use std::default::Default; use html5ever::tokenizer::{BufferQueue, TokenizerResult}; use html5ever::tokenizer::{StartTag, TagToken}; use html5ever::tokenizer::{Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts}; use html5ever::{local_name, tendril::*}; use tracing::{error, instrument, trace, warn}; use url::Url; use crate::db::Website; impl TokenSink for Website { type Handle = Vec; fn process_token(&self, token: Token, _line_number: u64) -> TokenSinkResult { match token { TagToken(tag) => { if tag.kind == StartTag { match tag.name { // this should be all the html elements that have links local_name!("a") | local_name!("audio") | local_name!("area") | local_name!("img") | local_name!("link") | local_name!("object") | local_name!("source") | local_name!("base") | local_name!("video") => { let mut links = Vec::new(); for attr in &tag.attrs { let attr_name = attr.name.local.to_string(); if attr_name == "src" || attr_name == "href" || attr_name == "data" { trace!(url = self.site.as_str(),"Found `{}` in html `{}` tag", &attr.value, tag.name); let url = try_get_url(&self.site, &attr.value); if let Some(mut parsed) = url { parsed.set_query(None); parsed.set_fragment(None); trace!(url = self.site.as_str(), "Final cleaned URL: `{}`", parsed.to_string()); let web = Website::new(&parsed.to_string(), false); links.push(web); } } } return TokenSinkResult::Script(links); } local_name!("button") | local_name!("meta") | local_name!("iframe") => { // dbg!(attrs); } _ => {} } } } _ => {} } TokenSinkResult::Continue } } #[instrument(skip(data))] /// Parses the passed site and returns all the sites it links to. pub async fn parse(site: &Website, data: &[u8]) -> Vec { trace!(url = site.site.as_str(), "Parsing {}", site.site.to_string()); // prep work let mut other_sites: Vec = Vec::new(); // change data into something that can be tokenized let s: Result, ()> = Tendril::try_from_byte_slice(data); if let Ok(chunk) = s { // create buffer of tokens and push our input into it let token_buffer = BufferQueue::default(); token_buffer.push_back( chunk .try_reinterpret::() .expect("Failed to reinterpret chunk!"), ); // create the tokenizer let tokenizer = Tokenizer::new(site.clone(), TokenizerOpts::default()); // go thru buffer while let TokenizerResult::Script(mut sites) = tokenizer.feed(&token_buffer) { other_sites.append(&mut sites); // other_sites.push(sites); } assert!(token_buffer.is_empty()); tokenizer.end(); } else { warn!(url = site.site.as_str(), "Tendril failed to parse on: {}", site.site.to_string()); } other_sites } #[instrument] fn try_get_url(parent: &Url, link: &str) -> Option { match Url::parse(link) { Ok(ok) => Some(ok), Err(e) => { if link.starts_with('#') { trace!(url = parent.as_str(), "Rejecting # url"); None } else if link.starts_with("//") { // if a url starts with "//" is assumed that it will adopt // the same scheme as it's parent // https://stackoverflow.com/questions/9646407/two-forward-slashes-in-a-url-src-href-attribute let scheme = parent.scheme(); match Url::parse(&format!("{scheme}://{link}")) { Ok(url) => Some(url), Err(err) => { error!("Failed parsing relative scheme url: {}", err); None } } } else { // # This is some sort of relative url, gonna try patching it up into an absolute // url match e { url::ParseError::RelativeUrlWithoutBase => { // Is: scheme://host:port let mut origin = parent.origin().ascii_serialization(); if !origin.ends_with('/') && !link.starts_with('/') { origin += "/"; } let url = origin.clone() + link; if let Ok(url) = Url::parse(&url) { trace!(url = parent.as_str(), "Built `{url}` from `{origin} + `{}`", link.to_string()); Some(url) } else { error!( "Failed to reconstruct a url from relative url: `{}` on site: `{}`. Failed url was: {}", link, parent.to_string(), url ); None } } _ => { error!("MISC error: {:?} {:?}", e, link); None } } } } } }