use std::default::Default; use std::str::FromStr; use html5ever::tokenizer::{BufferQueue, TokenizerResult}; use html5ever::tokenizer::{StartTag, TagToken}; use html5ever::tokenizer::{Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts}; use html5ever::{local_name, tendril::*}; use surrealdb::engine::remote::ws::Client; use surrealdb::Surreal; use tracing::instrument; use crate::db::Website; use crate::Timer; impl TokenSink for Website { type Handle = Vec; fn process_token(&self, token: Token, _line_number: u64) -> TokenSinkResult { match token { TagToken(tag) => { if tag.kind == StartTag { match tag.name { local_name!("a") | local_name!("audio") | local_name!("area") | local_name!("img") | local_name!("link") | local_name!("object") | local_name!("source") | local_name!("base") | local_name!("video") => { let mut links = Vec::new(); for attr in &tag.attrs { let attr_name = attr.name.local.to_string(); if attr_name == "src" || attr_name == "href" || attr_name == "data" { // Get clone of the current site object let mut web = self.clone(); // Set url let mut url = web.site; url.set_fragment(None); // removes #xyz let joined = url.join(&attr.value).expect("Failed to join url during parsing!"); web.site = joined; web.crawled = false; links.push(web); } } return TokenSinkResult::Script(links); } local_name!("button") | local_name!("meta") | local_name!("iframe") => { // dbg!(attrs); } _ => {} } } } _ => {} } TokenSinkResult::Continue } } #[instrument(skip_all)] pub async fn parse(db: &Surreal, site: &mut Website, data: &str) { // update self in db site.set_crawled(); site.store(db).await; // prep work let mut other_sites: Vec = Vec::new(); { // using blocks to prevent compiler's async worries let _t = Timer::start("Parsed page"); // change data into something that can be tokenized let chunk = Tendril::from_str(&data).expect("Failed to parse string into Tendril!"); // create buffer of tokens and push our input into it let mut token_buffer = BufferQueue::default(); token_buffer.push_back(chunk.try_reinterpret::().expect("Failed to reinterprt chunk!")); // create the tokenizer let tokenizer = Tokenizer::new(site.clone(), TokenizerOpts::default()); // go thru buffer while let TokenizerResult::Script(mut sites) = tokenizer.feed(&mut token_buffer) { other_sites.append(&mut sites); // other_sites.push(sites); } assert!(token_buffer.is_empty()); tokenizer.end(); } { let mut links_to = Vec::with_capacity(other_sites.len()); for a in other_sites { let other = a.store(db).await; if let Some(o) = other { links_to.push(o); } } site.links_to(links_to, db).await; } }