From 0f8a3d721522b0a7b589c5e62612b2f06e4ad3b4 Mon Sep 17 00:00:00 2001 From: oliver Date: Tue, 12 Nov 2024 23:08:09 -0700 Subject: [PATCH] using a custom parser now :) --- Cargo.lock | 24 ---------- Cargo.toml | 1 - src/main.rs | 121 ++++++-------------------------------------------- src/parser.rs | 95 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 108 insertions(+), 133 deletions(-) create mode 100644 src/parser.rs diff --git a/Cargo.lock b/Cargo.lock index e205787..0732362 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1987,18 +1987,6 @@ dependencies = [ "tendril", ] -[[package]] -name = "markup5ever_rcdom" -version = "0.5.0-unofficial" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9cb12459c4cab18dcc580159590f404ad78c0a9c5435ace80288ed43abdce31" -dependencies = [ - "html5ever 0.29.0", - "markup5ever 0.14.0", - "tendril", - "xml5ever", -] - [[package]] name = "matchers" version = "0.1.0" @@ -3653,7 +3641,6 @@ name = "surreal_spider" version = "0.1.0" dependencies = [ "html5ever 0.29.0", - "markup5ever_rcdom", "minio", "reqwest", "serde", @@ -4725,17 +4712,6 @@ version = "0.8.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af310deaae937e48a26602b730250b4949e125f468f11e6990be3e5304ddd96f" -[[package]] -name = "xml5ever" -version = "0.20.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2278b4bf33071ba8e30368a59436c65eec8e01c49d5c29b3dfeb0cdc45331383" -dependencies = [ - "log", - "mac", - "markup5ever 0.14.0", -] - [[package]] name = "xmltree" version = "0.11.0" diff --git a/Cargo.toml b/Cargo.toml index a9c7532..cba532f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,7 +5,6 @@ edition = "2021" [dependencies] html5ever = "0.29.0" -markup5ever_rcdom = "0.5.0-unofficial" # minio = "0.1.0" minio = {git="https://github.com/minio/minio-rs.git", rev = "c28f576"} reqwest = "0.12.9" diff --git a/src/main.rs b/src/main.rs index 56dc1b8..f19cd84 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,19 +1,16 @@ extern crate html5ever; -extern crate markup5ever_rcdom as rcdom; + +use std::time::Instant; use db::{connect, Website}; -use html5ever::{ - local_name, parse_document, tendril::TendrilSink, tree_builder::TreeBuilderOpts, ParseOpts, -}; -use rcdom::RcDom; use s3::S3; -use std::time::Instant; -use surrealdb::{engine::remote::ws::Client, sql::Thing, Surreal}; +use surrealdb::{engine::remote::ws::Client, Surreal}; use tracing::{debug, info, instrument, trace, trace_span}; use tracing_subscriber::EnvFilter; mod db; mod s3; +mod parser; struct Config<'a> { surreal_ns: &'a str, @@ -39,19 +36,19 @@ async fn main() { let config = Config { surreal_ns: "test", - surreal_db: "v1.5", + surreal_db: "v1.7", surreal_url: "localhost:8000", surreal_username: "root", surreal_password: "root", s3_url: "http://localhost:9000", - s3_bucket: "v1.5", + s3_bucket: "v1.7", s3_access_key: "8tUJn7e1paMFZQr0PKIT", s3_secret_key: "uSMvYxNOeCejCUgXVqgTfYlUEcmiZY0xcZ91M9E0", }; // Would probably take these in as parameters from a cli let starting_url = "https://oliveratkinson.net/"; - let budget = 200; + let budget = 15; let mut crawled = 0; let s3 = S3::connect(&config).await.expect("Failed to connect to minio, aborting."); @@ -75,11 +72,7 @@ async fn main() { let span = trace_span!("Loop"); let span = span.enter(); while crawled < budget { - let get_num = if budget - crawled < 100 { - budget - crawled - } else { - 100 - }; + let get_num = if budget - crawled < 100 { budget - crawled } else { 100 }; let uncrawled = get_uncrawled_links(&db, get_num).await; if uncrawled.len() == 0 { @@ -93,6 +86,7 @@ async fn main() { for mut site in uncrawled { get(&mut site, &db, &reqwest, &s3, &mut crawled).await; + let percent = format!("{:.2}%", (crawled as f32 / budget as f32) * 100f32); info!("Crawled {crawled} out of {budget} pages. ({percent})"); } @@ -103,7 +97,7 @@ async fn main() { } #[instrument(skip_all)] -/// A quick helper function for downloading a url +/// Downloads and crawls and stores a webpage. async fn get( site: &mut Website, db: &Surreal, @@ -113,109 +107,20 @@ async fn get( ) { trace!("Get: {}", site.to_string()); let timer = Timer::start("Got page"); - if let Ok(response) = reqwest.get(site.to_string()).send().await { timer.stop(); // Get body let data = response.text().await.unwrap(); - let opts = ParseOpts { - tree_builder: TreeBuilderOpts { - drop_doctype: true, - ..Default::default() - }, - ..Default::default() - }; + // Store document s3.store(&data, &site.site).await; - - // Get DOM - let dom = parse_document(RcDom::default(), opts) - .from_utf8() - .read_from(&mut data.as_bytes()) - .unwrap(); - - // TODO save the dom to minio if a flag is set - - // Modify record in database - site.set_crawled(); - site.store(db).await; - trace!("Got: {}", site.to_string()); - - // Walk all the children nodes, searching for links to other pages. - let mut buffer = Vec::new(); - let timer = Timer::start("Walked"); - walk(&dom.document, &db, &site, &mut buffer).await; - timer.stop(); - - // Put all the found links into the database. - site.links_to(buffer, &db).await; + // Parse document and store relationships + parser::parse(db, site, data).await; *count += 1; } trace!("Failed to get: {}", site.to_string()); } -/// Walks the givin site, placing it's findings in the database -async fn walk( - node: &rcdom::Handle, - db: &Surreal, - site: &Website, - links_to: &mut Vec, -) { - let span = trace_span!("Walk"); - let span = span.enter(); - // Match each node - node basically means element. - match &node.data { - rcdom::NodeData::Element { name, attrs, .. } => { - for attr in attrs.borrow().clone() { - match name.local { - local_name!("a") - | local_name!("audio") - | local_name!("area") - | local_name!("img") - | local_name!("link") - | local_name!("object") - | local_name!("source") - | local_name!("base") - | local_name!("video") => { - let attribute_name = attr.name.local.to_string(); - if attribute_name == "src" - || attribute_name == "href" - || attribute_name == "data" - { - // Get clone of the current site object - let mut web = site.clone(); - - // Set url - let mut url = web.site; - url.set_fragment(None); // removes #xyz - let joined = url.join(&attr.value).unwrap(); - web.site = joined; - - // Set other attributes - web.crawled = false; - // TODO set element name - // let element_name = name.local.to_string(); - - if let Some(id) = web.store(db).await { - links_to.push(id); - } - } - } - local_name!("button") | local_name!("meta") | local_name!("iframe") => { - // dbg!(attrs); - } - _ => {} - }; - } - } - _ => {} - }; - drop(span); - for child in node.children.borrow().iter() { - Box::pin(walk(child, db, site, links_to)).await; - } -} - /// Returns uncrawled links async fn get_uncrawled_links(db: &Surreal, mut count: usize) -> Vec { if count > 100 { diff --git a/src/parser.rs b/src/parser.rs new file mode 100644 index 0000000..17143f6 --- /dev/null +++ b/src/parser.rs @@ -0,0 +1,95 @@ +use std::default::Default; +use std::str::FromStr; + +use html5ever::tokenizer::{BufferQueue, TokenizerResult}; +use html5ever::tokenizer::{StartTag, TagToken}; +use html5ever::tokenizer::{Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts}; +use html5ever::{local_name, tendril::*}; +use surrealdb::engine::remote::ws::Client; +use surrealdb::Surreal; + +use crate::db::Website; + +#[derive(Clone)] +struct LinkParser<'a> { + site: &'a Website, +} + +impl TokenSink for LinkParser<'_> { + type Handle = Vec; + + fn process_token(&self, token: Token, _line_number: u64) -> TokenSinkResult { + match token { + TagToken(tag) => { + if tag.kind == StartTag { + match tag.name { + local_name!("a") + | local_name!("audio") + | local_name!("area") + | local_name!("img") + | local_name!("link") + | local_name!("object") + | local_name!("source") + | local_name!("base") + | local_name!("video") => { + let mut links = Vec::new(); + for attr in &tag.attrs { + let attr_name = attr.name.local.to_string(); + if attr_name == "src" || attr_name == "href" || attr_name == "data" + { + // Get clone of the current site object + let mut web = self.site.clone(); + + // Set url + let mut url = web.site; + url.set_fragment(None); // removes #xyz + let joined = url.join(&attr.value).unwrap(); + web.site = joined; + + web.crawled = false; + + links.push(web); + } + } + + return TokenSinkResult::Script(links); + } + local_name!("button") | local_name!("meta") | local_name!("iframe") => { + // dbg!(attrs); + } + _ => {} + } + } + } + _ => {} + } + TokenSinkResult::Continue + } +} + +pub async fn parse(db: &Surreal, site: &mut Website, data: String) { + + site.set_crawled(); + site.store(db).await; + + let sink = LinkParser { site }; + let chunk = Tendril::from_str(&data).unwrap(); + let mut input = BufferQueue::default(); + input.push_back(chunk.try_reinterpret::().unwrap()); + + let token = Tokenizer::new(sink.clone(), TokenizerOpts::default()); + + let mut links_to = Vec::new(); + while !input.is_empty() { + if let TokenizerResult::Script(s) = token.feed(&mut input) { + for mut web in s { + if let Some(id) = web.store(db).await { + links_to.push(id); + } + } + } + } + sink.site.links_to(links_to, db).await; + assert!(input.is_empty()); + token.end(); +}