From 720adaa552fdd4b149172925ef82955ad8eb8391 Mon Sep 17 00:00:00 2001 From: oliver Date: Tue, 12 Nov 2024 17:50:06 -0700 Subject: [PATCH] added support for nearly all html tags that can have a link --- src/db.rs | 8 +++--- src/main.rs | 74 +++++++++++++++++++++++++++++++++++------------------ test | 1 - 3 files changed, 52 insertions(+), 31 deletions(-) delete mode 100644 test diff --git a/src/db.rs b/src/db.rs index f31000a..4b36ef9 100644 --- a/src/db.rs +++ b/src/db.rs @@ -15,7 +15,7 @@ pub struct Website { /// The url that this data is found at site: Url, /// Wether or not this link has been crawled yet - crawled: bool, + pub crawled: bool, #[serde(skip_serializing)] id: Option, } @@ -39,10 +39,6 @@ impl Website { self.crawled = true } - pub fn crawled(&mut self) -> &mut bool { - &mut self.crawled - } - pub fn mut_url(&mut self) -> &mut Url { &mut self.site } @@ -105,9 +101,11 @@ impl Website { if let Some(old) = response.take::>(0).unwrap() { // site exists already if let Some(id) = old.id { + // make sure to preserve the "crawled status" let mut new = self.clone(); new.crawled = old.crawled | new.crawled; + // update the record match db.upsert((id.tb, id.id.to_string())).content(new).await { Ok(e) => { if let Some(a) = e { diff --git a/src/main.rs b/src/main.rs index 8c0dd9a..feb9ba8 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,9 +1,9 @@ extern crate markup5ever_rcdom as rcdom; extern crate html5ever; -use std::{rc::Rc, time::Instant}; +use std::{path::is_separator, rc::Rc, time::Instant}; use db::{connect, Website}; -use html5ever::{parse_document, tendril::TendrilSink, tree_builder::TreeBuilderOpts, ParseOpts}; +use html5ever::{local_name, parse_document, tendril::TendrilSink, tree_builder::TreeBuilderOpts, ParseOpts}; use rcdom::{Node, RcDom}; use surrealdb::{engine::remote::ws::Client, sql::Thing, Surreal}; use tracing::{debug, info, instrument, trace, trace_span, warn}; @@ -21,7 +21,8 @@ async fn main() { debug!("Starting..."); // Would probably take these in as parameters from a cli - let url = "https://oliveratkinson.net/"; + // let url = "https://oliveratkinson.net/"; + let url = "http://localhost:5500"; let budget = 50; let mut crawled = 0; @@ -36,9 +37,12 @@ async fn main() { // get() to work. let span = trace_span!("Pre-Loop"); let pre_loop_span = span.enter(); + // Download the site let mut site = Website::new(&url, false); let dom = get(&mut site, &db, &client).await.expect("Inital page returned None."); + crawl_wrapper(&dom, &db, &site, &mut crawled).await; + drop(pre_loop_span); let span = trace_span!("Loop"); @@ -49,6 +53,10 @@ async fn main() { } else {100}; let uncrawled = get_uncrawled_links(&db, get_num).await; + if uncrawled.len() == 0 { + info!("Had more budget but finished crawling everything."); + return; + } debug!("Crawling {} pages...", uncrawled.len()); let span = trace_span!("Crawling"); @@ -100,6 +108,8 @@ async fn get(site: &mut Website, db: &Surreal, getter: &reqwest::Client) .from_utf8() .read_from(&mut data.as_bytes()) .unwrap(); + + // TODO save the dom to minio if a flag is set site.set_crawled(); site.store(db).await; @@ -114,34 +124,48 @@ async fn get(site: &mut Website, db: &Surreal, getter: &reqwest::Client) async fn walk(node: &rcdom::Handle, db: &Surreal , site: &Website, links_to: &mut Vec) { let span = trace_span!("Walk"); let span = span.enter(); - + // Match each node - node basically means element. match &node.data { rcdom::NodeData::Element { name, attrs, template_contents, mathml_annotation_xml_integration_point } => { for attr in attrs.borrow().clone() { - if name.local.to_string() == "a" { - if attr.value.starts_with("mailto") { - trace!("Is mailto"); - // mailto link, lol - let _created: Option = db.create("email").content(db::Email { - email: attr.value.to_string(), - on: site.domain_str().to_owned(), - }).await.unwrap(); - } else { - let mut web = site.clone(); - let url = web.mut_url(); + match name.local { + local_name!("a") | + local_name!("audio") | + local_name!("area") | + local_name!("img") | + local_name!("link") | + local_name!("object") | + local_name!("source") | + local_name!("base") | + local_name!("video") => { + let attribute_name = attr.name.local.to_string(); + if attribute_name == "src" || attribute_name == "href" || attribute_name == "data" { + // Get clone of the current site object + let mut web = site.clone(); + + // Set url + let url = web.mut_url(); + url.set_fragment(None); // removes #xyz + let joined = url.join(&attr.value).unwrap(); + *url = joined; - // TODO remove #xyz - let joined = url.join(&attr.value).unwrap(); - *url = joined; + // Set other attributes + web.crawled = false; + // TODO set element name + // let element_name = name.local.to_string(); - let crawled = web.crawled(); - *crawled = false; - - if let Some(id) = web.store(db).await { - links_to.push(id); + if let Some(id) = web.store(db).await { + links_to.push(id); + } } + }, + local_name!("button") | + local_name!("meta") | + local_name!("iframe") => { + // dbg!(attrs); } - } + _ => {/**/} + }; }; }, _ => {}, @@ -183,4 +207,4 @@ impl<'a> Drop for Timer<'a> { let dif = self.start.elapsed().as_micros(); debug!("{}", format!("{} in {:.3}ms", self.msg, dif as f64/1000.)); } -} \ No newline at end of file +} diff --git a/test b/test deleted file mode 100644 index 52d456b..0000000 --- a/test +++ /dev/null @@ -1 +0,0 @@ -[{"result":[{"accessed_at":"2024-08-25T20:07:25.969525156Z","crawled":false,"domain":"google.com","id":"website:fd46b0cr5f5y3d57eje8","path":"/","url":"https://google.com"}],"status":"OK","time":"205.7