diff --git a/.vscode/launch.json b/.vscode/launch.json index 93262ee..58b82e5 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -9,7 +9,7 @@ "request": "launch", "name": "Debug executable 'surreal_spider'", "env": { - "RUST_LOG": "surreal_spider=trace", + "RUST_LOG": "surreal_spider=trace,reqwest=trace", }, "cargo": { "args": [ diff --git a/compose.yml b/compose.yml index 830c890..49b676d 100644 --- a/compose.yml +++ b/compose.yml @@ -3,6 +3,8 @@ services: image: surrealdb/surrealdb:latest-dev ports: - 8000:8000 + volumes: + - ./.surrealdb/:/mydata command: - start - --log @@ -11,3 +13,4 @@ services: - root - --pass - root + - rocksdb:/mydata/database.db diff --git a/src/db.rs b/src/db.rs index f12593e..76fc321 100644 --- a/src/db.rs +++ b/src/db.rs @@ -3,126 +3,117 @@ use surrealdb::{ engine::remote::ws::{Client, Ws}, opt::auth::Root, sql::Thing, - Surreal, + Response, Surreal, }; -use tracing::{debug, error, info, instrument}; +use tracing::{debug, error, info, instrument, trace}; use url::Url; #[derive(Debug, Serialize, Deserialize, Clone)] pub struct Website { /// The url that this data is found at site: Url, - /// The url as defined in the tag - href: Url, /// Wether or not this link has been crawled yet crawled: bool, - /// Wether or not the href was doctored - doctored_href: bool, - original_href: Option, + #[serde(skip_serializing)] + id: Option, } impl Website { /// Creates a blank site (assumes that url param is site's root) - pub fn new(url: &str, href: &str, crawled: bool) -> Self { - let mut new = Self::from(url); - new.crawled = crawled; - new.original_href = Some(href.to_string()); - new.href = - match Url::parse(href) { - Ok(e) => e, - Err(e) => { - match e { - url::ParseError::RelativeUrlWithoutBase => { - // Try to combine the scheme_host and href to get a useable domain - new.doctored_href = true; - - let url = if !url.ends_with('/') && !href.starts_with('/') { - format!("{url}/{href}") - } else { - format!("{url}{href}") - }; - - // paste the domain onto the begining of the href - Url::parse(&url).map_or_else(|err| { - debug!("Parsing {url} with {href}"); - error!("{err} Failed to parse href into url on second try. Aborting"); - panic!("See error logs for more info."); - }, |ok| ok) - } - _ => { - error!("{e}"); - panic!("See error logs for more info."); - } - } - } - }; - new + pub fn new(url: &str, crawled: bool) -> Self { + let site = match Url::parse(url) { + Ok(a) => a, + Err(_) => todo!(), + }; + Self { + id: None, + crawled, + site, + } } - pub fn crawled(&mut self) { + + pub fn set_crawled(&mut self) { + trace!("Set crawled to true"); self.crawled = true } - pub fn href_str(&self) -> &str { - self.href.as_str() + + pub fn crawled(&mut self) -> &mut bool { + &mut self.crawled } - pub fn site(&self) -> String { - self.site.to_string() + + pub fn mut_url(&mut self) -> &mut Url { + &mut self.site } + pub fn domain_str(&self) -> &str { self.site.as_str() } + + #[instrument(skip_all)] + pub async fn links_to(&self, other: &Self, db: &Surreal) { + let from = self.site.to_string(); + let to = other.site.to_string(); + trace!("Linking {from} to {to}"); + + match db + .query("RELATE (SELECT id FROM website WHERE site = $in) -> links_to -> (SELECT id FROM website WHERE site = $out)") + .bind(("in", from)) + .bind(("out", to)) + .await + { + Ok(e) => { + let _: Response = e; + }, + Err(e) => { + error!("{}", e.to_string()); + }, + } + } + #[instrument(skip_all)] pub async fn store(&mut self, db: &Surreal) { - // is root record? - if self.href.path() == "/" { - // Upsert is create or update - // Whereas Update is just update - let record = ("website", &self.href.to_string()); + // check if it's been gone thru before + let mut response = db + .query("SELECT * FROM ONLY website WHERE site = $site LIMIT 1") + .bind(("site", self.site.to_string())) + .await + .unwrap(); - let crawled = if let Some(old) = db.select(record).await.unwrap() { - let old: Website = old; // infer type - old.crawled - } else {false}; + if let Some(old) = response.take::>(0).unwrap() { + // site exists already + if let Some(id) = old.id { + let mut new = self.clone(); + new.crawled = old.crawled | new.crawled; - if !self.crawled {self.crawled = crawled}; - - match db.upsert(record).content(self.clone()).await { - Ok(e) => { - if let Some(a) = &e { - let _: &Record = a; + match db.upsert((id.tb, id.id.to_string())).content(new).await { + Ok(e) => { + if let Some(a) = &e { + let _: &Record = a; + } } - } - Err(e) => { - error!("{}", e); - }, - }; + Err(e) => { + error!("{}", e); + } + }; + } } else { - let _: Option = match db.create("website").content(self.clone()).await { + // sites hasn't existed yet + match db.create("website").content(self.clone()).await { Ok(e) => { + let _: Option = e; if let Some(a) = &e { let _: &Record = a; } - e } - Err(_) => todo!(), + Err(a) => error!("{:?}", a), }; } } } -impl From<&str> for Website { - /// site == href, crawled = false - fn from(value: &str) -> Self { - let site = match Url::parse(value) { - Ok(a) => a, - Err(_) => todo!(), - }; - Self { - href: site.clone(), - crawled: false, - site, - doctored_href: false, - original_href: None, - } +impl ToString for Website { + fn to_string(&self) -> String { + self.site.to_string() } } @@ -149,7 +140,7 @@ pub async fn connect() -> surrealdb::Result> { .await?; // Select a specific namespace / database - db.use_ns("test").use_db("test").await?; + db.use_ns("test").use_db("time").await?; Ok(db) } diff --git a/src/main.rs b/src/main.rs index af2fdde..cebe7b7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -6,7 +6,7 @@ use db::{connect, Website}; use html5ever::{parse_document, tendril::TendrilSink, tree_builder::TreeBuilderOpts, ParseOpts}; use rcdom::{Node, RcDom}; use surrealdb::{engine::remote::ws::Client, Surreal}; -use tracing::{debug, info, instrument}; +use tracing::{debug, info, instrument, trace, trace_span, warn}; use tracing_subscriber::EnvFilter; mod db; @@ -29,24 +29,37 @@ async fn main() { // Kick off the whole machine - This Website object doesn't matter, it's just to allow for // get() to work. - let mut site = Website::from(url); + let span = trace_span!("Pre-Loop"); + let pre_loop_span = span.enter(); + let mut site = Website::new(&url, false); let dom = get(&mut site, &db).await.expect("Inital page returned None."); crawled += 1; walk(&dom, &db, &site).await; + drop(pre_loop_span); + let span = trace_span!("Loop"); + let span = span.enter(); + // Can go upto 49 above budget because the reterival function gets 50 no matter what while crawled < budget { - let uncrawled = get_uncrawled_links(&db).await; + let uncrawled = get_uncrawled_links(&db, 100).await; debug!("Crawling {} pages...", uncrawled.len()); + let span = trace_span!("Crawling"); + let _ = span.enter(); + for mut site in uncrawled { if let Some(dom) = get(&mut site, &db).await { + trace!("pre-walk checkpoint"); walk(&dom, &db, &site).await; crawled += 1; let percent = format!("{:.2}%", (crawled as f32/budget as f32) * 100f32); info!("Crawled {crawled} out of {budget} pages. ({percent})"); + } else { + warn!("Failed to get {}", site.to_string()); } } } + drop(span); info!("Done"); } @@ -54,7 +67,8 @@ async fn main() { #[instrument(skip_all)] /// A quick helper function for downloading a url async fn get(site: &mut Website, db: &Surreal) -> Option> { - if let Ok(response) = reqwest::get(site.href_str()).await { + trace!("Get: {}", site.to_string()); + if let Ok(response) = reqwest::get(site.to_string()).await { let data = response.text().await.unwrap(); let opts = ParseOpts { tree_builder: TreeBuilderOpts { @@ -69,29 +83,42 @@ async fn get(site: &mut Website, db: &Surreal) -> Option> { .read_from(&mut data.as_bytes()) .unwrap(); - site.crawled(); + site.set_crawled(); site.store(db).await; + trace!("Got: {}", site.to_string()); return Some(dom.document); } + trace!("Failed to get: {}", site.to_string()); None } /// Walks the givin site, placing it's findings in the database -async fn walk(node: &rcdom::Handle, db: &Surreal , site_name: &Website) { - // Insert Or Update - // create_root(site_name, db).await; +async fn walk(node: &rcdom::Handle, db: &Surreal , site: &Website) { + let span = trace_span!("Walk"); + let span = span.enter(); match &node.data { rcdom::NodeData::Element { name, attrs, template_contents, mathml_annotation_xml_integration_point } => { for attr in attrs.borrow().clone() { if name.local.to_string() == "a" { if attr.value.starts_with("mailto") { + trace!("Is mailto"); // mailto link, lol let _created: Option = db.create("email").content(db::Email { email: attr.value.to_string() }).await.unwrap(); } else { - let mut web = Website::new(&site_name.site(), &attr.value, false); + let mut web = site.clone(); + let url = web.mut_url(); + + // TODO remove #xyz + let joined = url.join(&attr.value).unwrap(); + *url = joined; + + let crawled = web.crawled(); + *crawled = false; + + site.links_to(&web, &db).await; web.store(db).await; } } @@ -99,15 +126,23 @@ async fn walk(node: &rcdom::Handle, db: &Surreal , site_name: &Website) }, _ => {}, }; - + drop(span); for child in node.children.borrow().iter() { - Box::pin(walk(child, db, site_name)).await; + Box::pin(walk(child, db, site)).await; } } -/// Returns 0-50 uncrawled links (LIMIT = 50) -async fn get_uncrawled_links(db: &Surreal) -> Vec { - let mut response = db.query("SELECT * FROM website WHERE crawled = false LIMIT 50").await.expect("Hard-coded query failed..?"); +/// Returns uncrawled links +async fn get_uncrawled_links(db: &Surreal, mut count: usize) -> Vec { + if count > 100 { + count = 100 + } + + let mut response = db + .query("SELECT * FROM website WHERE crawled = false LIMIT $count") + .bind(("count", count)) + .await + .expect("Hard-coded query failed..?"); response.take(0).expect("Returned websites couldn't be parsed") }