From 5404d5c3e81b6958f4441adbfd832be80958eed2 Mon Sep 17 00:00:00 2001 From: oliver Date: Sat, 9 Nov 2024 23:30:57 -0700 Subject: [PATCH] it works :party: --- src/db.rs | 49 +++++++++++++++++++++++++++++++++++-------------- src/main.rs | 47 +++++++++++++++++++++++++++++++++-------------- 2 files changed, 68 insertions(+), 28 deletions(-) diff --git a/src/db.rs b/src/db.rs index 76fc321..f20ae84 100644 --- a/src/db.rs +++ b/src/db.rs @@ -5,7 +5,7 @@ use surrealdb::{ sql::Thing, Response, Surreal, }; -use tracing::{debug, error, info, instrument, trace}; +use tracing::{debug, error, instrument, trace, warn}; use url::Url; #[derive(Debug, Serialize, Deserialize, Clone)] @@ -50,19 +50,36 @@ impl Website { } #[instrument(skip_all)] - pub async fn links_to(&self, other: &Self, db: &Surreal) { - let from = self.site.to_string(); - let to = other.site.to_string(); - trace!("Linking {from} to {to}"); + pub async fn links_to(&self, other: Vec, db: &Surreal) { + let len = other.len(); + if len == 0 {return} + let from = self.site.to_string(); + // let to = other.site.to_string(); + trace!("Linking {from} to {} other pages.", other.len()); match db - .query("RELATE (SELECT id FROM website WHERE site = $in) -> links_to -> (SELECT id FROM website WHERE site = $out)") + .query("COUNT(RELATE (SELECT id FROM website WHERE site = $in) -> links_to -> $out)") .bind(("in", from)) - .bind(("out", to)) + .bind(("out", other)) .await { - Ok(e) => { + Ok(mut e) => { + // The relate could technically "fail" (not relate anything), this just means that + // the query was ok. let _: Response = e; + if let Ok(vec) = e.take(0) { + let _: Vec = vec; + if let Some(num) = vec.get(0) { + if *num == len { + debug!("Link OK"); + return; + } else { + warn!("Didn't link all the records. {num}/{len}"); + return; + } + } + } + warn!("Linking request succeeded but couldn't verify the results."); }, Err(e) => { error!("{}", e.to_string()); @@ -71,7 +88,7 @@ impl Website { } #[instrument(skip_all)] - pub async fn store(&mut self, db: &Surreal) { + pub async fn store(&mut self, db: &Surreal) -> Option { // check if it's been gone thru before let mut response = db .query("SELECT * FROM ONLY website WHERE site = $site LIMIT 1") @@ -87,8 +104,9 @@ impl Website { match db.upsert((id.tb, id.id.to_string())).content(new).await { Ok(e) => { - if let Some(a) = &e { - let _: &Record = a; + if let Some(a) = e { + let _: Record = a; + return Some(a.id); } } Err(e) => { @@ -101,13 +119,15 @@ impl Website { match db.create("website").content(self.clone()).await { Ok(e) => { let _: Option = e; - if let Some(a) = &e { - let _: &Record = a; + if let Some(a) = e { + let _: Record = a; + return Some(a.id); } } Err(a) => error!("{:?}", a), }; } + None } } @@ -120,6 +140,7 @@ impl ToString for Website { #[derive(Debug, Serialize)] pub struct Email { pub email: String, + pub on: String, } #[derive(Debug, Deserialize)] @@ -140,7 +161,7 @@ pub async fn connect() -> surrealdb::Result> { .await?; // Select a specific namespace / database - db.use_ns("test").use_db("time").await?; + db.use_ns("test").use_db("v1.2").await?; Ok(db) } diff --git a/src/main.rs b/src/main.rs index cebe7b7..6dc17ee 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,11 +1,11 @@ extern crate markup5ever_rcdom as rcdom; extern crate html5ever; -use std::rc::Rc; +use std::{rc::Rc, time::Instant}; use db::{connect, Website}; use html5ever::{parse_document, tendril::TendrilSink, tree_builder::TreeBuilderOpts, ParseOpts}; use rcdom::{Node, RcDom}; -use surrealdb::{engine::remote::ws::Client, Surreal}; +use surrealdb::{engine::remote::ws::Client, sql::Thing, Surreal}; use tracing::{debug, info, instrument, trace, trace_span, warn}; use tracing_subscriber::EnvFilter; @@ -33,15 +33,17 @@ async fn main() { let pre_loop_span = span.enter(); let mut site = Website::new(&url, false); let dom = get(&mut site, &db).await.expect("Inital page returned None."); - crawled += 1; - walk(&dom, &db, &site).await; + crawl_wrapper(&dom, &db, &site, &mut crawled).await; drop(pre_loop_span); let span = trace_span!("Loop"); let span = span.enter(); - // Can go upto 49 above budget because the reterival function gets 50 no matter what while crawled < budget { - let uncrawled = get_uncrawled_links(&db, 100).await; + let get_num = if budget - crawled < 100 { + budget - crawled + } else {100}; + + let uncrawled = get_uncrawled_links(&db, get_num).await; debug!("Crawling {} pages...", uncrawled.len()); let span = trace_span!("Crawling"); @@ -49,9 +51,10 @@ async fn main() { for mut site in uncrawled { if let Some(dom) = get(&mut site, &db).await { - trace!("pre-walk checkpoint"); - walk(&dom, &db, &site).await; - crawled += 1; + trace!("Pre-walk checkpoint"); + + crawl_wrapper(&dom, &db, &site, &mut crawled).await; + let percent = format!("{:.2}%", (crawled as f32/budget as f32) * 100f32); info!("Crawled {crawled} out of {budget} pages. ({percent})"); } else { @@ -64,11 +67,25 @@ async fn main() { info!("Done"); } +async fn crawl_wrapper(dom: &Rc, db: &Surreal, site: &Website, count: &mut usize) { + let mut buffer = Vec::new(); + let now = Instant::now(); + walk(&dom, &db, &site, &mut buffer).await; + let dif = now.elapsed().as_micros(); + trace!("{}", format!("Walked in {:.3}ms", dif as f64/1000.)); + site.links_to(buffer, &db).await; + *count += 1; +} + #[instrument(skip_all)] /// A quick helper function for downloading a url async fn get(site: &mut Website, db: &Surreal) -> Option> { trace!("Get: {}", site.to_string()); + let now = Instant::now(); if let Ok(response) = reqwest::get(site.to_string()).await { + let dif = now.elapsed().as_micros(); + trace!("{}", format!("Got page in {:.3}ms", dif as f64/1000.)); + let data = response.text().await.unwrap(); let opts = ParseOpts { tree_builder: TreeBuilderOpts { @@ -93,7 +110,7 @@ async fn get(site: &mut Website, db: &Surreal) -> Option> { } /// Walks the givin site, placing it's findings in the database -async fn walk(node: &rcdom::Handle, db: &Surreal , site: &Website) { +async fn walk(node: &rcdom::Handle, db: &Surreal , site: &Website, links_to: &mut Vec) { let span = trace_span!("Walk"); let span = span.enter(); @@ -105,7 +122,8 @@ async fn walk(node: &rcdom::Handle, db: &Surreal , site: &Website) { trace!("Is mailto"); // mailto link, lol let _created: Option = db.create("email").content(db::Email { - email: attr.value.to_string() + email: attr.value.to_string(), + on: site.domain_str().to_owned(), }).await.unwrap(); } else { let mut web = site.clone(); @@ -118,8 +136,9 @@ async fn walk(node: &rcdom::Handle, db: &Surreal , site: &Website) { let crawled = web.crawled(); *crawled = false; - site.links_to(&web, &db).await; - web.store(db).await; + if let Some(id) = web.store(db).await { + links_to.push(id); + } } } }; @@ -128,7 +147,7 @@ async fn walk(node: &rcdom::Handle, db: &Surreal , site: &Website) { }; drop(span); for child in node.children.borrow().iter() { - Box::pin(walk(child, db, site)).await; + Box::pin(walk(child, db, site, links_to)).await; } }