diff --git a/.gitignore b/.gitignore index ea8c4bf..7b6584d 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,4 @@ /target +perf.data +flamegraph.svg +perf.data.old \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json index 58b82e5..76fa95c 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -9,7 +9,7 @@ "request": "launch", "name": "Debug executable 'surreal_spider'", "env": { - "RUST_LOG": "surreal_spider=trace,reqwest=trace", + "RUST_LOG": "surreal_spider=debug,reqwest=info", }, "cargo": { "args": [ diff --git a/src/db.rs b/src/db.rs index f20ae84..b82f965 100644 --- a/src/db.rs +++ b/src/db.rs @@ -8,6 +8,8 @@ use surrealdb::{ use tracing::{debug, error, instrument, trace, warn}; use url::Url; +use crate::Timer; + #[derive(Debug, Serialize, Deserialize, Clone)] pub struct Website { /// The url that this data is found at @@ -57,6 +59,10 @@ impl Website { let from = self.site.to_string(); // let to = other.site.to_string(); trace!("Linking {from} to {} other pages.", other.len()); + let msg = format!("Linked {len} pages"); + let timer = Timer::start(&msg); + // prevent the timer from being dropped instantly. + let _ = timer; match db .query("COUNT(RELATE (SELECT id FROM website WHERE site = $in) -> links_to -> $out)") .bind(("in", from)) @@ -71,7 +77,7 @@ impl Website { let _: Vec = vec; if let Some(num) = vec.get(0) { if *num == len { - debug!("Link OK"); + trace!("Link OK"); return; } else { warn!("Didn't link all the records. {num}/{len}"); diff --git a/src/main.rs b/src/main.rs index 6dc17ee..49332c0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -51,10 +51,7 @@ async fn main() { for mut site in uncrawled { if let Some(dom) = get(&mut site, &db).await { - trace!("Pre-walk checkpoint"); - crawl_wrapper(&dom, &db, &site, &mut crawled).await; - let percent = format!("{:.2}%", (crawled as f32/budget as f32) * 100f32); info!("Crawled {crawled} out of {budget} pages. ({percent})"); } else { @@ -69,10 +66,9 @@ async fn main() { async fn crawl_wrapper(dom: &Rc, db: &Surreal, site: &Website, count: &mut usize) { let mut buffer = Vec::new(); - let now = Instant::now(); + let timer= Timer::start("Walked"); walk(&dom, &db, &site, &mut buffer).await; - let dif = now.elapsed().as_micros(); - trace!("{}", format!("Walked in {:.3}ms", dif as f64/1000.)); + drop(timer); site.links_to(buffer, &db).await; *count += 1; } @@ -81,10 +77,9 @@ async fn crawl_wrapper(dom: &Rc, db: &Surreal, site: &Website, cou /// A quick helper function for downloading a url async fn get(site: &mut Website, db: &Surreal) -> Option> { trace!("Get: {}", site.to_string()); - let now = Instant::now(); + let timer = Timer::start("Got page"); if let Ok(response) = reqwest::get(site.to_string()).await { - let dif = now.elapsed().as_micros(); - trace!("{}", format!("Got page in {:.3}ms", dif as f64/1000.)); + drop(timer); let data = response.text().await.unwrap(); let opts = ParseOpts { @@ -165,3 +160,21 @@ async fn get_uncrawled_links(db: &Surreal, mut count: usize) -> Vec { + start: Instant, + msg: &'a str, +} + +impl<'a> Timer<'a> { + #[inline] + pub fn start(msg: &'a str) -> Self { + Self { start: Instant::now(), msg } + } +} + +impl<'a> Drop for Timer<'a> { + fn drop(&mut self) { + let dif = self.start.elapsed().as_micros(); + debug!("{}", format!("{} in {:.3}ms", self.msg, dif as f64/1000.)); + } +} \ No newline at end of file