working, now onto speeding it up

This commit is contained in:
oliver 2024-11-10 20:24:04 -07:00
parent 5404d5c3e8
commit a9628ee5e4
4 changed files with 33 additions and 11 deletions

3
.gitignore vendored
View File

@ -1 +1,4 @@
/target /target
perf.data
flamegraph.svg
perf.data.old

2
.vscode/launch.json vendored
View File

@ -9,7 +9,7 @@
"request": "launch", "request": "launch",
"name": "Debug executable 'surreal_spider'", "name": "Debug executable 'surreal_spider'",
"env": { "env": {
"RUST_LOG": "surreal_spider=trace,reqwest=trace", "RUST_LOG": "surreal_spider=debug,reqwest=info",
}, },
"cargo": { "cargo": {
"args": [ "args": [

View File

@ -8,6 +8,8 @@ use surrealdb::{
use tracing::{debug, error, instrument, trace, warn}; use tracing::{debug, error, instrument, trace, warn};
use url::Url; use url::Url;
use crate::Timer;
#[derive(Debug, Serialize, Deserialize, Clone)] #[derive(Debug, Serialize, Deserialize, Clone)]
pub struct Website { pub struct Website {
/// The url that this data is found at /// The url that this data is found at
@ -57,6 +59,10 @@ impl Website {
let from = self.site.to_string(); let from = self.site.to_string();
// let to = other.site.to_string(); // let to = other.site.to_string();
trace!("Linking {from} to {} other pages.", other.len()); trace!("Linking {from} to {} other pages.", other.len());
let msg = format!("Linked {len} pages");
let timer = Timer::start(&msg);
// prevent the timer from being dropped instantly.
let _ = timer;
match db match db
.query("COUNT(RELATE (SELECT id FROM website WHERE site = $in) -> links_to -> $out)") .query("COUNT(RELATE (SELECT id FROM website WHERE site = $in) -> links_to -> $out)")
.bind(("in", from)) .bind(("in", from))
@ -71,7 +77,7 @@ impl Website {
let _: Vec<usize> = vec; let _: Vec<usize> = vec;
if let Some(num) = vec.get(0) { if let Some(num) = vec.get(0) {
if *num == len { if *num == len {
debug!("Link OK"); trace!("Link OK");
return; return;
} else { } else {
warn!("Didn't link all the records. {num}/{len}"); warn!("Didn't link all the records. {num}/{len}");

View File

@ -51,10 +51,7 @@ async fn main() {
for mut site in uncrawled { for mut site in uncrawled {
if let Some(dom) = get(&mut site, &db).await { if let Some(dom) = get(&mut site, &db).await {
trace!("Pre-walk checkpoint");
crawl_wrapper(&dom, &db, &site, &mut crawled).await; crawl_wrapper(&dom, &db, &site, &mut crawled).await;
let percent = format!("{:.2}%", (crawled as f32/budget as f32) * 100f32); let percent = format!("{:.2}%", (crawled as f32/budget as f32) * 100f32);
info!("Crawled {crawled} out of {budget} pages. ({percent})"); info!("Crawled {crawled} out of {budget} pages. ({percent})");
} else { } else {
@ -69,10 +66,9 @@ async fn main() {
async fn crawl_wrapper(dom: &Rc<Node>, db: &Surreal<Client>, site: &Website, count: &mut usize) { async fn crawl_wrapper(dom: &Rc<Node>, db: &Surreal<Client>, site: &Website, count: &mut usize) {
let mut buffer = Vec::new(); let mut buffer = Vec::new();
let now = Instant::now(); let timer= Timer::start("Walked");
walk(&dom, &db, &site, &mut buffer).await; walk(&dom, &db, &site, &mut buffer).await;
let dif = now.elapsed().as_micros(); drop(timer);
trace!("{}", format!("Walked in {:.3}ms", dif as f64/1000.));
site.links_to(buffer, &db).await; site.links_to(buffer, &db).await;
*count += 1; *count += 1;
} }
@ -81,10 +77,9 @@ async fn crawl_wrapper(dom: &Rc<Node>, db: &Surreal<Client>, site: &Website, cou
/// A quick helper function for downloading a url /// A quick helper function for downloading a url
async fn get(site: &mut Website, db: &Surreal<Client>) -> Option<Rc<Node>> { async fn get(site: &mut Website, db: &Surreal<Client>) -> Option<Rc<Node>> {
trace!("Get: {}", site.to_string()); trace!("Get: {}", site.to_string());
let now = Instant::now(); let timer = Timer::start("Got page");
if let Ok(response) = reqwest::get(site.to_string()).await { if let Ok(response) = reqwest::get(site.to_string()).await {
let dif = now.elapsed().as_micros(); drop(timer);
trace!("{}", format!("Got page in {:.3}ms", dif as f64/1000.));
let data = response.text().await.unwrap(); let data = response.text().await.unwrap();
let opts = ParseOpts { let opts = ParseOpts {
@ -165,3 +160,21 @@ async fn get_uncrawled_links(db: &Surreal<Client>, mut count: usize) -> Vec<Webs
response.take(0).expect("Returned websites couldn't be parsed") response.take(0).expect("Returned websites couldn't be parsed")
} }
pub struct Timer<'a> {
start: Instant,
msg: &'a str,
}
impl<'a> Timer<'a> {
#[inline]
pub fn start(msg: &'a str) -> Self {
Self { start: Instant::now(), msg }
}
}
impl<'a> Drop for Timer<'a> {
fn drop(&mut self) {
let dif = self.start.elapsed().as_micros();
debug!("{}", format!("{} in {:.3}ms", self.msg, dif as f64/1000.));
}
}