custom_engine #1

Merged
Oliver merged 16 commits from custom_engine into main 2024-11-13 00:53:31 +00:00
4 changed files with 33 additions and 11 deletions
Showing only changes of commit a9628ee5e4 - Show all commits

3
.gitignore vendored
View File

@ -1 +1,4 @@
/target
perf.data
flamegraph.svg
perf.data.old

2
.vscode/launch.json vendored
View File

@ -9,7 +9,7 @@
"request": "launch",
"name": "Debug executable 'surreal_spider'",
"env": {
"RUST_LOG": "surreal_spider=trace,reqwest=trace",
"RUST_LOG": "surreal_spider=debug,reqwest=info",
},
"cargo": {
"args": [

View File

@ -8,6 +8,8 @@ use surrealdb::{
use tracing::{debug, error, instrument, trace, warn};
use url::Url;
use crate::Timer;
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct Website {
/// The url that this data is found at
@ -57,6 +59,10 @@ impl Website {
let from = self.site.to_string();
// let to = other.site.to_string();
trace!("Linking {from} to {} other pages.", other.len());
let msg = format!("Linked {len} pages");
let timer = Timer::start(&msg);
// prevent the timer from being dropped instantly.
let _ = timer;
match db
.query("COUNT(RELATE (SELECT id FROM website WHERE site = $in) -> links_to -> $out)")
.bind(("in", from))
@ -71,7 +77,7 @@ impl Website {
let _: Vec<usize> = vec;
if let Some(num) = vec.get(0) {
if *num == len {
debug!("Link OK");
trace!("Link OK");
return;
} else {
warn!("Didn't link all the records. {num}/{len}");

View File

@ -51,10 +51,7 @@ async fn main() {
for mut site in uncrawled {
if let Some(dom) = get(&mut site, &db).await {
trace!("Pre-walk checkpoint");
crawl_wrapper(&dom, &db, &site, &mut crawled).await;
let percent = format!("{:.2}%", (crawled as f32/budget as f32) * 100f32);
info!("Crawled {crawled} out of {budget} pages. ({percent})");
} else {
@ -69,10 +66,9 @@ async fn main() {
async fn crawl_wrapper(dom: &Rc<Node>, db: &Surreal<Client>, site: &Website, count: &mut usize) {
let mut buffer = Vec::new();
let now = Instant::now();
let timer= Timer::start("Walked");
walk(&dom, &db, &site, &mut buffer).await;
let dif = now.elapsed().as_micros();
trace!("{}", format!("Walked in {:.3}ms", dif as f64/1000.));
drop(timer);
site.links_to(buffer, &db).await;
*count += 1;
}
@ -81,10 +77,9 @@ async fn crawl_wrapper(dom: &Rc<Node>, db: &Surreal<Client>, site: &Website, cou
/// A quick helper function for downloading a url
async fn get(site: &mut Website, db: &Surreal<Client>) -> Option<Rc<Node>> {
trace!("Get: {}", site.to_string());
let now = Instant::now();
let timer = Timer::start("Got page");
if let Ok(response) = reqwest::get(site.to_string()).await {
let dif = now.elapsed().as_micros();
trace!("{}", format!("Got page in {:.3}ms", dif as f64/1000.));
drop(timer);
let data = response.text().await.unwrap();
let opts = ParseOpts {
@ -165,3 +160,21 @@ async fn get_uncrawled_links(db: &Surreal<Client>, mut count: usize) -> Vec<Webs
response.take(0).expect("Returned websites couldn't be parsed")
}
pub struct Timer<'a> {
start: Instant,
msg: &'a str,
}
impl<'a> Timer<'a> {
#[inline]
pub fn start(msg: &'a str) -> Self {
Self { start: Instant::now(), msg }
}
}
impl<'a> Drop for Timer<'a> {
fn drop(&mut self) {
let dif = self.start.elapsed().as_micros();
debug!("{}", format!("{} in {:.3}ms", self.msg, dif as f64/1000.));
}
}