working, now onto speeding it up
This commit is contained in:
parent
5404d5c3e8
commit
a9628ee5e4
3
.gitignore
vendored
3
.gitignore
vendored
@ -1 +1,4 @@
|
||||
/target
|
||||
perf.data
|
||||
flamegraph.svg
|
||||
perf.data.old
|
2
.vscode/launch.json
vendored
2
.vscode/launch.json
vendored
@ -9,7 +9,7 @@
|
||||
"request": "launch",
|
||||
"name": "Debug executable 'surreal_spider'",
|
||||
"env": {
|
||||
"RUST_LOG": "surreal_spider=trace,reqwest=trace",
|
||||
"RUST_LOG": "surreal_spider=debug,reqwest=info",
|
||||
},
|
||||
"cargo": {
|
||||
"args": [
|
||||
|
@ -8,6 +8,8 @@ use surrealdb::{
|
||||
use tracing::{debug, error, instrument, trace, warn};
|
||||
use url::Url;
|
||||
|
||||
use crate::Timer;
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct Website {
|
||||
/// The url that this data is found at
|
||||
@ -57,6 +59,10 @@ impl Website {
|
||||
let from = self.site.to_string();
|
||||
// let to = other.site.to_string();
|
||||
trace!("Linking {from} to {} other pages.", other.len());
|
||||
let msg = format!("Linked {len} pages");
|
||||
let timer = Timer::start(&msg);
|
||||
// prevent the timer from being dropped instantly.
|
||||
let _ = timer;
|
||||
match db
|
||||
.query("COUNT(RELATE (SELECT id FROM website WHERE site = $in) -> links_to -> $out)")
|
||||
.bind(("in", from))
|
||||
@ -71,7 +77,7 @@ impl Website {
|
||||
let _: Vec<usize> = vec;
|
||||
if let Some(num) = vec.get(0) {
|
||||
if *num == len {
|
||||
debug!("Link OK");
|
||||
trace!("Link OK");
|
||||
return;
|
||||
} else {
|
||||
warn!("Didn't link all the records. {num}/{len}");
|
||||
|
31
src/main.rs
31
src/main.rs
@ -51,10 +51,7 @@ async fn main() {
|
||||
|
||||
for mut site in uncrawled {
|
||||
if let Some(dom) = get(&mut site, &db).await {
|
||||
trace!("Pre-walk checkpoint");
|
||||
|
||||
crawl_wrapper(&dom, &db, &site, &mut crawled).await;
|
||||
|
||||
let percent = format!("{:.2}%", (crawled as f32/budget as f32) * 100f32);
|
||||
info!("Crawled {crawled} out of {budget} pages. ({percent})");
|
||||
} else {
|
||||
@ -69,10 +66,9 @@ async fn main() {
|
||||
|
||||
async fn crawl_wrapper(dom: &Rc<Node>, db: &Surreal<Client>, site: &Website, count: &mut usize) {
|
||||
let mut buffer = Vec::new();
|
||||
let now = Instant::now();
|
||||
let timer= Timer::start("Walked");
|
||||
walk(&dom, &db, &site, &mut buffer).await;
|
||||
let dif = now.elapsed().as_micros();
|
||||
trace!("{}", format!("Walked in {:.3}ms", dif as f64/1000.));
|
||||
drop(timer);
|
||||
site.links_to(buffer, &db).await;
|
||||
*count += 1;
|
||||
}
|
||||
@ -81,10 +77,9 @@ async fn crawl_wrapper(dom: &Rc<Node>, db: &Surreal<Client>, site: &Website, cou
|
||||
/// A quick helper function for downloading a url
|
||||
async fn get(site: &mut Website, db: &Surreal<Client>) -> Option<Rc<Node>> {
|
||||
trace!("Get: {}", site.to_string());
|
||||
let now = Instant::now();
|
||||
let timer = Timer::start("Got page");
|
||||
if let Ok(response) = reqwest::get(site.to_string()).await {
|
||||
let dif = now.elapsed().as_micros();
|
||||
trace!("{}", format!("Got page in {:.3}ms", dif as f64/1000.));
|
||||
drop(timer);
|
||||
|
||||
let data = response.text().await.unwrap();
|
||||
let opts = ParseOpts {
|
||||
@ -165,3 +160,21 @@ async fn get_uncrawled_links(db: &Surreal<Client>, mut count: usize) -> Vec<Webs
|
||||
response.take(0).expect("Returned websites couldn't be parsed")
|
||||
}
|
||||
|
||||
pub struct Timer<'a> {
|
||||
start: Instant,
|
||||
msg: &'a str,
|
||||
}
|
||||
|
||||
impl<'a> Timer<'a> {
|
||||
#[inline]
|
||||
pub fn start(msg: &'a str) -> Self {
|
||||
Self { start: Instant::now(), msg }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Drop for Timer<'a> {
|
||||
fn drop(&mut self) {
|
||||
let dif = self.start.elapsed().as_micros();
|
||||
debug!("{}", format!("{} in {:.3}ms", self.msg, dif as f64/1000.));
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user