working, now onto speeding it up
This commit is contained in:
parent
5404d5c3e8
commit
a9628ee5e4
3
.gitignore
vendored
3
.gitignore
vendored
@ -1 +1,4 @@
|
|||||||
/target
|
/target
|
||||||
|
perf.data
|
||||||
|
flamegraph.svg
|
||||||
|
perf.data.old
|
2
.vscode/launch.json
vendored
2
.vscode/launch.json
vendored
@ -9,7 +9,7 @@
|
|||||||
"request": "launch",
|
"request": "launch",
|
||||||
"name": "Debug executable 'surreal_spider'",
|
"name": "Debug executable 'surreal_spider'",
|
||||||
"env": {
|
"env": {
|
||||||
"RUST_LOG": "surreal_spider=trace,reqwest=trace",
|
"RUST_LOG": "surreal_spider=debug,reqwest=info",
|
||||||
},
|
},
|
||||||
"cargo": {
|
"cargo": {
|
||||||
"args": [
|
"args": [
|
||||||
|
@ -8,6 +8,8 @@ use surrealdb::{
|
|||||||
use tracing::{debug, error, instrument, trace, warn};
|
use tracing::{debug, error, instrument, trace, warn};
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
|
use crate::Timer;
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||||
pub struct Website {
|
pub struct Website {
|
||||||
/// The url that this data is found at
|
/// The url that this data is found at
|
||||||
@ -57,6 +59,10 @@ impl Website {
|
|||||||
let from = self.site.to_string();
|
let from = self.site.to_string();
|
||||||
// let to = other.site.to_string();
|
// let to = other.site.to_string();
|
||||||
trace!("Linking {from} to {} other pages.", other.len());
|
trace!("Linking {from} to {} other pages.", other.len());
|
||||||
|
let msg = format!("Linked {len} pages");
|
||||||
|
let timer = Timer::start(&msg);
|
||||||
|
// prevent the timer from being dropped instantly.
|
||||||
|
let _ = timer;
|
||||||
match db
|
match db
|
||||||
.query("COUNT(RELATE (SELECT id FROM website WHERE site = $in) -> links_to -> $out)")
|
.query("COUNT(RELATE (SELECT id FROM website WHERE site = $in) -> links_to -> $out)")
|
||||||
.bind(("in", from))
|
.bind(("in", from))
|
||||||
@ -71,7 +77,7 @@ impl Website {
|
|||||||
let _: Vec<usize> = vec;
|
let _: Vec<usize> = vec;
|
||||||
if let Some(num) = vec.get(0) {
|
if let Some(num) = vec.get(0) {
|
||||||
if *num == len {
|
if *num == len {
|
||||||
debug!("Link OK");
|
trace!("Link OK");
|
||||||
return;
|
return;
|
||||||
} else {
|
} else {
|
||||||
warn!("Didn't link all the records. {num}/{len}");
|
warn!("Didn't link all the records. {num}/{len}");
|
||||||
|
31
src/main.rs
31
src/main.rs
@ -51,10 +51,7 @@ async fn main() {
|
|||||||
|
|
||||||
for mut site in uncrawled {
|
for mut site in uncrawled {
|
||||||
if let Some(dom) = get(&mut site, &db).await {
|
if let Some(dom) = get(&mut site, &db).await {
|
||||||
trace!("Pre-walk checkpoint");
|
|
||||||
|
|
||||||
crawl_wrapper(&dom, &db, &site, &mut crawled).await;
|
crawl_wrapper(&dom, &db, &site, &mut crawled).await;
|
||||||
|
|
||||||
let percent = format!("{:.2}%", (crawled as f32/budget as f32) * 100f32);
|
let percent = format!("{:.2}%", (crawled as f32/budget as f32) * 100f32);
|
||||||
info!("Crawled {crawled} out of {budget} pages. ({percent})");
|
info!("Crawled {crawled} out of {budget} pages. ({percent})");
|
||||||
} else {
|
} else {
|
||||||
@ -69,10 +66,9 @@ async fn main() {
|
|||||||
|
|
||||||
async fn crawl_wrapper(dom: &Rc<Node>, db: &Surreal<Client>, site: &Website, count: &mut usize) {
|
async fn crawl_wrapper(dom: &Rc<Node>, db: &Surreal<Client>, site: &Website, count: &mut usize) {
|
||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
let now = Instant::now();
|
let timer= Timer::start("Walked");
|
||||||
walk(&dom, &db, &site, &mut buffer).await;
|
walk(&dom, &db, &site, &mut buffer).await;
|
||||||
let dif = now.elapsed().as_micros();
|
drop(timer);
|
||||||
trace!("{}", format!("Walked in {:.3}ms", dif as f64/1000.));
|
|
||||||
site.links_to(buffer, &db).await;
|
site.links_to(buffer, &db).await;
|
||||||
*count += 1;
|
*count += 1;
|
||||||
}
|
}
|
||||||
@ -81,10 +77,9 @@ async fn crawl_wrapper(dom: &Rc<Node>, db: &Surreal<Client>, site: &Website, cou
|
|||||||
/// A quick helper function for downloading a url
|
/// A quick helper function for downloading a url
|
||||||
async fn get(site: &mut Website, db: &Surreal<Client>) -> Option<Rc<Node>> {
|
async fn get(site: &mut Website, db: &Surreal<Client>) -> Option<Rc<Node>> {
|
||||||
trace!("Get: {}", site.to_string());
|
trace!("Get: {}", site.to_string());
|
||||||
let now = Instant::now();
|
let timer = Timer::start("Got page");
|
||||||
if let Ok(response) = reqwest::get(site.to_string()).await {
|
if let Ok(response) = reqwest::get(site.to_string()).await {
|
||||||
let dif = now.elapsed().as_micros();
|
drop(timer);
|
||||||
trace!("{}", format!("Got page in {:.3}ms", dif as f64/1000.));
|
|
||||||
|
|
||||||
let data = response.text().await.unwrap();
|
let data = response.text().await.unwrap();
|
||||||
let opts = ParseOpts {
|
let opts = ParseOpts {
|
||||||
@ -165,3 +160,21 @@ async fn get_uncrawled_links(db: &Surreal<Client>, mut count: usize) -> Vec<Webs
|
|||||||
response.take(0).expect("Returned websites couldn't be parsed")
|
response.take(0).expect("Returned websites couldn't be parsed")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct Timer<'a> {
|
||||||
|
start: Instant,
|
||||||
|
msg: &'a str,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Timer<'a> {
|
||||||
|
#[inline]
|
||||||
|
pub fn start(msg: &'a str) -> Self {
|
||||||
|
Self { start: Instant::now(), msg }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Drop for Timer<'a> {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
let dif = self.start.elapsed().as_micros();
|
||||||
|
debug!("{}", format!("{} in {:.3}ms", self.msg, dif as f64/1000.));
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user