working, now onto speeding it up
This commit is contained in:
		
							
								
								
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@@ -1 +1,4 @@
 | 
				
			|||||||
/target
 | 
					/target
 | 
				
			||||||
 | 
					perf.data
 | 
				
			||||||
 | 
					flamegraph.svg
 | 
				
			||||||
 | 
					perf.data.old
 | 
				
			||||||
							
								
								
									
										2
									
								
								.vscode/launch.json
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.vscode/launch.json
									
									
									
									
										vendored
									
									
								
							@@ -9,7 +9,7 @@
 | 
				
			|||||||
            "request": "launch",
 | 
					            "request": "launch",
 | 
				
			||||||
            "name": "Debug executable 'surreal_spider'",
 | 
					            "name": "Debug executable 'surreal_spider'",
 | 
				
			||||||
            "env": {
 | 
					            "env": {
 | 
				
			||||||
                "RUST_LOG": "surreal_spider=trace,reqwest=trace",
 | 
					                "RUST_LOG": "surreal_spider=debug,reqwest=info",
 | 
				
			||||||
            },
 | 
					            },
 | 
				
			||||||
            "cargo": {
 | 
					            "cargo": {
 | 
				
			||||||
                "args": [
 | 
					                "args": [
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -8,6 +8,8 @@ use surrealdb::{
 | 
				
			|||||||
use tracing::{debug, error, instrument, trace, warn};
 | 
					use tracing::{debug, error, instrument, trace, warn};
 | 
				
			||||||
use url::Url;
 | 
					use url::Url;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					use crate::Timer;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#[derive(Debug, Serialize, Deserialize, Clone)]
 | 
					#[derive(Debug, Serialize, Deserialize, Clone)]
 | 
				
			||||||
pub struct Website {
 | 
					pub struct Website {
 | 
				
			||||||
    /// The url that this data is found at
 | 
					    /// The url that this data is found at
 | 
				
			||||||
@@ -57,6 +59,10 @@ impl Website {
 | 
				
			|||||||
        let from = self.site.to_string();
 | 
					        let from = self.site.to_string();
 | 
				
			||||||
        // let to = other.site.to_string();
 | 
					        // let to = other.site.to_string();
 | 
				
			||||||
        trace!("Linking {from} to {} other pages.", other.len());
 | 
					        trace!("Linking {from} to {} other pages.", other.len());
 | 
				
			||||||
 | 
					        let msg = format!("Linked {len} pages");
 | 
				
			||||||
 | 
					        let timer = Timer::start(&msg);
 | 
				
			||||||
 | 
					        // prevent the timer from being dropped instantly.
 | 
				
			||||||
 | 
					        let _ = timer;
 | 
				
			||||||
        match db
 | 
					        match db
 | 
				
			||||||
            .query("COUNT(RELATE (SELECT id FROM website WHERE site = $in) -> links_to -> $out)")
 | 
					            .query("COUNT(RELATE (SELECT id FROM website WHERE site = $in) -> links_to -> $out)")
 | 
				
			||||||
            .bind(("in", from))
 | 
					            .bind(("in", from))
 | 
				
			||||||
@@ -71,7 +77,7 @@ impl Website {
 | 
				
			|||||||
                    let _: Vec<usize> = vec;
 | 
					                    let _: Vec<usize> = vec;
 | 
				
			||||||
                    if let Some(num) = vec.get(0) {
 | 
					                    if let Some(num) = vec.get(0) {
 | 
				
			||||||
                        if *num == len {
 | 
					                        if *num == len {
 | 
				
			||||||
                            debug!("Link OK");
 | 
					                            trace!("Link OK");
 | 
				
			||||||
                            return;
 | 
					                            return;
 | 
				
			||||||
                        } else {
 | 
					                        } else {
 | 
				
			||||||
                            warn!("Didn't link all the records. {num}/{len}");
 | 
					                            warn!("Didn't link all the records. {num}/{len}");
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										31
									
								
								src/main.rs
									
									
									
									
									
								
							
							
						
						
									
										31
									
								
								src/main.rs
									
									
									
									
									
								
							@@ -51,10 +51,7 @@ async fn main() {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        for mut site in uncrawled {
 | 
					        for mut site in uncrawled {
 | 
				
			||||||
            if let Some(dom) = get(&mut site, &db).await {
 | 
					            if let Some(dom) = get(&mut site, &db).await {
 | 
				
			||||||
                trace!("Pre-walk checkpoint");
 | 
					 | 
				
			||||||
                
 | 
					 | 
				
			||||||
                crawl_wrapper(&dom, &db, &site, &mut crawled).await;
 | 
					                crawl_wrapper(&dom, &db, &site, &mut crawled).await;
 | 
				
			||||||
 | 
					 | 
				
			||||||
                let percent = format!("{:.2}%", (crawled as f32/budget as f32) * 100f32);
 | 
					                let percent = format!("{:.2}%", (crawled as f32/budget as f32) * 100f32);
 | 
				
			||||||
                info!("Crawled {crawled} out of {budget} pages. ({percent})");
 | 
					                info!("Crawled {crawled} out of {budget} pages. ({percent})");
 | 
				
			||||||
            } else {
 | 
					            } else {
 | 
				
			||||||
@@ -69,10 +66,9 @@ async fn main() {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
async fn crawl_wrapper(dom: &Rc<Node>, db: &Surreal<Client>, site: &Website, count: &mut usize) {
 | 
					async fn crawl_wrapper(dom: &Rc<Node>, db: &Surreal<Client>, site: &Website, count: &mut usize) {
 | 
				
			||||||
    let mut buffer = Vec::new();
 | 
					    let mut buffer = Vec::new();
 | 
				
			||||||
    let now = Instant::now();
 | 
					    let timer= Timer::start("Walked");
 | 
				
			||||||
    walk(&dom, &db, &site, &mut buffer).await;
 | 
					    walk(&dom, &db, &site, &mut buffer).await;
 | 
				
			||||||
    let dif = now.elapsed().as_micros();
 | 
					    drop(timer);
 | 
				
			||||||
    trace!("{}", format!("Walked in {:.3}ms", dif as f64/1000.));
 | 
					 | 
				
			||||||
    site.links_to(buffer, &db).await;
 | 
					    site.links_to(buffer, &db).await;
 | 
				
			||||||
    *count += 1;
 | 
					    *count += 1;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
@@ -81,10 +77,9 @@ async fn crawl_wrapper(dom: &Rc<Node>, db: &Surreal<Client>, site: &Website, cou
 | 
				
			|||||||
/// A quick helper function for downloading a url
 | 
					/// A quick helper function for downloading a url
 | 
				
			||||||
async fn get(site: &mut Website, db: &Surreal<Client>) -> Option<Rc<Node>> {
 | 
					async fn get(site: &mut Website, db: &Surreal<Client>) -> Option<Rc<Node>> {
 | 
				
			||||||
    trace!("Get: {}", site.to_string());
 | 
					    trace!("Get: {}", site.to_string());
 | 
				
			||||||
    let now = Instant::now();
 | 
					    let timer = Timer::start("Got page");
 | 
				
			||||||
    if let Ok(response) = reqwest::get(site.to_string()).await {
 | 
					    if let Ok(response) = reqwest::get(site.to_string()).await {
 | 
				
			||||||
        let dif = now.elapsed().as_micros();
 | 
					        drop(timer);
 | 
				
			||||||
        trace!("{}", format!("Got page in {:.3}ms", dif as f64/1000.));
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        let data = response.text().await.unwrap();
 | 
					        let data = response.text().await.unwrap();
 | 
				
			||||||
        let opts = ParseOpts {
 | 
					        let opts = ParseOpts {
 | 
				
			||||||
@@ -165,3 +160,21 @@ async fn get_uncrawled_links(db: &Surreal<Client>, mut count: usize) -> Vec<Webs
 | 
				
			|||||||
    response.take(0).expect("Returned websites couldn't be parsed")
 | 
					    response.take(0).expect("Returned websites couldn't be parsed")
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					pub struct Timer<'a> {
 | 
				
			||||||
 | 
					    start: Instant,
 | 
				
			||||||
 | 
					    msg: &'a str,
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					impl<'a> Timer<'a> {
 | 
				
			||||||
 | 
					    #[inline]
 | 
				
			||||||
 | 
					    pub fn start(msg: &'a str) -> Self {
 | 
				
			||||||
 | 
					        Self { start: Instant::now(), msg }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					impl<'a> Drop for Timer<'a> {
 | 
				
			||||||
 | 
					    fn drop(&mut self) {
 | 
				
			||||||
 | 
					        let dif = self.start.elapsed().as_micros();
 | 
				
			||||||
 | 
					        debug!("{}", format!("{} in {:.3}ms", self.msg, dif as f64/1000.));
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
		Reference in New Issue
	
	Block a user