it works :party:
This commit is contained in:
		
							
								
								
									
										49
									
								
								src/db.rs
									
									
									
									
									
								
							
							
						
						
									
										49
									
								
								src/db.rs
									
									
									
									
									
								
							@@ -5,7 +5,7 @@ use surrealdb::{
 | 
				
			|||||||
    sql::Thing,
 | 
					    sql::Thing,
 | 
				
			||||||
    Response, Surreal,
 | 
					    Response, Surreal,
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
use tracing::{debug, error, info, instrument, trace};
 | 
					use tracing::{debug, error, instrument, trace, warn};
 | 
				
			||||||
use url::Url;
 | 
					use url::Url;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#[derive(Debug, Serialize, Deserialize, Clone)]
 | 
					#[derive(Debug, Serialize, Deserialize, Clone)]
 | 
				
			||||||
@@ -50,19 +50,36 @@ impl Website {
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    #[instrument(skip_all)]
 | 
					    #[instrument(skip_all)]
 | 
				
			||||||
    pub async fn links_to(&self, other: &Self, db: &Surreal<Client>) {
 | 
					    pub async fn links_to(&self, other: Vec<Thing>, db: &Surreal<Client>) {
 | 
				
			||||||
        let from = self.site.to_string();
 | 
					        let len = other.len();
 | 
				
			||||||
        let to = other.site.to_string();
 | 
					        if len == 0 {return}
 | 
				
			||||||
        trace!("Linking {from} to {to}");
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        let from = self.site.to_string();
 | 
				
			||||||
 | 
					        // let to = other.site.to_string();
 | 
				
			||||||
 | 
					        trace!("Linking {from} to {} other pages.", other.len());
 | 
				
			||||||
        match db
 | 
					        match db
 | 
				
			||||||
            .query("RELATE (SELECT id FROM website WHERE site = $in) -> links_to -> (SELECT id FROM website WHERE site = $out)")
 | 
					            .query("COUNT(RELATE (SELECT id FROM website WHERE site = $in) -> links_to -> $out)")
 | 
				
			||||||
            .bind(("in", from))
 | 
					            .bind(("in", from))
 | 
				
			||||||
            .bind(("out", to))
 | 
					            .bind(("out", other))
 | 
				
			||||||
            .await
 | 
					            .await
 | 
				
			||||||
        {
 | 
					        {
 | 
				
			||||||
            Ok(e) => {
 | 
					            Ok(mut e) => {
 | 
				
			||||||
 | 
					                // The relate could technically "fail" (not relate anything), this just means that
 | 
				
			||||||
 | 
					                // the query was ok.
 | 
				
			||||||
                let _: Response = e;
 | 
					                let _: Response = e;
 | 
				
			||||||
 | 
					                if let Ok(vec) = e.take(0) {
 | 
				
			||||||
 | 
					                    let _: Vec<usize> = vec;
 | 
				
			||||||
 | 
					                    if let Some(num) = vec.get(0) {
 | 
				
			||||||
 | 
					                        if *num == len {
 | 
				
			||||||
 | 
					                            debug!("Link OK");
 | 
				
			||||||
 | 
					                            return;
 | 
				
			||||||
 | 
					                        } else {
 | 
				
			||||||
 | 
					                            warn!("Didn't link all the records. {num}/{len}");
 | 
				
			||||||
 | 
					                            return;
 | 
				
			||||||
 | 
					                        }
 | 
				
			||||||
 | 
					                    }
 | 
				
			||||||
 | 
					                }
 | 
				
			||||||
 | 
					                warn!("Linking request succeeded but couldn't verify the results.");
 | 
				
			||||||
            },
 | 
					            },
 | 
				
			||||||
            Err(e) => {
 | 
					            Err(e) => {
 | 
				
			||||||
                error!("{}", e.to_string());
 | 
					                error!("{}", e.to_string());
 | 
				
			||||||
@@ -71,7 +88,7 @@ impl Website {
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    #[instrument(skip_all)]
 | 
					    #[instrument(skip_all)]
 | 
				
			||||||
    pub async fn store(&mut self, db: &Surreal<Client>) {
 | 
					    pub async fn store(&mut self, db: &Surreal<Client>) -> Option<Thing> {
 | 
				
			||||||
        // check if it's been gone thru before
 | 
					        // check if it's been gone thru before
 | 
				
			||||||
        let mut response = db
 | 
					        let mut response = db
 | 
				
			||||||
            .query("SELECT * FROM ONLY website WHERE site = $site LIMIT 1")
 | 
					            .query("SELECT * FROM ONLY website WHERE site = $site LIMIT 1")
 | 
				
			||||||
@@ -87,8 +104,9 @@ impl Website {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
                match db.upsert((id.tb, id.id.to_string())).content(new).await {
 | 
					                match db.upsert((id.tb, id.id.to_string())).content(new).await {
 | 
				
			||||||
                    Ok(e) => {
 | 
					                    Ok(e) => {
 | 
				
			||||||
                        if let Some(a) = &e {
 | 
					                        if let Some(a) = e {
 | 
				
			||||||
                            let _: &Record = a;
 | 
					                            let _: Record = a;
 | 
				
			||||||
 | 
					                            return Some(a.id);
 | 
				
			||||||
                        }
 | 
					                        }
 | 
				
			||||||
                    }
 | 
					                    }
 | 
				
			||||||
                    Err(e) => {
 | 
					                    Err(e) => {
 | 
				
			||||||
@@ -101,13 +119,15 @@ impl Website {
 | 
				
			|||||||
            match db.create("website").content(self.clone()).await {
 | 
					            match db.create("website").content(self.clone()).await {
 | 
				
			||||||
                Ok(e) => {
 | 
					                Ok(e) => {
 | 
				
			||||||
                    let _: Option<Record> = e;
 | 
					                    let _: Option<Record> = e;
 | 
				
			||||||
                    if let Some(a) = &e {
 | 
					                    if let Some(a) = e {
 | 
				
			||||||
                        let _: &Record = a;
 | 
					                        let _: Record = a;
 | 
				
			||||||
 | 
					                        return Some(a.id);
 | 
				
			||||||
                    }
 | 
					                    }
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
                Err(a) => error!("{:?}", a),
 | 
					                Err(a) => error!("{:?}", a),
 | 
				
			||||||
            };
 | 
					            };
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					        None
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -120,6 +140,7 @@ impl ToString for Website {
 | 
				
			|||||||
#[derive(Debug, Serialize)]
 | 
					#[derive(Debug, Serialize)]
 | 
				
			||||||
pub struct Email {
 | 
					pub struct Email {
 | 
				
			||||||
    pub email: String,
 | 
					    pub email: String,
 | 
				
			||||||
 | 
					    pub on: String,
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#[derive(Debug, Deserialize)]
 | 
					#[derive(Debug, Deserialize)]
 | 
				
			||||||
@@ -140,7 +161,7 @@ pub async fn connect() -> surrealdb::Result<Surreal<Client>> {
 | 
				
			|||||||
    .await?;
 | 
					    .await?;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // Select a specific namespace / database
 | 
					    // Select a specific namespace / database
 | 
				
			||||||
    db.use_ns("test").use_db("time").await?;
 | 
					    db.use_ns("test").use_db("v1.2").await?;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Ok(db)
 | 
					    Ok(db)
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										47
									
								
								src/main.rs
									
									
									
									
									
								
							
							
						
						
									
										47
									
								
								src/main.rs
									
									
									
									
									
								
							@@ -1,11 +1,11 @@
 | 
				
			|||||||
extern crate markup5ever_rcdom as rcdom;
 | 
					extern crate markup5ever_rcdom as rcdom;
 | 
				
			||||||
extern crate html5ever;
 | 
					extern crate html5ever;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
use std::rc::Rc;
 | 
					use std::{rc::Rc, time::Instant};
 | 
				
			||||||
use db::{connect, Website};
 | 
					use db::{connect, Website};
 | 
				
			||||||
use html5ever::{parse_document, tendril::TendrilSink, tree_builder::TreeBuilderOpts, ParseOpts};
 | 
					use html5ever::{parse_document, tendril::TendrilSink, tree_builder::TreeBuilderOpts, ParseOpts};
 | 
				
			||||||
use rcdom::{Node, RcDom};
 | 
					use rcdom::{Node, RcDom};
 | 
				
			||||||
use surrealdb::{engine::remote::ws::Client, Surreal};
 | 
					use surrealdb::{engine::remote::ws::Client, sql::Thing, Surreal};
 | 
				
			||||||
use tracing::{debug, info, instrument, trace, trace_span, warn};
 | 
					use tracing::{debug, info, instrument, trace, trace_span, warn};
 | 
				
			||||||
use tracing_subscriber::EnvFilter;
 | 
					use tracing_subscriber::EnvFilter;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -33,15 +33,17 @@ async fn main() {
 | 
				
			|||||||
    let pre_loop_span = span.enter();
 | 
					    let pre_loop_span = span.enter();
 | 
				
			||||||
    let mut site = Website::new(&url, false);
 | 
					    let mut site = Website::new(&url, false);
 | 
				
			||||||
    let dom = get(&mut site, &db).await.expect("Inital page returned None.");
 | 
					    let dom = get(&mut site, &db).await.expect("Inital page returned None.");
 | 
				
			||||||
    crawled += 1;
 | 
					    crawl_wrapper(&dom, &db, &site, &mut crawled).await;
 | 
				
			||||||
    walk(&dom, &db, &site).await;
 | 
					 | 
				
			||||||
    drop(pre_loop_span);
 | 
					    drop(pre_loop_span);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    let span = trace_span!("Loop");
 | 
					    let span = trace_span!("Loop");
 | 
				
			||||||
    let span = span.enter();
 | 
					    let span = span.enter();
 | 
				
			||||||
    // Can go upto 49 above budget because the reterival function gets 50 no matter what
 | 
					 | 
				
			||||||
    while crawled < budget {
 | 
					    while crawled < budget {
 | 
				
			||||||
        let uncrawled = get_uncrawled_links(&db, 100).await;
 | 
					        let get_num = if budget - crawled < 100 {
 | 
				
			||||||
 | 
					            budget - crawled
 | 
				
			||||||
 | 
					        } else {100};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        let uncrawled = get_uncrawled_links(&db, get_num).await;
 | 
				
			||||||
        debug!("Crawling {} pages...", uncrawled.len());
 | 
					        debug!("Crawling {} pages...", uncrawled.len());
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        let span = trace_span!("Crawling");
 | 
					        let span = trace_span!("Crawling");
 | 
				
			||||||
@@ -49,9 +51,10 @@ async fn main() {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        for mut site in uncrawled {
 | 
					        for mut site in uncrawled {
 | 
				
			||||||
            if let Some(dom) = get(&mut site, &db).await {
 | 
					            if let Some(dom) = get(&mut site, &db).await {
 | 
				
			||||||
                trace!("pre-walk checkpoint");
 | 
					                trace!("Pre-walk checkpoint");
 | 
				
			||||||
                walk(&dom, &db, &site).await;
 | 
					                
 | 
				
			||||||
                crawled += 1;
 | 
					                crawl_wrapper(&dom, &db, &site, &mut crawled).await;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                let percent = format!("{:.2}%", (crawled as f32/budget as f32) * 100f32);
 | 
					                let percent = format!("{:.2}%", (crawled as f32/budget as f32) * 100f32);
 | 
				
			||||||
                info!("Crawled {crawled} out of {budget} pages. ({percent})");
 | 
					                info!("Crawled {crawled} out of {budget} pages. ({percent})");
 | 
				
			||||||
            } else {
 | 
					            } else {
 | 
				
			||||||
@@ -64,11 +67,25 @@ async fn main() {
 | 
				
			|||||||
    info!("Done");
 | 
					    info!("Done");
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					async fn crawl_wrapper(dom: &Rc<Node>, db: &Surreal<Client>, site: &Website, count: &mut usize) {
 | 
				
			||||||
 | 
					    let mut buffer = Vec::new();
 | 
				
			||||||
 | 
					    let now = Instant::now();
 | 
				
			||||||
 | 
					    walk(&dom, &db, &site, &mut buffer).await;
 | 
				
			||||||
 | 
					    let dif = now.elapsed().as_micros();
 | 
				
			||||||
 | 
					    trace!("{}", format!("Walked in {:.3}ms", dif as f64/1000.));
 | 
				
			||||||
 | 
					    site.links_to(buffer, &db).await;
 | 
				
			||||||
 | 
					    *count += 1;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#[instrument(skip_all)]
 | 
					#[instrument(skip_all)]
 | 
				
			||||||
/// A quick helper function for downloading a url
 | 
					/// A quick helper function for downloading a url
 | 
				
			||||||
async fn get(site: &mut Website, db: &Surreal<Client>) -> Option<Rc<Node>> {
 | 
					async fn get(site: &mut Website, db: &Surreal<Client>) -> Option<Rc<Node>> {
 | 
				
			||||||
    trace!("Get: {}", site.to_string());
 | 
					    trace!("Get: {}", site.to_string());
 | 
				
			||||||
 | 
					    let now = Instant::now();
 | 
				
			||||||
    if let Ok(response) = reqwest::get(site.to_string()).await {
 | 
					    if let Ok(response) = reqwest::get(site.to_string()).await {
 | 
				
			||||||
 | 
					        let dif = now.elapsed().as_micros();
 | 
				
			||||||
 | 
					        trace!("{}", format!("Got page in {:.3}ms", dif as f64/1000.));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        let data = response.text().await.unwrap();
 | 
					        let data = response.text().await.unwrap();
 | 
				
			||||||
        let opts = ParseOpts {
 | 
					        let opts = ParseOpts {
 | 
				
			||||||
            tree_builder: TreeBuilderOpts {
 | 
					            tree_builder: TreeBuilderOpts {
 | 
				
			||||||
@@ -93,7 +110,7 @@ async fn get(site: &mut Website, db: &Surreal<Client>) -> Option<Rc<Node>> {
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/// Walks the givin site, placing it's findings in the database
 | 
					/// Walks the givin site, placing it's findings in the database
 | 
				
			||||||
async fn walk(node: &rcdom::Handle, db: &Surreal<Client> , site: &Website) {
 | 
					async fn walk(node: &rcdom::Handle, db: &Surreal<Client> , site: &Website, links_to: &mut Vec<Thing>) {
 | 
				
			||||||
    let span = trace_span!("Walk");
 | 
					    let span = trace_span!("Walk");
 | 
				
			||||||
    let span = span.enter();
 | 
					    let span = span.enter();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -105,7 +122,8 @@ async fn walk(node: &rcdom::Handle, db: &Surreal<Client> , site: &Website) {
 | 
				
			|||||||
                        trace!("Is mailto");
 | 
					                        trace!("Is mailto");
 | 
				
			||||||
                        // mailto link, lol
 | 
					                        // mailto link, lol
 | 
				
			||||||
                        let _created: Option<db::Record> = db.create("email").content(db::Email {
 | 
					                        let _created: Option<db::Record> = db.create("email").content(db::Email {
 | 
				
			||||||
                            email: attr.value.to_string()
 | 
					                            email: attr.value.to_string(),
 | 
				
			||||||
 | 
					                            on: site.domain_str().to_owned(),
 | 
				
			||||||
                        }).await.unwrap();
 | 
					                        }).await.unwrap();
 | 
				
			||||||
                    } else {
 | 
					                    } else {
 | 
				
			||||||
                        let mut web = site.clone();
 | 
					                        let mut web = site.clone();
 | 
				
			||||||
@@ -118,8 +136,9 @@ async fn walk(node: &rcdom::Handle, db: &Surreal<Client> , site: &Website) {
 | 
				
			|||||||
                        let crawled = web.crawled();
 | 
					                        let crawled = web.crawled();
 | 
				
			||||||
                        *crawled = false;
 | 
					                        *crawled = false;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                        site.links_to(&web, &db).await;
 | 
					                        if let Some(id) = web.store(db).await {
 | 
				
			||||||
                        web.store(db).await;
 | 
					                            links_to.push(id);
 | 
				
			||||||
 | 
					                        }
 | 
				
			||||||
                    }
 | 
					                    }
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
            };
 | 
					            };
 | 
				
			||||||
@@ -128,7 +147,7 @@ async fn walk(node: &rcdom::Handle, db: &Surreal<Client> , site: &Website) {
 | 
				
			|||||||
    };
 | 
					    };
 | 
				
			||||||
    drop(span);
 | 
					    drop(span);
 | 
				
			||||||
    for child in node.children.borrow().iter() {
 | 
					    for child in node.children.borrow().iter() {
 | 
				
			||||||
        Box::pin(walk(child, db, site)).await; 
 | 
					        Box::pin(walk(child, db, site, links_to)).await; 
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user