it works :party:
This commit is contained in:
parent
fd971bafbf
commit
5404d5c3e8
49
src/db.rs
49
src/db.rs
@ -5,7 +5,7 @@ use surrealdb::{
|
|||||||
sql::Thing,
|
sql::Thing,
|
||||||
Response, Surreal,
|
Response, Surreal,
|
||||||
};
|
};
|
||||||
use tracing::{debug, error, info, instrument, trace};
|
use tracing::{debug, error, instrument, trace, warn};
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||||
@ -50,19 +50,36 @@ impl Website {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[instrument(skip_all)]
|
#[instrument(skip_all)]
|
||||||
pub async fn links_to(&self, other: &Self, db: &Surreal<Client>) {
|
pub async fn links_to(&self, other: Vec<Thing>, db: &Surreal<Client>) {
|
||||||
let from = self.site.to_string();
|
let len = other.len();
|
||||||
let to = other.site.to_string();
|
if len == 0 {return}
|
||||||
trace!("Linking {from} to {to}");
|
|
||||||
|
|
||||||
|
let from = self.site.to_string();
|
||||||
|
// let to = other.site.to_string();
|
||||||
|
trace!("Linking {from} to {} other pages.", other.len());
|
||||||
match db
|
match db
|
||||||
.query("RELATE (SELECT id FROM website WHERE site = $in) -> links_to -> (SELECT id FROM website WHERE site = $out)")
|
.query("COUNT(RELATE (SELECT id FROM website WHERE site = $in) -> links_to -> $out)")
|
||||||
.bind(("in", from))
|
.bind(("in", from))
|
||||||
.bind(("out", to))
|
.bind(("out", other))
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
Ok(e) => {
|
Ok(mut e) => {
|
||||||
|
// The relate could technically "fail" (not relate anything), this just means that
|
||||||
|
// the query was ok.
|
||||||
let _: Response = e;
|
let _: Response = e;
|
||||||
|
if let Ok(vec) = e.take(0) {
|
||||||
|
let _: Vec<usize> = vec;
|
||||||
|
if let Some(num) = vec.get(0) {
|
||||||
|
if *num == len {
|
||||||
|
debug!("Link OK");
|
||||||
|
return;
|
||||||
|
} else {
|
||||||
|
warn!("Didn't link all the records. {num}/{len}");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
warn!("Linking request succeeded but couldn't verify the results.");
|
||||||
},
|
},
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
error!("{}", e.to_string());
|
error!("{}", e.to_string());
|
||||||
@ -71,7 +88,7 @@ impl Website {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[instrument(skip_all)]
|
#[instrument(skip_all)]
|
||||||
pub async fn store(&mut self, db: &Surreal<Client>) {
|
pub async fn store(&mut self, db: &Surreal<Client>) -> Option<Thing> {
|
||||||
// check if it's been gone thru before
|
// check if it's been gone thru before
|
||||||
let mut response = db
|
let mut response = db
|
||||||
.query("SELECT * FROM ONLY website WHERE site = $site LIMIT 1")
|
.query("SELECT * FROM ONLY website WHERE site = $site LIMIT 1")
|
||||||
@ -87,8 +104,9 @@ impl Website {
|
|||||||
|
|
||||||
match db.upsert((id.tb, id.id.to_string())).content(new).await {
|
match db.upsert((id.tb, id.id.to_string())).content(new).await {
|
||||||
Ok(e) => {
|
Ok(e) => {
|
||||||
if let Some(a) = &e {
|
if let Some(a) = e {
|
||||||
let _: &Record = a;
|
let _: Record = a;
|
||||||
|
return Some(a.id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
@ -101,13 +119,15 @@ impl Website {
|
|||||||
match db.create("website").content(self.clone()).await {
|
match db.create("website").content(self.clone()).await {
|
||||||
Ok(e) => {
|
Ok(e) => {
|
||||||
let _: Option<Record> = e;
|
let _: Option<Record> = e;
|
||||||
if let Some(a) = &e {
|
if let Some(a) = e {
|
||||||
let _: &Record = a;
|
let _: Record = a;
|
||||||
|
return Some(a.id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err(a) => error!("{:?}", a),
|
Err(a) => error!("{:?}", a),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -120,6 +140,7 @@ impl ToString for Website {
|
|||||||
#[derive(Debug, Serialize)]
|
#[derive(Debug, Serialize)]
|
||||||
pub struct Email {
|
pub struct Email {
|
||||||
pub email: String,
|
pub email: String,
|
||||||
|
pub on: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
#[derive(Debug, Deserialize)]
|
||||||
@ -140,7 +161,7 @@ pub async fn connect() -> surrealdb::Result<Surreal<Client>> {
|
|||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
// Select a specific namespace / database
|
// Select a specific namespace / database
|
||||||
db.use_ns("test").use_db("time").await?;
|
db.use_ns("test").use_db("v1.2").await?;
|
||||||
|
|
||||||
Ok(db)
|
Ok(db)
|
||||||
}
|
}
|
||||||
|
47
src/main.rs
47
src/main.rs
@ -1,11 +1,11 @@
|
|||||||
extern crate markup5ever_rcdom as rcdom;
|
extern crate markup5ever_rcdom as rcdom;
|
||||||
extern crate html5ever;
|
extern crate html5ever;
|
||||||
|
|
||||||
use std::rc::Rc;
|
use std::{rc::Rc, time::Instant};
|
||||||
use db::{connect, Website};
|
use db::{connect, Website};
|
||||||
use html5ever::{parse_document, tendril::TendrilSink, tree_builder::TreeBuilderOpts, ParseOpts};
|
use html5ever::{parse_document, tendril::TendrilSink, tree_builder::TreeBuilderOpts, ParseOpts};
|
||||||
use rcdom::{Node, RcDom};
|
use rcdom::{Node, RcDom};
|
||||||
use surrealdb::{engine::remote::ws::Client, Surreal};
|
use surrealdb::{engine::remote::ws::Client, sql::Thing, Surreal};
|
||||||
use tracing::{debug, info, instrument, trace, trace_span, warn};
|
use tracing::{debug, info, instrument, trace, trace_span, warn};
|
||||||
use tracing_subscriber::EnvFilter;
|
use tracing_subscriber::EnvFilter;
|
||||||
|
|
||||||
@ -33,15 +33,17 @@ async fn main() {
|
|||||||
let pre_loop_span = span.enter();
|
let pre_loop_span = span.enter();
|
||||||
let mut site = Website::new(&url, false);
|
let mut site = Website::new(&url, false);
|
||||||
let dom = get(&mut site, &db).await.expect("Inital page returned None.");
|
let dom = get(&mut site, &db).await.expect("Inital page returned None.");
|
||||||
crawled += 1;
|
crawl_wrapper(&dom, &db, &site, &mut crawled).await;
|
||||||
walk(&dom, &db, &site).await;
|
|
||||||
drop(pre_loop_span);
|
drop(pre_loop_span);
|
||||||
|
|
||||||
let span = trace_span!("Loop");
|
let span = trace_span!("Loop");
|
||||||
let span = span.enter();
|
let span = span.enter();
|
||||||
// Can go upto 49 above budget because the reterival function gets 50 no matter what
|
|
||||||
while crawled < budget {
|
while crawled < budget {
|
||||||
let uncrawled = get_uncrawled_links(&db, 100).await;
|
let get_num = if budget - crawled < 100 {
|
||||||
|
budget - crawled
|
||||||
|
} else {100};
|
||||||
|
|
||||||
|
let uncrawled = get_uncrawled_links(&db, get_num).await;
|
||||||
debug!("Crawling {} pages...", uncrawled.len());
|
debug!("Crawling {} pages...", uncrawled.len());
|
||||||
|
|
||||||
let span = trace_span!("Crawling");
|
let span = trace_span!("Crawling");
|
||||||
@ -49,9 +51,10 @@ async fn main() {
|
|||||||
|
|
||||||
for mut site in uncrawled {
|
for mut site in uncrawled {
|
||||||
if let Some(dom) = get(&mut site, &db).await {
|
if let Some(dom) = get(&mut site, &db).await {
|
||||||
trace!("pre-walk checkpoint");
|
trace!("Pre-walk checkpoint");
|
||||||
walk(&dom, &db, &site).await;
|
|
||||||
crawled += 1;
|
crawl_wrapper(&dom, &db, &site, &mut crawled).await;
|
||||||
|
|
||||||
let percent = format!("{:.2}%", (crawled as f32/budget as f32) * 100f32);
|
let percent = format!("{:.2}%", (crawled as f32/budget as f32) * 100f32);
|
||||||
info!("Crawled {crawled} out of {budget} pages. ({percent})");
|
info!("Crawled {crawled} out of {budget} pages. ({percent})");
|
||||||
} else {
|
} else {
|
||||||
@ -64,11 +67,25 @@ async fn main() {
|
|||||||
info!("Done");
|
info!("Done");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn crawl_wrapper(dom: &Rc<Node>, db: &Surreal<Client>, site: &Website, count: &mut usize) {
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
let now = Instant::now();
|
||||||
|
walk(&dom, &db, &site, &mut buffer).await;
|
||||||
|
let dif = now.elapsed().as_micros();
|
||||||
|
trace!("{}", format!("Walked in {:.3}ms", dif as f64/1000.));
|
||||||
|
site.links_to(buffer, &db).await;
|
||||||
|
*count += 1;
|
||||||
|
}
|
||||||
|
|
||||||
#[instrument(skip_all)]
|
#[instrument(skip_all)]
|
||||||
/// A quick helper function for downloading a url
|
/// A quick helper function for downloading a url
|
||||||
async fn get(site: &mut Website, db: &Surreal<Client>) -> Option<Rc<Node>> {
|
async fn get(site: &mut Website, db: &Surreal<Client>) -> Option<Rc<Node>> {
|
||||||
trace!("Get: {}", site.to_string());
|
trace!("Get: {}", site.to_string());
|
||||||
|
let now = Instant::now();
|
||||||
if let Ok(response) = reqwest::get(site.to_string()).await {
|
if let Ok(response) = reqwest::get(site.to_string()).await {
|
||||||
|
let dif = now.elapsed().as_micros();
|
||||||
|
trace!("{}", format!("Got page in {:.3}ms", dif as f64/1000.));
|
||||||
|
|
||||||
let data = response.text().await.unwrap();
|
let data = response.text().await.unwrap();
|
||||||
let opts = ParseOpts {
|
let opts = ParseOpts {
|
||||||
tree_builder: TreeBuilderOpts {
|
tree_builder: TreeBuilderOpts {
|
||||||
@ -93,7 +110,7 @@ async fn get(site: &mut Website, db: &Surreal<Client>) -> Option<Rc<Node>> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Walks the givin site, placing it's findings in the database
|
/// Walks the givin site, placing it's findings in the database
|
||||||
async fn walk(node: &rcdom::Handle, db: &Surreal<Client> , site: &Website) {
|
async fn walk(node: &rcdom::Handle, db: &Surreal<Client> , site: &Website, links_to: &mut Vec<Thing>) {
|
||||||
let span = trace_span!("Walk");
|
let span = trace_span!("Walk");
|
||||||
let span = span.enter();
|
let span = span.enter();
|
||||||
|
|
||||||
@ -105,7 +122,8 @@ async fn walk(node: &rcdom::Handle, db: &Surreal<Client> , site: &Website) {
|
|||||||
trace!("Is mailto");
|
trace!("Is mailto");
|
||||||
// mailto link, lol
|
// mailto link, lol
|
||||||
let _created: Option<db::Record> = db.create("email").content(db::Email {
|
let _created: Option<db::Record> = db.create("email").content(db::Email {
|
||||||
email: attr.value.to_string()
|
email: attr.value.to_string(),
|
||||||
|
on: site.domain_str().to_owned(),
|
||||||
}).await.unwrap();
|
}).await.unwrap();
|
||||||
} else {
|
} else {
|
||||||
let mut web = site.clone();
|
let mut web = site.clone();
|
||||||
@ -118,8 +136,9 @@ async fn walk(node: &rcdom::Handle, db: &Surreal<Client> , site: &Website) {
|
|||||||
let crawled = web.crawled();
|
let crawled = web.crawled();
|
||||||
*crawled = false;
|
*crawled = false;
|
||||||
|
|
||||||
site.links_to(&web, &db).await;
|
if let Some(id) = web.store(db).await {
|
||||||
web.store(db).await;
|
links_to.push(id);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -128,7 +147,7 @@ async fn walk(node: &rcdom::Handle, db: &Surreal<Client> , site: &Website) {
|
|||||||
};
|
};
|
||||||
drop(span);
|
drop(span);
|
||||||
for child in node.children.borrow().iter() {
|
for child in node.children.borrow().iter() {
|
||||||
Box::pin(walk(child, db, site)).await;
|
Box::pin(walk(child, db, site, links_to)).await;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user