works more, but still not all the way
This commit is contained in:
132
src/db.rs
132
src/db.rs
@@ -1,22 +1,140 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use surrealdb::{engine::remote::ws::{Client, Ws}, opt::auth::Root, sql::Thing, Surreal};
|
||||
use surrealdb::{
|
||||
engine::remote::ws::{Client, Ws},
|
||||
opt::auth::Root,
|
||||
sql::Thing,
|
||||
Surreal,
|
||||
};
|
||||
use tracing::{debug, error, info, instrument};
|
||||
use url::Url;
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct Website {
|
||||
pub site: String,
|
||||
pub href: String,
|
||||
pub crawled: bool
|
||||
/// The url that this data is found at
|
||||
site: Url,
|
||||
/// The url as defined in the <a> tag
|
||||
href: Url,
|
||||
/// Wether or not this link has been crawled yet
|
||||
crawled: bool,
|
||||
/// Wether or not the href was doctored
|
||||
doctored_href: bool,
|
||||
original_href: Option<String>,
|
||||
}
|
||||
|
||||
impl Website {
|
||||
/// Creates a blank site (assumes that url param is site's root)
|
||||
pub fn new(url: &str, href: &str, crawled: bool) -> Self {
|
||||
let mut new = Self::from(url);
|
||||
new.crawled = crawled;
|
||||
new.original_href = Some(href.to_string());
|
||||
new.href =
|
||||
match Url::parse(href) {
|
||||
Ok(e) => e,
|
||||
Err(e) => {
|
||||
match e {
|
||||
url::ParseError::RelativeUrlWithoutBase => {
|
||||
// Try to combine the scheme_host and href to get a useable domain
|
||||
new.doctored_href = true;
|
||||
|
||||
let url = if !url.ends_with('/') && !href.starts_with('/') {
|
||||
format!("{url}/{href}")
|
||||
} else {
|
||||
format!("{url}{href}")
|
||||
};
|
||||
|
||||
// paste the domain onto the begining of the href
|
||||
Url::parse(&url).map_or_else(|err| {
|
||||
debug!("Parsing {url} with {href}");
|
||||
error!("{err} Failed to parse href into url on second try. Aborting");
|
||||
panic!("See error logs for more info.");
|
||||
}, |ok| ok)
|
||||
}
|
||||
_ => {
|
||||
error!("{e}");
|
||||
panic!("See error logs for more info.");
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
new
|
||||
}
|
||||
pub fn crawled(&mut self) {
|
||||
self.crawled = true
|
||||
}
|
||||
pub fn href_str(&self) -> &str {
|
||||
self.href.as_str()
|
||||
}
|
||||
pub fn site(&self) -> String {
|
||||
self.site.to_string()
|
||||
}
|
||||
pub fn domain_str(&self) -> &str {
|
||||
self.site.as_str()
|
||||
}
|
||||
#[instrument(skip_all)]
|
||||
pub async fn store(&mut self, db: &Surreal<Client>) {
|
||||
// is root record?
|
||||
if self.href.path() == "/" {
|
||||
// Upsert is create or update
|
||||
// Whereas Update is just update
|
||||
let record = ("website", &self.href.to_string());
|
||||
|
||||
let crawled = if let Some(old) = db.select(record).await.unwrap() {
|
||||
let old: Website = old; // infer type
|
||||
old.crawled
|
||||
} else {false};
|
||||
|
||||
if !self.crawled {self.crawled = crawled};
|
||||
|
||||
match db.upsert(record).content(self.clone()).await {
|
||||
Ok(e) => {
|
||||
if let Some(a) = &e {
|
||||
let _: &Record = a;
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
error!("{}", e);
|
||||
},
|
||||
};
|
||||
} else {
|
||||
let _: Option<Record> = match db.create("website").content(self.clone()).await {
|
||||
Ok(e) => {
|
||||
if let Some(a) = &e {
|
||||
let _: &Record = a;
|
||||
}
|
||||
e
|
||||
}
|
||||
Err(_) => todo!(),
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&str> for Website {
|
||||
/// site == href, crawled = false
|
||||
fn from(value: &str) -> Self {
|
||||
let site = match Url::parse(value) {
|
||||
Ok(a) => a,
|
||||
Err(_) => todo!(),
|
||||
};
|
||||
Self {
|
||||
href: site.clone(),
|
||||
crawled: false,
|
||||
site,
|
||||
doctored_href: false,
|
||||
original_href: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct Email {
|
||||
pub email: String
|
||||
pub email: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct Record {
|
||||
#[allow(dead_code)]
|
||||
id: Thing,
|
||||
pub id: Thing,
|
||||
}
|
||||
|
||||
pub async fn connect() -> surrealdb::Result<Surreal<Client>> {
|
||||
|
121
src/main.rs
121
src/main.rs
@@ -2,87 +2,97 @@ extern crate markup5ever_rcdom as rcdom;
|
||||
extern crate html5ever;
|
||||
|
||||
use std::rc::Rc;
|
||||
use db::connect;
|
||||
use db::{connect, Website};
|
||||
use html5ever::{parse_document, tendril::TendrilSink, tree_builder::TreeBuilderOpts, ParseOpts};
|
||||
use rcdom::{Node, RcDom};
|
||||
use surrealdb::{engine::remote::ws::Client, Surreal};
|
||||
use tracing::{debug, error, info, warn};
|
||||
use tracing::{debug, info, instrument};
|
||||
use tracing_subscriber::EnvFilter;
|
||||
|
||||
mod db;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
tracing_subscriber::fmt::init();
|
||||
tracing_subscriber::fmt()
|
||||
.with_env_filter(EnvFilter::from_default_env())
|
||||
.with_line_number(true)
|
||||
.without_time()
|
||||
.init();
|
||||
debug!("Starting...");
|
||||
|
||||
let url = "https://oliveratkinson.net";
|
||||
|
||||
let db = connect().await.expect("Failed to connect to db, aborting.");
|
||||
let dom = get(url).await;
|
||||
// Would probably take these in as parameters from a cli
|
||||
let url = "https://oliveratkinson.net/";
|
||||
let budget = 50;
|
||||
let mut crawled = 0;
|
||||
|
||||
walk(&dom, &db, url).await;
|
||||
let db = connect().await.expect("Failed to connect to db, aborting.");
|
||||
|
||||
// Kick off the whole machine - This Website object doesn't matter, it's just to allow for
|
||||
// get() to work.
|
||||
let mut site = Website::from(url);
|
||||
let dom = get(&mut site, &db).await.expect("Inital page returned None.");
|
||||
crawled += 1;
|
||||
walk(&dom, &db, &site).await;
|
||||
|
||||
while crawled < budget {
|
||||
let uncrawled = get_uncrawled_links(&db).await;
|
||||
debug!("Crawling {} pages...", uncrawled.len());
|
||||
|
||||
for mut site in uncrawled {
|
||||
if let Some(dom) = get(&mut site, &db).await {
|
||||
walk(&dom, &db, &site).await;
|
||||
crawled += 1;
|
||||
let percent = format!("{:.2}%", (crawled as f32/budget as f32) * 100f32);
|
||||
info!("Crawled {crawled} out of {budget} pages. ({percent})");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
info!("Done");
|
||||
}
|
||||
|
||||
async fn get(url: &str) -> Rc<Node> {
|
||||
let response = reqwest::get(url).await.unwrap();
|
||||
let data = response.text().await.unwrap();
|
||||
|
||||
let opts = ParseOpts {
|
||||
tree_builder: TreeBuilderOpts {
|
||||
drop_doctype: true,
|
||||
#[instrument(skip_all)]
|
||||
/// A quick helper function for downloading a url
|
||||
async fn get(site: &mut Website, db: &Surreal<Client>) -> Option<Rc<Node>> {
|
||||
if let Ok(response) = reqwest::get(site.href_str()).await {
|
||||
let data = response.text().await.unwrap();
|
||||
let opts = ParseOpts {
|
||||
tree_builder: TreeBuilderOpts {
|
||||
drop_doctype: true,
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
};
|
||||
};
|
||||
|
||||
let dom = parse_document(RcDom::default(), opts)
|
||||
.from_utf8()
|
||||
.read_from(&mut data.as_bytes())
|
||||
.unwrap();
|
||||
|
||||
let dom = parse_document(RcDom::default(), opts)
|
||||
.from_utf8()
|
||||
.read_from(&mut data.as_bytes())
|
||||
.unwrap();
|
||||
dom.document
|
||||
site.crawled();
|
||||
site.store(db).await;
|
||||
return Some(dom.document);
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
async fn walk(node: &rcdom::Handle, db: &Surreal<Client> , site_name: &str) {
|
||||
/// Walks the givin site, placing it's findings in the database
|
||||
async fn walk(node: &rcdom::Handle, db: &Surreal<Client> , site_name: &Website) {
|
||||
// Insert Or Update
|
||||
let _: Option<Vec<db::Record>> = match db.upsert(("website", site_name)).content(db::Website { href: String::from("/"), crawled: true, site: site_name.to_string() } ).await {
|
||||
Ok(e) => {
|
||||
// Return this for type coercion
|
||||
e
|
||||
},
|
||||
Err(e) => {
|
||||
// error!("{}", e);
|
||||
None
|
||||
}
|
||||
};
|
||||
// create_root(site_name, db).await;
|
||||
|
||||
match &node.data {
|
||||
rcdom::NodeData::Element { name, attrs, template_contents, mathml_annotation_xml_integration_point } => {
|
||||
for attr in attrs.borrow().clone() {
|
||||
let name = name.local.to_string();
|
||||
if name == "a" {
|
||||
if name.local.to_string() == "a" {
|
||||
if attr.value.starts_with("mailto") {
|
||||
// mailto link, lol
|
||||
let created: Option<db::Record> = db.create("email").content(db::Email {
|
||||
let _created: Option<db::Record> = db.create("email").content(db::Email {
|
||||
email: attr.value.to_string()
|
||||
}).await.unwrap();
|
||||
warn!("{:?}", created)
|
||||
} else {
|
||||
// FIXME this isn't actually creating records...?
|
||||
let _: Option<db::Record> = match db.create("website").content(db::Website {
|
||||
href: attr.value.to_string(),
|
||||
crawled: false,
|
||||
site: site_name.to_string()
|
||||
}).await {
|
||||
Ok(e) => {
|
||||
if let Some(a) = &e {
|
||||
debug!("{:?}", a);
|
||||
}
|
||||
e
|
||||
},
|
||||
Err(_) => todo!(),
|
||||
};
|
||||
let mut web = Website::new(&site_name.site(), &attr.value, false);
|
||||
web.store(db).await;
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -94,3 +104,10 @@ async fn walk(node: &rcdom::Handle, db: &Surreal<Client> , site_name: &str) {
|
||||
Box::pin(walk(child, db, site_name)).await;
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns 0-50 uncrawled links (LIMIT = 50)
|
||||
async fn get_uncrawled_links(db: &Surreal<Client>) -> Vec<Website> {
|
||||
let mut response = db.query("SELECT * FROM website WHERE crawled = false LIMIT 50").await.expect("Hard-coded query failed..?");
|
||||
response.take(0).expect("Returned websites couldn't be parsed")
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user