This commit is contained in:
Oliver Atkinson
2024-10-31 15:09:48 -06:00
parent fbca067b1f
commit 3a46dd937b
3 changed files with 157 additions and 174 deletions

View File

@@ -1,12 +1,12 @@
extern crate markup5ever_rcdom as rcdom;
extern crate html5ever;
use std::{env, rc::Rc};
use std::rc::Rc;
use db::connect;
use html5ever::{parse_document, tendril::TendrilSink, tree_builder::TreeBuilderOpts, ParseOpts};
use rcdom::{Node, RcDom};
use surrealdb::{engine::remote::ws::Client, Surreal};
use tracing::{debug, error, info, trace, warn};
use tracing::{debug, info, warn};
mod db;
@@ -16,13 +16,13 @@ async fn main() {
debug!("Starting...");
let url = "https://oliveratkinson.net";
let budget = "10";
let dom = get(url).await;
let db = connect().await.expect("Failed to connect to db, aborting.");
let dom = get(url).await;
warn!("Walking...");
walk(&dom, &db, url).await;
info!("Done");
}
async fn get(url: &str) -> Rc<Node> {
@@ -45,48 +45,42 @@ async fn get(url: &str) -> Rc<Node> {
}
async fn walk(node: &rcdom::Handle, db: &Surreal<Client> , site_name: &str) {
// Should protentailly just check for the record first.
let created: Option<db::Record> = match db.create("website").content(db::Website { href: String::from("/"), crawled: true, site: site_name.to_string() } ).await {
Ok(e) => e,
// Insert Or Update
let _created: Vec<db::Record> = match db.upsert("website").content(db::Website { href: String::from("/"), crawled: true, site: site_name.to_string() } ).await {
Ok(e) => {
// Return this for type coercion
e
},
Err(e) => {
match e {
surrealdb::Error::Db(_) => todo!(),
surrealdb::Error::Api(api) => {
match api {
surrealdb::error::Api::Query(query) => {
error!(query);
None
},
_ => todo!(),
}
},
}
unimplemented!("{}", e);
}
};
info!{"{:?}", created};
match &node.data {
rcdom::NodeData::Document => (),
rcdom::NodeData::ProcessingInstruction { target, contents } => debug!("ProcessingInstruction"),
rcdom::NodeData::Doctype { name, public_id, system_id } => debug!("doctype"),
rcdom::NodeData::Text { contents } => {},
rcdom::NodeData::Comment { contents } => debug!("comment"),
rcdom::NodeData::Element { name, attrs, template_contents, mathml_annotation_xml_integration_point } => {
for attr in attrs.borrow().clone() {
let name = name.local.to_string();
if name == "a" {
if attr.value.starts_with("mailto") {
// mailto link, lol
let created: Option<db::Record> = db.create("email").content(db::Email { email: attr.value.to_string() }).await.unwrap();
info!("{:?}", created)
let created: Option<db::Record> = db.create("email").content(db::Email {
email: attr.value.to_string()
}).await.unwrap();
warn!("{:?}", created)
} else {
let href = attr.value.to_string();
info!("{}", href);
// Every not-mailto link
let created: Option<db::Record> = db.create("website").content(db::Website { href: attr.value.to_string(), crawled: false, site: site_name.to_string() } ).await.unwrap();
info!{"{:?}", created};
let _created: Option<db::Record> = db.create("website").content(db::Website {
href,
crawled: false,
site: site_name.to_string()
}).await.unwrap();
}
}
};
},
_ => {},
};
for child in node.children.borrow().iter() {