clean up walk()

This commit is contained in:
Oliver Atkinson 2024-10-31 14:10:14 -06:00
parent 9324160e74
commit fbca067b1f
2 changed files with 13 additions and 38 deletions

View File

@ -3,7 +3,8 @@ use surrealdb::{engine::remote::ws::{Client, Ws}, opt::auth::Root, sql::Thing, S
#[derive(Debug, Serialize)] #[derive(Debug, Serialize)]
pub struct Website { pub struct Website {
pub url: String, pub site: String,
pub href: String,
pub crawled: bool pub crawled: bool
} }

View File

@ -45,10 +45,8 @@ async fn get(url: &str) -> Rc<Node> {
} }
async fn walk(node: &rcdom::Handle, db: &Surreal<Client> , site_name: &str) { async fn walk(node: &rcdom::Handle, db: &Surreal<Client> , site_name: &str) {
// Should protentailly just check for the record first. // Should protentailly just check for the record first.
let created: Option<db::Record> = match db.create("website").content(db::Website { url: site_name.to_string(), crawled: true } ).await { let created: Option<db::Record> = match db.create("website").content(db::Website { href: String::from("/"), crawled: true, site: site_name.to_string() } ).await {
Ok(e) => e, Ok(e) => e,
Err(e) => { Err(e) => {
match e { match e {
@ -65,57 +63,33 @@ async fn walk(node: &rcdom::Handle, db: &Surreal<Client> , site_name: &str) {
} }
} }
}; };
info!{"{:?}", created};
match &node.data { match &node.data {
rcdom::NodeData::Document => (), rcdom::NodeData::Document => (),
rcdom::NodeData::ProcessingInstruction { target, contents } => debug!("ProcessingInstruction"),
rcdom::NodeData::Doctype { name, public_id, system_id } => debug!("doctype"), rcdom::NodeData::Doctype { name, public_id, system_id } => debug!("doctype"),
rcdom::NodeData::Text { contents } => {}, rcdom::NodeData::Text { contents } => {},
rcdom::NodeData::Comment { contents } => debug!("comment"), rcdom::NodeData::Comment { contents } => debug!("comment"),
rcdom::NodeData::Element { name, attrs, template_contents, mathml_annotation_xml_integration_point } => { rcdom::NodeData::Element { name, attrs, template_contents, mathml_annotation_xml_integration_point } => {
let attrs = attrs.borrow().clone(); for attr in attrs.borrow().clone() {
for attr in attrs {
let name = name.local.to_string(); let name = name.local.to_string();
let link = &*attr.value;
trace!("element: {name}, attr: {link}");
if name == "a" { if name == "a" {
if link.starts_with("mailto") { if attr.value.starts_with("mailto") {
// mailto link, lol // mailto link, lol
warn!("{link}"); let created: Option<db::Record> = db.create("email").content(db::Email { email: attr.value.to_string() }).await.unwrap();
let created: Option<db::Record> = db.create("email").content(db::Email { email: link.to_owned() }).await.unwrap();
info!("{:?}", created) info!("{:?}", created)
} else if link.starts_with("http") {
// normal link
debug!("{link}")
} else if link.contains("/") {
// possibly a relative link?
//
// TODO This needs more logic handling. Needs to handle the following cases:
//
// Absolute links:
// /img.png
// /file-no-extension
//
// Realtive Links:
//
// img.png
// file-no-extnesion
//
let link_name = format!("{site_name}/{link}");
debug!("{link_name}");
let created: Option<db::Record> = db.create("website").content(db::Website { url: link_name, crawled: false } ).await.unwrap();
} else { } else {
error!("Unhandled link type: {link}") // Every not-mailto link
let created: Option<db::Record> = db.create("website").content(db::Website { href: attr.value.to_string(), crawled: false, site: site_name.to_string() } ).await.unwrap();
info!{"{:?}", created};
} }
} }
}; };
}, },
rcdom::NodeData::ProcessingInstruction { target, contents } => debug!("ProcessingInstruction"),
}; };
for child in &*node.children.borrow() { for child in node.children.borrow().iter() {
Box::pin(walk(&child, db, site_name)).await; Box::pin(walk(child, db, site_name)).await;
} }
} }