From fbca067b1f37b4226eb53e998cfe67db3cbfaf42 Mon Sep 17 00:00:00 2001 From: Oliver Atkinson Date: Thu, 31 Oct 2024 14:10:14 -0600 Subject: [PATCH] clean up walk() --- src/db.rs | 3 ++- src/main.rs | 48 +++++++++++------------------------------------- 2 files changed, 13 insertions(+), 38 deletions(-) diff --git a/src/db.rs b/src/db.rs index bc349c7..9d1cfa9 100644 --- a/src/db.rs +++ b/src/db.rs @@ -3,7 +3,8 @@ use surrealdb::{engine::remote::ws::{Client, Ws}, opt::auth::Root, sql::Thing, S #[derive(Debug, Serialize)] pub struct Website { - pub url: String, + pub site: String, + pub href: String, pub crawled: bool } diff --git a/src/main.rs b/src/main.rs index d805f9f..abfbe5e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -45,10 +45,8 @@ async fn get(url: &str) -> Rc { } async fn walk(node: &rcdom::Handle, db: &Surreal , site_name: &str) { - - // Should protentailly just check for the record first. - let created: Option = match db.create("website").content(db::Website { url: site_name.to_string(), crawled: true } ).await { + let created: Option = match db.create("website").content(db::Website { href: String::from("/"), crawled: true, site: site_name.to_string() } ).await { Ok(e) => e, Err(e) => { match e { @@ -65,57 +63,33 @@ async fn walk(node: &rcdom::Handle, db: &Surreal , site_name: &str) { } } }; + info!{"{:?}", created}; match &node.data { rcdom::NodeData::Document => (), + rcdom::NodeData::ProcessingInstruction { target, contents } => debug!("ProcessingInstruction"), rcdom::NodeData::Doctype { name, public_id, system_id } => debug!("doctype"), rcdom::NodeData::Text { contents } => {}, rcdom::NodeData::Comment { contents } => debug!("comment"), rcdom::NodeData::Element { name, attrs, template_contents, mathml_annotation_xml_integration_point } => { - let attrs = attrs.borrow().clone(); - for attr in attrs { + for attr in attrs.borrow().clone() { let name = name.local.to_string(); - let link = &*attr.value; - trace!("element: {name}, attr: {link}"); - if name == "a" { - if link.starts_with("mailto") { + if attr.value.starts_with("mailto") { // mailto link, lol - warn!("{link}"); - - let created: Option = db.create("email").content(db::Email { email: link.to_owned() }).await.unwrap(); + let created: Option = db.create("email").content(db::Email { email: attr.value.to_string() }).await.unwrap(); info!("{:?}", created) - - } else if link.starts_with("http") { - // normal link - debug!("{link}") - } else if link.contains("/") { - // possibly a relative link? - // - // TODO This needs more logic handling. Needs to handle the following cases: - // - // Absolute links: - // /img.png - // /file-no-extension - // - // Realtive Links: - // - // img.png - // file-no-extnesion - // - let link_name = format!("{site_name}/{link}"); - debug!("{link_name}"); - let created: Option = db.create("website").content(db::Website { url: link_name, crawled: false } ).await.unwrap(); } else { - error!("Unhandled link type: {link}") + // Every not-mailto link + let created: Option = db.create("website").content(db::Website { href: attr.value.to_string(), crawled: false, site: site_name.to_string() } ).await.unwrap(); + info!{"{:?}", created}; } } }; }, - rcdom::NodeData::ProcessingInstruction { target, contents } => debug!("ProcessingInstruction"), }; - for child in &*node.children.borrow() { - Box::pin(walk(&child, db, site_name)).await; + for child in node.children.borrow().iter() { + Box::pin(walk(child, db, site_name)).await; } }