clean up walk()
This commit is contained in:
parent
9324160e74
commit
fbca067b1f
@ -3,7 +3,8 @@ use surrealdb::{engine::remote::ws::{Client, Ws}, opt::auth::Root, sql::Thing, S
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct Website {
|
||||
pub url: String,
|
||||
pub site: String,
|
||||
pub href: String,
|
||||
pub crawled: bool
|
||||
}
|
||||
|
||||
|
48
src/main.rs
48
src/main.rs
@ -45,10 +45,8 @@ async fn get(url: &str) -> Rc<Node> {
|
||||
}
|
||||
|
||||
async fn walk(node: &rcdom::Handle, db: &Surreal<Client> , site_name: &str) {
|
||||
|
||||
|
||||
// Should protentailly just check for the record first.
|
||||
let created: Option<db::Record> = match db.create("website").content(db::Website { url: site_name.to_string(), crawled: true } ).await {
|
||||
let created: Option<db::Record> = match db.create("website").content(db::Website { href: String::from("/"), crawled: true, site: site_name.to_string() } ).await {
|
||||
Ok(e) => e,
|
||||
Err(e) => {
|
||||
match e {
|
||||
@ -65,57 +63,33 @@ async fn walk(node: &rcdom::Handle, db: &Surreal<Client> , site_name: &str) {
|
||||
}
|
||||
}
|
||||
};
|
||||
info!{"{:?}", created};
|
||||
|
||||
match &node.data {
|
||||
rcdom::NodeData::Document => (),
|
||||
rcdom::NodeData::ProcessingInstruction { target, contents } => debug!("ProcessingInstruction"),
|
||||
rcdom::NodeData::Doctype { name, public_id, system_id } => debug!("doctype"),
|
||||
rcdom::NodeData::Text { contents } => {},
|
||||
rcdom::NodeData::Comment { contents } => debug!("comment"),
|
||||
rcdom::NodeData::Element { name, attrs, template_contents, mathml_annotation_xml_integration_point } => {
|
||||
let attrs = attrs.borrow().clone();
|
||||
for attr in attrs {
|
||||
for attr in attrs.borrow().clone() {
|
||||
let name = name.local.to_string();
|
||||
let link = &*attr.value;
|
||||
trace!("element: {name}, attr: {link}");
|
||||
|
||||
if name == "a" {
|
||||
if link.starts_with("mailto") {
|
||||
if attr.value.starts_with("mailto") {
|
||||
// mailto link, lol
|
||||
warn!("{link}");
|
||||
|
||||
let created: Option<db::Record> = db.create("email").content(db::Email { email: link.to_owned() }).await.unwrap();
|
||||
let created: Option<db::Record> = db.create("email").content(db::Email { email: attr.value.to_string() }).await.unwrap();
|
||||
info!("{:?}", created)
|
||||
|
||||
} else if link.starts_with("http") {
|
||||
// normal link
|
||||
debug!("{link}")
|
||||
} else if link.contains("/") {
|
||||
// possibly a relative link?
|
||||
//
|
||||
// TODO This needs more logic handling. Needs to handle the following cases:
|
||||
//
|
||||
// Absolute links:
|
||||
// /img.png
|
||||
// /file-no-extension
|
||||
//
|
||||
// Realtive Links:
|
||||
//
|
||||
// img.png
|
||||
// file-no-extnesion
|
||||
//
|
||||
let link_name = format!("{site_name}/{link}");
|
||||
debug!("{link_name}");
|
||||
let created: Option<db::Record> = db.create("website").content(db::Website { url: link_name, crawled: false } ).await.unwrap();
|
||||
} else {
|
||||
error!("Unhandled link type: {link}")
|
||||
// Every not-mailto link
|
||||
let created: Option<db::Record> = db.create("website").content(db::Website { href: attr.value.to_string(), crawled: false, site: site_name.to_string() } ).await.unwrap();
|
||||
info!{"{:?}", created};
|
||||
}
|
||||
}
|
||||
};
|
||||
},
|
||||
rcdom::NodeData::ProcessingInstruction { target, contents } => debug!("ProcessingInstruction"),
|
||||
};
|
||||
|
||||
for child in &*node.children.borrow() {
|
||||
Box::pin(walk(&child, db, site_name)).await;
|
||||
for child in node.children.borrow().iter() {
|
||||
Box::pin(walk(child, db, site_name)).await;
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user