clean up walk()
This commit is contained in:
parent
9324160e74
commit
fbca067b1f
@ -3,7 +3,8 @@ use surrealdb::{engine::remote::ws::{Client, Ws}, opt::auth::Root, sql::Thing, S
|
|||||||
|
|
||||||
#[derive(Debug, Serialize)]
|
#[derive(Debug, Serialize)]
|
||||||
pub struct Website {
|
pub struct Website {
|
||||||
pub url: String,
|
pub site: String,
|
||||||
|
pub href: String,
|
||||||
pub crawled: bool
|
pub crawled: bool
|
||||||
}
|
}
|
||||||
|
|
||||||
|
48
src/main.rs
48
src/main.rs
@ -45,10 +45,8 @@ async fn get(url: &str) -> Rc<Node> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn walk(node: &rcdom::Handle, db: &Surreal<Client> , site_name: &str) {
|
async fn walk(node: &rcdom::Handle, db: &Surreal<Client> , site_name: &str) {
|
||||||
|
|
||||||
|
|
||||||
// Should protentailly just check for the record first.
|
// Should protentailly just check for the record first.
|
||||||
let created: Option<db::Record> = match db.create("website").content(db::Website { url: site_name.to_string(), crawled: true } ).await {
|
let created: Option<db::Record> = match db.create("website").content(db::Website { href: String::from("/"), crawled: true, site: site_name.to_string() } ).await {
|
||||||
Ok(e) => e,
|
Ok(e) => e,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
match e {
|
match e {
|
||||||
@ -65,57 +63,33 @@ async fn walk(node: &rcdom::Handle, db: &Surreal<Client> , site_name: &str) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
info!{"{:?}", created};
|
||||||
|
|
||||||
match &node.data {
|
match &node.data {
|
||||||
rcdom::NodeData::Document => (),
|
rcdom::NodeData::Document => (),
|
||||||
|
rcdom::NodeData::ProcessingInstruction { target, contents } => debug!("ProcessingInstruction"),
|
||||||
rcdom::NodeData::Doctype { name, public_id, system_id } => debug!("doctype"),
|
rcdom::NodeData::Doctype { name, public_id, system_id } => debug!("doctype"),
|
||||||
rcdom::NodeData::Text { contents } => {},
|
rcdom::NodeData::Text { contents } => {},
|
||||||
rcdom::NodeData::Comment { contents } => debug!("comment"),
|
rcdom::NodeData::Comment { contents } => debug!("comment"),
|
||||||
rcdom::NodeData::Element { name, attrs, template_contents, mathml_annotation_xml_integration_point } => {
|
rcdom::NodeData::Element { name, attrs, template_contents, mathml_annotation_xml_integration_point } => {
|
||||||
let attrs = attrs.borrow().clone();
|
for attr in attrs.borrow().clone() {
|
||||||
for attr in attrs {
|
|
||||||
let name = name.local.to_string();
|
let name = name.local.to_string();
|
||||||
let link = &*attr.value;
|
|
||||||
trace!("element: {name}, attr: {link}");
|
|
||||||
|
|
||||||
if name == "a" {
|
if name == "a" {
|
||||||
if link.starts_with("mailto") {
|
if attr.value.starts_with("mailto") {
|
||||||
// mailto link, lol
|
// mailto link, lol
|
||||||
warn!("{link}");
|
let created: Option<db::Record> = db.create("email").content(db::Email { email: attr.value.to_string() }).await.unwrap();
|
||||||
|
|
||||||
let created: Option<db::Record> = db.create("email").content(db::Email { email: link.to_owned() }).await.unwrap();
|
|
||||||
info!("{:?}", created)
|
info!("{:?}", created)
|
||||||
|
|
||||||
} else if link.starts_with("http") {
|
|
||||||
// normal link
|
|
||||||
debug!("{link}")
|
|
||||||
} else if link.contains("/") {
|
|
||||||
// possibly a relative link?
|
|
||||||
//
|
|
||||||
// TODO This needs more logic handling. Needs to handle the following cases:
|
|
||||||
//
|
|
||||||
// Absolute links:
|
|
||||||
// /img.png
|
|
||||||
// /file-no-extension
|
|
||||||
//
|
|
||||||
// Realtive Links:
|
|
||||||
//
|
|
||||||
// img.png
|
|
||||||
// file-no-extnesion
|
|
||||||
//
|
|
||||||
let link_name = format!("{site_name}/{link}");
|
|
||||||
debug!("{link_name}");
|
|
||||||
let created: Option<db::Record> = db.create("website").content(db::Website { url: link_name, crawled: false } ).await.unwrap();
|
|
||||||
} else {
|
} else {
|
||||||
error!("Unhandled link type: {link}")
|
// Every not-mailto link
|
||||||
|
let created: Option<db::Record> = db.create("website").content(db::Website { href: attr.value.to_string(), crawled: false, site: site_name.to_string() } ).await.unwrap();
|
||||||
|
info!{"{:?}", created};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
},
|
},
|
||||||
rcdom::NodeData::ProcessingInstruction { target, contents } => debug!("ProcessingInstruction"),
|
|
||||||
};
|
};
|
||||||
|
|
||||||
for child in &*node.children.borrow() {
|
for child in node.children.borrow().iter() {
|
||||||
Box::pin(walk(&child, db, site_name)).await;
|
Box::pin(walk(child, db, site_name)).await;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user