updates
This commit is contained in:
54
src/main.rs
54
src/main.rs
@@ -1,12 +1,12 @@
|
||||
extern crate markup5ever_rcdom as rcdom;
|
||||
extern crate html5ever;
|
||||
|
||||
use std::{env, rc::Rc};
|
||||
use std::rc::Rc;
|
||||
use db::connect;
|
||||
use html5ever::{parse_document, tendril::TendrilSink, tree_builder::TreeBuilderOpts, ParseOpts};
|
||||
use rcdom::{Node, RcDom};
|
||||
use surrealdb::{engine::remote::ws::Client, Surreal};
|
||||
use tracing::{debug, error, info, trace, warn};
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
mod db;
|
||||
|
||||
@@ -16,13 +16,13 @@ async fn main() {
|
||||
debug!("Starting...");
|
||||
|
||||
let url = "https://oliveratkinson.net";
|
||||
let budget = "10";
|
||||
|
||||
let dom = get(url).await;
|
||||
let db = connect().await.expect("Failed to connect to db, aborting.");
|
||||
let dom = get(url).await;
|
||||
|
||||
warn!("Walking...");
|
||||
walk(&dom, &db, url).await;
|
||||
|
||||
info!("Done");
|
||||
}
|
||||
|
||||
async fn get(url: &str) -> Rc<Node> {
|
||||
@@ -45,48 +45,42 @@ async fn get(url: &str) -> Rc<Node> {
|
||||
}
|
||||
|
||||
async fn walk(node: &rcdom::Handle, db: &Surreal<Client> , site_name: &str) {
|
||||
// Should protentailly just check for the record first.
|
||||
let created: Option<db::Record> = match db.create("website").content(db::Website { href: String::from("/"), crawled: true, site: site_name.to_string() } ).await {
|
||||
Ok(e) => e,
|
||||
// Insert Or Update
|
||||
let _created: Vec<db::Record> = match db.upsert("website").content(db::Website { href: String::from("/"), crawled: true, site: site_name.to_string() } ).await {
|
||||
Ok(e) => {
|
||||
// Return this for type coercion
|
||||
e
|
||||
},
|
||||
Err(e) => {
|
||||
match e {
|
||||
surrealdb::Error::Db(_) => todo!(),
|
||||
surrealdb::Error::Api(api) => {
|
||||
match api {
|
||||
surrealdb::error::Api::Query(query) => {
|
||||
error!(query);
|
||||
None
|
||||
},
|
||||
_ => todo!(),
|
||||
}
|
||||
},
|
||||
}
|
||||
unimplemented!("{}", e);
|
||||
}
|
||||
};
|
||||
info!{"{:?}", created};
|
||||
|
||||
match &node.data {
|
||||
rcdom::NodeData::Document => (),
|
||||
rcdom::NodeData::ProcessingInstruction { target, contents } => debug!("ProcessingInstruction"),
|
||||
rcdom::NodeData::Doctype { name, public_id, system_id } => debug!("doctype"),
|
||||
rcdom::NodeData::Text { contents } => {},
|
||||
rcdom::NodeData::Comment { contents } => debug!("comment"),
|
||||
rcdom::NodeData::Element { name, attrs, template_contents, mathml_annotation_xml_integration_point } => {
|
||||
for attr in attrs.borrow().clone() {
|
||||
let name = name.local.to_string();
|
||||
if name == "a" {
|
||||
if attr.value.starts_with("mailto") {
|
||||
// mailto link, lol
|
||||
let created: Option<db::Record> = db.create("email").content(db::Email { email: attr.value.to_string() }).await.unwrap();
|
||||
info!("{:?}", created)
|
||||
let created: Option<db::Record> = db.create("email").content(db::Email {
|
||||
email: attr.value.to_string()
|
||||
}).await.unwrap();
|
||||
warn!("{:?}", created)
|
||||
} else {
|
||||
let href = attr.value.to_string();
|
||||
info!("{}", href);
|
||||
// Every not-mailto link
|
||||
let created: Option<db::Record> = db.create("website").content(db::Website { href: attr.value.to_string(), crawled: false, site: site_name.to_string() } ).await.unwrap();
|
||||
info!{"{:?}", created};
|
||||
let _created: Option<db::Record> = db.create("website").content(db::Website {
|
||||
href,
|
||||
crawled: false,
|
||||
site: site_name.to_string()
|
||||
}).await.unwrap();
|
||||
}
|
||||
}
|
||||
};
|
||||
},
|
||||
_ => {},
|
||||
};
|
||||
|
||||
for child in node.children.borrow().iter() {
|
||||
|
Reference in New Issue
Block a user