crawling 🕷️
This commit is contained in:
36
src/db.rs
Normal file
36
src/db.rs
Normal file
@@ -0,0 +1,36 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use surrealdb::{engine::remote::ws::{Client, Ws}, opt::auth::Root, sql::Thing, Surreal};
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct Website {
|
||||
pub url: String,
|
||||
pub crawled: bool
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct Email {
|
||||
pub email: String
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct Record {
|
||||
#[allow(dead_code)]
|
||||
id: Thing,
|
||||
}
|
||||
|
||||
pub async fn connect() -> surrealdb::Result<Surreal<Client>> {
|
||||
// Connect to the server
|
||||
let db = Surreal::new::<Ws>("127.0.0.1:8000").await?;
|
||||
|
||||
// Signin as a namespace, database, or root user
|
||||
db.signin(Root {
|
||||
username: "root",
|
||||
password: "root",
|
||||
})
|
||||
.await?;
|
||||
|
||||
// Select a specific namespace / database
|
||||
db.use_ns("test").use_db("test").await?;
|
||||
|
||||
Ok(db)
|
||||
}
|
93
src/main.rs
93
src/main.rs
@@ -1,11 +1,14 @@
|
||||
extern crate markup5ever_rcdom as rcdom;
|
||||
extern crate html5ever;
|
||||
|
||||
use std::env;
|
||||
use std::{env, rc::Rc};
|
||||
use db::connect;
|
||||
use html5ever::{parse_document, tendril::TendrilSink, tree_builder::TreeBuilderOpts, ParseOpts};
|
||||
use rcdom::RcDom;
|
||||
use tracing::{debug, info, trace, warn};
|
||||
use rcdom::{Node, RcDom};
|
||||
use surrealdb::{engine::remote::ws::Client, Surreal};
|
||||
use tracing::{debug, error, info, trace, warn};
|
||||
|
||||
mod db;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
@@ -15,10 +18,17 @@ async fn main() {
|
||||
let url = "https://oliveratkinson.net";
|
||||
let budget = "10";
|
||||
|
||||
let dom = get(url).await;
|
||||
let db = connect().await.expect("Failed to connect to db, aborting.");
|
||||
|
||||
warn!("Walking...");
|
||||
walk(&dom, &db, url).await;
|
||||
}
|
||||
|
||||
async fn get(url: &str) -> Rc<Node> {
|
||||
let response = reqwest::get(url).await.unwrap();
|
||||
let data = response.text().await.unwrap();
|
||||
|
||||
|
||||
let opts = ParseOpts {
|
||||
tree_builder: TreeBuilderOpts {
|
||||
drop_doctype: true,
|
||||
@@ -31,28 +41,81 @@ async fn main() {
|
||||
.from_utf8()
|
||||
.read_from(&mut data.as_bytes())
|
||||
.unwrap();
|
||||
|
||||
let a = &dom.document;
|
||||
warn!("Walking...");
|
||||
|
||||
walk(a);
|
||||
dom.document
|
||||
}
|
||||
|
||||
fn walk(node: &rcdom::Handle) {
|
||||
async fn walk(node: &rcdom::Handle, db: &Surreal<Client> , site_name: &str) {
|
||||
|
||||
|
||||
// Should protentailly just check for the record first.
|
||||
let created: Option<db::Record> = match db.create("website").content(db::Website { url: site_name.to_string(), crawled: true } ).await {
|
||||
Ok(e) => e,
|
||||
Err(e) => {
|
||||
match e {
|
||||
surrealdb::Error::Db(_) => todo!(),
|
||||
surrealdb::Error::Api(api) => {
|
||||
match api {
|
||||
surrealdb::error::Api::Query(query) => {
|
||||
error!(query);
|
||||
None
|
||||
},
|
||||
_ => todo!(),
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
match &node.data {
|
||||
rcdom::NodeData::Document => (),
|
||||
rcdom::NodeData::Doctype { name, public_id, system_id } => debug!("doctype"),
|
||||
rcdom::NodeData::Text { contents } => {},
|
||||
rcdom::NodeData::Comment { contents } => debug!("comment"),
|
||||
rcdom::NodeData::Element { name, attrs, template_contents, mathml_annotation_xml_integration_point } => {
|
||||
attrs.borrow().iter().for_each(|attr| {
|
||||
let attrs = attrs.borrow().clone();
|
||||
for attr in attrs {
|
||||
let name = name.local.to_string();
|
||||
let internal = &*attr.value;
|
||||
debug!("element: {name}, attr: {internal}");
|
||||
});
|
||||
let link = &*attr.value;
|
||||
trace!("element: {name}, attr: {link}");
|
||||
|
||||
if name == "a" {
|
||||
if link.starts_with("mailto") {
|
||||
// mailto link, lol
|
||||
warn!("{link}");
|
||||
|
||||
let created: Option<db::Record> = db.create("email").content(db::Email { email: link.to_owned() }).await.unwrap();
|
||||
info!("{:?}", created)
|
||||
|
||||
} else if link.starts_with("http") {
|
||||
// normal link
|
||||
debug!("{link}")
|
||||
} else if link.contains("/") {
|
||||
// possibly a relative link?
|
||||
//
|
||||
// TODO This needs more logic handling. Needs to handle the following cases:
|
||||
//
|
||||
// Absolute links:
|
||||
// /img.png
|
||||
// /file-no-extension
|
||||
//
|
||||
// Realtive Links:
|
||||
//
|
||||
// img.png
|
||||
// file-no-extnesion
|
||||
//
|
||||
let link_name = format!("{site_name}/{link}");
|
||||
debug!("{link_name}");
|
||||
let created: Option<db::Record> = db.create("website").content(db::Website { url: link_name, crawled: false } ).await.unwrap();
|
||||
} else {
|
||||
error!("Unhandled link type: {link}")
|
||||
}
|
||||
}
|
||||
};
|
||||
},
|
||||
rcdom::NodeData::ProcessingInstruction { target, contents } => debug!("ProcessingInstruction"),
|
||||
};
|
||||
|
||||
node.children.borrow().iter().for_each(|n| walk(n));
|
||||
for child in &*node.children.borrow() {
|
||||
Box::pin(walk(&child, db, site_name)).await;
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user