From c3997b0bb7b32d63995a0d2ed03eefb0ed9988eb Mon Sep 17 00:00:00 2001 From: oliver Date: Sat, 9 Nov 2024 11:30:32 -0700 Subject: [PATCH] works more, but still not all the way --- .vscode/launch.json | 3 + Cargo.lock | 316 +++++++++++++++++++++++++++++++++++++++++--- Cargo.toml | 3 +- src/db.rs | 132 +++++++++++++++++- src/main.rs | 121 +++++++++-------- 5 files changed, 498 insertions(+), 77 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index 302a13f..93262ee 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -8,6 +8,9 @@ "type": "lldb", "request": "launch", "name": "Debug executable 'surreal_spider'", + "env": { + "RUST_LOG": "surreal_spider=trace", + }, "cargo": { "args": [ "build", diff --git a/Cargo.lock b/Cargo.lock index ac1146f..f256eb1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -829,6 +829,17 @@ dependencies = [ "winapi", ] +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.85", +] + [[package]] name = "dmp" version = "0.2.0" @@ -1438,6 +1449,124 @@ dependencies = [ "cc", ] +[[package]] +name = "icu_collections" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locid" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_locid_transform" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_locid_transform_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_locid_transform_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e" + +[[package]] +name = "icu_normalizer" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "utf16_iter", + "utf8_iter", + "write16", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516" + +[[package]] +name = "icu_properties" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_locid_transform", + "icu_properties_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569" + +[[package]] +name = "icu_provider" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_provider_macros", + "stable_deref_trait", + "tinystr", + "writeable", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_provider_macros" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.85", +] + [[package]] name = "ident_case" version = "1.0.1" @@ -1446,12 +1575,23 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" [[package]] name = "idna" -version = "0.5.0" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" +checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e" dependencies = [ - "unicode-bidi", - "unicode-normalization", + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71" +dependencies = [ + "icu_normalizer", + "icu_properties", ] [[package]] @@ -1562,7 +1702,7 @@ dependencies = [ "petgraph", "pico-args", "regex", - "regex-syntax", + "regex-syntax 0.8.5", "string_cache", "term", "tiny-keccak", @@ -1576,7 +1716,7 @@ version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "507460a910eb7b32ee961886ff48539633b788a36b65692b95f225b844c82553" dependencies = [ - "regex-automata", + "regex-automata 0.4.8", ] [[package]] @@ -1634,6 +1774,12 @@ version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" +[[package]] +name = "litemap" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704" + [[package]] name = "lock_api" version = "0.4.12" @@ -1702,6 +1848,15 @@ dependencies = [ "xml5ever", ] +[[package]] +name = "matchers" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558" +dependencies = [ + "regex-automata 0.1.10", +] + [[package]] name = "matrixmultiply" version = "0.3.9" @@ -2516,8 +2671,17 @@ checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" dependencies = [ "aho-corasick", "memchr", - "regex-automata", - "regex-syntax", + "regex-automata 0.4.8", + "regex-syntax 0.8.5", +] + +[[package]] +name = "regex-automata" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" +dependencies = [ + "regex-syntax 0.6.29", ] [[package]] @@ -2528,9 +2692,15 @@ checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3" dependencies = [ "aho-corasick", "memchr", - "regex-syntax", + "regex-syntax 0.8.5", ] +[[package]] +name = "regex-syntax" +version = "0.6.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" + [[package]] name = "regex-syntax" version = "0.8.5" @@ -3273,6 +3443,7 @@ dependencies = [ "tokio", "tracing", "tracing-subscriber", + "url", ] [[package]] @@ -3451,6 +3622,17 @@ dependencies = [ "futures-core", ] +[[package]] +name = "synstructure" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.85", +] + [[package]] name = "system-configuration" version = "0.6.1" @@ -3583,6 +3765,16 @@ dependencies = [ "crunchy", ] +[[package]] +name = "tinystr" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f" +dependencies = [ + "displaydoc", + "zerovec", +] + [[package]] name = "tinyvec" version = "1.8.0" @@ -3750,10 +3942,14 @@ version = "0.3.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b" dependencies = [ + "matchers", "nu-ansi-term", + "once_cell", + "regex", "sharded-slab", "smallvec", "thread_local", + "tracing", "tracing-core", "tracing-log", ] @@ -3826,12 +4022,6 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7e51b68083f157f853b6379db119d1c1be0e6e4dec98101079dec41f6f5cf6df" -[[package]] -name = "unicode-bidi" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ab17db44d7388991a428b2ee655ce0c212e862eff1768a455c58f9aad6e7893" - [[package]] name = "unicode-ident" version = "1.0.13" @@ -3883,13 +4073,14 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "url" -version = "2.5.2" +version = "2.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22784dbdf76fdde8af1aeda5622b546b422b6fc585325248a2bf9f5e41e94d6c" +checksum = "8d157f1b96d14500ffdc1f10ba712e780825526c03d9a49b4d0324b0d9113ada" dependencies = [ "form_urlencoded", "idna", "percent-encoding", + "serde", ] [[package]] @@ -3904,6 +4095,18 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" +[[package]] +name = "utf16_iter" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + [[package]] name = "uuid" version = "1.11.0" @@ -4241,6 +4444,18 @@ dependencies = [ "memchr", ] +[[package]] +name = "write16" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" + +[[package]] +name = "writeable" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" + [[package]] name = "ws_stream_wasm" version = "0.7.4" @@ -4280,6 +4495,30 @@ dependencies = [ "markup5ever 0.14.0", ] +[[package]] +name = "yoke" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c5b1314b079b0930c31e3af543d8ee1757b1951ae1e1565ec704403a7240ca5" +dependencies = [ + "serde", + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28cc31741b18cb6f1d5ff12f5b7523e3d6eb0852bbbad19d73905511d9849b95" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.85", + "synstructure", +] + [[package]] name = "zerocopy" version = "0.7.35" @@ -4301,8 +4540,51 @@ dependencies = [ "syn 2.0.85", ] +[[package]] +name = "zerofrom" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91ec111ce797d0e0784a1116d0ddcdbea84322cd79e5d5ad173daeba4f93ab55" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ea7b4a3637ea8669cedf0f1fd5c286a17f3de97b8dd5a70a6c167a1730e63a5" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.85", + "synstructure", +] + [[package]] name = "zeroize" version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" + +[[package]] +name = "zerovec" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.85", +] diff --git a/Cargo.toml b/Cargo.toml index d1fdef0..b07be1d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,4 +11,5 @@ serde = { version = "1.0.214", features = ["derive"] } surrealdb = "2.0.4" tokio = { version="1.41.0", features = ["full"] } tracing = "0.1.40" -tracing-subscriber = "0.3.18" +tracing-subscriber = { version = "0.3.18", features = ["env-filter"] } +url = { version = "2.5.3", features = ["serde"] } diff --git a/src/db.rs b/src/db.rs index 9d1cfa9..f12593e 100644 --- a/src/db.rs +++ b/src/db.rs @@ -1,22 +1,140 @@ use serde::{Deserialize, Serialize}; -use surrealdb::{engine::remote::ws::{Client, Ws}, opt::auth::Root, sql::Thing, Surreal}; +use surrealdb::{ + engine::remote::ws::{Client, Ws}, + opt::auth::Root, + sql::Thing, + Surreal, +}; +use tracing::{debug, error, info, instrument}; +use url::Url; -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, Deserialize, Clone)] pub struct Website { - pub site: String, - pub href: String, - pub crawled: bool + /// The url that this data is found at + site: Url, + /// The url as defined in the tag + href: Url, + /// Wether or not this link has been crawled yet + crawled: bool, + /// Wether or not the href was doctored + doctored_href: bool, + original_href: Option, +} + +impl Website { + /// Creates a blank site (assumes that url param is site's root) + pub fn new(url: &str, href: &str, crawled: bool) -> Self { + let mut new = Self::from(url); + new.crawled = crawled; + new.original_href = Some(href.to_string()); + new.href = + match Url::parse(href) { + Ok(e) => e, + Err(e) => { + match e { + url::ParseError::RelativeUrlWithoutBase => { + // Try to combine the scheme_host and href to get a useable domain + new.doctored_href = true; + + let url = if !url.ends_with('/') && !href.starts_with('/') { + format!("{url}/{href}") + } else { + format!("{url}{href}") + }; + + // paste the domain onto the begining of the href + Url::parse(&url).map_or_else(|err| { + debug!("Parsing {url} with {href}"); + error!("{err} Failed to parse href into url on second try. Aborting"); + panic!("See error logs for more info."); + }, |ok| ok) + } + _ => { + error!("{e}"); + panic!("See error logs for more info."); + } + } + } + }; + new + } + pub fn crawled(&mut self) { + self.crawled = true + } + pub fn href_str(&self) -> &str { + self.href.as_str() + } + pub fn site(&self) -> String { + self.site.to_string() + } + pub fn domain_str(&self) -> &str { + self.site.as_str() + } + #[instrument(skip_all)] + pub async fn store(&mut self, db: &Surreal) { + // is root record? + if self.href.path() == "/" { + // Upsert is create or update + // Whereas Update is just update + let record = ("website", &self.href.to_string()); + + let crawled = if let Some(old) = db.select(record).await.unwrap() { + let old: Website = old; // infer type + old.crawled + } else {false}; + + if !self.crawled {self.crawled = crawled}; + + match db.upsert(record).content(self.clone()).await { + Ok(e) => { + if let Some(a) = &e { + let _: &Record = a; + } + } + Err(e) => { + error!("{}", e); + }, + }; + } else { + let _: Option = match db.create("website").content(self.clone()).await { + Ok(e) => { + if let Some(a) = &e { + let _: &Record = a; + } + e + } + Err(_) => todo!(), + }; + } + } +} + +impl From<&str> for Website { + /// site == href, crawled = false + fn from(value: &str) -> Self { + let site = match Url::parse(value) { + Ok(a) => a, + Err(_) => todo!(), + }; + Self { + href: site.clone(), + crawled: false, + site, + doctored_href: false, + original_href: None, + } + } } #[derive(Debug, Serialize)] pub struct Email { - pub email: String + pub email: String, } #[derive(Debug, Deserialize)] pub struct Record { #[allow(dead_code)] - id: Thing, + pub id: Thing, } pub async fn connect() -> surrealdb::Result> { diff --git a/src/main.rs b/src/main.rs index 40286b8..af2fdde 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,87 +2,97 @@ extern crate markup5ever_rcdom as rcdom; extern crate html5ever; use std::rc::Rc; -use db::connect; +use db::{connect, Website}; use html5ever::{parse_document, tendril::TendrilSink, tree_builder::TreeBuilderOpts, ParseOpts}; use rcdom::{Node, RcDom}; use surrealdb::{engine::remote::ws::Client, Surreal}; -use tracing::{debug, error, info, warn}; +use tracing::{debug, info, instrument}; +use tracing_subscriber::EnvFilter; mod db; #[tokio::main] async fn main() { - tracing_subscriber::fmt::init(); + tracing_subscriber::fmt() + .with_env_filter(EnvFilter::from_default_env()) + .with_line_number(true) + .without_time() + .init(); debug!("Starting..."); - let url = "https://oliveratkinson.net"; - - let db = connect().await.expect("Failed to connect to db, aborting."); - let dom = get(url).await; + // Would probably take these in as parameters from a cli + let url = "https://oliveratkinson.net/"; + let budget = 50; + let mut crawled = 0; - walk(&dom, &db, url).await; + let db = connect().await.expect("Failed to connect to db, aborting."); + + // Kick off the whole machine - This Website object doesn't matter, it's just to allow for + // get() to work. + let mut site = Website::from(url); + let dom = get(&mut site, &db).await.expect("Inital page returned None."); + crawled += 1; + walk(&dom, &db, &site).await; + + while crawled < budget { + let uncrawled = get_uncrawled_links(&db).await; + debug!("Crawling {} pages...", uncrawled.len()); + + for mut site in uncrawled { + if let Some(dom) = get(&mut site, &db).await { + walk(&dom, &db, &site).await; + crawled += 1; + let percent = format!("{:.2}%", (crawled as f32/budget as f32) * 100f32); + info!("Crawled {crawled} out of {budget} pages. ({percent})"); + } + } + } info!("Done"); } -async fn get(url: &str) -> Rc { - let response = reqwest::get(url).await.unwrap(); - let data = response.text().await.unwrap(); - - let opts = ParseOpts { - tree_builder: TreeBuilderOpts { - drop_doctype: true, +#[instrument(skip_all)] +/// A quick helper function for downloading a url +async fn get(site: &mut Website, db: &Surreal) -> Option> { + if let Ok(response) = reqwest::get(site.href_str()).await { + let data = response.text().await.unwrap(); + let opts = ParseOpts { + tree_builder: TreeBuilderOpts { + drop_doctype: true, + ..Default::default() + }, ..Default::default() - }, - ..Default::default() - }; + }; + + let dom = parse_document(RcDom::default(), opts) + .from_utf8() + .read_from(&mut data.as_bytes()) + .unwrap(); - let dom = parse_document(RcDom::default(), opts) - .from_utf8() - .read_from(&mut data.as_bytes()) - .unwrap(); - dom.document + site.crawled(); + site.store(db).await; + return Some(dom.document); + } + None } -async fn walk(node: &rcdom::Handle, db: &Surreal , site_name: &str) { +/// Walks the givin site, placing it's findings in the database +async fn walk(node: &rcdom::Handle, db: &Surreal , site_name: &Website) { // Insert Or Update - let _: Option> = match db.upsert(("website", site_name)).content(db::Website { href: String::from("/"), crawled: true, site: site_name.to_string() } ).await { - Ok(e) => { - // Return this for type coercion - e - }, - Err(e) => { - // error!("{}", e); - None - } - }; + // create_root(site_name, db).await; match &node.data { rcdom::NodeData::Element { name, attrs, template_contents, mathml_annotation_xml_integration_point } => { for attr in attrs.borrow().clone() { - let name = name.local.to_string(); - if name == "a" { + if name.local.to_string() == "a" { if attr.value.starts_with("mailto") { // mailto link, lol - let created: Option = db.create("email").content(db::Email { + let _created: Option = db.create("email").content(db::Email { email: attr.value.to_string() }).await.unwrap(); - warn!("{:?}", created) } else { - // FIXME this isn't actually creating records...? - let _: Option = match db.create("website").content(db::Website { - href: attr.value.to_string(), - crawled: false, - site: site_name.to_string() - }).await { - Ok(e) => { - if let Some(a) = &e { - debug!("{:?}", a); - } - e - }, - Err(_) => todo!(), - }; + let mut web = Website::new(&site_name.site(), &attr.value, false); + web.store(db).await; } } }; @@ -94,3 +104,10 @@ async fn walk(node: &rcdom::Handle, db: &Surreal , site_name: &str) { Box::pin(walk(child, db, site_name)).await; } } + +/// Returns 0-50 uncrawled links (LIMIT = 50) +async fn get_uncrawled_links(db: &Surreal) -> Vec { + let mut response = db.query("SELECT * FROM website WHERE crawled = false LIMIT 50").await.expect("Hard-coded query failed..?"); + response.take(0).expect("Returned websites couldn't be parsed") +} +