From 1b9964a938fcb0814bf2789185ee38d240a74844 Mon Sep 17 00:00:00 2001 From: Rushmore75 Date: Thu, 20 Mar 2025 14:33:27 -0600 Subject: [PATCH] not yet --- Cargo.lock | 38 ++++++++++---------- Cargo.toml | 2 +- Crawler.toml | 8 ++--- src/db.rs | 93 +++++++++++++++++++++++++++++-------------------- src/main.rs | 20 ++++++----- src/setup.surql | 3 +- 6 files changed, 93 insertions(+), 71 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index dd31000..c55ee61 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1961,6 +1961,25 @@ dependencies = [ "generic-array", ] +[[package]] +name = "internet_mapper" +version = "0.1.0" +dependencies = [ + "base64 0.22.1", + "html5ever 0.29.1", + "metrics", + "metrics-exporter-prometheus", + "minio", + "reqwest", + "serde", + "surrealdb", + "tokio", + "toml", + "tracing", + "tracing-subscriber", + "url", +] + [[package]] name = "ipnet" version = "2.11.0" @@ -4112,25 +4131,6 @@ version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" -[[package]] -name = "surreal_spider" -version = "0.1.0" -dependencies = [ - "base64 0.22.1", - "html5ever 0.29.1", - "metrics", - "metrics-exporter-prometheus", - "minio", - "reqwest", - "serde", - "surrealdb", - "tokio", - "toml", - "tracing", - "tracing-subscriber", - "url", -] - [[package]] name = "surrealdb" version = "2.2.1" diff --git a/Cargo.toml b/Cargo.toml index 6a1d88e..fa499e4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "surreal_spider" +name = "internet_mapper" version = "0.1.0" edition = "2021" diff --git a/Crawler.toml b/Crawler.toml index df0a253..a0dbad5 100644 --- a/Crawler.toml +++ b/Crawler.toml @@ -3,13 +3,13 @@ surreal_url = "localhost:8000" surreal_username = "root" surreal_password = "root" surreal_ns = "test" -surreal_db = "v1.15.4" +surreal_db = "v1.16" # Minio config -s3_bucket = "v1.15.4" +s3_bucket = "v1.16" s3_url = "http://localhost:9000" -s3_access_key = "3ptjsHhRHCHlpCmgFy9n" -s3_secret_key = "68CmV07YExeCxb8kJhosSauEizj5CAE7PINZIfQz" +s3_access_key = "DwJfDDVIbmCmfAblwSqp" +s3_secret_key = "V4UqvC1Vm4AwLE5FAhu2gxlMfvexTBQnDxuy8uZx" # Crawler config crawl_filter = "en.wikipedia.com" diff --git a/src/db.rs b/src/db.rs index 8aae6e8..4348031 100644 --- a/src/db.rs +++ b/src/db.rs @@ -5,7 +5,7 @@ use base64::{ }; use metrics::counter; use serde::{ser::SerializeStruct, Deserialize, Serialize}; -use std::{fmt::Debug, sync::LazyLock, time::Instant}; +use std::{collections::HashSet, fmt::Debug, sync::LazyLock, time::Instant}; use surrealdb::{ engine::remote::ws::{Client, Ws}, opt::auth::Root, @@ -29,7 +29,7 @@ const TIME_SPENT_ON_LOCK: &'static str = "surql_lock_waiting_ms"; const STORE: &'static str = "surql_store_calls"; const LINK: &'static str = "surql_link_calls"; -#[derive(Deserialize, Clone)] +#[derive(Deserialize, Clone, Hash, Eq, PartialEq)] pub struct Website { /// The url that this data is found at pub site: Url, @@ -40,12 +40,13 @@ pub struct Website { impl Serialize for Website { fn serialize(&self, serializer: S) -> Result where - S: serde::Serializer { - let mut state = serializer.serialize_struct("Website", 2)?; - state.serialize_field("crawled", &self.crawled)?; - // to_string() calls the correct naming of site - state.serialize_field("site", &self.site.to_string())?; - state.end() + S: serde::Serializer, + { + let mut state = serializer.serialize_struct("Website", 2)?; + state.serialize_field("crawled", &self.crawled)?; + // to_string() calls the correct naming of site + state.serialize_field("site", &self.site.to_string())?; + state.end() } } @@ -63,10 +64,7 @@ impl Website { Ok(a) => a, Err(_) => todo!(), }; - Self { - crawled, - site - } + Self { crawled, site } } pub fn set_crawled(&mut self) { @@ -78,7 +76,10 @@ impl Website { let domain = match site.domain() { Some(s) => s.to_string(), None => { - warn!("Failed to get domain of URL: {}, falling back to 'localhost'", site.to_string()); + warn!( + "Failed to get domain of URL: {}, falling back to 'localhost'", + site.to_string() + ); "localhost".to_string() } }; @@ -86,6 +87,7 @@ impl Website { domain + path } + pub fn get_url_as_b64_path(site: &Url) -> String { let domain = site.domain().unwrap_or("DOMAIN").to_string(); let path = &CUSTOM_ENGINE.encode(site.path()); @@ -99,7 +101,7 @@ impl Website { if len == 0 { return; } - + let from = &self.site; // let to = other.site.to_string(); @@ -126,7 +128,10 @@ impl Website { trace!("Link for {from} OK - {num}/{len}"); return; } else { - error!("Didn't link all the records. {num}/{len}. Surreal response: {:?}", e); + error!( + "Didn't link all the records. {num}/{len}. Surreal response: {:?}", + e + ); return; } } @@ -139,48 +144,61 @@ impl Website { } } + pub async fn store_self(&self, db: &Surreal) { + counter!(STORE).increment(1); + + db.query( + "INSERT INTO website $self + ON DUPLICATE KEY UPDATE + crawled = crawled OR $input.crawled + RETURN VALUE id; + ", + ) + .await + .expect("Failed to store self"); + } + // Insert ever item in the vec into surreal, crawled state will be preserved as TRUE // if already in the database as such or incoming data is TRUE. - pub async fn store_all(all: Vec, db: &Surreal) -> Vec { + pub async fn store_all(all: HashSet, db: &Surreal) -> Vec { + // NOTES: + // * all incoming Websites come in as !crawled + // * there are potentially duplicates in all + counter!(STORE).increment(1); - let mut things = Vec::with_capacity(all.len()); - // TODO this only allows for one thread to be in the database at a time. - // This is currently required since otherwise we get write errors. - // If the default `crawled` is set to false, we might not need to write any more - // than just the name. `accessed_at` is fun but not needed. let now = Instant::now(); let lock = LOCK.lock().await; counter!(TIME_SPENT_ON_LOCK).increment(now.elapsed().as_millis() as u64); + let mut results = Vec::with_capacity(all.len()); match db .query( + // TODO making this an upsert would make sense, but + // upserts seem to be broken. + // + // Doesn't look like upsert can take in an array, so insert + // it is... + // "INSERT INTO website $array ON DUPLICATE KEY UPDATE - accessed_at = time::now(), - crawled = crawled OR $input.crawled - RETURN VALUE id; - ", + last_write = time::now() + RETURN VALUE id;" + , ) .bind(("array", all)) .await { Ok(mut id) => match id.take::>(0) { - Ok(mut x) => things.append(&mut x), - Err(err) => match err { - Api(error) => { - eprintln!("{:?}", error); - error!("{:?}", error); - } - _ => error!("{:?}", err), - }, + Ok(mut x) => results.append(&mut x), + Err(err) => { + error!("{:?}", err); + } }, - Err(err) => { - error!("{:?}", err); - } + Err(err) => error!("{:?}", err), } drop(lock); - things + results } } @@ -224,3 +242,4 @@ pub async fn connect(config: &Config) -> surrealdb::Result> { Ok(db) } + diff --git a/src/main.rs b/src/main.rs index 1d02d56..8127842 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,10 +3,7 @@ extern crate html5ever; use std::{ - fs::File, - io::Read, - net::{IpAddr, Ipv4Addr}, - time::Instant, + collections::HashSet, fs::File, io::Read, net::{IpAddr, Ipv4Addr}, time::Instant }; use db::{connect, Website}; @@ -175,6 +172,7 @@ async fn process(mut site: Website, db: Surreal, reqwest: reqwest::Clien // METRICS let g = gauge!(GET_IN_FLIGHT); g.increment(1); + counter!(GET_METRIC).increment(1); let timer = Timer::start("Got page"); // Send the http request (get) @@ -183,7 +181,6 @@ async fn process(mut site: Website, db: Surreal, reqwest: reqwest::Clien // METRICS timer.stop(); g.decrement(1); - counter!(GET_METRIC).increment(1); // Get body from response let data = response @@ -198,11 +195,18 @@ async fn process(mut site: Website, db: Surreal, reqwest: reqwest::Clien // update self in db site.set_crawled(); - Website::store_all(vec![site.clone()], &db).await; + site.store_self(&db).await; + + // de duplicate this list + let set = sites.iter().fold(HashSet::new(), |mut set, item| { + // TODO seems kinda dumb to clone everything. + set.insert(item.clone()); + set + }); + trace!("Shrunk items to store from {} to {}", sites.len(), set.len()); // Store all the other sites so that we can link to them. - // let mut links_to = Vec::new(); - let others = Website::store_all(sites, &db).await; + let others = Website::store_all(set, &db).await; // Make the database's links reflect the html links between sites site.links_to(others, &db).await; diff --git a/src/setup.surql b/src/setup.surql index 17b89f5..a3f73b1 100644 --- a/src/setup.surql +++ b/src/setup.surql @@ -5,5 +5,4 @@ DEFINE INDEX IF NOT EXISTS idx ON TABLE website COLUMNS site UNIQUE; DEFINE FIELD IF NOT EXISTS crawled ON TABLE website TYPE bool; -DEFINE FIELD IF NOT EXISTS accessed_at ON TABLE website VALUE time::now(); -DEFINE FIELD IF NOT EXISTS first_accessed_at ON TABLE website VALUE time::now(); +DEFINE FIELD IF NOT EXISTS created ON TABLE website VALUE time::now();