From 50606bb69e7b738c3f6825f389d71182d8e2f015 Mon Sep 17 00:00:00 2001 From: Rushmore75 Date: Thu, 17 Apr 2025 09:59:23 -0600 Subject: [PATCH 1/2] It isnt quite working yet --- Crawler.toml | 14 +++++++------- src/db.rs | 10 +++++----- src/main.rs | 6 ++++-- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/Crawler.toml b/Crawler.toml index e57d08b..8cb179b 100644 --- a/Crawler.toml +++ b/Crawler.toml @@ -3,12 +3,12 @@ surreal_url = "localhost:8000" surreal_username = "root" surreal_password = "root" surreal_ns = "test" -surreal_db = "v1.20.3" +surreal_db = "v1.21.1" # Crawler config -crawl_filter = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI/Imagery/2023_NAIP/UTM_County_Mosaics/" -# crawl_filter = "https://oliveratkinson.net" -start_url = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI/Imagery/2023_NAIP/UTM_County_Mosaics/" -# start_url = "https://oliveratkinson.net" -budget = 100 -batch_size = 5 +# crawl_filter = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI/Imagery/2023_NAIP/UTM_County_Mosaics/" +crawl_filter = "https://oliveratkinson.net" +# start_url = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI/Imagery/2023_NAIP/UTM_County_Mosaics/" +start_url = "https://oliveratkinson.net" +budget = 1000 +batch_size = 500 diff --git a/src/db.rs b/src/db.rs index 49fbd6e..2cccfa7 100644 --- a/src/db.rs +++ b/src/db.rs @@ -20,6 +20,8 @@ pub struct Website { pub site: Url, /// Wether or not this link has been crawled yet pub crawled: bool, + /// 200, 404, etc + pub status_code: u16, } // manual impl to make tracing look nicer @@ -38,14 +40,11 @@ impl Website { }; Self { crawled, - site + site, + status_code: 0, } } - pub fn set_crawled(&mut self) { - self.crawled = true - } - // Insert ever item in the vec into surreal, crawled state will be preserved as TRUE // if already in the database as such or incoming data is TRUE. #[instrument(skip(db))] @@ -58,6 +57,7 @@ impl Website { "INSERT INTO website $array ON DUPLICATE KEY UPDATE accessed_at = time::now(), + status_code = $input.status_code, crawled = crawled OR $input.crawled RETURN VALUE id; ", diff --git a/src/main.rs b/src/main.rs index c0898a1..66c0fd5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -173,8 +173,8 @@ async fn process(mut site: Website, db: Surreal, reqwest: reqwest::Clien // Send the http request (get) if let Ok(response) = request_builder.send().await { - // Get body from response let headers = response.headers(); + let code = response.status(); #[allow(non_snake_case)] let CT = headers.get("Content-Type"); @@ -201,6 +201,7 @@ async fn process(mut site: Website, db: Surreal, reqwest: reqwest::Clien let mut writer = BufWriter::new(file); + // Get body from response // stream the response onto the disk let mut stream = response.bytes_stream(); @@ -243,7 +244,8 @@ async fn process(mut site: Website, db: Surreal, reqwest: reqwest::Clien counter!(GET_METRIC).increment(1); // update self in db - site.set_crawled(); + site.crawled = true; + site.status_code = code.as_u16(); Website::store_all(vec![site], &db).await; } else { error!("File failed to cooperate: {:?}", path); -- 2.47.2 From 6790061e22005b9c9e5a313fe0502be018130614 Mon Sep 17 00:00:00 2001 From: Rushmore75 Date: Wed, 9 Jul 2025 15:58:22 -0600 Subject: [PATCH 2/2] helper code --- src/db.rs | 7 ++++++- src/main.rs | 28 ++++++++++++++++++---------- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/src/db.rs b/src/db.rs index 2cccfa7..9778edb 100644 --- a/src/db.rs +++ b/src/db.rs @@ -27,7 +27,11 @@ pub struct Website { // manual impl to make tracing look nicer impl Debug for Website { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("Website").field("site", &self.site).finish() + f.debug_struct("Website") + .field("host", &self.site.host()) + .field("path", &self.site.path()) + .field("status_code", &self.status_code) + .finish() } } @@ -52,6 +56,7 @@ impl Website { counter!(STORE).increment(1); let mut things = Vec::with_capacity(all.len()); + // FIXME failes *sometimes* because "Resource Busy" match db .query( "INSERT INTO website $array diff --git a/src/main.rs b/src/main.rs index 66c0fd5..289f351 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,5 +1,4 @@ #![feature(ip_from)] -#![feature(async_closure)] #![warn(clippy::expect_used)] #![deny(clippy::unwrap_used)] @@ -20,7 +19,7 @@ use metrics_exporter_prometheus::PrometheusBuilder; use serde::Deserialize; use surrealdb::{engine::remote::ws::Client, Surreal}; use tokio::{io::{AsyncWriteExt, BufWriter}, task::JoinSet}; -use tracing::{debug, error, info, instrument, level_filters::LevelFilter, trace, trace_span, warn}; +use tracing::{debug, debug_span, error, info, instrument, level_filters::LevelFilter, trace, trace_span, warn}; use tracing_subscriber::{fmt, layer::SubscriberExt, EnvFilter, Layer, Registry}; mod db; @@ -180,8 +179,6 @@ async fn process(mut site: Website, db: Surreal, reqwest: reqwest::Clien let CT = headers.get("Content-Type"); let ct = headers.get("content-type"); - if CT.is_none() && ct.is_none() { - } let ct = match (CT,ct) { (None, None) => { warn!("Server did not respond with Content-Type header. Url: {} Headers: ({:?})", site.site.to_string(), headers); @@ -192,19 +189,20 @@ async fn process(mut site: Website, db: Surreal, reqwest: reqwest::Clien (Some(a), Some(_)) => a, }; + // create filepath (handles / -> /index.html) let path = filesystem::as_path(&site.site, ct); // make sure that the file is good to go if let Some(file) = filesystem::init(&path).await { - let should_parse = path.to_string_lossy().ends_with(".html"); - let mut buf: Vec = Vec::new(); - - let mut writer = BufWriter::new(file); - // Get body from response // stream the response onto the disk let mut stream = response.bytes_stream(); + let should_parse = path.to_string_lossy().ends_with(".html"); + let mut writer = BufWriter::new(file); + let mut buf: Vec = Vec::new(); + + // Write file to disk info!("Writing at: {:?}", path); while let Some(data) = stream.next().await { match data { @@ -212,6 +210,7 @@ async fn process(mut site: Website, db: Surreal, reqwest: reqwest::Clien let _ = writer.write_all(&data).await; // If we are going to parse this file later, we will save it // into memory as well as the disk. + // We do this because the data here might be incomplete if should_parse { data.iter().for_each(|f| buf.push(*f)); } @@ -223,7 +222,12 @@ async fn process(mut site: Website, db: Surreal, reqwest: reqwest::Clien } let _ = writer.flush(); + + // (If needed) Parse the file if should_parse { + let span = debug_span!("Should Parse"); + let enter = span.enter(); + // Parse document and get relationships let sites = parser::parse(&site, &buf).await; // De-duplicate this list @@ -237,6 +241,8 @@ async fn process(mut site: Website, db: Surreal, reqwest: reqwest::Clien trace!("Saved {diff} from being entered into the db by de-duping"); // Store all the other sites so that we can link to them. let _ = Website::store_all(de_dupe_sites, &db).await; + + drop(enter); } // METRICS @@ -246,10 +252,12 @@ async fn process(mut site: Website, db: Surreal, reqwest: reqwest::Clien // update self in db site.crawled = true; site.status_code = code.as_u16(); - Website::store_all(vec![site], &db).await; + Website::store_all(vec![site.clone()], &db).await; } else { error!("File failed to cooperate: {:?}", path); } + + trace!("Done processing: {}", &site.site); } else { error!("Failed to get: {}", &site.site); } -- 2.47.2