From 6790061e22005b9c9e5a313fe0502be018130614 Mon Sep 17 00:00:00 2001 From: Rushmore75 Date: Wed, 9 Jul 2025 15:58:22 -0600 Subject: [PATCH] helper code --- src/db.rs | 7 ++++++- src/main.rs | 28 ++++++++++++++++++---------- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/src/db.rs b/src/db.rs index 2cccfa7..9778edb 100644 --- a/src/db.rs +++ b/src/db.rs @@ -27,7 +27,11 @@ pub struct Website { // manual impl to make tracing look nicer impl Debug for Website { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("Website").field("site", &self.site).finish() + f.debug_struct("Website") + .field("host", &self.site.host()) + .field("path", &self.site.path()) + .field("status_code", &self.status_code) + .finish() } } @@ -52,6 +56,7 @@ impl Website { counter!(STORE).increment(1); let mut things = Vec::with_capacity(all.len()); + // FIXME failes *sometimes* because "Resource Busy" match db .query( "INSERT INTO website $array diff --git a/src/main.rs b/src/main.rs index 66c0fd5..289f351 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,5 +1,4 @@ #![feature(ip_from)] -#![feature(async_closure)] #![warn(clippy::expect_used)] #![deny(clippy::unwrap_used)] @@ -20,7 +19,7 @@ use metrics_exporter_prometheus::PrometheusBuilder; use serde::Deserialize; use surrealdb::{engine::remote::ws::Client, Surreal}; use tokio::{io::{AsyncWriteExt, BufWriter}, task::JoinSet}; -use tracing::{debug, error, info, instrument, level_filters::LevelFilter, trace, trace_span, warn}; +use tracing::{debug, debug_span, error, info, instrument, level_filters::LevelFilter, trace, trace_span, warn}; use tracing_subscriber::{fmt, layer::SubscriberExt, EnvFilter, Layer, Registry}; mod db; @@ -180,8 +179,6 @@ async fn process(mut site: Website, db: Surreal, reqwest: reqwest::Clien let CT = headers.get("Content-Type"); let ct = headers.get("content-type"); - if CT.is_none() && ct.is_none() { - } let ct = match (CT,ct) { (None, None) => { warn!("Server did not respond with Content-Type header. Url: {} Headers: ({:?})", site.site.to_string(), headers); @@ -192,19 +189,20 @@ async fn process(mut site: Website, db: Surreal, reqwest: reqwest::Clien (Some(a), Some(_)) => a, }; + // create filepath (handles / -> /index.html) let path = filesystem::as_path(&site.site, ct); // make sure that the file is good to go if let Some(file) = filesystem::init(&path).await { - let should_parse = path.to_string_lossy().ends_with(".html"); - let mut buf: Vec = Vec::new(); - - let mut writer = BufWriter::new(file); - // Get body from response // stream the response onto the disk let mut stream = response.bytes_stream(); + let should_parse = path.to_string_lossy().ends_with(".html"); + let mut writer = BufWriter::new(file); + let mut buf: Vec = Vec::new(); + + // Write file to disk info!("Writing at: {:?}", path); while let Some(data) = stream.next().await { match data { @@ -212,6 +210,7 @@ async fn process(mut site: Website, db: Surreal, reqwest: reqwest::Clien let _ = writer.write_all(&data).await; // If we are going to parse this file later, we will save it // into memory as well as the disk. + // We do this because the data here might be incomplete if should_parse { data.iter().for_each(|f| buf.push(*f)); } @@ -223,7 +222,12 @@ async fn process(mut site: Website, db: Surreal, reqwest: reqwest::Clien } let _ = writer.flush(); + + // (If needed) Parse the file if should_parse { + let span = debug_span!("Should Parse"); + let enter = span.enter(); + // Parse document and get relationships let sites = parser::parse(&site, &buf).await; // De-duplicate this list @@ -237,6 +241,8 @@ async fn process(mut site: Website, db: Surreal, reqwest: reqwest::Clien trace!("Saved {diff} from being entered into the db by de-duping"); // Store all the other sites so that we can link to them. let _ = Website::store_all(de_dupe_sites, &db).await; + + drop(enter); } // METRICS @@ -246,10 +252,12 @@ async fn process(mut site: Website, db: Surreal, reqwest: reqwest::Clien // update self in db site.crawled = true; site.status_code = code.as_u16(); - Website::store_all(vec![site], &db).await; + Website::store_all(vec![site.clone()], &db).await; } else { error!("File failed to cooperate: {:?}", path); } + + trace!("Done processing: {}", &site.site); } else { error!("Failed to get: {}", &site.site); }