fix the issue where nothing works

This commit is contained in:
2025-10-09 22:35:01 -06:00
parent 1e59ebd5c4
commit 73216f7003
2 changed files with 56 additions and 49 deletions

View File

@@ -52,7 +52,7 @@ pub async fn check_file_length(file: &PathBuf) -> Option<u64> {
} }
}, },
Err(err) => { Err(err) => {
error!("Failed to open file for testing... {}", err); error!("Failed to open file for testing... {:?} {}", file, err);
}, },
} }
None None

View File

@@ -22,7 +22,7 @@ use opentelemetry_sdk::{metrics::SdkMeterProvider, trace::SdkTracerProvider};
use serde::Deserialize; use serde::Deserialize;
use surrealdb::{engine::remote::ws::Client, Surreal}; use surrealdb::{engine::remote::ws::Client, Surreal};
use tokio::{ use tokio::{
io::{AsyncWriteExt, BufWriter}, io::{AsyncReadExt, AsyncWriteExt, BufWriter},
sync::RwLock, sync::RwLock,
task::JoinSet, task::JoinSet,
}; };
@@ -183,7 +183,7 @@ async fn process_single_thread(
#[instrument(skip(db, reqwest))] #[instrument(skip(db, reqwest))]
/// Downloads and crawls and stores a webpage. /// Downloads and crawls and stores a webpage.
/// It is acceptable to clone `db`, `reqwest`, and `s3` because they all use `Arc`s internally. - Noted by Oliver /// It is acceptable to clone `db`, `reqwest`, and `s3` because they all use `Arc`s internally. - Noted by Oliver
async fn process(site: Website, db: Surreal<Client>, reqwest: reqwest::Client) { async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Client) {
// METRICS // METRICS
debug!(url = &site.site.as_str(), "Process: {}", &site.site); debug!(url = &site.site.as_str(), "Process: {}", &site.site);
BEING_PROCESSED.add(1, &[]); BEING_PROCESSED.add(1, &[]);
@@ -251,17 +251,6 @@ async fn process(site: Website, db: Surreal<Client>, reqwest: reqwest::Client) {
} }
} }
let update_in_db = async |mut site: Website| {
// update self in db
site.crawled = true;
site.status_code = code.as_u16();
Website::store_all(vec![site.clone()], &db).await;
};
if skip_download {
trace!("Skipping download...");
update_in_db(site).await;
} else {
// make sure that the file is good to go // make sure that the file is good to go
if let Some(file) = filesystem::init(&tmp_path).await { if let Some(file) = filesystem::init(&tmp_path).await {
// Get body from response // Get body from response
@@ -269,9 +258,26 @@ async fn process(site: Website, db: Surreal<Client>, reqwest: reqwest::Client) {
let mut stream = response.bytes_stream(); let mut stream = response.bytes_stream();
let should_parse = real_path.to_string_lossy().ends_with(".html"); let should_parse = real_path.to_string_lossy().ends_with(".html");
let mut writer = BufWriter::new(file);
let mut buf: Vec<u8> = Vec::new(); let mut buf: Vec<u8> = Vec::new();
if skip_download && should_parse {
// since we are skipping the download we will just read the file off the disk to
// parse it
if let Ok(mut file) = tokio::fs::OpenOptions::new()
.read(true)
.open(&real_path).await
{
if let Err(err) = file.read_to_end(&mut buf).await {
warn!("Failed to read file off disk for parsing, {}", err);
}
}
}
// !!!DOWNLOADING TIME!!!
if !skip_download {
let mut writer = BufWriter::new(file);
// Write file to disk // Write file to disk
trace!("Writing at: {:?}", tmp_path); trace!("Writing at: {:?}", tmp_path);
BEING_STREAMED.add(1, &[]); BEING_STREAMED.add(1, &[]);
@@ -306,6 +312,7 @@ async fn process(site: Website, db: Surreal<Client>, reqwest: reqwest::Client) {
// stream_span.end(); // stream_span.end();
BEING_STREAMED.add(-1, &[]); BEING_STREAMED.add(-1, &[]);
}
// (If needed) Parse the file // (If needed) Parse the file
if should_parse { if should_parse {
@@ -333,10 +340,10 @@ async fn process(site: Website, db: Surreal<Client>, reqwest: reqwest::Client) {
} }
// update self in db // update self in db
update_in_db(site).await; site.crawled = true;
site.status_code = code.as_u16();
Website::store_all(vec![site.clone()], &db).await;
} }
}
} else { } else {
error!(url = site.site.as_str(), "Failed to get: {}", &site.site); error!(url = site.site.as_str(), "Failed to get: {}", &site.site);
} }