fix the issue where nothing works
This commit is contained in:
@@ -52,7 +52,7 @@ pub async fn check_file_length(file: &PathBuf) -> Option<u64> {
|
||||
}
|
||||
},
|
||||
Err(err) => {
|
||||
error!("Failed to open file for testing... {}", err);
|
||||
error!("Failed to open file for testing... {:?} {}", file, err);
|
||||
},
|
||||
}
|
||||
None
|
||||
|
41
src/main.rs
41
src/main.rs
@@ -22,7 +22,7 @@ use opentelemetry_sdk::{metrics::SdkMeterProvider, trace::SdkTracerProvider};
|
||||
use serde::Deserialize;
|
||||
use surrealdb::{engine::remote::ws::Client, Surreal};
|
||||
use tokio::{
|
||||
io::{AsyncWriteExt, BufWriter},
|
||||
io::{AsyncReadExt, AsyncWriteExt, BufWriter},
|
||||
sync::RwLock,
|
||||
task::JoinSet,
|
||||
};
|
||||
@@ -183,7 +183,7 @@ async fn process_single_thread(
|
||||
#[instrument(skip(db, reqwest))]
|
||||
/// Downloads and crawls and stores a webpage.
|
||||
/// It is acceptable to clone `db`, `reqwest`, and `s3` because they all use `Arc`s internally. - Noted by Oliver
|
||||
async fn process(site: Website, db: Surreal<Client>, reqwest: reqwest::Client) {
|
||||
async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Client) {
|
||||
// METRICS
|
||||
debug!(url = &site.site.as_str(), "Process: {}", &site.site);
|
||||
BEING_PROCESSED.add(1, &[]);
|
||||
@@ -251,17 +251,6 @@ async fn process(site: Website, db: Surreal<Client>, reqwest: reqwest::Client) {
|
||||
}
|
||||
}
|
||||
|
||||
let update_in_db = async |mut site: Website| {
|
||||
// update self in db
|
||||
site.crawled = true;
|
||||
site.status_code = code.as_u16();
|
||||
Website::store_all(vec![site.clone()], &db).await;
|
||||
};
|
||||
|
||||
if skip_download {
|
||||
trace!("Skipping download...");
|
||||
update_in_db(site).await;
|
||||
} else {
|
||||
// make sure that the file is good to go
|
||||
if let Some(file) = filesystem::init(&tmp_path).await {
|
||||
// Get body from response
|
||||
@@ -269,9 +258,26 @@ async fn process(site: Website, db: Surreal<Client>, reqwest: reqwest::Client) {
|
||||
let mut stream = response.bytes_stream();
|
||||
|
||||
let should_parse = real_path.to_string_lossy().ends_with(".html");
|
||||
let mut writer = BufWriter::new(file);
|
||||
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
|
||||
if skip_download && should_parse {
|
||||
// since we are skipping the download we will just read the file off the disk to
|
||||
// parse it
|
||||
if let Ok(mut file) = tokio::fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&real_path).await
|
||||
{
|
||||
if let Err(err) = file.read_to_end(&mut buf).await {
|
||||
warn!("Failed to read file off disk for parsing, {}", err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// !!!DOWNLOADING TIME!!!
|
||||
if !skip_download {
|
||||
let mut writer = BufWriter::new(file);
|
||||
|
||||
// Write file to disk
|
||||
trace!("Writing at: {:?}", tmp_path);
|
||||
BEING_STREAMED.add(1, &[]);
|
||||
@@ -306,6 +312,7 @@ async fn process(site: Website, db: Surreal<Client>, reqwest: reqwest::Client) {
|
||||
|
||||
// stream_span.end();
|
||||
BEING_STREAMED.add(-1, &[]);
|
||||
}
|
||||
|
||||
// (If needed) Parse the file
|
||||
if should_parse {
|
||||
@@ -333,10 +340,10 @@ async fn process(site: Website, db: Surreal<Client>, reqwest: reqwest::Client) {
|
||||
}
|
||||
|
||||
// update self in db
|
||||
update_in_db(site).await;
|
||||
site.crawled = true;
|
||||
site.status_code = code.as_u16();
|
||||
Website::store_all(vec![site.clone()], &db).await;
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
error!(url = site.site.as_str(), "Failed to get: {}", &site.site);
|
||||
}
|
||||
|
Reference in New Issue
Block a user