Compare commits

4 Commits

Author SHA1 Message Date
2c339a36f9 handle checking for file better 2025-10-09 23:00:11 -06:00
73216f7003 fix the issue where nothing works 2025-10-09 22:35:01 -06:00
1e59ebd5c4 even when not downloading, update the database 2025-10-09 22:13:06 -06:00
52d5e101d0 bragging 2025-10-09 22:03:19 -06:00
3 changed files with 69 additions and 46 deletions

View File

@@ -40,14 +40,18 @@ $EDITOR Crawler.toml
- [x] Allow for storing asynchronously - dropping the "links to" logic fixes this need - [x] Allow for storing asynchronously - dropping the "links to" logic fixes this need
- [x] Control crawler via config file (no recompliation needed) - [x] Control crawler via config file (no recompliation needed)
3/17/25: Took >1hr to crawl 100 pages ### Feats
3/19/25: Took 20min to crawl 1000 pages 3/17/25: Took >1hr to crawl 100 pages.
3/19/25: Took 20min to crawl 1000 pages.
This ment we stored 1000 pages, 142,997 urls, and 1,425,798 links between the two. This ment we stored 1000 pages, 142,997 urls, and 1,425,798 links between the two.
3/20/25: Took 5min to crawl 1000 pages 3/20/25: Took 5min to crawl 1000 pages.
3/21/25: Took 3min to crawl 1000 pages 3/21/25: Took 3min to crawl 1000 pages.
7/.../25: Downloaded just shy of 12TB of data from a remote server.
# About # About

View File

@@ -52,7 +52,10 @@ pub async fn check_file_length(file: &PathBuf) -> Option<u64> {
} }
}, },
Err(err) => { Err(err) => {
error!("Failed to open file for testing... {}", err); match err.kind() {
ErrorKind::NotFound => {/* ignore */},
_ => warn!("Failed to open file to check length. {:?} {}", file, err),
}
}, },
} }
None None

View File

@@ -22,7 +22,7 @@ use opentelemetry_sdk::{metrics::SdkMeterProvider, trace::SdkTracerProvider};
use serde::Deserialize; use serde::Deserialize;
use surrealdb::{engine::remote::ws::Client, Surreal}; use surrealdb::{engine::remote::ws::Client, Surreal};
use tokio::{ use tokio::{
io::{AsyncWriteExt, BufWriter}, io::{AsyncReadExt, AsyncWriteExt, BufWriter},
sync::RwLock, sync::RwLock,
task::JoinSet, task::JoinSet,
}; };
@@ -246,15 +246,14 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
if disk_len == len { if disk_len == len {
skip_download = true; skip_download = true;
} }
}
}
}
}
if skip_download {
trace!("Skipping download...");
} else { } else {
// File not found (or other error).
// Program will continue on it's way, downloading content.
}
}
}
}
// make sure that the file is good to go // make sure that the file is good to go
if let Some(file) = filesystem::init(&tmp_path).await { if let Some(file) = filesystem::init(&tmp_path).await {
// Get body from response // Get body from response
@@ -262,9 +261,26 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
let mut stream = response.bytes_stream(); let mut stream = response.bytes_stream();
let should_parse = real_path.to_string_lossy().ends_with(".html"); let should_parse = real_path.to_string_lossy().ends_with(".html");
let mut writer = BufWriter::new(file);
let mut buf: Vec<u8> = Vec::new(); let mut buf: Vec<u8> = Vec::new();
if skip_download && should_parse {
// since we are skipping the download we will just read the file off the disk to
// parse it
if let Ok(mut file) = tokio::fs::OpenOptions::new()
.read(true)
.open(&real_path).await
{
if let Err(err) = file.read_to_end(&mut buf).await {
warn!("Failed to read file off disk for parsing, {}", err);
}
}
}
// !!!DOWNLOADING TIME!!!
if !skip_download {
let mut writer = BufWriter::new(file);
// Write file to disk // Write file to disk
trace!("Writing at: {:?}", tmp_path); trace!("Writing at: {:?}", tmp_path);
BEING_STREAMED.add(1, &[]); BEING_STREAMED.add(1, &[]);
@@ -299,6 +315,7 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
// stream_span.end(); // stream_span.end();
BEING_STREAMED.add(-1, &[]); BEING_STREAMED.add(-1, &[]);
}
// (If needed) Parse the file // (If needed) Parse the file
if should_parse { if should_parse {
@@ -330,7 +347,6 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
site.status_code = code.as_u16(); site.status_code = code.as_u16();
Website::store_all(vec![site.clone()], &db).await; Website::store_all(vec![site.clone()], &db).await;
} }
}
} else { } else {
error!(url = site.site.as_str(), "Failed to get: {}", &site.site); error!(url = site.site.as_str(), "Failed to get: {}", &site.site);
} }