Compare commits

4 Commits

Author SHA1 Message Date
2c339a36f9 handle checking for file better 2025-10-09 23:00:11 -06:00
73216f7003 fix the issue where nothing works 2025-10-09 22:35:01 -06:00
1e59ebd5c4 even when not downloading, update the database 2025-10-09 22:13:06 -06:00
52d5e101d0 bragging 2025-10-09 22:03:19 -06:00
3 changed files with 69 additions and 46 deletions

View File

@@ -40,14 +40,18 @@ $EDITOR Crawler.toml
- [x] Allow for storing asynchronously - dropping the "links to" logic fixes this need
- [x] Control crawler via config file (no recompliation needed)
3/17/25: Took >1hr to crawl 100 pages
### Feats
3/19/25: Took 20min to crawl 1000 pages
3/17/25: Took >1hr to crawl 100 pages.
3/19/25: Took 20min to crawl 1000 pages.
This ment we stored 1000 pages, 142,997 urls, and 1,425,798 links between the two.
3/20/25: Took 5min to crawl 1000 pages
3/20/25: Took 5min to crawl 1000 pages.
3/21/25: Took 3min to crawl 1000 pages
3/21/25: Took 3min to crawl 1000 pages.
7/.../25: Downloaded just shy of 12TB of data from a remote server.
# About

View File

@@ -52,7 +52,10 @@ pub async fn check_file_length(file: &PathBuf) -> Option<u64> {
}
},
Err(err) => {
error!("Failed to open file for testing... {}", err);
match err.kind() {
ErrorKind::NotFound => {/* ignore */},
_ => warn!("Failed to open file to check length. {:?} {}", file, err),
}
},
}
None

View File

@@ -22,7 +22,7 @@ use opentelemetry_sdk::{metrics::SdkMeterProvider, trace::SdkTracerProvider};
use serde::Deserialize;
use surrealdb::{engine::remote::ws::Client, Surreal};
use tokio::{
io::{AsyncWriteExt, BufWriter},
io::{AsyncReadExt, AsyncWriteExt, BufWriter},
sync::RwLock,
task::JoinSet,
};
@@ -246,15 +246,14 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
if disk_len == len {
skip_download = true;
}
}
}
}
}
if skip_download {
trace!("Skipping download...");
} else {
// File not found (or other error).
// Program will continue on it's way, downloading content.
}
}
}
}
// make sure that the file is good to go
if let Some(file) = filesystem::init(&tmp_path).await {
// Get body from response
@@ -262,9 +261,26 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
let mut stream = response.bytes_stream();
let should_parse = real_path.to_string_lossy().ends_with(".html");
let mut writer = BufWriter::new(file);
let mut buf: Vec<u8> = Vec::new();
if skip_download && should_parse {
// since we are skipping the download we will just read the file off the disk to
// parse it
if let Ok(mut file) = tokio::fs::OpenOptions::new()
.read(true)
.open(&real_path).await
{
if let Err(err) = file.read_to_end(&mut buf).await {
warn!("Failed to read file off disk for parsing, {}", err);
}
}
}
// !!!DOWNLOADING TIME!!!
if !skip_download {
let mut writer = BufWriter::new(file);
// Write file to disk
trace!("Writing at: {:?}", tmp_path);
BEING_STREAMED.add(1, &[]);
@@ -299,6 +315,7 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
// stream_span.end();
BEING_STREAMED.add(-1, &[]);
}
// (If needed) Parse the file
if should_parse {
@@ -330,7 +347,6 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
site.status_code = code.as_u16();
Website::store_all(vec![site.clone()], &db).await;
}
}
} else {
error!(url = site.site.as_str(), "Failed to get: {}", &site.site);
}