Compare commits

4 Commits

Author SHA1 Message Date
2c339a36f9 handle checking for file better 2025-10-09 23:00:11 -06:00
73216f7003 fix the issue where nothing works 2025-10-09 22:35:01 -06:00
1e59ebd5c4 even when not downloading, update the database 2025-10-09 22:13:06 -06:00
52d5e101d0 bragging 2025-10-09 22:03:19 -06:00
3 changed files with 69 additions and 46 deletions

View File

@@ -40,14 +40,18 @@ $EDITOR Crawler.toml
- [x] Allow for storing asynchronously - dropping the "links to" logic fixes this need - [x] Allow for storing asynchronously - dropping the "links to" logic fixes this need
- [x] Control crawler via config file (no recompliation needed) - [x] Control crawler via config file (no recompliation needed)
3/17/25: Took >1hr to crawl 100 pages ### Feats
3/19/25: Took 20min to crawl 1000 pages 3/17/25: Took >1hr to crawl 100 pages.
3/19/25: Took 20min to crawl 1000 pages.
This ment we stored 1000 pages, 142,997 urls, and 1,425,798 links between the two. This ment we stored 1000 pages, 142,997 urls, and 1,425,798 links between the two.
3/20/25: Took 5min to crawl 1000 pages 3/20/25: Took 5min to crawl 1000 pages.
3/21/25: Took 3min to crawl 1000 pages 3/21/25: Took 3min to crawl 1000 pages.
7/.../25: Downloaded just shy of 12TB of data from a remote server.
# About # About

View File

@@ -52,7 +52,10 @@ pub async fn check_file_length(file: &PathBuf) -> Option<u64> {
} }
}, },
Err(err) => { Err(err) => {
error!("Failed to open file for testing... {}", err); match err.kind() {
ErrorKind::NotFound => {/* ignore */},
_ => warn!("Failed to open file to check length. {:?} {}", file, err),
}
}, },
} }
None None

View File

@@ -22,7 +22,7 @@ use opentelemetry_sdk::{metrics::SdkMeterProvider, trace::SdkTracerProvider};
use serde::Deserialize; use serde::Deserialize;
use surrealdb::{engine::remote::ws::Client, Surreal}; use surrealdb::{engine::remote::ws::Client, Surreal};
use tokio::{ use tokio::{
io::{AsyncWriteExt, BufWriter}, io::{AsyncReadExt, AsyncWriteExt, BufWriter},
sync::RwLock, sync::RwLock,
task::JoinSet, task::JoinSet,
}; };
@@ -246,24 +246,40 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
if disk_len == len { if disk_len == len {
skip_download = true; skip_download = true;
} }
} else {
// File not found (or other error).
// Program will continue on it's way, downloading content.
} }
} }
} }
} }
// make sure that the file is good to go
if let Some(file) = filesystem::init(&tmp_path).await {
// Get body from response
// stream the response onto the disk
let mut stream = response.bytes_stream();
if skip_download { let should_parse = real_path.to_string_lossy().ends_with(".html");
trace!("Skipping download...");
} else {
// make sure that the file is good to go
if let Some(file) = filesystem::init(&tmp_path).await {
// Get body from response
// stream the response onto the disk
let mut stream = response.bytes_stream();
let should_parse = real_path.to_string_lossy().ends_with(".html"); let mut buf: Vec<u8> = Vec::new();
if skip_download && should_parse {
// since we are skipping the download we will just read the file off the disk to
// parse it
if let Ok(mut file) = tokio::fs::OpenOptions::new()
.read(true)
.open(&real_path).await
{
if let Err(err) = file.read_to_end(&mut buf).await {
warn!("Failed to read file off disk for parsing, {}", err);
}
}
}
// !!!DOWNLOADING TIME!!!
if !skip_download {
let mut writer = BufWriter::new(file); let mut writer = BufWriter::new(file);
let mut buf: Vec<u8> = Vec::new();
// Write file to disk // Write file to disk
trace!("Writing at: {:?}", tmp_path); trace!("Writing at: {:?}", tmp_path);
@@ -299,37 +315,37 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
// stream_span.end(); // stream_span.end();
BEING_STREAMED.add(-1, &[]); BEING_STREAMED.add(-1, &[]);
// (If needed) Parse the file
if should_parse {
BEING_PARSED.add(1, &[]);
// let mut parsing_span = TRACER.start("Parsing");
// Parse document and get relationships
let sites = parser::parse(&site, &buf).await;
// De-duplicate this list
let prev_len = sites.len();
let set = sites.into_iter().fold(HashSet::new(), |mut set, item| {
set.insert(item);
set
});
let de_dupe_sites: Vec<Website> = set.into_iter().collect();
let diff = prev_len - de_dupe_sites.len();
trace!("Saved {diff} from being entered into the db by de-duping");
// Store all the other sites so that we can link to them.
let _ = Website::store_all(de_dupe_sites, &db).await;
// parsing_span.end();
BEING_PARSED.add(-1, &[]);
} else {
trace!(url = site.site.as_str(), "Parse = False");
}
// update self in db
site.crawled = true;
site.status_code = code.as_u16();
Website::store_all(vec![site.clone()], &db).await;
} }
// (If needed) Parse the file
if should_parse {
BEING_PARSED.add(1, &[]);
// let mut parsing_span = TRACER.start("Parsing");
// Parse document and get relationships
let sites = parser::parse(&site, &buf).await;
// De-duplicate this list
let prev_len = sites.len();
let set = sites.into_iter().fold(HashSet::new(), |mut set, item| {
set.insert(item);
set
});
let de_dupe_sites: Vec<Website> = set.into_iter().collect();
let diff = prev_len - de_dupe_sites.len();
trace!("Saved {diff} from being entered into the db by de-duping");
// Store all the other sites so that we can link to them.
let _ = Website::store_all(de_dupe_sites, &db).await;
// parsing_span.end();
BEING_PARSED.add(-1, &[]);
} else {
trace!(url = site.site.as_str(), "Parse = False");
}
// update self in db
site.crawled = true;
site.status_code = code.as_u16();
Website::store_all(vec![site.clone()], &db).await;
} }
} else { } else {
error!(url = site.site.as_str(), "Failed to get: {}", &site.site); error!(url = site.site.as_str(), "Failed to get: {}", &site.site);