Compare commits
4 Commits
5b728bacd6
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 2c339a36f9 | |||
| 73216f7003 | |||
| 1e59ebd5c4 | |||
| 52d5e101d0 |
12
README.md
12
README.md
@@ -40,14 +40,18 @@ $EDITOR Crawler.toml
|
|||||||
- [x] Allow for storing asynchronously - dropping the "links to" logic fixes this need
|
- [x] Allow for storing asynchronously - dropping the "links to" logic fixes this need
|
||||||
- [x] Control crawler via config file (no recompliation needed)
|
- [x] Control crawler via config file (no recompliation needed)
|
||||||
|
|
||||||
3/17/25: Took >1hr to crawl 100 pages
|
### Feats
|
||||||
|
|
||||||
3/19/25: Took 20min to crawl 1000 pages
|
3/17/25: Took >1hr to crawl 100 pages.
|
||||||
|
|
||||||
|
3/19/25: Took 20min to crawl 1000 pages.
|
||||||
This ment we stored 1000 pages, 142,997 urls, and 1,425,798 links between the two.
|
This ment we stored 1000 pages, 142,997 urls, and 1,425,798 links between the two.
|
||||||
|
|
||||||
3/20/25: Took 5min to crawl 1000 pages
|
3/20/25: Took 5min to crawl 1000 pages.
|
||||||
|
|
||||||
3/21/25: Took 3min to crawl 1000 pages
|
3/21/25: Took 3min to crawl 1000 pages.
|
||||||
|
|
||||||
|
7/.../25: Downloaded just shy of 12TB of data from a remote server.
|
||||||
|
|
||||||
# About
|
# About
|
||||||
|
|
||||||
|
|||||||
@@ -52,7 +52,10 @@ pub async fn check_file_length(file: &PathBuf) -> Option<u64> {
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
error!("Failed to open file for testing... {}", err);
|
match err.kind() {
|
||||||
|
ErrorKind::NotFound => {/* ignore */},
|
||||||
|
_ => warn!("Failed to open file to check length. {:?} {}", file, err),
|
||||||
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
None
|
None
|
||||||
|
|||||||
98
src/main.rs
98
src/main.rs
@@ -22,7 +22,7 @@ use opentelemetry_sdk::{metrics::SdkMeterProvider, trace::SdkTracerProvider};
|
|||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
use surrealdb::{engine::remote::ws::Client, Surreal};
|
use surrealdb::{engine::remote::ws::Client, Surreal};
|
||||||
use tokio::{
|
use tokio::{
|
||||||
io::{AsyncWriteExt, BufWriter},
|
io::{AsyncReadExt, AsyncWriteExt, BufWriter},
|
||||||
sync::RwLock,
|
sync::RwLock,
|
||||||
task::JoinSet,
|
task::JoinSet,
|
||||||
};
|
};
|
||||||
@@ -246,24 +246,40 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
|
|||||||
if disk_len == len {
|
if disk_len == len {
|
||||||
skip_download = true;
|
skip_download = true;
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// File not found (or other error).
|
||||||
|
// Program will continue on it's way, downloading content.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// make sure that the file is good to go
|
||||||
|
if let Some(file) = filesystem::init(&tmp_path).await {
|
||||||
|
// Get body from response
|
||||||
|
// stream the response onto the disk
|
||||||
|
let mut stream = response.bytes_stream();
|
||||||
|
|
||||||
if skip_download {
|
let should_parse = real_path.to_string_lossy().ends_with(".html");
|
||||||
trace!("Skipping download...");
|
|
||||||
} else {
|
|
||||||
// make sure that the file is good to go
|
|
||||||
if let Some(file) = filesystem::init(&tmp_path).await {
|
|
||||||
// Get body from response
|
|
||||||
// stream the response onto the disk
|
|
||||||
let mut stream = response.bytes_stream();
|
|
||||||
|
|
||||||
let should_parse = real_path.to_string_lossy().ends_with(".html");
|
let mut buf: Vec<u8> = Vec::new();
|
||||||
|
|
||||||
|
if skip_download && should_parse {
|
||||||
|
// since we are skipping the download we will just read the file off the disk to
|
||||||
|
// parse it
|
||||||
|
if let Ok(mut file) = tokio::fs::OpenOptions::new()
|
||||||
|
.read(true)
|
||||||
|
.open(&real_path).await
|
||||||
|
{
|
||||||
|
if let Err(err) = file.read_to_end(&mut buf).await {
|
||||||
|
warn!("Failed to read file off disk for parsing, {}", err);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// !!!DOWNLOADING TIME!!!
|
||||||
|
if !skip_download {
|
||||||
let mut writer = BufWriter::new(file);
|
let mut writer = BufWriter::new(file);
|
||||||
let mut buf: Vec<u8> = Vec::new();
|
|
||||||
|
|
||||||
// Write file to disk
|
// Write file to disk
|
||||||
trace!("Writing at: {:?}", tmp_path);
|
trace!("Writing at: {:?}", tmp_path);
|
||||||
@@ -299,37 +315,37 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
|
|||||||
|
|
||||||
// stream_span.end();
|
// stream_span.end();
|
||||||
BEING_STREAMED.add(-1, &[]);
|
BEING_STREAMED.add(-1, &[]);
|
||||||
|
|
||||||
// (If needed) Parse the file
|
|
||||||
if should_parse {
|
|
||||||
BEING_PARSED.add(1, &[]);
|
|
||||||
// let mut parsing_span = TRACER.start("Parsing");
|
|
||||||
|
|
||||||
// Parse document and get relationships
|
|
||||||
let sites = parser::parse(&site, &buf).await;
|
|
||||||
// De-duplicate this list
|
|
||||||
let prev_len = sites.len();
|
|
||||||
let set = sites.into_iter().fold(HashSet::new(), |mut set, item| {
|
|
||||||
set.insert(item);
|
|
||||||
set
|
|
||||||
});
|
|
||||||
let de_dupe_sites: Vec<Website> = set.into_iter().collect();
|
|
||||||
let diff = prev_len - de_dupe_sites.len();
|
|
||||||
trace!("Saved {diff} from being entered into the db by de-duping");
|
|
||||||
// Store all the other sites so that we can link to them.
|
|
||||||
let _ = Website::store_all(de_dupe_sites, &db).await;
|
|
||||||
|
|
||||||
// parsing_span.end();
|
|
||||||
BEING_PARSED.add(-1, &[]);
|
|
||||||
} else {
|
|
||||||
trace!(url = site.site.as_str(), "Parse = False");
|
|
||||||
}
|
|
||||||
|
|
||||||
// update self in db
|
|
||||||
site.crawled = true;
|
|
||||||
site.status_code = code.as_u16();
|
|
||||||
Website::store_all(vec![site.clone()], &db).await;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// (If needed) Parse the file
|
||||||
|
if should_parse {
|
||||||
|
BEING_PARSED.add(1, &[]);
|
||||||
|
// let mut parsing_span = TRACER.start("Parsing");
|
||||||
|
|
||||||
|
// Parse document and get relationships
|
||||||
|
let sites = parser::parse(&site, &buf).await;
|
||||||
|
// De-duplicate this list
|
||||||
|
let prev_len = sites.len();
|
||||||
|
let set = sites.into_iter().fold(HashSet::new(), |mut set, item| {
|
||||||
|
set.insert(item);
|
||||||
|
set
|
||||||
|
});
|
||||||
|
let de_dupe_sites: Vec<Website> = set.into_iter().collect();
|
||||||
|
let diff = prev_len - de_dupe_sites.len();
|
||||||
|
trace!("Saved {diff} from being entered into the db by de-duping");
|
||||||
|
// Store all the other sites so that we can link to them.
|
||||||
|
let _ = Website::store_all(de_dupe_sites, &db).await;
|
||||||
|
|
||||||
|
// parsing_span.end();
|
||||||
|
BEING_PARSED.add(-1, &[]);
|
||||||
|
} else {
|
||||||
|
trace!(url = site.site.as_str(), "Parse = False");
|
||||||
|
}
|
||||||
|
|
||||||
|
// update self in db
|
||||||
|
site.crawled = true;
|
||||||
|
site.status_code = code.as_u16();
|
||||||
|
Website::store_all(vec![site.clone()], &db).await;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
error!(url = site.site.as_str(), "Failed to get: {}", &site.site);
|
error!(url = site.site.as_str(), "Failed to get: {}", &site.site);
|
||||||
|
|||||||
Reference in New Issue
Block a user