handle checking for file better

fix the issue where nothing works
even when not downloading, update the database
2025-10-09 23:00:11 -06:00 · 2025-10-09 22:35:01 -06:00 · 2025-10-09 22:13:06 -06:00 · 2025-10-09 22:03:19 -06:00
3 changed files with 69 additions and 46 deletions
--- a/README.md
+++ b/README.md
@@ -40,14 +40,18 @@ $EDITOR Crawler.toml
 - [x] Allow for storing asynchronously - dropping the "links to" logic fixes this need
 - [x] Control crawler via config file (no recompliation needed)
-3/17/25: Took >1hr to crawl 100 pages
+### Feats
-3/19/25: Took 20min to crawl 1000 pages
+3/17/25: Took >1hr to crawl 100 pages.
 3/19/25: Took 20min to crawl 1000 pages.
 This ment we stored 1000 pages, 142,997 urls, and 1,425,798 links between the two.
-3/20/25: Took 5min to crawl 1000 pages
+3/20/25: Took 5min to crawl 1000 pages.
-3/21/25: Took 3min to crawl 1000 pages
+3/21/25: Took 3min to crawl 1000 pages.
 7/.../25: Downloaded just shy of 12TB of data from a remote server.
 # About
--- a/src/filesystem.rs
+++ b/src/filesystem.rs
@@ -52,7 +52,10 @@ pub async fn check_file_length(file: &PathBuf) -> Option<u64> {
            }
        },
        Err(err) => {
-            error!("Failed to open file for testing... {}", err);
+            match err.kind() {
                ErrorKind::NotFound => {/* ignore */},
                _ => warn!("Failed to open file to check length. {:?} {}", file, err),
            }
        },
    }
    None
--- a/src/main.rs
+++ b/src/main.rs
@@ -22,7 +22,7 @@ use opentelemetry_sdk::{metrics::SdkMeterProvider, trace::SdkTracerProvider};
 use serde::Deserialize;
 use surrealdb::{engine::remote::ws::Client, Surreal};
 use tokio::{
-    io::{AsyncWriteExt, BufWriter},
+    io::{AsyncReadExt, AsyncWriteExt, BufWriter},
    sync::RwLock,
    task::JoinSet,
 };
@@ -246,24 +246,40 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
                        if disk_len == len {
                            skip_download = true;
                        }
                    } else {
                        // File not found (or other error).
                        // Program will continue on it's way, downloading content.
                    }
                }
            }
        }
        // make sure that the file is good to go
        if let Some(file) = filesystem::init(&tmp_path).await {
            // Get body from response
            // stream the response onto the disk
            let mut stream = response.bytes_stream();
-        if skip_download {
+            let should_parse = real_path.to_string_lossy().ends_with(".html");
            trace!("Skipping download...");
        } else {
            // make sure that the file is good to go
            if let Some(file) = filesystem::init(&tmp_path).await {
                // Get body from response
                // stream the response onto the disk
                let mut stream = response.bytes_stream();
-                let should_parse = real_path.to_string_lossy().ends_with(".html");
+            let mut buf: Vec<u8> = Vec::new();
            if skip_download && should_parse {
                // since we are skipping the download we will just read the file off the disk to
                // parse it
                if let Ok(mut file) = tokio::fs::OpenOptions::new() 
                    .read(true)
                    .open(&real_path).await
                {
                    if let Err(err) = file.read_to_end(&mut buf).await {
                        warn!("Failed to read file off disk for parsing, {}", err);
                    }
                }
            } 
            // !!!DOWNLOADING TIME!!!
            if !skip_download {
                let mut writer = BufWriter::new(file);
                let mut buf: Vec<u8> = Vec::new();
                // Write file to disk
                trace!("Writing at: {:?}", tmp_path);
@@ -299,37 +315,37 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
                // stream_span.end();
                BEING_STREAMED.add(-1, &[]);
                // (If needed) Parse the file
                if should_parse {
                    BEING_PARSED.add(1, &[]);
                    // let mut parsing_span = TRACER.start("Parsing");
                    // Parse document and get relationships
                    let sites = parser::parse(&site, &buf).await;
                    // De-duplicate this list
                    let prev_len = sites.len();
                    let set = sites.into_iter().fold(HashSet::new(), |mut set, item| {
                        set.insert(item);
                        set
                    });
                    let de_dupe_sites: Vec<Website> = set.into_iter().collect();
                    let diff = prev_len - de_dupe_sites.len();
                    trace!("Saved {diff} from being entered into the db by de-duping");
                    // Store all the other sites so that we can link to them.
                    let _ = Website::store_all(de_dupe_sites, &db).await;
                    // parsing_span.end();
                    BEING_PARSED.add(-1, &[]);
                } else {
                    trace!(url = site.site.as_str(), "Parse = False");
                }
                // update self in db
                site.crawled = true;
                site.status_code = code.as_u16();
                Website::store_all(vec![site.clone()], &db).await;
            }
            // (If needed) Parse the file
            if should_parse {
                BEING_PARSED.add(1, &[]);
                // let mut parsing_span = TRACER.start("Parsing");
                // Parse document and get relationships
                let sites = parser::parse(&site, &buf).await;
                // De-duplicate this list
                let prev_len = sites.len();
                let set = sites.into_iter().fold(HashSet::new(), |mut set, item| {
                    set.insert(item);
                    set
                });
                let de_dupe_sites: Vec<Website> = set.into_iter().collect();
                let diff = prev_len - de_dupe_sites.len();
                trace!("Saved {diff} from being entered into the db by de-duping");
                // Store all the other sites so that we can link to them.
                let _ = Website::store_all(de_dupe_sites, &db).await;
                // parsing_span.end();
                BEING_PARSED.add(-1, &[]);
            } else {
                trace!(url = site.site.as_str(), "Parse = False");
            }
            // update self in db
            site.crawled = true;
            site.status_code = code.as_u16();
            Website::store_all(vec![site.clone()], &db).await;
        }
    } else {
        error!(url = site.site.as_str(), "Failed to get: {}", &site.site);
Author	SHA1	Message	Date
Oliver	2c339a36f9	handle checking for file better	2025-10-09 23:00:11 -06:00
Oliver	73216f7003	fix the issue where nothing works	2025-10-09 22:35:01 -06:00
Oliver	1e59ebd5c4	even when not downloading, update the database	2025-10-09 22:13:06 -06:00
Oliver	52d5e101d0	bragging	2025-10-09 22:03:19 -06:00