diff --git a/Crawler.toml b/Crawler.toml index 8cb179b..a7cdb42 100644 --- a/Crawler.toml +++ b/Crawler.toml @@ -6,9 +6,9 @@ surreal_ns = "test" surreal_db = "v1.21.1" # Crawler config -# crawl_filter = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI/Imagery/2023_NAIP/UTM_County_Mosaics/" -crawl_filter = "https://oliveratkinson.net" -# start_url = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI/Imagery/2023_NAIP/UTM_County_Mosaics/" -start_url = "https://oliveratkinson.net" -budget = 1000 -batch_size = 500 +crawl_filter = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI/Imagery/2023_NAIP/UTM_County_Mosaics/" +# crawl_filter = "https://oliveratkinson.net" +start_url = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI/Imagery/2023_NAIP/UTM_County_Mosaics/" +# start_url = "https://oliveratkinson.net" +budget = 100 +batch_size = 2 diff --git a/src/main.rs b/src/main.rs index d5e5d50..86c3f82 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,11 +1,12 @@ #![feature(ip_from)] +#![feature(path_add_extension)] #![warn(clippy::expect_used)] #![deny(clippy::unwrap_used)] extern crate html5ever; use std::{ - collections::HashSet, fs::File, io::Read, net::{IpAddr, Ipv4Addr}, sync::LazyLock, time::Instant + collections::HashSet, fs::File, io::Read, sync::LazyLock }; use futures_util::StreamExt; @@ -242,9 +243,15 @@ async fn process(mut site: Website, db: Surreal, reqwest: reqwest::Clien // create filepath (handles / -> /index.html) let path = filesystem::as_path(&site.site, ct); + let mut tmp_path= path.clone(); + if !(tmp_path.add_extension("crawl_temp")) { + warn!("Failed to add extension to file"); + // fallback ig + tmp_path = tmp_path.with_extension("crawl_temp"); + } // make sure that the file is good to go - if let Some(file) = filesystem::init(&path).await { + if let Some(file) = filesystem::init(&tmp_path).await { // Get body from response // stream the response onto the disk let mut stream = response.bytes_stream(); @@ -274,9 +281,14 @@ async fn process(mut site: Website, db: Surreal, reqwest: reqwest::Clien }, } } + let _ = writer.flush(); + // rename the temp file into the real file name + if let Err(err) = tokio::fs::rename(tmp_path, path).await { + error!("{}", err); + } + stream_span.end(); BEING_STREAMED.add(-1, &[]); - let _ = writer.flush(); // (If needed) Parse the file if should_parse {