Compare commits

..

2 Commits

Author SHA1 Message Date
865f9be8c0 Merge pull request 'works 😄' (#16) from tempfiles into main
Reviewed-on: #16
2025-07-16 02:26:14 +00:00
48abc73092 works 😄 2025-07-15 20:25:44 -06:00
2 changed files with 21 additions and 9 deletions

View File

@ -6,9 +6,9 @@ surreal_ns = "test"
surreal_db = "v1.21.1" surreal_db = "v1.21.1"
# Crawler config # Crawler config
# crawl_filter = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI/Imagery/2023_NAIP/UTM_County_Mosaics/" crawl_filter = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI/Imagery/2023_NAIP/UTM_County_Mosaics/"
crawl_filter = "https://oliveratkinson.net" # crawl_filter = "https://oliveratkinson.net"
# start_url = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI/Imagery/2023_NAIP/UTM_County_Mosaics/" start_url = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI/Imagery/2023_NAIP/UTM_County_Mosaics/"
start_url = "https://oliveratkinson.net" # start_url = "https://oliveratkinson.net"
budget = 1000 budget = 100
batch_size = 500 batch_size = 2

View File

@ -1,11 +1,12 @@
#![feature(ip_from)] #![feature(ip_from)]
#![feature(path_add_extension)]
#![warn(clippy::expect_used)] #![warn(clippy::expect_used)]
#![deny(clippy::unwrap_used)] #![deny(clippy::unwrap_used)]
extern crate html5ever; extern crate html5ever;
use std::{ use std::{
collections::HashSet, fs::File, io::Read, net::{IpAddr, Ipv4Addr}, sync::LazyLock, time::Instant collections::HashSet, fs::File, io::Read, sync::LazyLock
}; };
use futures_util::StreamExt; use futures_util::StreamExt;
@ -242,9 +243,15 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
// create filepath (handles / -> /index.html) // create filepath (handles / -> /index.html)
let path = filesystem::as_path(&site.site, ct); let path = filesystem::as_path(&site.site, ct);
let mut tmp_path= path.clone();
if !(tmp_path.add_extension("crawl_temp")) {
warn!("Failed to add extension to file");
// fallback ig
tmp_path = tmp_path.with_extension("crawl_temp");
}
// make sure that the file is good to go // make sure that the file is good to go
if let Some(file) = filesystem::init(&path).await { if let Some(file) = filesystem::init(&tmp_path).await {
// Get body from response // Get body from response
// stream the response onto the disk // stream the response onto the disk
let mut stream = response.bytes_stream(); let mut stream = response.bytes_stream();
@ -274,9 +281,14 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
}, },
} }
} }
let _ = writer.flush();
// rename the temp file into the real file name
if let Err(err) = tokio::fs::rename(tmp_path, path).await {
error!("{}", err);
}
stream_span.end(); stream_span.end();
BEING_STREAMED.add(-1, &[]); BEING_STREAMED.add(-1, &[]);
let _ = writer.flush();
// (If needed) Parse the file // (If needed) Parse the file
if should_parse { if should_parse {