Compare commits
No commits in common. "865f9be8c001ae89d7cd11b58c5caeb3bb1924eb" and "00618669764b6dfcb2d1d267ce72d71da8c5a840" have entirely different histories.
865f9be8c0
...
0061866976
12
Crawler.toml
12
Crawler.toml
@ -6,9 +6,9 @@ surreal_ns = "test"
|
|||||||
surreal_db = "v1.21.1"
|
surreal_db = "v1.21.1"
|
||||||
|
|
||||||
# Crawler config
|
# Crawler config
|
||||||
crawl_filter = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI/Imagery/2023_NAIP/UTM_County_Mosaics/"
|
# crawl_filter = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI/Imagery/2023_NAIP/UTM_County_Mosaics/"
|
||||||
# crawl_filter = "https://oliveratkinson.net"
|
crawl_filter = "https://oliveratkinson.net"
|
||||||
start_url = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI/Imagery/2023_NAIP/UTM_County_Mosaics/"
|
# start_url = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI/Imagery/2023_NAIP/UTM_County_Mosaics/"
|
||||||
# start_url = "https://oliveratkinson.net"
|
start_url = "https://oliveratkinson.net"
|
||||||
budget = 100
|
budget = 1000
|
||||||
batch_size = 2
|
batch_size = 500
|
||||||
|
18
src/main.rs
18
src/main.rs
@ -1,12 +1,11 @@
|
|||||||
#![feature(ip_from)]
|
#![feature(ip_from)]
|
||||||
#![feature(path_add_extension)]
|
|
||||||
#![warn(clippy::expect_used)]
|
#![warn(clippy::expect_used)]
|
||||||
#![deny(clippy::unwrap_used)]
|
#![deny(clippy::unwrap_used)]
|
||||||
|
|
||||||
extern crate html5ever;
|
extern crate html5ever;
|
||||||
|
|
||||||
use std::{
|
use std::{
|
||||||
collections::HashSet, fs::File, io::Read, sync::LazyLock
|
collections::HashSet, fs::File, io::Read, net::{IpAddr, Ipv4Addr}, sync::LazyLock, time::Instant
|
||||||
};
|
};
|
||||||
|
|
||||||
use futures_util::StreamExt;
|
use futures_util::StreamExt;
|
||||||
@ -243,15 +242,9 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
|
|||||||
|
|
||||||
// create filepath (handles / -> /index.html)
|
// create filepath (handles / -> /index.html)
|
||||||
let path = filesystem::as_path(&site.site, ct);
|
let path = filesystem::as_path(&site.site, ct);
|
||||||
let mut tmp_path= path.clone();
|
|
||||||
if !(tmp_path.add_extension("crawl_temp")) {
|
|
||||||
warn!("Failed to add extension to file");
|
|
||||||
// fallback ig
|
|
||||||
tmp_path = tmp_path.with_extension("crawl_temp");
|
|
||||||
}
|
|
||||||
|
|
||||||
// make sure that the file is good to go
|
// make sure that the file is good to go
|
||||||
if let Some(file) = filesystem::init(&tmp_path).await {
|
if let Some(file) = filesystem::init(&path).await {
|
||||||
// Get body from response
|
// Get body from response
|
||||||
// stream the response onto the disk
|
// stream the response onto the disk
|
||||||
let mut stream = response.bytes_stream();
|
let mut stream = response.bytes_stream();
|
||||||
@ -281,14 +274,9 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let _ = writer.flush();
|
|
||||||
// rename the temp file into the real file name
|
|
||||||
if let Err(err) = tokio::fs::rename(tmp_path, path).await {
|
|
||||||
error!("{}", err);
|
|
||||||
}
|
|
||||||
|
|
||||||
stream_span.end();
|
stream_span.end();
|
||||||
BEING_STREAMED.add(-1, &[]);
|
BEING_STREAMED.add(-1, &[]);
|
||||||
|
let _ = writer.flush();
|
||||||
|
|
||||||
// (If needed) Parse the file
|
// (If needed) Parse the file
|
||||||
if should_parse {
|
if should_parse {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user