Compare commits

..

1 Commits

Author SHA1 Message Date
50606bb69e It isnt quite working yet 2025-04-17 09:59:23 -06:00
3 changed files with 16 additions and 14 deletions

View File

@ -3,12 +3,12 @@ surreal_url = "localhost:8000"
surreal_username = "root" surreal_username = "root"
surreal_password = "root" surreal_password = "root"
surreal_ns = "test" surreal_ns = "test"
surreal_db = "v1.20.3" surreal_db = "v1.21.1"
# Crawler config # Crawler config
crawl_filter = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI/Imagery/2023_NAIP/UTM_County_Mosaics/" # crawl_filter = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI/Imagery/2023_NAIP/UTM_County_Mosaics/"
# crawl_filter = "https://oliveratkinson.net" crawl_filter = "https://oliveratkinson.net"
start_url = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI/Imagery/2023_NAIP/UTM_County_Mosaics/" # start_url = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI/Imagery/2023_NAIP/UTM_County_Mosaics/"
# start_url = "https://oliveratkinson.net" start_url = "https://oliveratkinson.net"
budget = 100 budget = 1000
batch_size = 5 batch_size = 500

View File

@ -20,6 +20,8 @@ pub struct Website {
pub site: Url, pub site: Url,
/// Wether or not this link has been crawled yet /// Wether or not this link has been crawled yet
pub crawled: bool, pub crawled: bool,
/// 200, 404, etc
pub status_code: u16,
} }
// manual impl to make tracing look nicer // manual impl to make tracing look nicer
@ -38,14 +40,11 @@ impl Website {
}; };
Self { Self {
crawled, crawled,
site site,
status_code: 0,
} }
} }
pub fn set_crawled(&mut self) {
self.crawled = true
}
// Insert ever item in the vec into surreal, crawled state will be preserved as TRUE // Insert ever item in the vec into surreal, crawled state will be preserved as TRUE
// if already in the database as such or incoming data is TRUE. // if already in the database as such or incoming data is TRUE.
#[instrument(skip(db))] #[instrument(skip(db))]
@ -58,6 +57,7 @@ impl Website {
"INSERT INTO website $array "INSERT INTO website $array
ON DUPLICATE KEY UPDATE ON DUPLICATE KEY UPDATE
accessed_at = time::now(), accessed_at = time::now(),
status_code = $input.status_code,
crawled = crawled OR $input.crawled crawled = crawled OR $input.crawled
RETURN VALUE id; RETURN VALUE id;
", ",

View File

@ -173,8 +173,8 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
// Send the http request (get) // Send the http request (get)
if let Ok(response) = request_builder.send().await { if let Ok(response) = request_builder.send().await {
// Get body from response
let headers = response.headers(); let headers = response.headers();
let code = response.status();
#[allow(non_snake_case)] #[allow(non_snake_case)]
let CT = headers.get("Content-Type"); let CT = headers.get("Content-Type");
@ -201,6 +201,7 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
let mut writer = BufWriter::new(file); let mut writer = BufWriter::new(file);
// Get body from response
// stream the response onto the disk // stream the response onto the disk
let mut stream = response.bytes_stream(); let mut stream = response.bytes_stream();
@ -243,7 +244,8 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
counter!(GET_METRIC).increment(1); counter!(GET_METRIC).increment(1);
// update self in db // update self in db
site.set_crawled(); site.crawled = true;
site.status_code = code.as_u16();
Website::store_all(vec![site], &db).await; Website::store_all(vec![site], &db).await;
} else { } else {
error!("File failed to cooperate: {:?}", path); error!("File failed to cooperate: {:?}", path);