diff --git a/Crawler.toml b/Crawler.toml index e57d08b..8cb179b 100644 --- a/Crawler.toml +++ b/Crawler.toml @@ -3,12 +3,12 @@ surreal_url = "localhost:8000" surreal_username = "root" surreal_password = "root" surreal_ns = "test" -surreal_db = "v1.20.3" +surreal_db = "v1.21.1" # Crawler config -crawl_filter = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI/Imagery/2023_NAIP/UTM_County_Mosaics/" -# crawl_filter = "https://oliveratkinson.net" -start_url = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI/Imagery/2023_NAIP/UTM_County_Mosaics/" -# start_url = "https://oliveratkinson.net" -budget = 100 -batch_size = 5 +# crawl_filter = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI/Imagery/2023_NAIP/UTM_County_Mosaics/" +crawl_filter = "https://oliveratkinson.net" +# start_url = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI/Imagery/2023_NAIP/UTM_County_Mosaics/" +start_url = "https://oliveratkinson.net" +budget = 1000 +batch_size = 500 diff --git a/src/db.rs b/src/db.rs index 49fbd6e..2cccfa7 100644 --- a/src/db.rs +++ b/src/db.rs @@ -20,6 +20,8 @@ pub struct Website { pub site: Url, /// Wether or not this link has been crawled yet pub crawled: bool, + /// 200, 404, etc + pub status_code: u16, } // manual impl to make tracing look nicer @@ -38,14 +40,11 @@ impl Website { }; Self { crawled, - site + site, + status_code: 0, } } - pub fn set_crawled(&mut self) { - self.crawled = true - } - // Insert ever item in the vec into surreal, crawled state will be preserved as TRUE // if already in the database as such or incoming data is TRUE. #[instrument(skip(db))] @@ -58,6 +57,7 @@ impl Website { "INSERT INTO website $array ON DUPLICATE KEY UPDATE accessed_at = time::now(), + status_code = $input.status_code, crawled = crawled OR $input.crawled RETURN VALUE id; ", diff --git a/src/main.rs b/src/main.rs index c0898a1..66c0fd5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -173,8 +173,8 @@ async fn process(mut site: Website, db: Surreal, reqwest: reqwest::Clien // Send the http request (get) if let Ok(response) = request_builder.send().await { - // Get body from response let headers = response.headers(); + let code = response.status(); #[allow(non_snake_case)] let CT = headers.get("Content-Type"); @@ -201,6 +201,7 @@ async fn process(mut site: Website, db: Surreal, reqwest: reqwest::Clien let mut writer = BufWriter::new(file); + // Get body from response // stream the response onto the disk let mut stream = response.bytes_stream(); @@ -243,7 +244,8 @@ async fn process(mut site: Website, db: Surreal, reqwest: reqwest::Clien counter!(GET_METRIC).increment(1); // update self in db - site.set_crawled(); + site.crawled = true; + site.status_code = code.as_u16(); Website::store_all(vec![site], &db).await; } else { error!("File failed to cooperate: {:?}", path);