Merge pull request 'works 😄' (#16) from tempfiles into main
Reviewed-on: #16
This commit is contained in:
		
							
								
								
									
										12
									
								
								Crawler.toml
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								Crawler.toml
									
									
									
									
									
								
							@@ -6,9 +6,9 @@ surreal_ns = "test"
 | 
				
			|||||||
surreal_db = "v1.21.1"
 | 
					surreal_db = "v1.21.1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Crawler config
 | 
					# Crawler config
 | 
				
			||||||
# crawl_filter = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI/Imagery/2023_NAIP/UTM_County_Mosaics/" 
 | 
					crawl_filter = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI/Imagery/2023_NAIP/UTM_County_Mosaics/" 
 | 
				
			||||||
crawl_filter = "https://oliveratkinson.net" 
 | 
					# crawl_filter = "https://oliveratkinson.net" 
 | 
				
			||||||
# start_url = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI/Imagery/2023_NAIP/UTM_County_Mosaics/"
 | 
					start_url = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI/Imagery/2023_NAIP/UTM_County_Mosaics/"
 | 
				
			||||||
start_url = "https://oliveratkinson.net"
 | 
					# start_url = "https://oliveratkinson.net"
 | 
				
			||||||
budget = 1000
 | 
					budget = 100
 | 
				
			||||||
batch_size = 500
 | 
					batch_size = 2
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										18
									
								
								src/main.rs
									
									
									
									
									
								
							
							
						
						
									
										18
									
								
								src/main.rs
									
									
									
									
									
								
							@@ -1,11 +1,12 @@
 | 
				
			|||||||
#![feature(ip_from)]
 | 
					#![feature(ip_from)]
 | 
				
			||||||
 | 
					#![feature(path_add_extension)]
 | 
				
			||||||
#![warn(clippy::expect_used)]
 | 
					#![warn(clippy::expect_used)]
 | 
				
			||||||
#![deny(clippy::unwrap_used)]
 | 
					#![deny(clippy::unwrap_used)]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
extern crate html5ever;
 | 
					extern crate html5ever;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
use std::{
 | 
					use std::{
 | 
				
			||||||
    collections::HashSet, fs::File, io::Read, net::{IpAddr, Ipv4Addr}, sync::LazyLock, time::Instant
 | 
					    collections::HashSet, fs::File, io::Read, sync::LazyLock
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
use futures_util::StreamExt;
 | 
					use futures_util::StreamExt;
 | 
				
			||||||
@@ -242,9 +243,15 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        // create filepath (handles / -> /index.html)
 | 
					        // create filepath (handles / -> /index.html)
 | 
				
			||||||
        let path = filesystem::as_path(&site.site, ct);
 | 
					        let path = filesystem::as_path(&site.site, ct);
 | 
				
			||||||
 | 
					        let mut tmp_path= path.clone();
 | 
				
			||||||
 | 
					        if !(tmp_path.add_extension("crawl_temp")) {
 | 
				
			||||||
 | 
					            warn!("Failed to add extension to file");
 | 
				
			||||||
 | 
					            // fallback ig
 | 
				
			||||||
 | 
					            tmp_path = tmp_path.with_extension("crawl_temp");
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        // make sure that the file is good to go
 | 
					        // make sure that the file is good to go
 | 
				
			||||||
        if let Some(file) = filesystem::init(&path).await {
 | 
					        if let Some(file) = filesystem::init(&tmp_path).await {
 | 
				
			||||||
            // Get body from response
 | 
					            // Get body from response
 | 
				
			||||||
            // stream the response onto the disk
 | 
					            // stream the response onto the disk
 | 
				
			||||||
            let mut stream = response.bytes_stream();
 | 
					            let mut stream = response.bytes_stream();
 | 
				
			||||||
@@ -274,9 +281,14 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
 | 
				
			|||||||
                    },
 | 
					                    },
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
 | 
					            let _ = writer.flush();
 | 
				
			||||||
 | 
					            // rename the temp file into the real file name
 | 
				
			||||||
 | 
					            if let Err(err) = tokio::fs::rename(tmp_path, path).await {
 | 
				
			||||||
 | 
					                error!("{}", err);
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            stream_span.end();
 | 
					            stream_span.end();
 | 
				
			||||||
            BEING_STREAMED.add(-1, &[]);
 | 
					            BEING_STREAMED.add(-1, &[]);
 | 
				
			||||||
            let _ = writer.flush();
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
            // (If needed) Parse the file
 | 
					            // (If needed) Parse the file
 | 
				
			||||||
            if should_parse {
 | 
					            if should_parse {
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user