change up how files are discovered
This commit is contained in:
		@@ -5,7 +5,10 @@ use tracing::{debug, error, instrument, trace, warn};
 | 
				
			|||||||
use url::Url;
 | 
					use url::Url;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#[instrument(skip(data))]
 | 
					#[instrument(skip(data))]
 | 
				
			||||||
pub async fn store(data: &str, url: &Url) {
 | 
					/// Returns whether or not the saved file should be parsed.
 | 
				
			||||||
 | 
					/// If the file is just data, like an image, it doesn't need to be parsed.
 | 
				
			||||||
 | 
					/// If it's html, then it does need to be parsed.
 | 
				
			||||||
 | 
					pub async fn store(data: &str, url: &Url) -> bool {
 | 
				
			||||||
    // extract data from url to save it accurately
 | 
					    // extract data from url to save it accurately
 | 
				
			||||||
    let url_path = PathBuf::from("./downloaded/".to_string() + url.domain().unwrap_or("UnknownDomain") + url.path());
 | 
					    let url_path = PathBuf::from("./downloaded/".to_string() + url.domain().unwrap_or("UnknownDomain") + url.path());
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -21,19 +24,20 @@ pub async fn store(data: &str, url: &Url) {
 | 
				
			|||||||
        (url_path.clone(), "index.html".into())
 | 
					        (url_path.clone(), "index.html".into())
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    let should_parse = filename.ends_with(".html");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    debug!("Writing at: {:?} {:?}", basepath, filename);
 | 
					    debug!("Writing at: {:?} {:?}", basepath, filename);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // create the folders
 | 
					    // create the folders
 | 
				
			||||||
    if let Err(err) = fs::create_dir_all(&basepath).await {
 | 
					    if let Err(err) = fs::create_dir_all(&basepath).await {
 | 
				
			||||||
        error!("Dir creation: {err} {:?}", basepath);
 | 
					        error!("Dir creation: {err} {:?}", basepath);
 | 
				
			||||||
    } else {
 | 
					    } else {
 | 
				
			||||||
        // FIXME I don't think this handles index.html files well...
 | 
					 | 
				
			||||||
        // TODO this should probably append .html to non-described files
 | 
					 | 
				
			||||||
        // create the file if that was successful
 | 
					 | 
				
			||||||
        if let Err(err) = fs::write(&basepath.join(filename), data).await {
 | 
					        if let Err(err) = fs::write(&basepath.join(filename), data).await {
 | 
				
			||||||
            error!("File creation: {err} {:?}", url_path);
 | 
					            error!("File creation: {err} {:?}", url_path);
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    should_parse
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
fn valid_file_extension(take: &&OsStr) -> bool {
 | 
					fn valid_file_extension(take: &&OsStr) -> bool {
 | 
				
			||||||
@@ -41,35 +45,14 @@ fn valid_file_extension(take: &&OsStr) -> bool {
 | 
				
			|||||||
    let all = los.split('.');
 | 
					    let all = los.split('.');
 | 
				
			||||||
    match all.last() {
 | 
					    match all.last() {
 | 
				
			||||||
        Some(s) => {
 | 
					        Some(s) => {
 | 
				
			||||||
            match s.to_lowercase().as_str() {
 | 
					            // FIXME it's worth noting that the dumb tlds like .zip are in here,
 | 
				
			||||||
                "html" => true,
 | 
					            // which could cause problems
 | 
				
			||||||
                "css" => true,
 | 
					            let all_domains = include_str!("tlds-alpha-by-domain.txt");
 | 
				
			||||||
                "js" => true,
 | 
					 | 
				
			||||||
                "ts" => true,
 | 
					 | 
				
			||||||
                "otf" => true, // font
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
                "png" => true,
 | 
					            // check if it is a domain
 | 
				
			||||||
                "svg" => true,
 | 
					            match all_domains.lines().map(str::to_lowercase).find(|x| x==s.to_lowercase().as_str()) {
 | 
				
			||||||
                "jpg" => true,
 | 
					                Some(_) => false,
 | 
				
			||||||
                "jpeg" => true,
 | 
					                None => true
 | 
				
			||||||
                "mp4" => true,
 | 
					 | 
				
			||||||
                "mp3" => true,
 | 
					 | 
				
			||||||
                "webp" => true,
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                "pdf" => true,
 | 
					 | 
				
			||||||
                "json" => true,
 | 
					 | 
				
			||||||
                "xml" => true,
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                // IGNORE
 | 
					 | 
				
			||||||
                // TODO Should this be a list of all domains?
 | 
					 | 
				
			||||||
                "org" => false,
 | 
					 | 
				
			||||||
                "com" => false,
 | 
					 | 
				
			||||||
                "net" => false,
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                _ => {
 | 
					 | 
				
			||||||
                    warn!("Might be forgetting a file extension: {s}");
 | 
					 | 
				
			||||||
                    false
 | 
					 | 
				
			||||||
                }
 | 
					 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
        },
 | 
					        },
 | 
				
			||||||
        None => false,
 | 
					        None => false,
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										1444
									
								
								src/tlds-alpha-by-domain.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1444
									
								
								src/tlds-alpha-by-domain.txt
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
		Reference in New Issue
	
	Block a user