From 808790a7c3bfd21d0c639fdfcecf4591ff889bbd Mon Sep 17 00:00:00 2001 From: Oliver Date: Fri, 21 Mar 2025 07:11:51 +0000 Subject: [PATCH] file patch; --- Crawler.toml | 2 +- src/filesystem.rs | 39 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/Crawler.toml b/Crawler.toml index 7072e33..a23d0f4 100644 --- a/Crawler.toml +++ b/Crawler.toml @@ -3,7 +3,7 @@ surreal_url = "localhost:8000" surreal_username = "root" surreal_password = "root" surreal_ns = "test" -surreal_db = "v1.18.1" +surreal_db = "v1.19.1" # Crawler config crawl_filter = "en.wikipedia.com" diff --git a/src/filesystem.rs b/src/filesystem.rs index 578ae7e..e92b9aa 100644 --- a/src/filesystem.rs +++ b/src/filesystem.rs @@ -1,7 +1,7 @@ -use std::path::PathBuf; +use std::{ffi::OsStr, path::PathBuf}; use tokio::fs; -use tracing::{error, instrument, trace}; +use tracing::{error, instrument, trace, warn}; use url::Url; #[instrument(skip(data))] @@ -10,7 +10,7 @@ pub async fn store(data: &str, url: &Url) { let url_path = PathBuf::from("./downloaded/".to_string() + url.domain().unwrap_or("UnknownDomain") + url.path()); // if it's a file - let (basepath, filename) = if url_path.extension().is_some() { + let (basepath, filename) = if url_path.extension().filter(valid_file_extension).is_some() { // get everything up till the file let basepath = url_path.ancestors().skip(1).take(1).collect::(); // get the file name @@ -33,3 +33,36 @@ pub async fn store(data: &str, url: &Url) { } } } + +fn valid_file_extension(take: &&OsStr) -> bool { + let los = take.to_string_lossy(); + let all = los.split('.'); + match all.last() { + Some(s) => { + match s.to_lowercase().as_str() { + "html" => true, + "css" => true, + "js" => true, + "ts" => true, + "otf" => true, // font + + "png" => true, + "svg" => true, + "jpg" => true, + "jpeg" => true, + "mp4" => true, + "mp3" => true, + "webp" => true, + + "pdf" => true, + "json" => true, + "xml" => true, + _ => { + warn!("Might be forgetting a file extension: {s}"); + false + } + } + }, + None => false, + } +}