file patch;

This commit is contained in:
Oliver 2025-03-21 07:11:51 +00:00
parent 2de01b2a0e
commit 808790a7c3
2 changed files with 37 additions and 4 deletions

View File

@ -3,7 +3,7 @@ surreal_url = "localhost:8000"
surreal_username = "root"
surreal_password = "root"
surreal_ns = "test"
surreal_db = "v1.18.1"
surreal_db = "v1.19.1"
# Crawler config
crawl_filter = "en.wikipedia.com"

View File

@ -1,7 +1,7 @@
use std::path::PathBuf;
use std::{ffi::OsStr, path::PathBuf};
use tokio::fs;
use tracing::{error, instrument, trace};
use tracing::{error, instrument, trace, warn};
use url::Url;
#[instrument(skip(data))]
@ -10,7 +10,7 @@ pub async fn store(data: &str, url: &Url) {
let url_path = PathBuf::from("./downloaded/".to_string() + url.domain().unwrap_or("UnknownDomain") + url.path());
// if it's a file
let (basepath, filename) = if url_path.extension().is_some() {
let (basepath, filename) = if url_path.extension().filter(valid_file_extension).is_some() {
// get everything up till the file
let basepath = url_path.ancestors().skip(1).take(1).collect::<PathBuf>();
// get the file name
@ -33,3 +33,36 @@ pub async fn store(data: &str, url: &Url) {
}
}
}
fn valid_file_extension(take: &&OsStr) -> bool {
let los = take.to_string_lossy();
let all = los.split('.');
match all.last() {
Some(s) => {
match s.to_lowercase().as_str() {
"html" => true,
"css" => true,
"js" => true,
"ts" => true,
"otf" => true, // font
"png" => true,
"svg" => true,
"jpg" => true,
"jpeg" => true,
"mp4" => true,
"mp3" => true,
"webp" => true,
"pdf" => true,
"json" => true,
"xml" => true,
_ => {
warn!("Might be forgetting a file extension: {s}");
false
}
}
},
None => false,
}
}