file patch;
This commit is contained in:
parent
2de01b2a0e
commit
808790a7c3
@ -3,7 +3,7 @@ surreal_url = "localhost:8000"
|
|||||||
surreal_username = "root"
|
surreal_username = "root"
|
||||||
surreal_password = "root"
|
surreal_password = "root"
|
||||||
surreal_ns = "test"
|
surreal_ns = "test"
|
||||||
surreal_db = "v1.18.1"
|
surreal_db = "v1.19.1"
|
||||||
|
|
||||||
# Crawler config
|
# Crawler config
|
||||||
crawl_filter = "en.wikipedia.com"
|
crawl_filter = "en.wikipedia.com"
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
use std::path::PathBuf;
|
use std::{ffi::OsStr, path::PathBuf};
|
||||||
|
|
||||||
use tokio::fs;
|
use tokio::fs;
|
||||||
use tracing::{error, instrument, trace};
|
use tracing::{error, instrument, trace, warn};
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
#[instrument(skip(data))]
|
#[instrument(skip(data))]
|
||||||
@ -10,7 +10,7 @@ pub async fn store(data: &str, url: &Url) {
|
|||||||
let url_path = PathBuf::from("./downloaded/".to_string() + url.domain().unwrap_or("UnknownDomain") + url.path());
|
let url_path = PathBuf::from("./downloaded/".to_string() + url.domain().unwrap_or("UnknownDomain") + url.path());
|
||||||
|
|
||||||
// if it's a file
|
// if it's a file
|
||||||
let (basepath, filename) = if url_path.extension().is_some() {
|
let (basepath, filename) = if url_path.extension().filter(valid_file_extension).is_some() {
|
||||||
// get everything up till the file
|
// get everything up till the file
|
||||||
let basepath = url_path.ancestors().skip(1).take(1).collect::<PathBuf>();
|
let basepath = url_path.ancestors().skip(1).take(1).collect::<PathBuf>();
|
||||||
// get the file name
|
// get the file name
|
||||||
@ -33,3 +33,36 @@ pub async fn store(data: &str, url: &Url) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn valid_file_extension(take: &&OsStr) -> bool {
|
||||||
|
let los = take.to_string_lossy();
|
||||||
|
let all = los.split('.');
|
||||||
|
match all.last() {
|
||||||
|
Some(s) => {
|
||||||
|
match s.to_lowercase().as_str() {
|
||||||
|
"html" => true,
|
||||||
|
"css" => true,
|
||||||
|
"js" => true,
|
||||||
|
"ts" => true,
|
||||||
|
"otf" => true, // font
|
||||||
|
|
||||||
|
"png" => true,
|
||||||
|
"svg" => true,
|
||||||
|
"jpg" => true,
|
||||||
|
"jpeg" => true,
|
||||||
|
"mp4" => true,
|
||||||
|
"mp3" => true,
|
||||||
|
"webp" => true,
|
||||||
|
|
||||||
|
"pdf" => true,
|
||||||
|
"json" => true,
|
||||||
|
"xml" => true,
|
||||||
|
_ => {
|
||||||
|
warn!("Might be forgetting a file extension: {s}");
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
None => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user