From add6f00ed6dc1b61fc1fffbb61af539fd92f7a7b Mon Sep 17 00:00:00 2001 From: Rushmore75 Date: Mon, 31 Mar 2025 14:53:10 -0600 Subject: [PATCH] no recomp needed --- Crawler.toml | 7 ++++--- README.md | 5 +++-- src/filesystem.rs | 7 +++++++ src/main.rs | 16 ++++++++++++++-- 4 files changed, 28 insertions(+), 7 deletions(-) diff --git a/Crawler.toml b/Crawler.toml index 8940c1b..0048238 100644 --- a/Crawler.toml +++ b/Crawler.toml @@ -3,8 +3,9 @@ surreal_url = "localhost:8000" surreal_username = "root" surreal_password = "root" surreal_ns = "test" -surreal_db = "v1.19.2" +surreal_db = "v1.19.5" # Crawler config -crawl_filter = "en.wikipedia.com" -budget = 1000 +crawl_filter = "en.wikipedia.org" +start_url = "https://en.wikipedia.org" +budget = 100 diff --git a/README.md b/README.md index 230af74..a223089 100644 --- a/README.md +++ b/README.md @@ -4,11 +4,12 @@ Crawls sites saving all the found links to a surrealdb database. It then proceed ### TODO -- [ ] Domain filtering - prevent the crawler from going on alternate versions of wikipedia. +- [x] Domain filtering - prevent the crawler from going on alternate versions of wikipedia. - [ ] Conditionally save content - based on filename or file contents - [x] GUI / TUI ? - Graphana - [x] Better asynchronous getting of the sites. Currently it all happens serially. -- [ ] Allow for storing asynchronously +- [x] Allow for storing asynchronously - dropping the "links to" logic fixes this need +- [x] Control crawler via config file (no recompliation needed) 3/17/25: Took >1hr to crawl 100 pages diff --git a/src/filesystem.rs b/src/filesystem.rs index 46c1af3..5a253f8 100644 --- a/src/filesystem.rs +++ b/src/filesystem.rs @@ -59,6 +59,13 @@ fn valid_file_extension(take: &&OsStr) -> bool { "pdf" => true, "json" => true, "xml" => true, + + // IGNORE + // TODO Should this be a list of all domains? + "org" => false, + "com" => false, + "net" => false, + _ => { warn!("Might be forgetting a file extension: {s}"); false diff --git a/src/main.rs b/src/main.rs index 9f20638..dc13d1e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -33,11 +33,14 @@ struct Config { surreal_password: String, crawl_filter: String, + start_url: String, budget: usize, } #[tokio::main] async fn main() { + println!("Logs and metrics are provided to the Grafana dashboard"); + let writer = std::fs::OpenOptions::new() .append(true) .create(true) @@ -70,8 +73,7 @@ async fn main() { .expect("failed to install recorder/exporter"); info!("Starting..."); - // Would probably take these in as parameters from a cli - let starting_url = "https://en.wikipedia.org/"; + // When getting uncrawled pages, name must contain this variable. "" will effectively get ignored. // let crawl_filter = "en.wikipedia.org/"; // let budget = 50; @@ -82,6 +84,7 @@ async fn main() { let _ = file.read_to_string(&mut buf); let config: Config = toml::from_str(&buf).expect("Failed to parse Crawler.toml"); + let starting_url = &config.start_url; let db = connect(&config) .await @@ -138,6 +141,15 @@ async fn main() { } drop(span); + if let Ok(mut ok) = db.query("count(select id from website where crawled = true)").await { + let res = ok.take::>(0); + if let Ok(i) = res { + if let Some(n) = i { + info!("Total crawled pages now equals {n}"); + } + } + } + info!("Done"); }