diff --git a/Cargo.lock b/Cargo.lock index dd31000..c55ee61 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1961,6 +1961,25 @@ dependencies = [ "generic-array", ] +[[package]] +name = "internet_mapper" +version = "0.1.0" +dependencies = [ + "base64 0.22.1", + "html5ever 0.29.1", + "metrics", + "metrics-exporter-prometheus", + "minio", + "reqwest", + "serde", + "surrealdb", + "tokio", + "toml", + "tracing", + "tracing-subscriber", + "url", +] + [[package]] name = "ipnet" version = "2.11.0" @@ -4112,25 +4131,6 @@ version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" -[[package]] -name = "surreal_spider" -version = "0.1.0" -dependencies = [ - "base64 0.22.1", - "html5ever 0.29.1", - "metrics", - "metrics-exporter-prometheus", - "minio", - "reqwest", - "serde", - "surrealdb", - "tokio", - "toml", - "tracing", - "tracing-subscriber", - "url", -] - [[package]] name = "surrealdb" version = "2.2.1" diff --git a/Cargo.toml b/Cargo.toml index 6a1d88e..fa499e4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "surreal_spider" +name = "internet_mapper" version = "0.1.0" edition = "2021" diff --git a/Crawler.toml b/Crawler.toml index df0a253..d536db5 100644 --- a/Crawler.toml +++ b/Crawler.toml @@ -3,13 +3,13 @@ surreal_url = "localhost:8000" surreal_username = "root" surreal_password = "root" surreal_ns = "test" -surreal_db = "v1.15.4" +surreal_db = "v1.17" # Minio config -s3_bucket = "v1.15.4" +s3_bucket = "v1.17" s3_url = "http://localhost:9000" -s3_access_key = "3ptjsHhRHCHlpCmgFy9n" -s3_secret_key = "68CmV07YExeCxb8kJhosSauEizj5CAE7PINZIfQz" +s3_access_key = "Ok6s9uQEvKrqRoGZdacm" +s3_secret_key = "qubeSkP787c7QZu4TvtnuwPTGIAq6ETPupCxvv6K" # Crawler config crawl_filter = "en.wikipedia.com" diff --git a/README.md b/README.md index 4832c6b..6b2890c 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,6 @@ Crawls sites saving all the found links to a surrealdb database. It then proceeds to take batches of 100 uncrawled links untill the crawl budget is reached. It saves the data of each site in a minio database. - ### TODO - [ ] Domain filtering - prevent the crawler from going on alternate versions of wikipedia. @@ -11,5 +10,14 @@ Crawls sites saving all the found links to a surrealdb database. It then proceed - [x] Better asynchronous getting of the sites. Currently it all happens serially. - [ ] Allow for storing asynchronously -3/19/25: Took 20min to crawl 100 pages -This ment we stored 100 pages, 142,997 urls, and 1,425,798 links between the two. +3/17/25: Took >1hr to crawl 100 pages + +3/19/25: Took 20min to crawl 1000 pages +This ment we stored 1000 pages, 142,997 urls, and 1,425,798 links between the two. + +3/20/25: Took 5min to crawl 1000 pages + +# About + +![Screenshot](/pngs/graphana.png) + diff --git a/pngs/graphana.png b/pngs/graphana.png new file mode 100644 index 0000000..70aa9a5 Binary files /dev/null and b/pngs/graphana.png differ diff --git a/src/db.rs b/src/db.rs index 8aae6e8..deceb27 100644 --- a/src/db.rs +++ b/src/db.rs @@ -10,14 +10,13 @@ use surrealdb::{ engine::remote::ws::{Client, Ws}, opt::auth::Root, sql::Thing, - Error::Api, Response, Surreal, }; use tokio::sync::Mutex; use tracing::{error, instrument, trace, warn}; use url::Url; -use crate::{Config, Timer}; +use crate::Config; // static LOCK: LazyLock>> = LazyLock::new(|| Arc::new(Mutex::new(true))); static LOCK: LazyLock> = LazyLock::new(|| Mutex::new(true)); @@ -25,9 +24,9 @@ static LOCK: LazyLock> = LazyLock::new(|| Mutex::new(true)); const CUSTOM_ENGINE: engine::GeneralPurpose = engine::GeneralPurpose::new(&alphabet::URL_SAFE, general_purpose::NO_PAD); -const TIME_SPENT_ON_LOCK: &'static str = "surql_lock_waiting_ms"; -const STORE: &'static str = "surql_store_calls"; -const LINK: &'static str = "surql_link_calls"; +const TIME_SPENT_ON_LOCK: &str = "surql_lock_waiting_ms"; +const STORE: &str = "surql_store_calls"; +const LINK: &str = "surql_link_calls"; #[derive(Deserialize, Clone)] pub struct Website { @@ -86,6 +85,7 @@ impl Website { domain + path } + pub fn get_url_as_b64_path(site: &Url) -> String { let domain = site.domain().unwrap_or("DOMAIN").to_string(); let path = &CUSTOM_ENGINE.encode(site.path()); @@ -104,10 +104,6 @@ impl Website { // let to = other.site.to_string(); trace!("Linking {} pages to {from}", other.len()); - let msg = format!("Linked {len} pages to {from}"); - let timer = Timer::start(&msg); - // prevent the timer from being dropped instantly. - let _ = timer; counter!(LINK).increment(1); match db .query("COUNT(RELATE (SELECT id FROM website WHERE site = $in) -> links_to -> $out)") @@ -121,7 +117,7 @@ impl Website { let _: Response = e; if let Ok(vec) = e.take(0) { let _: Vec = vec; - if let Some(num) = vec.get(0) { + if let Some(num) = vec.first() { if *num == len { trace!("Link for {from} OK - {num}/{len}"); return; @@ -167,13 +163,7 @@ impl Website { { Ok(mut id) => match id.take::>(0) { Ok(mut x) => things.append(&mut x), - Err(err) => match err { - Api(error) => { - eprintln!("{:?}", error); - error!("{:?}", error); - } - _ => error!("{:?}", err), - }, + Err(err) => error!("{:?}", err), }, Err(err) => { error!("{:?}", err); @@ -224,3 +214,4 @@ pub async fn connect(config: &Config) -> surrealdb::Result> { Ok(db) } + diff --git a/src/main.rs b/src/main.rs index 1d02d56..fcdd10b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -6,7 +6,6 @@ use std::{ fs::File, io::Read, net::{IpAddr, Ipv4Addr}, - time::Instant, }; use db::{connect, Website}; @@ -16,17 +15,17 @@ use s3::S3; use serde::Deserialize; use surrealdb::{engine::remote::ws::Client, Surreal}; use tokio::task::JoinSet; -use tracing::{debug, error, info, instrument, trace, trace_span, warn}; +use tracing::{debug, error, info, instrument, trace, trace_span}; use tracing_subscriber::{fmt, layer::SubscriberExt, EnvFilter, Layer, Registry}; mod db; mod parser; mod s3; -const GET_METRIC: &'static str = "total_gets"; -const GET_IN_FLIGHT: &'static str = "gets_in_flight"; -const SITES_CRAWLED: &'static str = "pages_crawled"; -const BEING_PROCESSED: &'static str = "pages_being_processed"; +const GET_METRIC: &str = "total_gets"; +const GET_IN_FLIGHT: &str = "gets_in_flight"; +const SITES_CRAWLED: &str = "pages_crawled"; +const BEING_PROCESSED: &str = "pages_being_processed"; #[derive(Deserialize)] struct Config { @@ -47,8 +46,6 @@ struct Config { #[tokio::main] async fn main() { - let total_runtime = Timer::start("Completed"); - let writer = std::fs::OpenOptions::new() .append(true) .create(true) @@ -110,7 +107,7 @@ async fn main() { let span = trace_span!("Pre-Loop"); let pre_loop_span = span.enter(); // Download the site - let site = Website::new(&starting_url, false); + let site = Website::new(starting_url, false); process(site, db.clone(), reqwest.clone(), s3.clone()).await; drop(pre_loop_span); @@ -125,7 +122,7 @@ async fn main() { }; let uncrawled = get_uncrawled_links(&db, get_num, config.crawl_filter.clone()).await; - if uncrawled.len() == 0 { + if uncrawled.is_empty() { info!("Had more budget but finished crawling everything."); return; } @@ -146,7 +143,7 @@ async fn main() { let c = counter!(SITES_CRAWLED); // As futures complete runs code in while block - while let Some(_) = futures.join_next().await { + while futures.join_next().await.is_some() { c.increment(1); gauge!(BEING_PROCESSED).decrement(1); crawled += 1; @@ -156,7 +153,6 @@ async fn main() { drop(span); debug!("Done"); - drop(total_runtime); } #[instrument(skip(db, s3, reqwest))] @@ -166,22 +162,17 @@ async fn process(mut site: Website, db: Surreal, reqwest: reqwest::Clien // METRICS trace!("Process: {}", &site.site); - let timer = Timer::start("Built request"); // Build the request - let request_builder = reqwest.get(&site.site.to_string()); - // METRICS - timer.stop(); + let request_builder = reqwest.get(site.site.to_string()); // METRICS let g = gauge!(GET_IN_FLIGHT); g.increment(1); - let timer = Timer::start("Got page"); // Send the http request (get) if let Ok(response) = request_builder.send().await { // METRICS - timer.stop(); g.decrement(1); counter!(GET_METRIC).increment(1); @@ -198,14 +189,14 @@ async fn process(mut site: Website, db: Surreal, reqwest: reqwest::Clien // update self in db site.set_crawled(); - Website::store_all(vec![site.clone()], &db).await; + Website::store_all(vec![site], &db).await; // Store all the other sites so that we can link to them. // let mut links_to = Vec::new(); - let others = Website::store_all(sites, &db).await; + let _ = Website::store_all(sites, &db).await; // Make the database's links reflect the html links between sites - site.links_to(others, &db).await; + // site.links_to(others, &db).await; } else { error!("Failed to get: {}", &site.site); } @@ -234,33 +225,3 @@ async fn get_uncrawled_links( .expect("Returned websites couldn't be parsed") } -pub struct Timer<'a> { - start: Instant, - msg: &'a str, -} - -impl<'a> Timer<'a> { - #[inline] - pub fn start(msg: &'a str) -> Self { - Self { - start: Instant::now(), - msg, - } - } - pub fn stop(&self) -> f64 { - let dif = self.start.elapsed().as_micros(); - let ms = dif as f64 / 1000.; - - if ms > 200. { - warn!("{}", format!("{} in {:.3}ms", self.msg, ms)); - } - - ms - } -} - -impl Drop for Timer<'_> { - fn drop(&mut self) { - self.stop(); - } -} diff --git a/src/parser.rs b/src/parser.rs index fc6d5fe..3fc1276 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -8,7 +8,6 @@ use html5ever::{local_name, tendril::*}; use tracing::instrument; use crate::db::Website; -use crate::Timer; impl TokenSink for Website { type Handle = Vec; @@ -69,12 +68,11 @@ impl TokenSink for Website { pub async fn parse(site: &Website, data: &str) -> Vec { // prep work let mut other_sites: Vec = Vec::new(); - let _t = Timer::start("Parsed page"); // change data into something that can be tokenized - let chunk = Tendril::from_str(&data).expect("Failed to parse string into Tendril!"); + let chunk = Tendril::from_str(data).expect("Failed to parse string into Tendril!"); // create buffer of tokens and push our input into it - let mut token_buffer = BufferQueue::default(); + let token_buffer = BufferQueue::default(); token_buffer.push_back( chunk .try_reinterpret::() @@ -84,7 +82,7 @@ pub async fn parse(site: &Website, data: &str) -> Vec { let tokenizer = Tokenizer::new(site.clone(), TokenizerOpts::default()); // go thru buffer - while let TokenizerResult::Script(mut sites) = tokenizer.feed(&mut token_buffer) { + while let TokenizerResult::Script(mut sites) = tokenizer.feed(&token_buffer) { other_sites.append(&mut sites); // other_sites.push(sites); } @@ -93,3 +91,4 @@ pub async fn parse(site: &Website, data: &str) -> Vec { other_sites } + diff --git a/src/s3.rs b/src/s3.rs index f984606..4efc2b4 100644 --- a/src/s3.rs +++ b/src/s3.rs @@ -10,9 +10,9 @@ use minio::s3::{ use tracing::{instrument, trace, warn}; use url::Url; -use crate::{db::Website, Config, Timer}; +use crate::{db::Website, Config}; -const S3_ROUND_TRIP_METRIC: &'static str = "s3_trips"; +const S3_ROUND_TRIP_METRIC: &str = "s3_trips"; #[derive(Clone)] pub struct S3 { @@ -65,14 +65,12 @@ impl S3 { #[instrument(name = "s3_store", skip_all)] pub async fn store(&self, data: &str, url: &Url) { let counter = counter!(S3_ROUND_TRIP_METRIC); - let t = Timer::start("Stored page"); - let _ = t; // prevent compiler drop let filename = Website::get_url_as_string(url); trace!("Storing {} as {filename}", url.to_string()); counter.increment(1); - let _ = match &self + match &self .client .put_object_content(&self.bucket_name, &filename, data.to_owned()) .send() @@ -99,3 +97,4 @@ impl S3 { }; } } +