From 66581cc4538f22998161e2b2a39348909e764588 Mon Sep 17 00:00:00 2001 From: Oliver Date: Fri, 21 Mar 2025 05:59:40 +0000 Subject: [PATCH 01/14] getting there --- .gitignore | 3 +- Crawler.toml | 8 +--- docker/prometheus.yaml | 3 +- src/db.rs | 90 ++----------------------------------- src/filesystem.rs | 24 ++++++++++ src/main.rs | 57 +++++++++++------------ src/parser.rs | 61 ++++++++++++++++++------- src/s3.rs | 100 ----------------------------------------- 8 files changed, 101 insertions(+), 245 deletions(-) create mode 100644 src/filesystem.rs delete mode 100644 src/s3.rs diff --git a/.gitignore b/.gitignore index dbaa546..87fb73a 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,5 @@ perf.data flamegraph.svg perf.data.old -/docker/logs/* \ No newline at end of file +/docker/logs/* +/downloaded \ No newline at end of file diff --git a/Crawler.toml b/Crawler.toml index d536db5..7072e33 100644 --- a/Crawler.toml +++ b/Crawler.toml @@ -3,13 +3,7 @@ surreal_url = "localhost:8000" surreal_username = "root" surreal_password = "root" surreal_ns = "test" -surreal_db = "v1.17" - -# Minio config -s3_bucket = "v1.17" -s3_url = "http://localhost:9000" -s3_access_key = "Ok6s9uQEvKrqRoGZdacm" -s3_secret_key = "qubeSkP787c7QZu4TvtnuwPTGIAq6ETPupCxvv6K" +surreal_db = "v1.18.1" # Crawler config crawl_filter = "en.wikipedia.com" diff --git a/docker/prometheus.yaml b/docker/prometheus.yaml index cb43a89..9133e15 100644 --- a/docker/prometheus.yaml +++ b/docker/prometheus.yaml @@ -7,7 +7,8 @@ scrape_configs: static_configs: # change this your machine's ip, localhost won't work # because localhost refers to the docker container. - - targets: ['172.20.239.48:2500'] + # - targets: ['172.20.239.48:2500'] + - targets: ['192.168.8.209:2500'] - job_name: loki static_configs: - targets: ['loki:3100'] diff --git a/src/db.rs b/src/db.rs index deceb27..03a1850 100644 --- a/src/db.rs +++ b/src/db.rs @@ -1,16 +1,11 @@ -use base64::{ - alphabet, - engine::{self, general_purpose}, - Engine, -}; use metrics::counter; -use serde::{ser::SerializeStruct, Deserialize, Serialize}; +use serde::{Deserialize, Serialize}; use std::{fmt::Debug, sync::LazyLock, time::Instant}; use surrealdb::{ engine::remote::ws::{Client, Ws}, opt::auth::Root, sql::Thing, - Response, Surreal, + Surreal, }; use tokio::sync::Mutex; use tracing::{error, instrument, trace, warn}; @@ -18,17 +13,12 @@ use url::Url; use crate::Config; -// static LOCK: LazyLock>> = LazyLock::new(|| Arc::new(Mutex::new(true))); static LOCK: LazyLock> = LazyLock::new(|| Mutex::new(true)); -const CUSTOM_ENGINE: engine::GeneralPurpose = - engine::GeneralPurpose::new(&alphabet::URL_SAFE, general_purpose::NO_PAD); - const TIME_SPENT_ON_LOCK: &str = "surql_lock_waiting_ms"; const STORE: &str = "surql_store_calls"; -const LINK: &str = "surql_link_calls"; -#[derive(Deserialize, Clone)] +#[derive(Serialize, Deserialize, Clone, Eq, PartialEq, Hash)] pub struct Website { /// The url that this data is found at pub site: Url, @@ -36,18 +26,6 @@ pub struct Website { pub crawled: bool, } -impl Serialize for Website { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer { - let mut state = serializer.serialize_struct("Website", 2)?; - state.serialize_field("crawled", &self.crawled)?; - // to_string() calls the correct naming of site - state.serialize_field("site", &self.site.to_string())?; - state.end() - } -} - // manual impl to make tracing look nicer impl Debug for Website { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { @@ -73,68 +51,6 @@ impl Website { self.crawled = true } - pub fn get_url_as_string(site: &Url) -> String { - let domain = match site.domain() { - Some(s) => s.to_string(), - None => { - warn!("Failed to get domain of URL: {}, falling back to 'localhost'", site.to_string()); - "localhost".to_string() - } - }; - let path = site.path(); - - domain + path - } - - pub fn get_url_as_b64_path(site: &Url) -> String { - let domain = site.domain().unwrap_or("DOMAIN").to_string(); - let path = &CUSTOM_ENGINE.encode(site.path()); - - domain + path - } - - #[instrument(skip_all)] - pub async fn links_to(&self, other: Vec, db: &Surreal) { - let len = other.len(); - if len == 0 { - return; - } - - let from = &self.site; - - // let to = other.site.to_string(); - trace!("Linking {} pages to {from}", other.len()); - counter!(LINK).increment(1); - match db - .query("COUNT(RELATE (SELECT id FROM website WHERE site = $in) -> links_to -> $out)") - .bind(("in", from.clone())) - .bind(("out", other)) - .await - { - Ok(mut e) => { - // The relate could technically "fail" (not relate anything), this just means that - // the query was ok. - let _: Response = e; - if let Ok(vec) = e.take(0) { - let _: Vec = vec; - if let Some(num) = vec.first() { - if *num == len { - trace!("Link for {from} OK - {num}/{len}"); - return; - } else { - error!("Didn't link all the records. {num}/{len}. Surreal response: {:?}", e); - return; - } - } - } - error!("Linking request succeeded but couldn't verify the results."); - } - Err(e) => { - error!("{}", e.to_string()); - } - } - } - // Insert ever item in the vec into surreal, crawled state will be preserved as TRUE // if already in the database as such or incoming data is TRUE. pub async fn store_all(all: Vec, db: &Surreal) -> Vec { diff --git a/src/filesystem.rs b/src/filesystem.rs new file mode 100644 index 0000000..9a3f761 --- /dev/null +++ b/src/filesystem.rs @@ -0,0 +1,24 @@ +use std::path::PathBuf; + +use tokio::fs; +use tracing::{error, instrument, trace}; +use url::Url; + +#[instrument(skip(data))] +pub async fn store(data: &str, url: &Url) { + let path = PathBuf::from("./downloaded".to_string() + url.path()); + let basepath = path.ancestors().skip(1).take(1).collect::(); + trace!("Built path: {:?} and base path: {:?}", &path, &basepath); + if let Err(err) = fs::create_dir_all(&basepath).await { + let ex = path.ancestors().fold(String::new(), |mut s, item| { + s += ", "; + s += &item.to_string_lossy().to_string(); + s + }); + error!("Dir creation: {err} {:?} {ex}", basepath); + } else { + if let Err(err) = fs::write(&path, data).await { + error!("File creation: {err} {:?}", path); + } + } +} diff --git a/src/main.rs b/src/main.rs index fcdd10b..c5c96cd 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,24 +3,21 @@ extern crate html5ever; use std::{ - fs::File, - io::Read, - net::{IpAddr, Ipv4Addr}, + collections::HashSet, fs::File, io::Read, net::{IpAddr, Ipv4Addr}, str::FromStr, time }; use db::{connect, Website}; use metrics::{counter, gauge}; use metrics_exporter_prometheus::PrometheusBuilder; -use s3::S3; use serde::Deserialize; use surrealdb::{engine::remote::ws::Client, Surreal}; use tokio::task::JoinSet; -use tracing::{debug, error, info, instrument, trace, trace_span}; +use tracing::{debug, error, info, instrument, level_filters::LevelFilter, trace, trace_span}; use tracing_subscriber::{fmt, layer::SubscriberExt, EnvFilter, Layer, Registry}; mod db; mod parser; -mod s3; +mod filesystem; const GET_METRIC: &str = "total_gets"; const GET_IN_FLIGHT: &str = "gets_in_flight"; @@ -35,11 +32,6 @@ struct Config { surreal_username: String, surreal_password: String, - s3_url: String, - s3_bucket: String, - s3_access_key: String, - s3_secret_key: String, - crawl_filter: String, budget: usize, } @@ -52,7 +44,9 @@ async fn main() { .open("./docker/logs/tracing.log") .expect("Couldn't make log file!"); - let filter = EnvFilter::from_default_env(); + let filter = EnvFilter::builder() + .with_default_directive(LevelFilter::DEBUG.into()) + .from_env_lossy(); let registry = Registry::default().with( fmt::layer() @@ -75,7 +69,7 @@ async fn main() { .install() .expect("failed to install recorder/exporter"); - debug!("Starting..."); + info!("Starting..."); // Would probably take these in as parameters from a cli let starting_url = "https://en.wikipedia.org/"; // When getting uncrawled pages, name must contain this variable. "" will effectively get ignored. @@ -92,9 +86,6 @@ async fn main() { let db = connect(&config) .await .expect("Failed to connect to surreal, aborting."); - let s3 = S3::connect(&config) - .await - .expect("Failed to connect to minio, aborting.\n\nThis probably means you need to login to the minio console and get a new access key!\n\n(Probably here) http://localhost:9001/access-keys/new-account\n\n"); let reqwest = reqwest::Client::builder() // .use_rustls_tls() @@ -108,7 +99,7 @@ async fn main() { let pre_loop_span = span.enter(); // Download the site let site = Website::new(starting_url, false); - process(site, db.clone(), reqwest.clone(), s3.clone()).await; + process(site, db.clone(), reqwest.clone()).await; drop(pre_loop_span); @@ -126,20 +117,15 @@ async fn main() { info!("Had more budget but finished crawling everything."); return; } - debug!("Crawling {} pages...", uncrawled.len()); - - let span = trace_span!("Crawling"); - let _ = span.enter(); { let mut futures = JoinSet::new(); for site in uncrawled { gauge!(BEING_PROCESSED).increment(1); - futures.spawn(process(site, db.clone(), reqwest.clone(), s3.clone())); + futures.spawn(process(site, db.clone(), reqwest.clone())); // let percent = format!("{:.2}%", (crawled as f32 / budget as f32) * 100f32); // info!("Crawled {crawled} out of {budget} pages. ({percent})"); } - debug!("Joining {} futures...", futures.len()); let c = counter!(SITES_CRAWLED); // As futures complete runs code in while block @@ -152,13 +138,13 @@ async fn main() { } drop(span); - debug!("Done"); + info!("Done"); } -#[instrument(skip(db, s3, reqwest))] +#[instrument(skip(db, reqwest))] /// Downloads and crawls and stores a webpage. /// It is acceptable to clone `db`, `reqwest`, and `s3` because they all use `Arc`s internally. - Noted by Oliver -async fn process(mut site: Website, db: Surreal, reqwest: reqwest::Client, s3: S3) { +async fn process(mut site: Website, db: Surreal, reqwest: reqwest::Client) { // METRICS trace!("Process: {}", &site.site); @@ -182,7 +168,7 @@ async fn process(mut site: Website, db: Surreal, reqwest: reqwest::Clien .await .expect("Failed to read http response's body!"); // Store document - s3.store(&data, &site.site).await; + filesystem::store(&data, &site.site).await; // Parse document and get relationships let sites = parser::parse(&site, &data).await; @@ -191,12 +177,19 @@ async fn process(mut site: Website, db: Surreal, reqwest: reqwest::Clien site.set_crawled(); Website::store_all(vec![site], &db).await; - // Store all the other sites so that we can link to them. - // let mut links_to = Vec::new(); - let _ = Website::store_all(sites, &db).await; + // De-duplicate this list + let prev_len = sites.len(); + let set = sites.into_iter().fold(HashSet::new(), |mut set,item| { + set.insert(item); + set + }); + let de_dupe_sites: Vec = set.into_iter().collect(); + let diff = prev_len - de_dupe_sites.len(); + trace!("Saved {diff} from being entered into the db by de-duping"); + + // Store all the other sites so that we can link to them. + let _ = Website::store_all(de_dupe_sites, &db).await; - // Make the database's links reflect the html links between sites - // site.links_to(others, &db).await; } else { error!("Failed to get: {}", &site.site); } diff --git a/src/parser.rs b/src/parser.rs index 3fc1276..49001e0 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -5,18 +5,22 @@ use html5ever::tokenizer::{BufferQueue, TokenizerResult}; use html5ever::tokenizer::{StartTag, TagToken}; use html5ever::tokenizer::{Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts}; use html5ever::{local_name, tendril::*}; -use tracing::instrument; +use tracing::{debug, error, info, instrument, trace, warn}; +use url::Url; use crate::db::Website; impl TokenSink for Website { type Handle = Vec; + #[instrument(skip(token, _line_number))] fn process_token(&self, token: Token, _line_number: u64) -> TokenSinkResult { match token { TagToken(tag) => { if tag.kind == StartTag { match tag.name { + // this should be all the html + // elements that have links local_name!("a") | local_name!("audio") | local_name!("area") @@ -31,23 +35,46 @@ impl TokenSink for Website { let attr_name = attr.name.local.to_string(); if attr_name == "src" || attr_name == "href" || attr_name == "data" { - // Get clone of the current site object - let mut web = self.clone(); - - // Set url - let mut url = web.site; - url.set_fragment(None); // removes #xyz - let joined = url - .join(&attr.value) - .expect("Failed to join url during parsing!"); - web.site = joined; - - web.crawled = false; - - links.push(web); + let url: Option = match Url::parse(&attr.value) { + Ok(ok) => { + trace!("Found `{}` in the html on `{}` tag", ok.to_string(), tag.name); + Some(ok) + }, + Err(e) => { + if attr.value.eq(&Tendril::from_char('#')) { + trace!("Rejecting # url"); + None + } else { + match e { + url::ParseError::RelativeUrlWithoutBase => { + let origin = self.site.origin().ascii_serialization(); + let url = origin.clone() + &attr.value; + trace!("Built `{url}` from `{origin} + {}`", &attr.value.to_string()); + if let Ok(url) = Url::parse(&url) { + trace!("Saved relative url `{}` AS: `{}`", &attr.value, url); + Some(url) + } else { + error!("Failed to reconstruct a url from relative url: `{}` on site: `{}`", &attr.value, self.site.to_string()); + None + } + }, + _ => { + error!("MISC error: {:?} {:?}", e, &attr.value); + None + }, + } + } + }, + }; + if let Some(mut parsed) = url { + parsed.set_query(None); + parsed.set_fragment(None); + debug!("Final cleaned URL: `{}`", parsed.to_string()); + let web = Website::new(&parsed.to_string(), false); + links.push(web); + } } } - return TokenSinkResult::Script(links); } local_name!("button") | local_name!("meta") | local_name!("iframe") => { @@ -76,7 +103,7 @@ pub async fn parse(site: &Website, data: &str) -> Vec { token_buffer.push_back( chunk .try_reinterpret::() - .expect("Failed to reinterprt chunk!"), + .expect("Failed to reinterpret chunk!"), ); // create the tokenizer let tokenizer = Tokenizer::new(site.clone(), TokenizerOpts::default()); diff --git a/src/s3.rs b/src/s3.rs deleted file mode 100644 index 4efc2b4..0000000 --- a/src/s3.rs +++ /dev/null @@ -1,100 +0,0 @@ -use metrics::counter; -use minio::s3::{ - args::{BucketExistsArgs, MakeBucketArgs}, - client::ClientBuilder, - creds::StaticProvider, - error::Error, - http::BaseUrl, - Client, -}; -use tracing::{instrument, trace, warn}; -use url::Url; - -use crate::{db::Website, Config}; - -const S3_ROUND_TRIP_METRIC: &str = "s3_trips"; - -#[derive(Clone)] -pub struct S3 { - bucket_name: String, - client: Client, -} - -impl S3 { - #[instrument(skip_all, name = "S3")] - pub async fn connect(config: &Config) -> Result { - let base_url = config - .s3_url - .parse::() - .expect("Failed to parse url into BaseUrl"); - - let static_provider = - StaticProvider::new(&config.s3_access_key, &config.s3_secret_key, None); - - let client = ClientBuilder::new(base_url) - .provider(Some(Box::new(static_provider))) - .build()?; - - trace!("Checking bucket..."); - let exists = client - .bucket_exists( - &BucketExistsArgs::new(&config.s3_bucket) - .expect("Failed to check if bucket exists"), - ) - .await?; - counter!(S3_ROUND_TRIP_METRIC).increment(1); - - if !exists { - trace!("Creating bucket..."); - client - .make_bucket( - &MakeBucketArgs::new(&config.s3_bucket).expect("Failed to create bucket!"), - ) - .await?; - } - counter!(S3_ROUND_TRIP_METRIC).increment(1); - - trace!("Connection successful"); - - Ok(Self { - bucket_name: config.s3_bucket.to_owned(), - client, - }) - } - - #[instrument(name = "s3_store", skip_all)] - pub async fn store(&self, data: &str, url: &Url) { - let counter = counter!(S3_ROUND_TRIP_METRIC); - - let filename = Website::get_url_as_string(url); - trace!("Storing {} as {filename}", url.to_string()); - - counter.increment(1); - match &self - .client - .put_object_content(&self.bucket_name, &filename, data.to_owned()) - .send() - .await - { - Ok(_) => {} - Err(err) => match err { - Error::InvalidObjectName(_) => { - // This code will really only run if the url has non-english chars - warn!("Tried storing invalid object name, retrying with Base64 encoding. Last try."); - - let filename: String = Website::get_url_as_b64_path(url); - - counter.increment(1); - let _ = &self - .client - .put_object_content(&self.bucket_name, &filename, data.to_owned()) - .send() - .await - .unwrap(); - } - _ => {} - }, - }; - } -} - From a23429104c50260af8e304c53629c9008949c306 Mon Sep 17 00:00:00 2001 From: Oliver Date: Fri, 21 Mar 2025 06:03:34 +0000 Subject: [PATCH 02/14] dead code removal --- src/db.rs | 2 +- src/filesystem.rs | 2 +- src/main.rs | 2 +- src/parser.rs | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/db.rs b/src/db.rs index 03a1850..b5e913a 100644 --- a/src/db.rs +++ b/src/db.rs @@ -8,7 +8,7 @@ use surrealdb::{ Surreal, }; use tokio::sync::Mutex; -use tracing::{error, instrument, trace, warn}; +use tracing::{error, instrument, trace}; use url::Url; use crate::Config; diff --git a/src/filesystem.rs b/src/filesystem.rs index 9a3f761..d95af58 100644 --- a/src/filesystem.rs +++ b/src/filesystem.rs @@ -8,7 +8,7 @@ use url::Url; pub async fn store(data: &str, url: &Url) { let path = PathBuf::from("./downloaded".to_string() + url.path()); let basepath = path.ancestors().skip(1).take(1).collect::(); - trace!("Built path: {:?} and base path: {:?}", &path, &basepath); + trace!("Save path: {:?} and base path: {:?}", &path, &basepath); if let Err(err) = fs::create_dir_all(&basepath).await { let ex = path.ancestors().fold(String::new(), |mut s, item| { s += ", "; diff --git a/src/main.rs b/src/main.rs index c5c96cd..399fe0a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,7 +3,7 @@ extern crate html5ever; use std::{ - collections::HashSet, fs::File, io::Read, net::{IpAddr, Ipv4Addr}, str::FromStr, time + collections::HashSet, fs::File, io::Read, net::{IpAddr, Ipv4Addr} }; use db::{connect, Website}; diff --git a/src/parser.rs b/src/parser.rs index 49001e0..ac55bf0 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -5,7 +5,7 @@ use html5ever::tokenizer::{BufferQueue, TokenizerResult}; use html5ever::tokenizer::{StartTag, TagToken}; use html5ever::tokenizer::{Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts}; use html5ever::{local_name, tendril::*}; -use tracing::{debug, error, info, instrument, trace, warn}; +use tracing::{debug, error, instrument, trace, warn}; use url::Url; use crate::db::Website; @@ -41,7 +41,7 @@ impl TokenSink for Website { Some(ok) }, Err(e) => { - if attr.value.eq(&Tendril::from_char('#')) { + if attr.value.starts_with('#') { trace!("Rejecting # url"); None } else { From be0fd5505b31367c2a7530520fe1403db1c0f4a0 Mon Sep 17 00:00:00 2001 From: Oliver Date: Fri, 21 Mar 2025 06:48:17 +0000 Subject: [PATCH 03/14] i think the files work better --- src/filesystem.rs | 36 ++++++++++++++++++++++++------------ src/main.rs | 2 ++ 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/src/filesystem.rs b/src/filesystem.rs index d95af58..8ecfd49 100644 --- a/src/filesystem.rs +++ b/src/filesystem.rs @@ -1,24 +1,36 @@ use std::path::PathBuf; +use reqwest::header::HeaderValue; use tokio::fs; use tracing::{error, instrument, trace}; use url::Url; #[instrument(skip(data))] pub async fn store(data: &str, url: &Url) { - let path = PathBuf::from("./downloaded".to_string() + url.path()); - let basepath = path.ancestors().skip(1).take(1).collect::(); - trace!("Save path: {:?} and base path: {:?}", &path, &basepath); - if let Err(err) = fs::create_dir_all(&basepath).await { - let ex = path.ancestors().fold(String::new(), |mut s, item| { - s += ", "; - s += &item.to_string_lossy().to_string(); - s - }); - error!("Dir creation: {err} {:?} {ex}", basepath); + // extract data from url to save it accurately + let url_path = PathBuf::from("./downloaded/".to_string() + url.domain().unwrap_or("UnknownDomain") + url.path()); + + // if it's a file + let (basepath, filename) = if url_path.extension().is_some() { + // get everything up till the file + let basepath = url_path.ancestors().skip(1).take(1).collect::(); + // get the file name + let filename = url_path.file_name().expect("This should exist").to_string_lossy(); + trace!("Save path: {:?} and base path: {:?}", &url_path, &basepath); + (basepath, filename.to_string()) } else { - if let Err(err) = fs::write(&path, data).await { - error!("File creation: {err} {:?}", path); + (url_path.clone(), "index.html".into()) + }; + + // create the folders + if let Err(err) = fs::create_dir_all(&basepath).await { + error!("Dir creation: {err} {:?}", basepath); + } else { + // FIXME I don't think this handles index.html files well... + // TODO this should probably append .html to non-described files + // create the file if that was successful + if let Err(err) = fs::write(&basepath.join(filename), data).await { + error!("File creation: {err} {:?}", url_path); } } } diff --git a/src/main.rs b/src/main.rs index 399fe0a..f93dac5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,6 +7,7 @@ use std::{ }; use db::{connect, Website}; +use filesystem::FileType; use metrics::{counter, gauge}; use metrics_exporter_prometheus::PrometheusBuilder; use serde::Deserialize; @@ -167,6 +168,7 @@ async fn process(mut site: Website, db: Surreal, reqwest: reqwest::Clien .text() .await .expect("Failed to read http response's body!"); + // Store document filesystem::store(&data, &site.site).await; From 2de01b2a0e512fa25e798eec586766e782a725d4 Mon Sep 17 00:00:00 2001 From: Oliver Date: Fri, 21 Mar 2025 06:48:39 +0000 Subject: [PATCH 04/14] remove removed code --- src/filesystem.rs | 1 - src/main.rs | 1 - 2 files changed, 2 deletions(-) diff --git a/src/filesystem.rs b/src/filesystem.rs index 8ecfd49..578ae7e 100644 --- a/src/filesystem.rs +++ b/src/filesystem.rs @@ -1,6 +1,5 @@ use std::path::PathBuf; -use reqwest::header::HeaderValue; use tokio::fs; use tracing::{error, instrument, trace}; use url::Url; diff --git a/src/main.rs b/src/main.rs index f93dac5..9f20638 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,7 +7,6 @@ use std::{ }; use db::{connect, Website}; -use filesystem::FileType; use metrics::{counter, gauge}; use metrics_exporter_prometheus::PrometheusBuilder; use serde::Deserialize; From 808790a7c3bfd21d0c639fdfcecf4591ff889bbd Mon Sep 17 00:00:00 2001 From: Oliver Date: Fri, 21 Mar 2025 07:11:51 +0000 Subject: [PATCH 05/14] file patch; --- Crawler.toml | 2 +- src/filesystem.rs | 39 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/Crawler.toml b/Crawler.toml index 7072e33..a23d0f4 100644 --- a/Crawler.toml +++ b/Crawler.toml @@ -3,7 +3,7 @@ surreal_url = "localhost:8000" surreal_username = "root" surreal_password = "root" surreal_ns = "test" -surreal_db = "v1.18.1" +surreal_db = "v1.19.1" # Crawler config crawl_filter = "en.wikipedia.com" diff --git a/src/filesystem.rs b/src/filesystem.rs index 578ae7e..e92b9aa 100644 --- a/src/filesystem.rs +++ b/src/filesystem.rs @@ -1,7 +1,7 @@ -use std::path::PathBuf; +use std::{ffi::OsStr, path::PathBuf}; use tokio::fs; -use tracing::{error, instrument, trace}; +use tracing::{error, instrument, trace, warn}; use url::Url; #[instrument(skip(data))] @@ -10,7 +10,7 @@ pub async fn store(data: &str, url: &Url) { let url_path = PathBuf::from("./downloaded/".to_string() + url.domain().unwrap_or("UnknownDomain") + url.path()); // if it's a file - let (basepath, filename) = if url_path.extension().is_some() { + let (basepath, filename) = if url_path.extension().filter(valid_file_extension).is_some() { // get everything up till the file let basepath = url_path.ancestors().skip(1).take(1).collect::(); // get the file name @@ -33,3 +33,36 @@ pub async fn store(data: &str, url: &Url) { } } } + +fn valid_file_extension(take: &&OsStr) -> bool { + let los = take.to_string_lossy(); + let all = los.split('.'); + match all.last() { + Some(s) => { + match s.to_lowercase().as_str() { + "html" => true, + "css" => true, + "js" => true, + "ts" => true, + "otf" => true, // font + + "png" => true, + "svg" => true, + "jpg" => true, + "jpeg" => true, + "mp4" => true, + "mp3" => true, + "webp" => true, + + "pdf" => true, + "json" => true, + "xml" => true, + _ => { + warn!("Might be forgetting a file extension: {s}"); + false + } + } + }, + None => false, + } +} From b750d88d48b74fbf9373c8ed46150c13e358dabe Mon Sep 17 00:00:00 2001 From: Rushmore75 Date: Fri, 21 Mar 2025 11:42:43 -0600 Subject: [PATCH 06/14] working filesystem storage --- Crawler.toml | 2 +- docker/compose.yml | 16 -------- docker/prometheus.yaml | 8 +--- src/filesystem.rs | 4 +- src/parser.rs | 90 ++++++++++++++++++++++++++---------------- 5 files changed, 63 insertions(+), 57 deletions(-) diff --git a/Crawler.toml b/Crawler.toml index a23d0f4..8940c1b 100644 --- a/Crawler.toml +++ b/Crawler.toml @@ -3,7 +3,7 @@ surreal_url = "localhost:8000" surreal_username = "root" surreal_password = "root" surreal_ns = "test" -surreal_db = "v1.19.1" +surreal_db = "v1.19.2" # Crawler config crawl_filter = "en.wikipedia.com" diff --git a/docker/compose.yml b/docker/compose.yml index 7cb6dcc..747afa0 100644 --- a/docker/compose.yml +++ b/docker/compose.yml @@ -14,22 +14,6 @@ services: - --pass - root - rocksdb:/mydata/database.db - minio: - image: quay.io/minio/minio - ports: - - 9000:9000 - - 9001:9001 - environment: - - MINIO_ROOT_USER=root - - MINIO_ROOT_PASSWORD=an8charpassword - - MINIO_PROMETHEUS_AUTH_TYPE=public - volumes: - - minio_storage:/data - command: - - server - - /data - - --console-address - - ":9001" alloy: image: grafana/alloy:latest diff --git a/docker/prometheus.yaml b/docker/prometheus.yaml index 9133e15..ffc1e24 100644 --- a/docker/prometheus.yaml +++ b/docker/prometheus.yaml @@ -7,15 +7,11 @@ scrape_configs: static_configs: # change this your machine's ip, localhost won't work # because localhost refers to the docker container. - # - targets: ['172.20.239.48:2500'] - - targets: ['192.168.8.209:2500'] + - targets: ['172.20.239.48:2500'] + #- targets: ['192.168.8.209:2500'] - job_name: loki static_configs: - targets: ['loki:3100'] - job_name: prometheus static_configs: - targets: ['localhost:9090'] - - job_name: minio - metrics_path: /minio/v2/metrics/cluster - static_configs: - - targets: ['minio:9000'] diff --git a/src/filesystem.rs b/src/filesystem.rs index e92b9aa..85fcf30 100644 --- a/src/filesystem.rs +++ b/src/filesystem.rs @@ -1,7 +1,7 @@ use std::{ffi::OsStr, path::PathBuf}; use tokio::fs; -use tracing::{error, instrument, trace, warn}; +use tracing::{debug, error, info, instrument, trace, warn}; use url::Url; #[instrument(skip(data))] @@ -21,6 +21,8 @@ pub async fn store(data: &str, url: &Url) { (url_path.clone(), "index.html".into()) }; + info!("Writing at: {:?} {:?}", basepath, filename); + // create the folders if let Err(err) = fs::create_dir_all(&basepath).await { error!("Dir creation: {err} {:?}", basepath); diff --git a/src/parser.rs b/src/parser.rs index ac55bf0..c1e87e2 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -19,8 +19,7 @@ impl TokenSink for Website { TagToken(tag) => { if tag.kind == StartTag { match tag.name { - // this should be all the html - // elements that have links + // this should be all the html elements that have links local_name!("a") | local_name!("audio") | local_name!("area") @@ -35,37 +34,9 @@ impl TokenSink for Website { let attr_name = attr.name.local.to_string(); if attr_name == "src" || attr_name == "href" || attr_name == "data" { - let url: Option = match Url::parse(&attr.value) { - Ok(ok) => { - trace!("Found `{}` in the html on `{}` tag", ok.to_string(), tag.name); - Some(ok) - }, - Err(e) => { - if attr.value.starts_with('#') { - trace!("Rejecting # url"); - None - } else { - match e { - url::ParseError::RelativeUrlWithoutBase => { - let origin = self.site.origin().ascii_serialization(); - let url = origin.clone() + &attr.value; - trace!("Built `{url}` from `{origin} + {}`", &attr.value.to_string()); - if let Ok(url) = Url::parse(&url) { - trace!("Saved relative url `{}` AS: `{}`", &attr.value, url); - Some(url) - } else { - error!("Failed to reconstruct a url from relative url: `{}` on site: `{}`", &attr.value, self.site.to_string()); - None - } - }, - _ => { - error!("MISC error: {:?} {:?}", e, &attr.value); - None - }, - } - } - }, - }; + trace!("Found `{}` in html `{}` tag", &attr.value, tag.name); + let url = try_get_url(&self.site, &attr.value); + if let Some(mut parsed) = url { parsed.set_query(None); parsed.set_fragment(None); @@ -119,3 +90,56 @@ pub async fn parse(site: &Website, data: &str) -> Vec { other_sites } +#[instrument] +fn try_get_url(parent: &Url, link: &str) -> Option { + match Url::parse(link) { + Ok(ok) => Some(ok), + Err(e) => { + if link.starts_with('#') { + trace!("Rejecting # url"); + None + } else if link.starts_with("//") { + // if a url starts with "//" is assumed that it will adopt + // the same scheme as it's parent + // https://stackoverflow.com/questions/9646407/two-forward-slashes-in-a-url-src-href-attribute + let scheme = parent.scheme(); + + match Url::parse(&format!("{scheme}://{}", link)) { + Ok(url) => Some(url), + Err(err) => { + error!("Failed parsing realative scheme url: {}", err); + None + } + } + } else { + // # This is some sort of realative url, gonna try patching it up into an absolute + // url + match e { + url::ParseError::RelativeUrlWithoutBase => { + // Is: scheme://host:port + let origin = parent.origin().ascii_serialization(); + let url = origin.clone() + link; + + trace!("Built `{url}` from `{origin} + {}`", link.to_string()); + + if let Ok(url) = Url::parse(&url) { + trace!("Saved relative url `{}` AS: `{}`", link, url); + Some(url) + } else { + error!( + "Failed to reconstruct a url from relative url: `{}` on site: `{}`", + link, + parent.to_string() + ); + None + } + } + _ => { + error!("MISC error: {:?} {:?}", e, link); + None + } + } + } + } + } +} From 96a3ca092ab73874442f19f812ead07c4e989208 Mon Sep 17 00:00:00 2001 From: Rushmore75 Date: Fri, 21 Mar 2025 12:11:05 -0600 Subject: [PATCH 07/14] :) --- src/db.rs | 15 +-------------- src/filesystem.rs | 4 ++-- 2 files changed, 3 insertions(+), 16 deletions(-) diff --git a/src/db.rs b/src/db.rs index b5e913a..4083ddc 100644 --- a/src/db.rs +++ b/src/db.rs @@ -1,21 +1,17 @@ use metrics::counter; +use std::fmt::Debug; use serde::{Deserialize, Serialize}; -use std::{fmt::Debug, sync::LazyLock, time::Instant}; use surrealdb::{ engine::remote::ws::{Client, Ws}, opt::auth::Root, sql::Thing, Surreal, }; -use tokio::sync::Mutex; use tracing::{error, instrument, trace}; use url::Url; use crate::Config; -static LOCK: LazyLock> = LazyLock::new(|| Mutex::new(true)); - -const TIME_SPENT_ON_LOCK: &str = "surql_lock_waiting_ms"; const STORE: &str = "surql_store_calls"; #[derive(Serialize, Deserialize, Clone, Eq, PartialEq, Hash)] @@ -57,14 +53,6 @@ impl Website { counter!(STORE).increment(1); let mut things = Vec::with_capacity(all.len()); - // TODO this only allows for one thread to be in the database at a time. - // This is currently required since otherwise we get write errors. - // If the default `crawled` is set to false, we might not need to write any more - // than just the name. `accessed_at` is fun but not needed. - let now = Instant::now(); - let lock = LOCK.lock().await; - counter!(TIME_SPENT_ON_LOCK).increment(now.elapsed().as_millis() as u64); - match db .query( "INSERT INTO website $array @@ -85,7 +73,6 @@ impl Website { error!("{:?}", err); } } - drop(lock); things } } diff --git a/src/filesystem.rs b/src/filesystem.rs index 85fcf30..46c1af3 100644 --- a/src/filesystem.rs +++ b/src/filesystem.rs @@ -1,7 +1,7 @@ use std::{ffi::OsStr, path::PathBuf}; use tokio::fs; -use tracing::{debug, error, info, instrument, trace, warn}; +use tracing::{debug, error, instrument, trace, warn}; use url::Url; #[instrument(skip(data))] @@ -21,7 +21,7 @@ pub async fn store(data: &str, url: &Url) { (url_path.clone(), "index.html".into()) }; - info!("Writing at: {:?} {:?}", basepath, filename); + debug!("Writing at: {:?} {:?}", basepath, filename); // create the folders if let Err(err) = fs::create_dir_all(&basepath).await { From 6fc71c7a7891fc11fd9318a86dafc52a087fbc03 Mon Sep 17 00:00:00 2001 From: Rushmore75 Date: Fri, 21 Mar 2025 12:14:29 -0600 Subject: [PATCH 08/14] add speed improvements --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 6b2890c..230af74 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,8 @@ This ment we stored 1000 pages, 142,997 urls, and 1,425,798 links between the tw 3/20/25: Took 5min to crawl 1000 pages +3/21/25: Took 3min to crawl 1000 pages + # About ![Screenshot](/pngs/graphana.png) From 03cbcd9ae0c24d4428007b9772427defbebaf7a1 Mon Sep 17 00:00:00 2001 From: Rushmore75 Date: Mon, 31 Mar 2025 14:18:11 -0600 Subject: [PATCH 09/14] remove minio code --- docker/compose.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/docker/compose.yml b/docker/compose.yml index 747afa0..efe16f2 100644 --- a/docker/compose.yml +++ b/docker/compose.yml @@ -66,4 +66,3 @@ volumes: grafana_storage: alloy_storage: surrealdb_storage: - minio_storage: From 4a433a1a772f7d51dc117ca1dbcc7f125b4c3840 Mon Sep 17 00:00:00 2001 From: Rushmore75 Date: Mon, 31 Mar 2025 14:18:37 -0600 Subject: [PATCH 10/14] This function sometimes throws errors, this logging should help --- src/db.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/db.rs b/src/db.rs index 4083ddc..06cddde 100644 --- a/src/db.rs +++ b/src/db.rs @@ -49,6 +49,7 @@ impl Website { // Insert ever item in the vec into surreal, crawled state will be preserved as TRUE // if already in the database as such or incoming data is TRUE. + #[instrument(skip(db))] pub async fn store_all(all: Vec, db: &Surreal) -> Vec { counter!(STORE).increment(1); let mut things = Vec::with_capacity(all.len()); From add6f00ed6dc1b61fc1fffbb61af539fd92f7a7b Mon Sep 17 00:00:00 2001 From: Rushmore75 Date: Mon, 31 Mar 2025 14:53:10 -0600 Subject: [PATCH 11/14] no recomp needed --- Crawler.toml | 7 ++++--- README.md | 5 +++-- src/filesystem.rs | 7 +++++++ src/main.rs | 16 ++++++++++++++-- 4 files changed, 28 insertions(+), 7 deletions(-) diff --git a/Crawler.toml b/Crawler.toml index 8940c1b..0048238 100644 --- a/Crawler.toml +++ b/Crawler.toml @@ -3,8 +3,9 @@ surreal_url = "localhost:8000" surreal_username = "root" surreal_password = "root" surreal_ns = "test" -surreal_db = "v1.19.2" +surreal_db = "v1.19.5" # Crawler config -crawl_filter = "en.wikipedia.com" -budget = 1000 +crawl_filter = "en.wikipedia.org" +start_url = "https://en.wikipedia.org" +budget = 100 diff --git a/README.md b/README.md index 230af74..a223089 100644 --- a/README.md +++ b/README.md @@ -4,11 +4,12 @@ Crawls sites saving all the found links to a surrealdb database. It then proceed ### TODO -- [ ] Domain filtering - prevent the crawler from going on alternate versions of wikipedia. +- [x] Domain filtering - prevent the crawler from going on alternate versions of wikipedia. - [ ] Conditionally save content - based on filename or file contents - [x] GUI / TUI ? - Graphana - [x] Better asynchronous getting of the sites. Currently it all happens serially. -- [ ] Allow for storing asynchronously +- [x] Allow for storing asynchronously - dropping the "links to" logic fixes this need +- [x] Control crawler via config file (no recompliation needed) 3/17/25: Took >1hr to crawl 100 pages diff --git a/src/filesystem.rs b/src/filesystem.rs index 46c1af3..5a253f8 100644 --- a/src/filesystem.rs +++ b/src/filesystem.rs @@ -59,6 +59,13 @@ fn valid_file_extension(take: &&OsStr) -> bool { "pdf" => true, "json" => true, "xml" => true, + + // IGNORE + // TODO Should this be a list of all domains? + "org" => false, + "com" => false, + "net" => false, + _ => { warn!("Might be forgetting a file extension: {s}"); false diff --git a/src/main.rs b/src/main.rs index 9f20638..dc13d1e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -33,11 +33,14 @@ struct Config { surreal_password: String, crawl_filter: String, + start_url: String, budget: usize, } #[tokio::main] async fn main() { + println!("Logs and metrics are provided to the Grafana dashboard"); + let writer = std::fs::OpenOptions::new() .append(true) .create(true) @@ -70,8 +73,7 @@ async fn main() { .expect("failed to install recorder/exporter"); info!("Starting..."); - // Would probably take these in as parameters from a cli - let starting_url = "https://en.wikipedia.org/"; + // When getting uncrawled pages, name must contain this variable. "" will effectively get ignored. // let crawl_filter = "en.wikipedia.org/"; // let budget = 50; @@ -82,6 +84,7 @@ async fn main() { let _ = file.read_to_string(&mut buf); let config: Config = toml::from_str(&buf).expect("Failed to parse Crawler.toml"); + let starting_url = &config.start_url; let db = connect(&config) .await @@ -138,6 +141,15 @@ async fn main() { } drop(span); + if let Ok(mut ok) = db.query("count(select id from website where crawled = true)").await { + let res = ok.take::>(0); + if let Ok(i) = res { + if let Some(n) = i { + info!("Total crawled pages now equals {n}"); + } + } + } + info!("Done"); } From a9465dda6e32d86a92d0b14e23a1513db9272bf4 Mon Sep 17 00:00:00 2001 From: Rushmore75 Date: Mon, 31 Mar 2025 15:05:18 -0600 Subject: [PATCH 12/14] add instructions --- README.md | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/README.md b/README.md index a223089..7ec5c6d 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,35 @@ Crawls sites saving all the found links to a surrealdb database. It then proceeds to take batches of 100 uncrawled links untill the crawl budget is reached. It saves the data of each site in a minio database. +## How to use + +1. Clone the repo and `cd` into it. +2. Build the repo with `cargo build -r` +3. Start the docker conatiners + 1. cd into the docker folder `cd docker` + 2. Bring up the docker containers `docker compose up -d` +4. From the project's root, edit the `Crawler.toml` file to your liking. +5. Run with `./target/release/internet_mapper` + +You can view stats of the project at `http://:3000/dashboards` + +```bash +# Untested script but probably works +git clone https://git.oliveratkinson.net/Oliver/internet_mapper.git +cd internet_mapper + +cargo build -r + +cd docker +docker compose up -d +cd .. + +$EDITOR Crawler.toml + +./target/release/internet_mapper + +``` + ### TODO - [x] Domain filtering - prevent the crawler from going on alternate versions of wikipedia. From 94912e9125c3746ed882b4bbbf0fdf4ce12f670b Mon Sep 17 00:00:00 2001 From: Rushmore75 Date: Tue, 15 Apr 2025 09:06:57 -0600 Subject: [PATCH 13/14] change up how files are discovered --- src/filesystem.rs | 47 +- src/tlds-alpha-by-domain.txt | 1444 ++++++++++++++++++++++++++++++++++ 2 files changed, 1459 insertions(+), 32 deletions(-) create mode 100644 src/tlds-alpha-by-domain.txt diff --git a/src/filesystem.rs b/src/filesystem.rs index 5a253f8..81ad057 100644 --- a/src/filesystem.rs +++ b/src/filesystem.rs @@ -5,7 +5,10 @@ use tracing::{debug, error, instrument, trace, warn}; use url::Url; #[instrument(skip(data))] -pub async fn store(data: &str, url: &Url) { +/// Returns whether or not the saved file should be parsed. +/// If the file is just data, like an image, it doesn't need to be parsed. +/// If it's html, then it does need to be parsed. +pub async fn store(data: &str, url: &Url) -> bool { // extract data from url to save it accurately let url_path = PathBuf::from("./downloaded/".to_string() + url.domain().unwrap_or("UnknownDomain") + url.path()); @@ -21,19 +24,20 @@ pub async fn store(data: &str, url: &Url) { (url_path.clone(), "index.html".into()) }; + let should_parse = filename.ends_with(".html"); + debug!("Writing at: {:?} {:?}", basepath, filename); // create the folders if let Err(err) = fs::create_dir_all(&basepath).await { error!("Dir creation: {err} {:?}", basepath); } else { - // FIXME I don't think this handles index.html files well... - // TODO this should probably append .html to non-described files - // create the file if that was successful if let Err(err) = fs::write(&basepath.join(filename), data).await { error!("File creation: {err} {:?}", url_path); } } + + should_parse } fn valid_file_extension(take: &&OsStr) -> bool { @@ -41,35 +45,14 @@ fn valid_file_extension(take: &&OsStr) -> bool { let all = los.split('.'); match all.last() { Some(s) => { - match s.to_lowercase().as_str() { - "html" => true, - "css" => true, - "js" => true, - "ts" => true, - "otf" => true, // font + // FIXME it's worth noting that the dumb tlds like .zip are in here, + // which could cause problems + let all_domains = include_str!("tlds-alpha-by-domain.txt"); - "png" => true, - "svg" => true, - "jpg" => true, - "jpeg" => true, - "mp4" => true, - "mp3" => true, - "webp" => true, - - "pdf" => true, - "json" => true, - "xml" => true, - - // IGNORE - // TODO Should this be a list of all domains? - "org" => false, - "com" => false, - "net" => false, - - _ => { - warn!("Might be forgetting a file extension: {s}"); - false - } + // check if it is a domain + match all_domains.lines().map(str::to_lowercase).find(|x| x==s.to_lowercase().as_str()) { + Some(_) => false, + None => true } }, None => false, diff --git a/src/tlds-alpha-by-domain.txt b/src/tlds-alpha-by-domain.txt new file mode 100644 index 0000000..327f649 --- /dev/null +++ b/src/tlds-alpha-by-domain.txt @@ -0,0 +1,1444 @@ +# Version 2025041500, Last Updated Tue Apr 15 07:07:01 2025 UTC +AAA +AARP +ABB +ABBOTT +ABBVIE +ABC +ABLE +ABOGADO +ABUDHABI +AC +ACADEMY +ACCENTURE +ACCOUNTANT +ACCOUNTANTS +ACO +ACTOR +AD +ADS +ADULT +AE +AEG +AERO +AETNA +AF +AFL +AFRICA +AG +AGAKHAN +AGENCY +AI +AIG +AIRBUS +AIRFORCE +AIRTEL +AKDN +AL +ALIBABA +ALIPAY +ALLFINANZ +ALLSTATE +ALLY +ALSACE +ALSTOM +AM +AMAZON +AMERICANEXPRESS +AMERICANFAMILY +AMEX +AMFAM +AMICA +AMSTERDAM +ANALYTICS +ANDROID +ANQUAN +ANZ +AO +AOL +APARTMENTS +APP +APPLE +AQ +AQUARELLE +AR +ARAB +ARAMCO +ARCHI +ARMY +ARPA +ART +ARTE +AS +ASDA +ASIA +ASSOCIATES +AT +ATHLETA +ATTORNEY +AU +AUCTION +AUDI +AUDIBLE +AUDIO +AUSPOST +AUTHOR +AUTO +AUTOS +AW +AWS +AX +AXA +AZ +AZURE +BA +BABY +BAIDU +BANAMEX +BAND +BANK +BAR +BARCELONA +BARCLAYCARD +BARCLAYS +BAREFOOT +BARGAINS +BASEBALL +BASKETBALL +BAUHAUS +BAYERN +BB +BBC +BBT +BBVA +BCG +BCN +BD +BE +BEATS +BEAUTY +BEER +BENTLEY +BERLIN +BEST +BESTBUY +BET +BF +BG +BH +BHARTI +BI +BIBLE +BID +BIKE +BING +BINGO +BIO +BIZ +BJ +BLACK +BLACKFRIDAY +BLOCKBUSTER +BLOG +BLOOMBERG +BLUE +BM +BMS +BMW +BN +BNPPARIBAS +BO +BOATS +BOEHRINGER +BOFA +BOM +BOND +BOO +BOOK +BOOKING +BOSCH +BOSTIK +BOSTON +BOT +BOUTIQUE +BOX +BR +BRADESCO +BRIDGESTONE +BROADWAY +BROKER +BROTHER +BRUSSELS +BS +BT +BUILD +BUILDERS +BUSINESS +BUY +BUZZ +BV +BW +BY +BZ +BZH +CA +CAB +CAFE +CAL +CALL +CALVINKLEIN +CAM +CAMERA +CAMP +CANON +CAPETOWN +CAPITAL +CAPITALONE +CAR +CARAVAN +CARDS +CARE +CAREER +CAREERS +CARS +CASA +CASE +CASH +CASINO +CAT +CATERING +CATHOLIC +CBA +CBN +CBRE +CC +CD +CENTER +CEO +CERN +CF +CFA +CFD +CG +CH +CHANEL +CHANNEL +CHARITY +CHASE +CHAT +CHEAP +CHINTAI +CHRISTMAS +CHROME +CHURCH +CI +CIPRIANI +CIRCLE +CISCO +CITADEL +CITI +CITIC +CITY +CK +CL +CLAIMS +CLEANING +CLICK +CLINIC +CLINIQUE +CLOTHING +CLOUD +CLUB +CLUBMED +CM +CN +CO +COACH +CODES +COFFEE +COLLEGE +COLOGNE +COM +COMMBANK +COMMUNITY +COMPANY +COMPARE +COMPUTER +COMSEC +CONDOS +CONSTRUCTION +CONSULTING +CONTACT +CONTRACTORS +COOKING +COOL +COOP +CORSICA +COUNTRY +COUPON +COUPONS +COURSES +CPA +CR +CREDIT +CREDITCARD +CREDITUNION +CRICKET +CROWN +CRS +CRUISE +CRUISES +CU +CUISINELLA +CV +CW +CX +CY +CYMRU +CYOU +CZ +DAD +DANCE +DATA +DATE +DATING +DATSUN +DAY +DCLK +DDS +DE +DEAL +DEALER +DEALS +DEGREE +DELIVERY +DELL +DELOITTE +DELTA +DEMOCRAT +DENTAL +DENTIST +DESI +DESIGN +DEV +DHL +DIAMONDS +DIET +DIGITAL +DIRECT +DIRECTORY +DISCOUNT +DISCOVER +DISH +DIY +DJ +DK +DM +DNP +DO +DOCS +DOCTOR +DOG +DOMAINS +DOT +DOWNLOAD +DRIVE +DTV +DUBAI +DUNLOP +DUPONT +DURBAN +DVAG +DVR +DZ +EARTH +EAT +EC +ECO +EDEKA +EDU +EDUCATION +EE +EG +EMAIL +EMERCK +ENERGY +ENGINEER +ENGINEERING +ENTERPRISES +EPSON +EQUIPMENT +ER +ERICSSON +ERNI +ES +ESQ +ESTATE +ET +EU +EUROVISION +EUS +EVENTS +EXCHANGE +EXPERT +EXPOSED +EXPRESS +EXTRASPACE +FAGE +FAIL +FAIRWINDS +FAITH +FAMILY +FAN +FANS +FARM +FARMERS +FASHION +FAST +FEDEX +FEEDBACK +FERRARI +FERRERO +FI +FIDELITY +FIDO +FILM +FINAL +FINANCE +FINANCIAL +FIRE +FIRESTONE +FIRMDALE +FISH +FISHING +FIT +FITNESS +FJ +FK +FLICKR +FLIGHTS +FLIR +FLORIST +FLOWERS +FLY +FM +FO +FOO +FOOD +FOOTBALL +FORD +FOREX +FORSALE +FORUM +FOUNDATION +FOX +FR +FREE +FRESENIUS +FRL +FROGANS +FRONTIER +FTR +FUJITSU +FUN +FUND +FURNITURE +FUTBOL +FYI +GA +GAL +GALLERY +GALLO +GALLUP +GAME +GAMES +GAP +GARDEN +GAY +GB +GBIZ +GD +GDN +GE +GEA +GENT +GENTING +GEORGE +GF +GG +GGEE +GH +GI +GIFT +GIFTS +GIVES +GIVING +GL +GLASS +GLE +GLOBAL +GLOBO +GM +GMAIL +GMBH +GMO +GMX +GN +GODADDY +GOLD +GOLDPOINT +GOLF +GOO +GOODYEAR +GOOG +GOOGLE +GOP +GOT +GOV +GP +GQ +GR +GRAINGER +GRAPHICS +GRATIS +GREEN +GRIPE +GROCERY +GROUP +GS +GT +GU +GUCCI +GUGE +GUIDE +GUITARS +GURU +GW +GY +HAIR +HAMBURG +HANGOUT +HAUS +HBO +HDFC +HDFCBANK +HEALTH +HEALTHCARE +HELP +HELSINKI +HERE +HERMES +HIPHOP +HISAMITSU +HITACHI +HIV +HK +HKT +HM +HN +HOCKEY +HOLDINGS +HOLIDAY +HOMEDEPOT +HOMEGOODS +HOMES +HOMESENSE +HONDA +HORSE +HOSPITAL +HOST +HOSTING +HOT +HOTELS +HOTMAIL +HOUSE +HOW +HR +HSBC +HT +HU +HUGHES +HYATT +HYUNDAI +IBM +ICBC +ICE +ICU +ID +IE +IEEE +IFM +IKANO +IL +IM +IMAMAT +IMDB +IMMO +IMMOBILIEN +IN +INC +INDUSTRIES +INFINITI +INFO +ING +INK +INSTITUTE +INSURANCE +INSURE +INT +INTERNATIONAL +INTUIT +INVESTMENTS +IO +IPIRANGA +IQ +IR +IRISH +IS +ISMAILI +IST +ISTANBUL +IT +ITAU +ITV +JAGUAR +JAVA +JCB +JE +JEEP +JETZT +JEWELRY +JIO +JLL +JM +JMP +JNJ +JO +JOBS +JOBURG +JOT +JOY +JP +JPMORGAN +JPRS +JUEGOS +JUNIPER +KAUFEN +KDDI +KE +KERRYHOTELS +KERRYPROPERTIES +KFH +KG +KH +KI +KIA +KIDS +KIM +KINDLE +KITCHEN +KIWI +KM +KN +KOELN +KOMATSU +KOSHER +KP +KPMG +KPN +KR +KRD +KRED +KUOKGROUP +KW +KY +KYOTO +KZ +LA +LACAIXA +LAMBORGHINI +LAMER +LANCASTER +LAND +LANDROVER +LANXESS +LASALLE +LAT +LATINO +LATROBE +LAW +LAWYER +LB +LC +LDS +LEASE +LECLERC +LEFRAK +LEGAL +LEGO +LEXUS +LGBT +LI +LIDL +LIFE +LIFEINSURANCE +LIFESTYLE +LIGHTING +LIKE +LILLY +LIMITED +LIMO +LINCOLN +LINK +LIVE +LIVING +LK +LLC +LLP +LOAN +LOANS +LOCKER +LOCUS +LOL +LONDON +LOTTE +LOTTO +LOVE +LPL +LPLFINANCIAL +LR +LS +LT +LTD +LTDA +LU +LUNDBECK +LUXE +LUXURY +LV +LY +MA +MADRID +MAIF +MAISON +MAKEUP +MAN +MANAGEMENT +MANGO +MAP +MARKET +MARKETING +MARKETS +MARRIOTT +MARSHALLS +MATTEL +MBA +MC +MCKINSEY +MD +ME +MED +MEDIA +MEET +MELBOURNE +MEME +MEMORIAL +MEN +MENU +MERCKMSD +MG +MH +MIAMI +MICROSOFT +MIL +MINI +MINT +MIT +MITSUBISHI +MK +ML +MLB +MLS +MM +MMA +MN +MO +MOBI +MOBILE +MODA +MOE +MOI +MOM +MONASH +MONEY +MONSTER +MORMON +MORTGAGE +MOSCOW +MOTO +MOTORCYCLES +MOV +MOVIE +MP +MQ +MR +MS +MSD +MT +MTN +MTR +MU +MUSEUM +MUSIC +MV +MW +MX +MY +MZ +NA +NAB +NAGOYA +NAME +NAVY +NBA +NC +NE +NEC +NET +NETBANK +NETFLIX +NETWORK +NEUSTAR +NEW +NEWS +NEXT +NEXTDIRECT +NEXUS +NF +NFL +NG +NGO +NHK +NI +NICO +NIKE +NIKON +NINJA +NISSAN +NISSAY +NL +NO +NOKIA +NORTON +NOW +NOWRUZ +NOWTV +NP +NR +NRA +NRW +NTT +NU +NYC +NZ +OBI +OBSERVER +OFFICE +OKINAWA +OLAYAN +OLAYANGROUP +OLLO +OM +OMEGA +ONE +ONG +ONL +ONLINE +OOO +OPEN +ORACLE +ORANGE +ORG +ORGANIC +ORIGINS +OSAKA +OTSUKA +OTT +OVH +PA +PAGE +PANASONIC +PARIS +PARS +PARTNERS +PARTS +PARTY +PAY +PCCW +PE +PET +PF +PFIZER +PG +PH +PHARMACY +PHD +PHILIPS +PHONE +PHOTO +PHOTOGRAPHY +PHOTOS +PHYSIO +PICS +PICTET +PICTURES +PID +PIN +PING +PINK +PIONEER +PIZZA +PK +PL +PLACE +PLAY +PLAYSTATION +PLUMBING +PLUS +PM +PN +PNC +POHL +POKER +POLITIE +PORN +POST +PR +PRAMERICA +PRAXI +PRESS +PRIME +PRO +PROD +PRODUCTIONS +PROF +PROGRESSIVE +PROMO +PROPERTIES +PROPERTY +PROTECTION +PRU +PRUDENTIAL +PS +PT +PUB +PW +PWC +PY +QA +QPON +QUEBEC +QUEST +RACING +RADIO +RE +READ +REALESTATE +REALTOR +REALTY +RECIPES +RED +REDSTONE +REDUMBRELLA +REHAB +REISE +REISEN +REIT +RELIANCE +REN +RENT +RENTALS +REPAIR +REPORT +REPUBLICAN +REST +RESTAURANT +REVIEW +REVIEWS +REXROTH +RICH +RICHARDLI +RICOH +RIL +RIO +RIP +RO +ROCKS +RODEO +ROGERS +ROOM +RS +RSVP +RU +RUGBY +RUHR +RUN +RW +RWE +RYUKYU +SA +SAARLAND +SAFE +SAFETY +SAKURA +SALE +SALON +SAMSCLUB +SAMSUNG +SANDVIK +SANDVIKCOROMANT +SANOFI +SAP +SARL +SAS +SAVE +SAXO +SB +SBI +SBS +SC +SCB +SCHAEFFLER +SCHMIDT +SCHOLARSHIPS +SCHOOL +SCHULE +SCHWARZ +SCIENCE +SCOT +SD +SE +SEARCH +SEAT +SECURE +SECURITY +SEEK +SELECT +SENER +SERVICES +SEVEN +SEW +SEX +SEXY +SFR +SG +SH +SHANGRILA +SHARP +SHELL +SHIA +SHIKSHA +SHOES +SHOP +SHOPPING +SHOUJI +SHOW +SI +SILK +SINA +SINGLES +SITE +SJ +SK +SKI +SKIN +SKY +SKYPE +SL +SLING +SM +SMART +SMILE +SN +SNCF +SO +SOCCER +SOCIAL +SOFTBANK +SOFTWARE +SOHU +SOLAR +SOLUTIONS +SONG +SONY +SOY +SPA +SPACE +SPORT +SPOT +SR +SRL +SS +ST +STADA +STAPLES +STAR +STATEBANK +STATEFARM +STC +STCGROUP +STOCKHOLM +STORAGE +STORE +STREAM +STUDIO +STUDY +STYLE +SU +SUCKS +SUPPLIES +SUPPLY +SUPPORT +SURF +SURGERY +SUZUKI +SV +SWATCH +SWISS +SX +SY +SYDNEY +SYSTEMS +SZ +TAB +TAIPEI +TALK +TAOBAO +TARGET +TATAMOTORS +TATAR +TATTOO +TAX +TAXI +TC +TCI +TD +TDK +TEAM +TECH +TECHNOLOGY +TEL +TEMASEK +TENNIS +TEVA +TF +TG +TH +THD +THEATER +THEATRE +TIAA +TICKETS +TIENDA +TIPS +TIRES +TIROL +TJ +TJMAXX +TJX +TK +TKMAXX +TL +TM +TMALL +TN +TO +TODAY +TOKYO +TOOLS +TOP +TORAY +TOSHIBA +TOTAL +TOURS +TOWN +TOYOTA +TOYS +TR +TRADE +TRADING +TRAINING +TRAVEL +TRAVELERS +TRAVELERSINSURANCE +TRUST +TRV +TT +TUBE +TUI +TUNES +TUSHU +TV +TVS +TW +TZ +UA +UBANK +UBS +UG +UK +UNICOM +UNIVERSITY +UNO +UOL +UPS +US +UY +UZ +VA +VACATIONS +VANA +VANGUARD +VC +VE +VEGAS +VENTURES +VERISIGN +VERSICHERUNG +VET +VG +VI +VIAJES +VIDEO +VIG +VIKING +VILLAS +VIN +VIP +VIRGIN +VISA +VISION +VIVA +VIVO +VLAANDEREN +VN +VODKA +VOLVO +VOTE +VOTING +VOTO +VOYAGE +VU +WALES +WALMART +WALTER +WANG +WANGGOU +WATCH +WATCHES +WEATHER +WEATHERCHANNEL +WEBCAM +WEBER +WEBSITE +WED +WEDDING +WEIBO +WEIR +WF +WHOSWHO +WIEN +WIKI +WILLIAMHILL +WIN +WINDOWS +WINE +WINNERS +WME +WOLTERSKLUWER +WOODSIDE +WORK +WORKS +WORLD +WOW +WS +WTC +WTF +XBOX +XEROX +XIHUAN +XIN +XN--11B4C3D +XN--1CK2E1B +XN--1QQW23A +XN--2SCRJ9C +XN--30RR7Y +XN--3BST00M +XN--3DS443G +XN--3E0B707E +XN--3HCRJ9C +XN--3PXU8K +XN--42C2D9A +XN--45BR5CYL +XN--45BRJ9C +XN--45Q11C +XN--4DBRK0CE +XN--4GBRIM +XN--54B7FTA0CC +XN--55QW42G +XN--55QX5D +XN--5SU34J936BGSG +XN--5TZM5G +XN--6FRZ82G +XN--6QQ986B3XL +XN--80ADXHKS +XN--80AO21A +XN--80AQECDR1A +XN--80ASEHDB +XN--80ASWG +XN--8Y0A063A +XN--90A3AC +XN--90AE +XN--90AIS +XN--9DBQ2A +XN--9ET52U +XN--9KRT00A +XN--B4W605FERD +XN--BCK1B9A5DRE4C +XN--C1AVG +XN--C2BR7G +XN--CCK2B3B +XN--CCKWCXETD +XN--CG4BKI +XN--CLCHC0EA0B2G2A9GCD +XN--CZR694B +XN--CZRS0T +XN--CZRU2D +XN--D1ACJ3B +XN--D1ALF +XN--E1A4C +XN--ECKVDTC9D +XN--EFVY88H +XN--FCT429K +XN--FHBEI +XN--FIQ228C5HS +XN--FIQ64B +XN--FIQS8S +XN--FIQZ9S +XN--FJQ720A +XN--FLW351E +XN--FPCRJ9C3D +XN--FZC2C9E2C +XN--FZYS8D69UVGM +XN--G2XX48C +XN--GCKR3F0F +XN--GECRJ9C +XN--GK3AT1E +XN--H2BREG3EVE +XN--H2BRJ9C +XN--H2BRJ9C8C +XN--HXT814E +XN--I1B6B1A6A2E +XN--IMR513N +XN--IO0A7I +XN--J1AEF +XN--J1AMH +XN--J6W193G +XN--JLQ480N2RG +XN--JVR189M +XN--KCRX77D1X4A +XN--KPRW13D +XN--KPRY57D +XN--KPUT3I +XN--L1ACC +XN--LGBBAT1AD8J +XN--MGB9AWBF +XN--MGBA3A3EJT +XN--MGBA3A4F16A +XN--MGBA7C0BBN0A +XN--MGBAAM7A8H +XN--MGBAB2BD +XN--MGBAH1A3HJKRD +XN--MGBAI9AZGQP6J +XN--MGBAYH7GPA +XN--MGBBH1A +XN--MGBBH1A71E +XN--MGBC0A9AZCG +XN--MGBCA7DZDO +XN--MGBCPQ6GPA1A +XN--MGBERP4A5D4AR +XN--MGBGU82A +XN--MGBI4ECEXP +XN--MGBPL2FH +XN--MGBT3DHD +XN--MGBTX2B +XN--MGBX4CD0AB +XN--MIX891F +XN--MK1BU44C +XN--MXTQ1M +XN--NGBC5AZD +XN--NGBE9E0A +XN--NGBRX +XN--NODE +XN--NQV7F +XN--NQV7FS00EMA +XN--NYQY26A +XN--O3CW4H +XN--OGBPF8FL +XN--OTU796D +XN--P1ACF +XN--P1AI +XN--PGBS0DH +XN--PSSY2U +XN--Q7CE6A +XN--Q9JYB4C +XN--QCKA1PMC +XN--QXA6A +XN--QXAM +XN--RHQV96G +XN--ROVU88B +XN--RVC1E0AM3E +XN--S9BRJ9C +XN--SES554G +XN--T60B56A +XN--TCKWE +XN--TIQ49XQYJ +XN--UNUP4Y +XN--VERMGENSBERATER-CTB +XN--VERMGENSBERATUNG-PWB +XN--VHQUV +XN--VUQ861B +XN--W4R85EL8FHU5DNRA +XN--W4RS40L +XN--WGBH1C +XN--WGBL6A +XN--XHQ521B +XN--XKC2AL3HYE2A +XN--XKC2DL3A5EE0H +XN--Y9A3AQ +XN--YFRO4I67O +XN--YGBI2AMMX +XN--ZFR164B +XXX +XYZ +YACHTS +YAHOO +YAMAXUN +YANDEX +YE +YODOBASHI +YOGA +YOKOHAMA +YOU +YOUTUBE +YT +YUN +ZA +ZAPPOS +ZARA +ZERO +ZIP +ZM +ZONE +ZUERICH +ZW From c08a20ac003baeda95f3bb607da9b397f0ffbdc8 Mon Sep 17 00:00:00 2001 From: Rushmore75 Date: Tue, 15 Apr 2025 09:07:16 -0600 Subject: [PATCH 14/14] cleanup and more accuratly use metrics --- src/main.rs | 63 ++++++++++++++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/src/main.rs b/src/main.rs index dc13d1e..36ecb59 100644 --- a/src/main.rs +++ b/src/main.rs @@ -24,6 +24,8 @@ const GET_IN_FLIGHT: &str = "gets_in_flight"; const SITES_CRAWLED: &str = "pages_crawled"; const BEING_PROCESSED: &str = "pages_being_processed"; +const BATCH_SIZE: usize = 2; + #[derive(Deserialize)] struct Config { surreal_ns: String, @@ -109,13 +111,7 @@ async fn main() { let span = trace_span!("Loop"); let span = span.enter(); while crawled < config.budget { - let get_num = if config.budget - crawled < 100 { - config.budget - crawled - } else { - 100 - }; - - let uncrawled = get_uncrawled_links(&db, get_num, config.crawl_filter.clone()).await; + let uncrawled = get_uncrawled_links(&db, config.budget - crawled, config.crawl_filter.clone()).await; if uncrawled.is_empty() { info!("Had more budget but finished crawling everything."); return; @@ -170,39 +166,44 @@ async fn process(mut site: Website, db: Surreal, reqwest: reqwest::Clien // Send the http request (get) if let Ok(response) = request_builder.send().await { - // METRICS - g.decrement(1); - counter!(GET_METRIC).increment(1); - + // TODO if this will fail if the object we are downloading is + // larger than the memory of the device it's running on. + // We should store it *as* we download it then parse it in-place. // Get body from response let data = response .text() .await .expect("Failed to read http response's body!"); - // Store document - filesystem::store(&data, &site.site).await; + // METRICS + g.decrement(1); + counter!(GET_METRIC).increment(1); - // Parse document and get relationships - let sites = parser::parse(&site, &data).await; + // Store document + let should_parse = filesystem::store(&data, &site.site).await; + + if should_parse { + // Parse document and get relationships + let sites = parser::parse(&site, &data).await; + // De-duplicate this list + let prev_len = sites.len(); + let set = sites.into_iter().fold(HashSet::new(), |mut set,item| { + set.insert(item); + set + }); + let de_dupe_sites: Vec = set.into_iter().collect(); + let diff = prev_len - de_dupe_sites.len(); + trace!("Saved {diff} from being entered into the db by de-duping"); + + // Store all the other sites so that we can link to them. + let _ = Website::store_all(de_dupe_sites, &db).await; + } + // update self in db site.set_crawled(); Website::store_all(vec![site], &db).await; - // De-duplicate this list - let prev_len = sites.len(); - let set = sites.into_iter().fold(HashSet::new(), |mut set,item| { - set.insert(item); - set - }); - let de_dupe_sites: Vec = set.into_iter().collect(); - let diff = prev_len - de_dupe_sites.len(); - trace!("Saved {diff} from being entered into the db by de-duping"); - - // Store all the other sites so that we can link to them. - let _ = Website::store_all(de_dupe_sites, &db).await; - } else { error!("Failed to get: {}", &site.site); } @@ -215,9 +216,11 @@ async fn get_uncrawled_links( mut count: usize, filter: String, ) -> Vec { - if count > 100 { - count = 100 + + if count > BATCH_SIZE { + count = BATCH_SIZE; } + debug!("Getting uncrawled links"); let mut response = db