From 66581cc4538f22998161e2b2a39348909e764588 Mon Sep 17 00:00:00 2001
From: Oliver <oliveratkinsoneng@gmail.com>
Date: Fri, 21 Mar 2025 05:59:40 +0000
Subject: [PATCH 01/14] getting there

---
 .gitignore             |   3 +-
 Crawler.toml           |   8 +---
 docker/prometheus.yaml |   3 +-
 src/db.rs              |  90 ++-----------------------------------
 src/filesystem.rs      |  24 ++++++++++
 src/main.rs            |  57 +++++++++++------------
 src/parser.rs          |  61 ++++++++++++++++++-------
 src/s3.rs              | 100 -----------------------------------------
 8 files changed, 101 insertions(+), 245 deletions(-)
 create mode 100644 src/filesystem.rs
 delete mode 100644 src/s3.rs
diff --git a/.gitignore b/.gitignore
index dbaa546..87fb73a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,4 +4,5 @@
 perf.data
 flamegraph.svg
 perf.data.old
-/docker/logs/*
\ No newline at end of file
+/docker/logs/*
+/downloaded
\ No newline at end of file
diff --git a/Crawler.toml b/Crawler.toml
index d536db5..7072e33 100644
--- a/Crawler.toml
+++ b/Crawler.toml
@@ -3,13 +3,7 @@ surreal_url = "localhost:8000"
 surreal_username = "root"
 surreal_password = "root"
 surreal_ns = "test"
-surreal_db = "v1.17"
-
-# Minio config
-s3_bucket = "v1.17"
-s3_url = "http://localhost:9000"
-s3_access_key = "Ok6s9uQEvKrqRoGZdacm"
-s3_secret_key = "qubeSkP787c7QZu4TvtnuwPTGIAq6ETPupCxvv6K"
+surreal_db = "v1.18.1"
 
 # Crawler config
 crawl_filter = "en.wikipedia.com" 
diff --git a/docker/prometheus.yaml b/docker/prometheus.yaml
index cb43a89..9133e15 100644
--- a/docker/prometheus.yaml
+++ b/docker/prometheus.yaml
@@ -7,7 +7,8 @@ scrape_configs:
     static_configs:
     # change this your machine's ip, localhost won't work
     # because localhost refers to the docker container.
-      - targets: ['172.20.239.48:2500']
+    #      - targets: ['172.20.239.48:2500']
+      - targets: ['192.168.8.209:2500']
   - job_name: loki
     static_configs:
       - targets: ['loki:3100']
diff --git a/src/db.rs b/src/db.rs
index deceb27..03a1850 100644
--- a/src/db.rs
+++ b/src/db.rs
@@ -1,16 +1,11 @@
-use base64::{
-    alphabet,
-    engine::{self, general_purpose},
-    Engine,
-};
 use metrics::counter;
-use serde::{ser::SerializeStruct, Deserialize, Serialize};
+use serde::{Deserialize, Serialize};
 use std::{fmt::Debug, sync::LazyLock, time::Instant};
 use surrealdb::{
     engine::remote::ws::{Client, Ws},
     opt::auth::Root,
     sql::Thing,
-    Response, Surreal,
+    Surreal,
 };
 use tokio::sync::Mutex;
 use tracing::{error, instrument, trace, warn};
@@ -18,17 +13,12 @@ use url::Url;
 
 use crate::Config;
 
-// static LOCK: LazyLock<Arc<Mutex<bool>>> = LazyLock::new(|| Arc::new(Mutex::new(true)));
 static LOCK: LazyLock<Mutex<bool>> = LazyLock::new(|| Mutex::new(true));
 
-const CUSTOM_ENGINE: engine::GeneralPurpose =
-    engine::GeneralPurpose::new(&alphabet::URL_SAFE, general_purpose::NO_PAD);
-
 const TIME_SPENT_ON_LOCK: &str = "surql_lock_waiting_ms";
 const STORE: &str = "surql_store_calls";
-const LINK: &str = "surql_link_calls";
 
-#[derive(Deserialize, Clone)]
+#[derive(Serialize, Deserialize, Clone, Eq, PartialEq, Hash)]
 pub struct Website {
     /// The url that this data is found at
     pub site: Url,
@@ -36,18 +26,6 @@ pub struct Website {
     pub crawled: bool,
 }
 
-impl Serialize for Website {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer {
-            let mut state = serializer.serialize_struct("Website", 2)?;
-            state.serialize_field("crawled", &self.crawled)?;
-            // to_string() calls the correct naming of site
-            state.serialize_field("site", &self.site.to_string())?;
-            state.end()
-    }
-}
-
 // manual impl to make tracing look nicer
 impl Debug for Website {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -73,68 +51,6 @@ impl Website {
         self.crawled = true
     }
 
-    pub fn get_url_as_string(site: &Url) -> String {
-        let domain = match site.domain() {
-            Some(s) => s.to_string(),
-            None => {
-                warn!("Failed to get domain of URL: {}, falling back to 'localhost'", site.to_string());
-                "localhost".to_string()
-            }
-        };
-        let path = site.path();
-
-        domain + path
-    }
-
-    pub fn get_url_as_b64_path(site: &Url) -> String {
-        let domain = site.domain().unwrap_or("DOMAIN").to_string();
-        let path = &CUSTOM_ENGINE.encode(site.path());
-
-        domain + path
-    }
-
-    #[instrument(skip_all)]
-    pub async fn links_to(&self, other: Vec<Thing>, db: &Surreal<Client>) {
-        let len = other.len();
-        if len == 0 {
-            return;
-        }
-        
-        let from = &self.site;
-
-        // let to = other.site.to_string();
-        trace!("Linking {} pages to {from}", other.len());
-        counter!(LINK).increment(1);
-        match db
-            .query("COUNT(RELATE (SELECT id FROM website WHERE site = $in) -> links_to -> $out)")
-            .bind(("in", from.clone()))
-            .bind(("out", other))
-            .await
-        {
-            Ok(mut e) => {
-                // The relate could technically "fail" (not relate anything), this just means that
-                // the query was ok.
-                let _: Response = e;
-                if let Ok(vec) = e.take(0) {
-                    let _: Vec<usize> = vec;
-                    if let Some(num) = vec.first() {
-                        if *num == len {
-                            trace!("Link for {from} OK - {num}/{len}");
-                            return;
-                        } else {
-                            error!("Didn't link all the records. {num}/{len}. Surreal response: {:?}", e);
-                            return;
-                        }
-                    }
-                }
-                error!("Linking request succeeded but couldn't verify the results.");
-            }
-            Err(e) => {
-                error!("{}", e.to_string());
-            }
-        }
-    }
-
     // Insert ever item in the vec into surreal, crawled state will be preserved as TRUE
     // if already in the database as such or incoming data is TRUE.
     pub async fn store_all(all: Vec<Self>, db: &Surreal<Client>) -> Vec<Thing> {
diff --git a/src/filesystem.rs b/src/filesystem.rs
new file mode 100644
index 0000000..9a3f761
--- /dev/null
+++ b/src/filesystem.rs
@@ -0,0 +1,24 @@
+use std::path::PathBuf;
+
+use tokio::fs;
+use tracing::{error, instrument, trace};
+use url::Url;
+
+#[instrument(skip(data))]
+pub async fn store(data: &str, url: &Url) {
+    let path = PathBuf::from("./downloaded".to_string() + url.path());
+    let basepath = path.ancestors().skip(1).take(1).collect::<PathBuf>();
+    trace!("Built path: {:?} and base path: {:?}", &path, &basepath);
+    if let Err(err) = fs::create_dir_all(&basepath).await {
+        let ex = path.ancestors().fold(String::new(), |mut s, item| {
+            s += ", ";
+            s += &item.to_string_lossy().to_string();
+            s
+        });
+        error!("Dir creation: {err} {:?} {ex}", basepath);
+    } else {
+        if let Err(err) = fs::write(&path, data).await {
+            error!("File creation: {err} {:?}", path);
+        }
+    }
+}
diff --git a/src/main.rs b/src/main.rs
index fcdd10b..c5c96cd 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -3,24 +3,21 @@
 extern crate html5ever;
 
 use std::{
-    fs::File,
-    io::Read,
-    net::{IpAddr, Ipv4Addr},
+    collections::HashSet, fs::File, io::Read, net::{IpAddr, Ipv4Addr}, str::FromStr, time
 };
 
 use db::{connect, Website};
 use metrics::{counter, gauge};
 use metrics_exporter_prometheus::PrometheusBuilder;
-use s3::S3;
 use serde::Deserialize;
 use surrealdb::{engine::remote::ws::Client, Surreal};
 use tokio::task::JoinSet;
-use tracing::{debug, error, info, instrument, trace, trace_span};
+use tracing::{debug, error, info, instrument, level_filters::LevelFilter, trace, trace_span};
 use tracing_subscriber::{fmt, layer::SubscriberExt, EnvFilter, Layer, Registry};
 
 mod db;
 mod parser;
-mod s3;
+mod filesystem;
 
 const GET_METRIC: &str = "total_gets";
 const GET_IN_FLIGHT: &str = "gets_in_flight";
@@ -35,11 +32,6 @@ struct Config {
     surreal_username: String,
     surreal_password: String,
 
-    s3_url: String,
-    s3_bucket: String,
-    s3_access_key: String,
-    s3_secret_key: String,
-
     crawl_filter: String,
     budget: usize,
 }
@@ -52,7 +44,9 @@ async fn main() {
         .open("./docker/logs/tracing.log")
         .expect("Couldn't make log file!");
 
-    let filter = EnvFilter::from_default_env();
+    let filter = EnvFilter::builder()
+        .with_default_directive(LevelFilter::DEBUG.into())
+        .from_env_lossy();
 
     let registry = Registry::default().with(
         fmt::layer()
@@ -75,7 +69,7 @@ async fn main() {
         .install()
         .expect("failed to install recorder/exporter");
 
-    debug!("Starting...");
+    info!("Starting...");
     // Would probably take these in as parameters from a cli
     let starting_url = "https://en.wikipedia.org/";
     // When getting uncrawled pages, name must contain this variable. "" will effectively get ignored.
@@ -92,9 +86,6 @@ async fn main() {
     let db = connect(&config)
         .await
         .expect("Failed to connect to surreal, aborting.");
-    let s3 = S3::connect(&config)
-        .await
-        .expect("Failed to connect to minio, aborting.\n\nThis probably means you need to login to the minio console and get a new access key!\n\n(Probably here) http://localhost:9001/access-keys/new-account\n\n");
 
     let reqwest = reqwest::Client::builder()
         // .use_rustls_tls()
@@ -108,7 +99,7 @@ async fn main() {
     let pre_loop_span = span.enter();
     // Download the site
     let site = Website::new(starting_url, false);
-    process(site, db.clone(), reqwest.clone(), s3.clone()).await;
+    process(site, db.clone(), reqwest.clone()).await;
 
     drop(pre_loop_span);
 
@@ -126,20 +117,15 @@ async fn main() {
             info!("Had more budget but finished crawling everything.");
             return;
         }
-        debug!("Crawling {} pages...", uncrawled.len());
-
-        let span = trace_span!("Crawling");
-        let _ = span.enter();
 
         {
             let mut futures = JoinSet::new();
             for site in uncrawled {
                 gauge!(BEING_PROCESSED).increment(1);
-                futures.spawn(process(site, db.clone(), reqwest.clone(), s3.clone()));
+                futures.spawn(process(site, db.clone(), reqwest.clone()));
                 // let percent = format!("{:.2}%", (crawled as f32 / budget as f32) * 100f32);
                 // info!("Crawled {crawled} out of {budget} pages. ({percent})");
             }
-            debug!("Joining {} futures...", futures.len());
 
             let c = counter!(SITES_CRAWLED);
             // As futures complete runs code in while block
@@ -152,13 +138,13 @@ async fn main() {
     }
     drop(span);
 
-    debug!("Done");
+    info!("Done");
 }
 
-#[instrument(skip(db, s3, reqwest))]
+#[instrument(skip(db, reqwest))]
 /// Downloads and crawls and stores a webpage.
 /// It is acceptable to clone `db`, `reqwest`, and `s3` because they all use `Arc`s internally. - Noted by Oliver
-async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Client, s3: S3) {
+async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Client) {
     
     // METRICS
     trace!("Process: {}", &site.site);
@@ -182,7 +168,7 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
             .await
             .expect("Failed to read http response's body!");
         // Store document
-        s3.store(&data, &site.site).await;
+        filesystem::store(&data, &site.site).await;
 
         // Parse document and get relationships
         let sites = parser::parse(&site, &data).await;
@@ -191,12 +177,19 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
         site.set_crawled();
         Website::store_all(vec![site], &db).await;
 
-        // Store all the other sites so that we can link to them.
-        // let mut links_to = Vec::new();
-        let _ = Website::store_all(sites, &db).await;
+        // De-duplicate this list
+        let prev_len = sites.len();
+        let set = sites.into_iter().fold(HashSet::new(), |mut set,item| {
+            set.insert(item);
+            set
+        });
+        let de_dupe_sites: Vec<Website> = set.into_iter().collect();
+        let diff = prev_len - de_dupe_sites.len();
+        trace!("Saved {diff} from being entered into the db by de-duping");
+
+        // Store all the other sites so that we can link to them.
+        let _ = Website::store_all(de_dupe_sites, &db).await;
 
-        // Make the database's links reflect the html links between sites
-        // site.links_to(others, &db).await;
     } else {
         error!("Failed to get: {}", &site.site);
     }
diff --git a/src/parser.rs b/src/parser.rs
index 3fc1276..49001e0 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -5,18 +5,22 @@ use html5ever::tokenizer::{BufferQueue, TokenizerResult};
 use html5ever::tokenizer::{StartTag, TagToken};
 use html5ever::tokenizer::{Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
 use html5ever::{local_name, tendril::*};
-use tracing::instrument;
+use tracing::{debug, error, info, instrument, trace, warn};
+use url::Url;
 
 use crate::db::Website;
 
 impl TokenSink for Website {
     type Handle = Vec<Website>;
 
+    #[instrument(skip(token, _line_number))]
     fn process_token(&self, token: Token, _line_number: u64) -> TokenSinkResult<Self::Handle> {
         match token {
             TagToken(tag) => {
                 if tag.kind == StartTag {
                     match tag.name {
+                        // this should be all the html
+                        // elements that have links
                         local_name!("a")
                         | local_name!("audio")
                         | local_name!("area")
@@ -31,23 +35,46 @@ impl TokenSink for Website {
                                 let attr_name = attr.name.local.to_string();
                                 if attr_name == "src" || attr_name == "href" || attr_name == "data"
                                 {
-                                    // Get clone of the current site object
-                                    let mut web = self.clone();
-
-                                    // Set url
-                                    let mut url = web.site;
-                                    url.set_fragment(None); // removes #xyz
-                                    let joined = url
-                                        .join(&attr.value)
-                                        .expect("Failed to join url during parsing!");
-                                    web.site = joined;
-
-                                    web.crawled = false;
-
-                                    links.push(web);
+                                    let url: Option<Url> = match Url::parse(&attr.value) {
+                                        Ok(ok) => {
+                                            trace!("Found `{}` in the html on `{}` tag", ok.to_string(), tag.name);
+                                            Some(ok)
+                                        },
+                                        Err(e) => {
+                                            if attr.value.eq(&Tendril::from_char('#')) {
+                                                trace!("Rejecting # url");
+                                                None
+                                            } else {
+                                                match e {
+                                                    url::ParseError::RelativeUrlWithoutBase => {
+                                                        let origin = self.site.origin().ascii_serialization();
+                                                        let url = origin.clone() + &attr.value;
+                                                        trace!("Built `{url}` from `{origin} + {}`", &attr.value.to_string());
+                                                        if let Ok(url) = Url::parse(&url) {
+                                                            trace!("Saved relative url `{}` AS: `{}`", &attr.value, url);
+                                                            Some(url)
+                                                        } else {
+                                                            error!("Failed to reconstruct a url from relative url: `{}` on site: `{}`", &attr.value, self.site.to_string());
+                                                            None
+                                                        }
+                                                    }, 
+                                                    _ => {
+                                                        error!("MISC error: {:?} {:?}", e, &attr.value);
+                                                        None
+                                                    },
+                                                }
+                                            }
+                                        },
+                                    };
+                                    if let Some(mut parsed) = url {
+                                        parsed.set_query(None);
+                                        parsed.set_fragment(None);
+                                        debug!("Final cleaned URL: `{}`", parsed.to_string());
+                                        let web = Website::new(&parsed.to_string(), false);
+                                        links.push(web);
+                                    }
                                 }
                             }
-
                             return TokenSinkResult::Script(links);
                         }
                         local_name!("button") | local_name!("meta") | local_name!("iframe") => {
@@ -76,7 +103,7 @@ pub async fn parse(site: &Website, data: &str) -> Vec<Website> {
     token_buffer.push_back(
         chunk
             .try_reinterpret::<fmt::UTF8>()
-            .expect("Failed to reinterprt chunk!"),
+            .expect("Failed to reinterpret chunk!"),
     );
     // create the tokenizer
     let tokenizer = Tokenizer::new(site.clone(), TokenizerOpts::default());
diff --git a/src/s3.rs b/src/s3.rs
deleted file mode 100644
index 4efc2b4..0000000
--- a/src/s3.rs
+++ /dev/null
@@ -1,100 +0,0 @@
-use metrics::counter;
-use minio::s3::{
-    args::{BucketExistsArgs, MakeBucketArgs},
-    client::ClientBuilder,
-    creds::StaticProvider,
-    error::Error,
-    http::BaseUrl,
-    Client,
-};
-use tracing::{instrument, trace, warn};
-use url::Url;
-
-use crate::{db::Website, Config};
-
-const S3_ROUND_TRIP_METRIC: &str = "s3_trips";
-
-#[derive(Clone)]
-pub struct S3 {
-    bucket_name: String,
-    client: Client,
-}
-
-impl S3 {
-    #[instrument(skip_all, name = "S3")]
-    pub async fn connect(config: &Config) -> Result<Self, Error> {
-        let base_url = config
-            .s3_url
-            .parse::<BaseUrl>()
-            .expect("Failed to parse url into BaseUrl");
-
-        let static_provider =
-            StaticProvider::new(&config.s3_access_key, &config.s3_secret_key, None);
-
-        let client = ClientBuilder::new(base_url)
-            .provider(Some(Box::new(static_provider)))
-            .build()?;
-
-        trace!("Checking bucket...");
-        let exists = client
-            .bucket_exists(
-                &BucketExistsArgs::new(&config.s3_bucket)
-                    .expect("Failed to check if bucket exists"),
-            )
-            .await?;
-        counter!(S3_ROUND_TRIP_METRIC).increment(1);
-
-        if !exists {
-            trace!("Creating bucket...");
-            client
-                .make_bucket(
-                    &MakeBucketArgs::new(&config.s3_bucket).expect("Failed to create bucket!"),
-                )
-                .await?;
-        }
-        counter!(S3_ROUND_TRIP_METRIC).increment(1);
-
-        trace!("Connection successful");
-
-        Ok(Self {
-            bucket_name: config.s3_bucket.to_owned(),
-            client,
-        })
-    }
-
-    #[instrument(name = "s3_store", skip_all)]
-    pub async fn store(&self, data: &str, url: &Url) {
-        let counter = counter!(S3_ROUND_TRIP_METRIC);
-
-        let filename = Website::get_url_as_string(url);
-        trace!("Storing {} as {filename}", url.to_string());
-
-        counter.increment(1);
-        match &self
-            .client
-            .put_object_content(&self.bucket_name, &filename, data.to_owned())
-            .send()
-            .await
-        {
-            Ok(_) => {}
-            Err(err) => match err {
-                Error::InvalidObjectName(_) => {
-                    // This code will really only run if the url has non-english chars
-                    warn!("Tried storing invalid object name, retrying with Base64 encoding. Last try.");
-
-                    let filename: String = Website::get_url_as_b64_path(url);
-
-                    counter.increment(1);
-                    let _ = &self
-                        .client
-                        .put_object_content(&self.bucket_name, &filename, data.to_owned())
-                        .send()
-                        .await
-                        .unwrap();
-                }
-                _ => {}
-            },
-        };
-    }
-}
-

From a23429104c50260af8e304c53629c9008949c306 Mon Sep 17 00:00:00 2001
From: Oliver <oliveratkinsoneng@gmail.com>
Date: Fri, 21 Mar 2025 06:03:34 +0000
Subject: [PATCH 02/14] dead code removal

---
 src/db.rs         | 2 +-
 src/filesystem.rs | 2 +-
 src/main.rs       | 2 +-
 src/parser.rs     | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/db.rs b/src/db.rs
index 03a1850..b5e913a 100644
--- a/src/db.rs
+++ b/src/db.rs
@@ -8,7 +8,7 @@ use surrealdb::{
     Surreal,
 };
 use tokio::sync::Mutex;
-use tracing::{error, instrument, trace, warn};
+use tracing::{error, instrument, trace};
 use url::Url;
 
 use crate::Config;
diff --git a/src/filesystem.rs b/src/filesystem.rs
index 9a3f761..d95af58 100644
--- a/src/filesystem.rs
+++ b/src/filesystem.rs
@@ -8,7 +8,7 @@ use url::Url;
 pub async fn store(data: &str, url: &Url) {
     let path = PathBuf::from("./downloaded".to_string() + url.path());
     let basepath = path.ancestors().skip(1).take(1).collect::<PathBuf>();
-    trace!("Built path: {:?} and base path: {:?}", &path, &basepath);
+    trace!("Save path: {:?} and base path: {:?}", &path, &basepath);
     if let Err(err) = fs::create_dir_all(&basepath).await {
         let ex = path.ancestors().fold(String::new(), |mut s, item| {
             s += ", ";
diff --git a/src/main.rs b/src/main.rs
index c5c96cd..399fe0a 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -3,7 +3,7 @@
 extern crate html5ever;
 
 use std::{
-    collections::HashSet, fs::File, io::Read, net::{IpAddr, Ipv4Addr}, str::FromStr, time
+    collections::HashSet, fs::File, io::Read, net::{IpAddr, Ipv4Addr}
 };
 
 use db::{connect, Website};
diff --git a/src/parser.rs b/src/parser.rs
index 49001e0..ac55bf0 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -5,7 +5,7 @@ use html5ever::tokenizer::{BufferQueue, TokenizerResult};
 use html5ever::tokenizer::{StartTag, TagToken};
 use html5ever::tokenizer::{Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
 use html5ever::{local_name, tendril::*};
-use tracing::{debug, error, info, instrument, trace, warn};
+use tracing::{debug, error, instrument, trace, warn};
 use url::Url;
 
 use crate::db::Website;
@@ -41,7 +41,7 @@ impl TokenSink for Website {
                                             Some(ok)
                                         },
                                         Err(e) => {
-                                            if attr.value.eq(&Tendril::from_char('#')) {
+                                            if attr.value.starts_with('#') {
                                                 trace!("Rejecting # url");
                                                 None
                                             } else {

From be0fd5505b31367c2a7530520fe1403db1c0f4a0 Mon Sep 17 00:00:00 2001
From: Oliver <oliveratkinsoneng@gmail.com>
Date: Fri, 21 Mar 2025 06:48:17 +0000
Subject: [PATCH 03/14] i think the files work better

---
 src/filesystem.rs | 36 ++++++++++++++++++++++++------------
 src/main.rs       |  2 ++
 2 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/src/filesystem.rs b/src/filesystem.rs
index d95af58..8ecfd49 100644
--- a/src/filesystem.rs
+++ b/src/filesystem.rs
@@ -1,24 +1,36 @@
 use std::path::PathBuf;
 
+use reqwest::header::HeaderValue;
 use tokio::fs;
 use tracing::{error, instrument, trace};
 use url::Url;
 
 #[instrument(skip(data))]
 pub async fn store(data: &str, url: &Url) {
-    let path = PathBuf::from("./downloaded".to_string() + url.path());
-    let basepath = path.ancestors().skip(1).take(1).collect::<PathBuf>();
-    trace!("Save path: {:?} and base path: {:?}", &path, &basepath);
-    if let Err(err) = fs::create_dir_all(&basepath).await {
-        let ex = path.ancestors().fold(String::new(), |mut s, item| {
-            s += ", ";
-            s += &item.to_string_lossy().to_string();
-            s
-        });
-        error!("Dir creation: {err} {:?} {ex}", basepath);
+    // extract data from url to save it accurately
+    let url_path = PathBuf::from("./downloaded/".to_string() + url.domain().unwrap_or("UnknownDomain") + url.path());
+
+    // if it's a file
+    let (basepath, filename) = if url_path.extension().is_some() {
+        // get everything up till the file
+        let basepath = url_path.ancestors().skip(1).take(1).collect::<PathBuf>();
+        // get the file name
+        let filename = url_path.file_name().expect("This should exist").to_string_lossy();
+        trace!("Save path: {:?} and base path: {:?}", &url_path, &basepath);
+        (basepath, filename.to_string())
     } else {
-        if let Err(err) = fs::write(&path, data).await {
-            error!("File creation: {err} {:?}", path);
+        (url_path.clone(), "index.html".into())
+    };
+
+    // create the folders
+    if let Err(err) = fs::create_dir_all(&basepath).await {
+        error!("Dir creation: {err} {:?}", basepath);
+    } else {
+        // FIXME I don't think this handles index.html files well...
+        // TODO this should probably append .html to non-described files
+        // create the file if that was successful
+        if let Err(err) = fs::write(&basepath.join(filename), data).await {
+            error!("File creation: {err} {:?}", url_path);
         }
     }
 }
diff --git a/src/main.rs b/src/main.rs
index 399fe0a..f93dac5 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -7,6 +7,7 @@ use std::{
 };
 
 use db::{connect, Website};
+use filesystem::FileType;
 use metrics::{counter, gauge};
 use metrics_exporter_prometheus::PrometheusBuilder;
 use serde::Deserialize;
@@ -167,6 +168,7 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
             .text()
             .await
             .expect("Failed to read http response's body!");
+
         // Store document
         filesystem::store(&data, &site.site).await;
 

From 2de01b2a0e512fa25e798eec586766e782a725d4 Mon Sep 17 00:00:00 2001
From: Oliver <oliveratkinsoneng@gmail.com>
Date: Fri, 21 Mar 2025 06:48:39 +0000
Subject: [PATCH 04/14] remove removed code

---
 src/filesystem.rs | 1 -
 src/main.rs       | 1 -
 2 files changed, 2 deletions(-)

diff --git a/src/filesystem.rs b/src/filesystem.rs
index 8ecfd49..578ae7e 100644
--- a/src/filesystem.rs
+++ b/src/filesystem.rs
@@ -1,6 +1,5 @@
 use std::path::PathBuf;
 
-use reqwest::header::HeaderValue;
 use tokio::fs;
 use tracing::{error, instrument, trace};
 use url::Url;
diff --git a/src/main.rs b/src/main.rs
index f93dac5..9f20638 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -7,7 +7,6 @@ use std::{
 };
 
 use db::{connect, Website};
-use filesystem::FileType;
 use metrics::{counter, gauge};
 use metrics_exporter_prometheus::PrometheusBuilder;
 use serde::Deserialize;

From 808790a7c3bfd21d0c639fdfcecf4591ff889bbd Mon Sep 17 00:00:00 2001
From: Oliver <oliveratkinsoneng@gmail.com>
Date: Fri, 21 Mar 2025 07:11:51 +0000
Subject: [PATCH 05/14] file patch;

---
 Crawler.toml      |  2 +-
 src/filesystem.rs | 39 ++++++++++++++++++++++++++++++++++++---
 2 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/Crawler.toml b/Crawler.toml
index 7072e33..a23d0f4 100644
--- a/Crawler.toml
+++ b/Crawler.toml
@@ -3,7 +3,7 @@ surreal_url = "localhost:8000"
 surreal_username = "root"
 surreal_password = "root"
 surreal_ns = "test"
-surreal_db = "v1.18.1"
+surreal_db = "v1.19.1"
 
 # Crawler config
 crawl_filter = "en.wikipedia.com" 
diff --git a/src/filesystem.rs b/src/filesystem.rs
index 578ae7e..e92b9aa 100644
--- a/src/filesystem.rs
+++ b/src/filesystem.rs
@@ -1,7 +1,7 @@
-use std::path::PathBuf;
+use std::{ffi::OsStr, path::PathBuf};
 
 use tokio::fs;
-use tracing::{error, instrument, trace};
+use tracing::{error, instrument, trace, warn};
 use url::Url;
 
 #[instrument(skip(data))]
@@ -10,7 +10,7 @@ pub async fn store(data: &str, url: &Url) {
     let url_path = PathBuf::from("./downloaded/".to_string() + url.domain().unwrap_or("UnknownDomain") + url.path());
 
     // if it's a file
-    let (basepath, filename) = if url_path.extension().is_some() {
+    let (basepath, filename) = if url_path.extension().filter(valid_file_extension).is_some() {
         // get everything up till the file
         let basepath = url_path.ancestors().skip(1).take(1).collect::<PathBuf>();
         // get the file name
@@ -33,3 +33,36 @@ pub async fn store(data: &str, url: &Url) {
         }
     }
 }
+
+fn valid_file_extension(take: &&OsStr) -> bool {
+    let los = take.to_string_lossy();
+    let all = los.split('.');
+    match all.last() {
+        Some(s) => {
+            match s.to_lowercase().as_str() {
+                "html" => true,
+                "css" => true,
+                "js" => true,
+                "ts" => true,
+                "otf" => true, // font
+
+                "png" => true,
+                "svg" => true,
+                "jpg" => true,
+                "jpeg" => true,
+                "mp4" => true,
+                "mp3" => true,
+                "webp" => true,
+
+                "pdf" => true,
+                "json" => true,
+                "xml" => true,
+                _ => {
+                    warn!("Might be forgetting a file extension: {s}");
+                    false
+                }
+            }
+        },
+        None => false,
+    }
+}

From b750d88d48b74fbf9373c8ed46150c13e358dabe Mon Sep 17 00:00:00 2001
From: Rushmore75 <oliveratkinsoneng@gmail.com>
Date: Fri, 21 Mar 2025 11:42:43 -0600
Subject: [PATCH 06/14] working filesystem storage

---
 Crawler.toml           |  2 +-
 docker/compose.yml     | 16 --------
 docker/prometheus.yaml |  8 +---
 src/filesystem.rs      |  4 +-
 src/parser.rs          | 90 ++++++++++++++++++++++++++----------------
 5 files changed, 63 insertions(+), 57 deletions(-)

diff --git a/Crawler.toml b/Crawler.toml
index a23d0f4..8940c1b 100644
--- a/Crawler.toml
+++ b/Crawler.toml
@@ -3,7 +3,7 @@ surreal_url = "localhost:8000"
 surreal_username = "root"
 surreal_password = "root"
 surreal_ns = "test"
-surreal_db = "v1.19.1"
+surreal_db = "v1.19.2"
 
 # Crawler config
 crawl_filter = "en.wikipedia.com" 
diff --git a/docker/compose.yml b/docker/compose.yml
index 7cb6dcc..747afa0 100644
--- a/docker/compose.yml
+++ b/docker/compose.yml
@@ -14,22 +14,6 @@ services:
       - --pass
       - root
       - rocksdb:/mydata/database.db
-  minio:
-    image: quay.io/minio/minio
-    ports:
-      - 9000:9000
-      - 9001:9001
-    environment:
-      - MINIO_ROOT_USER=root
-      - MINIO_ROOT_PASSWORD=an8charpassword
-      - MINIO_PROMETHEUS_AUTH_TYPE=public
-    volumes:
-      - minio_storage:/data
-    command:
-      - server
-      - /data
-      - --console-address
-      - ":9001"
 
   alloy:
     image: grafana/alloy:latest
diff --git a/docker/prometheus.yaml b/docker/prometheus.yaml
index 9133e15..ffc1e24 100644
--- a/docker/prometheus.yaml
+++ b/docker/prometheus.yaml
@@ -7,15 +7,11 @@ scrape_configs:
     static_configs:
     # change this your machine's ip, localhost won't work
     # because localhost refers to the docker container.
-    #      - targets: ['172.20.239.48:2500']
-      - targets: ['192.168.8.209:2500']
+      - targets: ['172.20.239.48:2500']
+        #- targets: ['192.168.8.209:2500']
   - job_name: loki
     static_configs:
       - targets: ['loki:3100']
   - job_name: prometheus
     static_configs:
       - targets: ['localhost:9090']
-  - job_name: minio
-    metrics_path: /minio/v2/metrics/cluster
-    static_configs:
-      - targets: ['minio:9000']
diff --git a/src/filesystem.rs b/src/filesystem.rs
index e92b9aa..85fcf30 100644
--- a/src/filesystem.rs
+++ b/src/filesystem.rs
@@ -1,7 +1,7 @@
 use std::{ffi::OsStr, path::PathBuf};
 
 use tokio::fs;
-use tracing::{error, instrument, trace, warn};
+use tracing::{debug, error, info, instrument, trace, warn};
 use url::Url;
 
 #[instrument(skip(data))]
@@ -21,6 +21,8 @@ pub async fn store(data: &str, url: &Url) {
         (url_path.clone(), "index.html".into())
     };
 
+    info!("Writing at: {:?} {:?}", basepath, filename);
+
     // create the folders
     if let Err(err) = fs::create_dir_all(&basepath).await {
         error!("Dir creation: {err} {:?}", basepath);
diff --git a/src/parser.rs b/src/parser.rs
index ac55bf0..c1e87e2 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -19,8 +19,7 @@ impl TokenSink for Website {
             TagToken(tag) => {
                 if tag.kind == StartTag {
                     match tag.name {
-                        // this should be all the html
-                        // elements that have links
+                        // this should be all the html elements that have links
                         local_name!("a")
                         | local_name!("audio")
                         | local_name!("area")
@@ -35,37 +34,9 @@ impl TokenSink for Website {
                                 let attr_name = attr.name.local.to_string();
                                 if attr_name == "src" || attr_name == "href" || attr_name == "data"
                                 {
-                                    let url: Option<Url> = match Url::parse(&attr.value) {
-                                        Ok(ok) => {
-                                            trace!("Found `{}` in the html on `{}` tag", ok.to_string(), tag.name);
-                                            Some(ok)
-                                        },
-                                        Err(e) => {
-                                            if attr.value.starts_with('#') {
-                                                trace!("Rejecting # url");
-                                                None
-                                            } else {
-                                                match e {
-                                                    url::ParseError::RelativeUrlWithoutBase => {
-                                                        let origin = self.site.origin().ascii_serialization();
-                                                        let url = origin.clone() + &attr.value;
-                                                        trace!("Built `{url}` from `{origin} + {}`", &attr.value.to_string());
-                                                        if let Ok(url) = Url::parse(&url) {
-                                                            trace!("Saved relative url `{}` AS: `{}`", &attr.value, url);
-                                                            Some(url)
-                                                        } else {
-                                                            error!("Failed to reconstruct a url from relative url: `{}` on site: `{}`", &attr.value, self.site.to_string());
-                                                            None
-                                                        }
-                                                    }, 
-                                                    _ => {
-                                                        error!("MISC error: {:?} {:?}", e, &attr.value);
-                                                        None
-                                                    },
-                                                }
-                                            }
-                                        },
-                                    };
+                                    trace!("Found `{}` in html `{}` tag", &attr.value, tag.name);
+                                    let url = try_get_url(&self.site, &attr.value);
+
                                     if let Some(mut parsed) = url {
                                         parsed.set_query(None);
                                         parsed.set_fragment(None);
@@ -119,3 +90,56 @@ pub async fn parse(site: &Website, data: &str) -> Vec<Website> {
     other_sites
 }
 
+#[instrument]
+fn try_get_url(parent: &Url, link: &str) -> Option<Url> {
+    match Url::parse(link) {
+        Ok(ok) => Some(ok),
+        Err(e) => {
+            if link.starts_with('#') {
+                trace!("Rejecting # url");
+                None
+            } else if link.starts_with("//") {
+                // if a url starts with "//" is assumed that it will adopt
+                // the same scheme as it's parent
+                // https://stackoverflow.com/questions/9646407/two-forward-slashes-in-a-url-src-href-attribute
+                let scheme = parent.scheme();
+
+                match Url::parse(&format!("{scheme}://{}", link)) {
+                    Ok(url) => Some(url),
+                    Err(err) => {
+                        error!("Failed parsing realative scheme url: {}", err);
+                        None
+                    }
+                }
+            } else {
+                // # This is some sort of realative url, gonna try patching it up into an absolute
+                // url
+                match e {
+                    url::ParseError::RelativeUrlWithoutBase => {
+                        // Is: scheme://host:port
+                        let origin = parent.origin().ascii_serialization();
+                        let url = origin.clone() + link;
+
+                        trace!("Built `{url}` from `{origin} + {}`", link.to_string());
+
+                        if let Ok(url) = Url::parse(&url) {
+                            trace!("Saved relative url `{}` AS: `{}`", link, url);
+                            Some(url)
+                        } else {
+                            error!(
+                                "Failed to reconstruct a url from relative url: `{}` on site: `{}`",
+                                link,
+                                parent.to_string()
+                            );
+                            None
+                        }
+                    }
+                    _ => {
+                        error!("MISC error: {:?} {:?}", e, link);
+                        None
+                    }
+                }
+            }
+        }
+    }
+}

From 96a3ca092ab73874442f19f812ead07c4e989208 Mon Sep 17 00:00:00 2001
From: Rushmore75 <oliveratkinsoneng@gmail.com>
Date: Fri, 21 Mar 2025 12:11:05 -0600
Subject: [PATCH 07/14] :)

---
 src/db.rs         | 15 +--------------
 src/filesystem.rs |  4 ++--
 2 files changed, 3 insertions(+), 16 deletions(-)

diff --git a/src/db.rs b/src/db.rs
index b5e913a..4083ddc 100644
--- a/src/db.rs
+++ b/src/db.rs
@@ -1,21 +1,17 @@
 use metrics::counter;
+use std::fmt::Debug;
 use serde::{Deserialize, Serialize};
-use std::{fmt::Debug, sync::LazyLock, time::Instant};
 use surrealdb::{
     engine::remote::ws::{Client, Ws},
     opt::auth::Root,
     sql::Thing,
     Surreal,
 };
-use tokio::sync::Mutex;
 use tracing::{error, instrument, trace};
 use url::Url;
 
 use crate::Config;
 
-static LOCK: LazyLock<Mutex<bool>> = LazyLock::new(|| Mutex::new(true));
-
-const TIME_SPENT_ON_LOCK: &str = "surql_lock_waiting_ms";
 const STORE: &str = "surql_store_calls";
 
 #[derive(Serialize, Deserialize, Clone, Eq, PartialEq, Hash)]
@@ -57,14 +53,6 @@ impl Website {
         counter!(STORE).increment(1);
         let mut things = Vec::with_capacity(all.len());
 
-        // TODO this only allows for one thread to be in the database at a time.
-        // This is currently required since otherwise we get write errors.
-        // If the default `crawled` is set to false, we might not need to write any more
-        // than just the name. `accessed_at` is fun but not needed.
-        let now = Instant::now();
-        let lock = LOCK.lock().await;
-        counter!(TIME_SPENT_ON_LOCK).increment(now.elapsed().as_millis() as u64);
-
         match db
             .query(
                 "INSERT INTO website $array
@@ -85,7 +73,6 @@ impl Website {
                 error!("{:?}", err);
             }
         }
-        drop(lock);
         things
     }
 }
diff --git a/src/filesystem.rs b/src/filesystem.rs
index 85fcf30..46c1af3 100644
--- a/src/filesystem.rs
+++ b/src/filesystem.rs
@@ -1,7 +1,7 @@
 use std::{ffi::OsStr, path::PathBuf};
 
 use tokio::fs;
-use tracing::{debug, error, info, instrument, trace, warn};
+use tracing::{debug, error, instrument, trace, warn};
 use url::Url;
 
 #[instrument(skip(data))]
@@ -21,7 +21,7 @@ pub async fn store(data: &str, url: &Url) {
         (url_path.clone(), "index.html".into())
     };
 
-    info!("Writing at: {:?} {:?}", basepath, filename);
+    debug!("Writing at: {:?} {:?}", basepath, filename);
 
     // create the folders
     if let Err(err) = fs::create_dir_all(&basepath).await {

From 6fc71c7a7891fc11fd9318a86dafc52a087fbc03 Mon Sep 17 00:00:00 2001
From: Rushmore75 <oliveratkinsoneng@gmail.com>
Date: Fri, 21 Mar 2025 12:14:29 -0600
Subject: [PATCH 08/14] add speed improvements

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 6b2890c..230af74 100644
--- a/README.md
+++ b/README.md
@@ -17,6 +17,8 @@ This ment we stored 1000 pages, 142,997 urls, and 1,425,798 links between the tw
 
 3/20/25: Took 5min to crawl 1000 pages
 
+3/21/25: Took 3min to crawl 1000 pages
+
 # About
 
 ![Screenshot](/pngs/graphana.png)

From 03cbcd9ae0c24d4428007b9772427defbebaf7a1 Mon Sep 17 00:00:00 2001
From: Rushmore75 <oliveratkinsoneng@gmail.com>
Date: Mon, 31 Mar 2025 14:18:11 -0600
Subject: [PATCH 09/14] remove minio code

---
 docker/compose.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docker/compose.yml b/docker/compose.yml
index 747afa0..efe16f2 100644
--- a/docker/compose.yml
+++ b/docker/compose.yml
@@ -66,4 +66,3 @@ volumes:
   grafana_storage:
   alloy_storage:
   surrealdb_storage:
-  minio_storage:

From 4a433a1a772f7d51dc117ca1dbcc7f125b4c3840 Mon Sep 17 00:00:00 2001
From: Rushmore75 <oliveratkinsoneng@gmail.com>
Date: Mon, 31 Mar 2025 14:18:37 -0600
Subject: [PATCH 10/14] This function sometimes throws errors, this logging
 should help

---
 src/db.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/db.rs b/src/db.rs
index 4083ddc..06cddde 100644
--- a/src/db.rs
+++ b/src/db.rs
@@ -49,6 +49,7 @@ impl Website {
 
     // Insert ever item in the vec into surreal, crawled state will be preserved as TRUE
     // if already in the database as such or incoming data is TRUE.
+    #[instrument(skip(db))]
     pub async fn store_all(all: Vec<Self>, db: &Surreal<Client>) -> Vec<Thing> {
         counter!(STORE).increment(1);
         let mut things = Vec::with_capacity(all.len());

From add6f00ed6dc1b61fc1fffbb61af539fd92f7a7b Mon Sep 17 00:00:00 2001
From: Rushmore75 <oliveratkinsoneng@gmail.com>
Date: Mon, 31 Mar 2025 14:53:10 -0600
Subject: [PATCH 11/14] no recomp needed

---
 Crawler.toml      |  7 ++++---
 README.md         |  5 +++--
 src/filesystem.rs |  7 +++++++
 src/main.rs       | 16 ++++++++++++++--
 4 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/Crawler.toml b/Crawler.toml
index 8940c1b..0048238 100644
--- a/Crawler.toml
+++ b/Crawler.toml
@@ -3,8 +3,9 @@ surreal_url = "localhost:8000"
 surreal_username = "root"
 surreal_password = "root"
 surreal_ns = "test"
-surreal_db = "v1.19.2"
+surreal_db = "v1.19.5"
 
 # Crawler config
-crawl_filter = "en.wikipedia.com" 
-budget = 1000
+crawl_filter = "en.wikipedia.org" 
+start_url = "https://en.wikipedia.org"
+budget = 100
diff --git a/README.md b/README.md
index 230af74..a223089 100644
--- a/README.md
+++ b/README.md
@@ -4,11 +4,12 @@ Crawls sites saving all the found links to a surrealdb database. It then proceed
 
 ### TODO
 
-- [ ] Domain filtering - prevent the crawler from going on alternate versions of wikipedia.
+- [x] Domain filtering - prevent the crawler from going on alternate versions of wikipedia.
 - [ ] Conditionally save content - based on filename or file contents
 - [x] GUI / TUI ? - Graphana
 - [x] Better asynchronous getting of the sites. Currently it all happens serially.
-- [ ] Allow for storing asynchronously
+- [x] Allow for storing asynchronously - dropping the "links to" logic fixes this need
+- [x] Control crawler via config file (no recompliation needed)
 
 3/17/25: Took >1hr to crawl 100 pages
 
diff --git a/src/filesystem.rs b/src/filesystem.rs
index 46c1af3..5a253f8 100644
--- a/src/filesystem.rs
+++ b/src/filesystem.rs
@@ -59,6 +59,13 @@ fn valid_file_extension(take: &&OsStr) -> bool {
                 "pdf" => true,
                 "json" => true,
                 "xml" => true,
+
+                // IGNORE
+                // TODO Should this be a list of all domains?
+                "org" => false,
+                "com" => false,
+                "net" => false,
+
                 _ => {
                     warn!("Might be forgetting a file extension: {s}");
                     false
diff --git a/src/main.rs b/src/main.rs
index 9f20638..dc13d1e 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -33,11 +33,14 @@ struct Config {
     surreal_password: String,
 
     crawl_filter: String,
+    start_url: String,
     budget: usize,
 }
 
 #[tokio::main]
 async fn main() {
+    println!("Logs and metrics are provided to the Grafana dashboard");
+
     let writer = std::fs::OpenOptions::new()
         .append(true)
         .create(true)
@@ -70,8 +73,7 @@ async fn main() {
         .expect("failed to install recorder/exporter");
 
     info!("Starting...");
-    // Would probably take these in as parameters from a cli
-    let starting_url = "https://en.wikipedia.org/";
+
     // When getting uncrawled pages, name must contain this variable. "" will effectively get ignored.
     // let crawl_filter = "en.wikipedia.org/";
     // let budget = 50;
@@ -82,6 +84,7 @@ async fn main() {
     let _ = file.read_to_string(&mut buf);
 
     let config: Config = toml::from_str(&buf).expect("Failed to parse Crawler.toml");
+    let starting_url = &config.start_url;
 
     let db = connect(&config)
         .await
@@ -138,6 +141,15 @@ async fn main() {
     }
     drop(span);
 
+    if let Ok(mut ok) = db.query("count(select id from website where crawled = true)").await {
+        let res = ok.take::<Option<usize>>(0);
+        if let Ok(i) = res {
+            if let Some(n) = i {
+                info!("Total crawled pages now equals {n}");
+            }
+        }
+    }
+
     info!("Done");
 }
 

From a9465dda6e32d86a92d0b14e23a1513db9272bf4 Mon Sep 17 00:00:00 2001
From: Rushmore75 <oliveratkinsoneng@gmail.com>
Date: Mon, 31 Mar 2025 15:05:18 -0600
Subject: [PATCH 12/14] add instructions

---
 README.md | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/README.md b/README.md
index a223089..7ec5c6d 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,35 @@
 
 Crawls sites saving all the found links to a surrealdb database. It then proceeds to take batches of 100 uncrawled links untill the crawl budget is reached. It saves the data of each site in a minio database.
 
+## How to use
+
+1. Clone the repo and `cd` into it.
+2. Build the repo with `cargo build -r`
+3. Start the docker conatiners
+	1. cd into the docker folder `cd docker`
+	2. Bring up the docker containers `docker compose up -d`
+4. From the project's root, edit the `Crawler.toml` file to your liking.
+5. Run with `./target/release/internet_mapper`
+
+You can view stats of the project at `http://<your-ip>:3000/dashboards`
+
+```bash
+# Untested script but probably works
+git clone https://git.oliveratkinson.net/Oliver/internet_mapper.git
+cd internet_mapper
+
+cargo build -r
+
+cd docker
+docker compose up -d
+cd ..
+
+$EDITOR Crawler.toml
+
+./target/release/internet_mapper
+
+```
+
 ### TODO
 
 - [x] Domain filtering - prevent the crawler from going on alternate versions of wikipedia.

From 94912e9125c3746ed882b4bbbf0fdf4ce12f670b Mon Sep 17 00:00:00 2001
From: Rushmore75 <oliveratkinsoneng@gmail.com>
Date: Tue, 15 Apr 2025 09:06:57 -0600
Subject: [PATCH 13/14] change up how files are discovered

---
 src/filesystem.rs            |   47 +-
 src/tlds-alpha-by-domain.txt | 1444 ++++++++++++++++++++++++++++++++++
 2 files changed, 1459 insertions(+), 32 deletions(-)
 create mode 100644 src/tlds-alpha-by-domain.txt

diff --git a/src/filesystem.rs b/src/filesystem.rs
index 5a253f8..81ad057 100644
--- a/src/filesystem.rs
+++ b/src/filesystem.rs
@@ -5,7 +5,10 @@ use tracing::{debug, error, instrument, trace, warn};
 use url::Url;
 
 #[instrument(skip(data))]
-pub async fn store(data: &str, url: &Url) {
+/// Returns whether or not the saved file should be parsed.
+/// If the file is just data, like an image, it doesn't need to be parsed.
+/// If it's html, then it does need to be parsed.
+pub async fn store(data: &str, url: &Url) -> bool {
     // extract data from url to save it accurately
     let url_path = PathBuf::from("./downloaded/".to_string() + url.domain().unwrap_or("UnknownDomain") + url.path());
 
@@ -21,19 +24,20 @@ pub async fn store(data: &str, url: &Url) {
         (url_path.clone(), "index.html".into())
     };
 
+    let should_parse = filename.ends_with(".html");
+
     debug!("Writing at: {:?} {:?}", basepath, filename);
 
     // create the folders
     if let Err(err) = fs::create_dir_all(&basepath).await {
         error!("Dir creation: {err} {:?}", basepath);
     } else {
-        // FIXME I don't think this handles index.html files well...
-        // TODO this should probably append .html to non-described files
-        // create the file if that was successful
         if let Err(err) = fs::write(&basepath.join(filename), data).await {
             error!("File creation: {err} {:?}", url_path);
         }
     }
+
+    should_parse
 }
 
 fn valid_file_extension(take: &&OsStr) -> bool {
@@ -41,35 +45,14 @@ fn valid_file_extension(take: &&OsStr) -> bool {
     let all = los.split('.');
     match all.last() {
         Some(s) => {
-            match s.to_lowercase().as_str() {
-                "html" => true,
-                "css" => true,
-                "js" => true,
-                "ts" => true,
-                "otf" => true, // font
+            // FIXME it's worth noting that the dumb tlds like .zip are in here,
+            // which could cause problems
+            let all_domains = include_str!("tlds-alpha-by-domain.txt");
 
-                "png" => true,
-                "svg" => true,
-                "jpg" => true,
-                "jpeg" => true,
-                "mp4" => true,
-                "mp3" => true,
-                "webp" => true,
-
-                "pdf" => true,
-                "json" => true,
-                "xml" => true,
-
-                // IGNORE
-                // TODO Should this be a list of all domains?
-                "org" => false,
-                "com" => false,
-                "net" => false,
-
-                _ => {
-                    warn!("Might be forgetting a file extension: {s}");
-                    false
-                }
+            // check if it is a domain
+            match all_domains.lines().map(str::to_lowercase).find(|x| x==s.to_lowercase().as_str()) {
+                Some(_) => false,
+                None => true
             }
         },
         None => false,
diff --git a/src/tlds-alpha-by-domain.txt b/src/tlds-alpha-by-domain.txt
new file mode 100644
index 0000000..327f649
--- /dev/null
+++ b/src/tlds-alpha-by-domain.txt
@@ -0,0 +1,1444 @@
+# Version 2025041500, Last Updated Tue Apr 15 07:07:01 2025 UTC
+AAA
+AARP
+ABB
+ABBOTT
+ABBVIE
+ABC
+ABLE
+ABOGADO
+ABUDHABI
+AC
+ACADEMY
+ACCENTURE
+ACCOUNTANT
+ACCOUNTANTS
+ACO
+ACTOR
+AD
+ADS
+ADULT
+AE
+AEG
+AERO
+AETNA
+AF
+AFL
+AFRICA
+AG
+AGAKHAN
+AGENCY
+AI
+AIG
+AIRBUS
+AIRFORCE
+AIRTEL
+AKDN
+AL
+ALIBABA
+ALIPAY
+ALLFINANZ
+ALLSTATE
+ALLY
+ALSACE
+ALSTOM
+AM
+AMAZON
+AMERICANEXPRESS
+AMERICANFAMILY
+AMEX
+AMFAM
+AMICA
+AMSTERDAM
+ANALYTICS
+ANDROID
+ANQUAN
+ANZ
+AO
+AOL
+APARTMENTS
+APP
+APPLE
+AQ
+AQUARELLE
+AR
+ARAB
+ARAMCO
+ARCHI
+ARMY
+ARPA
+ART
+ARTE
+AS
+ASDA
+ASIA
+ASSOCIATES
+AT
+ATHLETA
+ATTORNEY
+AU
+AUCTION
+AUDI
+AUDIBLE
+AUDIO
+AUSPOST
+AUTHOR
+AUTO
+AUTOS
+AW
+AWS
+AX
+AXA
+AZ
+AZURE
+BA
+BABY
+BAIDU
+BANAMEX
+BAND
+BANK
+BAR
+BARCELONA
+BARCLAYCARD
+BARCLAYS
+BAREFOOT
+BARGAINS
+BASEBALL
+BASKETBALL
+BAUHAUS
+BAYERN
+BB
+BBC
+BBT
+BBVA
+BCG
+BCN
+BD
+BE
+BEATS
+BEAUTY
+BEER
+BENTLEY
+BERLIN
+BEST
+BESTBUY
+BET
+BF
+BG
+BH
+BHARTI
+BI
+BIBLE
+BID
+BIKE
+BING
+BINGO
+BIO
+BIZ
+BJ
+BLACK
+BLACKFRIDAY
+BLOCKBUSTER
+BLOG
+BLOOMBERG
+BLUE
+BM
+BMS
+BMW
+BN
+BNPPARIBAS
+BO
+BOATS
+BOEHRINGER
+BOFA
+BOM
+BOND
+BOO
+BOOK
+BOOKING
+BOSCH
+BOSTIK
+BOSTON
+BOT
+BOUTIQUE
+BOX
+BR
+BRADESCO
+BRIDGESTONE
+BROADWAY
+BROKER
+BROTHER
+BRUSSELS
+BS
+BT
+BUILD
+BUILDERS
+BUSINESS
+BUY
+BUZZ
+BV
+BW
+BY
+BZ
+BZH
+CA
+CAB
+CAFE
+CAL
+CALL
+CALVINKLEIN
+CAM
+CAMERA
+CAMP
+CANON
+CAPETOWN
+CAPITAL
+CAPITALONE
+CAR
+CARAVAN
+CARDS
+CARE
+CAREER
+CAREERS
+CARS
+CASA
+CASE
+CASH
+CASINO
+CAT
+CATERING
+CATHOLIC
+CBA
+CBN
+CBRE
+CC
+CD
+CENTER
+CEO
+CERN
+CF
+CFA
+CFD
+CG
+CH
+CHANEL
+CHANNEL
+CHARITY
+CHASE
+CHAT
+CHEAP
+CHINTAI
+CHRISTMAS
+CHROME
+CHURCH
+CI
+CIPRIANI
+CIRCLE
+CISCO
+CITADEL
+CITI
+CITIC
+CITY
+CK
+CL
+CLAIMS
+CLEANING
+CLICK
+CLINIC
+CLINIQUE
+CLOTHING
+CLOUD
+CLUB
+CLUBMED
+CM
+CN
+CO
+COACH
+CODES
+COFFEE
+COLLEGE
+COLOGNE
+COM
+COMMBANK
+COMMUNITY
+COMPANY
+COMPARE
+COMPUTER
+COMSEC
+CONDOS
+CONSTRUCTION
+CONSULTING
+CONTACT
+CONTRACTORS
+COOKING
+COOL
+COOP
+CORSICA
+COUNTRY
+COUPON
+COUPONS
+COURSES
+CPA
+CR
+CREDIT
+CREDITCARD
+CREDITUNION
+CRICKET
+CROWN
+CRS
+CRUISE
+CRUISES
+CU
+CUISINELLA
+CV
+CW
+CX
+CY
+CYMRU
+CYOU
+CZ
+DAD
+DANCE
+DATA
+DATE
+DATING
+DATSUN
+DAY
+DCLK
+DDS
+DE
+DEAL
+DEALER
+DEALS
+DEGREE
+DELIVERY
+DELL
+DELOITTE
+DELTA
+DEMOCRAT
+DENTAL
+DENTIST
+DESI
+DESIGN
+DEV
+DHL
+DIAMONDS
+DIET
+DIGITAL
+DIRECT
+DIRECTORY
+DISCOUNT
+DISCOVER
+DISH
+DIY
+DJ
+DK
+DM
+DNP
+DO
+DOCS
+DOCTOR
+DOG
+DOMAINS
+DOT
+DOWNLOAD
+DRIVE
+DTV
+DUBAI
+DUNLOP
+DUPONT
+DURBAN
+DVAG
+DVR
+DZ
+EARTH
+EAT
+EC
+ECO
+EDEKA
+EDU
+EDUCATION
+EE
+EG
+EMAIL
+EMERCK
+ENERGY
+ENGINEER
+ENGINEERING
+ENTERPRISES
+EPSON
+EQUIPMENT
+ER
+ERICSSON
+ERNI
+ES
+ESQ
+ESTATE
+ET
+EU
+EUROVISION
+EUS
+EVENTS
+EXCHANGE
+EXPERT
+EXPOSED
+EXPRESS
+EXTRASPACE
+FAGE
+FAIL
+FAIRWINDS
+FAITH
+FAMILY
+FAN
+FANS
+FARM
+FARMERS
+FASHION
+FAST
+FEDEX
+FEEDBACK
+FERRARI
+FERRERO
+FI
+FIDELITY
+FIDO
+FILM
+FINAL
+FINANCE
+FINANCIAL
+FIRE
+FIRESTONE
+FIRMDALE
+FISH
+FISHING
+FIT
+FITNESS
+FJ
+FK
+FLICKR
+FLIGHTS
+FLIR
+FLORIST
+FLOWERS
+FLY
+FM
+FO
+FOO
+FOOD
+FOOTBALL
+FORD
+FOREX
+FORSALE
+FORUM
+FOUNDATION
+FOX
+FR
+FREE
+FRESENIUS
+FRL
+FROGANS
+FRONTIER
+FTR
+FUJITSU
+FUN
+FUND
+FURNITURE
+FUTBOL
+FYI
+GA
+GAL
+GALLERY
+GALLO
+GALLUP
+GAME
+GAMES
+GAP
+GARDEN
+GAY
+GB
+GBIZ
+GD
+GDN
+GE
+GEA
+GENT
+GENTING
+GEORGE
+GF
+GG
+GGEE
+GH
+GI
+GIFT
+GIFTS
+GIVES
+GIVING
+GL
+GLASS
+GLE
+GLOBAL
+GLOBO
+GM
+GMAIL
+GMBH
+GMO
+GMX
+GN
+GODADDY
+GOLD
+GOLDPOINT
+GOLF
+GOO
+GOODYEAR
+GOOG
+GOOGLE
+GOP
+GOT
+GOV
+GP
+GQ
+GR
+GRAINGER
+GRAPHICS
+GRATIS
+GREEN
+GRIPE
+GROCERY
+GROUP
+GS
+GT
+GU
+GUCCI
+GUGE
+GUIDE
+GUITARS
+GURU
+GW
+GY
+HAIR
+HAMBURG
+HANGOUT
+HAUS
+HBO
+HDFC
+HDFCBANK
+HEALTH
+HEALTHCARE
+HELP
+HELSINKI
+HERE
+HERMES
+HIPHOP
+HISAMITSU
+HITACHI
+HIV
+HK
+HKT
+HM
+HN
+HOCKEY
+HOLDINGS
+HOLIDAY
+HOMEDEPOT
+HOMEGOODS
+HOMES
+HOMESENSE
+HONDA
+HORSE
+HOSPITAL
+HOST
+HOSTING
+HOT
+HOTELS
+HOTMAIL
+HOUSE
+HOW
+HR
+HSBC
+HT
+HU
+HUGHES
+HYATT
+HYUNDAI
+IBM
+ICBC
+ICE
+ICU
+ID
+IE
+IEEE
+IFM
+IKANO
+IL
+IM
+IMAMAT
+IMDB
+IMMO
+IMMOBILIEN
+IN
+INC
+INDUSTRIES
+INFINITI
+INFO
+ING
+INK
+INSTITUTE
+INSURANCE
+INSURE
+INT
+INTERNATIONAL
+INTUIT
+INVESTMENTS
+IO
+IPIRANGA
+IQ
+IR
+IRISH
+IS
+ISMAILI
+IST
+ISTANBUL
+IT
+ITAU
+ITV
+JAGUAR
+JAVA
+JCB
+JE
+JEEP
+JETZT
+JEWELRY
+JIO
+JLL
+JM
+JMP
+JNJ
+JO
+JOBS
+JOBURG
+JOT
+JOY
+JP
+JPMORGAN
+JPRS
+JUEGOS
+JUNIPER
+KAUFEN
+KDDI
+KE
+KERRYHOTELS
+KERRYPROPERTIES
+KFH
+KG
+KH
+KI
+KIA
+KIDS
+KIM
+KINDLE
+KITCHEN
+KIWI
+KM
+KN
+KOELN
+KOMATSU
+KOSHER
+KP
+KPMG
+KPN
+KR
+KRD
+KRED
+KUOKGROUP
+KW
+KY
+KYOTO
+KZ
+LA
+LACAIXA
+LAMBORGHINI
+LAMER
+LANCASTER
+LAND
+LANDROVER
+LANXESS
+LASALLE
+LAT
+LATINO
+LATROBE
+LAW
+LAWYER
+LB
+LC
+LDS
+LEASE
+LECLERC
+LEFRAK
+LEGAL
+LEGO
+LEXUS
+LGBT
+LI
+LIDL
+LIFE
+LIFEINSURANCE
+LIFESTYLE
+LIGHTING
+LIKE
+LILLY
+LIMITED
+LIMO
+LINCOLN
+LINK
+LIVE
+LIVING
+LK
+LLC
+LLP
+LOAN
+LOANS
+LOCKER
+LOCUS
+LOL
+LONDON
+LOTTE
+LOTTO
+LOVE
+LPL
+LPLFINANCIAL
+LR
+LS
+LT
+LTD
+LTDA
+LU
+LUNDBECK
+LUXE
+LUXURY
+LV
+LY
+MA
+MADRID
+MAIF
+MAISON
+MAKEUP
+MAN
+MANAGEMENT
+MANGO
+MAP
+MARKET
+MARKETING
+MARKETS
+MARRIOTT
+MARSHALLS
+MATTEL
+MBA
+MC
+MCKINSEY
+MD
+ME
+MED
+MEDIA
+MEET
+MELBOURNE
+MEME
+MEMORIAL
+MEN
+MENU
+MERCKMSD
+MG
+MH
+MIAMI
+MICROSOFT
+MIL
+MINI
+MINT
+MIT
+MITSUBISHI
+MK
+ML
+MLB
+MLS
+MM
+MMA
+MN
+MO
+MOBI
+MOBILE
+MODA
+MOE
+MOI
+MOM
+MONASH
+MONEY
+MONSTER
+MORMON
+MORTGAGE
+MOSCOW
+MOTO
+MOTORCYCLES
+MOV
+MOVIE
+MP
+MQ
+MR
+MS
+MSD
+MT
+MTN
+MTR
+MU
+MUSEUM
+MUSIC
+MV
+MW
+MX
+MY
+MZ
+NA
+NAB
+NAGOYA
+NAME
+NAVY
+NBA
+NC
+NE
+NEC
+NET
+NETBANK
+NETFLIX
+NETWORK
+NEUSTAR
+NEW
+NEWS
+NEXT
+NEXTDIRECT
+NEXUS
+NF
+NFL
+NG
+NGO
+NHK
+NI
+NICO
+NIKE
+NIKON
+NINJA
+NISSAN
+NISSAY
+NL
+NO
+NOKIA
+NORTON
+NOW
+NOWRUZ
+NOWTV
+NP
+NR
+NRA
+NRW
+NTT
+NU
+NYC
+NZ
+OBI
+OBSERVER
+OFFICE
+OKINAWA
+OLAYAN
+OLAYANGROUP
+OLLO
+OM
+OMEGA
+ONE
+ONG
+ONL
+ONLINE
+OOO
+OPEN
+ORACLE
+ORANGE
+ORG
+ORGANIC
+ORIGINS
+OSAKA
+OTSUKA
+OTT
+OVH
+PA
+PAGE
+PANASONIC
+PARIS
+PARS
+PARTNERS
+PARTS
+PARTY
+PAY
+PCCW
+PE
+PET
+PF
+PFIZER
+PG
+PH
+PHARMACY
+PHD
+PHILIPS
+PHONE
+PHOTO
+PHOTOGRAPHY
+PHOTOS
+PHYSIO
+PICS
+PICTET
+PICTURES
+PID
+PIN
+PING
+PINK
+PIONEER
+PIZZA
+PK
+PL
+PLACE
+PLAY
+PLAYSTATION
+PLUMBING
+PLUS
+PM
+PN
+PNC
+POHL
+POKER
+POLITIE
+PORN
+POST
+PR
+PRAMERICA
+PRAXI
+PRESS
+PRIME
+PRO
+PROD
+PRODUCTIONS
+PROF
+PROGRESSIVE
+PROMO
+PROPERTIES
+PROPERTY
+PROTECTION
+PRU
+PRUDENTIAL
+PS
+PT
+PUB
+PW
+PWC
+PY
+QA
+QPON
+QUEBEC
+QUEST
+RACING
+RADIO
+RE
+READ
+REALESTATE
+REALTOR
+REALTY
+RECIPES
+RED
+REDSTONE
+REDUMBRELLA
+REHAB
+REISE
+REISEN
+REIT
+RELIANCE
+REN
+RENT
+RENTALS
+REPAIR
+REPORT
+REPUBLICAN
+REST
+RESTAURANT
+REVIEW
+REVIEWS
+REXROTH
+RICH
+RICHARDLI
+RICOH
+RIL
+RIO
+RIP
+RO
+ROCKS
+RODEO
+ROGERS
+ROOM
+RS
+RSVP
+RU
+RUGBY
+RUHR
+RUN
+RW
+RWE
+RYUKYU
+SA
+SAARLAND
+SAFE
+SAFETY
+SAKURA
+SALE
+SALON
+SAMSCLUB
+SAMSUNG
+SANDVIK
+SANDVIKCOROMANT
+SANOFI
+SAP
+SARL
+SAS
+SAVE
+SAXO
+SB
+SBI
+SBS
+SC
+SCB
+SCHAEFFLER
+SCHMIDT
+SCHOLARSHIPS
+SCHOOL
+SCHULE
+SCHWARZ
+SCIENCE
+SCOT
+SD
+SE
+SEARCH
+SEAT
+SECURE
+SECURITY
+SEEK
+SELECT
+SENER
+SERVICES
+SEVEN
+SEW
+SEX
+SEXY
+SFR
+SG
+SH
+SHANGRILA
+SHARP
+SHELL
+SHIA
+SHIKSHA
+SHOES
+SHOP
+SHOPPING
+SHOUJI
+SHOW
+SI
+SILK
+SINA
+SINGLES
+SITE
+SJ
+SK
+SKI
+SKIN
+SKY
+SKYPE
+SL
+SLING
+SM
+SMART
+SMILE
+SN
+SNCF
+SO
+SOCCER
+SOCIAL
+SOFTBANK
+SOFTWARE
+SOHU
+SOLAR
+SOLUTIONS
+SONG
+SONY
+SOY
+SPA
+SPACE
+SPORT
+SPOT
+SR
+SRL
+SS
+ST
+STADA
+STAPLES
+STAR
+STATEBANK
+STATEFARM
+STC
+STCGROUP
+STOCKHOLM
+STORAGE
+STORE
+STREAM
+STUDIO
+STUDY
+STYLE
+SU
+SUCKS
+SUPPLIES
+SUPPLY
+SUPPORT
+SURF
+SURGERY
+SUZUKI
+SV
+SWATCH
+SWISS
+SX
+SY
+SYDNEY
+SYSTEMS
+SZ
+TAB
+TAIPEI
+TALK
+TAOBAO
+TARGET
+TATAMOTORS
+TATAR
+TATTOO
+TAX
+TAXI
+TC
+TCI
+TD
+TDK
+TEAM
+TECH
+TECHNOLOGY
+TEL
+TEMASEK
+TENNIS
+TEVA
+TF
+TG
+TH
+THD
+THEATER
+THEATRE
+TIAA
+TICKETS
+TIENDA
+TIPS
+TIRES
+TIROL
+TJ
+TJMAXX
+TJX
+TK
+TKMAXX
+TL
+TM
+TMALL
+TN
+TO
+TODAY
+TOKYO
+TOOLS
+TOP
+TORAY
+TOSHIBA
+TOTAL
+TOURS
+TOWN
+TOYOTA
+TOYS
+TR
+TRADE
+TRADING
+TRAINING
+TRAVEL
+TRAVELERS
+TRAVELERSINSURANCE
+TRUST
+TRV
+TT
+TUBE
+TUI
+TUNES
+TUSHU
+TV
+TVS
+TW
+TZ
+UA
+UBANK
+UBS
+UG
+UK
+UNICOM
+UNIVERSITY
+UNO
+UOL
+UPS
+US
+UY
+UZ
+VA
+VACATIONS
+VANA
+VANGUARD
+VC
+VE
+VEGAS
+VENTURES
+VERISIGN
+VERSICHERUNG
+VET
+VG
+VI
+VIAJES
+VIDEO
+VIG
+VIKING
+VILLAS
+VIN
+VIP
+VIRGIN
+VISA
+VISION
+VIVA
+VIVO
+VLAANDEREN
+VN
+VODKA
+VOLVO
+VOTE
+VOTING
+VOTO
+VOYAGE
+VU
+WALES
+WALMART
+WALTER
+WANG
+WANGGOU
+WATCH
+WATCHES
+WEATHER
+WEATHERCHANNEL
+WEBCAM
+WEBER
+WEBSITE
+WED
+WEDDING
+WEIBO
+WEIR
+WF
+WHOSWHO
+WIEN
+WIKI
+WILLIAMHILL
+WIN
+WINDOWS
+WINE
+WINNERS
+WME
+WOLTERSKLUWER
+WOODSIDE
+WORK
+WORKS
+WORLD
+WOW
+WS
+WTC
+WTF
+XBOX
+XEROX
+XIHUAN
+XIN
+XN--11B4C3D
+XN--1CK2E1B
+XN--1QQW23A
+XN--2SCRJ9C
+XN--30RR7Y
+XN--3BST00M
+XN--3DS443G
+XN--3E0B707E
+XN--3HCRJ9C
+XN--3PXU8K
+XN--42C2D9A
+XN--45BR5CYL
+XN--45BRJ9C
+XN--45Q11C
+XN--4DBRK0CE
+XN--4GBRIM
+XN--54B7FTA0CC
+XN--55QW42G
+XN--55QX5D
+XN--5SU34J936BGSG
+XN--5TZM5G
+XN--6FRZ82G
+XN--6QQ986B3XL
+XN--80ADXHKS
+XN--80AO21A
+XN--80AQECDR1A
+XN--80ASEHDB
+XN--80ASWG
+XN--8Y0A063A
+XN--90A3AC
+XN--90AE
+XN--90AIS
+XN--9DBQ2A
+XN--9ET52U
+XN--9KRT00A
+XN--B4W605FERD
+XN--BCK1B9A5DRE4C
+XN--C1AVG
+XN--C2BR7G
+XN--CCK2B3B
+XN--CCKWCXETD
+XN--CG4BKI
+XN--CLCHC0EA0B2G2A9GCD
+XN--CZR694B
+XN--CZRS0T
+XN--CZRU2D
+XN--D1ACJ3B
+XN--D1ALF
+XN--E1A4C
+XN--ECKVDTC9D
+XN--EFVY88H
+XN--FCT429K
+XN--FHBEI
+XN--FIQ228C5HS
+XN--FIQ64B
+XN--FIQS8S
+XN--FIQZ9S
+XN--FJQ720A
+XN--FLW351E
+XN--FPCRJ9C3D
+XN--FZC2C9E2C
+XN--FZYS8D69UVGM
+XN--G2XX48C
+XN--GCKR3F0F
+XN--GECRJ9C
+XN--GK3AT1E
+XN--H2BREG3EVE
+XN--H2BRJ9C
+XN--H2BRJ9C8C
+XN--HXT814E
+XN--I1B6B1A6A2E
+XN--IMR513N
+XN--IO0A7I
+XN--J1AEF
+XN--J1AMH
+XN--J6W193G
+XN--JLQ480N2RG
+XN--JVR189M
+XN--KCRX77D1X4A
+XN--KPRW13D
+XN--KPRY57D
+XN--KPUT3I
+XN--L1ACC
+XN--LGBBAT1AD8J
+XN--MGB9AWBF
+XN--MGBA3A3EJT
+XN--MGBA3A4F16A
+XN--MGBA7C0BBN0A
+XN--MGBAAM7A8H
+XN--MGBAB2BD
+XN--MGBAH1A3HJKRD
+XN--MGBAI9AZGQP6J
+XN--MGBAYH7GPA
+XN--MGBBH1A
+XN--MGBBH1A71E
+XN--MGBC0A9AZCG
+XN--MGBCA7DZDO
+XN--MGBCPQ6GPA1A
+XN--MGBERP4A5D4AR
+XN--MGBGU82A
+XN--MGBI4ECEXP
+XN--MGBPL2FH
+XN--MGBT3DHD
+XN--MGBTX2B
+XN--MGBX4CD0AB
+XN--MIX891F
+XN--MK1BU44C
+XN--MXTQ1M
+XN--NGBC5AZD
+XN--NGBE9E0A
+XN--NGBRX
+XN--NODE
+XN--NQV7F
+XN--NQV7FS00EMA
+XN--NYQY26A
+XN--O3CW4H
+XN--OGBPF8FL
+XN--OTU796D
+XN--P1ACF
+XN--P1AI
+XN--PGBS0DH
+XN--PSSY2U
+XN--Q7CE6A
+XN--Q9JYB4C
+XN--QCKA1PMC
+XN--QXA6A
+XN--QXAM
+XN--RHQV96G
+XN--ROVU88B
+XN--RVC1E0AM3E
+XN--S9BRJ9C
+XN--SES554G
+XN--T60B56A
+XN--TCKWE
+XN--TIQ49XQYJ
+XN--UNUP4Y
+XN--VERMGENSBERATER-CTB
+XN--VERMGENSBERATUNG-PWB
+XN--VHQUV
+XN--VUQ861B
+XN--W4R85EL8FHU5DNRA
+XN--W4RS40L
+XN--WGBH1C
+XN--WGBL6A
+XN--XHQ521B
+XN--XKC2AL3HYE2A
+XN--XKC2DL3A5EE0H
+XN--Y9A3AQ
+XN--YFRO4I67O
+XN--YGBI2AMMX
+XN--ZFR164B
+XXX
+XYZ
+YACHTS
+YAHOO
+YAMAXUN
+YANDEX
+YE
+YODOBASHI
+YOGA
+YOKOHAMA
+YOU
+YOUTUBE
+YT
+YUN
+ZA
+ZAPPOS
+ZARA
+ZERO
+ZIP
+ZM
+ZONE
+ZUERICH
+ZW

From c08a20ac003baeda95f3bb607da9b397f0ffbdc8 Mon Sep 17 00:00:00 2001
From: Rushmore75 <oliveratkinsoneng@gmail.com>
Date: Tue, 15 Apr 2025 09:07:16 -0600
Subject: [PATCH 14/14] cleanup and more accuratly use metrics

---
 src/main.rs | 63 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 33 insertions(+), 30 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index dc13d1e..36ecb59 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -24,6 +24,8 @@ const GET_IN_FLIGHT: &str = "gets_in_flight";
 const SITES_CRAWLED: &str = "pages_crawled";
 const BEING_PROCESSED: &str = "pages_being_processed";
 
+const BATCH_SIZE: usize = 2;
+
 #[derive(Deserialize)]
 struct Config {
     surreal_ns: String,
@@ -109,13 +111,7 @@ async fn main() {
     let span = trace_span!("Loop");
     let span = span.enter();
     while crawled < config.budget {
-        let get_num = if config.budget - crawled < 100 {
-            config.budget - crawled
-        } else {
-            100
-        };
-
-        let uncrawled = get_uncrawled_links(&db, get_num, config.crawl_filter.clone()).await;
+        let uncrawled = get_uncrawled_links(&db, config.budget - crawled, config.crawl_filter.clone()).await;
         if uncrawled.is_empty() {
             info!("Had more budget but finished crawling everything.");
             return;
@@ -170,39 +166,44 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
     // Send the http request (get)
     if let Ok(response) = request_builder.send().await {
 
-        // METRICS
-        g.decrement(1);
-        counter!(GET_METRIC).increment(1);
-
+        // TODO if this will fail if the object we are downloading is
+        // larger than the memory of the device it's running on.
+        // We should store it *as* we download it then parse it in-place.
         // Get body from response
         let data = response
             .text()
             .await
             .expect("Failed to read http response's body!");
 
-        // Store document
-        filesystem::store(&data, &site.site).await;
+        // METRICS
+        g.decrement(1);
+        counter!(GET_METRIC).increment(1);
 
-        // Parse document and get relationships
-        let sites = parser::parse(&site, &data).await;
+        // Store document
+        let should_parse = filesystem::store(&data, &site.site).await;
+
+        if should_parse {
+            // Parse document and get relationships
+            let sites = parser::parse(&site, &data).await;
         
+            // De-duplicate this list
+            let prev_len = sites.len();
+            let set = sites.into_iter().fold(HashSet::new(), |mut set,item| {
+                set.insert(item);
+                set
+            });
+            let de_dupe_sites: Vec<Website> = set.into_iter().collect();
+            let diff = prev_len - de_dupe_sites.len();
+            trace!("Saved {diff} from being entered into the db by de-duping");
+
+            // Store all the other sites so that we can link to them.
+            let _ = Website::store_all(de_dupe_sites, &db).await;
+        }
+
         // update self in db
         site.set_crawled();
         Website::store_all(vec![site], &db).await;
 
-        // De-duplicate this list
-        let prev_len = sites.len();
-        let set = sites.into_iter().fold(HashSet::new(), |mut set,item| {
-            set.insert(item);
-            set
-        });
-        let de_dupe_sites: Vec<Website> = set.into_iter().collect();
-        let diff = prev_len - de_dupe_sites.len();
-        trace!("Saved {diff} from being entered into the db by de-duping");
-
-        // Store all the other sites so that we can link to them.
-        let _ = Website::store_all(de_dupe_sites, &db).await;
-
     } else {
         error!("Failed to get: {}", &site.site);
     }
@@ -215,9 +216,11 @@ async fn get_uncrawled_links(
     mut count: usize,
     filter: String,
 ) -> Vec<Website> {
-    if count > 100 {
-        count = 100
+
+    if count > BATCH_SIZE {
+        count = BATCH_SIZE;
     }
+
     debug!("Getting uncrawled links");
 
     let mut response = db