7 changed files with 73 additions and 98 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1961,25 +1961,6 @@ dependencies = [
 "generic-array",
 ]
 [[package]]
 name = "internet_mapper"
 version = "0.1.0"
 dependencies = [
 "base64 0.22.1",
 "html5ever 0.29.1",
 "metrics",
 "metrics-exporter-prometheus",
 "minio",
 "reqwest",
 "serde",
 "surrealdb",
 "tokio",
 "toml",
 "tracing",
 "tracing-subscriber",
 "url",
 ]
 [[package]]
 name = "ipnet"
 version = "2.11.0"
@ -4131,6 +4112,25 @@ version = "2.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
 [[package]]
 name = "surreal_spider"
 version = "0.1.0"
 dependencies = [
 "base64 0.22.1",
 "html5ever 0.29.1",
 "metrics",
 "metrics-exporter-prometheus",
 "minio",
 "reqwest",
 "serde",
 "surrealdb",
 "tokio",
 "toml",
 "tracing",
 "tracing-subscriber",
 "url",
 ]
 [[package]]
 name = "surrealdb"
 version = "2.2.1"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,5 +1,5 @@
 [package]
-name = "internet_mapper"
+name = "surreal_spider"
 version = "0.1.0"
 edition = "2021"
--- a/Crawler.toml
+++ b/Crawler.toml
@ -3,13 +3,13 @@ surreal_url = "localhost:8000"
 surreal_username = "root"
 surreal_password = "root"
 surreal_ns = "test"
-surreal_db = "v1.16"
+surreal_db = "v1.15.4"
 # Minio config
-s3_bucket = "v1.16"
+s3_bucket = "v1.15.4"
 s3_url = "http://localhost:9000"
-s3_access_key = "DwJfDDVIbmCmfAblwSqp"
+s3_access_key = "3ptjsHhRHCHlpCmgFy9n"
-s3_secret_key = "V4UqvC1Vm4AwLE5FAhu2gxlMfvexTBQnDxuy8uZx"
+s3_secret_key = "68CmV07YExeCxb8kJhosSauEizj5CAE7PINZIfQz"
 # Crawler config
 crawl_filter = "en.wikipedia.com" 
--- a/README.md
+++ b/README.md
@ -7,9 +7,6 @@ Crawls sites saving all the found links to a surrealdb database. It then proceed
 - [ ] Domain filtering - prevent the crawler from going on alternate versions of wikipedia.
 - [ ] Conditionally save content - based on filename or file contents
- [x] GUI / TUI ? - Graphana
+- [ ] GUI / TUI ?
- [x] Better asynchronous getting of the sites. Currently it all happens serially.
+- [ ] Better asynchronous getting of the sites. Currently it all happens serially.3/19/25: Took 20min to crawl 100 pages
 - [ ] Allow for storing asynchronously
 3/19/25: Took 20min to crawl 100 pages
 This ment we stored 100 pages, 142,997 urls, and 1,425,798 links between the two.
--- a/src/db.rs
+++ b/src/db.rs
@ -5,7 +5,7 @@ use base64::{
 };
 use metrics::counter;
 use serde::{ser::SerializeStruct, Deserialize, Serialize};
-use std::{collections::HashSet, fmt::Debug, sync::LazyLock, time::Instant};
+use std::{fmt::Debug, sync::LazyLock, time::Instant};
 use surrealdb::{
    engine::remote::ws::{Client, Ws},
    opt::auth::Root,
@ -29,7 +29,7 @@ const TIME_SPENT_ON_LOCK: &'static str = "surql_lock_waiting_ms";
 const STORE: &'static str = "surql_store_calls";
 const LINK: &'static str = "surql_link_calls";
-#[derive(Deserialize, Clone, Hash, Eq, PartialEq)]
+#[derive(Deserialize, Clone)]
 pub struct Website {
    /// The url that this data is found at
    pub site: Url,
@ -40,13 +40,12 @@ pub struct Website {
 impl Serialize for Website {
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
    where
-        S: serde::Serializer,
+        S: serde::Serializer {
-    {
+            let mut state = serializer.serialize_struct("Website", 2)?;
-        let mut state = serializer.serialize_struct("Website", 2)?;
+            state.serialize_field("crawled", &self.crawled)?;
-        state.serialize_field("crawled", &self.crawled)?;
+            // to_string() calls the correct naming of site
-        // to_string() calls the correct naming of site
+            state.serialize_field("site", &self.site.to_string())?;
-        state.serialize_field("site", &self.site.to_string())?;
+            state.end()
        state.end()
    }
 }
@ -64,7 +63,10 @@ impl Website {
            Ok(a) => a,
            Err(_) => todo!(),
        };
-        Self { crawled, site }
+        Self {
            crawled,
            site
        }
    }
    pub fn set_crawled(&mut self) {
@ -76,10 +78,7 @@ impl Website {
        let domain = match site.domain() {
            Some(s) => s.to_string(),
            None => {
-                warn!(
+                warn!("Failed to get domain of URL: {}, falling back to 'localhost'", site.to_string());
                    "Failed to get domain of URL: {}, falling back to 'localhost'",
                    site.to_string()
                );
                "localhost".to_string()
            }
        };
@ -87,7 +86,6 @@ impl Website {
        domain + path
    }
    pub fn get_url_as_b64_path(site: &Url) -> String {
        let domain = site.domain().unwrap_or("DOMAIN").to_string();
        let path = &CUSTOM_ENGINE.encode(site.path());
@ -101,7 +99,7 @@ impl Website {
        if len == 0 {
            return;
        }
-
+        
        let from = &self.site;
        // let to = other.site.to_string();
@ -128,10 +126,7 @@ impl Website {
                            trace!("Link for {from} OK - {num}/{len}");
                            return;
                        } else {
-                            error!(
+                            error!("Didn't link all the records. {num}/{len}. Surreal response: {:?}", e);
                                "Didn't link all the records. {num}/{len}. Surreal response: {:?}",
                                e
                            );
                            return;
                        }
                    }
@ -144,61 +139,48 @@ impl Website {
        }
    }
    pub async fn store_self(&self, db: &Surreal<Client>) {
        counter!(STORE).increment(1);
        db.query(
            "INSERT INTO website $self
                    ON DUPLICATE KEY UPDATE
                        crawled = crawled OR $input.crawled
                    RETURN VALUE id;
            ",
        )
        .await
        .expect("Failed to store self");
    }
    // Insert ever item in the vec into surreal, crawled state will be preserved as TRUE
    // if already in the database as such or incoming data is TRUE.
-    pub async fn store_all(all: HashSet<Self>, db: &Surreal<Client>) -> Vec<Thing> {
+    pub async fn store_all(all: Vec<Self>, db: &Surreal<Client>) -> Vec<Thing> {
        // NOTES:
        // * all incoming Websites come in as !crawled
        // * there are potentially duplicates in all
        counter!(STORE).increment(1);
        let mut things = Vec::with_capacity(all.len());
        // TODO this only allows for one thread to be in the database at a time.
        // This is currently required since otherwise we get write errors.
        // If the default `crawled` is set to false, we might not need to write any more
        // than just the name. `accessed_at` is fun but not needed.
        let now = Instant::now();
        let lock = LOCK.lock().await;
        counter!(TIME_SPENT_ON_LOCK).increment(now.elapsed().as_millis() as u64);
        let mut results = Vec::with_capacity(all.len());
        match db
            .query(
                // TODO making this an upsert would make sense, but
                // upserts seem to be broken.
                //
                // Doesn't look like upsert can take in an array, so insert
                // it is...
                //
                "INSERT INTO website $array
                    ON DUPLICATE KEY UPDATE
-                        last_write = time::now()
+                        accessed_at = time::now(),
-                    RETURN VALUE id;"
+                        crawled = crawled OR $input.crawled
-                ,
+                    RETURN VALUE id;
                 ",
            )
            .bind(("array", all))
            .await
        {
            Ok(mut id) => match id.take::<Vec<Thing>>(0) {
-                Ok(mut x) => results.append(&mut x),
+                Ok(mut x) => things.append(&mut x),
-                Err(err) => {
+                Err(err) => match err {
-                    error!("{:?}", err);
+                    Api(error) => {
-                }
+                        eprintln!("{:?}", error);
                        error!("{:?}", error);
                    }
                    _ => error!("{:?}", err),
                },
            },
-            Err(err) => error!("{:?}", err),
+            Err(err) => {
                error!("{:?}", err);
            }
        }
        drop(lock);
-        results
+        things
    }
 }
@ -242,4 +224,3 @@ pub async fn connect(config: &Config) -> surrealdb::Result<Surreal<Client>> {
    Ok(db)
 }
--- a/src/main.rs
+++ b/src/main.rs
@ -3,7 +3,10 @@
 extern crate html5ever;
 use std::{
-    collections::HashSet, fs::File, io::Read, net::{IpAddr, Ipv4Addr}, time::Instant
+    fs::File,
    io::Read,
    net::{IpAddr, Ipv4Addr},
    time::Instant,
 };
 use db::{connect, Website};
@ -172,7 +175,6 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
    // METRICS
    let g = gauge!(GET_IN_FLIGHT);
    g.increment(1);
    counter!(GET_METRIC).increment(1);
    let timer = Timer::start("Got page");
    // Send the http request (get)
@ -181,6 +183,7 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
        // METRICS
        timer.stop();
        g.decrement(1);
        counter!(GET_METRIC).increment(1);
        // Get body from response
        let data = response
@ -195,18 +198,11 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
        // update self in db
        site.set_crawled();
-        site.store_self(&db).await;
+        Website::store_all(vec![site.clone()], &db).await;
        // de duplicate this list
        let set = sites.iter().fold(HashSet::new(), |mut set, item| {
            // TODO seems kinda dumb to clone everything.
            set.insert(item.clone());
            set
        });
        trace!("Shrunk items to store from {} to {}", sites.len(), set.len());
        // Store all the other sites so that we can link to them.
-        let others = Website::store_all(set, &db).await;
+        // let mut links_to = Vec::new();
        let others = Website::store_all(sites, &db).await;
        // Make the database's links reflect the html links between sites
        site.links_to(others, &db).await;
--- a/src/setup.surql
+++ b/src/setup.surql
@ -5,4 +5,5 @@ DEFINE INDEX IF NOT EXISTS idx ON TABLE website COLUMNS site UNIQUE;
 DEFINE FIELD IF NOT EXISTS crawled ON TABLE website TYPE bool;
-DEFINE FIELD IF NOT EXISTS created ON TABLE website VALUE time::now();
+DEFINE FIELD IF NOT EXISTS accessed_at ON TABLE website VALUE time::now();
 DEFINE FIELD IF NOT EXISTS first_accessed_at ON TABLE website VALUE time::now();