12 changed files with 539 additions and 1763 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,5 @@
 /target
 /.surrealdb
-/.minio
 perf.data
 flamegraph.svg
 perf.data.old
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -9,7 +9,7 @@
            "request": "launch",
            "name": "Debug executable 'surreal_spider'",
            "env": {
-                "RUST_LOG": "surreal_spider=trace,reqwest=info",
+                "RUST_LOG": "surreal_spider=debug,reqwest=info",
            },
            "cargo": {
                "args": [
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -5,13 +5,11 @@ edition = "2021"

 [dependencies]
 html5ever = "0.29.0"
-# minio = "0.1.0"
-minio = {git="https://github.com/minio/minio-rs.git", rev = "c28f576"}
+markup5ever_rcdom = "0.5.0-unofficial"
 reqwest = "0.12.9"
-serde = { version = "1.0", features = ["derive"] }
-spider = { version = "2.21", features = ["sync"] }
-surrealdb = "2.0"
-tokio = { version="1.41", features = ["full"] }
-tracing = "0.1"
-tracing-subscriber = { version = "0.3", features = ["env-filter"] }
+serde = { version = "1.0.214", features = ["derive"] }
+surrealdb = "2.0.4"
+tokio = { version="1.41.0", features = ["full"] }
+tracing = "0.1.40"
+tracing-subscriber = { version = "0.3.18", features = ["env-filter"] }
 url = { version = "2.5.3", features = ["serde"] }
--- a/README.md
+++ b/README.md
@ -1,11 +1,23 @@
 # Surreal Crawler

-Crawls sites saving all the found links to a surrealdb database. It then proceeds to take batches of 100 uncrawled links untill the crawl budget is reached. It saves the data of each site in a minio database.
+Mapping with a budget of 1000 (crawl 1000 sites, so many more links are actually discovered), on [my webiste](https://oliveratkinson.net) on 8/26/2024 took 1m9s.

+This is including the crawl and loading into the database and linking sites. (Locally hosted surreal db instance)

-### TODO
+This run created 4299 site links with 23286 links between the sites. (It found my this git site which really bolsters those numbers.)
+
+## Install / Build
+
+* You will need rust to compile the crawler [rustup.rs](https://rustup.rs)
+* You need python3 (will come installed on most linux distros) and poetry for dependancy management.
+    * Install `pipx`, `python3`
+    * Then: `pipx install poetry`
+    * Then: `poetry install` to install the project dependancies
+* You need to install [surrealdb](https://surrealdb.com)
+
+## Use
+
+Just run `./crawl.sh {url}` and it will start crawling. You can tweak the budget inside [crawl.sh](https://git.oliveratkinson.net/Oliver/internet_mapper/src/branch/main/crawl.sh) if you want.
+
+You can also prefix the command with `time` to benchmark the system, such as: `time ./crawl.sh https://discord.com`.

- [ ] Domain filtering - prevent the crawler from going on alternate versions of wikipedia.
- [ ] Conditionally save content - based on filename or file contents
- [ ] GUI / TUI ?
- [ ] Better asynchronous getting of the sites. Currently it all happens serially.
--- a/compose.yml
+++ b/compose.yml
@ -1,5 +1,5 @@
 services:
-  surreal:
+  db:
    image: surrealdb/surrealdb:latest-dev
    ports:
    - 8000:8000
@ -14,18 +14,3 @@ services:
      - --pass
      - root
      - rocksdb:/mydata/database.db
-  minio:
-    image: quay.io/minio/minio
-    ports:
-      - 9000:9000
-      - 9001:9001
-    environment:
-      - MINIO_ROOT_USER=root
-      - MINIO_ROOT_PASSWORD=an8charpassword
-    volumes:
-      - ./.minio/:/data
-    command:
-      - server
-      - /data
-      - --console-address
-      - ":9001"
--- a/schema.surql
+++ b/schema.surql
@ -0,0 +1,2 @@
+DEFINE TABLE website SCHEMALESS;
+    DEFINE FIELD accessed_at ON TABLE website VALUE time::now();
--- a/src/db.rs
+++ b/src/db.rs
@ -8,12 +8,12 @@ use surrealdb::{
 use tracing::{error, instrument, trace, warn};
 use url::Url;

-use crate::{Config, Timer};
+use crate::Timer;

 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct Website {
    /// The url that this data is found at
-    pub site: Url,
+    site: Url,
    /// Wether or not this link has been crawled yet
    pub crawled: bool,
    #[serde(skip_serializing)]
@ -22,20 +22,35 @@ pub struct Website {

 impl Website {
    /// Creates a blank site (assumes that url param is site's root)
-    pub fn new(url: Url, crawled: bool) -> Self {
+    pub fn new(url: &str, crawled: bool) -> Self {
+        let site = match Url::parse(url) {
+            Ok(a) => a,
+            Err(_) => todo!(),
+        };
        Self {
            id: None,
            crawled,
-            site: url,
+            site,
        }
    }

+    pub fn set_crawled(&mut self) {
+        trace!("Set crawled to true");
+        self.crawled = true
+    }
+
+    pub fn mut_url(&mut self) -> &mut Url {
+        &mut self.site
+    }
+
    #[instrument(skip_all)]
    pub async fn links_to(&self, other: Vec<Thing>, db: &Surreal<Client>) {
        let len = other.len();
        if len == 0 {return}

        let from = self.site.to_string();
+        // let to = other.site.to_string();
+        trace!("Linking {from} to {} other pages.", other.len());
        let msg = format!("Linked {len} pages");
        let timer = Timer::start(&msg);
        // prevent the timer from being dropped instantly.
@ -54,6 +69,7 @@ impl Website {
                    let _: Vec<usize> = vec;
                    if let Some(num) = vec.get(0) {
                        if *num == len {
+                            trace!("Link OK");
                            return;
                        } else {
                            warn!("Didn't link all the records. {num}/{len}");
@ -133,30 +149,19 @@ pub struct Record {
    pub id: Thing,
 }

-#[instrument(skip_all, name = "SurrealDB")]
-pub async fn connect(config: &Config<'_>) -> surrealdb::Result<Surreal<Client>> {
-    trace!("Establishing connection to surreal...");
+pub async fn connect() -> surrealdb::Result<Surreal<Client>> {
    // Connect to the server
-    let db = Surreal::new::<Ws>(config.surreal_url).await?;
+    let db = Surreal::new::<Ws>("127.0.0.1:8000").await?;

-    trace!("Logging in...");
    // Signin as a namespace, database, or root user
    db.signin(Root {
-        username: config.surreal_username,
-        password: config.surreal_password,
+        username: "root",
+        password: "root",
    })
    .await?;

    // Select a specific namespace / database
-    db
-        .use_ns(config.surreal_ns)
-        .use_db(config.surreal_db)
-        .await?;
-    
-    let setup = include_bytes!("setup.surql");
-    let file = setup.iter().map(|c| *c as char).collect::<String>();
-
-    db.query(file).await.expect("Failed to setup surreal tables.");
+    db.use_ns("test").use_db("v1.2").await?;

    Ok(db)
 }
--- a/src/main.rs
+++ b/src/main.rs
@ -1,116 +1,189 @@
 extern crate html5ever;
-
-use std::time::Instant;
+extern crate markup5ever_rcdom as rcdom;

 use db::{connect, Website};
-use parser::parse;
-use s3::S3;
-use surrealdb::{engine::remote::ws::Client, Surreal};
-use tokio::sync::broadcast::Receiver;
-use tracing::{debug, info, trace, trace_span};
+use html5ever::{
+    local_name, parse_document, tendril::TendrilSink, tree_builder::TreeBuilderOpts, ParseOpts,
+};
+use rcdom::RcDom;
+use std::time::Instant;
+use surrealdb::{engine::remote::ws::Client, sql::Thing, Surreal};
+use tracing::{debug, info, instrument, trace, trace_span};
 use tracing_subscriber::EnvFilter;
-use url::Url;

 mod db;
-mod parser;
-mod s3;
-
-struct Config<'a> {
-    surreal_ns: &'a str,
-    surreal_db: &'a str,
-    surreal_url: &'a str,
-    surreal_username: &'a str,
-    surreal_password: &'a str,
-
-    s3_url: &'a str,
-    s3_bucket: &'a str,
-    s3_access_key: &'a str,
-    s3_secret_key: &'a str,
-}

 #[tokio::main]
 async fn main() {
-    let total_runtime = Timer::start("Completed");
    tracing_subscriber::fmt()
        .with_env_filter(EnvFilter::from_default_env())
        .with_line_number(true)
-        // .without_time()
+        .without_time()
        .init();
    debug!("Starting...");

-    let config = Config {
-        surreal_url: "localhost:8000",
-        surreal_username: "root",
-        surreal_password: "root",
-        surreal_ns: "test",
-        surreal_db: "spider",
-        s3_bucket: "spider",
-        s3_url: "http://localhost:9000",
-        s3_access_key: "0zv7GbLQsw4ZI8TclMps",
-        s3_secret_key: "5dB7QkGFw7fYbUJ5LpHk2GbWR7Bl710HlRz4NbzB",
-    };
-
    // Would probably take these in as parameters from a cli
-    let starting_url = "https://oliveratkinson.net/";
+    let url = "https://oliveratkinson.net/";
+    // let url = "http://localhost:5500";
+    let budget = 1000;
+    let mut crawled = 0;

-    let s3 = S3::connect(&config)
-        .await
-        .expect("Failed to connect to minio, aborting.");
-    let db = connect(&config)
-        .await
-        .expect("Failed to connect to surreal, aborting.");
+    let db = connect().await.expect("Failed to connect to db, aborting.");

-    let mut site = spider::website::Website::new(&starting_url)
-        .with_limit(5)
-        .with_depth(0)
+    let client = reqwest::Client::builder()
+        // .use_rustls_tls()
        .build()
        .unwrap();

-    let mut rx: Receiver<spider::page::Page> = site.subscribe(0).unwrap();
+    // Kick off the whole machine - This Website object doesn't matter, it's just to allow for
+    // get() to work.
+    let span = trace_span!("Pre-Loop");
+    let pre_loop_span = span.enter();
+    // Download the site
+    let mut site = Website::new(&url, false);
+    get(&mut site, &db, &client, &mut crawled).await;

-    let subscriber = tokio::spawn(async move {
-        let span = trace_span!("Sub");
+    drop(pre_loop_span);
+
+    let span = trace_span!("Loop");
    let span = span.enter();
-        while let Ok(res) = rx.recv().await {
-            // Get body
-            let data = res.get_html();
-            let url = Url::parse(res.get_url()).unwrap();
+    while crawled < budget {
+        let get_num = if budget - crawled < 100 {
+            budget - crawled
+        } else {
+            100
+        };

-            trace!("Got '{}'", url.to_string());
-            // Store document
-            s3.store(&data, &url).await;
-
-            // Parse document and store relationships
-            let mut page = Website::new(url, true);
-            page.store(&db).await;
-
-            // Relate this page to all the pages it links to
-            let span = trace_span!("Linking");
-            let span = span.enter();
-            let found_links = parse( &page, data).await;
-            let mut stored_links = Vec::new();
-            for mut link in found_links {
-                if let Some(id) = link.store(&db).await {
-                    stored_links.push(id);
+        let uncrawled = get_uncrawled_links(&db, get_num).await;
+        if uncrawled.len() == 0 {
+            info!("Had more budget but finished crawling everything.");
+            return;
        }
+        debug!("Crawling {} pages...", uncrawled.len());
+
+        let span = trace_span!("Crawling");
+        let _ = span.enter();
+
+        for mut site in uncrawled {
+            get(&mut site, &db, &client, &mut crawled).await;
+            let percent = format!("{:.2}%", (crawled as f32 / budget as f32) * 100f32);
+            info!("Crawled {crawled} out of {budget} pages. ({percent})");
        }
-            page.links_to(stored_links, &db).await;
-            drop(span);
    }
    drop(span);
-    });
-
-    let timer = Timer::start("Crawled");
-
-    site.crawl().await;
-    site.unsubscribe();
-
-    drop(timer);
-
-    subscriber.await.unwrap();

    info!("Done");
-    drop(total_runtime);
+}
+
+#[instrument(skip_all)]
+/// A quick helper function for downloading a url
+async fn get(
+    site: &mut Website,
+    db: &Surreal<Client>,
+    request_client: &reqwest::Client,
+    count: &mut usize,
+) {
+    trace!("Get: {}", site.to_string());
+    let timer = Timer::start("Got page");
+
+    if let Ok(response) = request_client.get(site.to_string()).send().await {
+        timer.stop();
+
+        // Get body
+        let data = response.text().await.unwrap();
+        let opts = ParseOpts {
+            tree_builder: TreeBuilderOpts {
+                drop_doctype: true,
+                ..Default::default()
+            },
+            ..Default::default()
+        };
+        // Get DOM
+        let dom = parse_document(RcDom::default(), opts)
+            .from_utf8()
+            .read_from(&mut data.as_bytes())
+            .unwrap();
+
+        // TODO save the dom to minio if a flag is set
+
+        // Modify record in database
+        site.set_crawled();
+        site.store(db).await;
+        trace!("Got: {}", site.to_string());
+
+        // Walk all the children nodes, searching for links to other pages.
+        let mut buffer = Vec::new();
+        let timer = Timer::start("Walked");
+        walk(&dom.document, &db, &site, &mut buffer).await;
+        timer.stop();
+
+        // Put all the found links into the database.
+        site.links_to(buffer, &db).await;
+        *count += 1;
+    }
+    trace!("Failed to get: {}", site.to_string());
+}
+
+/// Walks the givin site, placing it's findings in the database
+async fn walk(
+    node: &rcdom::Handle,
+    db: &Surreal<Client>,
+    site: &Website,
+    links_to: &mut Vec<Thing>,
+) {
+    let span = trace_span!("Walk");
+    let span = span.enter();
+    // Match each node - node basically means element.
+    match &node.data {
+        rcdom::NodeData::Element { name, attrs, .. } => {
+            for attr in attrs.borrow().clone() {
+                match name.local {
+                    local_name!("a")
+                    | local_name!("audio")
+                    | local_name!("area")
+                    | local_name!("img")
+                    | local_name!("link")
+                    | local_name!("object")
+                    | local_name!("source")
+                    | local_name!("base")
+                    | local_name!("video") => {
+                        let attribute_name = attr.name.local.to_string();
+                        if attribute_name == "src"
+                            || attribute_name == "href"
+                            || attribute_name == "data"
+                        {
+                            // Get clone of the current site object
+                            let mut web = site.clone();
+
+                            // Set url
+                            let url = web.mut_url();
+                            url.set_fragment(None); // removes #xyz
+                            let joined = url.join(&attr.value).unwrap();
+                            *url = joined;
+
+                            // Set other attributes
+                            web.crawled = false;
+                            // TODO set element name
+                            // let element_name = name.local.to_string();
+
+                            if let Some(id) = web.store(db).await {
+                                links_to.push(id);
+                            }
+                        }
+                    }
+                    local_name!("button") | local_name!("meta") | local_name!("iframe") => {
+                        // dbg!(attrs);
+                    }
+                    _ => {}
+                };
+            }
+        }
+        _ => {}
+    };
+    drop(span);
+    for child in node.children.borrow().iter() {
+        Box::pin(walk(child, db, site, links_to)).await;
+    }
 }

 /// Returns uncrawled links
--- a/src/parser.rs
+++ b/src/parser.rs
@ -1,92 +0,0 @@
-use std::default::Default;
-use std::str::FromStr;
-
-use html5ever::tokenizer::{BufferQueue, TokenizerResult};
-use html5ever::tokenizer::{StartTag, TagToken};
-use html5ever::tokenizer::{Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
-use html5ever::{local_name, tendril::*};
-use tracing::{instrument, trace};
-
-use crate::db::Website;
-
-#[derive(Clone)]
-struct LinkParser<'a> {
-    site: &'a Website,
-}
-
-impl TokenSink for LinkParser<'_> {
-    type Handle = Vec<Website>;
-
-    fn process_token(&self, token: Token, _line_number: u64) -> TokenSinkResult<Self::Handle> {
-        match token {
-            TagToken(tag) => {
-                if tag.kind == StartTag {
-                    match tag.name {
-                        local_name!("a")
-                        | local_name!("audio")
-                        | local_name!("area")
-                        | local_name!("img")
-                        | local_name!("link")
-                        | local_name!("object")
-                        | local_name!("source")
-                        | local_name!("base")
-                        | local_name!("video") => {
-                            let mut links = Vec::new();
-                            for attr in &tag.attrs {
-                                let attr_name = attr.name.local.to_string();
-                                if attr_name == "src" || attr_name == "href" || attr_name == "data"
-                                {
-                                    // Get clone of the current site object
-                                    let mut web = self.site.clone();
-
-                                    // Set url
-                                    let mut url = web.site;
-                                    url.set_fragment(None); // removes #xyz
-                                    let joined = url.join(&attr.value).unwrap();
-                                    web.site = joined;
-
-                                    web.crawled = false;
-                                    
-                                    links.push(web);
-                                }
-                            }
-                            
-                            return TokenSinkResult::Script(links);
-                        }
-                        local_name!("button") | local_name!("meta") | local_name!("iframe") => {
-                            // dbg!(attrs);
-                        }
-                        _ => {}
-                    }
-                }
-            }
-            _ => {}
-        }
-        TokenSinkResult::Continue
-    }
-}
-
-
-#[instrument(skip_all)]
-pub async fn parse(site: &Website, data: String) -> Vec<Website> {
-
-    let sink = LinkParser { site };
-    let chunk = Tendril::from_str(&data).unwrap();
-    let mut input = BufferQueue::default();
-    input.push_back(chunk.try_reinterpret::<fmt::UTF8>().unwrap());
-
-    let token = Tokenizer::new(sink.clone(), TokenizerOpts::default());
-    
-    // let mut links_to = Vec::new();
-    let mut res = Vec::new();
-    while !input.is_empty() {
-        if let TokenizerResult::Script(mut s) = token.feed(&mut input) {
-            res.append(&mut s);
-        }
-    }
-    trace!("Found {} links.", res.len());
-
-    assert!(input.is_empty());
-    token.end();
-    res
-}
--- a/src/s3.rs
+++ b/src/s3.rs
@ -1,76 +0,0 @@
-use minio::s3::{
-    args::{BucketExistsArgs, MakeBucketArgs}, client::ClientBuilder, creds::StaticProvider, error::Error, http::BaseUrl, types::S3Api, Client
-};
-use tracing::{instrument, trace};
-use url::Url;
-
-use crate::Config;
-
-pub struct S3 {
-    bucket_name: String,
-    client: Client,
-}
-
-impl S3 {
-    #[instrument(skip_all, name = "S3")]
-    pub async fn connect(config: &Config<'_>) -> Result<Self, Error> {
-        let base_url = config.s3_url.parse::<BaseUrl>().unwrap();
-
-        let static_provider =
-            StaticProvider::new(&config.s3_access_key, &config.s3_secret_key, None);
-
-        let client = ClientBuilder::new(base_url)
-            .provider(Some(Box::new(static_provider)))
-            .build()?;
-
-        trace!("Checking bucket...");
-        let exists = client
-            .bucket_exists(&BucketExistsArgs::new(&config.s3_bucket).unwrap())
-            .await?;
-
-        if !exists {
-            trace!("Creating bucket...");
-            client
-                .make_bucket(&MakeBucketArgs::new(&config.s3_bucket).unwrap())
-                .await?;
-        }
-
-        trace!("Connection successfull");
-
-        Ok(Self {
-            bucket_name: config.s3_bucket.to_owned(),
-            client: client,
-        })
-    }
-
-    pub async fn store(&self, data: &str, name: &Url) {
-        if let Some(domain) = name.domain() {
-            let filename = domain.to_string() + name.path();
-
-            let _ = &self
-                .client
-                .put_object_content(&self.bucket_name, &filename, data.to_owned())
-                .send()
-                .await
-                .unwrap();
-        }
-    }
-    
-    pub async fn _get(&self, name: &Url) -> Option<String> {
-         if let Some(domain) = name.domain() {
-            let filename = domain.to_string() + name.path();
-
-            let data = self
-                .client
-                .get_object(&self.bucket_name, &filename)
-                .send()
-                .await
-                .unwrap();
-             
-            if let Ok(segments )= data.content.to_segmented_bytes().await {
-                return Some(segments.to_bytes().iter().map(|c| *c as char).collect::<String>())
-            }
-        }
-        None
-    }
-}
--- a/src/setup.surql
+++ b/src/setup.surql
@ -1,2 +0,0 @@
-DEFINE TABLE IF NOT EXISTS website SCHEMALESS;
-DEFINE FIELD IF NOT EXISTS accessed_at ON TABLE website VALUE time::now();