changes

helper code
It isnt quite working yet
2025-08-04 23:02:05 +00:00 · 2025-07-09 15:58:22 -06:00 · 2025-04-17 09:59:23 -06:00 · 2025-04-17 15:39:49 +00:00 · 2025-04-17 09:36:27 -06:00 · 2025-04-17 09:36:13 -06:00
22 changed files with 2981 additions and 891 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,8 @@
 /target
 /.surrealdb
+/.minio
 perf.data
 flamegraph.svg
-perf.data.old
+perf.data.old
+/docker/logs/*
+/downloaded
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -7,18 +7,15 @@
        {
            "type": "lldb",
            "request": "launch",
-            "name": "Debug executable 'surreal_spider'",
-            "env": {
-                "RUST_LOG": "surreal_spider=debug,reqwest=info",
-            },
+            "name": "Debug executable 'internet_mapper'",
            "cargo": {
                "args": [
                    "build",
-                    "--bin=surreal_spider",
-                    "--package=surreal_spider"
+                    "--bin=internet_mapper",
+                    "--package=internet_mapper"
                ],
                "filter": {
-                    "name": "surreal_spider",
+                    "name": "internet_mapper",
                    "kind": "bin"
                }
            },
@@ -28,16 +25,16 @@
        {
            "type": "lldb",
            "request": "launch",
-            "name": "Debug unit tests in executable 'surreal_spider'",
+            "name": "Debug unit tests in executable 'internet_mapper'",
            "cargo": {
                "args": [
                    "test",
                    "--no-run",
-                    "--bin=surreal_spider",
-                    "--package=surreal_spider"
+                    "--bin=internet_mapper",
+                    "--package=internet_mapper"
                ],
                "filter": {
-                    "name": "surreal_spider",
+                    "name": "internet_mapper",
                    "kind": "bin"
                }
            },
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -0,0 +1,8 @@
+{
+    "cSpell.words": [
+        "creds",
+        "reqwest",
+        "rustls",
+        "surql",
+    ]
+}
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,15 +1,21 @@
 [package]
-name = "surreal_spider"
+name = "internet_mapper"
 version = "0.1.0"
 edition = "2021"

 [dependencies]
-html5ever = "0.29.0"
-markup5ever_rcdom = "0.5.0-unofficial"
-reqwest = "0.12.9"
-serde = { version = "1.0.214", features = ["derive"] }
-surrealdb = "2.0.4"
+base64 = "0.22.1"
+futures-util = "0.3.31"
+html5ever = "0.29"
+metrics = "0.24.1"
+metrics-exporter-prometheus = { version = "0.16.2", features=["http-listener"]}
+# minio = "0.1.0"
+minio = {git="https://github.com/minio/minio-rs.git", rev = "c28f576"}
+reqwest = { version = "0.12", features = ["gzip", "default", "rustls-tls", "stream"] }
+serde = { version = "1.0", features = ["derive"] }
+surrealdb = "2.2"
 tokio = { version="1.41.0", features = ["full"] }
-tracing = "0.1.40"
-tracing-subscriber = { version = "0.3.18", features = ["env-filter"] }
-url = { version = "2.5.3", features = ["serde"] }
+toml = "0.8.20"
+tracing = "0.1"
+tracing-subscriber = { version = "0.3", features = ["env-filter", "local-time", "json"] }
+url = { version = "2.5", features = ["serde"] }
--- a/Crawler.toml
+++ b/Crawler.toml
@@ -0,0 +1,12 @@
+# Surreal config
+surreal_url = "localhost:8000"
+surreal_username = "root"
+surreal_password = "root"
+surreal_ns = "test"
+surreal_db = "v1.21.1"
+
+# Crawler config
+crawl_filter = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI" 
+start_url = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI"
+budget = 10000
+batch_size = 50
--- a/README.md
+++ b/README.md
@@ -1,23 +1,55 @@
 # Surreal Crawler

-Mapping with a budget of 1000 (crawl 1000 sites, so many more links are actually discovered), on [my webiste](https://oliveratkinson.net) on 8/26/2024 took 1m9s.
+Crawls sites saving all the found links to a surrealdb database. It then proceeds to take batches of 100 uncrawled links untill the crawl budget is reached. It saves the data of each site in a minio database.

-This is including the crawl and loading into the database and linking sites. (Locally hosted surreal db instance)
+## How to use

-This run created 4299 site links with 23286 links between the sites. (It found my this git site which really bolsters those numbers.)
+1. Clone the repo and `cd` into it.
+2. Build the repo with `cargo build -r`
+3. Start the docker conatiners
+	1. cd into the docker folder `cd docker`
+	2. Bring up the docker containers `docker compose up -d`
+4. From the project's root, edit the `Crawler.toml` file to your liking.
+5. Run with `./target/release/internet_mapper`

-## Install / Build
+You can view stats of the project at `http://<your-ip>:3000/dashboards`

-* You will need rust to compile the crawler [rustup.rs](https://rustup.rs)
-* You need python3 (will come installed on most linux distros) and poetry for dependancy management.
-    * Install `pipx`, `python3`
-    * Then: `pipx install poetry`
-    * Then: `poetry install` to install the project dependancies
-* You need to install [surrealdb](https://surrealdb.com)
+```bash
+# Untested script but probably works
+git clone https://git.oliveratkinson.net/Oliver/internet_mapper.git
+cd internet_mapper

-## Use
+cargo build -r

-Just run `./crawl.sh {url}` and it will start crawling. You can tweak the budget inside [crawl.sh](https://git.oliveratkinson.net/Oliver/internet_mapper/src/branch/main/crawl.sh) if you want.
+cd docker
+docker compose up -d
+cd ..

-You can also prefix the command with `time` to benchmark the system, such as: `time ./crawl.sh https://discord.com`.
+$EDITOR Crawler.toml
+
+./target/release/internet_mapper
+
+```
+
+### TODO
+
+- [x] Domain filtering - prevent the crawler from going on alternate versions of wikipedia.
+- [ ] Conditionally save content - based on filename or file contents
+- [x] GUI / TUI ? - Graphana
+- [x] Better asynchronous getting of the sites. Currently it all happens serially.
+- [x] Allow for storing asynchronously - dropping the "links to" logic fixes this need
+- [x] Control crawler via config file (no recompliation needed)
+
+3/17/25: Took >1hr to crawl 100 pages
+
+3/19/25: Took 20min to crawl 1000 pages
+This ment we stored 1000 pages, 142,997 urls, and 1,425,798 links between the two.
+
+3/20/25: Took 5min to crawl 1000 pages
+
+3/21/25: Took 3min to crawl 1000 pages
+
+# About
+
+![Screenshot](/pngs/graphana.png)

--- a/compose.yml
+++ b/compose.yml
@@ -1,16 +0,0 @@
-services:
-  db:
-    image: surrealdb/surrealdb:latest-dev
-    ports:
-    - 8000:8000
-    volumes:
-      - ./.surrealdb/:/mydata
-    command:
-      - start
-      - --log
-      - debug
-      - --user
-      - root
-      - --pass
-      - root
-      - rocksdb:/mydata/database.db
--- a/docker/alloy.conf
+++ b/docker/alloy.conf
@@ -0,0 +1,14 @@
+local.file_match "tmplogs" {
+    path_targets = [{"__path__" = "/tmp/alloy-logs/*.log"}]
+}
+
+loki.source.file "local_files" {
+    targets    = local.file_match.tmplogs.targets
+    forward_to = [loki.write.local_loki.receiver]
+}
+
+loki.write "local_loki" {
+    endpoint {
+        url = "http://loki:3100/loki/api/v1/push"
+    }
+}
--- a/docker/compose.yml
+++ b/docker/compose.yml
@@ -0,0 +1,68 @@
+services:
+  surreal:
+    image: surrealdb/surrealdb:latest-dev
+    ports:
+    - 8000:8000
+    volumes:
+      - surrealdb_storage:/mydata
+    command:
+      - start
+      - --log
+      - debug
+      - --user
+      - root
+      - --pass
+      - root
+      - rocksdb:/mydata/database.db
+
+  alloy:
+    image: grafana/alloy:latest
+    ports:
+      - 12345:12345
+    volumes:
+      # if you change this, you also need to change it in the alloy config file
+      - ./logs/:/tmp/alloy-logs
+      - ./alloy.conf:/etc/alloy/config.alloy
+      - alloy_storage:/var/lib/alloy
+    command:  run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy
+
+  #logs
+  loki:
+    image: grafana/loki:latest
+    ports:
+      - 3100:3100
+    command: -config.file=/etc/loki/local-config.yaml
+    volumes:
+      - ./loki.yaml:/etc/loki/local-config.yaml
+
+  # Metrics collector
+  prometheus:
+    image: prom/prometheus:latest
+    expose:
+      - 9090
+    volumes:
+      - ./prometheus.yaml:/etc/prometheus/prometheus.yml
+      # persist data
+      - prometheus_storage:/prometheus
+    command: --web.enable-lifecycle --config.file=/etc/prometheus/prometheus.yml
+
+  # Everything viewer
+  grafana:
+    image: grafana/grafana:latest
+    volumes:
+      - ./grafana.yaml:/etc/grafana/provisioning/datasources/datasources.yaml
+      - ./dashboards:/var/lib/grafana/dashboards
+      - grafana_storage:/var/lib/grafana
+    environment:
+      - GF_AUTH_ANONYMOUS_ENABLED=true
+      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
+      - GF_AUTH_DISABLE_LOGIN_FORM=true
+      - GF_FEATURE_TOGGLES_ENABLE=traceqlEditor
+    ports:
+      - 3000:3000
+  
+volumes:
+  prometheus_storage:
+  grafana_storage:
+  alloy_storage:
+  surrealdb_storage:
--- a/docker/dashboards/crawler-dashboard.json
+++ b/docker/dashboards/crawler-dashboard.json
@@ -0,0 +1,648 @@
+{
+  "__inputs": [
+    {
+      "name": "DS_PROMETHEUS",
+      "label": "Prometheus",
+      "description": "",
+      "type": "datasource",
+      "pluginId": "prometheus",
+      "pluginName": "Prometheus"
+    },
+    {
+      "name": "DS_LOKI",
+      "label": "Loki",
+      "description": "",
+      "type": "datasource",
+      "pluginId": "loki",
+      "pluginName": "Loki"
+    }
+  ],
+  "__elements": {},
+  "__requires": [
+    {
+      "type": "grafana",
+      "id": "grafana",
+      "name": "Grafana",
+      "version": "11.3.1"
+    },
+    {
+      "type": "panel",
+      "id": "logs",
+      "name": "Logs",
+      "version": ""
+    },
+    {
+      "type": "datasource",
+      "id": "loki",
+      "name": "Loki",
+      "version": "1.0.0"
+    },
+    {
+      "type": "datasource",
+      "id": "prometheus",
+      "name": "Prometheus",
+      "version": "1.0.0"
+    },
+    {
+      "type": "panel",
+      "id": "stat",
+      "name": "Stat",
+      "version": ""
+    },
+    {
+      "type": "panel",
+      "id": "timeseries",
+      "name": "Time series",
+      "version": ""
+    }
+  ],
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {
+          "type": "grafana",
+          "uid": "-- Grafana --"
+        },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": null,
+  "links": [],
+  "panels": [
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": 300000,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 8,
+        "x": 0,
+        "y": 0
+      },
+      "id": 5,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "11.3.1",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "surql_trips",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "legendFormat": "Trips to Surreal",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "s3_trips",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "Trips to S3",
+          "range": true,
+          "refId": "B",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "pages_crawled",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "total crawled",
+          "range": true,
+          "refId": "C",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "pages_being_processed",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "Pages being processed",
+          "range": true,
+          "refId": "E",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "gets_in_flight",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "D",
+          "useBackend": false
+        }
+      ],
+      "title": "Crawler stats",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": 300000,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 9,
+        "x": 8,
+        "y": 0
+      },
+      "id": 6,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "11.3.1",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "surql_trips",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "legendFormat": "Trips to Surreal",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "surql_link_calls",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "link calls",
+          "range": true,
+          "refId": "B",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "surql_store_calls",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "store calls",
+          "range": true,
+          "refId": "C",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "pages_being_processed",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "Pages being processed",
+          "range": true,
+          "refId": "E",
+          "useBackend": false
+        }
+      ],
+      "title": "Surreal stats",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "This is across all threads, so this isn't wall clock time",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "ms"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 7,
+        "x": 17,
+        "y": 0
+      },
+      "id": 7,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "percentChangeColorMode": "standard",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "showPercentChange": false,
+        "textMode": "auto",
+        "wideLayout": true
+      },
+      "pluginVersion": "11.3.1",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "surql_lock_waiting_ms",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Time spend waiting on lock",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "loki",
+        "uid": "${DS_LOKI}"
+      },
+      "gridPos": {
+        "h": 18,
+        "w": 24,
+        "x": 0,
+        "y": 8
+      },
+      "id": 1,
+      "options": {
+        "dedupStrategy": "none",
+        "enableLogDetails": true,
+        "prettifyLogMessage": false,
+        "showCommonLabels": false,
+        "showLabels": false,
+        "showTime": false,
+        "sortOrder": "Descending",
+        "wrapLogMessage": false
+      },
+      "pluginVersion": "11.3.1",
+      "targets": [
+        {
+          "datasource": {
+            "type": "loki",
+            "uid": "${DS_LOKI}"
+          },
+          "editorMode": "code",
+          "expr": "{filename=\"/tmp/alloy-logs/tracing.log\"} | json | level = `ERROR` | line_format \"{{.threadId}} {{.filename_extracted}}:{{.line_number}}  {{.fields_message}}\"",
+          "queryType": "range",
+          "refId": "A"
+        }
+      ],
+      "title": "Errors",
+      "type": "logs"
+    },
+    {
+      "datasource": {
+        "type": "loki",
+        "uid": "${DS_LOKI}"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 26
+      },
+      "id": 2,
+      "options": {
+        "dedupStrategy": "none",
+        "enableLogDetails": true,
+        "prettifyLogMessage": false,
+        "showCommonLabels": false,
+        "showLabels": false,
+        "showTime": false,
+        "sortOrder": "Descending",
+        "wrapLogMessage": false
+      },
+      "pluginVersion": "11.3.1",
+      "targets": [
+        {
+          "datasource": {
+            "type": "loki",
+            "uid": "${DS_LOKI}"
+          },
+          "editorMode": "code",
+          "expr": "{filename=\"/tmp/alloy-logs/tracing.log\"} | json | level = `DEBUG` | line_format \"{{.fields_message}}\"",
+          "queryType": "range",
+          "refId": "A"
+        }
+      ],
+      "title": "Debug",
+      "type": "logs"
+    },
+    {
+      "datasource": {
+        "type": "loki",
+        "uid": "${DS_LOKI}"
+      },
+      "gridPos": {
+        "h": 16,
+        "w": 12,
+        "x": 12,
+        "y": 26
+      },
+      "id": 4,
+      "options": {
+        "dedupStrategy": "none",
+        "enableLogDetails": true,
+        "prettifyLogMessage": false,
+        "showCommonLabels": false,
+        "showLabels": false,
+        "showTime": false,
+        "sortOrder": "Descending",
+        "wrapLogMessage": false
+      },
+      "pluginVersion": "11.3.1",
+      "targets": [
+        {
+          "datasource": {
+            "type": "loki",
+            "uid": "${DS_LOKI}"
+          },
+          "editorMode": "code",
+          "expr": "{filename=\"/tmp/alloy-logs/tracing.log\"} | json | level = `TRACE` | line_format \"{{.fields_message}}\"",
+          "queryType": "range",
+          "refId": "A"
+        }
+      ],
+      "title": "Trace",
+      "type": "logs"
+    },
+    {
+      "datasource": {
+        "type": "loki",
+        "uid": "${DS_LOKI}"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 34
+      },
+      "id": 3,
+      "options": {
+        "dedupStrategy": "none",
+        "enableLogDetails": true,
+        "prettifyLogMessage": false,
+        "showCommonLabels": false,
+        "showLabels": false,
+        "showTime": false,
+        "sortOrder": "Descending",
+        "wrapLogMessage": false
+      },
+      "pluginVersion": "11.3.1",
+      "targets": [
+        {
+          "datasource": {
+            "type": "loki",
+            "uid": "${DS_LOKI}"
+          },
+          "editorMode": "code",
+          "expr": "{filename=\"/tmp/alloy-logs/tracing.log\"} | json | level = `WARN` | line_format \"{{.fields_message}}\"",
+          "queryType": "range",
+          "refId": "A"
+        }
+      ],
+      "title": "Warnings",
+      "type": "logs"
+    }
+  ],
+  "schemaVersion": 40,
+  "tags": [],
+  "templating": {
+    "list": [
+      {
+        "datasource": {
+          "type": "loki",
+          "uid": "P8E80F9AEF21F6940"
+        },
+        "filters": [],
+        "name": "Filters",
+        "type": "adhoc"
+      }
+    ]
+  },
+  "time": {
+    "from": "now-5m",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "browser",
+  "title": "Crawler",
+  "uid": "ceg90x34pqgowd",
+  "version": 21,
+  "weekStart": ""
+}
--- a/docker/grafana.yaml
+++ b/docker/grafana.yaml
@@ -0,0 +1,24 @@
+apiVersion: 1
+
+datasources:
+- name: Loki
+  type: loki
+  access: proxy 
+  orgId: 1
+  url: http://loki:3100
+  basicAuth: false
+  isDefault: true 
+  version: 1
+  editable: false
+- name: Prometheus
+  type: prometheus
+  uid: prometheus
+  access: proxy
+  orgId: 1
+  url: http://prometheus:9090
+  basicAuth: false
+  isDefault: false
+  version: 1
+  editable: false
+  jsonData:
+    httpMethod: GET
--- a/docker/loki.yaml
+++ b/docker/loki.yaml
@@ -0,0 +1,62 @@
+# this is mostly the default config from grafana's website
+
+auth_enabled: false
+
+server:
+  http_listen_port: 3100
+  grpc_listen_port: 9096
+  log_level: info
+  grpc_server_max_concurrent_streams: 1000
+
+common:
+  instance_addr: 127.0.0.1
+  path_prefix: /tmp/loki
+  storage:
+    filesystem:
+      chunks_directory: /tmp/loki/chunks
+      rules_directory: /tmp/loki/rules
+  replication_factor: 1
+  ring:
+    kvstore:
+      store: inmemory
+
+query_range:
+  results_cache:
+    cache:
+      embedded_cache:
+        enabled: true
+        max_size_mb: 100
+
+limits_config:
+  metric_aggregation_enabled: true
+
+schema_config:
+  configs:
+    - from: 2020-10-24
+      store: tsdb
+      object_store: filesystem
+      schema: v13
+      index:
+        prefix: index_
+        period: 24h
+
+pattern_ingester:
+  enabled: true
+  metric_aggregation:
+    loki_address: localhost:3100
+
+frontend:
+  encoding: protobuf
+
+# By default, Loki will send anonymous, but uniquely-identifiable usage and configuration
+# analytics to Grafana Labs. These statistics are sent to https://stats.grafana.org/
+#
+# Statistics help us better understand how Loki is used, and they show us performance
+# levels for most users. This helps us prioritize features and documentation.
+# For more information on what's sent, look at
+# https://github.com/grafana/loki/blob/main/pkg/analytics/stats.go
+# Refer to the buildReport method to see what goes into a report.
+#
+# If you would like to disable reporting, uncomment the following lines:
+analytics:
+  reporting_enabled: false
--- a/docker/prometheus.yaml
+++ b/docker/prometheus.yaml
@@ -0,0 +1,17 @@
+global:
+  scrape_interval: 5s 
+  query_log_file: /etc/prometheus/query.log
+
+scrape_configs:
+  - job_name: crawler
+    static_configs:
+    # change this your machine's ip, localhost won't work
+    # because localhost refers to the docker container.
+      - targets: ['192.168.1.200:2500']
+        #- targets: ['192.168.8.209:2500']
+  - job_name: loki
+    static_configs:
+      - targets: ['loki:3100']
+  - job_name: prometheus
+    static_configs:
+      - targets: ['localhost:9090']
--- a/jsconfig.json
+++ b/jsconfig.json
@@ -0,0 +1,16 @@
+{
+    "compilerOptions": {
+        "module": "ESNext",
+        "moduleResolution": "Bundler",
+        "target": "ES2022",
+        "jsx": "react",
+        "allowImportingTsExtensions": true,
+        "strictNullChecks": true,
+        "strictFunctionTypes": true
+    },
+    "exclude": [
+        "node_modules",
+        "**/node_modules/*"
+    ],
+    "typeAcquisition": {"include": ["firefox-webext-browser"]}
+}
--- a/pngs/graphana.png
+++ b/pngs/graphana.png
--- a/schema.surql
+++ b/schema.surql
@@ -1,2 +0,0 @@
-DEFINE TABLE website SCHEMALESS;
-    DEFINE FIELD accessed_at ON TABLE website VALUE time::now();
--- a/src/db.rs
+++ b/src/db.rs
@@ -1,23 +1,38 @@
+use metrics::counter;
+use std::fmt::Debug;
 use serde::{Deserialize, Serialize};
 use surrealdb::{
    engine::remote::ws::{Client, Ws},
    opt::auth::Root,
    sql::Thing,
-    Response, Surreal,
+    Surreal,
 };
-use tracing::{error, instrument, trace, warn};
+use tracing::{error, instrument, trace};
 use url::Url;

-use crate::Timer;
+use crate::Config;

-#[derive(Debug, Serialize, Deserialize, Clone)]
+const STORE: &str = "surql_store_calls";
+
+#[derive(Serialize, Deserialize, Clone, Eq, PartialEq, Hash)]
 pub struct Website {
    /// The url that this data is found at
-    site: Url,
+    pub site: Url,
    /// Wether or not this link has been crawled yet
    pub crawled: bool,
-    #[serde(skip_serializing)]
-    id: Option<Thing>,
+    /// 200, 404, etc
+    pub status_code: u16,
+}
+
+// manual impl to make tracing look nicer
+impl Debug for Website {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Website")
+            .field("host", &self.site.host())
+            .field("path", &self.site.path())
+            .field("status_code", &self.status_code)
+            .finish()
+    }
 }

 impl Website {
@@ -28,112 +43,42 @@ impl Website {
            Err(_) => todo!(),
        };
        Self {
-            id: None,
            crawled,
            site,
+            status_code: 0,
        }
    }

-    pub fn set_crawled(&mut self) {
-        trace!("Set crawled to true");
-        self.crawled = true
-    }
+    // Insert ever item in the vec into surreal, crawled state will be preserved as TRUE
+    // if already in the database as such or incoming data is TRUE.
+    #[instrument(skip(db))]
+    pub async fn store_all(all: Vec<Self>, db: &Surreal<Client>) -> Vec<Thing> {
+        counter!(STORE).increment(1);
+        let mut things = Vec::with_capacity(all.len());

-    pub fn mut_url(&mut self) -> &mut Url {
-        &mut self.site
-    }
-
-    #[instrument(skip_all)]
-    pub async fn links_to(&self, other: Vec<Thing>, db: &Surreal<Client>) {
-        let len = other.len();
-        if len == 0 {return}
-
-        let from = self.site.to_string();
-        // let to = other.site.to_string();
-        trace!("Linking {from} to {} other pages.", other.len());
-        let msg = format!("Linked {len} pages");
-        let timer = Timer::start(&msg);
-        // prevent the timer from being dropped instantly.
-        let _ = timer;
+        // FIXME failes *sometimes* because "Resource Busy"
        match db
-            .query("COUNT(RELATE (SELECT id FROM website WHERE site = $in) -> links_to -> $out)")
-            .bind(("in", from))
-            .bind(("out", other))
+            .query(
+                "INSERT INTO website $array
+                    ON DUPLICATE KEY UPDATE
+                        accessed_at = time::now(),
+                        status_code = $input.status_code,
+                        crawled = crawled OR $input.crawled
+                    RETURN VALUE id;
+                 ",
+            )
+            .bind(("array", all))
            .await
        {
-            Ok(mut e) => {
-                // The relate could technically "fail" (not relate anything), this just means that
-                // the query was ok.
-                let _: Response = e;
-                if let Ok(vec) = e.take(0) {
-                    let _: Vec<usize> = vec;
-                    if let Some(num) = vec.get(0) {
-                        if *num == len {
-                            trace!("Link OK");
-                            return;
-                        } else {
-                            warn!("Didn't link all the records. {num}/{len}");
-                            return;
-                        }
-                    }
-                }
-                warn!("Linking request succeeded but couldn't verify the results.");
+            Ok(mut id) => match id.take::<Vec<Thing>>(0) {
+                Ok(mut x) => things.append(&mut x),
+                Err(err) => error!("{:?}", err),
            },
-            Err(e) => {
-                error!("{}", e.to_string());
-            },
-        }
-    }
-
-    #[instrument(skip_all)]
-    pub async fn store(&mut self, db: &Surreal<Client>) -> Option<Thing> {
-        // check if it's been gone thru before
-        let mut response = db
-            .query("SELECT * FROM ONLY website WHERE site = $site LIMIT 1")
-            .bind(("site", self.site.to_string()))
-            .await
-            .unwrap();
-
-        if let Some(old) = response.take::<Option<Website>>(0).unwrap() {
-            // site exists already
-            if let Some(id) = old.id {
-                // make sure to preserve the "crawled status"
-                let mut new = self.clone();
-                new.crawled = old.crawled | new.crawled;
-
-                // update the record
-                match db.upsert((id.tb, id.id.to_string())).content(new).await {
-                    Ok(e) => {
-                        if let Some(a) = e {
-                            let _: Record = a;
-                            return Some(a.id);
-                        }
-                    }
-                    Err(e) => {
-                        error!("{}", e);
-                    }
-                };
+            Err(err) => {
+                error!("{:?}", err);
            }
-        } else {
-            // sites hasn't existed yet
-            match db.create("website").content(self.clone()).await {
-                Ok(e) => {
-                    let _: Option<Record> = e;
-                    if let Some(a) = e {
-                        let _: Record = a;
-                        return Some(a.id);
-                    }
-                }
-                Err(a) => error!("{:?}", a),
-            };
        }
-        None
-    }
-}
-
-impl ToString for Website {
-    fn to_string(&self) -> String {
-        self.site.to_string()
+        things
    }
 }

@@ -149,19 +94,32 @@ pub struct Record {
    pub id: Thing,
 }

-pub async fn connect() -> surrealdb::Result<Surreal<Client>> {
+#[instrument(skip_all, name = "SurrealDB")]
+pub async fn connect(config: &Config) -> surrealdb::Result<Surreal<Client>> {
+    trace!("Establishing connection to surreal...");
    // Connect to the server
-    let db = Surreal::new::<Ws>("127.0.0.1:8000").await?;
+    let db = Surreal::new::<Ws>(&config.surreal_url).await?;

+    trace!("Logging in...");
    // Signin as a namespace, database, or root user
    db.signin(Root {
-        username: "root",
-        password: "root",
+        username: &config.surreal_username,
+        password: &config.surreal_password,
    })
    .await?;

    // Select a specific namespace / database
-    db.use_ns("test").use_db("v1.2").await?;
+    db.use_ns(&config.surreal_ns)
+        .use_db(&config.surreal_db)
+        .await?;
+
+    let setup = include_bytes!("setup.surql");
+    let file = setup.iter().map(|c| *c as char).collect::<String>();
+
+    db.query(file)
+        .await
+        .expect("Failed to setup surreal tables.");

    Ok(db)
 }
+
--- a/src/filesystem.rs
+++ b/src/filesystem.rs
@@ -0,0 +1,66 @@
+use std::{io::ErrorKind, path::PathBuf};
+
+use reqwest::header::HeaderValue;
+use tokio::fs;
+use tracing::{error, trace, warn};
+use url::Url;
+
+pub fn as_path(url: &Url, content_type: &HeaderValue) -> PathBuf {
+    // extract data from url to save it accurately
+    let mut url_path = PathBuf::from("./downloaded/".to_string() + url.domain().unwrap_or("UnknownDomain") + url.path());
+
+    if let Ok(header) = content_type.to_str() {
+        // text/html; charset=UTF-8; option=value
+        let ttype = if let Some((t, _)) = header.split_once(';') {
+            t
+        } else {
+            header
+        };
+
+        if let Some((ttype, subtype)) = ttype.split_once('/') {
+            trace!("Found Content-Type to be: {ttype}/{subtype} for {}", url.to_string());
+            // If the Content-Type header is "*/html" (most likely "text/html") and the path's
+            // extension is anything but html:
+            if subtype=="html" && !url_path.extension().is_some_and(|f| f=="html" || f=="htm" ) {
+                // time to slap a index.html to the end of that path there!
+                url_path = url_path.join("index.html");
+            }
+        }
+    } else {
+        warn!("Header: {:?} couldn't be parsed into a string!", content_type);
+    }
+    trace!("Final path for {} is: {:?}", url, url_path);
+
+    url_path
+}
+
+pub async fn init(filename: &PathBuf) -> Option<fs::File> {
+    let file = async || tokio::fs::OpenOptions::new()
+        .append(true)
+        .create(true)
+        .open(&filename).await;
+
+    match file().await {
+        Ok(ok) => Some(ok),
+        Err(err) => {
+            // the file/folder isn't found
+            if err.kind() == ErrorKind::NotFound {
+                if let Some(parent ) = &filename.parent() {
+                    // create the folders
+                    if let Err(err) = fs::create_dir_all(&parent).await {
+                        error!("Dir creation: {err} {:?}", filename);
+                        eprintln!("{}", err)
+                    } else if let Ok(ok) = file().await {
+                        return Some(ok);
+                    }
+                } else {
+                    error!("Couldn't get file's parents: {:?}", &filename);
+                }
+            } else {
+                error!("File open error: {err} {:?}", filename);
+            }
+            // we don't care about other errors, we can't/shouldn't fix them
+            None
+        }
+    }
+}
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,199 +1,285 @@
+#![feature(ip_from)]
+#![warn(clippy::expect_used)]
+#![deny(clippy::unwrap_used)]
+
 extern crate html5ever;
-extern crate markup5ever_rcdom as rcdom;
+
+use futures_util::StreamExt;
+
+use std::{
+    collections::HashSet,
+    fs::File,
+    io::Read,
+    net::{IpAddr, Ipv4Addr},
+};

 use db::{connect, Website};
-use html5ever::{
-    local_name, parse_document, tendril::TendrilSink, tree_builder::TreeBuilderOpts, ParseOpts,
-};
-use rcdom::RcDom;
-use std::time::Instant;
-use surrealdb::{engine::remote::ws::Client, sql::Thing, Surreal};
-use tracing::{debug, info, instrument, trace, trace_span};
-use tracing_subscriber::EnvFilter;
+use metrics::{counter, gauge};
+use metrics_exporter_prometheus::PrometheusBuilder;
+use serde::Deserialize;
+use surrealdb::{engine::remote::ws::Client, Surreal};
+use tokio::{io::{AsyncWriteExt, BufWriter}, task::JoinSet};
+use tracing::{debug, debug_span, error, info, instrument, level_filters::LevelFilter, trace, trace_span, warn};
+use tracing_subscriber::{fmt, layer::SubscriberExt, EnvFilter, Layer, Registry};

 mod db;
+mod filesystem;
+mod parser;
+
+const GET_METRIC: &str = "total_gets";
+const GET_IN_FLIGHT: &str = "gets_in_flight";
+const SITES_CRAWLED: &str = "pages_crawled";
+const BEING_PROCESSED: &str = "pages_being_processed";
+
+#[derive(Deserialize)]
+struct Config {
+    surreal_ns: String,
+    surreal_db: String,
+    surreal_url: String,
+    surreal_username: String,
+    surreal_password: String,
+
+    crawl_filter: String,
+    start_url: String,
+    budget: usize,
+    batch_size: usize,
+}

 #[tokio::main]
 async fn main() {
-    tracing_subscriber::fmt()
-        .with_env_filter(EnvFilter::from_default_env())
-        .with_line_number(true)
-        .without_time()
-        .init();
-    debug!("Starting...");
+    println!("Logs and metrics are provided to the Grafana dashboard");

-    // Would probably take these in as parameters from a cli
-    let url = "https://oliveratkinson.net/";
-    // let url = "http://localhost:5500";
-    let budget = 1000;
+    let writer = std::fs::OpenOptions::new()
+        .append(true)
+        .create(true)
+        .open("./docker/logs/tracing.log")
+        .expect("Couldn't make log file!");
+
+    let filter = EnvFilter::builder()
+        .with_default_directive(LevelFilter::DEBUG.into())
+        .from_env_lossy();
+
+    let registry = Registry::default().with(
+        fmt::layer()
+            .with_line_number(true)
+            .with_thread_ids(true)
+            .with_file(true)
+            .json()
+            .with_writer(writer)
+            .with_filter(filter)
+    );
+
+    tracing::subscriber::set_global_default(registry).expect("Failed to set default subscriber");
+
+    let builder = PrometheusBuilder::new();
+    builder
+        .with_http_listener(std::net::SocketAddr::new(
+            IpAddr::V4(Ipv4Addr::from_octets([0, 0, 0, 0])),
+            2500,
+        ))
+        .install()
+        .expect("failed to install recorder/exporter");
+
+    info!("Starting...");
+
+    // When getting uncrawled pages, name must contain this variable. "" will effectively get ignored.
+    // let crawl_filter = "en.wikipedia.org/";
+    // let budget = 50;
    let mut crawled = 0;

-    let db = connect().await.expect("Failed to connect to db, aborting.");
+    let mut file = File::open("./Crawler.toml").expect("Failed to read Crawler.toml");
+    let mut buf = String::new();
+    let _ = file.read_to_string(&mut buf);

-    let client = reqwest::Client::builder()
+    let config: Config = toml::from_str(&buf).expect("Failed to parse Crawler.toml");
+    let starting_url = &config.start_url;
+
+    let db = connect(&config)
+        .await
+        .expect("Failed to connect to surreal, aborting.");
+
+    let reqwest = reqwest::Client::builder()
        // .use_rustls_tls()
+        .gzip(true)
        .build()
-        .unwrap();
+        .expect("Failed to build reqwest client.");

    // Kick off the whole machine - This Website object doesn't matter, it's just to allow for
    // get() to work.
    let span = trace_span!("Pre-Loop");
    let pre_loop_span = span.enter();
    // Download the site
-    let mut site = Website::new(&url, false);
-    get(&mut site, &db, &client, &mut crawled).await;
+    let site = Website::new(starting_url, false);
+    process(site, db.clone(), reqwest.clone()).await;

    drop(pre_loop_span);

    let span = trace_span!("Loop");
    let span = span.enter();
-    while crawled < budget {
-        let get_num = if budget - crawled < 100 {
-            budget - crawled
-        } else {
-            100
-        };
-
-        let uncrawled = get_uncrawled_links(&db, get_num).await;
-        if uncrawled.len() == 0 {
+    while crawled < config.budget {
+        let uncrawled =
+            get_uncrawled_links(&db, config.budget - crawled, config.crawl_filter.clone(), &config).await;
+        if uncrawled.is_empty() {
            info!("Had more budget but finished crawling everything.");
            return;
        }
-        debug!("Crawling {} pages...", uncrawled.len());

-        let span = trace_span!("Crawling");
-        let _ = span.enter();
+        {
+            let mut futures = JoinSet::new();
+            for site in uncrawled {
+                gauge!(BEING_PROCESSED).increment(1);
+                futures.spawn(process(site, db.clone(), reqwest.clone()));
+                // let percent = format!("{:.2}%", (crawled as f32 / budget as f32) * 100f32);
+                // info!("Crawled {crawled} out of {budget} pages. ({percent})");
+            }

-        for mut site in uncrawled {
-            get(&mut site, &db, &client, &mut crawled).await;
-            let percent = format!("{:.2}%", (crawled as f32 / budget as f32) * 100f32);
-            info!("Crawled {crawled} out of {budget} pages. ({percent})");
+            let c = counter!(SITES_CRAWLED);
+            // As futures complete runs code in while block
+            while futures.join_next().await.is_some() {
+                c.increment(1);
+                gauge!(BEING_PROCESSED).decrement(1);
+                crawled += 1;
+            }
        }
    }
    drop(span);

+    if let Ok(mut ok) = db
+        .query("count(select id from website where crawled = true)")
+        .await
+    {
+        let res = ok.take::<Option<usize>>(0);
+        if let Ok(Some(n)) = res {
+            info!("Total crawled pages now equals {n}");
+        }
+    }
+
    info!("Done");
 }

-#[instrument(skip_all)]
-/// A quick helper function for downloading a url
-async fn get(
-    site: &mut Website,
-    db: &Surreal<Client>,
-    request_client: &reqwest::Client,
-    count: &mut usize,
-) {
-    trace!("Get: {}", site.to_string());
-    let timer = Timer::start("Got page");
+#[instrument(skip(db, reqwest))]
+/// Downloads and crawls and stores a webpage.
+/// It is acceptable to clone `db`, `reqwest`, and `s3` because they all use `Arc`s internally. - Noted by Oliver
+async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Client) {
+    // METRICS
+    trace!("Process: {}", &site.site);
+    // Build the request
+    let request_builder = reqwest.get(site.site.to_string());

-    if let Ok(response) = request_client.get(site.to_string()).send().await {
-        timer.stop();
+    // METRICS
+    let g = gauge!(GET_IN_FLIGHT);
+    g.increment(1);

-        // Get body
-        let data = response.text().await.unwrap();
-        let opts = ParseOpts {
-            tree_builder: TreeBuilderOpts {
-                drop_doctype: true,
-                ..Default::default()
+    // Send the http request (get)
+    if let Ok(response) = request_builder.send().await {
+        let headers = response.headers();
+        let code = response.status();
+
+        #[allow(non_snake_case)]
+        let CT = headers.get("Content-Type");
+        let ct = headers.get("content-type");
+
+        let ct = match (CT,ct) {
+            (None, None) => {
+                warn!("Server did not respond with Content-Type header. Url: {} Headers: ({:?})", site.site.to_string(), headers);
+                return
            },
-            ..Default::default()
+            (None, Some(a)) => a,
+            (Some(a), None) => a,
+            (Some(a), Some(_)) => a,
        };
-        // Get DOM
-        let dom = parse_document(RcDom::default(), opts)
-            .from_utf8()
-            .read_from(&mut data.as_bytes())
-            .unwrap();

-        // TODO save the dom to minio if a flag is set
+        // create filepath (handles / -> /index.html)
+        let path = filesystem::as_path(&site.site, ct);

-        // Modify record in database
-        site.set_crawled();
-        site.store(db).await;
-        trace!("Got: {}", site.to_string());
+        // make sure that the file is good to go
+        if let Some(file) = filesystem::init(&path).await {
+            // Get body from response
+            // stream the response onto the disk
+            let mut stream = response.bytes_stream();

-        // Walk all the children nodes, searching for links to other pages.
-        let mut buffer = Vec::new();
-        let timer = Timer::start("Walked");
-        walk(&dom.document, &db, &site, &mut buffer).await;
-        timer.stop();
+            let should_parse = path.to_string_lossy().ends_with(".html");
+            let mut writer = BufWriter::new(file);
+            let mut buf: Vec<u8> = Vec::new();

-        // Put all the found links into the database.
-        site.links_to(buffer, &db).await;
-        *count += 1;
-    }
-    trace!("Failed to get: {}", site.to_string());
-}
-
-/// Walks the givin site, placing it's findings in the database
-async fn walk(
-    node: &rcdom::Handle,
-    db: &Surreal<Client>,
-    site: &Website,
-    links_to: &mut Vec<Thing>,
-) {
-    let span = trace_span!("Walk");
-    let span = span.enter();
-    // Match each node - node basically means element.
-    match &node.data {
-        rcdom::NodeData::Element { name, attrs, .. } => {
-            for attr in attrs.borrow().clone() {
-                match name.local {
-                    local_name!("a")
-                    | local_name!("audio")
-                    | local_name!("area")
-                    | local_name!("img")
-                    | local_name!("link")
-                    | local_name!("object")
-                    | local_name!("source")
-                    | local_name!("base")
-                    | local_name!("video") => {
-                        let attribute_name = attr.name.local.to_string();
-                        if attribute_name == "src"
-                            || attribute_name == "href"
-                            || attribute_name == "data"
-                        {
-                            // Get clone of the current site object
-                            let mut web = site.clone();
-
-                            // Set url
-                            let url = web.mut_url();
-                            url.set_fragment(None); // removes #xyz
-                            let joined = url.join(&attr.value).unwrap();
-                            *url = joined;
-
-                            // Set other attributes
-                            web.crawled = false;
-                            // TODO set element name
-                            // let element_name = name.local.to_string();
-
-                            if let Some(id) = web.store(db).await {
-                                links_to.push(id);
-                            }
+            // Write file to disk
+            info!("Writing at: {:?}", path);
+            while let Some(data) = stream.next().await {
+                match data {
+                    Ok(data) => {
+                        let _ = writer.write_all(&data).await;
+                        // If we are going to parse this file later, we will save it
+                        // into memory as well as the disk.
+                        // We do this because the data here might be incomplete
+                        if should_parse {
+                            data.iter().for_each(|f| buf.push(*f));
                        }
-                    }
-                    local_name!("button") | local_name!("meta") | local_name!("iframe") => {
-                        // dbg!(attrs);
-                    }
-                    _ => {}
-                };
+                    },
+                    Err(err) => {
+                        eprintln!("{}", err)
+                    },
+                }
            }
+            let _ = writer.flush();
+
+
+            // (If needed) Parse the file
+            if should_parse {
+                let span = debug_span!("Should Parse");
+                let enter = span.enter();
+
+                // Parse document and get relationships
+                let sites = parser::parse(&site, &buf).await;
+                // De-duplicate this list
+                let prev_len = sites.len();
+                let set = sites.into_iter().fold(HashSet::new(), |mut set, item| {
+                    set.insert(item);
+                    set
+                });
+                let de_dupe_sites: Vec<Website> = set.into_iter().collect();
+                let diff = prev_len - de_dupe_sites.len();
+                trace!("Saved {diff} from being entered into the db by de-duping");
+                // Store all the other sites so that we can link to them.
+                let _ = Website::store_all(de_dupe_sites, &db).await;
+
+                drop(enter);
+            }
+
+            // METRICS
+            g.decrement(1);
+            counter!(GET_METRIC).increment(1);
+
+            // update self in db
+            site.crawled = true;
+            site.status_code = code.as_u16();
+            Website::store_all(vec![site.clone()], &db).await;
+        } else {
+            error!("File failed to cooperate: {:?}", path);
        }
-        _ => {}
-    };
-    drop(span);
-    for child in node.children.borrow().iter() {
-        Box::pin(walk(child, db, site, links_to)).await;
+
+        trace!("Done processing: {}", &site.site);
+    } else {
+        error!("Failed to get: {}", &site.site);
    }
 }

 /// Returns uncrawled links
-async fn get_uncrawled_links(db: &Surreal<Client>, mut count: usize) -> Vec<Website> {
-    if count > 100 {
-        count = 100
+#[instrument(skip(db, config))]
+async fn get_uncrawled_links(
+    db: &Surreal<Client>,
+    mut count: usize,
+    filter: String,
+    config: &Config,
+) -> Vec<Website> {
+    if count > config.batch_size {
+        count = config.batch_size;
    }

+    debug!("Getting {} uncrawled links", count);
+
    let mut response = db
-        .query("SELECT * FROM website WHERE crawled = false LIMIT $count")
+        .query("SELECT * FROM website WHERE crawled = false AND site ~ type::string($format) LIMIT $count;")
+        .bind(("format", filter))
        .bind(("count", count))
        .await
        .expect("Hard-coded query failed..?");
@@ -201,30 +287,3 @@ async fn get_uncrawled_links(db: &Surreal<Client>, mut count: usize) -> Vec<Webs
        .take(0)
        .expect("Returned websites couldn't be parsed")
 }
-
-pub struct Timer<'a> {
-    start: Instant,
-    msg: &'a str,
-}
-
-impl<'a> Timer<'a> {
-    #[inline]
-    pub fn start(msg: &'a str) -> Self {
-        Self {
-            start: Instant::now(),
-            msg,
-        }
-    }
-    pub fn stop(&self) -> f64 {
-        let dif = self.start.elapsed().as_micros();
-        let ms = dif as f64 / 1000.;
-        debug!("{}", format!("{} in {:.3}ms", self.msg, ms));
-        ms
-    }
-}
-
-impl Drop for Timer<'_> {
-    fn drop(&mut self) {
-        self.stop();
-    }
-}
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -0,0 +1,151 @@
+use std::default::Default;
+
+use html5ever::tokenizer::{BufferQueue, TokenizerResult};
+use html5ever::tokenizer::{StartTag, TagToken};
+use html5ever::tokenizer::{Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
+use html5ever::{local_name, tendril::*};
+use tracing::{debug, error, instrument, trace, warn};
+use url::Url;
+
+use crate::db::Website;
+
+impl TokenSink for Website {
+    type Handle = Vec<Website>;
+
+    #[instrument(skip(token, _line_number))]
+    fn process_token(&self, token: Token, _line_number: u64) -> TokenSinkResult<Self::Handle> {
+        match token {
+            TagToken(tag) => {
+                if tag.kind == StartTag {
+                    match tag.name {
+                        // this should be all the html elements that have links
+                        local_name!("a")
+                        | local_name!("audio")
+                        | local_name!("area")
+                        | local_name!("img")
+                        | local_name!("link")
+                        | local_name!("object")
+                        | local_name!("source")
+                        | local_name!("base")
+                        | local_name!("video") => {
+                            let mut links = Vec::new();
+                            for attr in &tag.attrs {
+                                let attr_name = attr.name.local.to_string();
+                                if attr_name == "src" || attr_name == "href" || attr_name == "data"
+                                {
+                                    trace!("Found `{}` in html `{}` tag", &attr.value, tag.name);
+                                    let url = try_get_url(&self.site, &attr.value);
+
+                                    if let Some(mut parsed) = url {
+                                        parsed.set_query(None);
+                                        parsed.set_fragment(None);
+                                        trace!("Final cleaned URL: `{}`", parsed.to_string());
+                                        let web = Website::new(&parsed.to_string(), false);
+                                        links.push(web);
+                                    }
+                                }
+                            }
+                            return TokenSinkResult::Script(links);
+                        }
+                        local_name!("button") | local_name!("meta") | local_name!("iframe") => {
+                            // dbg!(attrs);
+                        }
+                        _ => {}
+                    }
+                }
+            }
+            _ => {}
+        }
+        TokenSinkResult::Continue
+    }
+}
+
+#[instrument(skip_all)]
+/// Parses the passed site and returns all the sites it links to.
+pub async fn parse(site: &Website, data: &[u8]) -> Vec<Website> {
+    debug!("Parsing {}", site.site.to_string());
+    // prep work
+    let mut other_sites: Vec<Website> = Vec::new();
+
+    // change data into something that can be tokenized
+    let s: Result<Tendril<fmt::UTF8>, ()> = Tendril::try_from_byte_slice(data);
+    if let Ok(chunk) = s {
+        // create buffer of tokens and push our input into it
+        let token_buffer = BufferQueue::default();
+        token_buffer.push_back(
+            chunk
+                .try_reinterpret::<fmt::UTF8>()
+                .expect("Failed to reinterpret chunk!"),
+        );
+        // create the tokenizer
+        let tokenizer = Tokenizer::new(site.clone(), TokenizerOpts::default());
+
+        // go thru buffer
+        while let TokenizerResult::Script(mut sites) = tokenizer.feed(&token_buffer) {
+            other_sites.append(&mut sites);
+            // other_sites.push(sites);
+        }
+        assert!(token_buffer.is_empty());
+        tokenizer.end();
+    } else {
+        warn!("Tendril failed to parse on: {}", site.site.to_string());
+    }
+
+    other_sites
+}
+
+#[instrument]
+fn try_get_url(parent: &Url, link: &str) -> Option<Url> {
+    match Url::parse(link) {
+        Ok(ok) => Some(ok),
+        Err(e) => {
+            if link.starts_with('#') {
+                trace!("Rejecting # url");
+                None
+            } else if link.starts_with("//") {
+                // if a url starts with "//" is assumed that it will adopt
+                // the same scheme as it's parent
+                // https://stackoverflow.com/questions/9646407/two-forward-slashes-in-a-url-src-href-attribute
+                let scheme = parent.scheme();
+
+                match Url::parse(&format!("{scheme}://{}", link)) {
+                    Ok(url) => Some(url),
+                    Err(err) => {
+                        error!("Failed parsing relative scheme url: {}", err);
+                        None
+                    }
+                }
+            } else {
+                // # This is some sort of realative url, gonna try patching it up into an absolute
+                // url
+                match e {
+                    url::ParseError::RelativeUrlWithoutBase => {
+                        // Is: scheme://host:port
+                        let mut origin = parent.origin().ascii_serialization();
+                        if !origin.ends_with('/') && !link.starts_with('/') {
+                            origin += "/";
+                        }
+                        let url = origin.clone() + link;
+
+                        if let Ok(url) = Url::parse(&url) {
+                            trace!("Built `{url}` from `{origin} + `{}`", link.to_string());
+                            Some(url)
+                        } else {
+                            error!(
+                                "Failed to reconstruct a url from relative url: `{}` on site: `{}`. Failed url was: {}",
+                                link,
+                                parent.to_string(),
+                                url
+                            );
+                            None
+                        }
+                    }
+                    _ => {
+                        error!("MISC error: {:?} {:?}", e, link);
+                        None
+                    }
+                }
+            }
+        }
+    }
+}
--- a/src/setup.surql
+++ b/src/setup.surql
@@ -0,0 +1,9 @@
+DEFINE TABLE IF NOT EXISTS website SCHEMALESS;
+
+DEFINE FIELD IF NOT EXISTS site ON TABLE website TYPE string;
+DEFINE INDEX IF NOT EXISTS idx ON TABLE website COLUMNS site UNIQUE;
+
+DEFINE FIELD IF NOT EXISTS crawled ON TABLE website TYPE bool;
+
+DEFINE FIELD IF NOT EXISTS accessed_at ON TABLE website VALUE time::now();
+DEFINE FIELD IF NOT EXISTS first_accessed_at ON TABLE website VALUE time::now();
Author	SHA1	Message	Date
table	f7a3ca8fd7	changes	2025-08-04 23:02:05 +00:00
Rushmore75	6790061e22	helper code	2025-07-09 15:58:22 -06:00
Rushmore75	50606bb69e	It isnt quite working yet	2025-04-17 09:59:23 -06:00
Oliver	5850f19cab	Merge pull request 'stream_response' (#6 ) from stream_response into main Reviewed-on: #6	2025-04-17 15:39:49 +00:00
Rushmore75	2c8546e30a	logging cleanup	2025-04-17 09:36:27 -06:00
Rushmore75	4e619d0ebc	logging cleanup	2025-04-17 09:36:13 -06:00
Rushmore75	647c4cd324	work off content-type header	2025-04-17 09:35:57 -06:00
Rushmore75	7fab961d76	no longer how this is working	2025-04-17 09:35:26 -06:00
Rushmore75	d3fff194f4	logging updates	2025-04-17 08:17:37 -06:00
Rushmore75	3497312fd4	de-enshitified file saving logic	2025-04-17 08:17:29 -06:00
Oliver	0fd76b1734	Merge pull request 'stream_response' (#4 ) from stream_response into main Reviewed-on: #4	2025-04-15 21:23:54 +00:00
Rushmore75	9bfa8f9108	batch_size	2025-04-15 13:38:28 -06:00
Rushmore75	bdb1094a30	steam data to the disk	2025-04-15 13:07:47 -06:00
Rushmore75	9aa2d9ce22	code settings	2025-04-15 13:06:53 -06:00
Oliver	4b557a923c	Merge pull request 'foss_storage' (#3 ) from foss_storage into main Reviewed-on: #3	2025-04-15 15:11:59 +00:00
Rushmore75	c08a20ac00	cleanup and more accuratly use metrics	2025-04-15 09:07:16 -06:00
Rushmore75	94912e9125	change up how files are discovered	2025-04-15 09:06:57 -06:00
Rushmore75	a9465dda6e	add instructions	2025-03-31 15:05:18 -06:00
Rushmore75	add6f00ed6	no recomp needed	2025-03-31 14:53:10 -06:00
Rushmore75	4a433a1a77	This function sometimes throws errors, this logging should help	2025-03-31 14:18:37 -06:00
Rushmore75	03cbcd9ae0	remove minio code	2025-03-31 14:18:11 -06:00
Rushmore75	6fc71c7a78	add speed improvements	2025-03-21 12:14:29 -06:00
Rushmore75	96a3ca092a	:)	2025-03-21 12:11:05 -06:00
Rushmore75	b750d88d48	working filesystem storage	2025-03-21 11:42:43 -06:00
Oliver	808790a7c3	file patch;	2025-03-21 07:11:51 +00:00
Oliver	2de01b2a0e	remove removed code	2025-03-21 06:48:39 +00:00
Oliver	be0fd5505b	i think the files work better	2025-03-21 06:48:17 +00:00
Oliver	a23429104c	dead code removal	2025-03-21 06:03:34 +00:00
Oliver	66581cc453	getting there	2025-03-21 05:59:40 +00:00
Rushmore75	7df19a480f	updates	2025-03-20 15:11:01 -06:00
Rushmore75	b9c1f0b492	readme updates	2025-03-19 15:05:32 -06:00
Rushmore75	71b7b2d7bc	it works and it is awesome	2025-03-19 15:04:00 -06:00
Rushmore75	bac3cd9d1d	add most recent long run	2025-03-19 15:03:49 -06:00
Rushmore75	1f6a0acce3	shutup spellchecker	2025-03-19 15:03:39 -06:00
Rushmore75	53dbf53ab9	newest settings	2025-03-19 15:03:24 -06:00
Rushmore75	0477bb26e4	viz improvements	2025-03-19 15:03:11 -06:00
Rushmore75	6409baaffb	Reducted trips to surreal by x500	2025-03-19 12:41:08 -06:00
Oliver	135a7e4957	Merge pull request 'multithreading' (#2 ) from multithreading into main Reviewed-on: #2	2025-03-19 05:00:59 +00:00
Oliver	9aa34b3eee	epic metrics	2025-03-19 04:59:50 +00:00
Rushmore75	de80418c00	better logging	2025-03-18 16:09:46 -06:00
Rushmore75	e3e4175f51	logging improvements	2025-03-18 15:25:56 -06:00
Rushmore75	d11e7dd27c	the biggest 1 line improvement ever	2025-03-18 15:25:40 -06:00
Rushmore75	f2a3e836a0	spelling and clippy	2025-03-18 15:08:29 -06:00
Rushmore75	3b4e6a40ce	minimize vec resizing	2025-03-18 15:07:50 -06:00
Rushmore75	bd0b946245	fixed tracing	2025-03-18 15:02:32 -06:00
Rushmore75	b7540a4680	checkpoint - onto profiling	2025-03-18 10:53:06 -06:00
Oliver Atkinson	82929fd0fc	updating for base64	2024-12-13 13:28:24 -07:00
Oliver Atkinson	f42e770a10	moved to other repo	2024-12-13 11:01:35 -07:00
Oliver Atkinson	611a1e923b	starting on the extension	2024-12-12 15:32:04 -07:00
Oliver Atkinson	298ad39a79	rename	2024-12-12 14:59:54 -07:00
Oliver Atkinson	215056e493	use contains operator for better output	2024-12-12 14:26:49 -07:00
Oliver Atkinson	22be3b2f61	updating deps	2024-12-12 14:14:38 -07:00
Oliver Atkinson	c1c8cf07bb	unifed settings for testing	2024-12-12 11:42:07 -07:00
oliver	0f8a3d7215	using a custom parser now :)	2024-11-12 23:08:09 -07:00
oliver	574a370f30	readme updates	2024-11-12 21:24:57 -07:00
oliver	eaa79b749e	prepare get function for s3	2024-11-12 21:19:05 -07:00
oliver	2c28d69d55	add s3 support	2024-11-12 21:03:58 -07:00