checkpoint

add speed improvements
:)
2025-07-10 18:46:25 -06:00 · 2025-03-21 12:14:29 -06:00 · 2025-03-21 12:11:05 -06:00 · 2025-03-21 11:42:43 -06:00 · 2025-03-21 07:11:51 +00:00 · 2025-03-21 06:48:39 +00:00
22 changed files with 2398 additions and 2643 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,8 @@
 /target
 /.surrealdb
+/.minio
 perf.data
 flamegraph.svg
-perf.data.old
+perf.data.old
+/docker/logs/*
+/downloaded
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -9,7 +9,7 @@
            "request": "launch",
            "name": "Debug executable 'surreal_spider'",
            "env": {
-                "RUST_LOG": "surreal_spider=debug,reqwest=info",
+                "RUST_LOG": "surreal_spider=trace,reqwest=info",
            },
            "cargo": {
                "args": [
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -0,0 +1,8 @@
+{
+    "cSpell.words": [
+        "creds",
+        "reqwest",
+        "rustls",
+        "surql"
+    ]
+}
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,15 +1,20 @@
 [package]
-name = "surreal_spider"
+name = "internet_mapper"
 version = "0.1.0"
 edition = "2021"

 [dependencies]
-html5ever = "0.29.0"
-markup5ever_rcdom = "0.5.0-unofficial"
-reqwest = "0.12.9"
-serde = { version = "1.0.214", features = ["derive"] }
-surrealdb = "2.0.4"
+base64 = "0.22.1"
+html5ever = "0.29"
+metrics = "0.24.1"
+metrics-exporter-prometheus = { version = "0.16.2", features=["http-listener"]}
+# minio = "0.1.0"
+minio = {git="https://github.com/minio/minio-rs.git", rev = "c28f576"}
+reqwest = { version = "0.12", features = ["gzip", "default", "rustls-tls"] }
+rusqlite = { version = "0.34.0", features = ["bundled"] }
+serde = { version = "1.0", features = ["derive"] }
 tokio = { version="1.41.0", features = ["full"] }
-tracing = "0.1.40"
-tracing-subscriber = { version = "0.3.18", features = ["env-filter"] }
-url = { version = "2.5.3", features = ["serde"] }
+toml = "0.8.20"
+tracing = "0.1"
+tracing-subscriber = { version = "0.3", features = ["env-filter", "local-time", "json"] }
+url = { version = "2.5", features = ["serde"] }
--- a/Crawler.toml
+++ b/Crawler.toml
@@ -0,0 +1,10 @@
+# Surreal config
+surreal_url = "localhost:8000"
+surreal_username = "root"
+surreal_password = "root"
+surreal_ns = "test"
+surreal_db = "v1.19.2"
+
+# Crawler config
+crawl_filter = "en.wikipedia.com" 
+budget = 1000
--- a/README.md
+++ b/README.md
@@ -1,23 +1,25 @@
 # Surreal Crawler

-Mapping with a budget of 1000 (crawl 1000 sites, so many more links are actually discovered), on [my webiste](https://oliveratkinson.net) on 8/26/2024 took 1m9s.
+Crawls sites saving all the found links to a surrealdb database. It then proceeds to take batches of 100 uncrawled links untill the crawl budget is reached. It saves the data of each site in a minio database.

-This is including the crawl and loading into the database and linking sites. (Locally hosted surreal db instance)
+### TODO

-This run created 4299 site links with 23286 links between the sites. (It found my this git site which really bolsters those numbers.)
+- [ ] Domain filtering - prevent the crawler from going on alternate versions of wikipedia.
+- [ ] Conditionally save content - based on filename or file contents
+- [x] GUI / TUI ? - Graphana
+- [x] Better asynchronous getting of the sites. Currently it all happens serially.
+- [ ] Allow for storing asynchronously

-## Install / Build
+3/17/25: Took >1hr to crawl 100 pages

-* You will need rust to compile the crawler [rustup.rs](https://rustup.rs)
-* You need python3 (will come installed on most linux distros) and poetry for dependancy management.
-    * Install `pipx`, `python3`
-    * Then: `pipx install poetry`
-    * Then: `poetry install` to install the project dependancies
-* You need to install [surrealdb](https://surrealdb.com)
+3/19/25: Took 20min to crawl 1000 pages
+This ment we stored 1000 pages, 142,997 urls, and 1,425,798 links between the two.

-## Use
+3/20/25: Took 5min to crawl 1000 pages

-Just run `./crawl.sh {url}` and it will start crawling. You can tweak the budget inside [crawl.sh](https://git.oliveratkinson.net/Oliver/internet_mapper/src/branch/main/crawl.sh) if you want.
+3/21/25: Took 3min to crawl 1000 pages

-You can also prefix the command with `time` to benchmark the system, such as: `time ./crawl.sh https://discord.com`.
+# About
+
+![Screenshot](/pngs/graphana.png)

--- a/compose.yml
+++ b/compose.yml
@@ -1,16 +0,0 @@
-services:
-  db:
-    image: surrealdb/surrealdb:latest-dev
-    ports:
-    - 8000:8000
-    volumes:
-      - ./.surrealdb/:/mydata
-    command:
-      - start
-      - --log
-      - debug
-      - --user
-      - root
-      - --pass
-      - root
-      - rocksdb:/mydata/database.db
--- a/docker/alloy.conf
+++ b/docker/alloy.conf
@@ -0,0 +1,14 @@
+local.file_match "tmplogs" {
+    path_targets = [{"__path__" = "/tmp/alloy-logs/*.log"}]
+}
+
+loki.source.file "local_files" {
+    targets    = local.file_match.tmplogs.targets
+    forward_to = [loki.write.local_loki.receiver]
+}
+
+loki.write "local_loki" {
+    endpoint {
+        url = "http://loki:3100/loki/api/v1/push"
+    }
+}
--- a/docker/compose.yml
+++ b/docker/compose.yml
@@ -0,0 +1,69 @@
+services:
+  surreal:
+    image: surrealdb/surrealdb:latest-dev
+    ports:
+    - 8000:8000
+    volumes:
+      - surrealdb_storage:/mydata
+    command:
+      - start
+      - --log
+      - debug
+      - --user
+      - root
+      - --pass
+      - root
+      - rocksdb:/mydata/database.db
+
+  alloy:
+    image: grafana/alloy:latest
+    ports:
+      - 12345:12345
+    volumes:
+      # if you change this, you also need to change it in the alloy config file
+      - ./logs/:/tmp/alloy-logs
+      - ./alloy.conf:/etc/alloy/config.alloy
+      - alloy_storage:/var/lib/alloy
+    command:  run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy
+
+  #logs
+  loki:
+    image: grafana/loki:latest
+    ports:
+      - 3100:3100
+    command: -config.file=/etc/loki/local-config.yaml
+    volumes:
+      - ./loki.yaml:/etc/loki/local-config.yaml
+
+  # Metrics collector
+  prometheus:
+    image: prom/prometheus:latest
+    expose:
+      - 9090
+    volumes:
+      - ./prometheus.yaml:/etc/prometheus/prometheus.yml
+      # persist data
+      - prometheus_storage:/prometheus
+    command: --web.enable-lifecycle --config.file=/etc/prometheus/prometheus.yml
+
+  # Everything viewer
+  grafana:
+    image: grafana/grafana:latest
+    volumes:
+      - ./grafana.yaml:/etc/grafana/provisioning/datasources/datasources.yaml
+      - ./dashboards:/var/lib/grafana/dashboards
+      - grafana_storage:/var/lib/grafana
+    environment:
+      - GF_AUTH_ANONYMOUS_ENABLED=true
+      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
+      - GF_AUTH_DISABLE_LOGIN_FORM=true
+      - GF_FEATURE_TOGGLES_ENABLE=traceqlEditor
+    ports:
+      - 3000:3000
+  
+volumes:
+  prometheus_storage:
+  grafana_storage:
+  alloy_storage:
+  surrealdb_storage:
+  minio_storage:
--- a/docker/dashboards/crawler-dashboard.json
+++ b/docker/dashboards/crawler-dashboard.json
@@ -0,0 +1,648 @@
+{
+  "__inputs": [
+    {
+      "name": "DS_PROMETHEUS",
+      "label": "Prometheus",
+      "description": "",
+      "type": "datasource",
+      "pluginId": "prometheus",
+      "pluginName": "Prometheus"
+    },
+    {
+      "name": "DS_LOKI",
+      "label": "Loki",
+      "description": "",
+      "type": "datasource",
+      "pluginId": "loki",
+      "pluginName": "Loki"
+    }
+  ],
+  "__elements": {},
+  "__requires": [
+    {
+      "type": "grafana",
+      "id": "grafana",
+      "name": "Grafana",
+      "version": "11.3.1"
+    },
+    {
+      "type": "panel",
+      "id": "logs",
+      "name": "Logs",
+      "version": ""
+    },
+    {
+      "type": "datasource",
+      "id": "loki",
+      "name": "Loki",
+      "version": "1.0.0"
+    },
+    {
+      "type": "datasource",
+      "id": "prometheus",
+      "name": "Prometheus",
+      "version": "1.0.0"
+    },
+    {
+      "type": "panel",
+      "id": "stat",
+      "name": "Stat",
+      "version": ""
+    },
+    {
+      "type": "panel",
+      "id": "timeseries",
+      "name": "Time series",
+      "version": ""
+    }
+  ],
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {
+          "type": "grafana",
+          "uid": "-- Grafana --"
+        },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": null,
+  "links": [],
+  "panels": [
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": 300000,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 8,
+        "x": 0,
+        "y": 0
+      },
+      "id": 5,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "11.3.1",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "surql_trips",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "legendFormat": "Trips to Surreal",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "s3_trips",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "Trips to S3",
+          "range": true,
+          "refId": "B",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "pages_crawled",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "total crawled",
+          "range": true,
+          "refId": "C",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "pages_being_processed",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "Pages being processed",
+          "range": true,
+          "refId": "E",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "gets_in_flight",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "D",
+          "useBackend": false
+        }
+      ],
+      "title": "Crawler stats",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": 300000,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 9,
+        "x": 8,
+        "y": 0
+      },
+      "id": 6,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "11.3.1",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "surql_trips",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "legendFormat": "Trips to Surreal",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "surql_link_calls",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "link calls",
+          "range": true,
+          "refId": "B",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "surql_store_calls",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "store calls",
+          "range": true,
+          "refId": "C",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "pages_being_processed",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "Pages being processed",
+          "range": true,
+          "refId": "E",
+          "useBackend": false
+        }
+      ],
+      "title": "Surreal stats",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "This is across all threads, so this isn't wall clock time",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "ms"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 7,
+        "x": 17,
+        "y": 0
+      },
+      "id": 7,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "percentChangeColorMode": "standard",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "showPercentChange": false,
+        "textMode": "auto",
+        "wideLayout": true
+      },
+      "pluginVersion": "11.3.1",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "surql_lock_waiting_ms",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Time spend waiting on lock",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "loki",
+        "uid": "${DS_LOKI}"
+      },
+      "gridPos": {
+        "h": 18,
+        "w": 24,
+        "x": 0,
+        "y": 8
+      },
+      "id": 1,
+      "options": {
+        "dedupStrategy": "none",
+        "enableLogDetails": true,
+        "prettifyLogMessage": false,
+        "showCommonLabels": false,
+        "showLabels": false,
+        "showTime": false,
+        "sortOrder": "Descending",
+        "wrapLogMessage": false
+      },
+      "pluginVersion": "11.3.1",
+      "targets": [
+        {
+          "datasource": {
+            "type": "loki",
+            "uid": "${DS_LOKI}"
+          },
+          "editorMode": "code",
+          "expr": "{filename=\"/tmp/alloy-logs/tracing.log\"} | json | level = `ERROR` | line_format \"{{.threadId}} {{.filename_extracted}}:{{.line_number}}  {{.fields_message}}\"",
+          "queryType": "range",
+          "refId": "A"
+        }
+      ],
+      "title": "Errors",
+      "type": "logs"
+    },
+    {
+      "datasource": {
+        "type": "loki",
+        "uid": "${DS_LOKI}"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 26
+      },
+      "id": 2,
+      "options": {
+        "dedupStrategy": "none",
+        "enableLogDetails": true,
+        "prettifyLogMessage": false,
+        "showCommonLabels": false,
+        "showLabels": false,
+        "showTime": false,
+        "sortOrder": "Descending",
+        "wrapLogMessage": false
+      },
+      "pluginVersion": "11.3.1",
+      "targets": [
+        {
+          "datasource": {
+            "type": "loki",
+            "uid": "${DS_LOKI}"
+          },
+          "editorMode": "code",
+          "expr": "{filename=\"/tmp/alloy-logs/tracing.log\"} | json | level = `DEBUG` | line_format \"{{.fields_message}}\"",
+          "queryType": "range",
+          "refId": "A"
+        }
+      ],
+      "title": "Debug",
+      "type": "logs"
+    },
+    {
+      "datasource": {
+        "type": "loki",
+        "uid": "${DS_LOKI}"
+      },
+      "gridPos": {
+        "h": 16,
+        "w": 12,
+        "x": 12,
+        "y": 26
+      },
+      "id": 4,
+      "options": {
+        "dedupStrategy": "none",
+        "enableLogDetails": true,
+        "prettifyLogMessage": false,
+        "showCommonLabels": false,
+        "showLabels": false,
+        "showTime": false,
+        "sortOrder": "Descending",
+        "wrapLogMessage": false
+      },
+      "pluginVersion": "11.3.1",
+      "targets": [
+        {
+          "datasource": {
+            "type": "loki",
+            "uid": "${DS_LOKI}"
+          },
+          "editorMode": "code",
+          "expr": "{filename=\"/tmp/alloy-logs/tracing.log\"} | json | level = `TRACE` | line_format \"{{.fields_message}}\"",
+          "queryType": "range",
+          "refId": "A"
+        }
+      ],
+      "title": "Trace",
+      "type": "logs"
+    },
+    {
+      "datasource": {
+        "type": "loki",
+        "uid": "${DS_LOKI}"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 34
+      },
+      "id": 3,
+      "options": {
+        "dedupStrategy": "none",
+        "enableLogDetails": true,
+        "prettifyLogMessage": false,
+        "showCommonLabels": false,
+        "showLabels": false,
+        "showTime": false,
+        "sortOrder": "Descending",
+        "wrapLogMessage": false
+      },
+      "pluginVersion": "11.3.1",
+      "targets": [
+        {
+          "datasource": {
+            "type": "loki",
+            "uid": "${DS_LOKI}"
+          },
+          "editorMode": "code",
+          "expr": "{filename=\"/tmp/alloy-logs/tracing.log\"} | json | level = `WARN` | line_format \"{{.fields_message}}\"",
+          "queryType": "range",
+          "refId": "A"
+        }
+      ],
+      "title": "Warnings",
+      "type": "logs"
+    }
+  ],
+  "schemaVersion": 40,
+  "tags": [],
+  "templating": {
+    "list": [
+      {
+        "datasource": {
+          "type": "loki",
+          "uid": "P8E80F9AEF21F6940"
+        },
+        "filters": [],
+        "name": "Filters",
+        "type": "adhoc"
+      }
+    ]
+  },
+  "time": {
+    "from": "now-5m",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "browser",
+  "title": "Crawler",
+  "uid": "ceg90x34pqgowd",
+  "version": 21,
+  "weekStart": ""
+}
--- a/docker/grafana.yaml
+++ b/docker/grafana.yaml
@@ -0,0 +1,24 @@
+apiVersion: 1
+
+datasources:
+- name: Loki
+  type: loki
+  access: proxy 
+  orgId: 1
+  url: http://loki:3100
+  basicAuth: false
+  isDefault: true 
+  version: 1
+  editable: false
+- name: Prometheus
+  type: prometheus
+  uid: prometheus
+  access: proxy
+  orgId: 1
+  url: http://prometheus:9090
+  basicAuth: false
+  isDefault: false
+  version: 1
+  editable: false
+  jsonData:
+    httpMethod: GET
--- a/docker/loki.yaml
+++ b/docker/loki.yaml
@@ -0,0 +1,62 @@
+# this is mostly the default config from grafana's website
+
+auth_enabled: false
+
+server:
+  http_listen_port: 3100
+  grpc_listen_port: 9096
+  log_level: info
+  grpc_server_max_concurrent_streams: 1000
+
+common:
+  instance_addr: 127.0.0.1
+  path_prefix: /tmp/loki
+  storage:
+    filesystem:
+      chunks_directory: /tmp/loki/chunks
+      rules_directory: /tmp/loki/rules
+  replication_factor: 1
+  ring:
+    kvstore:
+      store: inmemory
+
+query_range:
+  results_cache:
+    cache:
+      embedded_cache:
+        enabled: true
+        max_size_mb: 100
+
+limits_config:
+  metric_aggregation_enabled: true
+
+schema_config:
+  configs:
+    - from: 2020-10-24
+      store: tsdb
+      object_store: filesystem
+      schema: v13
+      index:
+        prefix: index_
+        period: 24h
+
+pattern_ingester:
+  enabled: true
+  metric_aggregation:
+    loki_address: localhost:3100
+
+frontend:
+  encoding: protobuf
+
+# By default, Loki will send anonymous, but uniquely-identifiable usage and configuration
+# analytics to Grafana Labs. These statistics are sent to https://stats.grafana.org/
+#
+# Statistics help us better understand how Loki is used, and they show us performance
+# levels for most users. This helps us prioritize features and documentation.
+# For more information on what's sent, look at
+# https://github.com/grafana/loki/blob/main/pkg/analytics/stats.go
+# Refer to the buildReport method to see what goes into a report.
+#
+# If you would like to disable reporting, uncomment the following lines:
+analytics:
+  reporting_enabled: false
--- a/docker/prometheus.yaml
+++ b/docker/prometheus.yaml
@@ -0,0 +1,17 @@
+global:
+  scrape_interval: 5s 
+  query_log_file: /etc/prometheus/query.log
+
+scrape_configs:
+  - job_name: crawler
+    static_configs:
+    # change this your machine's ip, localhost won't work
+    # because localhost refers to the docker container.
+      - targets: ['172.20.239.48:2500']
+        #- targets: ['192.168.8.209:2500']
+  - job_name: loki
+    static_configs:
+      - targets: ['loki:3100']
+  - job_name: prometheus
+    static_configs:
+      - targets: ['localhost:9090']
--- a/jsconfig.json
+++ b/jsconfig.json
@@ -0,0 +1,16 @@
+{
+    "compilerOptions": {
+        "module": "ESNext",
+        "moduleResolution": "Bundler",
+        "target": "ES2022",
+        "jsx": "react",
+        "allowImportingTsExtensions": true,
+        "strictNullChecks": true,
+        "strictFunctionTypes": true
+    },
+    "exclude": [
+        "node_modules",
+        "**/node_modules/*"
+    ],
+    "typeAcquisition": {"include": ["firefox-webext-browser"]}
+}
--- a/pngs/graphana.png
+++ b/pngs/graphana.png
--- a/schema.surql
+++ b/schema.surql
@@ -1,2 +0,0 @@
-DEFINE TABLE website SCHEMALESS;
-    DEFINE FIELD accessed_at ON TABLE website VALUE time::now();
--- a/src/db.rs
+++ b/src/db.rs
@@ -1,23 +1,27 @@
+use metrics::counter;
+use rusqlite::Connection;
+use std::fmt::Debug;
 use serde::{Deserialize, Serialize};
-use surrealdb::{
-    engine::remote::ws::{Client, Ws},
-    opt::auth::Root,
-    sql::Thing,
-    Response, Surreal,
-};
-use tracing::{error, instrument, trace, warn};
+use tracing::{error, instrument, trace};
 use url::Url;

-use crate::Timer;
+use crate::Config;

-#[derive(Debug, Serialize, Deserialize, Clone)]
+const STORE: &str = "surql_store_calls";
+
+#[derive(Serialize, Deserialize, Clone, Eq, PartialEq, Hash)]
 pub struct Website {
    /// The url that this data is found at
-    site: Url,
+    pub site: Url,
    /// Wether or not this link has been crawled yet
    pub crawled: bool,
-    #[serde(skip_serializing)]
-    id: Option<Thing>,
+}
+
+// manual impl to make tracing look nicer
+impl Debug for Website {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Website").field("site", &self.site).finish()
+    }
 }

 impl Website {
@@ -28,9 +32,8 @@ impl Website {
            Err(_) => todo!(),
        };
        Self {
-            id: None,
            crawled,
-            site,
+            site
        }
    }

@@ -39,101 +42,39 @@ impl Website {
        self.crawled = true
    }

-    pub fn mut_url(&mut self) -> &mut Url {
-        &mut self.site
-    }
+    // Insert ever item in the vec into surreal, crawled state will be preserved as TRUE
+    // if already in the database as such or incoming data is TRUE.
+    pub async fn store_all(all: Vec<Self>, db: &Connection) {
+        counter!(STORE).increment(1);
+        let mut things = Vec::with_capacity(all.len());

-    #[instrument(skip_all)]
-    pub async fn links_to(&self, other: Vec<Thing>, db: &Surreal<Client>) {
-        let len = other.len();
-        if len == 0 {return}
+        rusqlite::ParamsFromIter;
+
+        db.execute("",
+            params![]
+        );

-        let from = self.site.to_string();
-        // let to = other.site.to_string();
-        trace!("Linking {from} to {} other pages.", other.len());
-        let msg = format!("Linked {len} pages");
-        let timer = Timer::start(&msg);
-        // prevent the timer from being dropped instantly.
-        let _ = timer;
        match db
-            .query("COUNT(RELATE (SELECT id FROM website WHERE site = $in) -> links_to -> $out)")
-            .bind(("in", from))
-            .bind(("out", other))
+            .query(
+                "INSERT INTO website $array
+                    ON DUPLICATE KEY UPDATE
+                        accessed_at = time::now(),
+                        crawled = crawled OR $input.crawled
+                    RETURN VALUE id;
+                 ",
+            )
+            .bind(("array", all))
            .await
        {
-            Ok(mut e) => {
-                // The relate could technically "fail" (not relate anything), this just means that
-                // the query was ok.
-                let _: Response = e;
-                if let Ok(vec) = e.take(0) {
-                    let _: Vec<usize> = vec;
-                    if let Some(num) = vec.get(0) {
-                        if *num == len {
-                            trace!("Link OK");
-                            return;
-                        } else {
-                            warn!("Didn't link all the records. {num}/{len}");
-                            return;
-                        }
-                    }
-                }
-                warn!("Linking request succeeded but couldn't verify the results.");
+            Ok(mut id) => match id.take::<Vec<Thing>>(0) {
+                Ok(mut x) => things.append(&mut x),
+                Err(err) => error!("{:?}", err),
            },
-            Err(e) => {
-                error!("{}", e.to_string());
-            },
-        }
-    }
-
-    #[instrument(skip_all)]
-    pub async fn store(&mut self, db: &Surreal<Client>) -> Option<Thing> {
-        // check if it's been gone thru before
-        let mut response = db
-            .query("SELECT * FROM ONLY website WHERE site = $site LIMIT 1")
-            .bind(("site", self.site.to_string()))
-            .await
-            .unwrap();
-
-        if let Some(old) = response.take::<Option<Website>>(0).unwrap() {
-            // site exists already
-            if let Some(id) = old.id {
-                // make sure to preserve the "crawled status"
-                let mut new = self.clone();
-                new.crawled = old.crawled | new.crawled;
-
-                // update the record
-                match db.upsert((id.tb, id.id.to_string())).content(new).await {
-                    Ok(e) => {
-                        if let Some(a) = e {
-                            let _: Record = a;
-                            return Some(a.id);
-                        }
-                    }
-                    Err(e) => {
-                        error!("{}", e);
-                    }
-                };
+            Err(err) => {
+                error!("{:?}", err);
            }
-        } else {
-            // sites hasn't existed yet
-            match db.create("website").content(self.clone()).await {
-                Ok(e) => {
-                    let _: Option<Record> = e;
-                    if let Some(a) = e {
-                        let _: Record = a;
-                        return Some(a.id);
-                    }
-                }
-                Err(a) => error!("{:?}", a),
-            };
        }
-        None
-    }
-}
-
-impl ToString for Website {
-    fn to_string(&self) -> String {
-        self.site.to_string()
+        things
    }
 }

@@ -149,19 +90,10 @@ pub struct Record {
    pub id: Thing,
 }

-pub async fn connect() -> surrealdb::Result<Surreal<Client>> {
+#[instrument(skip_all, name = "sqlite_connect")]
+pub async fn connect(config: &Config) -> Result<Connection, rusqlite::Error> {
+    trace!("Establishing connection to sqlite...");
    // Connect to the server
-    let db = Surreal::new::<Ws>("127.0.0.1:8000").await?;
-
-    // Signin as a namespace, database, or root user
-    db.signin(Root {
-        username: "root",
-        password: "root",
-    })
-    .await?;
-
-    // Select a specific namespace / database
-    db.use_ns("test").use_db("v1.2").await?;
-
-    Ok(db)
+    Connection::open("./squeelite.db")
 }
+
--- a/src/filesystem.rs
+++ b/src/filesystem.rs
@@ -0,0 +1,70 @@
+use std::{ffi::OsStr, path::PathBuf};
+
+use tokio::fs;
+use tracing::{debug, error, instrument, trace, warn};
+use url::Url;
+
+#[instrument(skip(data))]
+pub async fn store(data: &str, url: &Url) {
+    // extract data from url to save it accurately
+    let url_path = PathBuf::from("./downloaded/".to_string() + url.domain().unwrap_or("UnknownDomain") + url.path());
+
+    // if it's a file
+    let (basepath, filename) = if url_path.extension().filter(valid_file_extension).is_some() {
+        // get everything up till the file
+        let basepath = url_path.ancestors().skip(1).take(1).collect::<PathBuf>();
+        // get the file name
+        let filename = url_path.file_name().expect("This should exist").to_string_lossy();
+        trace!("Save path: {:?} and base path: {:?}", &url_path, &basepath);
+        (basepath, filename.to_string())
+    } else {
+        (url_path.clone(), "index.html".into())
+    };
+
+    debug!("Writing at: {:?} {:?}", basepath, filename);
+
+    // create the folders
+    if let Err(err) = fs::create_dir_all(&basepath).await {
+        error!("Dir creation: {err} {:?}", basepath);
+    } else {
+        // FIXME I don't think this handles index.html files well...
+        // TODO this should probably append .html to non-described files
+        // create the file if that was successful
+        if let Err(err) = fs::write(&basepath.join(filename), data).await {
+            error!("File creation: {err} {:?}", url_path);
+        }
+    }
+}
+
+fn valid_file_extension(take: &&OsStr) -> bool {
+    let los = take.to_string_lossy();
+    let all = los.split('.');
+    match all.last() {
+        Some(s) => {
+            match s.to_lowercase().as_str() {
+                "html" => true,
+                "css" => true,
+                "js" => true,
+                "ts" => true,
+                "otf" => true, // font
+
+                "png" => true,
+                "svg" => true,
+                "jpg" => true,
+                "jpeg" => true,
+                "mp4" => true,
+                "mp3" => true,
+                "webp" => true,
+
+                "pdf" => true,
+                "json" => true,
+                "xml" => true,
+                _ => {
+                    warn!("Might be forgetting a file extension: {s}");
+                    false
+                }
+            }
+        },
+        None => false,
+    }
+}
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,73 +1,139 @@
+#![feature(ip_from)]
+
 extern crate html5ever;
-extern crate markup5ever_rcdom as rcdom;
+
+use std::{
+    collections::HashSet, fs::File, io::Read, net::{IpAddr, Ipv4Addr}
+};

 use db::{connect, Website};
-use html5ever::{
-    local_name, parse_document, tendril::TendrilSink, tree_builder::TreeBuilderOpts, ParseOpts,
-};
-use rcdom::RcDom;
-use std::time::Instant;
-use surrealdb::{engine::remote::ws::Client, sql::Thing, Surreal};
-use tracing::{debug, info, instrument, trace, trace_span};
-use tracing_subscriber::EnvFilter;
+use metrics::{counter, gauge};
+use metrics_exporter_prometheus::PrometheusBuilder;
+use serde::Deserialize;
+use surrealdb::{engine::remote::ws::Client, Surreal};
+use tokio::task::JoinSet;
+use tracing::{debug, error, info, instrument, level_filters::LevelFilter, trace, trace_span};
+use tracing_subscriber::{fmt, layer::SubscriberExt, EnvFilter, Layer, Registry};

 mod db;
+mod parser;
+mod filesystem;
+
+const GET_METRIC: &str = "total_gets";
+const GET_IN_FLIGHT: &str = "gets_in_flight";
+const SITES_CRAWLED: &str = "pages_crawled";
+const BEING_PROCESSED: &str = "pages_being_processed";
+
+#[derive(Deserialize)]
+struct Config {
+    surreal_ns: String,
+    surreal_db: String,
+    surreal_url: String,
+    surreal_username: String,
+    surreal_password: String,
+
+    crawl_filter: String,
+    budget: usize,
+}

 #[tokio::main]
 async fn main() {
-    tracing_subscriber::fmt()
-        .with_env_filter(EnvFilter::from_default_env())
-        .with_line_number(true)
-        .without_time()
-        .init();
-    debug!("Starting...");
+    let writer = std::fs::OpenOptions::new()
+        .append(true)
+        .create(true)
+        .open("./docker/logs/tracing.log")
+        .expect("Couldn't make log file!");

+    let filter = EnvFilter::builder()
+        .with_default_directive(LevelFilter::DEBUG.into())
+        .from_env_lossy();
+
+    let registry = Registry::default().with(
+        fmt::layer()
+            .with_line_number(true)
+            .with_thread_ids(true)
+            .with_file(true)
+            .json()
+            .with_writer(writer)
+            .with_filter(filter)
+    );
+
+    tracing::subscriber::set_global_default(registry).expect("Failed to set default subscriber");
+
+    let builder = PrometheusBuilder::new();
+    builder
+        .with_http_listener(std::net::SocketAddr::new(
+            IpAddr::V4(Ipv4Addr::from_octets([0, 0, 0, 0])),
+            2500,
+        ))
+        .install()
+        .expect("failed to install recorder/exporter");
+
+    info!("Starting...");
    // Would probably take these in as parameters from a cli
-    let url = "https://oliveratkinson.net/";
-    // let url = "http://localhost:5500";
-    let budget = 1000;
+    let starting_url = "https://en.wikipedia.org/";
+    // When getting uncrawled pages, name must contain this variable. "" will effectively get ignored.
+    // let crawl_filter = "en.wikipedia.org/";
+    // let budget = 50;
    let mut crawled = 0;

-    let db = connect().await.expect("Failed to connect to db, aborting.");
+    let mut file = File::open("./Crawler.toml").expect("Failed to read Crawler.toml");
+    let mut buf = String::new();
+    let _ = file.read_to_string(&mut buf);

-    let client = reqwest::Client::builder()
+    let config: Config = toml::from_str(&buf).expect("Failed to parse Crawler.toml");
+
+    let db = connect(&config)
+        .await
+        .expect("Failed to connect to surreal, aborting.");
+
+    let reqwest = reqwest::Client::builder()
        // .use_rustls_tls()
+        .gzip(true)
        .build()
-        .unwrap();
+        .expect("Failed to build reqwest client.");

    // Kick off the whole machine - This Website object doesn't matter, it's just to allow for
    // get() to work.
    let span = trace_span!("Pre-Loop");
    let pre_loop_span = span.enter();
    // Download the site
-    let mut site = Website::new(&url, false);
-    get(&mut site, &db, &client, &mut crawled).await;
+    let site = Website::new(starting_url, false);
+    process(site, db.clone(), reqwest.clone()).await;

    drop(pre_loop_span);

    let span = trace_span!("Loop");
    let span = span.enter();
-    while crawled < budget {
-        let get_num = if budget - crawled < 100 {
-            budget - crawled
+    while crawled < config.budget {
+        let get_num = if config.budget - crawled < 100 {
+            config.budget - crawled
        } else {
            100
        };

-        let uncrawled = get_uncrawled_links(&db, get_num).await;
-        if uncrawled.len() == 0 {
+        let uncrawled = get_uncrawled_links(&db, get_num, config.crawl_filter.clone()).await;
+        if uncrawled.is_empty() {
            info!("Had more budget but finished crawling everything.");
            return;
        }
-        debug!("Crawling {} pages...", uncrawled.len());

-        let span = trace_span!("Crawling");
-        let _ = span.enter();
+        {
+            let mut futures = JoinSet::new();
+            for site in uncrawled {
+                gauge!(BEING_PROCESSED).increment(1);
+                futures.spawn(process(site, db.clone(), reqwest.clone()));
+                // let percent = format!("{:.2}%", (crawled as f32 / budget as f32) * 100f32);
+                // info!("Crawled {crawled} out of {budget} pages. ({percent})");
+            }

-        for mut site in uncrawled {
-            get(&mut site, &db, &client, &mut crawled).await;
-            let percent = format!("{:.2}%", (crawled as f32 / budget as f32) * 100f32);
-            info!("Crawled {crawled} out of {budget} pages. ({percent})");
+            let c = counter!(SITES_CRAWLED);
+            // As futures complete runs code in while block
+            while futures.join_next().await.is_some() {
+                c.increment(1);
+                gauge!(BEING_PROCESSED).decrement(1);
+                crawled += 1;
+            }
        }
    }
    drop(span);
@@ -75,125 +141,76 @@ async fn main() {
    info!("Done");
 }

-#[instrument(skip_all)]
-/// A quick helper function for downloading a url
-async fn get(
-    site: &mut Website,
-    db: &Surreal<Client>,
-    request_client: &reqwest::Client,
-    count: &mut usize,
-) {
-    trace!("Get: {}", site.to_string());
-    let timer = Timer::start("Got page");
+#[instrument(skip(db, reqwest))]
+/// Downloads and crawls and stores a webpage.
+/// It is acceptable to clone `db`, `reqwest`, and `s3` because they all use `Arc`s internally. - Noted by Oliver
+async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Client) {
+    
+    // METRICS
+    trace!("Process: {}", &site.site);
+    // Build the request
+    let request_builder = reqwest.get(site.site.to_string());

-    if let Ok(response) = request_client.get(site.to_string()).send().await {
-        timer.stop();
+    // METRICS
+    let g = gauge!(GET_IN_FLIGHT);
+    g.increment(1);

-        // Get body
-        let data = response.text().await.unwrap();
-        let opts = ParseOpts {
-            tree_builder: TreeBuilderOpts {
-                drop_doctype: true,
-                ..Default::default()
-            },
-            ..Default::default()
-        };
-        // Get DOM
-        let dom = parse_document(RcDom::default(), opts)
-            .from_utf8()
-            .read_from(&mut data.as_bytes())
-            .unwrap();
+    // Send the http request (get)
+    if let Ok(response) = request_builder.send().await {

-        // TODO save the dom to minio if a flag is set
+        // METRICS
+        g.decrement(1);
+        counter!(GET_METRIC).increment(1);

-        // Modify record in database
+        // Get body from response
+        let data = response
+            .text()
+            .await
+            .expect("Failed to read http response's body!");
+
+        // Store document
+        filesystem::store(&data, &site.site).await;
+
+        // Parse document and get relationships
+        let sites = parser::parse(&site, &data).await;
+        
+        // update self in db
        site.set_crawled();
-        site.store(db).await;
-        trace!("Got: {}", site.to_string());
+        Website::store_all(vec![site], &db).await;

-        // Walk all the children nodes, searching for links to other pages.
-        let mut buffer = Vec::new();
-        let timer = Timer::start("Walked");
-        walk(&dom.document, &db, &site, &mut buffer).await;
-        timer.stop();
+        // De-duplicate this list
+        let prev_len = sites.len();
+        let set = sites.into_iter().fold(HashSet::new(), |mut set,item| {
+            set.insert(item);
+            set
+        });
+        let de_dupe_sites: Vec<Website> = set.into_iter().collect();
+        let diff = prev_len - de_dupe_sites.len();
+        trace!("Saved {diff} from being entered into the db by de-duping");

-        // Put all the found links into the database.
-        site.links_to(buffer, &db).await;
-        *count += 1;
-    }
-    trace!("Failed to get: {}", site.to_string());
-}
+        // Store all the other sites so that we can link to them.
+        let _ = Website::store_all(de_dupe_sites, &db).await;

-/// Walks the givin site, placing it's findings in the database
-async fn walk(
-    node: &rcdom::Handle,
-    db: &Surreal<Client>,
-    site: &Website,
-    links_to: &mut Vec<Thing>,
-) {
-    let span = trace_span!("Walk");
-    let span = span.enter();
-    // Match each node - node basically means element.
-    match &node.data {
-        rcdom::NodeData::Element { name, attrs, .. } => {
-            for attr in attrs.borrow().clone() {
-                match name.local {
-                    local_name!("a")
-                    | local_name!("audio")
-                    | local_name!("area")
-                    | local_name!("img")
-                    | local_name!("link")
-                    | local_name!("object")
-                    | local_name!("source")
-                    | local_name!("base")
-                    | local_name!("video") => {
-                        let attribute_name = attr.name.local.to_string();
-                        if attribute_name == "src"
-                            || attribute_name == "href"
-                            || attribute_name == "data"
-                        {
-                            // Get clone of the current site object
-                            let mut web = site.clone();
-
-                            // Set url
-                            let url = web.mut_url();
-                            url.set_fragment(None); // removes #xyz
-                            let joined = url.join(&attr.value).unwrap();
-                            *url = joined;
-
-                            // Set other attributes
-                            web.crawled = false;
-                            // TODO set element name
-                            // let element_name = name.local.to_string();
-
-                            if let Some(id) = web.store(db).await {
-                                links_to.push(id);
-                            }
-                        }
-                    }
-                    local_name!("button") | local_name!("meta") | local_name!("iframe") => {
-                        // dbg!(attrs);
-                    }
-                    _ => {}
-                };
-            }
-        }
-        _ => {}
-    };
-    drop(span);
-    for child in node.children.borrow().iter() {
-        Box::pin(walk(child, db, site, links_to)).await;
+    } else {
+        error!("Failed to get: {}", &site.site);
    }
 }

 /// Returns uncrawled links
-async fn get_uncrawled_links(db: &Surreal<Client>, mut count: usize) -> Vec<Website> {
+#[instrument(skip(db))]
+async fn get_uncrawled_links(
+    db: &Surreal<Client>,
+    mut count: usize,
+    filter: String,
+) -> Vec<Website> {
    if count > 100 {
        count = 100
    }
+    debug!("Getting uncrawled links");

    let mut response = db
-        .query("SELECT * FROM website WHERE crawled = false LIMIT $count")
+        .query("SELECT * FROM website WHERE crawled = false AND site ~ type::string($format) LIMIT $count;")
+        .bind(("format", filter))
        .bind(("count", count))
        .await
        .expect("Hard-coded query failed..?");
@@ -202,29 +219,3 @@ async fn get_uncrawled_links(db: &Surreal<Client>, mut count: usize) -> Vec<Webs
        .expect("Returned websites couldn't be parsed")
 }

-pub struct Timer<'a> {
-    start: Instant,
-    msg: &'a str,
-}
-
-impl<'a> Timer<'a> {
-    #[inline]
-    pub fn start(msg: &'a str) -> Self {
-        Self {
-            start: Instant::now(),
-            msg,
-        }
-    }
-    pub fn stop(&self) -> f64 {
-        let dif = self.start.elapsed().as_micros();
-        let ms = dif as f64 / 1000.;
-        debug!("{}", format!("{} in {:.3}ms", self.msg, ms));
-        ms
-    }
-}
-
-impl Drop for Timer<'_> {
-    fn drop(&mut self) {
-        self.stop();
-    }
-}
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -0,0 +1,145 @@
+use std::default::Default;
+use std::str::FromStr;
+
+use html5ever::tokenizer::{BufferQueue, TokenizerResult};
+use html5ever::tokenizer::{StartTag, TagToken};
+use html5ever::tokenizer::{Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
+use html5ever::{local_name, tendril::*};
+use tracing::{debug, error, instrument, trace, warn};
+use url::Url;
+
+use crate::db::Website;
+
+impl TokenSink for Website {
+    type Handle = Vec<Website>;
+
+    #[instrument(skip(token, _line_number))]
+    fn process_token(&self, token: Token, _line_number: u64) -> TokenSinkResult<Self::Handle> {
+        match token {
+            TagToken(tag) => {
+                if tag.kind == StartTag {
+                    match tag.name {
+                        // this should be all the html elements that have links
+                        local_name!("a")
+                        | local_name!("audio")
+                        | local_name!("area")
+                        | local_name!("img")
+                        | local_name!("link")
+                        | local_name!("object")
+                        | local_name!("source")
+                        | local_name!("base")
+                        | local_name!("video") => {
+                            let mut links = Vec::new();
+                            for attr in &tag.attrs {
+                                let attr_name = attr.name.local.to_string();
+                                if attr_name == "src" || attr_name == "href" || attr_name == "data"
+                                {
+                                    trace!("Found `{}` in html `{}` tag", &attr.value, tag.name);
+                                    let url = try_get_url(&self.site, &attr.value);
+
+                                    if let Some(mut parsed) = url {
+                                        parsed.set_query(None);
+                                        parsed.set_fragment(None);
+                                        debug!("Final cleaned URL: `{}`", parsed.to_string());
+                                        let web = Website::new(&parsed.to_string(), false);
+                                        links.push(web);
+                                    }
+                                }
+                            }
+                            return TokenSinkResult::Script(links);
+                        }
+                        local_name!("button") | local_name!("meta") | local_name!("iframe") => {
+                            // dbg!(attrs);
+                        }
+                        _ => {}
+                    }
+                }
+            }
+            _ => {}
+        }
+        TokenSinkResult::Continue
+    }
+}
+
+#[instrument(skip_all)]
+/// Parses the passed site and returns all the sites it links to.
+pub async fn parse(site: &Website, data: &str) -> Vec<Website> {
+    // prep work
+    let mut other_sites: Vec<Website> = Vec::new();
+
+    // change data into something that can be tokenized
+    let chunk = Tendril::from_str(data).expect("Failed to parse string into Tendril!");
+    // create buffer of tokens and push our input into it
+    let token_buffer = BufferQueue::default();
+    token_buffer.push_back(
+        chunk
+            .try_reinterpret::<fmt::UTF8>()
+            .expect("Failed to reinterpret chunk!"),
+    );
+    // create the tokenizer
+    let tokenizer = Tokenizer::new(site.clone(), TokenizerOpts::default());
+
+    // go thru buffer
+    while let TokenizerResult::Script(mut sites) = tokenizer.feed(&token_buffer) {
+        other_sites.append(&mut sites);
+        // other_sites.push(sites);
+    }
+    assert!(token_buffer.is_empty());
+    tokenizer.end();
+
+    other_sites
+}
+
+#[instrument]
+fn try_get_url(parent: &Url, link: &str) -> Option<Url> {
+    match Url::parse(link) {
+        Ok(ok) => Some(ok),
+        Err(e) => {
+            if link.starts_with('#') {
+                trace!("Rejecting # url");
+                None
+            } else if link.starts_with("//") {
+                // if a url starts with "//" is assumed that it will adopt
+                // the same scheme as it's parent
+                // https://stackoverflow.com/questions/9646407/two-forward-slashes-in-a-url-src-href-attribute
+                let scheme = parent.scheme();
+
+                match Url::parse(&format!("{scheme}://{}", link)) {
+                    Ok(url) => Some(url),
+                    Err(err) => {
+                        error!("Failed parsing realative scheme url: {}", err);
+                        None
+                    }
+                }
+            } else {
+                // # This is some sort of realative url, gonna try patching it up into an absolute
+                // url
+                match e {
+                    url::ParseError::RelativeUrlWithoutBase => {
+                        // Is: scheme://host:port
+                        let origin = parent.origin().ascii_serialization();
+                        let url = origin.clone() + link;
+
+                        trace!("Built `{url}` from `{origin} + {}`", link.to_string());
+
+                        if let Ok(url) = Url::parse(&url) {
+                            trace!("Saved relative url `{}` AS: `{}`", link, url);
+                            Some(url)
+                        } else {
+                            error!(
+                                "Failed to reconstruct a url from relative url: `{}` on site: `{}`",
+                                link,
+                                parent.to_string()
+                            );
+                            None
+                        }
+                    }
+                    _ => {
+                        error!("MISC error: {:?} {:?}", e, link);
+                        None
+                    }
+                }
+            }
+        }
+    }
+}
--- a/src/setup.surql
+++ b/src/setup.surql
@@ -0,0 +1,9 @@
+DEFINE TABLE IF NOT EXISTS website SCHEMALESS;
+
+DEFINE FIELD IF NOT EXISTS site ON TABLE website TYPE string;
+DEFINE INDEX IF NOT EXISTS idx ON TABLE website COLUMNS site UNIQUE;
+
+DEFINE FIELD IF NOT EXISTS crawled ON TABLE website TYPE bool;
+
+DEFINE FIELD IF NOT EXISTS accessed_at ON TABLE website VALUE time::now();
+DEFINE FIELD IF NOT EXISTS first_accessed_at ON TABLE website VALUE time::now();
Author	SHA1	Message	Date
Oliver	4989a59ddf	checkpoint	2025-07-10 18:46:25 -06:00
Rushmore75	6fc71c7a78	add speed improvements	2025-03-21 12:14:29 -06:00
Rushmore75	96a3ca092a	:)	2025-03-21 12:11:05 -06:00
Rushmore75	b750d88d48	working filesystem storage	2025-03-21 11:42:43 -06:00
Oliver	808790a7c3	file patch;	2025-03-21 07:11:51 +00:00
Oliver	2de01b2a0e	remove removed code	2025-03-21 06:48:39 +00:00
Oliver	be0fd5505b	i think the files work better	2025-03-21 06:48:17 +00:00
Oliver	a23429104c	dead code removal	2025-03-21 06:03:34 +00:00
Oliver	66581cc453	getting there	2025-03-21 05:59:40 +00:00
Rushmore75	7df19a480f	updates	2025-03-20 15:11:01 -06:00
Rushmore75	b9c1f0b492	readme updates	2025-03-19 15:05:32 -06:00
Rushmore75	71b7b2d7bc	it works and it is awesome	2025-03-19 15:04:00 -06:00
Rushmore75	bac3cd9d1d	add most recent long run	2025-03-19 15:03:49 -06:00
Rushmore75	1f6a0acce3	shutup spellchecker	2025-03-19 15:03:39 -06:00
Rushmore75	53dbf53ab9	newest settings	2025-03-19 15:03:24 -06:00
Rushmore75	0477bb26e4	viz improvements	2025-03-19 15:03:11 -06:00
Rushmore75	6409baaffb	Reducted trips to surreal by x500	2025-03-19 12:41:08 -06:00
Oliver	135a7e4957	Merge pull request 'multithreading' (#2 ) from multithreading into main Reviewed-on: #2	2025-03-19 05:00:59 +00:00
Oliver	9aa34b3eee	epic metrics	2025-03-19 04:59:50 +00:00
Rushmore75	de80418c00	better logging	2025-03-18 16:09:46 -06:00
Rushmore75	e3e4175f51	logging improvements	2025-03-18 15:25:56 -06:00
Rushmore75	d11e7dd27c	the biggest 1 line improvement ever	2025-03-18 15:25:40 -06:00
Rushmore75	f2a3e836a0	spelling and clippy	2025-03-18 15:08:29 -06:00
Rushmore75	3b4e6a40ce	minimize vec resizing	2025-03-18 15:07:50 -06:00
Rushmore75	bd0b946245	fixed tracing	2025-03-18 15:02:32 -06:00
Rushmore75	b7540a4680	checkpoint - onto profiling	2025-03-18 10:53:06 -06:00
Oliver Atkinson	82929fd0fc	updating for base64	2024-12-13 13:28:24 -07:00
Oliver Atkinson	f42e770a10	moved to other repo	2024-12-13 11:01:35 -07:00
Oliver Atkinson	611a1e923b	starting on the extension	2024-12-12 15:32:04 -07:00
Oliver Atkinson	298ad39a79	rename	2024-12-12 14:59:54 -07:00
Oliver Atkinson	215056e493	use contains operator for better output	2024-12-12 14:26:49 -07:00
Oliver Atkinson	22be3b2f61	updating deps	2024-12-12 14:14:38 -07:00
Oliver Atkinson	c1c8cf07bb	unifed settings for testing	2024-12-12 11:42:07 -07:00
oliver	0f8a3d7215	using a custom parser now :)	2024-11-12 23:08:09 -07:00
oliver	574a370f30	readme updates	2024-11-12 21:24:57 -07:00
oliver	eaa79b749e	prepare get function for s3	2024-11-12 21:19:05 -07:00
oliver	2c28d69d55	add s3 support	2024-11-12 21:03:58 -07:00