Compare commits
	
		
			6 Commits
		
	
	
		
			135a7e4957
			...
			71b7b2d7bc
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 71b7b2d7bc | |||
| bac3cd9d1d | |||
| 1f6a0acce3 | |||
| 53dbf53ab9 | |||
| 0477bb26e4 | |||
| 6409baaffb | 
							
								
								
									
										8
									
								
								.vscode/settings.json
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								.vscode/settings.json
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,8 @@ | ||||
| { | ||||
|     "cSpell.words": [ | ||||
|         "creds", | ||||
|         "reqwest", | ||||
|         "rustls", | ||||
|         "surql" | ||||
|     ] | ||||
| } | ||||
							
								
								
									
										10
									
								
								Crawler.toml
									
									
									
									
									
								
							
							
						
						
									
										10
									
								
								Crawler.toml
									
									
									
									
									
								
							| @@ -3,14 +3,14 @@ surreal_url = "localhost:8000" | ||||
| surreal_username = "root" | ||||
| surreal_password = "root" | ||||
| surreal_ns = "test" | ||||
| surreal_db = "v1.12" | ||||
| surreal_db = "v1.15.4" | ||||
|  | ||||
| # Minio config | ||||
| s3_bucket = "v1.12" | ||||
| s3_bucket = "v1.15.4" | ||||
| s3_url = "http://localhost:9000" | ||||
| s3_access_key = "jLDPKGuu513VENc8kJwX" | ||||
| s3_secret_key = "4T1nymEzsGYOlKSAb1WX7V3stnQn9a5ZoTQjDfcL" | ||||
| s3_access_key = "3ptjsHhRHCHlpCmgFy9n" | ||||
| s3_secret_key = "68CmV07YExeCxb8kJhosSauEizj5CAE7PINZIfQz" | ||||
|  | ||||
| # Crawler config | ||||
| crawl_filter = "en.wikipedia.com"  | ||||
| budget = 200 | ||||
| budget = 1000 | ||||
|   | ||||
| @@ -8,4 +8,5 @@ Crawls sites saving all the found links to a surrealdb database. It then proceed | ||||
| - [ ] Domain filtering - prevent the crawler from going on alternate versions of wikipedia. | ||||
| - [ ] Conditionally save content - based on filename or file contents | ||||
| - [ ] GUI / TUI ? | ||||
| - [ ] Better asynchronous getting of the sites. Currently it all happens serially. | ||||
| - [ ] Better asynchronous getting of the sites. Currently it all happens serially.3/19/25: Took 20min to crawl 100 pages | ||||
| This ment we stored 100 pages, 142,997 urls, and 1,425,798 links between the two. | ||||
|   | ||||
| @@ -22,6 +22,7 @@ services: | ||||
|     environment: | ||||
|       - MINIO_ROOT_USER=root | ||||
|       - MINIO_ROOT_PASSWORD=an8charpassword | ||||
|       - MINIO_PROMETHEUS_AUTH_TYPE=public | ||||
|     volumes: | ||||
|       - minio_storage:/data | ||||
|     command: | ||||
| @@ -66,6 +67,7 @@ services: | ||||
|     image: grafana/grafana:latest | ||||
|     volumes: | ||||
|       - ./grafana.yaml:/etc/grafana/provisioning/datasources/datasources.yaml | ||||
|       - ./dashboards:/var/lib/grafana/dashboards | ||||
|       - grafana_storage:/var/lib/grafana | ||||
|     environment: | ||||
|       - GF_AUTH_ANONYMOUS_ENABLED=true | ||||
| @@ -80,4 +82,4 @@ volumes: | ||||
|   grafana_storage: | ||||
|   alloy_storage: | ||||
|   surrealdb_storage: | ||||
|   minio_storage: | ||||
|   minio_storage: | ||||
|   | ||||
							
								
								
									
										648
									
								
								docker/dashboards/crawler-dashboard.json
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										648
									
								
								docker/dashboards/crawler-dashboard.json
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,648 @@ | ||||
| { | ||||
|   "__inputs": [ | ||||
|     { | ||||
|       "name": "DS_PROMETHEUS", | ||||
|       "label": "Prometheus", | ||||
|       "description": "", | ||||
|       "type": "datasource", | ||||
|       "pluginId": "prometheus", | ||||
|       "pluginName": "Prometheus" | ||||
|     }, | ||||
|     { | ||||
|       "name": "DS_LOKI", | ||||
|       "label": "Loki", | ||||
|       "description": "", | ||||
|       "type": "datasource", | ||||
|       "pluginId": "loki", | ||||
|       "pluginName": "Loki" | ||||
|     } | ||||
|   ], | ||||
|   "__elements": {}, | ||||
|   "__requires": [ | ||||
|     { | ||||
|       "type": "grafana", | ||||
|       "id": "grafana", | ||||
|       "name": "Grafana", | ||||
|       "version": "11.3.1" | ||||
|     }, | ||||
|     { | ||||
|       "type": "panel", | ||||
|       "id": "logs", | ||||
|       "name": "Logs", | ||||
|       "version": "" | ||||
|     }, | ||||
|     { | ||||
|       "type": "datasource", | ||||
|       "id": "loki", | ||||
|       "name": "Loki", | ||||
|       "version": "1.0.0" | ||||
|     }, | ||||
|     { | ||||
|       "type": "datasource", | ||||
|       "id": "prometheus", | ||||
|       "name": "Prometheus", | ||||
|       "version": "1.0.0" | ||||
|     }, | ||||
|     { | ||||
|       "type": "panel", | ||||
|       "id": "stat", | ||||
|       "name": "Stat", | ||||
|       "version": "" | ||||
|     }, | ||||
|     { | ||||
|       "type": "panel", | ||||
|       "id": "timeseries", | ||||
|       "name": "Time series", | ||||
|       "version": "" | ||||
|     } | ||||
|   ], | ||||
|   "annotations": { | ||||
|     "list": [ | ||||
|       { | ||||
|         "builtIn": 1, | ||||
|         "datasource": { | ||||
|           "type": "grafana", | ||||
|           "uid": "-- Grafana --" | ||||
|         }, | ||||
|         "enable": true, | ||||
|         "hide": true, | ||||
|         "iconColor": "rgba(0, 211, 255, 1)", | ||||
|         "name": "Annotations & Alerts", | ||||
|         "type": "dashboard" | ||||
|       } | ||||
|     ] | ||||
|   }, | ||||
|   "editable": true, | ||||
|   "fiscalYearStartMonth": 0, | ||||
|   "graphTooltip": 0, | ||||
|   "id": null, | ||||
|   "links": [], | ||||
|   "panels": [ | ||||
|     { | ||||
|       "datasource": { | ||||
|         "type": "prometheus", | ||||
|         "uid": "${DS_PROMETHEUS}" | ||||
|       }, | ||||
|       "fieldConfig": { | ||||
|         "defaults": { | ||||
|           "color": { | ||||
|             "mode": "palette-classic" | ||||
|           }, | ||||
|           "custom": { | ||||
|             "axisBorderShow": false, | ||||
|             "axisCenteredZero": false, | ||||
|             "axisColorMode": "text", | ||||
|             "axisLabel": "", | ||||
|             "axisPlacement": "auto", | ||||
|             "barAlignment": 0, | ||||
|             "barWidthFactor": 0.6, | ||||
|             "drawStyle": "line", | ||||
|             "fillOpacity": 0, | ||||
|             "gradientMode": "none", | ||||
|             "hideFrom": { | ||||
|               "legend": false, | ||||
|               "tooltip": false, | ||||
|               "viz": false | ||||
|             }, | ||||
|             "insertNulls": false, | ||||
|             "lineInterpolation": "linear", | ||||
|             "lineWidth": 1, | ||||
|             "pointSize": 5, | ||||
|             "scaleDistribution": { | ||||
|               "type": "linear" | ||||
|             }, | ||||
|             "showPoints": "auto", | ||||
|             "spanNulls": 300000, | ||||
|             "stacking": { | ||||
|               "group": "A", | ||||
|               "mode": "none" | ||||
|             }, | ||||
|             "thresholdsStyle": { | ||||
|               "mode": "off" | ||||
|             } | ||||
|           }, | ||||
|           "mappings": [], | ||||
|           "thresholds": { | ||||
|             "mode": "absolute", | ||||
|             "steps": [ | ||||
|               { | ||||
|                 "color": "green", | ||||
|                 "value": null | ||||
|               }, | ||||
|               { | ||||
|                 "color": "red", | ||||
|                 "value": 80 | ||||
|               } | ||||
|             ] | ||||
|           } | ||||
|         }, | ||||
|         "overrides": [] | ||||
|       }, | ||||
|       "gridPos": { | ||||
|         "h": 8, | ||||
|         "w": 8, | ||||
|         "x": 0, | ||||
|         "y": 0 | ||||
|       }, | ||||
|       "id": 5, | ||||
|       "options": { | ||||
|         "legend": { | ||||
|           "calcs": [], | ||||
|           "displayMode": "list", | ||||
|           "placement": "bottom", | ||||
|           "showLegend": true | ||||
|         }, | ||||
|         "tooltip": { | ||||
|           "mode": "single", | ||||
|           "sort": "none" | ||||
|         } | ||||
|       }, | ||||
|       "pluginVersion": "11.3.1", | ||||
|       "targets": [ | ||||
|         { | ||||
|           "datasource": { | ||||
|             "type": "prometheus", | ||||
|             "uid": "${DS_PROMETHEUS}" | ||||
|           }, | ||||
|           "disableTextWrap": false, | ||||
|           "editorMode": "builder", | ||||
|           "expr": "surql_trips", | ||||
|           "fullMetaSearch": false, | ||||
|           "includeNullMetadata": true, | ||||
|           "legendFormat": "Trips to Surreal", | ||||
|           "range": true, | ||||
|           "refId": "A", | ||||
|           "useBackend": false | ||||
|         }, | ||||
|         { | ||||
|           "datasource": { | ||||
|             "type": "prometheus", | ||||
|             "uid": "${DS_PROMETHEUS}" | ||||
|           }, | ||||
|           "disableTextWrap": false, | ||||
|           "editorMode": "builder", | ||||
|           "expr": "s3_trips", | ||||
|           "fullMetaSearch": false, | ||||
|           "hide": false, | ||||
|           "includeNullMetadata": true, | ||||
|           "instant": false, | ||||
|           "legendFormat": "Trips to S3", | ||||
|           "range": true, | ||||
|           "refId": "B", | ||||
|           "useBackend": false | ||||
|         }, | ||||
|         { | ||||
|           "datasource": { | ||||
|             "type": "prometheus", | ||||
|             "uid": "${DS_PROMETHEUS}" | ||||
|           }, | ||||
|           "disableTextWrap": false, | ||||
|           "editorMode": "builder", | ||||
|           "expr": "pages_crawled", | ||||
|           "fullMetaSearch": false, | ||||
|           "hide": false, | ||||
|           "includeNullMetadata": true, | ||||
|           "instant": false, | ||||
|           "legendFormat": "total crawled", | ||||
|           "range": true, | ||||
|           "refId": "C", | ||||
|           "useBackend": false | ||||
|         }, | ||||
|         { | ||||
|           "datasource": { | ||||
|             "type": "prometheus", | ||||
|             "uid": "${DS_PROMETHEUS}" | ||||
|           }, | ||||
|           "disableTextWrap": false, | ||||
|           "editorMode": "builder", | ||||
|           "expr": "pages_being_processed", | ||||
|           "fullMetaSearch": false, | ||||
|           "hide": false, | ||||
|           "includeNullMetadata": true, | ||||
|           "instant": false, | ||||
|           "legendFormat": "Pages being processed", | ||||
|           "range": true, | ||||
|           "refId": "E", | ||||
|           "useBackend": false | ||||
|         }, | ||||
|         { | ||||
|           "datasource": { | ||||
|             "type": "prometheus", | ||||
|             "uid": "${DS_PROMETHEUS}" | ||||
|           }, | ||||
|           "disableTextWrap": false, | ||||
|           "editorMode": "builder", | ||||
|           "expr": "gets_in_flight", | ||||
|           "fullMetaSearch": false, | ||||
|           "hide": false, | ||||
|           "includeNullMetadata": true, | ||||
|           "instant": false, | ||||
|           "legendFormat": "__auto", | ||||
|           "range": true, | ||||
|           "refId": "D", | ||||
|           "useBackend": false | ||||
|         } | ||||
|       ], | ||||
|       "title": "Crawler stats", | ||||
|       "type": "timeseries" | ||||
|     }, | ||||
|     { | ||||
|       "datasource": { | ||||
|         "type": "prometheus", | ||||
|         "uid": "${DS_PROMETHEUS}" | ||||
|       }, | ||||
|       "fieldConfig": { | ||||
|         "defaults": { | ||||
|           "color": { | ||||
|             "mode": "palette-classic" | ||||
|           }, | ||||
|           "custom": { | ||||
|             "axisBorderShow": false, | ||||
|             "axisCenteredZero": false, | ||||
|             "axisColorMode": "text", | ||||
|             "axisLabel": "", | ||||
|             "axisPlacement": "auto", | ||||
|             "barAlignment": 0, | ||||
|             "barWidthFactor": 0.6, | ||||
|             "drawStyle": "line", | ||||
|             "fillOpacity": 0, | ||||
|             "gradientMode": "none", | ||||
|             "hideFrom": { | ||||
|               "legend": false, | ||||
|               "tooltip": false, | ||||
|               "viz": false | ||||
|             }, | ||||
|             "insertNulls": false, | ||||
|             "lineInterpolation": "linear", | ||||
|             "lineWidth": 1, | ||||
|             "pointSize": 5, | ||||
|             "scaleDistribution": { | ||||
|               "type": "linear" | ||||
|             }, | ||||
|             "showPoints": "auto", | ||||
|             "spanNulls": 300000, | ||||
|             "stacking": { | ||||
|               "group": "A", | ||||
|               "mode": "none" | ||||
|             }, | ||||
|             "thresholdsStyle": { | ||||
|               "mode": "off" | ||||
|             } | ||||
|           }, | ||||
|           "mappings": [], | ||||
|           "thresholds": { | ||||
|             "mode": "absolute", | ||||
|             "steps": [ | ||||
|               { | ||||
|                 "color": "green", | ||||
|                 "value": null | ||||
|               }, | ||||
|               { | ||||
|                 "color": "red", | ||||
|                 "value": 80 | ||||
|               } | ||||
|             ] | ||||
|           } | ||||
|         }, | ||||
|         "overrides": [] | ||||
|       }, | ||||
|       "gridPos": { | ||||
|         "h": 8, | ||||
|         "w": 9, | ||||
|         "x": 8, | ||||
|         "y": 0 | ||||
|       }, | ||||
|       "id": 6, | ||||
|       "options": { | ||||
|         "legend": { | ||||
|           "calcs": [], | ||||
|           "displayMode": "list", | ||||
|           "placement": "bottom", | ||||
|           "showLegend": true | ||||
|         }, | ||||
|         "tooltip": { | ||||
|           "mode": "single", | ||||
|           "sort": "none" | ||||
|         } | ||||
|       }, | ||||
|       "pluginVersion": "11.3.1", | ||||
|       "targets": [ | ||||
|         { | ||||
|           "datasource": { | ||||
|             "type": "prometheus", | ||||
|             "uid": "${DS_PROMETHEUS}" | ||||
|           }, | ||||
|           "disableTextWrap": false, | ||||
|           "editorMode": "builder", | ||||
|           "expr": "surql_trips", | ||||
|           "fullMetaSearch": false, | ||||
|           "includeNullMetadata": true, | ||||
|           "legendFormat": "Trips to Surreal", | ||||
|           "range": true, | ||||
|           "refId": "A", | ||||
|           "useBackend": false | ||||
|         }, | ||||
|         { | ||||
|           "datasource": { | ||||
|             "type": "prometheus", | ||||
|             "uid": "${DS_PROMETHEUS}" | ||||
|           }, | ||||
|           "disableTextWrap": false, | ||||
|           "editorMode": "builder", | ||||
|           "expr": "surql_link_calls", | ||||
|           "fullMetaSearch": false, | ||||
|           "hide": false, | ||||
|           "includeNullMetadata": true, | ||||
|           "instant": false, | ||||
|           "legendFormat": "link calls", | ||||
|           "range": true, | ||||
|           "refId": "B", | ||||
|           "useBackend": false | ||||
|         }, | ||||
|         { | ||||
|           "datasource": { | ||||
|             "type": "prometheus", | ||||
|             "uid": "${DS_PROMETHEUS}" | ||||
|           }, | ||||
|           "disableTextWrap": false, | ||||
|           "editorMode": "builder", | ||||
|           "expr": "surql_store_calls", | ||||
|           "fullMetaSearch": false, | ||||
|           "hide": false, | ||||
|           "includeNullMetadata": true, | ||||
|           "instant": false, | ||||
|           "legendFormat": "store calls", | ||||
|           "range": true, | ||||
|           "refId": "C", | ||||
|           "useBackend": false | ||||
|         }, | ||||
|         { | ||||
|           "datasource": { | ||||
|             "type": "prometheus", | ||||
|             "uid": "${DS_PROMETHEUS}" | ||||
|           }, | ||||
|           "disableTextWrap": false, | ||||
|           "editorMode": "builder", | ||||
|           "expr": "pages_being_processed", | ||||
|           "fullMetaSearch": false, | ||||
|           "hide": false, | ||||
|           "includeNullMetadata": true, | ||||
|           "instant": false, | ||||
|           "legendFormat": "Pages being processed", | ||||
|           "range": true, | ||||
|           "refId": "E", | ||||
|           "useBackend": false | ||||
|         } | ||||
|       ], | ||||
|       "title": "Surreal stats", | ||||
|       "type": "timeseries" | ||||
|     }, | ||||
|     { | ||||
|       "datasource": { | ||||
|         "type": "prometheus", | ||||
|         "uid": "${DS_PROMETHEUS}" | ||||
|       }, | ||||
|       "description": "This is across all threads, so this isn't wall clock time", | ||||
|       "fieldConfig": { | ||||
|         "defaults": { | ||||
|           "color": { | ||||
|             "mode": "thresholds" | ||||
|           }, | ||||
|           "mappings": [], | ||||
|           "thresholds": { | ||||
|             "mode": "absolute", | ||||
|             "steps": [ | ||||
|               { | ||||
|                 "color": "green", | ||||
|                 "value": null | ||||
|               } | ||||
|             ] | ||||
|           }, | ||||
|           "unit": "ms" | ||||
|         }, | ||||
|         "overrides": [] | ||||
|       }, | ||||
|       "gridPos": { | ||||
|         "h": 8, | ||||
|         "w": 7, | ||||
|         "x": 17, | ||||
|         "y": 0 | ||||
|       }, | ||||
|       "id": 7, | ||||
|       "options": { | ||||
|         "colorMode": "value", | ||||
|         "graphMode": "area", | ||||
|         "justifyMode": "auto", | ||||
|         "orientation": "auto", | ||||
|         "percentChangeColorMode": "standard", | ||||
|         "reduceOptions": { | ||||
|           "calcs": [ | ||||
|             "lastNotNull" | ||||
|           ], | ||||
|           "fields": "", | ||||
|           "values": false | ||||
|         }, | ||||
|         "showPercentChange": false, | ||||
|         "textMode": "auto", | ||||
|         "wideLayout": true | ||||
|       }, | ||||
|       "pluginVersion": "11.3.1", | ||||
|       "targets": [ | ||||
|         { | ||||
|           "datasource": { | ||||
|             "type": "prometheus", | ||||
|             "uid": "${DS_PROMETHEUS}" | ||||
|           }, | ||||
|           "disableTextWrap": false, | ||||
|           "editorMode": "builder", | ||||
|           "expr": "surql_lock_waiting_ms", | ||||
|           "fullMetaSearch": false, | ||||
|           "includeNullMetadata": true, | ||||
|           "legendFormat": "__auto", | ||||
|           "range": true, | ||||
|           "refId": "A", | ||||
|           "useBackend": false | ||||
|         } | ||||
|       ], | ||||
|       "title": "Time spend waiting on lock", | ||||
|       "type": "stat" | ||||
|     }, | ||||
|     { | ||||
|       "datasource": { | ||||
|         "type": "loki", | ||||
|         "uid": "${DS_LOKI}" | ||||
|       }, | ||||
|       "gridPos": { | ||||
|         "h": 18, | ||||
|         "w": 24, | ||||
|         "x": 0, | ||||
|         "y": 8 | ||||
|       }, | ||||
|       "id": 1, | ||||
|       "options": { | ||||
|         "dedupStrategy": "none", | ||||
|         "enableLogDetails": true, | ||||
|         "prettifyLogMessage": false, | ||||
|         "showCommonLabels": false, | ||||
|         "showLabels": false, | ||||
|         "showTime": false, | ||||
|         "sortOrder": "Descending", | ||||
|         "wrapLogMessage": false | ||||
|       }, | ||||
|       "pluginVersion": "11.3.1", | ||||
|       "targets": [ | ||||
|         { | ||||
|           "datasource": { | ||||
|             "type": "loki", | ||||
|             "uid": "${DS_LOKI}" | ||||
|           }, | ||||
|           "editorMode": "code", | ||||
|           "expr": "{filename=\"/tmp/alloy-logs/tracing.log\"} | json | level = `ERROR` | line_format \"{{.threadId}} {{.filename_extracted}}:{{.line_number}}  {{.fields_message}}\"", | ||||
|           "queryType": "range", | ||||
|           "refId": "A" | ||||
|         } | ||||
|       ], | ||||
|       "title": "Errors", | ||||
|       "type": "logs" | ||||
|     }, | ||||
|     { | ||||
|       "datasource": { | ||||
|         "type": "loki", | ||||
|         "uid": "${DS_LOKI}" | ||||
|       }, | ||||
|       "gridPos": { | ||||
|         "h": 8, | ||||
|         "w": 12, | ||||
|         "x": 0, | ||||
|         "y": 26 | ||||
|       }, | ||||
|       "id": 2, | ||||
|       "options": { | ||||
|         "dedupStrategy": "none", | ||||
|         "enableLogDetails": true, | ||||
|         "prettifyLogMessage": false, | ||||
|         "showCommonLabels": false, | ||||
|         "showLabels": false, | ||||
|         "showTime": false, | ||||
|         "sortOrder": "Descending", | ||||
|         "wrapLogMessage": false | ||||
|       }, | ||||
|       "pluginVersion": "11.3.1", | ||||
|       "targets": [ | ||||
|         { | ||||
|           "datasource": { | ||||
|             "type": "loki", | ||||
|             "uid": "${DS_LOKI}" | ||||
|           }, | ||||
|           "editorMode": "code", | ||||
|           "expr": "{filename=\"/tmp/alloy-logs/tracing.log\"} | json | level = `DEBUG` | line_format \"{{.fields_message}}\"", | ||||
|           "queryType": "range", | ||||
|           "refId": "A" | ||||
|         } | ||||
|       ], | ||||
|       "title": "Debug", | ||||
|       "type": "logs" | ||||
|     }, | ||||
|     { | ||||
|       "datasource": { | ||||
|         "type": "loki", | ||||
|         "uid": "${DS_LOKI}" | ||||
|       }, | ||||
|       "gridPos": { | ||||
|         "h": 16, | ||||
|         "w": 12, | ||||
|         "x": 12, | ||||
|         "y": 26 | ||||
|       }, | ||||
|       "id": 4, | ||||
|       "options": { | ||||
|         "dedupStrategy": "none", | ||||
|         "enableLogDetails": true, | ||||
|         "prettifyLogMessage": false, | ||||
|         "showCommonLabels": false, | ||||
|         "showLabels": false, | ||||
|         "showTime": false, | ||||
|         "sortOrder": "Descending", | ||||
|         "wrapLogMessage": false | ||||
|       }, | ||||
|       "pluginVersion": "11.3.1", | ||||
|       "targets": [ | ||||
|         { | ||||
|           "datasource": { | ||||
|             "type": "loki", | ||||
|             "uid": "${DS_LOKI}" | ||||
|           }, | ||||
|           "editorMode": "code", | ||||
|           "expr": "{filename=\"/tmp/alloy-logs/tracing.log\"} | json | level = `TRACE` | line_format \"{{.fields_message}}\"", | ||||
|           "queryType": "range", | ||||
|           "refId": "A" | ||||
|         } | ||||
|       ], | ||||
|       "title": "Trace", | ||||
|       "type": "logs" | ||||
|     }, | ||||
|     { | ||||
|       "datasource": { | ||||
|         "type": "loki", | ||||
|         "uid": "${DS_LOKI}" | ||||
|       }, | ||||
|       "gridPos": { | ||||
|         "h": 8, | ||||
|         "w": 12, | ||||
|         "x": 0, | ||||
|         "y": 34 | ||||
|       }, | ||||
|       "id": 3, | ||||
|       "options": { | ||||
|         "dedupStrategy": "none", | ||||
|         "enableLogDetails": true, | ||||
|         "prettifyLogMessage": false, | ||||
|         "showCommonLabels": false, | ||||
|         "showLabels": false, | ||||
|         "showTime": false, | ||||
|         "sortOrder": "Descending", | ||||
|         "wrapLogMessage": false | ||||
|       }, | ||||
|       "pluginVersion": "11.3.1", | ||||
|       "targets": [ | ||||
|         { | ||||
|           "datasource": { | ||||
|             "type": "loki", | ||||
|             "uid": "${DS_LOKI}" | ||||
|           }, | ||||
|           "editorMode": "code", | ||||
|           "expr": "{filename=\"/tmp/alloy-logs/tracing.log\"} | json | level = `WARN` | line_format \"{{.fields_message}}\"", | ||||
|           "queryType": "range", | ||||
|           "refId": "A" | ||||
|         } | ||||
|       ], | ||||
|       "title": "Warnings", | ||||
|       "type": "logs" | ||||
|     } | ||||
|   ], | ||||
|   "schemaVersion": 40, | ||||
|   "tags": [], | ||||
|   "templating": { | ||||
|     "list": [ | ||||
|       { | ||||
|         "datasource": { | ||||
|           "type": "loki", | ||||
|           "uid": "P8E80F9AEF21F6940" | ||||
|         }, | ||||
|         "filters": [], | ||||
|         "name": "Filters", | ||||
|         "type": "adhoc" | ||||
|       } | ||||
|     ] | ||||
|   }, | ||||
|   "time": { | ||||
|     "from": "now-5m", | ||||
|     "to": "now" | ||||
|   }, | ||||
|   "timepicker": {}, | ||||
|   "timezone": "browser", | ||||
|   "title": "Crawler", | ||||
|   "uid": "ceg90x34pqgowd", | ||||
|   "version": 21, | ||||
|   "weekStart": "" | ||||
| } | ||||
| @@ -1,223 +0,0 @@ | ||||
| { | ||||
|   "__inputs": [ | ||||
|     { | ||||
|       "name": "DS_LOKI", | ||||
|       "label": "Loki", | ||||
|       "description": "", | ||||
|       "type": "datasource", | ||||
|       "pluginId": "loki", | ||||
|       "pluginName": "Loki" | ||||
|     } | ||||
|   ], | ||||
|   "__elements": {}, | ||||
|   "__requires": [ | ||||
|     { | ||||
|       "type": "grafana", | ||||
|       "id": "grafana", | ||||
|       "name": "Grafana", | ||||
|       "version": "11.3.1" | ||||
|     }, | ||||
|     { | ||||
|       "type": "panel", | ||||
|       "id": "logs", | ||||
|       "name": "Logs", | ||||
|       "version": "" | ||||
|     }, | ||||
|     { | ||||
|       "type": "datasource", | ||||
|       "id": "loki", | ||||
|       "name": "Loki", | ||||
|       "version": "1.0.0" | ||||
|     } | ||||
|   ], | ||||
|   "annotations": { | ||||
|     "list": [ | ||||
|       { | ||||
|         "builtIn": 1, | ||||
|         "datasource": { | ||||
|           "type": "grafana", | ||||
|           "uid": "-- Grafana --" | ||||
|         }, | ||||
|         "enable": true, | ||||
|         "hide": true, | ||||
|         "iconColor": "rgba(0, 211, 255, 1)", | ||||
|         "name": "Annotations & Alerts", | ||||
|         "type": "dashboard" | ||||
|       } | ||||
|     ] | ||||
|   }, | ||||
|   "editable": true, | ||||
|   "fiscalYearStartMonth": 0, | ||||
|   "graphTooltip": 0, | ||||
|   "id": null, | ||||
|   "links": [], | ||||
|   "panels": [ | ||||
|     { | ||||
|       "datasource": { | ||||
|         "type": "loki", | ||||
|         "uid": "${DS_LOKI}" | ||||
|       }, | ||||
|       "gridPos": { | ||||
|         "h": 8, | ||||
|         "w": 12, | ||||
|         "x": 0, | ||||
|         "y": 0 | ||||
|       }, | ||||
|       "id": 1, | ||||
|       "options": { | ||||
|         "dedupStrategy": "none", | ||||
|         "enableLogDetails": true, | ||||
|         "prettifyLogMessage": false, | ||||
|         "showCommonLabels": false, | ||||
|         "showLabels": false, | ||||
|         "showTime": false, | ||||
|         "sortOrder": "Descending", | ||||
|         "wrapLogMessage": false | ||||
|       }, | ||||
|       "pluginVersion": "11.3.1", | ||||
|       "targets": [ | ||||
|         { | ||||
|           "datasource": { | ||||
|             "type": "loki", | ||||
|             "uid": "${DS_LOKI}" | ||||
|           }, | ||||
|           "editorMode": "code", | ||||
|           "expr": "{filename=\"/tmp/alloy-logs/tracing.log\"} | json | level = `ERROR` | line_format \"{{.fields_message}}\"", | ||||
|           "queryType": "range", | ||||
|           "refId": "A" | ||||
|         } | ||||
|       ], | ||||
|       "title": "Errors", | ||||
|       "type": "logs" | ||||
|     }, | ||||
|     { | ||||
|       "datasource": { | ||||
|         "type": "loki", | ||||
|         "uid": "${DS_LOKI}" | ||||
|       }, | ||||
|       "gridPos": { | ||||
|         "h": 8, | ||||
|         "w": 12, | ||||
|         "x": 12, | ||||
|         "y": 0 | ||||
|       }, | ||||
|       "id": 3, | ||||
|       "options": { | ||||
|         "dedupStrategy": "none", | ||||
|         "enableLogDetails": true, | ||||
|         "prettifyLogMessage": false, | ||||
|         "showCommonLabels": false, | ||||
|         "showLabels": false, | ||||
|         "showTime": false, | ||||
|         "sortOrder": "Descending", | ||||
|         "wrapLogMessage": false | ||||
|       }, | ||||
|       "pluginVersion": "11.3.1", | ||||
|       "targets": [ | ||||
|         { | ||||
|           "datasource": { | ||||
|             "type": "loki", | ||||
|             "uid": "${DS_LOKI}" | ||||
|           }, | ||||
|           "editorMode": "code", | ||||
|           "expr": "{filename=\"/tmp/alloy-logs/tracing.log\"} | json | level = `WARN` | line_format \"{{.fields_message}}\"", | ||||
|           "queryType": "range", | ||||
|           "refId": "A" | ||||
|         } | ||||
|       ], | ||||
|       "title": "Warnings", | ||||
|       "type": "logs" | ||||
|     }, | ||||
|     { | ||||
|       "datasource": { | ||||
|         "type": "loki", | ||||
|         "uid": "${DS_LOKI}" | ||||
|       }, | ||||
|       "gridPos": { | ||||
|         "h": 8, | ||||
|         "w": 12, | ||||
|         "x": 0, | ||||
|         "y": 8 | ||||
|       }, | ||||
|       "id": 2, | ||||
|       "options": { | ||||
|         "dedupStrategy": "none", | ||||
|         "enableLogDetails": true, | ||||
|         "prettifyLogMessage": false, | ||||
|         "showCommonLabels": false, | ||||
|         "showLabels": false, | ||||
|         "showTime": false, | ||||
|         "sortOrder": "Descending", | ||||
|         "wrapLogMessage": false | ||||
|       }, | ||||
|       "pluginVersion": "11.3.1", | ||||
|       "targets": [ | ||||
|         { | ||||
|           "datasource": { | ||||
|             "type": "loki", | ||||
|             "uid": "${DS_LOKI}" | ||||
|           }, | ||||
|           "editorMode": "code", | ||||
|           "expr": "{filename=\"/tmp/alloy-logs/tracing.log\"} | json | level = `DEBUG` | line_format \"{{.fields_message}}\"", | ||||
|           "queryType": "range", | ||||
|           "refId": "A" | ||||
|         } | ||||
|       ], | ||||
|       "title": "Debug", | ||||
|       "type": "logs" | ||||
|     }, | ||||
|     { | ||||
|       "datasource": { | ||||
|         "type": "loki", | ||||
|         "uid": "${DS_LOKI}" | ||||
|       }, | ||||
|       "gridPos": { | ||||
|         "h": 8, | ||||
|         "w": 12, | ||||
|         "x": 12, | ||||
|         "y": 8 | ||||
|       }, | ||||
|       "id": 4, | ||||
|       "options": { | ||||
|         "dedupStrategy": "none", | ||||
|         "enableLogDetails": true, | ||||
|         "prettifyLogMessage": false, | ||||
|         "showCommonLabels": false, | ||||
|         "showLabels": false, | ||||
|         "showTime": false, | ||||
|         "sortOrder": "Descending", | ||||
|         "wrapLogMessage": false | ||||
|       }, | ||||
|       "pluginVersion": "11.3.1", | ||||
|       "targets": [ | ||||
|         { | ||||
|           "datasource": { | ||||
|             "type": "loki", | ||||
|             "uid": "${DS_LOKI}" | ||||
|           }, | ||||
|           "editorMode": "code", | ||||
|           "expr": "{filename=\"/tmp/alloy-logs/tracing.log\"} | json | level = `TRACE` | line_format \"{{.fields_message}}\"", | ||||
|           "queryType": "range", | ||||
|           "refId": "A" | ||||
|         } | ||||
|       ], | ||||
|       "title": "Trace", | ||||
|       "type": "logs" | ||||
|     } | ||||
|   ], | ||||
|   "schemaVersion": 40, | ||||
|   "tags": [], | ||||
|   "templating": { | ||||
|     "list": [] | ||||
|   }, | ||||
|   "time": { | ||||
|     "from": "now-6h", | ||||
|     "to": "now" | ||||
|   }, | ||||
|   "timepicker": {}, | ||||
|   "timezone": "browser", | ||||
|   "title": "New dashboard", | ||||
|   "uid": "ceg90x34pqgowd", | ||||
|   "version": 4, | ||||
|   "weekStart": "" | ||||
| } | ||||
| @@ -7,10 +7,14 @@ scrape_configs: | ||||
|     static_configs: | ||||
|     # change this your machine's ip, localhost won't work | ||||
|     # because localhost refers to the docker container. | ||||
|       - targets: ['192.168.8.209:2500'] | ||||
|       - targets: ['172.20.239.48:2500'] | ||||
|   - job_name: loki | ||||
|     static_configs: | ||||
|       - targets: ['loki:3100'] | ||||
|   - job_name: prometheus | ||||
|     static_configs: | ||||
|       - targets: ['localhost:9090'] | ||||
|   - job_name: minio | ||||
|     metrics_path: /minio/v2/metrics/cluster | ||||
|     static_configs: | ||||
|       - targets: ['minio:9000'] | ||||
|   | ||||
							
								
								
									
										211
									
								
								src/db.rs
									
									
									
									
									
								
							
							
						
						
									
										211
									
								
								src/db.rs
									
									
									
									
									
								
							| @@ -1,33 +1,58 @@ | ||||
| use std::fmt::Debug; | ||||
| use metrics::counter; | ||||
| use serde::{Deserialize, Serialize}; | ||||
| use surrealdb::{ | ||||
|     engine::remote::ws::{Client, Ws}, error::Db, opt::auth::Root, sql::Thing, Response, Surreal | ||||
| use base64::{ | ||||
|     alphabet, | ||||
|     engine::{self, general_purpose}, | ||||
|     Engine, | ||||
| }; | ||||
| use metrics::counter; | ||||
| use serde::{ser::SerializeStruct, Deserialize, Serialize}; | ||||
| use std::{fmt::Debug, sync::LazyLock, time::Instant}; | ||||
| use surrealdb::{ | ||||
|     engine::remote::ws::{Client, Ws}, | ||||
|     opt::auth::Root, | ||||
|     sql::Thing, | ||||
|     Error::Api, | ||||
|     Response, Surreal, | ||||
| }; | ||||
| use tokio::sync::Mutex; | ||||
| use tracing::{error, instrument, trace, warn}; | ||||
| use url::Url; | ||||
|  | ||||
| use crate::{Config, Timer}; | ||||
|  | ||||
| const ROUND_TRIP_METRIC: &'static str = "surql_trips"; | ||||
| // static LOCK: LazyLock<Arc<Mutex<bool>>> = LazyLock::new(|| Arc::new(Mutex::new(true))); | ||||
| static LOCK: LazyLock<Mutex<bool>> = LazyLock::new(|| Mutex::new(true)); | ||||
|  | ||||
| const CUSTOM_ENGINE: engine::GeneralPurpose = | ||||
|     engine::GeneralPurpose::new(&alphabet::URL_SAFE, general_purpose::NO_PAD); | ||||
|  | ||||
| const TIME_SPENT_ON_LOCK: &'static str = "surql_lock_waiting_ms"; | ||||
| const STORE: &'static str = "surql_store_calls"; | ||||
| const LINK: &'static str = "surql_link_calls"; | ||||
|  | ||||
| #[derive(Serialize, Deserialize, Clone)] | ||||
| #[derive(Deserialize, Clone)] | ||||
| pub struct Website { | ||||
|     /// The url that this data is found at | ||||
|     pub site: Url, | ||||
|     /// Wether or not this link has been crawled yet | ||||
|     pub crawled: bool, | ||||
|     #[serde(skip_serializing)] | ||||
|     id: Option<Thing>, | ||||
| } | ||||
|  | ||||
| impl Serialize for Website { | ||||
|     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> | ||||
|     where | ||||
|         S: serde::Serializer { | ||||
|             let mut state = serializer.serialize_struct("Website", 2)?; | ||||
|             state.serialize_field("crawled", &self.crawled)?; | ||||
|             // to_string() calls the correct naming of site | ||||
|             state.serialize_field("site", &self.site.to_string())?; | ||||
|             state.end() | ||||
|     } | ||||
| } | ||||
|  | ||||
| // manual impl to make tracing look nicer | ||||
| impl Debug for Website { | ||||
|     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { | ||||
|         let site = (self.site.domain().unwrap_or("n/a")).to_string() + self.site.path(); | ||||
|         f.debug_struct("Website").field("site", &site).finish() | ||||
|         f.debug_struct("Website").field("site", &self.site).finish() | ||||
|     } | ||||
| } | ||||
|  | ||||
| @@ -39,9 +64,8 @@ impl Website { | ||||
|             Err(_) => todo!(), | ||||
|         }; | ||||
|         Self { | ||||
|             id: None, | ||||
|             crawled, | ||||
|             site, | ||||
|             site | ||||
|         } | ||||
|     } | ||||
|  | ||||
| @@ -50,24 +74,44 @@ impl Website { | ||||
|         self.crawled = true | ||||
|     } | ||||
|  | ||||
|     pub fn get_url_as_string(site: &Url) -> String { | ||||
|         let domain = match site.domain() { | ||||
|             Some(s) => s.to_string(), | ||||
|             None => { | ||||
|                 warn!("Failed to get domain of URL: {}, falling back to 'localhost'", site.to_string()); | ||||
|                 "localhost".to_string() | ||||
|             } | ||||
|         }; | ||||
|         let path = site.path(); | ||||
|  | ||||
|         domain + path | ||||
|     } | ||||
|     pub fn get_url_as_b64_path(site: &Url) -> String { | ||||
|         let domain = site.domain().unwrap_or("DOMAIN").to_string(); | ||||
|         let path = &CUSTOM_ENGINE.encode(site.path()); | ||||
|  | ||||
|         domain + path | ||||
|     } | ||||
|  | ||||
|     #[instrument(skip_all)] | ||||
|     pub async fn links_to(&self, other: Vec<Thing>, db: &Surreal<Client>) { | ||||
|  | ||||
|         let len = other.len(); | ||||
|         if len == 0 {return} | ||||
|         if len == 0 { | ||||
|             return; | ||||
|         } | ||||
|          | ||||
|         let from = &self.site; | ||||
|  | ||||
|         let from = self.site.to_string(); | ||||
|         // let to = other.site.to_string(); | ||||
|         trace!("Linking {from} to {} other pages.", other.len()); | ||||
|         let msg = format!("Linked {len} pages"); | ||||
|         trace!("Linking {} pages to {from}", other.len()); | ||||
|         let msg = format!("Linked {len} pages to {from}"); | ||||
|         let timer = Timer::start(&msg); | ||||
|         // prevent the timer from being dropped instantly. | ||||
|         let _ = timer; | ||||
|         counter!(ROUND_TRIP_METRIC).increment(1); | ||||
|         counter!(LINK).increment(1); | ||||
|         match db | ||||
|             .query("COUNT(RELATE (SELECT id FROM website WHERE site = $in) -> links_to -> $out)") | ||||
|             .bind(("in", from)) | ||||
|             .bind(("in", from.clone())) | ||||
|             .bind(("out", other)) | ||||
|             .await | ||||
|         { | ||||
| @@ -79,90 +123,64 @@ impl Website { | ||||
|                     let _: Vec<usize> = vec; | ||||
|                     if let Some(num) = vec.get(0) { | ||||
|                         if *num == len { | ||||
|                             trace!("Link OK"); | ||||
|                             trace!("Link for {from} OK - {num}/{len}"); | ||||
|                             return; | ||||
|                         } else { | ||||
|                             warn!("Didn't link all the records. {num}/{len}"); | ||||
|                             error!("Didn't link all the records. {num}/{len}. Surreal response: {:?}", e); | ||||
|                             return; | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|                 warn!("Linking request succeeded but couldn't verify the results."); | ||||
|             }, | ||||
|                 error!("Linking request succeeded but couldn't verify the results."); | ||||
|             } | ||||
|             Err(e) => { | ||||
|                 error!("{}", e.to_string()); | ||||
|             }, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     #[instrument(name = "surql_store", skip_all)] | ||||
|     pub async fn store(&self, db: &Surreal<Client>) -> Option<Thing> { | ||||
|         counter!(STORE).increment(1); | ||||
|         let counter = counter!(ROUND_TRIP_METRIC); | ||||
|         let t = Timer::start("Stored link"); | ||||
|         let _ = t; | ||||
|         counter.increment(1); | ||||
|  | ||||
|         // check if it's been gone thru before | ||||
|         let mut response = db | ||||
|             .query("SELECT * FROM ONLY website WHERE site = $site LIMIT 1") | ||||
|             .bind(("site", self.site.to_string())) | ||||
|             .await | ||||
|             .expect("Failed to check surreal for duplicates!"); | ||||
|  | ||||
|         if let Some(old) = response.take::<Option<Website>>(0).expect("Failed to read response from surreal for duplicates.") { | ||||
|             // site exists already | ||||
|             if let Some(id) = old.id { | ||||
|                 // make sure to preserve the "crawled status" | ||||
|                 let mut new = self.clone(); | ||||
|                 new.crawled = old.crawled | new.crawled; | ||||
|  | ||||
|                 counter.increment(1); | ||||
|                 // update the record | ||||
|                 match db.upsert((id.tb, id.id.to_string())).content(new).await { | ||||
|                     Ok(e) => { | ||||
|                         if let Some(a) = e { | ||||
|                             let _: Record = a; | ||||
|                             return Some(a.id); | ||||
|                         } | ||||
|                     } | ||||
|                     Err(e) => { | ||||
|                         match e { | ||||
|                             surrealdb::Error::Db(error) => { | ||||
|                                 match error { | ||||
|                                     Db::QueryCancelled => todo!(), | ||||
|                                     Db::QueryNotExecuted => todo!(), | ||||
|                                     Db::QueryNotExecutedDetail { message: _ } => todo!(), | ||||
|                                    _=>{}, | ||||
|                                 } | ||||
|                             }, | ||||
|                             _=>{}, | ||||
|                         } | ||||
|                         // error!("{}", e); | ||||
|                     } | ||||
|                 }; | ||||
|             } | ||||
|         } else { | ||||
|             counter.increment(1); | ||||
|             // sites hasn't existed yet | ||||
|             match db.create("website").content(self.clone()).await { | ||||
|                 Ok(e) => { | ||||
|                     let _: Option<Record> = e; | ||||
|                     if let Some(a) = e { | ||||
|                         let _: Record = a; | ||||
|                         return Some(a.id); | ||||
|                     } | ||||
|                 } | ||||
|                 Err(a) => error!("{:?}", a), | ||||
|             }; | ||||
|         } | ||||
|         None | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl ToString for Website { | ||||
|     fn to_string(&self) -> String { | ||||
|         self.site.to_string() | ||||
|     // Insert ever item in the vec into surreal, crawled state will be preserved as TRUE | ||||
|     // if already in the database as such or incoming data is TRUE. | ||||
|     pub async fn store_all(all: Vec<Self>, db: &Surreal<Client>) -> Vec<Thing> { | ||||
|         counter!(STORE).increment(1); | ||||
|         let mut things = Vec::with_capacity(all.len()); | ||||
|  | ||||
|         // TODO this only allows for one thread to be in the database at a time. | ||||
|         // This is currently required since otherwise we get write errors. | ||||
|         // If the default `crawled` is set to false, we might not need to write any more | ||||
|         // than just the name. `accessed_at` is fun but not needed. | ||||
|         let now = Instant::now(); | ||||
|         let lock = LOCK.lock().await; | ||||
|         counter!(TIME_SPENT_ON_LOCK).increment(now.elapsed().as_millis() as u64); | ||||
|  | ||||
|         match db | ||||
|             .query( | ||||
|                 "INSERT INTO website $array | ||||
|                     ON DUPLICATE KEY UPDATE | ||||
|                         accessed_at = time::now(), | ||||
|                         crawled = crawled OR $input.crawled | ||||
|                     RETURN VALUE id; | ||||
|                  ", | ||||
|             ) | ||||
|             .bind(("array", all)) | ||||
|             .await | ||||
|         { | ||||
|             Ok(mut id) => match id.take::<Vec<Thing>>(0) { | ||||
|                 Ok(mut x) => things.append(&mut x), | ||||
|                 Err(err) => match err { | ||||
|                     Api(error) => { | ||||
|                         eprintln!("{:?}", error); | ||||
|                         error!("{:?}", error); | ||||
|                     } | ||||
|                     _ => error!("{:?}", err), | ||||
|                 }, | ||||
|             }, | ||||
|             Err(err) => { | ||||
|                 error!("{:?}", err); | ||||
|             } | ||||
|         } | ||||
|         drop(lock); | ||||
|         things | ||||
|     } | ||||
| } | ||||
|  | ||||
| @@ -193,15 +211,16 @@ pub async fn connect(config: &Config) -> surrealdb::Result<Surreal<Client>> { | ||||
|     .await?; | ||||
|  | ||||
|     // Select a specific namespace / database | ||||
|     db | ||||
|         .use_ns(&config.surreal_ns) | ||||
|     db.use_ns(&config.surreal_ns) | ||||
|         .use_db(&config.surreal_db) | ||||
|         .await?; | ||||
|      | ||||
|  | ||||
|     let setup = include_bytes!("setup.surql"); | ||||
|     let file = setup.iter().map(|c| *c as char).collect::<String>(); | ||||
|  | ||||
|     db.query(file).await.expect("Failed to setup surreal tables."); | ||||
|     db.query(file) | ||||
|         .await | ||||
|         .expect("Failed to setup surreal tables."); | ||||
|  | ||||
|     Ok(db) | ||||
| } | ||||
|   | ||||
							
								
								
									
										95
									
								
								src/main.rs
									
									
									
									
									
								
							
							
						
						
									
										95
									
								
								src/main.rs
									
									
									
									
									
								
							| @@ -2,7 +2,12 @@ | ||||
|  | ||||
| extern crate html5ever; | ||||
|  | ||||
| use std::{fs::File, io::Read, net::{IpAddr, Ipv4Addr}, time::Instant}; | ||||
| use std::{ | ||||
|     fs::File, | ||||
|     io::Read, | ||||
|     net::{IpAddr, Ipv4Addr}, | ||||
|     time::Instant, | ||||
| }; | ||||
|  | ||||
| use db::{connect, Website}; | ||||
| use metrics::{counter, gauge}; | ||||
| @@ -11,7 +16,7 @@ use s3::S3; | ||||
| use serde::Deserialize; | ||||
| use surrealdb::{engine::remote::ws::Client, Surreal}; | ||||
| use tokio::task::JoinSet; | ||||
| use tracing::{debug, info, instrument, trace, trace_span, warn}; | ||||
| use tracing::{debug, error, info, instrument, trace, trace_span, warn}; | ||||
| use tracing_subscriber::{fmt, layer::SubscriberExt, EnvFilter, Layer, Registry}; | ||||
|  | ||||
| mod db; | ||||
| @@ -50,25 +55,26 @@ async fn main() { | ||||
|         .open("./docker/logs/tracing.log") | ||||
|         .expect("Couldn't make log file!"); | ||||
|  | ||||
|     let registry = Registry::default() | ||||
|         .with( | ||||
|             fmt::layer() | ||||
|                 .with_line_number(true) | ||||
|                 .with_thread_ids(true) | ||||
|                 .with_file(true) | ||||
|                 // .with_timer(LocalTime::rfc_3339()) // Loki or alloy does this automatically | ||||
|                 .json() | ||||
|                 .with_writer(writer) | ||||
|                 // .with_filter(EnvFilter::from_default_env()) | ||||
|         ); | ||||
|     let filter = EnvFilter::from_default_env(); | ||||
|  | ||||
|     let registry = Registry::default().with( | ||||
|         fmt::layer() | ||||
|             .with_line_number(true) | ||||
|             .with_thread_ids(true) | ||||
|             .with_file(true) | ||||
|             .json() | ||||
|             .with_writer(writer) | ||||
|             .with_filter(filter) | ||||
|     ); | ||||
|  | ||||
|     tracing::subscriber::set_global_default(registry).expect("Failed to set default subscriber"); | ||||
|  | ||||
|  | ||||
|     let builder = PrometheusBuilder::new(); | ||||
|     builder.with_http_listener( | ||||
|             std::net::SocketAddr::new(IpAddr::V4(Ipv4Addr::from_octets([0,0,0,0])), 2500) | ||||
|         ) | ||||
|     builder | ||||
|         .with_http_listener(std::net::SocketAddr::new( | ||||
|             IpAddr::V4(Ipv4Addr::from_octets([0, 0, 0, 0])), | ||||
|             2500, | ||||
|         )) | ||||
|         .install() | ||||
|         .expect("failed to install recorder/exporter"); | ||||
|  | ||||
| @@ -80,7 +86,6 @@ async fn main() { | ||||
|     // let budget = 50; | ||||
|     let mut crawled = 0; | ||||
|  | ||||
|      | ||||
|     let mut file = File::open("./Crawler.toml").expect("Failed to read Crawler.toml"); | ||||
|     let mut buf = String::new(); | ||||
|     let _ = file.read_to_string(&mut buf); | ||||
| @@ -106,7 +111,7 @@ async fn main() { | ||||
|     let pre_loop_span = span.enter(); | ||||
|     // Download the site | ||||
|     let site = Website::new(&starting_url, false); | ||||
|     get(site, db.clone(), reqwest.clone(), s3.clone()).await; | ||||
|     process(site, db.clone(), reqwest.clone(), s3.clone()).await; | ||||
|  | ||||
|     drop(pre_loop_span); | ||||
|  | ||||
| @@ -133,7 +138,7 @@ async fn main() { | ||||
|             let mut futures = JoinSet::new(); | ||||
|             for site in uncrawled { | ||||
|                 gauge!(BEING_PROCESSED).increment(1); | ||||
|                 futures.spawn(get(site, db.clone(), reqwest.clone(), s3.clone())); | ||||
|                 futures.spawn(process(site, db.clone(), reqwest.clone(), s3.clone())); | ||||
|                 // let percent = format!("{:.2}%", (crawled as f32 / budget as f32) * 100f32); | ||||
|                 // info!("Crawled {crawled} out of {budget} pages. ({percent})"); | ||||
|             } | ||||
| @@ -150,40 +155,60 @@ async fn main() { | ||||
|     } | ||||
|     drop(span); | ||||
|  | ||||
|     info!("Done"); | ||||
|     debug!("Done"); | ||||
|     drop(total_runtime); | ||||
| } | ||||
|  | ||||
| #[instrument(skip (db, s3, reqwest))] | ||||
| #[instrument(skip(db, s3, reqwest))] | ||||
| /// Downloads and crawls and stores a webpage. | ||||
| /// It is acceptable to clone `db`, `reqwest`, and `s3` because they all use `Arc`s internally. - Noted by Oliver  | ||||
| async fn get(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Client, s3: S3) { | ||||
|     trace!("Get: {}", site.to_string()); | ||||
|  | ||||
| /// It is acceptable to clone `db`, `reqwest`, and `s3` because they all use `Arc`s internally. - Noted by Oliver | ||||
| async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Client, s3: S3) { | ||||
|      | ||||
|     // METRICS | ||||
|     trace!("Process: {}", &site.site); | ||||
|     let timer = Timer::start("Built request"); | ||||
|     let request_builder = reqwest.get(site.to_string()); | ||||
|     // Build the request | ||||
|     let request_builder = reqwest.get(&site.site.to_string()); | ||||
|     // METRICS | ||||
|     timer.stop(); | ||||
|  | ||||
|     // METRICS | ||||
|     let g = gauge!(GET_IN_FLIGHT); | ||||
|     g.increment(1); | ||||
|     let timer = Timer::start("Got page"); | ||||
|  | ||||
|     // Send the http request (get) | ||||
|     if let Ok(response) = request_builder.send().await { | ||||
|  | ||||
|         // METRICS | ||||
|         timer.stop(); | ||||
|         g.decrement(1); | ||||
|         counter!(GET_METRIC).increment(1); | ||||
|         debug!("Getting body..."); | ||||
|  | ||||
|         // Get body | ||||
|         let data = response.text().await.expect("Failed to read http response's body!"); | ||||
|         // Get body from response | ||||
|         let data = response | ||||
|             .text() | ||||
|             .await | ||||
|             .expect("Failed to read http response's body!"); | ||||
|         // Store document | ||||
|         s3.store(&data, &site.site).await; | ||||
|         // Parse document and store relationships | ||||
|         parser::parse(&db, &mut site, &data).await; | ||||
|         return; | ||||
|  | ||||
|         // Parse document and get relationships | ||||
|         let sites = parser::parse(&site, &data).await; | ||||
|          | ||||
|         // update self in db | ||||
|         site.set_crawled(); | ||||
|         Website::store_all(vec![site.clone()], &db).await; | ||||
|  | ||||
|         // Store all the other sites so that we can link to them. | ||||
|         // let mut links_to = Vec::new(); | ||||
|         let others = Website::store_all(sites, &db).await; | ||||
|  | ||||
|         // Make the database's links reflect the html links between sites | ||||
|         site.links_to(others, &db).await; | ||||
|     } else { | ||||
|         error!("Failed to get: {}", &site.site); | ||||
|     } | ||||
|     trace!("Failed to get: {}", site.to_string()); | ||||
| } | ||||
|  | ||||
| /// Returns uncrawled links | ||||
| @@ -228,8 +253,6 @@ impl<'a> Timer<'a> { | ||||
|  | ||||
|         if ms > 200. { | ||||
|             warn!("{}", format!("{} in {:.3}ms", self.msg, ms)); | ||||
|         } else { | ||||
|             trace!("{}", format!("{} in {:.3}ms", self.msg, ms)); | ||||
|         } | ||||
|  | ||||
|         ms | ||||
|   | ||||
| @@ -5,8 +5,6 @@ use html5ever::tokenizer::{BufferQueue, TokenizerResult}; | ||||
| use html5ever::tokenizer::{StartTag, TagToken}; | ||||
| use html5ever::tokenizer::{Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts}; | ||||
| use html5ever::{local_name, tendril::*}; | ||||
| use surrealdb::engine::remote::ws::Client; | ||||
| use surrealdb::Surreal; | ||||
| use tracing::instrument; | ||||
|  | ||||
| use crate::db::Website; | ||||
| @@ -40,7 +38,9 @@ impl TokenSink for Website { | ||||
|                                     // Set url | ||||
|                                     let mut url = web.site; | ||||
|                                     url.set_fragment(None); // removes #xyz | ||||
|                                     let joined = url.join(&attr.value).expect("Failed to join url during parsing!"); | ||||
|                                     let joined = url | ||||
|                                         .join(&attr.value) | ||||
|                                         .expect("Failed to join url during parsing!"); | ||||
|                                     web.site = joined; | ||||
|  | ||||
|                                     web.crawled = false; | ||||
| @@ -65,45 +65,31 @@ impl TokenSink for Website { | ||||
| } | ||||
|  | ||||
| #[instrument(skip_all)] | ||||
| pub async fn parse(db: &Surreal<Client>, site: &mut Website, data: &str) { | ||||
|     // update self in db | ||||
|     site.set_crawled(); | ||||
|     site.store(db).await; | ||||
|  | ||||
| /// Parses the passed site and returns all the sites it links to. | ||||
| pub async fn parse(site: &Website, data: &str) -> Vec<Website> { | ||||
|     // prep work | ||||
|     let mut other_sites: Vec<Website> = Vec::new(); | ||||
|     { // using blocks to prevent compiler's async worries | ||||
|         let _t = Timer::start("Parsed page"); | ||||
|     let _t = Timer::start("Parsed page"); | ||||
|  | ||||
|         // change data into something that can be tokenized | ||||
|         let chunk = Tendril::from_str(&data).expect("Failed to parse string into Tendril!"); | ||||
|         // create buffer of tokens and push our input into it | ||||
|         let mut token_buffer = BufferQueue::default(); | ||||
|         token_buffer.push_back(chunk.try_reinterpret::<fmt::UTF8>().expect("Failed to reinterprt chunk!")); | ||||
|         // create the tokenizer | ||||
|         let tokenizer = Tokenizer::new(site.clone(), TokenizerOpts::default()); | ||||
|     // change data into something that can be tokenized | ||||
|     let chunk = Tendril::from_str(&data).expect("Failed to parse string into Tendril!"); | ||||
|     // create buffer of tokens and push our input into it | ||||
|     let mut token_buffer = BufferQueue::default(); | ||||
|     token_buffer.push_back( | ||||
|         chunk | ||||
|             .try_reinterpret::<fmt::UTF8>() | ||||
|             .expect("Failed to reinterprt chunk!"), | ||||
|     ); | ||||
|     // create the tokenizer | ||||
|     let tokenizer = Tokenizer::new(site.clone(), TokenizerOpts::default()); | ||||
|  | ||||
|         // go thru buffer | ||||
|         while let TokenizerResult::Script(mut sites) = tokenizer.feed(&mut token_buffer) { | ||||
|             other_sites.append(&mut sites); | ||||
|             // other_sites.push(sites); | ||||
|         } | ||||
|  | ||||
|         assert!(token_buffer.is_empty()); | ||||
|         tokenizer.end(); | ||||
|     // go thru buffer | ||||
|     while let TokenizerResult::Script(mut sites) = tokenizer.feed(&mut token_buffer) { | ||||
|         other_sites.append(&mut sites); | ||||
|         // other_sites.push(sites); | ||||
|     } | ||||
|     assert!(token_buffer.is_empty()); | ||||
|     tokenizer.end(); | ||||
|  | ||||
|     { | ||||
|         let mut links_to = Vec::with_capacity(other_sites.len()); | ||||
|  | ||||
|         for a in other_sites { | ||||
|  | ||||
|             let other = a.store(db).await; | ||||
|             if let Some(o) = other { | ||||
|                 links_to.push(o); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         site.links_to(links_to, db).await; | ||||
|     } | ||||
|     other_sites | ||||
| } | ||||
|   | ||||
							
								
								
									
										68
									
								
								src/s3.rs
									
									
									
									
									
								
							
							
						
						
									
										68
									
								
								src/s3.rs
									
									
									
									
									
								
							| @@ -1,4 +1,3 @@ | ||||
| use base64::{alphabet, engine::{self, general_purpose}, Engine}; | ||||
| use metrics::counter; | ||||
| use minio::s3::{ | ||||
|     args::{BucketExistsArgs, MakeBucketArgs}, | ||||
| @@ -11,11 +10,9 @@ use minio::s3::{ | ||||
| use tracing::{instrument, trace, warn}; | ||||
| use url::Url; | ||||
|  | ||||
| use crate::{Config, Timer}; | ||||
|          | ||||
| const CUSTOM_ENGINE: engine::GeneralPurpose = engine::GeneralPurpose::new(&alphabet::URL_SAFE, general_purpose::NO_PAD); | ||||
| use crate::{db::Website, Config, Timer}; | ||||
|  | ||||
| const ROUND_TRIP_METRIC: &'static str = "s3_trips"; | ||||
| const S3_ROUND_TRIP_METRIC: &'static str = "s3_trips"; | ||||
|  | ||||
| #[derive(Clone)] | ||||
| pub struct S3 { | ||||
| @@ -45,6 +42,7 @@ impl S3 { | ||||
|                     .expect("Failed to check if bucket exists"), | ||||
|             ) | ||||
|             .await?; | ||||
|         counter!(S3_ROUND_TRIP_METRIC).increment(1); | ||||
|  | ||||
|         if !exists { | ||||
|             trace!("Creating bucket..."); | ||||
| @@ -54,52 +52,50 @@ impl S3 { | ||||
|                 ) | ||||
|                 .await?; | ||||
|         } | ||||
|         counter!(S3_ROUND_TRIP_METRIC).increment(1); | ||||
|  | ||||
|         trace!("Connection successful"); | ||||
|  | ||||
|         Ok(Self { | ||||
|             bucket_name: config.s3_bucket.to_owned(), | ||||
|             client: client, | ||||
|             client, | ||||
|         }) | ||||
|     } | ||||
|  | ||||
|     #[instrument(name = "s3_store", skip_all)] | ||||
|     pub async fn store(&self, data: &str, url: &Url) { | ||||
|         let counter = counter!(ROUND_TRIP_METRIC); | ||||
|         let counter = counter!(S3_ROUND_TRIP_METRIC); | ||||
|         let t = Timer::start("Stored page"); | ||||
|         let _ = t; // prevent compiler drop | ||||
|         if let Some(domain) = url.domain() { | ||||
|             let filename = domain.to_owned() + url.path(); | ||||
|  | ||||
|             trace!("Created filename: {filename} from raw: {}", url.to_string()); | ||||
|         let filename = Website::get_url_as_string(url); | ||||
|         trace!("Storing {} as {filename}", url.to_string()); | ||||
|  | ||||
|             counter.increment(1); | ||||
|             let _ = match &self | ||||
|                 .client | ||||
|                 .put_object_content(&self.bucket_name, &filename, data.to_owned()) | ||||
|                 .send() | ||||
|                 .await { | ||||
|                     Ok(_) => {}, | ||||
|                     Err(err) => { | ||||
|                         match err { | ||||
|                             Error::InvalidObjectName(_) => { | ||||
|         counter.increment(1); | ||||
|         let _ = match &self | ||||
|             .client | ||||
|             .put_object_content(&self.bucket_name, &filename, data.to_owned()) | ||||
|             .send() | ||||
|             .await | ||||
|         { | ||||
|             Ok(_) => {} | ||||
|             Err(err) => match err { | ||||
|                 Error::InvalidObjectName(_) => { | ||||
|                     // This code will really only run if the url has non-english chars | ||||
|                     warn!("Tried storing invalid object name, retrying with Base64 encoding. Last try."); | ||||
|  | ||||
|                                 warn!("Tried storing invalid object name, retrying with Base64 encoding. Last try."); | ||||
|                                  | ||||
|                                 let filename: String = domain.to_owned() + &CUSTOM_ENGINE.encode(url.path()); | ||||
|                     let filename: String = Website::get_url_as_b64_path(url); | ||||
|  | ||||
|                                 counter.increment(1); | ||||
|                                 let _ = &self | ||||
|                                     .client | ||||
|                                     .put_object_content(&self.bucket_name, &filename, data.to_owned()) | ||||
|                                     .send() | ||||
|                                     .await | ||||
|                                     .unwrap(); | ||||
|                             }, | ||||
|                             _ => {}, | ||||
|                        } | ||||
|                     }, | ||||
|                 }; | ||||
|         } | ||||
|                     counter.increment(1); | ||||
|                     let _ = &self | ||||
|                         .client | ||||
|                         .put_object_content(&self.bucket_name, &filename, data.to_owned()) | ||||
|                         .send() | ||||
|                         .await | ||||
|                         .unwrap(); | ||||
|                 } | ||||
|                 _ => {} | ||||
|             }, | ||||
|         }; | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -1,3 +1,9 @@ | ||||
| DEFINE TABLE IF NOT EXISTS website SCHEMALESS; | ||||
| DEFINE FIELD IF NOT EXISTS accessed_at ON TABLE website VALUE time::now(); | ||||
|  | ||||
| DEFINE FIELD IF NOT EXISTS site ON TABLE website TYPE string; | ||||
| DEFINE INDEX IF NOT EXISTS idx ON TABLE website COLUMNS site UNIQUE; | ||||
|  | ||||
| DEFINE FIELD IF NOT EXISTS crawled ON TABLE website TYPE bool; | ||||
|  | ||||
| DEFINE FIELD IF NOT EXISTS accessed_at ON TABLE website VALUE time::now(); | ||||
| DEFINE FIELD IF NOT EXISTS first_accessed_at ON TABLE website VALUE time::now(); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user