Compare commits
	
		
			60 Commits
		
	
	
		
			135a7e4957
			...
			main
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 2c339a36f9 | |||
| 73216f7003 | |||
| 1e59ebd5c4 | |||
| 52d5e101d0 | |||
| 5b728bacd6 | |||
| b0fe7f4761 | |||
| 5ade5e36df | |||
| 95b8af0356 | |||
| ad8d7c606d | |||
| f3a51065b5 | |||
| 343d3a7570 | |||
| e535bcc295 | |||
| a0fd81d956 | |||
| 5cbba33a09 | |||
| 83def7ba27 | |||
| 76e78cc745 | |||
| b4038b76dd | |||
| caa523f1eb | |||
| f7bb0eef16 | |||
| 865f9be8c0 | |||
| 48abc73092 | |||
| 0061866976 | |||
| 9662b68b0d | |||
| 6f98001d8e | |||
| 6790061e22 | |||
| 50606bb69e | |||
| 5850f19cab | |||
| 2c8546e30a | |||
| 4e619d0ebc | |||
| 647c4cd324 | |||
| 7fab961d76 | |||
| d3fff194f4 | |||
| 3497312fd4 | |||
| 0fd76b1734 | |||
| 9bfa8f9108 | |||
| bdb1094a30 | |||
| 9aa2d9ce22 | |||
| 4b557a923c | |||
| c08a20ac00 | |||
| 94912e9125 | |||
| a9465dda6e | |||
| add6f00ed6 | |||
| 4a433a1a77 | |||
| 03cbcd9ae0 | |||
| 6fc71c7a78 | |||
| 96a3ca092a | |||
| b750d88d48 | |||
| 808790a7c3 | |||
| 2de01b2a0e | |||
| be0fd5505b | |||
| a23429104c | |||
| 66581cc453 | |||
| 7df19a480f | |||
| b9c1f0b492 | |||
| 71b7b2d7bc | |||
| bac3cd9d1d | |||
| 1f6a0acce3 | |||
| 53dbf53ab9 | |||
| 0477bb26e4 | |||
| 6409baaffb | 
							
								
								
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -5,3 +5,5 @@ perf.data | |||||||
| flamegraph.svg | flamegraph.svg | ||||||
| perf.data.old | perf.data.old | ||||||
| /docker/logs/* | /docker/logs/* | ||||||
|  | /downloaded | ||||||
|  | /Crawler.toml | ||||||
|   | |||||||
							
								
								
									
										19
									
								
								.vscode/launch.json
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										19
									
								
								.vscode/launch.json
									
									
									
									
										vendored
									
									
								
							| @@ -7,18 +7,15 @@ | |||||||
|         { |         { | ||||||
|             "type": "lldb", |             "type": "lldb", | ||||||
|             "request": "launch", |             "request": "launch", | ||||||
|             "name": "Debug executable 'surreal_spider'", |             "name": "Debug executable 'internet_mapper'", | ||||||
|             "env": { |  | ||||||
|                 "RUST_LOG": "surreal_spider=trace,reqwest=info", |  | ||||||
|             }, |  | ||||||
|             "cargo": { |             "cargo": { | ||||||
|                 "args": [ |                 "args": [ | ||||||
|                     "build", |                     "build", | ||||||
|                     "--bin=surreal_spider", |                     "--bin=internet_mapper", | ||||||
|                     "--package=surreal_spider" |                     "--package=internet_mapper" | ||||||
|                 ], |                 ], | ||||||
|                 "filter": { |                 "filter": { | ||||||
|                     "name": "surreal_spider", |                     "name": "internet_mapper", | ||||||
|                     "kind": "bin" |                     "kind": "bin" | ||||||
|                 } |                 } | ||||||
|             }, |             }, | ||||||
| @@ -28,16 +25,16 @@ | |||||||
|         { |         { | ||||||
|             "type": "lldb", |             "type": "lldb", | ||||||
|             "request": "launch", |             "request": "launch", | ||||||
|             "name": "Debug unit tests in executable 'surreal_spider'", |             "name": "Debug unit tests in executable 'internet_mapper'", | ||||||
|             "cargo": { |             "cargo": { | ||||||
|                 "args": [ |                 "args": [ | ||||||
|                     "test", |                     "test", | ||||||
|                     "--no-run", |                     "--no-run", | ||||||
|                     "--bin=surreal_spider", |                     "--bin=internet_mapper", | ||||||
|                     "--package=surreal_spider" |                     "--package=internet_mapper" | ||||||
|                 ], |                 ], | ||||||
|                 "filter": { |                 "filter": { | ||||||
|                     "name": "surreal_spider", |                     "name": "internet_mapper", | ||||||
|                     "kind": "bin" |                     "kind": "bin" | ||||||
|                 } |                 } | ||||||
|             }, |             }, | ||||||
|   | |||||||
							
								
								
									
										8
									
								
								.vscode/settings.json
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								.vscode/settings.json
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,8 @@ | |||||||
|  | { | ||||||
|  |     "cSpell.words": [ | ||||||
|  |         "creds", | ||||||
|  |         "reqwest", | ||||||
|  |         "rustls", | ||||||
|  |         "surql", | ||||||
|  |     ] | ||||||
|  | } | ||||||
							
								
								
									
										466
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										466
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							| @@ -103,62 +103,18 @@ dependencies = [ | |||||||
|  "libc", |  "libc", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "anstream" |  | ||||||
| version = "0.6.18" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" |  | ||||||
| dependencies = [ |  | ||||||
|  "anstyle", |  | ||||||
|  "anstyle-parse", |  | ||||||
|  "anstyle-query", |  | ||||||
|  "anstyle-wincon", |  | ||||||
|  "colorchoice", |  | ||||||
|  "is_terminal_polyfill", |  | ||||||
|  "utf8parse", |  | ||||||
| ] |  | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "anstyle" |  | ||||||
| version = "1.0.10" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" |  | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "anstyle-parse" |  | ||||||
| version = "0.2.6" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" |  | ||||||
| dependencies = [ |  | ||||||
|  "utf8parse", |  | ||||||
| ] |  | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "anstyle-query" |  | ||||||
| version = "1.1.2" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" |  | ||||||
| dependencies = [ |  | ||||||
|  "windows-sys 0.59.0", |  | ||||||
| ] |  | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "anstyle-wincon" |  | ||||||
| version = "3.0.7" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "ca3534e77181a9cc07539ad51f2141fe32f6c3ffd4df76db8ad92346b003ae4e" |  | ||||||
| dependencies = [ |  | ||||||
|  "anstyle", |  | ||||||
|  "once_cell", |  | ||||||
|  "windows-sys 0.59.0", |  | ||||||
| ] |  | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "any_ascii" | name = "any_ascii" | ||||||
| version = "0.3.2" | version = "0.3.2" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "ea50b14b7a4b9343f8c627a7a53c52076482bd4bdad0a24fd3ec533ed616cc2c" | checksum = "ea50b14b7a4b9343f8c627a7a53c52076482bd4bdad0a24fd3ec533ed616cc2c" | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "anyhow" | ||||||
|  | version = "1.0.98" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "approx" | name = "approx" | ||||||
| version = "0.4.0" | version = "0.4.0" | ||||||
| @@ -325,17 +281,6 @@ dependencies = [ | |||||||
|  "serde_json", |  "serde_json", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "async-recursion" |  | ||||||
| version = "1.1.1" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" |  | ||||||
| dependencies = [ |  | ||||||
|  "proc-macro2", |  | ||||||
|  "quote", |  | ||||||
|  "syn 2.0.100", |  | ||||||
| ] |  | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "async-stream" | name = "async-stream" | ||||||
| version = "0.3.6" | version = "0.3.6" | ||||||
| @@ -815,12 +760,6 @@ dependencies = [ | |||||||
|  "cc", |  "cc", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "colorchoice" |  | ||||||
| version = "1.0.3" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" |  | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "concurrent-queue" | name = "concurrent-queue" | ||||||
| version = "2.5.0" | version = "2.5.0" | ||||||
| @@ -871,21 +810,6 @@ dependencies = [ | |||||||
|  "libc", |  "libc", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "crc" |  | ||||||
| version = "3.2.1" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "69e6e4d7b33a94f0991c26729976b10ebde1d34c3ee82408fb536164fa10d636" |  | ||||||
| dependencies = [ |  | ||||||
|  "crc-catalog", |  | ||||||
| ] |  | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "crc-catalog" |  | ||||||
| version = "2.4.0" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" |  | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "crc32fast" | name = "crc32fast" | ||||||
| version = "1.4.2" | version = "1.4.2" | ||||||
| @@ -984,20 +908,6 @@ dependencies = [ | |||||||
|  "parking_lot_core", |  "parking_lot_core", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "dashmap" |  | ||||||
| version = "6.1.0" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" |  | ||||||
| dependencies = [ |  | ||||||
|  "cfg-if", |  | ||||||
|  "crossbeam-utils", |  | ||||||
|  "hashbrown 0.14.5", |  | ||||||
|  "lock_api", |  | ||||||
|  "once_cell", |  | ||||||
|  "parking_lot_core", |  | ||||||
| ] |  | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "data-encoding" | name = "data-encoding" | ||||||
| version = "2.8.0" | version = "2.8.0" | ||||||
| @@ -1014,17 +924,6 @@ dependencies = [ | |||||||
|  "serde", |  "serde", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "derivative" |  | ||||||
| version = "2.2.0" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "fcc3dd5e9e9c0b295d6e1e4d811fb6f157d5ffd784b8d202fc62eac8035a770b" |  | ||||||
| dependencies = [ |  | ||||||
|  "proc-macro2", |  | ||||||
|  "quote", |  | ||||||
|  "syn 1.0.109", |  | ||||||
| ] |  | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "deunicode" | name = "deunicode" | ||||||
| version = "1.6.1" | version = "1.6.1" | ||||||
| @@ -1135,29 +1034,6 @@ version = "0.1.2" | |||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d" | checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d" | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "env_filter" |  | ||||||
| version = "0.1.3" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0" |  | ||||||
| dependencies = [ |  | ||||||
|  "log", |  | ||||||
|  "regex", |  | ||||||
| ] |  | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "env_logger" |  | ||||||
| version = "0.11.7" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "c3716d7a920fb4fac5d84e9d4bce8ceb321e9414b4409da61b07b75c1e3d0697" |  | ||||||
| dependencies = [ |  | ||||||
|  "anstream", |  | ||||||
|  "anstyle", |  | ||||||
|  "env_filter", |  | ||||||
|  "jiff", |  | ||||||
|  "log", |  | ||||||
| ] |  | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "equivalent" | name = "equivalent" | ||||||
| version = "1.0.2" | version = "1.0.2" | ||||||
| @@ -1727,6 +1603,19 @@ dependencies = [ | |||||||
|  "webpki-roots", |  "webpki-roots", | ||||||
| ] | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "hyper-timeout" | ||||||
|  | version = "0.5.2" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0" | ||||||
|  | dependencies = [ | ||||||
|  |  "hyper", | ||||||
|  |  "hyper-util", | ||||||
|  |  "pin-project-lite", | ||||||
|  |  "tokio", | ||||||
|  |  "tower-service", | ||||||
|  | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "hyper-tls" | name = "hyper-tls" | ||||||
| version = "0.6.0" | version = "0.6.0" | ||||||
| @@ -1961,18 +1850,35 @@ dependencies = [ | |||||||
|  "generic-array", |  "generic-array", | ||||||
| ] | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "internet_mapper" | ||||||
|  | version = "0.1.0" | ||||||
|  | dependencies = [ | ||||||
|  |  "base64 0.22.1", | ||||||
|  |  "futures-util", | ||||||
|  |  "html5ever 0.29.1", | ||||||
|  |  "metrics", | ||||||
|  |  "metrics-exporter-prometheus", | ||||||
|  |  "opentelemetry", | ||||||
|  |  "opentelemetry-otlp", | ||||||
|  |  "opentelemetry_sdk", | ||||||
|  |  "rand 0.9.1", | ||||||
|  |  "reqwest", | ||||||
|  |  "serde", | ||||||
|  |  "surrealdb", | ||||||
|  |  "tokio", | ||||||
|  |  "toml", | ||||||
|  |  "tracing", | ||||||
|  |  "tracing-subscriber", | ||||||
|  |  "url", | ||||||
|  | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "ipnet" | name = "ipnet" | ||||||
| version = "2.11.0" | version = "2.11.0" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" | checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "is_terminal_polyfill" |  | ||||||
| version = "1.70.1" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" |  | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "itertools" | name = "itertools" | ||||||
| version = "0.10.5" | version = "0.10.5" | ||||||
| @@ -2006,30 +1912,6 @@ version = "1.0.15" | |||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" | checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "jiff" |  | ||||||
| version = "0.2.4" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "d699bc6dfc879fb1bf9bdff0d4c56f0884fc6f0d0eb0fba397a6d00cd9a6b85e" |  | ||||||
| dependencies = [ |  | ||||||
|  "jiff-static", |  | ||||||
|  "log", |  | ||||||
|  "portable-atomic", |  | ||||||
|  "portable-atomic-util", |  | ||||||
|  "serde", |  | ||||||
| ] |  | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "jiff-static" |  | ||||||
| version = "0.2.4" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "8d16e75759ee0aa64c57a56acbf43916987b20c77373cb7e808979e02b93c9f9" |  | ||||||
| dependencies = [ |  | ||||||
|  "proc-macro2", |  | ||||||
|  "quote", |  | ||||||
|  "syn 2.0.100", |  | ||||||
| ] |  | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "jobserver" | name = "jobserver" | ||||||
| version = "0.1.32" | version = "0.1.32" | ||||||
| @@ -2274,12 +2156,6 @@ dependencies = [ | |||||||
|  "digest", |  "digest", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "md5" |  | ||||||
| version = "0.7.0" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" |  | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "memchr" | name = "memchr" | ||||||
| version = "2.7.4" | version = "2.7.4" | ||||||
| @@ -2378,46 +2254,6 @@ version = "0.2.1" | |||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" | checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "minio" |  | ||||||
| version = "0.2.0-alpha" |  | ||||||
| source = "git+https://github.com/minio/minio-rs.git?rev=c28f576#c28f576cb8f8cf47fb941bb9db62b2cbd6f080c1" |  | ||||||
| dependencies = [ |  | ||||||
|  "async-recursion", |  | ||||||
|  "async-trait", |  | ||||||
|  "base64 0.22.1", |  | ||||||
|  "byteorder", |  | ||||||
|  "bytes", |  | ||||||
|  "chrono", |  | ||||||
|  "crc", |  | ||||||
|  "dashmap 6.1.0", |  | ||||||
|  "derivative", |  | ||||||
|  "env_logger", |  | ||||||
|  "futures-util", |  | ||||||
|  "hex", |  | ||||||
|  "hmac", |  | ||||||
|  "home", |  | ||||||
|  "http", |  | ||||||
|  "hyper", |  | ||||||
|  "lazy_static", |  | ||||||
|  "log", |  | ||||||
|  "md5", |  | ||||||
|  "multimap", |  | ||||||
|  "os_info", |  | ||||||
|  "percent-encoding", |  | ||||||
|  "rand 0.8.5", |  | ||||||
|  "regex", |  | ||||||
|  "reqwest", |  | ||||||
|  "serde", |  | ||||||
|  "serde_json", |  | ||||||
|  "sha2", |  | ||||||
|  "tokio", |  | ||||||
|  "tokio-stream", |  | ||||||
|  "tokio-util", |  | ||||||
|  "urlencoding", |  | ||||||
|  "xmltree", |  | ||||||
| ] |  | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "miniz_oxide" | name = "miniz_oxide" | ||||||
| version = "0.8.5" | version = "0.8.5" | ||||||
| @@ -2455,15 +2291,6 @@ dependencies = [ | |||||||
|  "version_check", |  "version_check", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "multimap" |  | ||||||
| version = "0.10.0" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "defc4c55412d89136f966bbb339008b474350e5e6e78d2714439c386b3137a03" |  | ||||||
| dependencies = [ |  | ||||||
|  "serde", |  | ||||||
| ] |  | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "nanoid" | name = "nanoid" | ||||||
| version = "0.4.0" | version = "0.4.0" | ||||||
| @@ -2716,14 +2543,77 @@ dependencies = [ | |||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "os_info" | name = "opentelemetry" | ||||||
| version = "3.10.0" | version = "0.30.0" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "2a604e53c24761286860eba4e2c8b23a0161526476b1de520139d69cdb85a6b5" | checksum = "aaf416e4cb72756655126f7dd7bb0af49c674f4c1b9903e80c009e0c37e552e6" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "log", |  "futures-core", | ||||||
|  "serde", |  "futures-sink", | ||||||
|  "windows-sys 0.52.0", |  "js-sys", | ||||||
|  |  "pin-project-lite", | ||||||
|  |  "thiserror 2.0.12", | ||||||
|  |  "tracing", | ||||||
|  | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "opentelemetry-http" | ||||||
|  | version = "0.30.0" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "50f6639e842a97dbea8886e3439710ae463120091e2e064518ba8e716e6ac36d" | ||||||
|  | dependencies = [ | ||||||
|  |  "async-trait", | ||||||
|  |  "bytes", | ||||||
|  |  "http", | ||||||
|  |  "opentelemetry", | ||||||
|  |  "reqwest", | ||||||
|  | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "opentelemetry-otlp" | ||||||
|  | version = "0.30.0" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "dbee664a43e07615731afc539ca60c6d9f1a9425e25ca09c57bc36c87c55852b" | ||||||
|  | dependencies = [ | ||||||
|  |  "http", | ||||||
|  |  "opentelemetry", | ||||||
|  |  "opentelemetry-http", | ||||||
|  |  "opentelemetry-proto", | ||||||
|  |  "opentelemetry_sdk", | ||||||
|  |  "prost", | ||||||
|  |  "reqwest", | ||||||
|  |  "thiserror 2.0.12", | ||||||
|  |  "tokio", | ||||||
|  |  "tonic", | ||||||
|  |  "tracing", | ||||||
|  | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "opentelemetry-proto" | ||||||
|  | version = "0.30.0" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "2e046fd7660710fe5a05e8748e70d9058dc15c94ba914e7c4faa7c728f0e8ddc" | ||||||
|  | dependencies = [ | ||||||
|  |  "opentelemetry", | ||||||
|  |  "opentelemetry_sdk", | ||||||
|  |  "prost", | ||||||
|  |  "tonic", | ||||||
|  | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "opentelemetry_sdk" | ||||||
|  | version = "0.30.0" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "11f644aa9e5e31d11896e024305d7e3c98a88884d9f8919dbf37a9991bc47a4b" | ||||||
|  | dependencies = [ | ||||||
|  |  "futures-channel", | ||||||
|  |  "futures-executor", | ||||||
|  |  "futures-util", | ||||||
|  |  "opentelemetry", | ||||||
|  |  "percent-encoding", | ||||||
|  |  "rand 0.9.1", | ||||||
|  |  "serde_json", | ||||||
|  |  "thiserror 2.0.12", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| @@ -2903,6 +2793,26 @@ version = "0.5.0" | |||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "5be167a7af36ee22fe3115051bc51f6e6c7054c9348e28deb4f49bd6f705a315" | checksum = "5be167a7af36ee22fe3115051bc51f6e6c7054c9348e28deb4f49bd6f705a315" | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "pin-project" | ||||||
|  | version = "1.1.10" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a" | ||||||
|  | dependencies = [ | ||||||
|  |  "pin-project-internal", | ||||||
|  | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "pin-project-internal" | ||||||
|  | version = "1.1.10" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861" | ||||||
|  | dependencies = [ | ||||||
|  |  "proc-macro2", | ||||||
|  |  "quote", | ||||||
|  |  "syn 2.0.100", | ||||||
|  | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "pin-project-lite" | name = "pin-project-lite" | ||||||
| version = "0.2.16" | version = "0.2.16" | ||||||
| @@ -2927,15 +2837,6 @@ version = "1.11.0" | |||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "350e9b48cbc6b0e028b0473b114454c6316e57336ee184ceab6e53f72c178b3e" | checksum = "350e9b48cbc6b0e028b0473b114454c6316e57336ee184ceab6e53f72c178b3e" | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "portable-atomic-util" |  | ||||||
| version = "0.2.4" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" |  | ||||||
| dependencies = [ |  | ||||||
|  "portable-atomic", |  | ||||||
| ] |  | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "powerfmt" | name = "powerfmt" | ||||||
| version = "0.2.0" | version = "0.2.0" | ||||||
| @@ -2985,6 +2886,29 @@ dependencies = [ | |||||||
|  "unicode-ident", |  "unicode-ident", | ||||||
| ] | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "prost" | ||||||
|  | version = "0.13.5" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" | ||||||
|  | dependencies = [ | ||||||
|  |  "bytes", | ||||||
|  |  "prost-derive", | ||||||
|  | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "prost-derive" | ||||||
|  | version = "0.13.5" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" | ||||||
|  | dependencies = [ | ||||||
|  |  "anyhow", | ||||||
|  |  "itertools 0.13.0", | ||||||
|  |  "proc-macro2", | ||||||
|  |  "quote", | ||||||
|  |  "syn 2.0.100", | ||||||
|  | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "psl-types" | name = "psl-types" | ||||||
| version = "2.0.11" | version = "2.0.11" | ||||||
| @@ -3075,7 +2999,7 @@ checksum = "b820744eb4dc9b57a3398183639c511b5a26d2ed702cedd3febaa1393caa22cc" | |||||||
| dependencies = [ | dependencies = [ | ||||||
|  "bytes", |  "bytes", | ||||||
|  "getrandom 0.3.2", |  "getrandom 0.3.2", | ||||||
|  "rand 0.9.0", |  "rand 0.9.1", | ||||||
|  "ring", |  "ring", | ||||||
|  "rustc-hash 2.1.1", |  "rustc-hash 2.1.1", | ||||||
|  "rustls", |  "rustls", | ||||||
| @@ -3146,13 +3070,12 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "rand" | name = "rand" | ||||||
| version = "0.9.0" | version = "0.9.1" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "3779b94aeb87e8bd4e834cee3650289ee9e0d5677f976ecdb6d219e5f4f6cd94" | checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "rand_chacha 0.9.0", |  "rand_chacha 0.9.0", | ||||||
|  "rand_core 0.9.3", |  "rand_core 0.9.3", | ||||||
|  "zerocopy 0.8.23", |  | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| @@ -3346,6 +3269,7 @@ dependencies = [ | |||||||
|  "base64 0.22.1", |  "base64 0.22.1", | ||||||
|  "bytes", |  "bytes", | ||||||
|  "encoding_rs", |  "encoding_rs", | ||||||
|  |  "futures-channel", | ||||||
|  "futures-core", |  "futures-core", | ||||||
|  "futures-util", |  "futures-util", | ||||||
|  "h2", |  "h2", | ||||||
| @@ -4112,25 +4036,6 @@ version = "2.6.1" | |||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" | checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "surreal_spider" |  | ||||||
| version = "0.1.0" |  | ||||||
| dependencies = [ |  | ||||||
|  "base64 0.22.1", |  | ||||||
|  "html5ever 0.29.1", |  | ||||||
|  "metrics", |  | ||||||
|  "metrics-exporter-prometheus", |  | ||||||
|  "minio", |  | ||||||
|  "reqwest", |  | ||||||
|  "serde", |  | ||||||
|  "surrealdb", |  | ||||||
|  "tokio", |  | ||||||
|  "toml", |  | ||||||
|  "tracing", |  | ||||||
|  "tracing-subscriber", |  | ||||||
|  "url", |  | ||||||
| ] |  | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "surrealdb" | name = "surrealdb" | ||||||
| version = "2.2.1" | version = "2.2.1" | ||||||
| @@ -4195,7 +4100,7 @@ dependencies = [ | |||||||
|  "cedar-policy", |  "cedar-policy", | ||||||
|  "chrono", |  "chrono", | ||||||
|  "ciborium", |  "ciborium", | ||||||
|  "dashmap 5.5.3", |  "dashmap", | ||||||
|  "deunicode", |  "deunicode", | ||||||
|  "dmp", |  "dmp", | ||||||
|  "fst", |  "fst", | ||||||
| @@ -4618,6 +4523,32 @@ dependencies = [ | |||||||
|  "winnow", |  "winnow", | ||||||
| ] | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "tonic" | ||||||
|  | version = "0.13.1" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "7e581ba15a835f4d9ea06c55ab1bd4dce26fc53752c69a04aac00703bfb49ba9" | ||||||
|  | dependencies = [ | ||||||
|  |  "async-trait", | ||||||
|  |  "base64 0.22.1", | ||||||
|  |  "bytes", | ||||||
|  |  "http", | ||||||
|  |  "http-body", | ||||||
|  |  "http-body-util", | ||||||
|  |  "hyper", | ||||||
|  |  "hyper-timeout", | ||||||
|  |  "hyper-util", | ||||||
|  |  "percent-encoding", | ||||||
|  |  "pin-project", | ||||||
|  |  "prost", | ||||||
|  |  "tokio", | ||||||
|  |  "tokio-stream", | ||||||
|  |  "tower", | ||||||
|  |  "tower-layer", | ||||||
|  |  "tower-service", | ||||||
|  |  "tracing", | ||||||
|  | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "tower" | name = "tower" | ||||||
| version = "0.5.2" | version = "0.5.2" | ||||||
| @@ -4626,11 +4557,15 @@ checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" | |||||||
| dependencies = [ | dependencies = [ | ||||||
|  "futures-core", |  "futures-core", | ||||||
|  "futures-util", |  "futures-util", | ||||||
|  |  "indexmap 2.8.0", | ||||||
|  "pin-project-lite", |  "pin-project-lite", | ||||||
|  |  "slab", | ||||||
|  "sync_wrapper", |  "sync_wrapper", | ||||||
|  "tokio", |  "tokio", | ||||||
|  |  "tokio-util", | ||||||
|  "tower-layer", |  "tower-layer", | ||||||
|  "tower-service", |  "tower-service", | ||||||
|  |  "tracing", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| @@ -4776,7 +4711,7 @@ version = "1.2.1" | |||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "470dbf6591da1b39d43c14523b2b469c86879a53e8b758c8e090a470fe7b1fbe" | checksum = "470dbf6591da1b39d43c14523b2b469c86879a53e8b758c8e090a470fe7b1fbe" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "rand 0.9.0", |  "rand 0.9.1", | ||||||
|  "serde", |  "serde", | ||||||
|  "web-time", |  "web-time", | ||||||
| ] | ] | ||||||
| @@ -4872,12 +4807,6 @@ version = "1.0.4" | |||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" | checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "utf8parse" |  | ||||||
| version = "0.2.2" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" |  | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "uuid" | name = "uuid" | ||||||
| version = "1.16.0" | version = "1.16.0" | ||||||
| @@ -5418,21 +5347,6 @@ dependencies = [ | |||||||
|  "tap", |  "tap", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "xml-rs" |  | ||||||
| version = "0.8.25" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "c5b940ebc25896e71dd073bad2dbaa2abfe97b0a391415e22ad1326d9c54e3c4" |  | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "xmltree" |  | ||||||
| version = "0.11.0" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "b619f8c85654798007fb10afa5125590b43b088c225a25fc2fec100a9fad0fc6" |  | ||||||
| dependencies = [ |  | ||||||
|  "xml-rs", |  | ||||||
| ] |  | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "yoke" | name = "yoke" | ||||||
| version = "0.7.5" | version = "0.7.5" | ||||||
|   | |||||||
							
								
								
									
										11
									
								
								Cargo.toml
									
									
									
									
									
								
							
							
						
						
									
										11
									
								
								Cargo.toml
									
									
									
									
									
								
							| @@ -1,16 +1,19 @@ | |||||||
| [package] | [package] | ||||||
| name = "surreal_spider" | name = "internet_mapper" | ||||||
| version = "0.1.0" | version = "0.1.0" | ||||||
| edition = "2021" | edition = "2021" | ||||||
|  |  | ||||||
| [dependencies] | [dependencies] | ||||||
| base64 = "0.22.1" | base64 = "0.22.1" | ||||||
|  | futures-util = "0.3.31" | ||||||
| html5ever = "0.29" | html5ever = "0.29" | ||||||
| metrics = "0.24.1" | metrics = "0.24.1" | ||||||
| metrics-exporter-prometheus = { version = "0.16.2", features=["http-listener"]} | metrics-exporter-prometheus = { version = "0.16.2", features=["http-listener"]} | ||||||
| # minio = "0.1.0" | opentelemetry = "0.30.0" | ||||||
| minio = {git="https://github.com/minio/minio-rs.git", rev = "c28f576"} | opentelemetry-otlp = { version = "0.30.0", features = ["metrics", "trace", "logs", "grpc-tonic"] } | ||||||
| reqwest = { version = "0.12", features = ["gzip", "default", "rustls-tls"] } | opentelemetry_sdk = "0.30.0" | ||||||
|  | rand = "0.9.1" | ||||||
|  | reqwest = { version = "0.12", features = ["gzip", "default", "rustls-tls", "stream"] } | ||||||
| serde = { version = "1.0", features = ["derive"] } | serde = { version = "1.0", features = ["derive"] } | ||||||
| surrealdb = "2.2" | surrealdb = "2.2" | ||||||
| tokio = { version="1.41.0", features = ["full"] } | tokio = { version="1.41.0", features = ["full"] } | ||||||
|   | |||||||
							
								
								
									
										24
									
								
								Crawler.toml
									
									
									
									
									
								
							
							
						
						
									
										24
									
								
								Crawler.toml
									
									
									
									
									
								
							| @@ -1,16 +1,22 @@ | |||||||
|  | # Visability config | ||||||
|  | # Alloy (for Tempo) | ||||||
|  | tracing_endpoint = "http://localhost:4317" | ||||||
|  | # Prometheus | ||||||
|  | metrics_endpoint = "http://localhost:9090/api/v1/otlp/v1/metrics" | ||||||
|  | # Alloy (for Loki) | ||||||
|  | log_file = "./docker/logs/tracing.log" | ||||||
|  |  | ||||||
| # Surreal config | # Surreal config | ||||||
| surreal_url = "localhost:8000" | surreal_url = "localhost:8000" | ||||||
| surreal_username = "root" | surreal_username = "root" | ||||||
| surreal_password = "root" | surreal_password = "root" | ||||||
| surreal_ns = "test" | surreal_ns = "test" | ||||||
| surreal_db = "v1.12" | surreal_db = "v1.21.1" | ||||||
|  |  | ||||||
| # Minio config |  | ||||||
| s3_bucket = "v1.12" |  | ||||||
| s3_url = "http://localhost:9000" |  | ||||||
| s3_access_key = "jLDPKGuu513VENc8kJwX" |  | ||||||
| s3_secret_key = "4T1nymEzsGYOlKSAb1WX7V3stnQn9a5ZoTQjDfcL" |  | ||||||
|  |  | ||||||
| # Crawler config | # Crawler config | ||||||
| crawl_filter = "en.wikipedia.com"  | crawl_filter = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI/Imagery/2023_NAIP/UTM_County_Mosaics/"  | ||||||
| budget = 200 | # crawl_filter = "https://oliveratkinson.net"  | ||||||
|  | start_url = "https://ftpgeoinfo.msl.mt.gov/Data/Spatial/MSDI/Imagery/2023_NAIP/UTM_County_Mosaics/" | ||||||
|  | # start_url = "https://oliveratkinson.net" | ||||||
|  | budget = 100 | ||||||
|  | batch_size = 2 | ||||||
|   | |||||||
							
								
								
									
										54
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										54
									
								
								README.md
									
									
									
									
									
								
							| @@ -2,10 +2,58 @@ | |||||||
|  |  | ||||||
| Crawls sites saving all the found links to a surrealdb database. It then proceeds to take batches of 100 uncrawled links untill the crawl budget is reached. It saves the data of each site in a minio database. | Crawls sites saving all the found links to a surrealdb database. It then proceeds to take batches of 100 uncrawled links untill the crawl budget is reached. It saves the data of each site in a minio database. | ||||||
|  |  | ||||||
|  | ## How to use | ||||||
|  |  | ||||||
|  | 1. Clone the repo and `cd` into it. | ||||||
|  | 2. Build the repo with `cargo build -r` | ||||||
|  | 3. Start the docker conatiners | ||||||
|  | 	1. cd into the docker folder `cd docker` | ||||||
|  | 	2. Bring up the docker containers `docker compose up -d` | ||||||
|  | 4. From the project's root, edit the `Crawler.toml` file to your liking. | ||||||
|  | 5. Run with `./target/release/internet_mapper` | ||||||
|  |  | ||||||
|  | You can view stats of the project at `http://<your-ip>:3000/dashboards` | ||||||
|  |  | ||||||
|  | ```bash | ||||||
|  | # Untested script but probably works | ||||||
|  | git clone https://git.oliveratkinson.net/Oliver/internet_mapper.git | ||||||
|  | cd internet_mapper | ||||||
|  |  | ||||||
|  | cargo build -r | ||||||
|  |  | ||||||
|  | cd docker | ||||||
|  | docker compose up -d | ||||||
|  | cd .. | ||||||
|  |  | ||||||
|  | $EDITOR Crawler.toml | ||||||
|  |  | ||||||
|  | ./target/release/internet_mapper | ||||||
|  |  | ||||||
|  | ``` | ||||||
|  |  | ||||||
| ### TODO | ### TODO | ||||||
|  |  | ||||||
| - [ ] Domain filtering - prevent the crawler from going on alternate versions of wikipedia. | - [x] Domain filtering - prevent the crawler from going on alternate versions of wikipedia. | ||||||
| - [ ] Conditionally save content - based on filename or file contents | - [ ] Conditionally save content - based on filename or file contents | ||||||
| - [ ] GUI / TUI ? | - [x] GUI / TUI ? - Graphana | ||||||
| - [ ] Better asynchronous getting of the sites. Currently it all happens serially. | - [x] Better asynchronous getting of the sites. Currently it all happens serially. | ||||||
|  | - [x] Allow for storing asynchronously - dropping the "links to" logic fixes this need | ||||||
|  | - [x] Control crawler via config file (no recompliation needed) | ||||||
|  |  | ||||||
|  | ### Feats | ||||||
|  |  | ||||||
|  | 3/17/25: Took >1hr to crawl 100 pages. | ||||||
|  |  | ||||||
|  | 3/19/25: Took 20min to crawl 1000 pages. | ||||||
|  | This ment we stored 1000 pages, 142,997 urls, and 1,425,798 links between the two. | ||||||
|  |  | ||||||
|  | 3/20/25: Took 5min to crawl 1000 pages. | ||||||
|  |  | ||||||
|  | 3/21/25: Took 3min to crawl 1000 pages. | ||||||
|  |  | ||||||
|  | 7/.../25: Downloaded just shy of 12TB of data from a remote server. | ||||||
|  |  | ||||||
|  | # About | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -3,8 +3,8 @@ local.file_match "tmplogs" { | |||||||
| } | } | ||||||
|  |  | ||||||
| loki.source.file "local_files" { | loki.source.file "local_files" { | ||||||
|     targets    = local.file_match.tmplogs.targets |     targets    	= local.file_match.tmplogs.targets | ||||||
|     forward_to = [loki.write.local_loki.receiver] |     forward_to 	= [loki.write.local_loki.receiver] | ||||||
| } | } | ||||||
|  |  | ||||||
| loki.write "local_loki" { | loki.write "local_loki" { | ||||||
| @@ -12,3 +12,25 @@ loki.write "local_loki" { | |||||||
|         url = "http://loki:3100/loki/api/v1/push" |         url = "http://loki:3100/loki/api/v1/push" | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | otelcol.receiver.otlp "otlp_receiver" { | ||||||
|  |   grpc { | ||||||
|  |     endpoint = "0.0.0.0:4317" | ||||||
|  |   } | ||||||
|  |   http { | ||||||
|  |     endpoint = "0.0.0.0:4318" | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   output { | ||||||
|  |     traces = [otelcol.exporter.otlp.tempo.input,] | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | otelcol.exporter.otlp "tempo" { | ||||||
|  |   client { | ||||||
|  |     endpoint = "tempo:4317" | ||||||
|  |     tls { | ||||||
|  | 			insecure = true | ||||||
|  | 		} | ||||||
|  |   } | ||||||
|  | } | ||||||
|   | |||||||
| @@ -1,4 +1,6 @@ | |||||||
| services: | services: | ||||||
|  |  | ||||||
|  |   # Database | ||||||
|   surreal: |   surreal: | ||||||
|     image: surrealdb/surrealdb:latest-dev |     image: surrealdb/surrealdb:latest-dev | ||||||
|     ports: |     ports: | ||||||
| @@ -14,22 +16,19 @@ services: | |||||||
|       - --pass |       - --pass | ||||||
|       - root |       - root | ||||||
|       - rocksdb:/mydata/database.db |       - rocksdb:/mydata/database.db | ||||||
|   minio: |  | ||||||
|     image: quay.io/minio/minio |  | ||||||
|     ports: |  | ||||||
|       - 9000:9000 |  | ||||||
|       - 9001:9001 |  | ||||||
|     environment: |  | ||||||
|       - MINIO_ROOT_USER=root |  | ||||||
|       - MINIO_ROOT_PASSWORD=an8charpassword |  | ||||||
|     volumes: |  | ||||||
|       - minio_storage:/data |  | ||||||
|     command: |  | ||||||
|       - server |  | ||||||
|       - /data |  | ||||||
|       - --console-address |  | ||||||
|       - ":9001" |  | ||||||
|  |  | ||||||
|  |   # Tracing | ||||||
|  |   tempo: | ||||||
|  |     image: grafana/tempo:latest | ||||||
|  |     command: [ "-config.file=/etc/tempo.yaml" ] | ||||||
|  |     volumes: | ||||||
|  |       - ./tempo.yaml:/etc/tempo.yaml | ||||||
|  |       - tempo_storage:/var/tempo | ||||||
|  |     ports: | ||||||
|  |       - 3200:3200 # self metrics for prometheus | ||||||
|  |       - 4317:4317 # otlp grpc - (alloy)  | ||||||
|  |  | ||||||
|  |   # Log scraper | ||||||
|   alloy: |   alloy: | ||||||
|     image: grafana/alloy:latest |     image: grafana/alloy:latest | ||||||
|     ports: |     ports: | ||||||
| @@ -39,9 +38,13 @@ services: | |||||||
|       - ./logs/:/tmp/alloy-logs |       - ./logs/:/tmp/alloy-logs | ||||||
|       - ./alloy.conf:/etc/alloy/config.alloy |       - ./alloy.conf:/etc/alloy/config.alloy | ||||||
|       - alloy_storage:/var/lib/alloy |       - alloy_storage:/var/lib/alloy | ||||||
|     command:  run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy |     command: | ||||||
|  |       - run | ||||||
|  |       - --server.http.listen-addr=0.0.0.0:12345 | ||||||
|  |       - --storage.path=/var/lib/alloy/data | ||||||
|  |       - /etc/alloy/config.alloy | ||||||
|  |  | ||||||
|   #logs |   # Log storage / analysis | ||||||
|   loki: |   loki: | ||||||
|     image: grafana/loki:latest |     image: grafana/loki:latest | ||||||
|     ports: |     ports: | ||||||
| @@ -50,22 +53,28 @@ services: | |||||||
|     volumes: |     volumes: | ||||||
|       - ./loki.yaml:/etc/loki/local-config.yaml |       - ./loki.yaml:/etc/loki/local-config.yaml | ||||||
|  |  | ||||||
|   # Metrics collector |   # Metrics | ||||||
|   prometheus: |   prometheus: | ||||||
|     image: prom/prometheus:latest |     image: prom/prometheus:latest | ||||||
|     expose: |     ports: | ||||||
|       - 9090 |       - 9090:9090 | ||||||
|     volumes: |     volumes: | ||||||
|       - ./prometheus.yaml:/etc/prometheus/prometheus.yml |       - ./prometheus.yaml:/etc/prometheus/prometheus.yml | ||||||
|       # persist data |       # persist data | ||||||
|       - prometheus_storage:/prometheus |       # - prometheus_storage:/prometheus | ||||||
|     command: --web.enable-lifecycle --config.file=/etc/prometheus/prometheus.yml |     command: | ||||||
|  |       - --enable-feature=native-histograms | ||||||
|  |       - --web.enable-remote-write-receiver | ||||||
|  |       - --web.enable-lifecycle | ||||||
|  |       - --web.enable-otlp-receiver | ||||||
|  |       - --config.file=/etc/prometheus/prometheus.yml | ||||||
|  |  | ||||||
|   # Everything viewer |   # Everything viewer | ||||||
|   grafana: |   grafana: | ||||||
|     image: grafana/grafana:latest |     image: grafana/grafana:latest | ||||||
|     volumes: |     volumes: | ||||||
|       - ./grafana.yaml:/etc/grafana/provisioning/datasources/datasources.yaml |       - ./grafana.yaml:/etc/grafana/provisioning/datasources/datasources.yaml | ||||||
|  |       - ./dashboards:/var/lib/grafana/dashboards | ||||||
|       - grafana_storage:/var/lib/grafana |       - grafana_storage:/var/lib/grafana | ||||||
|     environment: |     environment: | ||||||
|       - GF_AUTH_ANONYMOUS_ENABLED=true |       - GF_AUTH_ANONYMOUS_ENABLED=true | ||||||
| @@ -80,4 +89,4 @@ volumes: | |||||||
|   grafana_storage: |   grafana_storage: | ||||||
|   alloy_storage: |   alloy_storage: | ||||||
|   surrealdb_storage: |   surrealdb_storage: | ||||||
|   minio_storage: |   tempo_storage: | ||||||
|   | |||||||
							
								
								
									
										648
									
								
								docker/dashboards/crawler-dashboard.json
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										648
									
								
								docker/dashboards/crawler-dashboard.json
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,648 @@ | |||||||
|  | { | ||||||
|  |   "__inputs": [ | ||||||
|  |     { | ||||||
|  |       "name": "DS_PROMETHEUS", | ||||||
|  |       "label": "Prometheus", | ||||||
|  |       "description": "", | ||||||
|  |       "type": "datasource", | ||||||
|  |       "pluginId": "prometheus", | ||||||
|  |       "pluginName": "Prometheus" | ||||||
|  |     }, | ||||||
|  |     { | ||||||
|  |       "name": "DS_LOKI", | ||||||
|  |       "label": "Loki", | ||||||
|  |       "description": "", | ||||||
|  |       "type": "datasource", | ||||||
|  |       "pluginId": "loki", | ||||||
|  |       "pluginName": "Loki" | ||||||
|  |     } | ||||||
|  |   ], | ||||||
|  |   "__elements": {}, | ||||||
|  |   "__requires": [ | ||||||
|  |     { | ||||||
|  |       "type": "grafana", | ||||||
|  |       "id": "grafana", | ||||||
|  |       "name": "Grafana", | ||||||
|  |       "version": "11.3.1" | ||||||
|  |     }, | ||||||
|  |     { | ||||||
|  |       "type": "panel", | ||||||
|  |       "id": "logs", | ||||||
|  |       "name": "Logs", | ||||||
|  |       "version": "" | ||||||
|  |     }, | ||||||
|  |     { | ||||||
|  |       "type": "datasource", | ||||||
|  |       "id": "loki", | ||||||
|  |       "name": "Loki", | ||||||
|  |       "version": "1.0.0" | ||||||
|  |     }, | ||||||
|  |     { | ||||||
|  |       "type": "datasource", | ||||||
|  |       "id": "prometheus", | ||||||
|  |       "name": "Prometheus", | ||||||
|  |       "version": "1.0.0" | ||||||
|  |     }, | ||||||
|  |     { | ||||||
|  |       "type": "panel", | ||||||
|  |       "id": "stat", | ||||||
|  |       "name": "Stat", | ||||||
|  |       "version": "" | ||||||
|  |     }, | ||||||
|  |     { | ||||||
|  |       "type": "panel", | ||||||
|  |       "id": "timeseries", | ||||||
|  |       "name": "Time series", | ||||||
|  |       "version": "" | ||||||
|  |     } | ||||||
|  |   ], | ||||||
|  |   "annotations": { | ||||||
|  |     "list": [ | ||||||
|  |       { | ||||||
|  |         "builtIn": 1, | ||||||
|  |         "datasource": { | ||||||
|  |           "type": "grafana", | ||||||
|  |           "uid": "-- Grafana --" | ||||||
|  |         }, | ||||||
|  |         "enable": true, | ||||||
|  |         "hide": true, | ||||||
|  |         "iconColor": "rgba(0, 211, 255, 1)", | ||||||
|  |         "name": "Annotations & Alerts", | ||||||
|  |         "type": "dashboard" | ||||||
|  |       } | ||||||
|  |     ] | ||||||
|  |   }, | ||||||
|  |   "editable": true, | ||||||
|  |   "fiscalYearStartMonth": 0, | ||||||
|  |   "graphTooltip": 0, | ||||||
|  |   "id": null, | ||||||
|  |   "links": [], | ||||||
|  |   "panels": [ | ||||||
|  |     { | ||||||
|  |       "datasource": { | ||||||
|  |         "type": "prometheus", | ||||||
|  |         "uid": "${DS_PROMETHEUS}" | ||||||
|  |       }, | ||||||
|  |       "fieldConfig": { | ||||||
|  |         "defaults": { | ||||||
|  |           "color": { | ||||||
|  |             "mode": "palette-classic" | ||||||
|  |           }, | ||||||
|  |           "custom": { | ||||||
|  |             "axisBorderShow": false, | ||||||
|  |             "axisCenteredZero": false, | ||||||
|  |             "axisColorMode": "text", | ||||||
|  |             "axisLabel": "", | ||||||
|  |             "axisPlacement": "auto", | ||||||
|  |             "barAlignment": 0, | ||||||
|  |             "barWidthFactor": 0.6, | ||||||
|  |             "drawStyle": "line", | ||||||
|  |             "fillOpacity": 0, | ||||||
|  |             "gradientMode": "none", | ||||||
|  |             "hideFrom": { | ||||||
|  |               "legend": false, | ||||||
|  |               "tooltip": false, | ||||||
|  |               "viz": false | ||||||
|  |             }, | ||||||
|  |             "insertNulls": false, | ||||||
|  |             "lineInterpolation": "linear", | ||||||
|  |             "lineWidth": 1, | ||||||
|  |             "pointSize": 5, | ||||||
|  |             "scaleDistribution": { | ||||||
|  |               "type": "linear" | ||||||
|  |             }, | ||||||
|  |             "showPoints": "auto", | ||||||
|  |             "spanNulls": 300000, | ||||||
|  |             "stacking": { | ||||||
|  |               "group": "A", | ||||||
|  |               "mode": "none" | ||||||
|  |             }, | ||||||
|  |             "thresholdsStyle": { | ||||||
|  |               "mode": "off" | ||||||
|  |             } | ||||||
|  |           }, | ||||||
|  |           "mappings": [], | ||||||
|  |           "thresholds": { | ||||||
|  |             "mode": "absolute", | ||||||
|  |             "steps": [ | ||||||
|  |               { | ||||||
|  |                 "color": "green", | ||||||
|  |                 "value": null | ||||||
|  |               }, | ||||||
|  |               { | ||||||
|  |                 "color": "red", | ||||||
|  |                 "value": 80 | ||||||
|  |               } | ||||||
|  |             ] | ||||||
|  |           } | ||||||
|  |         }, | ||||||
|  |         "overrides": [] | ||||||
|  |       }, | ||||||
|  |       "gridPos": { | ||||||
|  |         "h": 8, | ||||||
|  |         "w": 8, | ||||||
|  |         "x": 0, | ||||||
|  |         "y": 0 | ||||||
|  |       }, | ||||||
|  |       "id": 5, | ||||||
|  |       "options": { | ||||||
|  |         "legend": { | ||||||
|  |           "calcs": [], | ||||||
|  |           "displayMode": "list", | ||||||
|  |           "placement": "bottom", | ||||||
|  |           "showLegend": true | ||||||
|  |         }, | ||||||
|  |         "tooltip": { | ||||||
|  |           "mode": "single", | ||||||
|  |           "sort": "none" | ||||||
|  |         } | ||||||
|  |       }, | ||||||
|  |       "pluginVersion": "11.3.1", | ||||||
|  |       "targets": [ | ||||||
|  |         { | ||||||
|  |           "datasource": { | ||||||
|  |             "type": "prometheus", | ||||||
|  |             "uid": "${DS_PROMETHEUS}" | ||||||
|  |           }, | ||||||
|  |           "disableTextWrap": false, | ||||||
|  |           "editorMode": "builder", | ||||||
|  |           "expr": "surql_trips", | ||||||
|  |           "fullMetaSearch": false, | ||||||
|  |           "includeNullMetadata": true, | ||||||
|  |           "legendFormat": "Trips to Surreal", | ||||||
|  |           "range": true, | ||||||
|  |           "refId": "A", | ||||||
|  |           "useBackend": false | ||||||
|  |         }, | ||||||
|  |         { | ||||||
|  |           "datasource": { | ||||||
|  |             "type": "prometheus", | ||||||
|  |             "uid": "${DS_PROMETHEUS}" | ||||||
|  |           }, | ||||||
|  |           "disableTextWrap": false, | ||||||
|  |           "editorMode": "builder", | ||||||
|  |           "expr": "s3_trips", | ||||||
|  |           "fullMetaSearch": false, | ||||||
|  |           "hide": false, | ||||||
|  |           "includeNullMetadata": true, | ||||||
|  |           "instant": false, | ||||||
|  |           "legendFormat": "Trips to S3", | ||||||
|  |           "range": true, | ||||||
|  |           "refId": "B", | ||||||
|  |           "useBackend": false | ||||||
|  |         }, | ||||||
|  |         { | ||||||
|  |           "datasource": { | ||||||
|  |             "type": "prometheus", | ||||||
|  |             "uid": "${DS_PROMETHEUS}" | ||||||
|  |           }, | ||||||
|  |           "disableTextWrap": false, | ||||||
|  |           "editorMode": "builder", | ||||||
|  |           "expr": "pages_crawled", | ||||||
|  |           "fullMetaSearch": false, | ||||||
|  |           "hide": false, | ||||||
|  |           "includeNullMetadata": true, | ||||||
|  |           "instant": false, | ||||||
|  |           "legendFormat": "total crawled", | ||||||
|  |           "range": true, | ||||||
|  |           "refId": "C", | ||||||
|  |           "useBackend": false | ||||||
|  |         }, | ||||||
|  |         { | ||||||
|  |           "datasource": { | ||||||
|  |             "type": "prometheus", | ||||||
|  |             "uid": "${DS_PROMETHEUS}" | ||||||
|  |           }, | ||||||
|  |           "disableTextWrap": false, | ||||||
|  |           "editorMode": "builder", | ||||||
|  |           "expr": "pages_being_processed", | ||||||
|  |           "fullMetaSearch": false, | ||||||
|  |           "hide": false, | ||||||
|  |           "includeNullMetadata": true, | ||||||
|  |           "instant": false, | ||||||
|  |           "legendFormat": "Pages being processed", | ||||||
|  |           "range": true, | ||||||
|  |           "refId": "E", | ||||||
|  |           "useBackend": false | ||||||
|  |         }, | ||||||
|  |         { | ||||||
|  |           "datasource": { | ||||||
|  |             "type": "prometheus", | ||||||
|  |             "uid": "${DS_PROMETHEUS}" | ||||||
|  |           }, | ||||||
|  |           "disableTextWrap": false, | ||||||
|  |           "editorMode": "builder", | ||||||
|  |           "expr": "gets_in_flight", | ||||||
|  |           "fullMetaSearch": false, | ||||||
|  |           "hide": false, | ||||||
|  |           "includeNullMetadata": true, | ||||||
|  |           "instant": false, | ||||||
|  |           "legendFormat": "__auto", | ||||||
|  |           "range": true, | ||||||
|  |           "refId": "D", | ||||||
|  |           "useBackend": false | ||||||
|  |         } | ||||||
|  |       ], | ||||||
|  |       "title": "Crawler stats", | ||||||
|  |       "type": "timeseries" | ||||||
|  |     }, | ||||||
|  |     { | ||||||
|  |       "datasource": { | ||||||
|  |         "type": "prometheus", | ||||||
|  |         "uid": "${DS_PROMETHEUS}" | ||||||
|  |       }, | ||||||
|  |       "fieldConfig": { | ||||||
|  |         "defaults": { | ||||||
|  |           "color": { | ||||||
|  |             "mode": "palette-classic" | ||||||
|  |           }, | ||||||
|  |           "custom": { | ||||||
|  |             "axisBorderShow": false, | ||||||
|  |             "axisCenteredZero": false, | ||||||
|  |             "axisColorMode": "text", | ||||||
|  |             "axisLabel": "", | ||||||
|  |             "axisPlacement": "auto", | ||||||
|  |             "barAlignment": 0, | ||||||
|  |             "barWidthFactor": 0.6, | ||||||
|  |             "drawStyle": "line", | ||||||
|  |             "fillOpacity": 0, | ||||||
|  |             "gradientMode": "none", | ||||||
|  |             "hideFrom": { | ||||||
|  |               "legend": false, | ||||||
|  |               "tooltip": false, | ||||||
|  |               "viz": false | ||||||
|  |             }, | ||||||
|  |             "insertNulls": false, | ||||||
|  |             "lineInterpolation": "linear", | ||||||
|  |             "lineWidth": 1, | ||||||
|  |             "pointSize": 5, | ||||||
|  |             "scaleDistribution": { | ||||||
|  |               "type": "linear" | ||||||
|  |             }, | ||||||
|  |             "showPoints": "auto", | ||||||
|  |             "spanNulls": 300000, | ||||||
|  |             "stacking": { | ||||||
|  |               "group": "A", | ||||||
|  |               "mode": "none" | ||||||
|  |             }, | ||||||
|  |             "thresholdsStyle": { | ||||||
|  |               "mode": "off" | ||||||
|  |             } | ||||||
|  |           }, | ||||||
|  |           "mappings": [], | ||||||
|  |           "thresholds": { | ||||||
|  |             "mode": "absolute", | ||||||
|  |             "steps": [ | ||||||
|  |               { | ||||||
|  |                 "color": "green", | ||||||
|  |                 "value": null | ||||||
|  |               }, | ||||||
|  |               { | ||||||
|  |                 "color": "red", | ||||||
|  |                 "value": 80 | ||||||
|  |               } | ||||||
|  |             ] | ||||||
|  |           } | ||||||
|  |         }, | ||||||
|  |         "overrides": [] | ||||||
|  |       }, | ||||||
|  |       "gridPos": { | ||||||
|  |         "h": 8, | ||||||
|  |         "w": 9, | ||||||
|  |         "x": 8, | ||||||
|  |         "y": 0 | ||||||
|  |       }, | ||||||
|  |       "id": 6, | ||||||
|  |       "options": { | ||||||
|  |         "legend": { | ||||||
|  |           "calcs": [], | ||||||
|  |           "displayMode": "list", | ||||||
|  |           "placement": "bottom", | ||||||
|  |           "showLegend": true | ||||||
|  |         }, | ||||||
|  |         "tooltip": { | ||||||
|  |           "mode": "single", | ||||||
|  |           "sort": "none" | ||||||
|  |         } | ||||||
|  |       }, | ||||||
|  |       "pluginVersion": "11.3.1", | ||||||
|  |       "targets": [ | ||||||
|  |         { | ||||||
|  |           "datasource": { | ||||||
|  |             "type": "prometheus", | ||||||
|  |             "uid": "${DS_PROMETHEUS}" | ||||||
|  |           }, | ||||||
|  |           "disableTextWrap": false, | ||||||
|  |           "editorMode": "builder", | ||||||
|  |           "expr": "surql_trips", | ||||||
|  |           "fullMetaSearch": false, | ||||||
|  |           "includeNullMetadata": true, | ||||||
|  |           "legendFormat": "Trips to Surreal", | ||||||
|  |           "range": true, | ||||||
|  |           "refId": "A", | ||||||
|  |           "useBackend": false | ||||||
|  |         }, | ||||||
|  |         { | ||||||
|  |           "datasource": { | ||||||
|  |             "type": "prometheus", | ||||||
|  |             "uid": "${DS_PROMETHEUS}" | ||||||
|  |           }, | ||||||
|  |           "disableTextWrap": false, | ||||||
|  |           "editorMode": "builder", | ||||||
|  |           "expr": "surql_link_calls", | ||||||
|  |           "fullMetaSearch": false, | ||||||
|  |           "hide": false, | ||||||
|  |           "includeNullMetadata": true, | ||||||
|  |           "instant": false, | ||||||
|  |           "legendFormat": "link calls", | ||||||
|  |           "range": true, | ||||||
|  |           "refId": "B", | ||||||
|  |           "useBackend": false | ||||||
|  |         }, | ||||||
|  |         { | ||||||
|  |           "datasource": { | ||||||
|  |             "type": "prometheus", | ||||||
|  |             "uid": "${DS_PROMETHEUS}" | ||||||
|  |           }, | ||||||
|  |           "disableTextWrap": false, | ||||||
|  |           "editorMode": "builder", | ||||||
|  |           "expr": "surql_store_calls", | ||||||
|  |           "fullMetaSearch": false, | ||||||
|  |           "hide": false, | ||||||
|  |           "includeNullMetadata": true, | ||||||
|  |           "instant": false, | ||||||
|  |           "legendFormat": "store calls", | ||||||
|  |           "range": true, | ||||||
|  |           "refId": "C", | ||||||
|  |           "useBackend": false | ||||||
|  |         }, | ||||||
|  |         { | ||||||
|  |           "datasource": { | ||||||
|  |             "type": "prometheus", | ||||||
|  |             "uid": "${DS_PROMETHEUS}" | ||||||
|  |           }, | ||||||
|  |           "disableTextWrap": false, | ||||||
|  |           "editorMode": "builder", | ||||||
|  |           "expr": "pages_being_processed", | ||||||
|  |           "fullMetaSearch": false, | ||||||
|  |           "hide": false, | ||||||
|  |           "includeNullMetadata": true, | ||||||
|  |           "instant": false, | ||||||
|  |           "legendFormat": "Pages being processed", | ||||||
|  |           "range": true, | ||||||
|  |           "refId": "E", | ||||||
|  |           "useBackend": false | ||||||
|  |         } | ||||||
|  |       ], | ||||||
|  |       "title": "Surreal stats", | ||||||
|  |       "type": "timeseries" | ||||||
|  |     }, | ||||||
|  |     { | ||||||
|  |       "datasource": { | ||||||
|  |         "type": "prometheus", | ||||||
|  |         "uid": "${DS_PROMETHEUS}" | ||||||
|  |       }, | ||||||
|  |       "description": "This is across all threads, so this isn't wall clock time", | ||||||
|  |       "fieldConfig": { | ||||||
|  |         "defaults": { | ||||||
|  |           "color": { | ||||||
|  |             "mode": "thresholds" | ||||||
|  |           }, | ||||||
|  |           "mappings": [], | ||||||
|  |           "thresholds": { | ||||||
|  |             "mode": "absolute", | ||||||
|  |             "steps": [ | ||||||
|  |               { | ||||||
|  |                 "color": "green", | ||||||
|  |                 "value": null | ||||||
|  |               } | ||||||
|  |             ] | ||||||
|  |           }, | ||||||
|  |           "unit": "ms" | ||||||
|  |         }, | ||||||
|  |         "overrides": [] | ||||||
|  |       }, | ||||||
|  |       "gridPos": { | ||||||
|  |         "h": 8, | ||||||
|  |         "w": 7, | ||||||
|  |         "x": 17, | ||||||
|  |         "y": 0 | ||||||
|  |       }, | ||||||
|  |       "id": 7, | ||||||
|  |       "options": { | ||||||
|  |         "colorMode": "value", | ||||||
|  |         "graphMode": "area", | ||||||
|  |         "justifyMode": "auto", | ||||||
|  |         "orientation": "auto", | ||||||
|  |         "percentChangeColorMode": "standard", | ||||||
|  |         "reduceOptions": { | ||||||
|  |           "calcs": [ | ||||||
|  |             "lastNotNull" | ||||||
|  |           ], | ||||||
|  |           "fields": "", | ||||||
|  |           "values": false | ||||||
|  |         }, | ||||||
|  |         "showPercentChange": false, | ||||||
|  |         "textMode": "auto", | ||||||
|  |         "wideLayout": true | ||||||
|  |       }, | ||||||
|  |       "pluginVersion": "11.3.1", | ||||||
|  |       "targets": [ | ||||||
|  |         { | ||||||
|  |           "datasource": { | ||||||
|  |             "type": "prometheus", | ||||||
|  |             "uid": "${DS_PROMETHEUS}" | ||||||
|  |           }, | ||||||
|  |           "disableTextWrap": false, | ||||||
|  |           "editorMode": "builder", | ||||||
|  |           "expr": "surql_lock_waiting_ms", | ||||||
|  |           "fullMetaSearch": false, | ||||||
|  |           "includeNullMetadata": true, | ||||||
|  |           "legendFormat": "__auto", | ||||||
|  |           "range": true, | ||||||
|  |           "refId": "A", | ||||||
|  |           "useBackend": false | ||||||
|  |         } | ||||||
|  |       ], | ||||||
|  |       "title": "Time spend waiting on lock", | ||||||
|  |       "type": "stat" | ||||||
|  |     }, | ||||||
|  |     { | ||||||
|  |       "datasource": { | ||||||
|  |         "type": "loki", | ||||||
|  |         "uid": "${DS_LOKI}" | ||||||
|  |       }, | ||||||
|  |       "gridPos": { | ||||||
|  |         "h": 18, | ||||||
|  |         "w": 24, | ||||||
|  |         "x": 0, | ||||||
|  |         "y": 8 | ||||||
|  |       }, | ||||||
|  |       "id": 1, | ||||||
|  |       "options": { | ||||||
|  |         "dedupStrategy": "none", | ||||||
|  |         "enableLogDetails": true, | ||||||
|  |         "prettifyLogMessage": false, | ||||||
|  |         "showCommonLabels": false, | ||||||
|  |         "showLabels": false, | ||||||
|  |         "showTime": false, | ||||||
|  |         "sortOrder": "Descending", | ||||||
|  |         "wrapLogMessage": false | ||||||
|  |       }, | ||||||
|  |       "pluginVersion": "11.3.1", | ||||||
|  |       "targets": [ | ||||||
|  |         { | ||||||
|  |           "datasource": { | ||||||
|  |             "type": "loki", | ||||||
|  |             "uid": "${DS_LOKI}" | ||||||
|  |           }, | ||||||
|  |           "editorMode": "code", | ||||||
|  |           "expr": "{filename=\"/tmp/alloy-logs/tracing.log\"} | json | level = `ERROR` | line_format \"{{.threadId}} {{.filename_extracted}}:{{.line_number}}  {{.fields_message}}\"", | ||||||
|  |           "queryType": "range", | ||||||
|  |           "refId": "A" | ||||||
|  |         } | ||||||
|  |       ], | ||||||
|  |       "title": "Errors", | ||||||
|  |       "type": "logs" | ||||||
|  |     }, | ||||||
|  |     { | ||||||
|  |       "datasource": { | ||||||
|  |         "type": "loki", | ||||||
|  |         "uid": "${DS_LOKI}" | ||||||
|  |       }, | ||||||
|  |       "gridPos": { | ||||||
|  |         "h": 8, | ||||||
|  |         "w": 12, | ||||||
|  |         "x": 0, | ||||||
|  |         "y": 26 | ||||||
|  |       }, | ||||||
|  |       "id": 2, | ||||||
|  |       "options": { | ||||||
|  |         "dedupStrategy": "none", | ||||||
|  |         "enableLogDetails": true, | ||||||
|  |         "prettifyLogMessage": false, | ||||||
|  |         "showCommonLabels": false, | ||||||
|  |         "showLabels": false, | ||||||
|  |         "showTime": false, | ||||||
|  |         "sortOrder": "Descending", | ||||||
|  |         "wrapLogMessage": false | ||||||
|  |       }, | ||||||
|  |       "pluginVersion": "11.3.1", | ||||||
|  |       "targets": [ | ||||||
|  |         { | ||||||
|  |           "datasource": { | ||||||
|  |             "type": "loki", | ||||||
|  |             "uid": "${DS_LOKI}" | ||||||
|  |           }, | ||||||
|  |           "editorMode": "code", | ||||||
|  |           "expr": "{filename=\"/tmp/alloy-logs/tracing.log\"} | json | level = `DEBUG` | line_format \"{{.fields_message}}\"", | ||||||
|  |           "queryType": "range", | ||||||
|  |           "refId": "A" | ||||||
|  |         } | ||||||
|  |       ], | ||||||
|  |       "title": "Debug", | ||||||
|  |       "type": "logs" | ||||||
|  |     }, | ||||||
|  |     { | ||||||
|  |       "datasource": { | ||||||
|  |         "type": "loki", | ||||||
|  |         "uid": "${DS_LOKI}" | ||||||
|  |       }, | ||||||
|  |       "gridPos": { | ||||||
|  |         "h": 16, | ||||||
|  |         "w": 12, | ||||||
|  |         "x": 12, | ||||||
|  |         "y": 26 | ||||||
|  |       }, | ||||||
|  |       "id": 4, | ||||||
|  |       "options": { | ||||||
|  |         "dedupStrategy": "none", | ||||||
|  |         "enableLogDetails": true, | ||||||
|  |         "prettifyLogMessage": false, | ||||||
|  |         "showCommonLabels": false, | ||||||
|  |         "showLabels": false, | ||||||
|  |         "showTime": false, | ||||||
|  |         "sortOrder": "Descending", | ||||||
|  |         "wrapLogMessage": false | ||||||
|  |       }, | ||||||
|  |       "pluginVersion": "11.3.1", | ||||||
|  |       "targets": [ | ||||||
|  |         { | ||||||
|  |           "datasource": { | ||||||
|  |             "type": "loki", | ||||||
|  |             "uid": "${DS_LOKI}" | ||||||
|  |           }, | ||||||
|  |           "editorMode": "code", | ||||||
|  |           "expr": "{filename=\"/tmp/alloy-logs/tracing.log\"} | json | level = `TRACE` | line_format \"{{.fields_message}}\"", | ||||||
|  |           "queryType": "range", | ||||||
|  |           "refId": "A" | ||||||
|  |         } | ||||||
|  |       ], | ||||||
|  |       "title": "Trace", | ||||||
|  |       "type": "logs" | ||||||
|  |     }, | ||||||
|  |     { | ||||||
|  |       "datasource": { | ||||||
|  |         "type": "loki", | ||||||
|  |         "uid": "${DS_LOKI}" | ||||||
|  |       }, | ||||||
|  |       "gridPos": { | ||||||
|  |         "h": 8, | ||||||
|  |         "w": 12, | ||||||
|  |         "x": 0, | ||||||
|  |         "y": 34 | ||||||
|  |       }, | ||||||
|  |       "id": 3, | ||||||
|  |       "options": { | ||||||
|  |         "dedupStrategy": "none", | ||||||
|  |         "enableLogDetails": true, | ||||||
|  |         "prettifyLogMessage": false, | ||||||
|  |         "showCommonLabels": false, | ||||||
|  |         "showLabels": false, | ||||||
|  |         "showTime": false, | ||||||
|  |         "sortOrder": "Descending", | ||||||
|  |         "wrapLogMessage": false | ||||||
|  |       }, | ||||||
|  |       "pluginVersion": "11.3.1", | ||||||
|  |       "targets": [ | ||||||
|  |         { | ||||||
|  |           "datasource": { | ||||||
|  |             "type": "loki", | ||||||
|  |             "uid": "${DS_LOKI}" | ||||||
|  |           }, | ||||||
|  |           "editorMode": "code", | ||||||
|  |           "expr": "{filename=\"/tmp/alloy-logs/tracing.log\"} | json | level = `WARN` | line_format \"{{.fields_message}}\"", | ||||||
|  |           "queryType": "range", | ||||||
|  |           "refId": "A" | ||||||
|  |         } | ||||||
|  |       ], | ||||||
|  |       "title": "Warnings", | ||||||
|  |       "type": "logs" | ||||||
|  |     } | ||||||
|  |   ], | ||||||
|  |   "schemaVersion": 40, | ||||||
|  |   "tags": [], | ||||||
|  |   "templating": { | ||||||
|  |     "list": [ | ||||||
|  |       { | ||||||
|  |         "datasource": { | ||||||
|  |           "type": "loki", | ||||||
|  |           "uid": "P8E80F9AEF21F6940" | ||||||
|  |         }, | ||||||
|  |         "filters": [], | ||||||
|  |         "name": "Filters", | ||||||
|  |         "type": "adhoc" | ||||||
|  |       } | ||||||
|  |     ] | ||||||
|  |   }, | ||||||
|  |   "time": { | ||||||
|  |     "from": "now-5m", | ||||||
|  |     "to": "now" | ||||||
|  |   }, | ||||||
|  |   "timepicker": {}, | ||||||
|  |   "timezone": "browser", | ||||||
|  |   "title": "Crawler", | ||||||
|  |   "uid": "ceg90x34pqgowd", | ||||||
|  |   "version": 21, | ||||||
|  |   "weekStart": "" | ||||||
|  | } | ||||||
| @@ -22,3 +22,20 @@ datasources: | |||||||
|   editable: false |   editable: false | ||||||
|   jsonData: |   jsonData: | ||||||
|     httpMethod: GET |     httpMethod: GET | ||||||
|  | - name: Tempo | ||||||
|  |   type: tempo | ||||||
|  |   access: proxy | ||||||
|  |   orgId: 1 | ||||||
|  |   url: http://tempo:3200 | ||||||
|  |   basicAuth: false | ||||||
|  |   isDefault: false | ||||||
|  |   version: 1 | ||||||
|  |   editable: true | ||||||
|  |   apiVersion: 1 | ||||||
|  |   uid: tempo | ||||||
|  |   jsonData: | ||||||
|  |     httpMethod: GET | ||||||
|  |     serviceMap: | ||||||
|  |       datasourceUid: prometheus | ||||||
|  |     streamingEnabled: | ||||||
|  |       search: true | ||||||
|   | |||||||
| @@ -1,223 +0,0 @@ | |||||||
| { |  | ||||||
|   "__inputs": [ |  | ||||||
|     { |  | ||||||
|       "name": "DS_LOKI", |  | ||||||
|       "label": "Loki", |  | ||||||
|       "description": "", |  | ||||||
|       "type": "datasource", |  | ||||||
|       "pluginId": "loki", |  | ||||||
|       "pluginName": "Loki" |  | ||||||
|     } |  | ||||||
|   ], |  | ||||||
|   "__elements": {}, |  | ||||||
|   "__requires": [ |  | ||||||
|     { |  | ||||||
|       "type": "grafana", |  | ||||||
|       "id": "grafana", |  | ||||||
|       "name": "Grafana", |  | ||||||
|       "version": "11.3.1" |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|       "type": "panel", |  | ||||||
|       "id": "logs", |  | ||||||
|       "name": "Logs", |  | ||||||
|       "version": "" |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|       "type": "datasource", |  | ||||||
|       "id": "loki", |  | ||||||
|       "name": "Loki", |  | ||||||
|       "version": "1.0.0" |  | ||||||
|     } |  | ||||||
|   ], |  | ||||||
|   "annotations": { |  | ||||||
|     "list": [ |  | ||||||
|       { |  | ||||||
|         "builtIn": 1, |  | ||||||
|         "datasource": { |  | ||||||
|           "type": "grafana", |  | ||||||
|           "uid": "-- Grafana --" |  | ||||||
|         }, |  | ||||||
|         "enable": true, |  | ||||||
|         "hide": true, |  | ||||||
|         "iconColor": "rgba(0, 211, 255, 1)", |  | ||||||
|         "name": "Annotations & Alerts", |  | ||||||
|         "type": "dashboard" |  | ||||||
|       } |  | ||||||
|     ] |  | ||||||
|   }, |  | ||||||
|   "editable": true, |  | ||||||
|   "fiscalYearStartMonth": 0, |  | ||||||
|   "graphTooltip": 0, |  | ||||||
|   "id": null, |  | ||||||
|   "links": [], |  | ||||||
|   "panels": [ |  | ||||||
|     { |  | ||||||
|       "datasource": { |  | ||||||
|         "type": "loki", |  | ||||||
|         "uid": "${DS_LOKI}" |  | ||||||
|       }, |  | ||||||
|       "gridPos": { |  | ||||||
|         "h": 8, |  | ||||||
|         "w": 12, |  | ||||||
|         "x": 0, |  | ||||||
|         "y": 0 |  | ||||||
|       }, |  | ||||||
|       "id": 1, |  | ||||||
|       "options": { |  | ||||||
|         "dedupStrategy": "none", |  | ||||||
|         "enableLogDetails": true, |  | ||||||
|         "prettifyLogMessage": false, |  | ||||||
|         "showCommonLabels": false, |  | ||||||
|         "showLabels": false, |  | ||||||
|         "showTime": false, |  | ||||||
|         "sortOrder": "Descending", |  | ||||||
|         "wrapLogMessage": false |  | ||||||
|       }, |  | ||||||
|       "pluginVersion": "11.3.1", |  | ||||||
|       "targets": [ |  | ||||||
|         { |  | ||||||
|           "datasource": { |  | ||||||
|             "type": "loki", |  | ||||||
|             "uid": "${DS_LOKI}" |  | ||||||
|           }, |  | ||||||
|           "editorMode": "code", |  | ||||||
|           "expr": "{filename=\"/tmp/alloy-logs/tracing.log\"} | json | level = `ERROR` | line_format \"{{.fields_message}}\"", |  | ||||||
|           "queryType": "range", |  | ||||||
|           "refId": "A" |  | ||||||
|         } |  | ||||||
|       ], |  | ||||||
|       "title": "Errors", |  | ||||||
|       "type": "logs" |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|       "datasource": { |  | ||||||
|         "type": "loki", |  | ||||||
|         "uid": "${DS_LOKI}" |  | ||||||
|       }, |  | ||||||
|       "gridPos": { |  | ||||||
|         "h": 8, |  | ||||||
|         "w": 12, |  | ||||||
|         "x": 12, |  | ||||||
|         "y": 0 |  | ||||||
|       }, |  | ||||||
|       "id": 3, |  | ||||||
|       "options": { |  | ||||||
|         "dedupStrategy": "none", |  | ||||||
|         "enableLogDetails": true, |  | ||||||
|         "prettifyLogMessage": false, |  | ||||||
|         "showCommonLabels": false, |  | ||||||
|         "showLabels": false, |  | ||||||
|         "showTime": false, |  | ||||||
|         "sortOrder": "Descending", |  | ||||||
|         "wrapLogMessage": false |  | ||||||
|       }, |  | ||||||
|       "pluginVersion": "11.3.1", |  | ||||||
|       "targets": [ |  | ||||||
|         { |  | ||||||
|           "datasource": { |  | ||||||
|             "type": "loki", |  | ||||||
|             "uid": "${DS_LOKI}" |  | ||||||
|           }, |  | ||||||
|           "editorMode": "code", |  | ||||||
|           "expr": "{filename=\"/tmp/alloy-logs/tracing.log\"} | json | level = `WARN` | line_format \"{{.fields_message}}\"", |  | ||||||
|           "queryType": "range", |  | ||||||
|           "refId": "A" |  | ||||||
|         } |  | ||||||
|       ], |  | ||||||
|       "title": "Warnings", |  | ||||||
|       "type": "logs" |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|       "datasource": { |  | ||||||
|         "type": "loki", |  | ||||||
|         "uid": "${DS_LOKI}" |  | ||||||
|       }, |  | ||||||
|       "gridPos": { |  | ||||||
|         "h": 8, |  | ||||||
|         "w": 12, |  | ||||||
|         "x": 0, |  | ||||||
|         "y": 8 |  | ||||||
|       }, |  | ||||||
|       "id": 2, |  | ||||||
|       "options": { |  | ||||||
|         "dedupStrategy": "none", |  | ||||||
|         "enableLogDetails": true, |  | ||||||
|         "prettifyLogMessage": false, |  | ||||||
|         "showCommonLabels": false, |  | ||||||
|         "showLabels": false, |  | ||||||
|         "showTime": false, |  | ||||||
|         "sortOrder": "Descending", |  | ||||||
|         "wrapLogMessage": false |  | ||||||
|       }, |  | ||||||
|       "pluginVersion": "11.3.1", |  | ||||||
|       "targets": [ |  | ||||||
|         { |  | ||||||
|           "datasource": { |  | ||||||
|             "type": "loki", |  | ||||||
|             "uid": "${DS_LOKI}" |  | ||||||
|           }, |  | ||||||
|           "editorMode": "code", |  | ||||||
|           "expr": "{filename=\"/tmp/alloy-logs/tracing.log\"} | json | level = `DEBUG` | line_format \"{{.fields_message}}\"", |  | ||||||
|           "queryType": "range", |  | ||||||
|           "refId": "A" |  | ||||||
|         } |  | ||||||
|       ], |  | ||||||
|       "title": "Debug", |  | ||||||
|       "type": "logs" |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|       "datasource": { |  | ||||||
|         "type": "loki", |  | ||||||
|         "uid": "${DS_LOKI}" |  | ||||||
|       }, |  | ||||||
|       "gridPos": { |  | ||||||
|         "h": 8, |  | ||||||
|         "w": 12, |  | ||||||
|         "x": 12, |  | ||||||
|         "y": 8 |  | ||||||
|       }, |  | ||||||
|       "id": 4, |  | ||||||
|       "options": { |  | ||||||
|         "dedupStrategy": "none", |  | ||||||
|         "enableLogDetails": true, |  | ||||||
|         "prettifyLogMessage": false, |  | ||||||
|         "showCommonLabels": false, |  | ||||||
|         "showLabels": false, |  | ||||||
|         "showTime": false, |  | ||||||
|         "sortOrder": "Descending", |  | ||||||
|         "wrapLogMessage": false |  | ||||||
|       }, |  | ||||||
|       "pluginVersion": "11.3.1", |  | ||||||
|       "targets": [ |  | ||||||
|         { |  | ||||||
|           "datasource": { |  | ||||||
|             "type": "loki", |  | ||||||
|             "uid": "${DS_LOKI}" |  | ||||||
|           }, |  | ||||||
|           "editorMode": "code", |  | ||||||
|           "expr": "{filename=\"/tmp/alloy-logs/tracing.log\"} | json | level = `TRACE` | line_format \"{{.fields_message}}\"", |  | ||||||
|           "queryType": "range", |  | ||||||
|           "refId": "A" |  | ||||||
|         } |  | ||||||
|       ], |  | ||||||
|       "title": "Trace", |  | ||||||
|       "type": "logs" |  | ||||||
|     } |  | ||||||
|   ], |  | ||||||
|   "schemaVersion": 40, |  | ||||||
|   "tags": [], |  | ||||||
|   "templating": { |  | ||||||
|     "list": [] |  | ||||||
|   }, |  | ||||||
|   "time": { |  | ||||||
|     "from": "now-6h", |  | ||||||
|     "to": "now" |  | ||||||
|   }, |  | ||||||
|   "timepicker": {}, |  | ||||||
|   "timezone": "browser", |  | ||||||
|   "title": "New dashboard", |  | ||||||
|   "uid": "ceg90x34pqgowd", |  | ||||||
|   "version": 4, |  | ||||||
|   "weekStart": "" |  | ||||||
| } |  | ||||||
| @@ -1,16 +1,15 @@ | |||||||
| global: | global: | ||||||
|   scrape_interval: 5s  |   scrape_interval: 60s  | ||||||
|   query_log_file: /etc/prometheus/query.log |   query_log_file: /etc/prometheus/query.log | ||||||
|  |  | ||||||
| scrape_configs: | scrape_configs: | ||||||
|   - job_name: crawler |   # Crawler configs get pushed with OTLP | ||||||
|  | #  - job_name: 'loki' | ||||||
|  | #    static_configs: | ||||||
|  | #      - targets: ['loki:3100'] | ||||||
|  | #  - job_name: 'prometheus' | ||||||
|  | #    static_configs: | ||||||
|  | #      - targets: ['localhost:9090'] | ||||||
|  |   - job_name: 'tempo' | ||||||
|     static_configs: |     static_configs: | ||||||
|     # change this your machine's ip, localhost won't work |       - targets: ['tempo:3200'] | ||||||
|     # because localhost refers to the docker container. |  | ||||||
|       - targets: ['192.168.8.209:2500'] |  | ||||||
|   - job_name: loki |  | ||||||
|     static_configs: |  | ||||||
|       - targets: ['loki:3100'] |  | ||||||
|   - job_name: prometheus |  | ||||||
|     static_configs: |  | ||||||
|       - targets: ['localhost:9090'] |  | ||||||
|   | |||||||
							
								
								
									
										48
									
								
								docker/tempo.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										48
									
								
								docker/tempo.yaml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,48 @@ | |||||||
|  | stream_over_http_enabled: true | ||||||
|  | server: | ||||||
|  |   http_listen_port: 3200 | ||||||
|  |   log_level: info | ||||||
|  |  | ||||||
|  | query_frontend: | ||||||
|  |   search: | ||||||
|  |     duration_slo: 5s | ||||||
|  |     throughput_bytes_slo: 1.073741824e+09 | ||||||
|  |     metadata_slo: | ||||||
|  |       duration_slo: 5s | ||||||
|  |       throughput_bytes_slo: 1.073741824e+09 | ||||||
|  |   trace_by_id: | ||||||
|  |     duration_slo: 5s | ||||||
|  |  | ||||||
|  | distributor: | ||||||
|  |   receivers: | ||||||
|  |     otlp: | ||||||
|  |       protocols: | ||||||
|  |         grpc: | ||||||
|  |           endpoint: "tempo:4317" | ||||||
|  |  | ||||||
|  | metrics_generator: | ||||||
|  |   registry: | ||||||
|  |     external_labels: | ||||||
|  |       source: tempo | ||||||
|  |       cluster: docker-compose | ||||||
|  |   storage: | ||||||
|  |     path: /var/tempo/generator/wal | ||||||
|  |     remote_write: | ||||||
|  |       - url: http://prometheus:9090/api/v1/write | ||||||
|  |         send_exemplars: true | ||||||
|  |   traces_storage: | ||||||
|  |     path: /var/tempo/generator/traces | ||||||
|  |  | ||||||
|  | storage: | ||||||
|  |   trace: | ||||||
|  |     backend: local # backend configuration to use | ||||||
|  |     wal: | ||||||
|  |       path: /var/tempo/wal # where to store the wal locally | ||||||
|  |     local: | ||||||
|  |       path: /var/tempo/blocks | ||||||
|  |  | ||||||
|  | overrides: | ||||||
|  |   defaults: | ||||||
|  |     metrics_generator: | ||||||
|  |       processors: [service-graphs, span-metrics, local-blocks] # enables metrics generator | ||||||
|  |       generate_native_histograms: both | ||||||
							
								
								
									
										
											BIN
										
									
								
								pngs/graphana.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								pngs/graphana.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 264 KiB | 
							
								
								
									
										211
									
								
								src/db.rs
									
									
									
									
									
								
							
							
						
						
									
										211
									
								
								src/db.rs
									
									
									
									
									
								
							| @@ -1,33 +1,39 @@ | |||||||
| use std::fmt::Debug; |  | ||||||
| use metrics::counter; | use metrics::counter; | ||||||
| use serde::{Deserialize, Serialize}; | use serde::{Deserialize, Serialize}; | ||||||
|  | use std::{fmt::Debug, time::Duration}; | ||||||
| use surrealdb::{ | use surrealdb::{ | ||||||
|     engine::remote::ws::{Client, Ws}, error::Db, opt::auth::Root, sql::Thing, Response, Surreal |     engine::remote::ws::{Client, Ws}, | ||||||
|  |     opt::auth::Root, | ||||||
|  |     sql::Thing, | ||||||
|  |     Surreal, | ||||||
| }; | }; | ||||||
| use tracing::{error, instrument, trace, warn}; | use tokio::time::sleep; | ||||||
|  | use tracing::{error, instrument, trace}; | ||||||
| use url::Url; | use url::Url; | ||||||
|  |  | ||||||
| use crate::{Config, Timer}; | use crate::Config; | ||||||
|  |  | ||||||
| const ROUND_TRIP_METRIC: &'static str = "surql_trips"; | const STORE: &str = "surql_store_calls"; | ||||||
| const STORE: &'static str = "surql_store_calls"; |  | ||||||
| const LINK: &'static str = "surql_link_calls"; |  | ||||||
|  |  | ||||||
| #[derive(Serialize, Deserialize, Clone)] | #[derive(Serialize, Deserialize, Clone, Eq, PartialEq, Hash)] | ||||||
| pub struct Website { | pub struct Website { | ||||||
|  |     pub id: Option<Thing>, | ||||||
|     /// The url that this data is found at |     /// The url that this data is found at | ||||||
|     pub site: Url, |     pub site: Url, | ||||||
|     /// Wether or not this link has been crawled yet |     /// Wether or not this link has been crawled yet | ||||||
|     pub crawled: bool, |     pub crawled: bool, | ||||||
|     #[serde(skip_serializing)] |     /// 200, 404, etc | ||||||
|     id: Option<Thing>, |     pub status_code: u16, | ||||||
| } | } | ||||||
|  |  | ||||||
| // manual impl to make tracing look nicer | // manual impl to make tracing look nicer | ||||||
| impl Debug for Website { | impl Debug for Website { | ||||||
|     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { | ||||||
|         let site = (self.site.domain().unwrap_or("n/a")).to_string() + self.site.path(); |         f.debug_struct("Website") | ||||||
|         f.debug_struct("Website").field("site", &site).finish() |             .field("host", &self.site.host()) | ||||||
|  |             .field("path", &self.site.path()) | ||||||
|  |             .field("status_code", &self.status_code) | ||||||
|  |             .finish() | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -39,145 +45,87 @@ impl Website { | |||||||
|             Err(_) => todo!(), |             Err(_) => todo!(), | ||||||
|         }; |         }; | ||||||
|         Self { |         Self { | ||||||
|             id: None, |  | ||||||
|             crawled, |             crawled, | ||||||
|             site, |             site, | ||||||
|  |             status_code: 0, | ||||||
|  |             id: None, | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn set_crawled(&mut self) { |     // Insert ever item in the vec into surreal, crawled state will be preserved as TRUE | ||||||
|         trace!("Set crawled to true"); |     // if already in the database as such or incoming data is TRUE. | ||||||
|         self.crawled = true |     #[instrument(skip(db))] | ||||||
|     } |     pub async fn store_all(all: Vec<Self>, db: &Surreal<Client>) -> Vec<Thing> { | ||||||
|  |         counter!(STORE).increment(1); | ||||||
|  |         let mut things = Vec::with_capacity(all.len()); | ||||||
|  |  | ||||||
|     #[instrument(skip_all)] |  | ||||||
|     pub async fn links_to(&self, other: Vec<Thing>, db: &Surreal<Client>) { |  | ||||||
|  |  | ||||||
|         let len = other.len(); |  | ||||||
|         if len == 0 {return} |  | ||||||
|  |  | ||||||
|         let from = self.site.to_string(); |  | ||||||
|         // let to = other.site.to_string(); |  | ||||||
|         trace!("Linking {from} to {} other pages.", other.len()); |  | ||||||
|         let msg = format!("Linked {len} pages"); |  | ||||||
|         let timer = Timer::start(&msg); |  | ||||||
|         // prevent the timer from being dropped instantly. |  | ||||||
|         let _ = timer; |  | ||||||
|         counter!(ROUND_TRIP_METRIC).increment(1); |  | ||||||
|         counter!(LINK).increment(1); |  | ||||||
|         match db |         match db | ||||||
|             .query("COUNT(RELATE (SELECT id FROM website WHERE site = $in) -> links_to -> $out)") |             .query( | ||||||
|             .bind(("in", from)) |                 "INSERT INTO website $array | ||||||
|             .bind(("out", other)) |                     ON DUPLICATE KEY UPDATE | ||||||
|  |                         accessed_at = time::now(), | ||||||
|  |                         status_code = $input.status_code, | ||||||
|  |                         processing = false, | ||||||
|  |                         crawled = crawled OR $input.crawled | ||||||
|  |                     RETURN VALUE id; | ||||||
|  |                  ", | ||||||
|  |             ) | ||||||
|  |             .bind(("array", all)) | ||||||
|             .await |             .await | ||||||
|         { |         { | ||||||
|             Ok(mut e) => { |             Ok(mut id) => match id.take::<Vec<Thing>>(0) { | ||||||
|                 // The relate could technically "fail" (not relate anything), this just means that |                 Ok(mut x) => things.append(&mut x), | ||||||
|                 // the query was ok. |                 Err(err) => error!("{:?}", err), | ||||||
|                 let _: Response = e; |  | ||||||
|                 if let Ok(vec) = e.take(0) { |  | ||||||
|                     let _: Vec<usize> = vec; |  | ||||||
|                     if let Some(num) = vec.get(0) { |  | ||||||
|                         if *num == len { |  | ||||||
|                             trace!("Link OK"); |  | ||||||
|                             return; |  | ||||||
|                         } else { |  | ||||||
|                             warn!("Didn't link all the records. {num}/{len}"); |  | ||||||
|                             return; |  | ||||||
|                         } |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|                 warn!("Linking request succeeded but couldn't verify the results."); |  | ||||||
|             }, |             }, | ||||||
|             Err(e) => { |             Err(err) => { | ||||||
|                 error!("{}", e.to_string()); |                 error!("{:?}", err); | ||||||
|             }, |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[instrument(name = "surql_store", skip_all)] |  | ||||||
|     pub async fn store(&self, db: &Surreal<Client>) -> Option<Thing> { |  | ||||||
|         counter!(STORE).increment(1); |  | ||||||
|         let counter = counter!(ROUND_TRIP_METRIC); |  | ||||||
|         let t = Timer::start("Stored link"); |  | ||||||
|         let _ = t; |  | ||||||
|         counter.increment(1); |  | ||||||
|  |  | ||||||
|         // check if it's been gone thru before |  | ||||||
|         let mut response = db |  | ||||||
|             .query("SELECT * FROM ONLY website WHERE site = $site LIMIT 1") |  | ||||||
|             .bind(("site", self.site.to_string())) |  | ||||||
|             .await |  | ||||||
|             .expect("Failed to check surreal for duplicates!"); |  | ||||||
|  |  | ||||||
|         if let Some(old) = response.take::<Option<Website>>(0).expect("Failed to read response from surreal for duplicates.") { |  | ||||||
|             // site exists already |  | ||||||
|             if let Some(id) = old.id { |  | ||||||
|                 // make sure to preserve the "crawled status" |  | ||||||
|                 let mut new = self.clone(); |  | ||||||
|                 new.crawled = old.crawled | new.crawled; |  | ||||||
|  |  | ||||||
|                 counter.increment(1); |  | ||||||
|                 // update the record |  | ||||||
|                 match db.upsert((id.tb, id.id.to_string())).content(new).await { |  | ||||||
|                     Ok(e) => { |  | ||||||
|                         if let Some(a) = e { |  | ||||||
|                             let _: Record = a; |  | ||||||
|                             return Some(a.id); |  | ||||||
|                         } |  | ||||||
|                     } |  | ||||||
|                     Err(e) => { |  | ||||||
|                         match e { |  | ||||||
|                             surrealdb::Error::Db(error) => { |  | ||||||
|                                 match error { |  | ||||||
|                                     Db::QueryCancelled => todo!(), |  | ||||||
|                                     Db::QueryNotExecuted => todo!(), |  | ||||||
|                                     Db::QueryNotExecutedDetail { message: _ } => todo!(), |  | ||||||
|                                    _=>{}, |  | ||||||
|                                 } |  | ||||||
|                             }, |  | ||||||
|                             _=>{}, |  | ||||||
|                         } |  | ||||||
|                         // error!("{}", e); |  | ||||||
|                     } |  | ||||||
|                 }; |  | ||||||
|             } |             } | ||||||
|         } else { |  | ||||||
|             counter.increment(1); |  | ||||||
|             // sites hasn't existed yet |  | ||||||
|             match db.create("website").content(self.clone()).await { |  | ||||||
|                 Ok(e) => { |  | ||||||
|                     let _: Option<Record> = e; |  | ||||||
|                     if let Some(a) = e { |  | ||||||
|                         let _: Record = a; |  | ||||||
|                         return Some(a.id); |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|                 Err(a) => error!("{:?}", a), |  | ||||||
|             }; |  | ||||||
|         } |         } | ||||||
|         None |         things | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| impl ToString for Website { | /// Returns uncrawled links | ||||||
|     fn to_string(&self) -> String { | #[instrument(skip(db, config))] | ||||||
|         self.site.to_string() | pub async fn get_next(db: &Surreal<Client>, config: &Config) -> Option<Website> { | ||||||
|  |     let mut res: Option<Website> = None; | ||||||
|  |     let mut fails = 0; | ||||||
|  |  | ||||||
|  |     while res == None { | ||||||
|  |         let mut response = db | ||||||
|  |             .query("fn::get_next($format)") | ||||||
|  |             .bind(("format", config.crawl_filter.to_string())) | ||||||
|  |             .await | ||||||
|  |             .expect("Hard-coded query failed..?"); | ||||||
|  |  | ||||||
|  |         res = match response.take(0) { | ||||||
|  |             Ok(ok) => ok, | ||||||
|  |             Err(_err) => { | ||||||
|  |                 // basically just CSMA/CA | ||||||
|  |                 let delay = rand::random_range(10..10_000); | ||||||
|  |                 sleep(Duration::from_millis(delay)).await; | ||||||
|  |                 fails += 1; | ||||||
|  |                 // Don't get stuck here forever, failing... | ||||||
|  |                 // (most I've seen is 1) | ||||||
|  |                 if fails > 5 { | ||||||
|  |                     error!("Max attempts to get_next() reached... ({fails})"); | ||||||
|  |                     return None | ||||||
|  |                 } | ||||||
|  |                 None | ||||||
|  |             } | ||||||
|  |         }; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     res | ||||||
| } | } | ||||||
|  |  | ||||||
| #[derive(Debug, Serialize)] | #[derive(Debug, Serialize)] | ||||||
|  | #[allow(dead_code)] | ||||||
| pub struct Email { | pub struct Email { | ||||||
|     pub email: String, |     pub email: String, | ||||||
|     pub on: String, |     pub on: String, | ||||||
| } | } | ||||||
|  |  | ||||||
| #[derive(Debug, Deserialize)] |  | ||||||
| pub struct Record { |  | ||||||
|     #[allow(dead_code)] |  | ||||||
|     pub id: Thing, |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #[instrument(skip_all, name = "SurrealDB")] | #[instrument(skip_all, name = "SurrealDB")] | ||||||
| pub async fn connect(config: &Config) -> surrealdb::Result<Surreal<Client>> { | pub async fn connect(config: &Config) -> surrealdb::Result<Surreal<Client>> { | ||||||
|     trace!("Establishing connection to surreal..."); |     trace!("Establishing connection to surreal..."); | ||||||
| @@ -193,15 +141,16 @@ pub async fn connect(config: &Config) -> surrealdb::Result<Surreal<Client>> { | |||||||
|     .await?; |     .await?; | ||||||
|  |  | ||||||
|     // Select a specific namespace / database |     // Select a specific namespace / database | ||||||
|     db |     db.use_ns(&config.surreal_ns) | ||||||
|         .use_ns(&config.surreal_ns) |  | ||||||
|         .use_db(&config.surreal_db) |         .use_db(&config.surreal_db) | ||||||
|         .await?; |         .await?; | ||||||
|  |  | ||||||
|     let setup = include_bytes!("setup.surql"); |     let setup = include_bytes!("setup.surql"); | ||||||
|     let file = setup.iter().map(|c| *c as char).collect::<String>(); |     let init_commands = setup.iter().map(|c| *c as char).collect::<String>(); | ||||||
|  |  | ||||||
|     db.query(file).await.expect("Failed to setup surreal tables."); |     db.query(init_commands) | ||||||
|  |         .await | ||||||
|  |         .expect("Failed to setup surreal tables."); | ||||||
|  |  | ||||||
|     Ok(db) |     Ok(db) | ||||||
| } | } | ||||||
|   | |||||||
							
								
								
									
										105
									
								
								src/filesystem.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										105
									
								
								src/filesystem.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,105 @@ | |||||||
|  | use std::{io::ErrorKind, path::PathBuf}; | ||||||
|  |  | ||||||
|  | use reqwest::header::HeaderValue; | ||||||
|  | use tokio::fs; | ||||||
|  | use tracing::{error, trace, warn}; | ||||||
|  | use url::Url; | ||||||
|  |  | ||||||
|  | pub fn as_path(url: &Url, content_type: &HeaderValue) -> PathBuf { | ||||||
|  |     // extract data from url to save it accurately | ||||||
|  |     let mut url_path = PathBuf::from("./downloaded/".to_string() + url.domain().unwrap_or("UnknownDomain") + url.path()); | ||||||
|  |  | ||||||
|  |     if let Ok(header) = content_type.to_str() { | ||||||
|  |         // text/html; charset=UTF-8; option=value | ||||||
|  |         let ttype = if let Some((t, _)) = header.split_once(';') { | ||||||
|  |             t | ||||||
|  |         } else { | ||||||
|  |             header | ||||||
|  |         }; | ||||||
|  |  | ||||||
|  |         if let Some((ttype, subtype)) = ttype.split_once('/') { | ||||||
|  |             trace!(url = url.to_string(), main_type = ttype, sub_type = subtype, "Found Content-Type to be: {ttype}/{subtype}"); | ||||||
|  |             // If the Content-Type header is "*/html" (most likely "text/html") and the path's | ||||||
|  |             // extension is anything but html: | ||||||
|  |             if subtype=="html" && !url_path.extension().is_some_and(|f| f=="html" || f=="htm" ) { | ||||||
|  |                 // time to slap a index.html to the end of that path there! | ||||||
|  |                 url_path = url_path.join("index.html"); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } else { | ||||||
|  |         warn!("Header: {:?} couldn't be parsed into a string!", content_type); | ||||||
|  |     } | ||||||
|  |     trace!(url = url.to_string(), path = &*url_path.to_string_lossy(), "Converted URL into path"); | ||||||
|  |  | ||||||
|  |     url_path | ||||||
|  | } | ||||||
|  |  | ||||||
|  | pub async fn check_file_length(file: &PathBuf) -> Option<u64> { | ||||||
|  |     match tokio::fs::OpenOptions::new() | ||||||
|  |         .write(false) | ||||||
|  |         .read(true) | ||||||
|  |         .create(false) | ||||||
|  |         .open(file).await | ||||||
|  |     { | ||||||
|  |         Ok(file) => { | ||||||
|  |             match file.metadata().await { | ||||||
|  |                 Ok(meta) => { | ||||||
|  |                     return Some(meta.len()) | ||||||
|  |                 }, | ||||||
|  |                 Err(err) => { | ||||||
|  |                     error!("Failed to get metadata. {}", err) | ||||||
|  |                 }, | ||||||
|  |             } | ||||||
|  |         }, | ||||||
|  |         Err(err) => { | ||||||
|  |             match err.kind() { | ||||||
|  |                 ErrorKind::NotFound => {/* ignore */}, | ||||||
|  |                 _ => warn!("Failed to open file to check length. {:?} {}", file, err), | ||||||
|  |             } | ||||||
|  |         }, | ||||||
|  |     } | ||||||
|  |     None | ||||||
|  |  | ||||||
|  | } | ||||||
|  |  | ||||||
|  | pub async fn init(filename: &PathBuf) -> Option<fs::File> { | ||||||
|  |     let file = async || tokio::fs::OpenOptions::new() | ||||||
|  |         .write(true) | ||||||
|  |         .append(false) | ||||||
|  |         .create(true) | ||||||
|  |         .open(&filename).await; | ||||||
|  |  | ||||||
|  |     match file().await { | ||||||
|  |         Ok(ok) => { | ||||||
|  |             trace!("Initialized file {}", filename.to_str().unwrap_or("N/A")); | ||||||
|  |             Some(ok) | ||||||
|  |         }, | ||||||
|  |         Err(err) => { | ||||||
|  |             // the file/folder isn't found | ||||||
|  |             if err.kind() == ErrorKind::NotFound { | ||||||
|  |                 if let Some(parent ) = &filename.parent() { | ||||||
|  |                     // create the folders | ||||||
|  |                     if let Err(err) = fs::create_dir_all(&parent).await { | ||||||
|  |                         error!("Dir creation: {err} {:?}", filename); | ||||||
|  |                     } else if let Ok(ok) = file().await { | ||||||
|  |                         return Some(ok); | ||||||
|  |                     } | ||||||
|  |                 } else { | ||||||
|  |                     error!("Couldn't get file's parents: {:?}", &filename); | ||||||
|  |                 } | ||||||
|  |             } else if err.kind() == ErrorKind::NotADirectory { | ||||||
|  |                 // Example: | ||||||
|  |                 // 1. example.com/user | ||||||
|  |                 // 2. example.com/user/post | ||||||
|  |                 // If file 1 exists it will prevent file 2 from existing | ||||||
|  |                 // FIXME | ||||||
|  |  | ||||||
|  |                 error!("One of the parent directories is actually a file...") | ||||||
|  |             } else { | ||||||
|  |                 error!("File open error: {err} {:?}", filename); | ||||||
|  |             } | ||||||
|  |             // we don't care about other errors, we can't/shouldn't fix them | ||||||
|  |             None | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
							
								
								
									
										526
									
								
								src/main.rs
									
									
									
									
									
								
							
							
						
						
									
										526
									
								
								src/main.rs
									
									
									
									
									
								
							| @@ -1,98 +1,116 @@ | |||||||
| #![feature(ip_from)] | #![feature(ip_from)] | ||||||
|  | #![feature(path_add_extension)] | ||||||
|  | #![deny(clippy::unwrap_used)] | ||||||
|  |  | ||||||
| extern crate html5ever; | extern crate html5ever; | ||||||
|  |  | ||||||
| use std::{fs::File, io::Read, net::{IpAddr, Ipv4Addr}, time::Instant}; | use std::{ | ||||||
|  |     collections::HashSet, | ||||||
|  |     fs::File, | ||||||
|  |     io::Read, | ||||||
|  |     sync::{Arc, LazyLock}, | ||||||
|  | }; | ||||||
|  |  | ||||||
| use db::{connect, Website}; | use db::{connect, Website}; | ||||||
| use metrics::{counter, gauge}; | use futures_util::StreamExt; | ||||||
| use metrics_exporter_prometheus::PrometheusBuilder; | use opentelemetry::{ | ||||||
| use s3::S3; |     global::{self}, | ||||||
|  |     metrics::{Counter, Meter, UpDownCounter}, | ||||||
|  | }; | ||||||
|  | use opentelemetry_otlp::{Protocol, WithExportConfig}; | ||||||
|  | use opentelemetry_sdk::{metrics::SdkMeterProvider, trace::SdkTracerProvider}; | ||||||
| use serde::Deserialize; | use serde::Deserialize; | ||||||
| use surrealdb::{engine::remote::ws::Client, Surreal}; | use surrealdb::{engine::remote::ws::Client, Surreal}; | ||||||
| use tokio::task::JoinSet; | use tokio::{ | ||||||
| use tracing::{debug, info, instrument, trace, trace_span, warn}; |     io::{AsyncReadExt, AsyncWriteExt, BufWriter}, | ||||||
|  |     sync::RwLock, | ||||||
|  |     task::JoinSet, | ||||||
|  | }; | ||||||
|  | use tracing::{debug, error, info, instrument, level_filters::LevelFilter, trace, warn}; | ||||||
| use tracing_subscriber::{fmt, layer::SubscriberExt, EnvFilter, Layer, Registry}; | use tracing_subscriber::{fmt, layer::SubscriberExt, EnvFilter, Layer, Registry}; | ||||||
|  |  | ||||||
| mod db; | use crate::db::get_next; | ||||||
| mod parser; |  | ||||||
| mod s3; |  | ||||||
|  |  | ||||||
| const GET_METRIC: &'static str = "total_gets"; | mod db; | ||||||
| const GET_IN_FLIGHT: &'static str = "gets_in_flight"; | mod filesystem; | ||||||
| const SITES_CRAWLED: &'static str = "pages_crawled"; | mod parser; | ||||||
| const BEING_PROCESSED: &'static str = "pages_being_processed"; |  | ||||||
|  | static METER: LazyLock<Meter> = LazyLock::new(|| global::meter("Internet_Mapper")); | ||||||
|  | static BATCH_SIZE: LazyLock<Counter<u64>> = | ||||||
|  |     LazyLock::new(|| METER.u64_counter("crawler_batch_size").build()); | ||||||
|  | static BEING_PROCESSED: LazyLock<UpDownCounter<i64>> = LazyLock::new(|| { | ||||||
|  |     METER | ||||||
|  |         .i64_up_down_counter("crawler_pages_being_processed") | ||||||
|  |         .build() | ||||||
|  | }); | ||||||
|  | static BEING_PARSED: LazyLock<UpDownCounter<i64>> = LazyLock::new(|| { | ||||||
|  |     METER | ||||||
|  |         .i64_up_down_counter("crawler_pages_being_parsed") | ||||||
|  |         .build() | ||||||
|  | }); | ||||||
|  | static BEING_STREAMED: LazyLock<UpDownCounter<i64>> = LazyLock::new(|| { | ||||||
|  |     METER | ||||||
|  |         .i64_up_down_counter("crawler_pages_being_streamed") | ||||||
|  |         .build() | ||||||
|  | }); | ||||||
|  | static GET_IN_FLIGHT: LazyLock<UpDownCounter<i64>> = | ||||||
|  |     LazyLock::new(|| METER.i64_up_down_counter("crawler_gets_in_flight").build()); | ||||||
|  | static TOTAL_BYTES_DOWN: LazyLock<Counter<u64>> = | ||||||
|  |     LazyLock::new(|| METER.u64_counter("crawler_total_bytes_down").build()); | ||||||
|  | static SITES_CRAWLED: LazyLock<Counter<u64>> = | ||||||
|  |     LazyLock::new(|| METER.u64_counter("crawler_total_sites_crawled").build()); | ||||||
|  |  | ||||||
|  | static CONFIG: LazyLock<Config> = LazyLock::new(|| { | ||||||
|  |     let mut file = File::open("./Crawler.toml").expect("Failed to read Crawler.toml"); | ||||||
|  |     let mut buf = String::new(); | ||||||
|  |     let _ = file.read_to_string(&mut buf); | ||||||
|  |  | ||||||
|  |     let config: Config = toml::from_str(&buf).expect("Failed to parse Crawler.toml"); | ||||||
|  |     config | ||||||
|  | }); | ||||||
|  |  | ||||||
|  | // FIXME Traces aren't working on multiple threads, they block | ||||||
|  | // static TRACER: LazyLock<BoxedTracer> = LazyLock::new(|| global::tracer("Internet_Mapper")); | ||||||
|  |  | ||||||
| #[derive(Deserialize)] | #[derive(Deserialize)] | ||||||
| struct Config { | struct Config { | ||||||
|  |     tracing_endpoint: String, | ||||||
|  |     metrics_endpoint: String, | ||||||
|  |     log_file: String, | ||||||
|  |  | ||||||
|     surreal_ns: String, |     surreal_ns: String, | ||||||
|     surreal_db: String, |     surreal_db: String, | ||||||
|     surreal_url: String, |     surreal_url: String, | ||||||
|     surreal_username: String, |     surreal_username: String, | ||||||
|     surreal_password: String, |     surreal_password: String, | ||||||
|  |  | ||||||
|     s3_url: String, |  | ||||||
|     s3_bucket: String, |  | ||||||
|     s3_access_key: String, |  | ||||||
|     s3_secret_key: String, |  | ||||||
|  |  | ||||||
|     crawl_filter: String, |     crawl_filter: String, | ||||||
|  |     start_url: String, | ||||||
|     budget: usize, |     budget: usize, | ||||||
|  |     batch_size: usize, | ||||||
| } | } | ||||||
|  |  | ||||||
| #[tokio::main] | #[tokio::main] | ||||||
| async fn main() { | async fn main() { | ||||||
|     let total_runtime = Timer::start("Completed"); |     println!("Logs and metrics are provided to the Grafana dashboard"); | ||||||
|  |  | ||||||
|     let writer = std::fs::OpenOptions::new() |     // Start TRACE / LOGGING / METRICS | ||||||
|         .append(true) |     load_logging(&CONFIG); // this seems to be working ok | ||||||
|         .create(true) |     global::set_tracer_provider(load_tracing(&CONFIG)); | ||||||
|         .open("./docker/logs/tracing.log") |     global::set_meter_provider(load_metrics(&CONFIG)); | ||||||
|         .expect("Couldn't make log file!"); |  | ||||||
|  |  | ||||||
|     let registry = Registry::default() |     BATCH_SIZE.add(CONFIG.batch_size as u64, &[]); | ||||||
|         .with( |  | ||||||
|             fmt::layer() |  | ||||||
|                 .with_line_number(true) |  | ||||||
|                 .with_thread_ids(true) |  | ||||||
|                 .with_file(true) |  | ||||||
|                 // .with_timer(LocalTime::rfc_3339()) // Loki or alloy does this automatically |  | ||||||
|                 .json() |  | ||||||
|                 .with_writer(writer) |  | ||||||
|                 // .with_filter(EnvFilter::from_default_env()) |  | ||||||
|         ); |  | ||||||
|  |  | ||||||
|     tracing::subscriber::set_global_default(registry).expect("Failed to set default subscriber"); |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     let builder = PrometheusBuilder::new(); |  | ||||||
|     builder.with_http_listener( |  | ||||||
|             std::net::SocketAddr::new(IpAddr::V4(Ipv4Addr::from_octets([0,0,0,0])), 2500) |  | ||||||
|         ) |  | ||||||
|         .install() |  | ||||||
|         .expect("failed to install recorder/exporter"); |  | ||||||
|  |  | ||||||
|     debug!("Starting..."); |  | ||||||
|     // Would probably take these in as parameters from a cli |  | ||||||
|     let starting_url = "https://en.wikipedia.org/"; |  | ||||||
|     // When getting uncrawled pages, name must contain this variable. "" will effectively get ignored. |     // When getting uncrawled pages, name must contain this variable. "" will effectively get ignored. | ||||||
|     // let crawl_filter = "en.wikipedia.org/"; |     // let crawl_filter = "en.wikipedia.org/"; | ||||||
|     // let budget = 50; |     // let budget = 50; | ||||||
|     let mut crawled = 0; |     let crawled = Arc::new(RwLock::new(0)); | ||||||
|  |  | ||||||
|  |     let starting_url = &CONFIG.start_url; | ||||||
|  |  | ||||||
|     let mut file = File::open("./Crawler.toml").expect("Failed to read Crawler.toml"); |     let db = connect(&CONFIG) | ||||||
|     let mut buf = String::new(); |  | ||||||
|     let _ = file.read_to_string(&mut buf); |  | ||||||
|  |  | ||||||
|     let config: Config = toml::from_str(&buf).expect("Failed to parse Crawler.toml"); |  | ||||||
|  |  | ||||||
|     let db = connect(&config) |  | ||||||
|         .await |         .await | ||||||
|         .expect("Failed to connect to surreal, aborting."); |         .expect("Failed to connect to surreal, aborting."); | ||||||
|     let s3 = S3::connect(&config) |  | ||||||
|         .await |  | ||||||
|         .expect("Failed to connect to minio, aborting.\n\nThis probably means you need to login to the minio console and get a new access key!\n\n(Probably here) http://localhost:9001/access-keys/new-account\n\n"); |  | ||||||
|  |  | ||||||
|     let reqwest = reqwest::Client::builder() |     let reqwest = reqwest::Client::builder() | ||||||
|         // .use_rustls_tls() |         // .use_rustls_tls() | ||||||
| @@ -102,142 +120,296 @@ async fn main() { | |||||||
|  |  | ||||||
|     // Kick off the whole machine - This Website object doesn't matter, it's just to allow for |     // Kick off the whole machine - This Website object doesn't matter, it's just to allow for | ||||||
|     // get() to work. |     // get() to work. | ||||||
|     let span = trace_span!("Pre-Loop"); |     // let mut span = TRACER.start("Pre-Loop"); | ||||||
|     let pre_loop_span = span.enter(); |     let site = Website::new(starting_url, false); | ||||||
|     // Download the site |     process(site, db.clone(), reqwest.clone()).await; | ||||||
|     let site = Website::new(&starting_url, false); |     // span.end(); | ||||||
|     get(site, db.clone(), reqwest.clone(), s3.clone()).await; |  | ||||||
|  |  | ||||||
|     drop(pre_loop_span); |     // let mut main_loop_span= TRACER.start("Main-Loop"); | ||||||
|  |     let mut futures = JoinSet::new(); | ||||||
|  |     for _ in 0..CONFIG.batch_size { | ||||||
|  |         futures.spawn(process_single_thread( | ||||||
|  |             &CONFIG, | ||||||
|  |             db.clone(), | ||||||
|  |             reqwest.clone(), | ||||||
|  |             crawled.clone(), | ||||||
|  |         )); | ||||||
|  |     } | ||||||
|  |  | ||||||
|     let span = trace_span!("Loop"); |     while let Some(_) = futures.join_next().await { | ||||||
|     let span = span.enter(); |         // Budget - Threads - This thread (1) | ||||||
|     while crawled < config.budget { |         // Would roughly be the acceptable amount at which a thread should exit | ||||||
|         let get_num = if config.budget - crawled < 100 { |         if *(crawled.read().await) < CONFIG.budget - CONFIG.batch_size - 1 { | ||||||
|             config.budget - crawled |             warn!("Thread terminated early, restarting"); | ||||||
|         } else { |             futures.spawn(process_single_thread( | ||||||
|             100 |                 &CONFIG, | ||||||
|         }; |                 db.clone(), | ||||||
|  |                 reqwest.clone(), | ||||||
|         let uncrawled = get_uncrawled_links(&db, get_num, config.crawl_filter.clone()).await; |                 crawled.clone(), | ||||||
|         if uncrawled.len() == 0 { |             )); | ||||||
|             info!("Had more budget but finished crawling everything."); |  | ||||||
|             return; |  | ||||||
|         } |  | ||||||
|         debug!("Crawling {} pages...", uncrawled.len()); |  | ||||||
|  |  | ||||||
|         let span = trace_span!("Crawling"); |  | ||||||
|         let _ = span.enter(); |  | ||||||
|  |  | ||||||
|         { |  | ||||||
|             let mut futures = JoinSet::new(); |  | ||||||
|             for site in uncrawled { |  | ||||||
|                 gauge!(BEING_PROCESSED).increment(1); |  | ||||||
|                 futures.spawn(get(site, db.clone(), reqwest.clone(), s3.clone())); |  | ||||||
|                 // let percent = format!("{:.2}%", (crawled as f32 / budget as f32) * 100f32); |  | ||||||
|                 // info!("Crawled {crawled} out of {budget} pages. ({percent})"); |  | ||||||
|             } |  | ||||||
|             debug!("Joining {} futures...", futures.len()); |  | ||||||
|  |  | ||||||
|             let c = counter!(SITES_CRAWLED); |  | ||||||
|             // As futures complete runs code in while block |  | ||||||
|             while let Some(_) = futures.join_next().await { |  | ||||||
|                 c.increment(1); |  | ||||||
|                 gauge!(BEING_PROCESSED).decrement(1); |  | ||||||
|                 crawled += 1; |  | ||||||
|             } |  | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|     drop(span); |  | ||||||
|  |     futures.join_all().await; | ||||||
|  |     // main_loop_span.end(); | ||||||
|  |  | ||||||
|     info!("Done"); |     info!("Done"); | ||||||
|     drop(total_runtime); |  | ||||||
| } | } | ||||||
|  |  | ||||||
| #[instrument(skip (db, s3, reqwest))] | async fn process_single_thread( | ||||||
|  |     config: &Config, | ||||||
|  |     db: Surreal<Client>, | ||||||
|  |     reqwest: reqwest::Client, | ||||||
|  |     crawled: Arc<RwLock<usize>>, | ||||||
|  | ) { | ||||||
|  |     while *(crawled.read().await) < config.budget { | ||||||
|  |         let uncrawled = get_next(&db.clone(), &config).await; | ||||||
|  |         match uncrawled { | ||||||
|  |             Some(site) => { | ||||||
|  |                 process(site, db.clone(), reqwest.clone()).await; | ||||||
|  |                 SITES_CRAWLED.add(1, &[]); | ||||||
|  |                 // Somehow this write doesn't hang on the while's read? | ||||||
|  |                 let mut c = crawled.write().await; | ||||||
|  |                 *c += 1; | ||||||
|  |             } | ||||||
|  |             None => { | ||||||
|  |                 warn!("fn::get_next() returned None"); | ||||||
|  |                 return; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #[instrument(skip(db, reqwest))] | ||||||
| /// Downloads and crawls and stores a webpage. | /// Downloads and crawls and stores a webpage. | ||||||
| /// It is acceptable to clone `db`, `reqwest`, and `s3` because they all use `Arc`s internally. - Noted by Oliver | /// It is acceptable to clone `db`, `reqwest`, and `s3` because they all use `Arc`s internally. - Noted by Oliver | ||||||
| async fn get(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Client, s3: S3) { | async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Client) { | ||||||
|     trace!("Get: {}", site.to_string()); |     // METRICS | ||||||
|  |     debug!(url = &site.site.as_str(), "Process: {}", &site.site); | ||||||
|  |     BEING_PROCESSED.add(1, &[]); | ||||||
|  |     // let mut process_span = TRACER.start("Process"); | ||||||
|  |  | ||||||
|     let timer = Timer::start("Built request"); |     // Build the request | ||||||
|     let request_builder = reqwest.get(site.to_string()); |     let request_builder = reqwest.get(site.site.to_string()); | ||||||
|     timer.stop(); |  | ||||||
|  |  | ||||||
|     let g = gauge!(GET_IN_FLIGHT); |  | ||||||
|     g.increment(1); |  | ||||||
|     let timer = Timer::start("Got page"); |  | ||||||
|  |  | ||||||
|  |     // Send the http request (get) | ||||||
|  |     GET_IN_FLIGHT.add(1, &[]); | ||||||
|     if let Ok(response) = request_builder.send().await { |     if let Ok(response) = request_builder.send().await { | ||||||
|  |         let mut skip_download = false; | ||||||
|  |  | ||||||
|         timer.stop(); |         GET_IN_FLIGHT.add(-1, &[]); | ||||||
|         g.decrement(1); |  | ||||||
|         counter!(GET_METRIC).increment(1); |  | ||||||
|         debug!("Getting body..."); |  | ||||||
|  |  | ||||||
|         // Get body |         let headers = response.headers(); | ||||||
|         let data = response.text().await.expect("Failed to read http response's body!"); |         let code = response.status(); | ||||||
|         // Store document |         if code != 200 { | ||||||
|         s3.store(&data, &site.site).await; |             warn!("{code} for {}", site.site.as_str()); | ||||||
|         // Parse document and store relationships |  | ||||||
|         parser::parse(&db, &mut site, &data).await; |  | ||||||
|         return; |  | ||||||
|     } |  | ||||||
|     trace!("Failed to get: {}", site.to_string()); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /// Returns uncrawled links |  | ||||||
| #[instrument(skip(db))] |  | ||||||
| async fn get_uncrawled_links( |  | ||||||
|     db: &Surreal<Client>, |  | ||||||
|     mut count: usize, |  | ||||||
|     filter: String, |  | ||||||
| ) -> Vec<Website> { |  | ||||||
|     if count > 100 { |  | ||||||
|         count = 100 |  | ||||||
|     } |  | ||||||
|     debug!("Getting uncrawled links"); |  | ||||||
|  |  | ||||||
|     let mut response = db |  | ||||||
|         .query("SELECT * FROM website WHERE crawled = false AND site ~ type::string($format) LIMIT $count;") |  | ||||||
|         .bind(("format", filter)) |  | ||||||
|         .bind(("count", count)) |  | ||||||
|         .await |  | ||||||
|         .expect("Hard-coded query failed..?"); |  | ||||||
|     response |  | ||||||
|         .take(0) |  | ||||||
|         .expect("Returned websites couldn't be parsed") |  | ||||||
| } |  | ||||||
|  |  | ||||||
| pub struct Timer<'a> { |  | ||||||
|     start: Instant, |  | ||||||
|     msg: &'a str, |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl<'a> Timer<'a> { |  | ||||||
|     #[inline] |  | ||||||
|     pub fn start(msg: &'a str) -> Self { |  | ||||||
|         Self { |  | ||||||
|             start: Instant::now(), |  | ||||||
|             msg, |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|     pub fn stop(&self) -> f64 { |  | ||||||
|         let dif = self.start.elapsed().as_micros(); |  | ||||||
|         let ms = dif as f64 / 1000.; |  | ||||||
|  |  | ||||||
|         if ms > 200. { |  | ||||||
|             warn!("{}", format!("{} in {:.3}ms", self.msg, ms)); |  | ||||||
|         } else { |  | ||||||
|             trace!("{}", format!("{} in {:.3}ms", self.msg, ms)); |  | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         ms |         #[allow(non_snake_case)] | ||||||
|  |         let CT = headers.get("Content-Type"); | ||||||
|  |         let ct = headers.get("content-type"); | ||||||
|  |  | ||||||
|  |         let ct = match (CT, ct) { | ||||||
|  |             (None, None) => { | ||||||
|  |                 warn!( | ||||||
|  |                     "Server did not respond with Content-Type header. Url: {} Headers: ({:?})", | ||||||
|  |                     site.site.to_string(), | ||||||
|  |                     headers | ||||||
|  |                 ); | ||||||
|  |                 return; | ||||||
|  |             } | ||||||
|  |             (None, Some(a)) => a, | ||||||
|  |             (Some(a), None) => a, | ||||||
|  |             (Some(a), Some(_)) => a, | ||||||
|  |         }; | ||||||
|  |  | ||||||
|  |         // create filepath (handles / -> /index.html) | ||||||
|  |         let real_path = filesystem::as_path(&site.site, ct); | ||||||
|  |         let mut tmp_path = real_path.clone(); | ||||||
|  |         if !(tmp_path.add_extension("crawl_temp")) { | ||||||
|  |             warn!("Failed to add extension to file"); | ||||||
|  |             // fallback ig | ||||||
|  |             tmp_path = tmp_path.with_extension("crawl_temp"); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         // CODE FOR UPDATING DOWNLOADED CONTENT: | ||||||
|  |         // Check the Content-Length header (we assume the server is telling the truth) (I don't see | ||||||
|  |         // a reason for it to lie in this case). | ||||||
|  |         // And see if the file on the disk is the same length. | ||||||
|  |         // Yes, technically this isn't perfect, but the other option is storing ETags, which I | ||||||
|  |         // don't want to do right now. | ||||||
|  |         if let Some(len) = headers.get("Content-Length") { | ||||||
|  |             if let Ok(s) = len.to_str() { | ||||||
|  |                 // length is in bytes | ||||||
|  |                 if let Ok(len) = s.parse::<u64>() { | ||||||
|  |                     if let Some(disk_len) = filesystem::check_file_length(&real_path).await { | ||||||
|  |                         if disk_len == len { | ||||||
|  |                             skip_download = true; | ||||||
|  |                         } | ||||||
|  |                     } else { | ||||||
|  |                         // File not found (or other error). | ||||||
|  |                         // Program will continue on it's way, downloading content. | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         // make sure that the file is good to go | ||||||
|  |         if let Some(file) = filesystem::init(&tmp_path).await { | ||||||
|  |             // Get body from response | ||||||
|  |             // stream the response onto the disk | ||||||
|  |             let mut stream = response.bytes_stream(); | ||||||
|  |  | ||||||
|  |             let should_parse = real_path.to_string_lossy().ends_with(".html"); | ||||||
|  |  | ||||||
|  |             let mut buf: Vec<u8> = Vec::new(); | ||||||
|  |  | ||||||
|  |             if skip_download && should_parse { | ||||||
|  |                 // since we are skipping the download we will just read the file off the disk to | ||||||
|  |                 // parse it | ||||||
|  |                 if let Ok(mut file) = tokio::fs::OpenOptions::new()  | ||||||
|  |                     .read(true) | ||||||
|  |                     .open(&real_path).await | ||||||
|  |                 { | ||||||
|  |                     if let Err(err) = file.read_to_end(&mut buf).await { | ||||||
|  |                         warn!("Failed to read file off disk for parsing, {}", err); | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|  |             }  | ||||||
|  |  | ||||||
|  |             // !!!DOWNLOADING TIME!!! | ||||||
|  |             if !skip_download { | ||||||
|  |                 let mut writer = BufWriter::new(file); | ||||||
|  |  | ||||||
|  |                 // Write file to disk | ||||||
|  |                 trace!("Writing at: {:?}", tmp_path); | ||||||
|  |                 BEING_STREAMED.add(1, &[]); | ||||||
|  |                 // let mut stream_span = TRACER.start("Stream"); | ||||||
|  |                 while let Some(data) = stream.next().await { | ||||||
|  |                     match data { | ||||||
|  |                         Ok(data) => { | ||||||
|  |                             TOTAL_BYTES_DOWN.add(data.len() as u64, &[]); | ||||||
|  |                             let _ = writer.write_all(&data).await; | ||||||
|  |                             // If we are going to parse this file later, we will save it | ||||||
|  |                             // into memory as well as the disk. | ||||||
|  |                             // We do this because the data here might be incomplete | ||||||
|  |                             if should_parse { | ||||||
|  |                                 data.iter().for_each(|f| buf.push(*f)); | ||||||
|  |                             } | ||||||
|  |                         } | ||||||
|  |                         Err(err) => { | ||||||
|  |                             error!("{err}") | ||||||
|  |                         } | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|  |                 let _ = writer.flush().await; | ||||||
|  |                 // rename the temp file into the real file name | ||||||
|  |                 if let Err(err) = tokio::fs::rename(&tmp_path, &real_path).await { | ||||||
|  |                     error!( | ||||||
|  |                         from = &*tmp_path.to_string_lossy(), | ||||||
|  |                         to = &*real_path.to_string_lossy(), | ||||||
|  |                         "Error renaming file: {}", | ||||||
|  |                         err | ||||||
|  |                     ); | ||||||
|  |                 } | ||||||
|  |  | ||||||
|  |                 // stream_span.end(); | ||||||
|  |                 BEING_STREAMED.add(-1, &[]); | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             // (If needed) Parse the file | ||||||
|  |             if should_parse { | ||||||
|  |                 BEING_PARSED.add(1, &[]); | ||||||
|  |                 // let mut parsing_span = TRACER.start("Parsing"); | ||||||
|  |  | ||||||
|  |                 // Parse document and get relationships | ||||||
|  |                 let sites = parser::parse(&site, &buf).await; | ||||||
|  |                 // De-duplicate this list | ||||||
|  |                 let prev_len = sites.len(); | ||||||
|  |                 let set = sites.into_iter().fold(HashSet::new(), |mut set, item| { | ||||||
|  |                     set.insert(item); | ||||||
|  |                     set | ||||||
|  |                 }); | ||||||
|  |                 let de_dupe_sites: Vec<Website> = set.into_iter().collect(); | ||||||
|  |                 let diff = prev_len - de_dupe_sites.len(); | ||||||
|  |                 trace!("Saved {diff} from being entered into the db by de-duping"); | ||||||
|  |                 // Store all the other sites so that we can link to them. | ||||||
|  |                 let _ = Website::store_all(de_dupe_sites, &db).await; | ||||||
|  |  | ||||||
|  |                 // parsing_span.end(); | ||||||
|  |                 BEING_PARSED.add(-1, &[]); | ||||||
|  |             } else { | ||||||
|  |                 trace!(url = site.site.as_str(), "Parse = False"); | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             // update self in db | ||||||
|  |             site.crawled = true; | ||||||
|  |             site.status_code = code.as_u16(); | ||||||
|  |             Website::store_all(vec![site.clone()], &db).await; | ||||||
|  |         } | ||||||
|  |     } else { | ||||||
|  |         error!(url = site.site.as_str(), "Failed to get: {}", &site.site); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     // process_span.end(); | ||||||
|  |     BEING_PROCESSED.add(-1, &[]); | ||||||
| } | } | ||||||
|  |  | ||||||
| impl Drop for Timer<'_> { | fn load_tracing(config: &Config) -> SdkTracerProvider { | ||||||
|     fn drop(&mut self) { |     // Send spans to Alloy (which will send them to Tempo) | ||||||
|         self.stop(); |     let otlp_span = opentelemetry_otlp::SpanExporter::builder() | ||||||
|     } |         .with_tonic() | ||||||
|  |         .with_endpoint(config.tracing_endpoint.clone()) | ||||||
|  |         .build() | ||||||
|  |         .unwrap(); | ||||||
|  |     let tracer_provider = opentelemetry_sdk::trace::SdkTracerProvider::builder() | ||||||
|  |         .with_simple_exporter(otlp_span) | ||||||
|  |         .build(); | ||||||
|  |     tracer_provider | ||||||
|  | } | ||||||
|  |  | ||||||
|  | fn load_logging(config: &Config) { | ||||||
|  |     //    let otlp_log = opentelemetry_otlp::LogExporter::builder() | ||||||
|  |     //        .with_tonic() | ||||||
|  |     //        .with_endpoint(endpoint) | ||||||
|  |     //        .build() | ||||||
|  |     //        .unwrap(); | ||||||
|  |     // let logger_provider = opentelemetry_sdk::logs::SdkLoggerProvider::builder() | ||||||
|  |     //    .with_simple_exporter(otlp_log) | ||||||
|  |     //    .build(); | ||||||
|  |     let writer = std::fs::OpenOptions::new() | ||||||
|  |         .append(true) | ||||||
|  |         .create(true) | ||||||
|  |         .open(config.log_file.clone()) | ||||||
|  |         .expect("Couldn't make log file!"); | ||||||
|  |  | ||||||
|  |     let filter = EnvFilter::builder() | ||||||
|  |         .with_default_directive(LevelFilter::DEBUG.into()) | ||||||
|  |         .from_env_lossy(); | ||||||
|  |  | ||||||
|  |     let registry = Registry::default().with( | ||||||
|  |         fmt::layer() | ||||||
|  |             .with_line_number(true) | ||||||
|  |             .with_thread_ids(true) | ||||||
|  |             .with_file(true) | ||||||
|  |             .json() | ||||||
|  |             .with_writer(writer) | ||||||
|  |             .with_filter(filter), | ||||||
|  |     ); | ||||||
|  |  | ||||||
|  |     tracing::subscriber::set_global_default(registry).expect("Failed to set default subscriber"); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | fn load_metrics(config: &Config) -> SdkMeterProvider { | ||||||
|  |     //  Send metrics to Prometheus | ||||||
|  |     let otlp_metrics = opentelemetry_otlp::MetricExporter::builder() | ||||||
|  |         .with_http() | ||||||
|  |         .with_protocol(Protocol::HttpBinary) | ||||||
|  |         .with_endpoint(config.metrics_endpoint.clone()) | ||||||
|  |         .build() | ||||||
|  |         .unwrap(); | ||||||
|  |     let metrics_provider = opentelemetry_sdk::metrics::SdkMeterProvider::builder() | ||||||
|  |         .with_periodic_exporter(otlp_metrics) // default delay is 60s, turn down to like 15 | ||||||
|  |         .build(); | ||||||
|  |     metrics_provider | ||||||
| } | } | ||||||
|   | |||||||
							
								
								
									
										119
									
								
								src/parser.rs
									
									
									
									
									
								
							
							
						
						
									
										119
									
								
								src/parser.rs
									
									
									
									
									
								
							| @@ -1,16 +1,13 @@ | |||||||
| use std::default::Default; | use std::default::Default; | ||||||
| use std::str::FromStr; |  | ||||||
|  |  | ||||||
| use html5ever::tokenizer::{BufferQueue, TokenizerResult}; | use html5ever::tokenizer::{BufferQueue, TokenizerResult}; | ||||||
| use html5ever::tokenizer::{StartTag, TagToken}; | use html5ever::tokenizer::{StartTag, TagToken}; | ||||||
| use html5ever::tokenizer::{Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts}; | use html5ever::tokenizer::{Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts}; | ||||||
| use html5ever::{local_name, tendril::*}; | use html5ever::{local_name, tendril::*}; | ||||||
| use surrealdb::engine::remote::ws::Client; | use tracing::{error, instrument, trace, warn}; | ||||||
| use surrealdb::Surreal; | use url::Url; | ||||||
| use tracing::instrument; |  | ||||||
|  |  | ||||||
| use crate::db::Website; | use crate::db::Website; | ||||||
| use crate::Timer; |  | ||||||
|  |  | ||||||
| impl TokenSink for Website { | impl TokenSink for Website { | ||||||
|     type Handle = Vec<Website>; |     type Handle = Vec<Website>; | ||||||
| @@ -20,6 +17,7 @@ impl TokenSink for Website { | |||||||
|             TagToken(tag) => { |             TagToken(tag) => { | ||||||
|                 if tag.kind == StartTag { |                 if tag.kind == StartTag { | ||||||
|                     match tag.name { |                     match tag.name { | ||||||
|  |                         // this should be all the html elements that have links | ||||||
|                         local_name!("a") |                         local_name!("a") | ||||||
|                         | local_name!("audio") |                         | local_name!("audio") | ||||||
|                         | local_name!("area") |                         | local_name!("area") | ||||||
| @@ -34,21 +32,18 @@ impl TokenSink for Website { | |||||||
|                                 let attr_name = attr.name.local.to_string(); |                                 let attr_name = attr.name.local.to_string(); | ||||||
|                                 if attr_name == "src" || attr_name == "href" || attr_name == "data" |                                 if attr_name == "src" || attr_name == "href" || attr_name == "data" | ||||||
|                                 { |                                 { | ||||||
|                                     // Get clone of the current site object |                                     trace!(url = self.site.as_str(),"Found `{}` in html `{}` tag", &attr.value, tag.name); | ||||||
|                                     let mut web = self.clone(); |                                     let url = try_get_url(&self.site, &attr.value); | ||||||
|  |  | ||||||
|                                     // Set url |                                     if let Some(mut parsed) = url { | ||||||
|                                     let mut url = web.site; |                                         parsed.set_query(None); | ||||||
|                                     url.set_fragment(None); // removes #xyz |                                         parsed.set_fragment(None); | ||||||
|                                     let joined = url.join(&attr.value).expect("Failed to join url during parsing!"); |                                         trace!(url = self.site.as_str(), "Final cleaned URL: `{}`", parsed.to_string()); | ||||||
|                                     web.site = joined; |                                         let web = Website::new(&parsed.to_string(), false); | ||||||
|  |                                         links.push(web); | ||||||
|                                     web.crawled = false; |                                     } | ||||||
|  |  | ||||||
|                                     links.push(web); |  | ||||||
|                                 } |                                 } | ||||||
|                             } |                             } | ||||||
|  |  | ||||||
|                             return TokenSinkResult::Script(links); |                             return TokenSinkResult::Script(links); | ||||||
|                         } |                         } | ||||||
|                         local_name!("button") | local_name!("meta") | local_name!("iframe") => { |                         local_name!("button") | local_name!("meta") | local_name!("iframe") => { | ||||||
| @@ -64,46 +59,92 @@ impl TokenSink for Website { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| #[instrument(skip_all)] | #[instrument(skip(data))] | ||||||
| pub async fn parse(db: &Surreal<Client>, site: &mut Website, data: &str) { | /// Parses the passed site and returns all the sites it links to. | ||||||
|     // update self in db | pub async fn parse(site: &Website, data: &[u8]) -> Vec<Website> { | ||||||
|     site.set_crawled(); |     trace!(url = site.site.as_str(), "Parsing {}", site.site.to_string()); | ||||||
|     site.store(db).await; |  | ||||||
|  |  | ||||||
|     // prep work |     // prep work | ||||||
|     let mut other_sites: Vec<Website> = Vec::new(); |     let mut other_sites: Vec<Website> = Vec::new(); | ||||||
|     { // using blocks to prevent compiler's async worries |  | ||||||
|         let _t = Timer::start("Parsed page"); |  | ||||||
|  |  | ||||||
|         // change data into something that can be tokenized |     // change data into something that can be tokenized | ||||||
|         let chunk = Tendril::from_str(&data).expect("Failed to parse string into Tendril!"); |     let s: Result<Tendril<fmt::UTF8>, ()> = Tendril::try_from_byte_slice(data); | ||||||
|  |     if let Ok(chunk) = s { | ||||||
|         // create buffer of tokens and push our input into it |         // create buffer of tokens and push our input into it | ||||||
|         let mut token_buffer = BufferQueue::default(); |         let token_buffer = BufferQueue::default(); | ||||||
|         token_buffer.push_back(chunk.try_reinterpret::<fmt::UTF8>().expect("Failed to reinterprt chunk!")); |         token_buffer.push_back( | ||||||
|  |             chunk | ||||||
|  |                 .try_reinterpret::<fmt::UTF8>() | ||||||
|  |                 .expect("Failed to reinterpret chunk!"), | ||||||
|  |         ); | ||||||
|         // create the tokenizer |         // create the tokenizer | ||||||
|         let tokenizer = Tokenizer::new(site.clone(), TokenizerOpts::default()); |         let tokenizer = Tokenizer::new(site.clone(), TokenizerOpts::default()); | ||||||
|  |  | ||||||
|         // go thru buffer |         // go thru buffer | ||||||
|         while let TokenizerResult::Script(mut sites) = tokenizer.feed(&mut token_buffer) { |         while let TokenizerResult::Script(mut sites) = tokenizer.feed(&token_buffer) { | ||||||
|             other_sites.append(&mut sites); |             other_sites.append(&mut sites); | ||||||
|             // other_sites.push(sites); |             // other_sites.push(sites); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         assert!(token_buffer.is_empty()); |         assert!(token_buffer.is_empty()); | ||||||
|         tokenizer.end(); |         tokenizer.end(); | ||||||
|  |     } else { | ||||||
|  |         warn!(url = site.site.as_str(), "Tendril failed to parse on: {}", site.site.to_string()); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     { |     other_sites | ||||||
|         let mut links_to = Vec::with_capacity(other_sites.len()); | } | ||||||
|  |  | ||||||
|         for a in other_sites { | #[instrument] | ||||||
|  | fn try_get_url(parent: &Url, link: &str) -> Option<Url> { | ||||||
|  |     match Url::parse(link) { | ||||||
|  |         Ok(ok) => Some(ok), | ||||||
|  |         Err(e) => { | ||||||
|  |             if link.starts_with('#') { | ||||||
|  |                 trace!(url = parent.as_str(), "Rejecting # url"); | ||||||
|  |                 None | ||||||
|  |             } else if link.starts_with("//") { | ||||||
|  |                 // if a url starts with "//" is assumed that it will adopt | ||||||
|  |                 // the same scheme as it's parent | ||||||
|  |                 // https://stackoverflow.com/questions/9646407/two-forward-slashes-in-a-url-src-href-attribute | ||||||
|  |                 let scheme = parent.scheme(); | ||||||
|  |  | ||||||
|             let other = a.store(db).await; |                 match Url::parse(&format!("{scheme}://{link}")) { | ||||||
|             if let Some(o) = other { |                     Ok(url) => Some(url), | ||||||
|                 links_to.push(o); |                     Err(err) => { | ||||||
|  |                         error!("Failed parsing relative scheme url: {}", err); | ||||||
|  |                         None | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|  |             } else { | ||||||
|  |                 // # This is some sort of relative url, gonna try patching it up into an absolute | ||||||
|  |                 // url | ||||||
|  |                 match e { | ||||||
|  |                     url::ParseError::RelativeUrlWithoutBase => { | ||||||
|  |                         // Is: scheme://host:port | ||||||
|  |                         let mut origin = parent.origin().ascii_serialization(); | ||||||
|  |                         if !origin.ends_with('/') && !link.starts_with('/') { | ||||||
|  |                             origin += "/"; | ||||||
|  |                         } | ||||||
|  |                         let url = origin.clone() + link; | ||||||
|  |  | ||||||
|  |                         if let Ok(url) = Url::parse(&url) { | ||||||
|  |                             trace!(url = parent.as_str(), "Built `{url}` from `{origin} + `{}`", link.to_string()); | ||||||
|  |                             Some(url) | ||||||
|  |                         } else { | ||||||
|  |                             error!( | ||||||
|  |                                 "Failed to reconstruct a url from relative url: `{}` on site: `{}`. Failed url was: {}", | ||||||
|  |                                 link, | ||||||
|  |                                 parent.to_string(), | ||||||
|  |                                 url | ||||||
|  |                             ); | ||||||
|  |                             None | ||||||
|  |                         } | ||||||
|  |                     } | ||||||
|  |                     _ => { | ||||||
|  |                         error!("MISC error: {:?} {:?}", e, link); | ||||||
|  |                         None | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         site.links_to(links_to, db).await; |  | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
							
								
								
									
										105
									
								
								src/s3.rs
									
									
									
									
									
								
							
							
						
						
									
										105
									
								
								src/s3.rs
									
									
									
									
									
								
							| @@ -1,105 +0,0 @@ | |||||||
| use base64::{alphabet, engine::{self, general_purpose}, Engine}; |  | ||||||
| use metrics::counter; |  | ||||||
| use minio::s3::{ |  | ||||||
|     args::{BucketExistsArgs, MakeBucketArgs}, |  | ||||||
|     client::ClientBuilder, |  | ||||||
|     creds::StaticProvider, |  | ||||||
|     error::Error, |  | ||||||
|     http::BaseUrl, |  | ||||||
|     Client, |  | ||||||
| }; |  | ||||||
| use tracing::{instrument, trace, warn}; |  | ||||||
| use url::Url; |  | ||||||
|  |  | ||||||
| use crate::{Config, Timer}; |  | ||||||
|          |  | ||||||
| const CUSTOM_ENGINE: engine::GeneralPurpose = engine::GeneralPurpose::new(&alphabet::URL_SAFE, general_purpose::NO_PAD); |  | ||||||
|  |  | ||||||
| const ROUND_TRIP_METRIC: &'static str = "s3_trips"; |  | ||||||
|  |  | ||||||
| #[derive(Clone)] |  | ||||||
| pub struct S3 { |  | ||||||
|     bucket_name: String, |  | ||||||
|     client: Client, |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl S3 { |  | ||||||
|     #[instrument(skip_all, name = "S3")] |  | ||||||
|     pub async fn connect(config: &Config) -> Result<Self, Error> { |  | ||||||
|         let base_url = config |  | ||||||
|             .s3_url |  | ||||||
|             .parse::<BaseUrl>() |  | ||||||
|             .expect("Failed to parse url into BaseUrl"); |  | ||||||
|  |  | ||||||
|         let static_provider = |  | ||||||
|             StaticProvider::new(&config.s3_access_key, &config.s3_secret_key, None); |  | ||||||
|  |  | ||||||
|         let client = ClientBuilder::new(base_url) |  | ||||||
|             .provider(Some(Box::new(static_provider))) |  | ||||||
|             .build()?; |  | ||||||
|  |  | ||||||
|         trace!("Checking bucket..."); |  | ||||||
|         let exists = client |  | ||||||
|             .bucket_exists( |  | ||||||
|                 &BucketExistsArgs::new(&config.s3_bucket) |  | ||||||
|                     .expect("Failed to check if bucket exists"), |  | ||||||
|             ) |  | ||||||
|             .await?; |  | ||||||
|  |  | ||||||
|         if !exists { |  | ||||||
|             trace!("Creating bucket..."); |  | ||||||
|             client |  | ||||||
|                 .make_bucket( |  | ||||||
|                     &MakeBucketArgs::new(&config.s3_bucket).expect("Failed to create bucket!"), |  | ||||||
|                 ) |  | ||||||
|                 .await?; |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         trace!("Connection successful"); |  | ||||||
|  |  | ||||||
|         Ok(Self { |  | ||||||
|             bucket_name: config.s3_bucket.to_owned(), |  | ||||||
|             client: client, |  | ||||||
|         }) |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[instrument(name = "s3_store", skip_all)] |  | ||||||
|     pub async fn store(&self, data: &str, url: &Url) { |  | ||||||
|         let counter = counter!(ROUND_TRIP_METRIC); |  | ||||||
|         let t = Timer::start("Stored page"); |  | ||||||
|         let _ = t; // prevent compiler drop |  | ||||||
|         if let Some(domain) = url.domain() { |  | ||||||
|             let filename = domain.to_owned() + url.path(); |  | ||||||
|  |  | ||||||
|             trace!("Created filename: {filename} from raw: {}", url.to_string()); |  | ||||||
|  |  | ||||||
|             counter.increment(1); |  | ||||||
|             let _ = match &self |  | ||||||
|                 .client |  | ||||||
|                 .put_object_content(&self.bucket_name, &filename, data.to_owned()) |  | ||||||
|                 .send() |  | ||||||
|                 .await { |  | ||||||
|                     Ok(_) => {}, |  | ||||||
|                     Err(err) => { |  | ||||||
|                         match err { |  | ||||||
|                             Error::InvalidObjectName(_) => { |  | ||||||
|  |  | ||||||
|                                 warn!("Tried storing invalid object name, retrying with Base64 encoding. Last try."); |  | ||||||
|                                  |  | ||||||
|                                 let filename: String = domain.to_owned() + &CUSTOM_ENGINE.encode(url.path()); |  | ||||||
|  |  | ||||||
|                                 counter.increment(1); |  | ||||||
|                                 let _ = &self |  | ||||||
|                                     .client |  | ||||||
|                                     .put_object_content(&self.bucket_name, &filename, data.to_owned()) |  | ||||||
|                                     .send() |  | ||||||
|                                     .await |  | ||||||
|                                     .unwrap(); |  | ||||||
|                             }, |  | ||||||
|                             _ => {}, |  | ||||||
|                        } |  | ||||||
|                     }, |  | ||||||
|                 }; |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| @@ -1,3 +1,18 @@ | |||||||
| DEFINE TABLE IF NOT EXISTS website SCHEMALESS; | DEFINE TABLE IF NOT EXISTS website SCHEMALESS; | ||||||
| DEFINE FIELD IF NOT EXISTS accessed_at ON TABLE website VALUE time::now(); |  | ||||||
|  | DEFINE FIELD IF NOT EXISTS site ON TABLE website TYPE string; | ||||||
| DEFINE INDEX IF NOT EXISTS idx ON TABLE website COLUMNS site UNIQUE; | DEFINE INDEX IF NOT EXISTS idx ON TABLE website COLUMNS site UNIQUE; | ||||||
|  |  | ||||||
|  | DEFINE FIELD IF NOT EXISTS crawled ON TABLE website TYPE bool; | ||||||
|  | DEFINE FIELD IF NOT EXISTS processing ON TABLE website TYPE bool DEFAULT false; | ||||||
|  |  | ||||||
|  | DEFINE FIELD IF NOT EXISTS accessed_at ON TABLE website VALUE time::now(); | ||||||
|  | DEFINE FIELD IF NOT EXISTS first_accessed_at ON TABLE website VALUE time::now(); | ||||||
|  |  | ||||||
|  | DEFINE FUNCTION OVERWRITE fn::get_next($filter: string) { | ||||||
|  |     LET $site = SELECT * FROM ONLY website WHERE crawled = false AND processing = false AND site ~ type::string($filter) LIMIT 1; | ||||||
|  |     UPDATE $site.id SET processing = true; | ||||||
|  |     RETURN $site | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | UPDATE website SET processing = false WHERE processing = true; | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user