From 2c28d69d55aac73744709096d21410576c2a0f21 Mon Sep 17 00:00:00 2001 From: oliver Date: Tue, 12 Nov 2024 21:03:58 -0700 Subject: [PATCH] add s3 support --- .gitignore | 1 + .vscode/launch.json | 2 +- Cargo.lock | 254 +++++++++++++++++++++++++++++++++++++++++++- Cargo.toml | 2 + compose.yml | 17 ++- schema.surql | 2 - src/db.rs | 29 +++-- src/main.rs | 56 +++++++--- src/s3.rs | 63 +++++++++++ src/setup.surql | 2 + 10 files changed, 398 insertions(+), 30 deletions(-) delete mode 100644 schema.surql create mode 100644 src/s3.rs create mode 100644 src/setup.surql diff --git a/.gitignore b/.gitignore index a968872..301630e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ /target /.surrealdb +/.minio perf.data flamegraph.svg perf.data.old \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json index 76fa95c..43deeb5 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -9,7 +9,7 @@ "request": "launch", "name": "Debug executable 'surreal_spider'", "env": { - "RUST_LOG": "surreal_spider=debug,reqwest=info", + "RUST_LOG": "surreal_spider=trace,reqwest=info", }, "cargo": { "args": [ diff --git a/Cargo.lock b/Cargo.lock index f256eb1..e205787 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "Inflector" @@ -103,6 +103,55 @@ dependencies = [ "libc", ] +[[package]] +name = "anstream" +version = "0.6.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" + +[[package]] +name = "anstyle-parse" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" +dependencies = [ + "windows-sys 0.59.0", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125" +dependencies = [ + "anstyle", + "windows-sys 0.59.0", +] + [[package]] name = "any_ascii" version = "0.3.2" @@ -262,6 +311,17 @@ dependencies = [ "serde_json", ] +[[package]] +name = "async-recursion" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.85", +] + [[package]] name = "async-stream" version = "0.3.6" @@ -665,6 +725,12 @@ dependencies = [ "inout", ] +[[package]] +name = "colorchoice" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" + [[package]] name = "concurrent-queue" version = "2.5.0" @@ -705,6 +771,21 @@ dependencies = [ "libc", ] +[[package]] +name = "crc" +version = "3.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69e6e4d7b33a94f0991c26729976b10ebde1d34c3ee82408fb536164fa10d636" +dependencies = [ + "crc-catalog", +] + +[[package]] +name = "crc-catalog" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" + [[package]] name = "crossbeam-utils" version = "0.8.20" @@ -775,6 +856,20 @@ dependencies = [ "parking_lot_core", ] +[[package]] +name = "dashmap" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core", +] + [[package]] name = "data-encoding" version = "2.6.0" @@ -791,6 +886,17 @@ dependencies = [ "serde", ] +[[package]] +name = "derivative" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcc3dd5e9e9c0b295d6e1e4d811fb6f157d5ffd784b8d202fc62eac8035a770b" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "deunicode" version = "1.6.0" @@ -895,6 +1001,29 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d" +[[package]] +name = "env_filter" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f2c92ceda6ceec50f43169f9ee8424fe2db276791afde7b2cd8bc084cb376ab" +dependencies = [ + "log", + "regex", +] + +[[package]] +name = "env_logger" +version = "0.11.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13fa619b91fb2381732789fc5de83b45675e882f66623b7d8cb4f643017018d" +dependencies = [ + "anstream", + "anstyle", + "env_filter", + "humantime", + "log", +] + [[package]] name = "equivalent" version = "1.0.1" @@ -1279,6 +1408,15 @@ dependencies = [ "digest", ] +[[package]] +name = "home" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5" +dependencies = [ + "windows-sys 0.52.0", +] + [[package]] name = "html5ever" version = "0.27.0" @@ -1347,6 +1485,12 @@ version = "1.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d71d3574edd2771538b901e6549113b4006ece66150fb69c0fb6d9a2adae946" +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + [[package]] name = "humantime" version = "2.1.0" @@ -1366,6 +1510,7 @@ dependencies = [ "http", "http-body", "httparse", + "httpdate", "itoa", "pin-project-lite", "smallvec", @@ -1631,6 +1776,12 @@ version = "2.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ddc24109865250148c2e0f3d25d4f0f479571723792d3802153c60922a4fb708" +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + [[package]] name = "itertools" version = "0.10.5" @@ -1877,6 +2028,12 @@ dependencies = [ "digest", ] +[[package]] +name = "md5" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" + [[package]] name = "memchr" version = "2.7.4" @@ -1922,6 +2079,46 @@ dependencies = [ "unicase", ] +[[package]] +name = "minio" +version = "0.2.0-alpha" +source = "git+https://github.com/minio/minio-rs.git?rev=c28f576#c28f576cb8f8cf47fb941bb9db62b2cbd6f080c1" +dependencies = [ + "async-recursion", + "async-trait", + "base64 0.22.1", + "byteorder", + "bytes", + "chrono", + "crc", + "dashmap 6.1.0", + "derivative", + "env_logger", + "futures-util", + "hex", + "hmac", + "home", + "http", + "hyper", + "lazy_static", + "log", + "md5", + "multimap", + "os_info", + "percent-encoding", + "rand", + "regex", + "reqwest", + "serde", + "serde_json", + "sha2", + "tokio", + "tokio-stream", + "tokio-util", + "urlencoding", + "xmltree", +] + [[package]] name = "miniz_oxide" version = "0.8.0" @@ -1960,6 +2157,15 @@ dependencies = [ "version_check", ] +[[package]] +name = "multimap" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "defc4c55412d89136f966bbb339008b474350e5e6e78d2714439c386b3137a03" +dependencies = [ + "serde", +] + [[package]] name = "nanoid" version = "0.4.0" @@ -2183,6 +2389,17 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "os_info" +version = "3.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae99c7fa6dd38c7cafe1ec085e804f8f555a2f8659b0dbe03f1f9963a9b51092" +dependencies = [ + "log", + "serde", + "windows-sys 0.52.0", +] + [[package]] name = "overload" version = "0.1.1" @@ -3437,6 +3654,7 @@ version = "0.1.0" dependencies = [ "html5ever 0.29.0", "markup5ever_rcdom", + "minio", "reqwest", "serde", "surrealdb", @@ -3510,7 +3728,7 @@ dependencies = [ "cedar-policy", "chrono", "ciborium", - "dashmap", + "dashmap 5.5.3", "deunicode", "dmp", "fst", @@ -3840,6 +4058,17 @@ dependencies = [ "tokio", ] +[[package]] +name = "tokio-stream" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f4e6ce100d0eb49a2734f8c0812bcd324cf357d21810932c5df6b96ef2b86f1" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + [[package]] name = "tokio-tungstenite" version = "0.23.1" @@ -4107,6 +4336,12 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + [[package]] name = "uuid" version = "1.11.0" @@ -4484,6 +4719,12 @@ dependencies = [ "tap", ] +[[package]] +name = "xml-rs" +version = "0.8.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af310deaae937e48a26602b730250b4949e125f468f11e6990be3e5304ddd96f" + [[package]] name = "xml5ever" version = "0.20.0" @@ -4495,6 +4736,15 @@ dependencies = [ "markup5ever 0.14.0", ] +[[package]] +name = "xmltree" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b619f8c85654798007fb10afa5125590b43b088c225a25fc2fec100a9fad0fc6" +dependencies = [ + "xml-rs", +] + [[package]] name = "yoke" version = "0.7.4" diff --git a/Cargo.toml b/Cargo.toml index b07be1d..a9c7532 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,8 @@ edition = "2021" [dependencies] html5ever = "0.29.0" markup5ever_rcdom = "0.5.0-unofficial" +# minio = "0.1.0" +minio = {git="https://github.com/minio/minio-rs.git", rev = "c28f576"} reqwest = "0.12.9" serde = { version = "1.0.214", features = ["derive"] } surrealdb = "2.0.4" diff --git a/compose.yml b/compose.yml index 49b676d..797688f 100644 --- a/compose.yml +++ b/compose.yml @@ -1,5 +1,5 @@ services: - db: + surreal: image: surrealdb/surrealdb:latest-dev ports: - 8000:8000 @@ -14,3 +14,18 @@ services: - --pass - root - rocksdb:/mydata/database.db + minio: + image: quay.io/minio/minio + ports: + - 9000:9000 + - 9001:9001 + environment: + - MINIO_ROOT_USER=root + - MINIO_ROOT_PASSWORD=an8charpassword + volumes: + - ./.minio/:/data + command: + - server + - /data + - --console-address + - ":9001" diff --git a/schema.surql b/schema.surql deleted file mode 100644 index b4a8882..0000000 --- a/schema.surql +++ /dev/null @@ -1,2 +0,0 @@ -DEFINE TABLE website SCHEMALESS; - DEFINE FIELD accessed_at ON TABLE website VALUE time::now(); diff --git a/src/db.rs b/src/db.rs index 1656c4e..b5e1230 100644 --- a/src/db.rs +++ b/src/db.rs @@ -8,12 +8,12 @@ use surrealdb::{ use tracing::{error, instrument, trace, warn}; use url::Url; -use crate::Timer; +use crate::{Config, Timer}; #[derive(Debug, Serialize, Deserialize, Clone)] pub struct Website { /// The url that this data is found at - site: Url, + pub site: Url, /// Wether or not this link has been crawled yet pub crawled: bool, #[serde(skip_serializing)] @@ -39,10 +39,6 @@ impl Website { self.crawled = true } - pub fn mut_url(&mut self) -> &mut Url { - &mut self.site - } - #[instrument(skip_all)] pub async fn links_to(&self, other: Vec, db: &Surreal) { let len = other.len(); @@ -149,19 +145,30 @@ pub struct Record { pub id: Thing, } -pub async fn connect() -> surrealdb::Result> { +#[instrument(skip_all, name = "SurrealDB")] +pub async fn connect(config: &Config<'_>) -> surrealdb::Result> { + trace!("Establishing connection to surreal..."); // Connect to the server - let db = Surreal::new::("127.0.0.1:8000").await?; + let db = Surreal::new::(config.surreal_url).await?; + trace!("Logging in..."); // Signin as a namespace, database, or root user db.signin(Root { - username: "root", - password: "root", + username: config.surreal_username, + password: config.surreal_password, }) .await?; // Select a specific namespace / database - db.use_ns("test").use_db("v1.2").await?; + db + .use_ns(config.surreal_ns) + .use_db(config.surreal_db) + .await?; + + let setup = include_bytes!("setup.surql"); + let file = setup.iter().map(|c| *c as char).collect::(); + + db.query(file).await.expect("Failed to setup surreal tables."); Ok(db) } diff --git a/src/main.rs b/src/main.rs index 751eea9..56dc1b8 100644 --- a/src/main.rs +++ b/src/main.rs @@ -6,12 +6,27 @@ use html5ever::{ local_name, parse_document, tendril::TendrilSink, tree_builder::TreeBuilderOpts, ParseOpts, }; use rcdom::RcDom; +use s3::S3; use std::time::Instant; use surrealdb::{engine::remote::ws::Client, sql::Thing, Surreal}; use tracing::{debug, info, instrument, trace, trace_span}; use tracing_subscriber::EnvFilter; mod db; +mod s3; + +struct Config<'a> { + surreal_ns: &'a str, + surreal_db: &'a str, + surreal_url: &'a str, + surreal_username: &'a str, + surreal_password: &'a str, + + s3_url: &'a str, + s3_bucket: &'a str, + s3_access_key: &'a str, + s3_secret_key: &'a str, +} #[tokio::main] async fn main() { @@ -21,16 +36,28 @@ async fn main() { .without_time() .init(); debug!("Starting..."); + + let config = Config { + surreal_ns: "test", + surreal_db: "v1.5", + surreal_url: "localhost:8000", + surreal_username: "root", + surreal_password: "root", + s3_url: "http://localhost:9000", + s3_bucket: "v1.5", + s3_access_key: "8tUJn7e1paMFZQr0PKIT", + s3_secret_key: "uSMvYxNOeCejCUgXVqgTfYlUEcmiZY0xcZ91M9E0", + }; // Would probably take these in as parameters from a cli - let url = "https://oliveratkinson.net/"; - // let url = "http://localhost:5500"; - let budget = 1000; + let starting_url = "https://oliveratkinson.net/"; + let budget = 200; let mut crawled = 0; - let db = connect().await.expect("Failed to connect to db, aborting."); - - let client = reqwest::Client::builder() + let s3 = S3::connect(&config).await.expect("Failed to connect to minio, aborting."); + let db = connect(&config).await.expect("Failed to connect to surreal, aborting."); + + let reqwest = reqwest::Client::builder() // .use_rustls_tls() .build() .unwrap(); @@ -40,8 +67,8 @@ async fn main() { let span = trace_span!("Pre-Loop"); let pre_loop_span = span.enter(); // Download the site - let mut site = Website::new(&url, false); - get(&mut site, &db, &client, &mut crawled).await; + let mut site = Website::new(&starting_url, false); + get(&mut site, &db, &reqwest, &s3, &mut crawled).await; drop(pre_loop_span); @@ -65,7 +92,7 @@ async fn main() { let _ = span.enter(); for mut site in uncrawled { - get(&mut site, &db, &client, &mut crawled).await; + get(&mut site, &db, &reqwest, &s3, &mut crawled).await; let percent = format!("{:.2}%", (crawled as f32 / budget as f32) * 100f32); info!("Crawled {crawled} out of {budget} pages. ({percent})"); } @@ -80,13 +107,14 @@ async fn main() { async fn get( site: &mut Website, db: &Surreal, - request_client: &reqwest::Client, + reqwest: &reqwest::Client, + s3: &S3, count: &mut usize, ) { trace!("Get: {}", site.to_string()); let timer = Timer::start("Got page"); - if let Ok(response) = request_client.get(site.to_string()).send().await { + if let Ok(response) = reqwest.get(site.to_string()).send().await { timer.stop(); // Get body @@ -98,6 +126,8 @@ async fn get( }, ..Default::default() }; + s3.store(&data, &site.site).await; + // Get DOM let dom = parse_document(RcDom::default(), opts) .from_utf8() @@ -156,10 +186,10 @@ async fn walk( let mut web = site.clone(); // Set url - let url = web.mut_url(); + let mut url = web.site; url.set_fragment(None); // removes #xyz let joined = url.join(&attr.value).unwrap(); - *url = joined; + web.site = joined; // Set other attributes web.crawled = false; diff --git a/src/s3.rs b/src/s3.rs new file mode 100644 index 0000000..ddcc263 --- /dev/null +++ b/src/s3.rs @@ -0,0 +1,63 @@ +use minio::s3::{ + args::{BucketExistsArgs, MakeBucketArgs}, + client::ClientBuilder, + creds::StaticProvider, + error::Error, + http::BaseUrl, + Client, +}; +use tracing::{instrument, trace}; +use url::Url; + +use crate::Config; + +pub struct S3 { + bucket_name: String, + client: Client, +} + +impl S3 { + #[instrument(skip_all, name = "S3")] + pub async fn connect(config: &Config<'_>) -> Result { + let base_url = config.s3_url.parse::().unwrap(); + + let static_provider = + StaticProvider::new(&config.s3_access_key, &config.s3_secret_key, None); + + let client = ClientBuilder::new(base_url) + .provider(Some(Box::new(static_provider))) + .build()?; + + trace!("Checking bucket..."); + let exists = client + .bucket_exists(&BucketExistsArgs::new(&config.s3_bucket).unwrap()) + .await?; + + if !exists { + trace!("Creating bucket..."); + client + .make_bucket(&MakeBucketArgs::new(&config.s3_bucket).unwrap()) + .await?; + } + + trace!("Connection successfull"); + + Ok(Self { + bucket_name: config.s3_bucket.to_owned(), + client: client, + }) + } + + pub async fn store(&self, data: &str, name: &Url) { + if let Some(domain) = name.domain() { + let filename = domain.to_string() + name.path(); + + let _ = &self + .client + .put_object_content(&self.bucket_name, &filename, data.to_owned()) + .send() + .await + .unwrap(); + } + } +} diff --git a/src/setup.surql b/src/setup.surql new file mode 100644 index 0000000..4271c79 --- /dev/null +++ b/src/setup.surql @@ -0,0 +1,2 @@ +DEFINE TABLE IF NOT EXISTS website SCHEMALESS; +DEFINE FIELD IF NOT EXISTS accessed_at ON TABLE website VALUE time::now();