diff --git a/Cargo.lock b/Cargo.lock index 238e64f..cbb76f9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3673,6 +3673,7 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" name = "surreal_spider" version = "0.1.0" dependencies = [ + "base64 0.22.1", "html5ever 0.29.0", "minio", "reqwest", diff --git a/Cargo.toml b/Cargo.toml index 72c21f7..ed51242 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,7 @@ version = "0.1.0" edition = "2021" [dependencies] +base64 = "0.22.1" html5ever = "0.29" # minio = "0.1.0" minio = {git="https://github.com/minio/minio-rs.git", rev = "c28f576"} diff --git a/src/main.rs b/src/main.rs index 6db0563..7ae82fd 100644 --- a/src/main.rs +++ b/src/main.rs @@ -41,17 +41,17 @@ async fn main() { surreal_username: "root", surreal_password: "root", surreal_ns: "test", - surreal_db: "custom-engine-v2", - s3_bucket: "custom-engine-v2", + surreal_db: "b64v1", + s3_bucket: "b64v1", s3_url: "http://localhost:9000", - s3_access_key: "0zv7GbLQsw4ZI8TclMps", - s3_secret_key: "5dB7QkGFw7fYbUJ5LpHk2GbWR7Bl710HlRz4NbzB", + s3_access_key: "8UO76z8wCs9DnpxSbQUY", + s3_secret_key: "xwKVMpf2jzgprsdo85Dvo74UmO84y0aRrAUorYY5", }; // Would probably take these in as parameters from a cli let starting_url = "https://en.wikipedia.org/"; // When getting uncrawled pages, name must contain this variable. "" will effectively get ignored. - let crawl_filter = "https://en.wikipedia.org/"; + let crawl_filter = "wikipedia.org/"; let budget = 50; let mut crawled = 0; diff --git a/src/s3.rs b/src/s3.rs index f17b7d1..6ed33cb 100644 --- a/src/s3.rs +++ b/src/s3.rs @@ -43,9 +43,14 @@ impl S3 { }) } + #[instrument(skip_all)] pub async fn store(&self, data: &str, name: &Url) { - if let Some(domain) = name.domain() { - let filename = domain.to_string() + name.path(); + if let Some(domain) = name.to_string().split('#').collect::>().get(0) { + use base64::prelude::*; + // FIXME can still get unsupported characters, _ I think + let filename = BASE64_URL_SAFE.encode(domain); + + trace!("Filename: {filename} from {domain}"); let _ = &self .client