From b750d88d48b74fbf9373c8ed46150c13e358dabe Mon Sep 17 00:00:00 2001 From: Rushmore75 Date: Fri, 21 Mar 2025 11:42:43 -0600 Subject: [PATCH] working filesystem storage --- Crawler.toml | 2 +- docker/compose.yml | 16 -------- docker/prometheus.yaml | 8 +--- src/filesystem.rs | 4 +- src/parser.rs | 90 ++++++++++++++++++++++++++---------------- 5 files changed, 63 insertions(+), 57 deletions(-) diff --git a/Crawler.toml b/Crawler.toml index a23d0f4..8940c1b 100644 --- a/Crawler.toml +++ b/Crawler.toml @@ -3,7 +3,7 @@ surreal_url = "localhost:8000" surreal_username = "root" surreal_password = "root" surreal_ns = "test" -surreal_db = "v1.19.1" +surreal_db = "v1.19.2" # Crawler config crawl_filter = "en.wikipedia.com" diff --git a/docker/compose.yml b/docker/compose.yml index 7cb6dcc..747afa0 100644 --- a/docker/compose.yml +++ b/docker/compose.yml @@ -14,22 +14,6 @@ services: - --pass - root - rocksdb:/mydata/database.db - minio: - image: quay.io/minio/minio - ports: - - 9000:9000 - - 9001:9001 - environment: - - MINIO_ROOT_USER=root - - MINIO_ROOT_PASSWORD=an8charpassword - - MINIO_PROMETHEUS_AUTH_TYPE=public - volumes: - - minio_storage:/data - command: - - server - - /data - - --console-address - - ":9001" alloy: image: grafana/alloy:latest diff --git a/docker/prometheus.yaml b/docker/prometheus.yaml index 9133e15..ffc1e24 100644 --- a/docker/prometheus.yaml +++ b/docker/prometheus.yaml @@ -7,15 +7,11 @@ scrape_configs: static_configs: # change this your machine's ip, localhost won't work # because localhost refers to the docker container. - # - targets: ['172.20.239.48:2500'] - - targets: ['192.168.8.209:2500'] + - targets: ['172.20.239.48:2500'] + #- targets: ['192.168.8.209:2500'] - job_name: loki static_configs: - targets: ['loki:3100'] - job_name: prometheus static_configs: - targets: ['localhost:9090'] - - job_name: minio - metrics_path: /minio/v2/metrics/cluster - static_configs: - - targets: ['minio:9000'] diff --git a/src/filesystem.rs b/src/filesystem.rs index e92b9aa..85fcf30 100644 --- a/src/filesystem.rs +++ b/src/filesystem.rs @@ -1,7 +1,7 @@ use std::{ffi::OsStr, path::PathBuf}; use tokio::fs; -use tracing::{error, instrument, trace, warn}; +use tracing::{debug, error, info, instrument, trace, warn}; use url::Url; #[instrument(skip(data))] @@ -21,6 +21,8 @@ pub async fn store(data: &str, url: &Url) { (url_path.clone(), "index.html".into()) }; + info!("Writing at: {:?} {:?}", basepath, filename); + // create the folders if let Err(err) = fs::create_dir_all(&basepath).await { error!("Dir creation: {err} {:?}", basepath); diff --git a/src/parser.rs b/src/parser.rs index ac55bf0..c1e87e2 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -19,8 +19,7 @@ impl TokenSink for Website { TagToken(tag) => { if tag.kind == StartTag { match tag.name { - // this should be all the html - // elements that have links + // this should be all the html elements that have links local_name!("a") | local_name!("audio") | local_name!("area") @@ -35,37 +34,9 @@ impl TokenSink for Website { let attr_name = attr.name.local.to_string(); if attr_name == "src" || attr_name == "href" || attr_name == "data" { - let url: Option = match Url::parse(&attr.value) { - Ok(ok) => { - trace!("Found `{}` in the html on `{}` tag", ok.to_string(), tag.name); - Some(ok) - }, - Err(e) => { - if attr.value.starts_with('#') { - trace!("Rejecting # url"); - None - } else { - match e { - url::ParseError::RelativeUrlWithoutBase => { - let origin = self.site.origin().ascii_serialization(); - let url = origin.clone() + &attr.value; - trace!("Built `{url}` from `{origin} + {}`", &attr.value.to_string()); - if let Ok(url) = Url::parse(&url) { - trace!("Saved relative url `{}` AS: `{}`", &attr.value, url); - Some(url) - } else { - error!("Failed to reconstruct a url from relative url: `{}` on site: `{}`", &attr.value, self.site.to_string()); - None - } - }, - _ => { - error!("MISC error: {:?} {:?}", e, &attr.value); - None - }, - } - } - }, - }; + trace!("Found `{}` in html `{}` tag", &attr.value, tag.name); + let url = try_get_url(&self.site, &attr.value); + if let Some(mut parsed) = url { parsed.set_query(None); parsed.set_fragment(None); @@ -119,3 +90,56 @@ pub async fn parse(site: &Website, data: &str) -> Vec { other_sites } +#[instrument] +fn try_get_url(parent: &Url, link: &str) -> Option { + match Url::parse(link) { + Ok(ok) => Some(ok), + Err(e) => { + if link.starts_with('#') { + trace!("Rejecting # url"); + None + } else if link.starts_with("//") { + // if a url starts with "//" is assumed that it will adopt + // the same scheme as it's parent + // https://stackoverflow.com/questions/9646407/two-forward-slashes-in-a-url-src-href-attribute + let scheme = parent.scheme(); + + match Url::parse(&format!("{scheme}://{}", link)) { + Ok(url) => Some(url), + Err(err) => { + error!("Failed parsing realative scheme url: {}", err); + None + } + } + } else { + // # This is some sort of realative url, gonna try patching it up into an absolute + // url + match e { + url::ParseError::RelativeUrlWithoutBase => { + // Is: scheme://host:port + let origin = parent.origin().ascii_serialization(); + let url = origin.clone() + link; + + trace!("Built `{url}` from `{origin} + {}`", link.to_string()); + + if let Ok(url) = Url::parse(&url) { + trace!("Saved relative url `{}` AS: `{}`", link, url); + Some(url) + } else { + error!( + "Failed to reconstruct a url from relative url: `{}` on site: `{}`", + link, + parent.to_string() + ); + None + } + } + _ => { + error!("MISC error: {:?} {:?}", e, link); + None + } + } + } + } + } +}