Compare commits

...

4 Commits
main ... minio

Author SHA1 Message Date
0f8a3d7215 using a custom parser now :) 2024-11-12 23:08:09 -07:00
574a370f30 readme updates 2024-11-12 21:24:57 -07:00
eaa79b749e prepare get function for s3 2024-11-12 21:19:05 -07:00
2c28d69d55 add s3 support 2024-11-12 21:03:58 -07:00
12 changed files with 514 additions and 170 deletions

1
.gitignore vendored
View File

@ -1,5 +1,6 @@
/target /target
/.surrealdb /.surrealdb
/.minio
perf.data perf.data
flamegraph.svg flamegraph.svg
perf.data.old perf.data.old

2
.vscode/launch.json vendored
View File

@ -9,7 +9,7 @@
"request": "launch", "request": "launch",
"name": "Debug executable 'surreal_spider'", "name": "Debug executable 'surreal_spider'",
"env": { "env": {
"RUST_LOG": "surreal_spider=debug,reqwest=info", "RUST_LOG": "surreal_spider=trace,reqwest=info",
}, },
"cargo": { "cargo": {
"args": [ "args": [

268
Cargo.lock generated
View File

@ -1,6 +1,6 @@
# This file is automatically @generated by Cargo. # This file is automatically @generated by Cargo.
# It is not intended for manual editing. # It is not intended for manual editing.
version = 3 version = 4
[[package]] [[package]]
name = "Inflector" name = "Inflector"
@ -103,6 +103,55 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "anstream"
version = "0.6.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b"
dependencies = [
"anstyle",
"anstyle-parse",
"anstyle-query",
"anstyle-wincon",
"colorchoice",
"is_terminal_polyfill",
"utf8parse",
]
[[package]]
name = "anstyle"
version = "1.0.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
[[package]]
name = "anstyle-parse"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9"
dependencies = [
"utf8parse",
]
[[package]]
name = "anstyle-query"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c"
dependencies = [
"windows-sys 0.59.0",
]
[[package]]
name = "anstyle-wincon"
version = "3.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125"
dependencies = [
"anstyle",
"windows-sys 0.59.0",
]
[[package]] [[package]]
name = "any_ascii" name = "any_ascii"
version = "0.3.2" version = "0.3.2"
@ -262,6 +311,17 @@ dependencies = [
"serde_json", "serde_json",
] ]
[[package]]
name = "async-recursion"
version = "1.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.85",
]
[[package]] [[package]]
name = "async-stream" name = "async-stream"
version = "0.3.6" version = "0.3.6"
@ -665,6 +725,12 @@ dependencies = [
"inout", "inout",
] ]
[[package]]
name = "colorchoice"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
[[package]] [[package]]
name = "concurrent-queue" name = "concurrent-queue"
version = "2.5.0" version = "2.5.0"
@ -705,6 +771,21 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "crc"
version = "3.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "69e6e4d7b33a94f0991c26729976b10ebde1d34c3ee82408fb536164fa10d636"
dependencies = [
"crc-catalog",
]
[[package]]
name = "crc-catalog"
version = "2.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5"
[[package]] [[package]]
name = "crossbeam-utils" name = "crossbeam-utils"
version = "0.8.20" version = "0.8.20"
@ -775,6 +856,20 @@ dependencies = [
"parking_lot_core", "parking_lot_core",
] ]
[[package]]
name = "dashmap"
version = "6.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf"
dependencies = [
"cfg-if",
"crossbeam-utils",
"hashbrown 0.14.5",
"lock_api",
"once_cell",
"parking_lot_core",
]
[[package]] [[package]]
name = "data-encoding" name = "data-encoding"
version = "2.6.0" version = "2.6.0"
@ -791,6 +886,17 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "derivative"
version = "2.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fcc3dd5e9e9c0b295d6e1e4d811fb6f157d5ffd784b8d202fc62eac8035a770b"
dependencies = [
"proc-macro2",
"quote",
"syn 1.0.109",
]
[[package]] [[package]]
name = "deunicode" name = "deunicode"
version = "1.6.0" version = "1.6.0"
@ -895,6 +1001,29 @@ version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d" checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d"
[[package]]
name = "env_filter"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4f2c92ceda6ceec50f43169f9ee8424fe2db276791afde7b2cd8bc084cb376ab"
dependencies = [
"log",
"regex",
]
[[package]]
name = "env_logger"
version = "0.11.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e13fa619b91fb2381732789fc5de83b45675e882f66623b7d8cb4f643017018d"
dependencies = [
"anstream",
"anstyle",
"env_filter",
"humantime",
"log",
]
[[package]] [[package]]
name = "equivalent" name = "equivalent"
version = "1.0.1" version = "1.0.1"
@ -1279,6 +1408,15 @@ dependencies = [
"digest", "digest",
] ]
[[package]]
name = "home"
version = "0.5.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5"
dependencies = [
"windows-sys 0.52.0",
]
[[package]] [[package]]
name = "html5ever" name = "html5ever"
version = "0.27.0" version = "0.27.0"
@ -1347,6 +1485,12 @@ version = "1.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7d71d3574edd2771538b901e6549113b4006ece66150fb69c0fb6d9a2adae946" checksum = "7d71d3574edd2771538b901e6549113b4006ece66150fb69c0fb6d9a2adae946"
[[package]]
name = "httpdate"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
[[package]] [[package]]
name = "humantime" name = "humantime"
version = "2.1.0" version = "2.1.0"
@ -1366,6 +1510,7 @@ dependencies = [
"http", "http",
"http-body", "http-body",
"httparse", "httparse",
"httpdate",
"itoa", "itoa",
"pin-project-lite", "pin-project-lite",
"smallvec", "smallvec",
@ -1631,6 +1776,12 @@ version = "2.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ddc24109865250148c2e0f3d25d4f0f479571723792d3802153c60922a4fb708" checksum = "ddc24109865250148c2e0f3d25d4f0f479571723792d3802153c60922a4fb708"
[[package]]
name = "is_terminal_polyfill"
version = "1.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
[[package]] [[package]]
name = "itertools" name = "itertools"
version = "0.10.5" version = "0.10.5"
@ -1836,18 +1987,6 @@ dependencies = [
"tendril", "tendril",
] ]
[[package]]
name = "markup5ever_rcdom"
version = "0.5.0-unofficial"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9cb12459c4cab18dcc580159590f404ad78c0a9c5435ace80288ed43abdce31"
dependencies = [
"html5ever 0.29.0",
"markup5ever 0.14.0",
"tendril",
"xml5ever",
]
[[package]] [[package]]
name = "matchers" name = "matchers"
version = "0.1.0" version = "0.1.0"
@ -1877,6 +2016,12 @@ dependencies = [
"digest", "digest",
] ]
[[package]]
name = "md5"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
[[package]] [[package]]
name = "memchr" name = "memchr"
version = "2.7.4" version = "2.7.4"
@ -1922,6 +2067,46 @@ dependencies = [
"unicase", "unicase",
] ]
[[package]]
name = "minio"
version = "0.2.0-alpha"
source = "git+https://github.com/minio/minio-rs.git?rev=c28f576#c28f576cb8f8cf47fb941bb9db62b2cbd6f080c1"
dependencies = [
"async-recursion",
"async-trait",
"base64 0.22.1",
"byteorder",
"bytes",
"chrono",
"crc",
"dashmap 6.1.0",
"derivative",
"env_logger",
"futures-util",
"hex",
"hmac",
"home",
"http",
"hyper",
"lazy_static",
"log",
"md5",
"multimap",
"os_info",
"percent-encoding",
"rand",
"regex",
"reqwest",
"serde",
"serde_json",
"sha2",
"tokio",
"tokio-stream",
"tokio-util",
"urlencoding",
"xmltree",
]
[[package]] [[package]]
name = "miniz_oxide" name = "miniz_oxide"
version = "0.8.0" version = "0.8.0"
@ -1960,6 +2145,15 @@ dependencies = [
"version_check", "version_check",
] ]
[[package]]
name = "multimap"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "defc4c55412d89136f966bbb339008b474350e5e6e78d2714439c386b3137a03"
dependencies = [
"serde",
]
[[package]] [[package]]
name = "nanoid" name = "nanoid"
version = "0.4.0" version = "0.4.0"
@ -2183,6 +2377,17 @@ dependencies = [
"vcpkg", "vcpkg",
] ]
[[package]]
name = "os_info"
version = "3.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae99c7fa6dd38c7cafe1ec085e804f8f555a2f8659b0dbe03f1f9963a9b51092"
dependencies = [
"log",
"serde",
"windows-sys 0.52.0",
]
[[package]] [[package]]
name = "overload" name = "overload"
version = "0.1.1" version = "0.1.1"
@ -3436,7 +3641,7 @@ name = "surreal_spider"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"html5ever 0.29.0", "html5ever 0.29.0",
"markup5ever_rcdom", "minio",
"reqwest", "reqwest",
"serde", "serde",
"surrealdb", "surrealdb",
@ -3510,7 +3715,7 @@ dependencies = [
"cedar-policy", "cedar-policy",
"chrono", "chrono",
"ciborium", "ciborium",
"dashmap", "dashmap 5.5.3",
"deunicode", "deunicode",
"dmp", "dmp",
"fst", "fst",
@ -3840,6 +4045,17 @@ dependencies = [
"tokio", "tokio",
] ]
[[package]]
name = "tokio-stream"
version = "0.1.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4f4e6ce100d0eb49a2734f8c0812bcd324cf357d21810932c5df6b96ef2b86f1"
dependencies = [
"futures-core",
"pin-project-lite",
"tokio",
]
[[package]] [[package]]
name = "tokio-tungstenite" name = "tokio-tungstenite"
version = "0.23.1" version = "0.23.1"
@ -4107,6 +4323,12 @@ version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
[[package]]
name = "utf8parse"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
[[package]] [[package]]
name = "uuid" name = "uuid"
version = "1.11.0" version = "1.11.0"
@ -4485,14 +4707,18 @@ dependencies = [
] ]
[[package]] [[package]]
name = "xml5ever" name = "xml-rs"
version = "0.20.0" version = "0.8.23"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2278b4bf33071ba8e30368a59436c65eec8e01c49d5c29b3dfeb0cdc45331383" checksum = "af310deaae937e48a26602b730250b4949e125f468f11e6990be3e5304ddd96f"
[[package]]
name = "xmltree"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b619f8c85654798007fb10afa5125590b43b088c225a25fc2fec100a9fad0fc6"
dependencies = [ dependencies = [
"log", "xml-rs",
"mac",
"markup5ever 0.14.0",
] ]
[[package]] [[package]]

View File

@ -5,7 +5,8 @@ edition = "2021"
[dependencies] [dependencies]
html5ever = "0.29.0" html5ever = "0.29.0"
markup5ever_rcdom = "0.5.0-unofficial" # minio = "0.1.0"
minio = {git="https://github.com/minio/minio-rs.git", rev = "c28f576"}
reqwest = "0.12.9" reqwest = "0.12.9"
serde = { version = "1.0.214", features = ["derive"] } serde = { version = "1.0.214", features = ["derive"] }
surrealdb = "2.0.4" surrealdb = "2.0.4"

View File

@ -1,23 +1,11 @@
# Surreal Crawler # Surreal Crawler
Mapping with a budget of 1000 (crawl 1000 sites, so many more links are actually discovered), on [my webiste](https://oliveratkinson.net) on 8/26/2024 took 1m9s. Crawls sites saving all the found links to a surrealdb database. It then proceeds to take batches of 100 uncrawled links untill the crawl budget is reached. It saves the data of each site in a minio database.
This is including the crawl and loading into the database and linking sites. (Locally hosted surreal db instance)
This run created 4299 site links with 23286 links between the sites. (It found my this git site which really bolsters those numbers.) ### TODO
## Install / Build
* You will need rust to compile the crawler [rustup.rs](https://rustup.rs)
* You need python3 (will come installed on most linux distros) and poetry for dependancy management.
* Install `pipx`, `python3`
* Then: `pipx install poetry`
* Then: `poetry install` to install the project dependancies
* You need to install [surrealdb](https://surrealdb.com)
## Use
Just run `./crawl.sh {url}` and it will start crawling. You can tweak the budget inside [crawl.sh](https://git.oliveratkinson.net/Oliver/internet_mapper/src/branch/main/crawl.sh) if you want.
You can also prefix the command with `time` to benchmark the system, such as: `time ./crawl.sh https://discord.com`.
- [ ] Domain filtering - prevent the crawler from going on alternate versions of wikipedia.
- [ ] Conditionally save content - based on filename or file contents
- [ ] GUI / TUI ?
- [ ] Better asynchronous getting of the sites. Currently it all happens serially.

View File

@ -1,5 +1,5 @@
services: services:
db: surreal:
image: surrealdb/surrealdb:latest-dev image: surrealdb/surrealdb:latest-dev
ports: ports:
- 8000:8000 - 8000:8000
@ -14,3 +14,18 @@ services:
- --pass - --pass
- root - root
- rocksdb:/mydata/database.db - rocksdb:/mydata/database.db
minio:
image: quay.io/minio/minio
ports:
- 9000:9000
- 9001:9001
environment:
- MINIO_ROOT_USER=root
- MINIO_ROOT_PASSWORD=an8charpassword
volumes:
- ./.minio/:/data
command:
- server
- /data
- --console-address
- ":9001"

View File

@ -1,2 +0,0 @@
DEFINE TABLE website SCHEMALESS;
DEFINE FIELD accessed_at ON TABLE website VALUE time::now();

View File

@ -8,12 +8,12 @@ use surrealdb::{
use tracing::{error, instrument, trace, warn}; use tracing::{error, instrument, trace, warn};
use url::Url; use url::Url;
use crate::Timer; use crate::{Config, Timer};
#[derive(Debug, Serialize, Deserialize, Clone)] #[derive(Debug, Serialize, Deserialize, Clone)]
pub struct Website { pub struct Website {
/// The url that this data is found at /// The url that this data is found at
site: Url, pub site: Url,
/// Wether or not this link has been crawled yet /// Wether or not this link has been crawled yet
pub crawled: bool, pub crawled: bool,
#[serde(skip_serializing)] #[serde(skip_serializing)]
@ -39,10 +39,6 @@ impl Website {
self.crawled = true self.crawled = true
} }
pub fn mut_url(&mut self) -> &mut Url {
&mut self.site
}
#[instrument(skip_all)] #[instrument(skip_all)]
pub async fn links_to(&self, other: Vec<Thing>, db: &Surreal<Client>) { pub async fn links_to(&self, other: Vec<Thing>, db: &Surreal<Client>) {
let len = other.len(); let len = other.len();
@ -149,19 +145,30 @@ pub struct Record {
pub id: Thing, pub id: Thing,
} }
pub async fn connect() -> surrealdb::Result<Surreal<Client>> { #[instrument(skip_all, name = "SurrealDB")]
pub async fn connect(config: &Config<'_>) -> surrealdb::Result<Surreal<Client>> {
trace!("Establishing connection to surreal...");
// Connect to the server // Connect to the server
let db = Surreal::new::<Ws>("127.0.0.1:8000").await?; let db = Surreal::new::<Ws>(config.surreal_url).await?;
trace!("Logging in...");
// Signin as a namespace, database, or root user // Signin as a namespace, database, or root user
db.signin(Root { db.signin(Root {
username: "root", username: config.surreal_username,
password: "root", password: config.surreal_password,
}) })
.await?; .await?;
// Select a specific namespace / database // Select a specific namespace / database
db.use_ns("test").use_db("v1.2").await?; db
.use_ns(config.surreal_ns)
.use_db(config.surreal_db)
.await?;
let setup = include_bytes!("setup.surql");
let file = setup.iter().map(|c| *c as char).collect::<String>();
db.query(file).await.expect("Failed to setup surreal tables.");
Ok(db) Ok(db)
} }

View File

@ -1,17 +1,29 @@
extern crate html5ever; extern crate html5ever;
extern crate markup5ever_rcdom as rcdom;
use std::time::Instant;
use db::{connect, Website}; use db::{connect, Website};
use html5ever::{ use s3::S3;
local_name, parse_document, tendril::TendrilSink, tree_builder::TreeBuilderOpts, ParseOpts, use surrealdb::{engine::remote::ws::Client, Surreal};
};
use rcdom::RcDom;
use std::time::Instant;
use surrealdb::{engine::remote::ws::Client, sql::Thing, Surreal};
use tracing::{debug, info, instrument, trace, trace_span}; use tracing::{debug, info, instrument, trace, trace_span};
use tracing_subscriber::EnvFilter; use tracing_subscriber::EnvFilter;
mod db; mod db;
mod s3;
mod parser;
struct Config<'a> {
surreal_ns: &'a str,
surreal_db: &'a str,
surreal_url: &'a str,
surreal_username: &'a str,
surreal_password: &'a str,
s3_url: &'a str,
s3_bucket: &'a str,
s3_access_key: &'a str,
s3_secret_key: &'a str,
}
#[tokio::main] #[tokio::main]
async fn main() { async fn main() {
@ -22,15 +34,27 @@ async fn main() {
.init(); .init();
debug!("Starting..."); debug!("Starting...");
let config = Config {
surreal_ns: "test",
surreal_db: "v1.7",
surreal_url: "localhost:8000",
surreal_username: "root",
surreal_password: "root",
s3_url: "http://localhost:9000",
s3_bucket: "v1.7",
s3_access_key: "8tUJn7e1paMFZQr0PKIT",
s3_secret_key: "uSMvYxNOeCejCUgXVqgTfYlUEcmiZY0xcZ91M9E0",
};
// Would probably take these in as parameters from a cli // Would probably take these in as parameters from a cli
let url = "https://oliveratkinson.net/"; let starting_url = "https://oliveratkinson.net/";
// let url = "http://localhost:5500"; let budget = 15;
let budget = 1000;
let mut crawled = 0; let mut crawled = 0;
let db = connect().await.expect("Failed to connect to db, aborting."); let s3 = S3::connect(&config).await.expect("Failed to connect to minio, aborting.");
let db = connect(&config).await.expect("Failed to connect to surreal, aborting.");
let client = reqwest::Client::builder() let reqwest = reqwest::Client::builder()
// .use_rustls_tls() // .use_rustls_tls()
.build() .build()
.unwrap(); .unwrap();
@ -40,19 +64,15 @@ async fn main() {
let span = trace_span!("Pre-Loop"); let span = trace_span!("Pre-Loop");
let pre_loop_span = span.enter(); let pre_loop_span = span.enter();
// Download the site // Download the site
let mut site = Website::new(&url, false); let mut site = Website::new(&starting_url, false);
get(&mut site, &db, &client, &mut crawled).await; get(&mut site, &db, &reqwest, &s3, &mut crawled).await;
drop(pre_loop_span); drop(pre_loop_span);
let span = trace_span!("Loop"); let span = trace_span!("Loop");
let span = span.enter(); let span = span.enter();
while crawled < budget { while crawled < budget {
let get_num = if budget - crawled < 100 { let get_num = if budget - crawled < 100 { budget - crawled } else { 100 };
budget - crawled
} else {
100
};
let uncrawled = get_uncrawled_links(&db, get_num).await; let uncrawled = get_uncrawled_links(&db, get_num).await;
if uncrawled.len() == 0 { if uncrawled.len() == 0 {
@ -65,7 +85,8 @@ async fn main() {
let _ = span.enter(); let _ = span.enter();
for mut site in uncrawled { for mut site in uncrawled {
get(&mut site, &db, &client, &mut crawled).await; get(&mut site, &db, &reqwest, &s3, &mut crawled).await;
let percent = format!("{:.2}%", (crawled as f32 / budget as f32) * 100f32); let percent = format!("{:.2}%", (crawled as f32 / budget as f32) * 100f32);
info!("Crawled {crawled} out of {budget} pages. ({percent})"); info!("Crawled {crawled} out of {budget} pages. ({percent})");
} }
@ -76,116 +97,30 @@ async fn main() {
} }
#[instrument(skip_all)] #[instrument(skip_all)]
/// A quick helper function for downloading a url /// Downloads and crawls and stores a webpage.
async fn get( async fn get(
site: &mut Website, site: &mut Website,
db: &Surreal<Client>, db: &Surreal<Client>,
request_client: &reqwest::Client, reqwest: &reqwest::Client,
s3: &S3,
count: &mut usize, count: &mut usize,
) { ) {
trace!("Get: {}", site.to_string()); trace!("Get: {}", site.to_string());
let timer = Timer::start("Got page"); let timer = Timer::start("Got page");
if let Ok(response) = reqwest.get(site.to_string()).send().await {
if let Ok(response) = request_client.get(site.to_string()).send().await {
timer.stop(); timer.stop();
// Get body // Get body
let data = response.text().await.unwrap(); let data = response.text().await.unwrap();
let opts = ParseOpts { // Store document
tree_builder: TreeBuilderOpts { s3.store(&data, &site.site).await;
drop_doctype: true, // Parse document and store relationships
..Default::default() parser::parse(db, site, data).await;
},
..Default::default()
};
// Get DOM
let dom = parse_document(RcDom::default(), opts)
.from_utf8()
.read_from(&mut data.as_bytes())
.unwrap();
// TODO save the dom to minio if a flag is set
// Modify record in database
site.set_crawled();
site.store(db).await;
trace!("Got: {}", site.to_string());
// Walk all the children nodes, searching for links to other pages.
let mut buffer = Vec::new();
let timer = Timer::start("Walked");
walk(&dom.document, &db, &site, &mut buffer).await;
timer.stop();
// Put all the found links into the database.
site.links_to(buffer, &db).await;
*count += 1; *count += 1;
} }
trace!("Failed to get: {}", site.to_string()); trace!("Failed to get: {}", site.to_string());
} }
/// Walks the givin site, placing it's findings in the database
async fn walk(
node: &rcdom::Handle,
db: &Surreal<Client>,
site: &Website,
links_to: &mut Vec<Thing>,
) {
let span = trace_span!("Walk");
let span = span.enter();
// Match each node - node basically means element.
match &node.data {
rcdom::NodeData::Element { name, attrs, .. } => {
for attr in attrs.borrow().clone() {
match name.local {
local_name!("a")
| local_name!("audio")
| local_name!("area")
| local_name!("img")
| local_name!("link")
| local_name!("object")
| local_name!("source")
| local_name!("base")
| local_name!("video") => {
let attribute_name = attr.name.local.to_string();
if attribute_name == "src"
|| attribute_name == "href"
|| attribute_name == "data"
{
// Get clone of the current site object
let mut web = site.clone();
// Set url
let url = web.mut_url();
url.set_fragment(None); // removes #xyz
let joined = url.join(&attr.value).unwrap();
*url = joined;
// Set other attributes
web.crawled = false;
// TODO set element name
// let element_name = name.local.to_string();
if let Some(id) = web.store(db).await {
links_to.push(id);
}
}
}
local_name!("button") | local_name!("meta") | local_name!("iframe") => {
// dbg!(attrs);
}
_ => {}
};
}
}
_ => {}
};
drop(span);
for child in node.children.borrow().iter() {
Box::pin(walk(child, db, site, links_to)).await;
}
}
/// Returns uncrawled links /// Returns uncrawled links
async fn get_uncrawled_links(db: &Surreal<Client>, mut count: usize) -> Vec<Website> { async fn get_uncrawled_links(db: &Surreal<Client>, mut count: usize) -> Vec<Website> {
if count > 100 { if count > 100 {

95
src/parser.rs Normal file
View File

@ -0,0 +1,95 @@
use std::default::Default;
use std::str::FromStr;
use html5ever::tokenizer::{BufferQueue, TokenizerResult};
use html5ever::tokenizer::{StartTag, TagToken};
use html5ever::tokenizer::{Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
use html5ever::{local_name, tendril::*};
use surrealdb::engine::remote::ws::Client;
use surrealdb::Surreal;
use crate::db::Website;
#[derive(Clone)]
struct LinkParser<'a> {
site: &'a Website,
}
impl TokenSink for LinkParser<'_> {
type Handle = Vec<Website>;
fn process_token(&self, token: Token, _line_number: u64) -> TokenSinkResult<Self::Handle> {
match token {
TagToken(tag) => {
if tag.kind == StartTag {
match tag.name {
local_name!("a")
| local_name!("audio")
| local_name!("area")
| local_name!("img")
| local_name!("link")
| local_name!("object")
| local_name!("source")
| local_name!("base")
| local_name!("video") => {
let mut links = Vec::new();
for attr in &tag.attrs {
let attr_name = attr.name.local.to_string();
if attr_name == "src" || attr_name == "href" || attr_name == "data"
{
// Get clone of the current site object
let mut web = self.site.clone();
// Set url
let mut url = web.site;
url.set_fragment(None); // removes #xyz
let joined = url.join(&attr.value).unwrap();
web.site = joined;
web.crawled = false;
links.push(web);
}
}
return TokenSinkResult::Script(links);
}
local_name!("button") | local_name!("meta") | local_name!("iframe") => {
// dbg!(attrs);
}
_ => {}
}
}
}
_ => {}
}
TokenSinkResult::Continue
}
}
pub async fn parse(db: &Surreal<Client>, site: &mut Website, data: String) {
site.set_crawled();
site.store(db).await;
let sink = LinkParser { site };
let chunk = Tendril::from_str(&data).unwrap();
let mut input = BufferQueue::default();
input.push_back(chunk.try_reinterpret::<fmt::UTF8>().unwrap());
let token = Tokenizer::new(sink.clone(), TokenizerOpts::default());
let mut links_to = Vec::new();
while !input.is_empty() {
if let TokenizerResult::Script(s) = token.feed(&mut input) {
for mut web in s {
if let Some(id) = web.store(db).await {
links_to.push(id);
}
}
}
}
sink.site.links_to(links_to, db).await;
assert!(input.is_empty());
token.end();
}

76
src/s3.rs Normal file
View File

@ -0,0 +1,76 @@
use minio::s3::{
args::{BucketExistsArgs, MakeBucketArgs}, client::ClientBuilder, creds::StaticProvider, error::Error, http::BaseUrl, types::S3Api, Client
};
use tracing::{instrument, trace};
use url::Url;
use crate::Config;
pub struct S3 {
bucket_name: String,
client: Client,
}
impl S3 {
#[instrument(skip_all, name = "S3")]
pub async fn connect(config: &Config<'_>) -> Result<Self, Error> {
let base_url = config.s3_url.parse::<BaseUrl>().unwrap();
let static_provider =
StaticProvider::new(&config.s3_access_key, &config.s3_secret_key, None);
let client = ClientBuilder::new(base_url)
.provider(Some(Box::new(static_provider)))
.build()?;
trace!("Checking bucket...");
let exists = client
.bucket_exists(&BucketExistsArgs::new(&config.s3_bucket).unwrap())
.await?;
if !exists {
trace!("Creating bucket...");
client
.make_bucket(&MakeBucketArgs::new(&config.s3_bucket).unwrap())
.await?;
}
trace!("Connection successfull");
Ok(Self {
bucket_name: config.s3_bucket.to_owned(),
client: client,
})
}
pub async fn store(&self, data: &str, name: &Url) {
if let Some(domain) = name.domain() {
let filename = domain.to_string() + name.path();
let _ = &self
.client
.put_object_content(&self.bucket_name, &filename, data.to_owned())
.send()
.await
.unwrap();
}
}
pub async fn _get(&self, name: &Url) -> Option<String> {
if let Some(domain) = name.domain() {
let filename = domain.to_string() + name.path();
let data = self
.client
.get_object(&self.bucket_name, &filename)
.send()
.await
.unwrap();
if let Ok(segments )= data.content.to_segmented_bytes().await {
return Some(segments.to_bytes().iter().map(|c| *c as char).collect::<String>())
}
}
None
}
}

2
src/setup.surql Normal file
View File

@ -0,0 +1,2 @@
DEFINE TABLE IF NOT EXISTS website SCHEMALESS;
DEFINE FIELD IF NOT EXISTS accessed_at ON TABLE website VALUE time::now();