custom_engine #1
3
.vscode/launch.json
vendored
3
.vscode/launch.json
vendored
@ -8,6 +8,9 @@
|
||||
"type": "lldb",
|
||||
"request": "launch",
|
||||
"name": "Debug executable 'surreal_spider'",
|
||||
"env": {
|
||||
"RUST_LOG": "surreal_spider=trace",
|
||||
},
|
||||
"cargo": {
|
||||
"args": [
|
||||
"build",
|
||||
|
316
Cargo.lock
generated
316
Cargo.lock
generated
@ -829,6 +829,17 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "displaydoc"
|
||||
version = "0.2.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.85",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dmp"
|
||||
version = "0.2.0"
|
||||
@ -1438,6 +1449,124 @@ dependencies = [
|
||||
"cc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_collections"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"yoke",
|
||||
"zerofrom",
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_locid"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"litemap",
|
||||
"tinystr",
|
||||
"writeable",
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_locid_transform"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"icu_locid",
|
||||
"icu_locid_transform_data",
|
||||
"icu_provider",
|
||||
"tinystr",
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_locid_transform_data"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e"
|
||||
|
||||
[[package]]
|
||||
name = "icu_normalizer"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"icu_collections",
|
||||
"icu_normalizer_data",
|
||||
"icu_properties",
|
||||
"icu_provider",
|
||||
"smallvec",
|
||||
"utf16_iter",
|
||||
"utf8_iter",
|
||||
"write16",
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_normalizer_data"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516"
|
||||
|
||||
[[package]]
|
||||
name = "icu_properties"
|
||||
version = "1.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"icu_collections",
|
||||
"icu_locid_transform",
|
||||
"icu_properties_data",
|
||||
"icu_provider",
|
||||
"tinystr",
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_properties_data"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569"
|
||||
|
||||
[[package]]
|
||||
name = "icu_provider"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"icu_locid",
|
||||
"icu_provider_macros",
|
||||
"stable_deref_trait",
|
||||
"tinystr",
|
||||
"writeable",
|
||||
"yoke",
|
||||
"zerofrom",
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_provider_macros"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.85",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ident_case"
|
||||
version = "1.0.1"
|
||||
@ -1446,12 +1575,23 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
|
||||
|
||||
[[package]]
|
||||
name = "idna"
|
||||
version = "0.5.0"
|
||||
version = "1.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6"
|
||||
checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e"
|
||||
dependencies = [
|
||||
"unicode-bidi",
|
||||
"unicode-normalization",
|
||||
"idna_adapter",
|
||||
"smallvec",
|
||||
"utf8_iter",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "idna_adapter"
|
||||
version = "1.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71"
|
||||
dependencies = [
|
||||
"icu_normalizer",
|
||||
"icu_properties",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -1562,7 +1702,7 @@ dependencies = [
|
||||
"petgraph",
|
||||
"pico-args",
|
||||
"regex",
|
||||
"regex-syntax",
|
||||
"regex-syntax 0.8.5",
|
||||
"string_cache",
|
||||
"term",
|
||||
"tiny-keccak",
|
||||
@ -1576,7 +1716,7 @@ version = "0.20.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "507460a910eb7b32ee961886ff48539633b788a36b65692b95f225b844c82553"
|
||||
dependencies = [
|
||||
"regex-automata",
|
||||
"regex-automata 0.4.8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -1634,6 +1774,12 @@ version = "0.4.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
|
||||
|
||||
[[package]]
|
||||
name = "litemap"
|
||||
version = "0.7.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
|
||||
|
||||
[[package]]
|
||||
name = "lock_api"
|
||||
version = "0.4.12"
|
||||
@ -1702,6 +1848,15 @@ dependencies = [
|
||||
"xml5ever",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "matchers"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558"
|
||||
dependencies = [
|
||||
"regex-automata 0.1.10",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "matrixmultiply"
|
||||
version = "0.3.9"
|
||||
@ -2516,8 +2671,17 @@ checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-automata",
|
||||
"regex-syntax",
|
||||
"regex-automata 0.4.8",
|
||||
"regex-syntax 0.8.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.1.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
|
||||
dependencies = [
|
||||
"regex-syntax 0.6.29",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -2528,9 +2692,15 @@ checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-syntax",
|
||||
"regex-syntax 0.8.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.6.29"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.8.5"
|
||||
@ -3273,6 +3443,7 @@ dependencies = [
|
||||
"tokio",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -3451,6 +3622,17 @@ dependencies = [
|
||||
"futures-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "synstructure"
|
||||
version = "0.13.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.85",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "system-configuration"
|
||||
version = "0.6.1"
|
||||
@ -3583,6 +3765,16 @@ dependencies = [
|
||||
"crunchy",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinystr"
|
||||
version = "0.7.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinyvec"
|
||||
version = "1.8.0"
|
||||
@ -3750,10 +3942,14 @@ version = "0.3.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b"
|
||||
dependencies = [
|
||||
"matchers",
|
||||
"nu-ansi-term",
|
||||
"once_cell",
|
||||
"regex",
|
||||
"sharded-slab",
|
||||
"smallvec",
|
||||
"thread_local",
|
||||
"tracing",
|
||||
"tracing-core",
|
||||
"tracing-log",
|
||||
]
|
||||
@ -3826,12 +4022,6 @@ version = "2.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7e51b68083f157f853b6379db119d1c1be0e6e4dec98101079dec41f6f5cf6df"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-bidi"
|
||||
version = "0.3.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5ab17db44d7388991a428b2ee655ce0c212e862eff1768a455c58f9aad6e7893"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.13"
|
||||
@ -3883,13 +4073,14 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
|
||||
|
||||
[[package]]
|
||||
name = "url"
|
||||
version = "2.5.2"
|
||||
version = "2.5.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "22784dbdf76fdde8af1aeda5622b546b422b6fc585325248a2bf9f5e41e94d6c"
|
||||
checksum = "8d157f1b96d14500ffdc1f10ba712e780825526c03d9a49b4d0324b0d9113ada"
|
||||
dependencies = [
|
||||
"form_urlencoded",
|
||||
"idna",
|
||||
"percent-encoding",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -3904,6 +4095,18 @@ version = "0.7.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
|
||||
|
||||
[[package]]
|
||||
name = "utf16_iter"
|
||||
version = "1.0.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
|
||||
|
||||
[[package]]
|
||||
name = "utf8_iter"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
|
||||
|
||||
[[package]]
|
||||
name = "uuid"
|
||||
version = "1.11.0"
|
||||
@ -4241,6 +4444,18 @@ dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "write16"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936"
|
||||
|
||||
[[package]]
|
||||
name = "writeable"
|
||||
version = "0.5.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51"
|
||||
|
||||
[[package]]
|
||||
name = "ws_stream_wasm"
|
||||
version = "0.7.4"
|
||||
@ -4280,6 +4495,30 @@ dependencies = [
|
||||
"markup5ever 0.14.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "yoke"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6c5b1314b079b0930c31e3af543d8ee1757b1951ae1e1565ec704403a7240ca5"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"stable_deref_trait",
|
||||
"yoke-derive",
|
||||
"zerofrom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "yoke-derive"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "28cc31741b18cb6f1d5ff12f5b7523e3d6eb0852bbbad19d73905511d9849b95"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.85",
|
||||
"synstructure",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerocopy"
|
||||
version = "0.7.35"
|
||||
@ -4301,8 +4540,51 @@ dependencies = [
|
||||
"syn 2.0.85",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerofrom"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "91ec111ce797d0e0784a1116d0ddcdbea84322cd79e5d5ad173daeba4f93ab55"
|
||||
dependencies = [
|
||||
"zerofrom-derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerofrom-derive"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0ea7b4a3637ea8669cedf0f1fd5c286a17f3de97b8dd5a70a6c167a1730e63a5"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.85",
|
||||
"synstructure",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zeroize"
|
||||
version = "1.8.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde"
|
||||
|
||||
[[package]]
|
||||
name = "zerovec"
|
||||
version = "0.10.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079"
|
||||
dependencies = [
|
||||
"yoke",
|
||||
"zerofrom",
|
||||
"zerovec-derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerovec-derive"
|
||||
version = "0.10.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.85",
|
||||
]
|
||||
|
@ -11,4 +11,5 @@ serde = { version = "1.0.214", features = ["derive"] }
|
||||
surrealdb = "2.0.4"
|
||||
tokio = { version="1.41.0", features = ["full"] }
|
||||
tracing = "0.1.40"
|
||||
tracing-subscriber = "0.3.18"
|
||||
tracing-subscriber = { version = "0.3.18", features = ["env-filter"] }
|
||||
url = { version = "2.5.3", features = ["serde"] }
|
||||
|
132
src/db.rs
132
src/db.rs
@ -1,22 +1,140 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use surrealdb::{engine::remote::ws::{Client, Ws}, opt::auth::Root, sql::Thing, Surreal};
|
||||
use surrealdb::{
|
||||
engine::remote::ws::{Client, Ws},
|
||||
opt::auth::Root,
|
||||
sql::Thing,
|
||||
Surreal,
|
||||
};
|
||||
use tracing::{debug, error, info, instrument};
|
||||
use url::Url;
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct Website {
|
||||
pub site: String,
|
||||
pub href: String,
|
||||
pub crawled: bool
|
||||
/// The url that this data is found at
|
||||
site: Url,
|
||||
/// The url as defined in the <a> tag
|
||||
href: Url,
|
||||
/// Wether or not this link has been crawled yet
|
||||
crawled: bool,
|
||||
/// Wether or not the href was doctored
|
||||
doctored_href: bool,
|
||||
original_href: Option<String>,
|
||||
}
|
||||
|
||||
impl Website {
|
||||
/// Creates a blank site (assumes that url param is site's root)
|
||||
pub fn new(url: &str, href: &str, crawled: bool) -> Self {
|
||||
let mut new = Self::from(url);
|
||||
new.crawled = crawled;
|
||||
new.original_href = Some(href.to_string());
|
||||
new.href =
|
||||
match Url::parse(href) {
|
||||
Ok(e) => e,
|
||||
Err(e) => {
|
||||
match e {
|
||||
url::ParseError::RelativeUrlWithoutBase => {
|
||||
// Try to combine the scheme_host and href to get a useable domain
|
||||
new.doctored_href = true;
|
||||
|
||||
let url = if !url.ends_with('/') && !href.starts_with('/') {
|
||||
format!("{url}/{href}")
|
||||
} else {
|
||||
format!("{url}{href}")
|
||||
};
|
||||
|
||||
// paste the domain onto the begining of the href
|
||||
Url::parse(&url).map_or_else(|err| {
|
||||
debug!("Parsing {url} with {href}");
|
||||
error!("{err} Failed to parse href into url on second try. Aborting");
|
||||
panic!("See error logs for more info.");
|
||||
}, |ok| ok)
|
||||
}
|
||||
_ => {
|
||||
error!("{e}");
|
||||
panic!("See error logs for more info.");
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
new
|
||||
}
|
||||
pub fn crawled(&mut self) {
|
||||
self.crawled = true
|
||||
}
|
||||
pub fn href_str(&self) -> &str {
|
||||
self.href.as_str()
|
||||
}
|
||||
pub fn site(&self) -> String {
|
||||
self.site.to_string()
|
||||
}
|
||||
pub fn domain_str(&self) -> &str {
|
||||
self.site.as_str()
|
||||
}
|
||||
#[instrument(skip_all)]
|
||||
pub async fn store(&mut self, db: &Surreal<Client>) {
|
||||
// is root record?
|
||||
if self.href.path() == "/" {
|
||||
// Upsert is create or update
|
||||
// Whereas Update is just update
|
||||
let record = ("website", &self.href.to_string());
|
||||
|
||||
let crawled = if let Some(old) = db.select(record).await.unwrap() {
|
||||
let old: Website = old; // infer type
|
||||
old.crawled
|
||||
} else {false};
|
||||
|
||||
if !self.crawled {self.crawled = crawled};
|
||||
|
||||
match db.upsert(record).content(self.clone()).await {
|
||||
Ok(e) => {
|
||||
if let Some(a) = &e {
|
||||
let _: &Record = a;
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
error!("{}", e);
|
||||
},
|
||||
};
|
||||
} else {
|
||||
let _: Option<Record> = match db.create("website").content(self.clone()).await {
|
||||
Ok(e) => {
|
||||
if let Some(a) = &e {
|
||||
let _: &Record = a;
|
||||
}
|
||||
e
|
||||
}
|
||||
Err(_) => todo!(),
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&str> for Website {
|
||||
/// site == href, crawled = false
|
||||
fn from(value: &str) -> Self {
|
||||
let site = match Url::parse(value) {
|
||||
Ok(a) => a,
|
||||
Err(_) => todo!(),
|
||||
};
|
||||
Self {
|
||||
href: site.clone(),
|
||||
crawled: false,
|
||||
site,
|
||||
doctored_href: false,
|
||||
original_href: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct Email {
|
||||
pub email: String
|
||||
pub email: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct Record {
|
||||
#[allow(dead_code)]
|
||||
id: Thing,
|
||||
pub id: Thing,
|
||||
}
|
||||
|
||||
pub async fn connect() -> surrealdb::Result<Surreal<Client>> {
|
||||
|
121
src/main.rs
121
src/main.rs
@ -2,87 +2,97 @@ extern crate markup5ever_rcdom as rcdom;
|
||||
extern crate html5ever;
|
||||
|
||||
use std::rc::Rc;
|
||||
use db::connect;
|
||||
use db::{connect, Website};
|
||||
use html5ever::{parse_document, tendril::TendrilSink, tree_builder::TreeBuilderOpts, ParseOpts};
|
||||
use rcdom::{Node, RcDom};
|
||||
use surrealdb::{engine::remote::ws::Client, Surreal};
|
||||
use tracing::{debug, error, info, warn};
|
||||
use tracing::{debug, info, instrument};
|
||||
use tracing_subscriber::EnvFilter;
|
||||
|
||||
mod db;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
tracing_subscriber::fmt::init();
|
||||
tracing_subscriber::fmt()
|
||||
.with_env_filter(EnvFilter::from_default_env())
|
||||
.with_line_number(true)
|
||||
.without_time()
|
||||
.init();
|
||||
debug!("Starting...");
|
||||
|
||||
let url = "https://oliveratkinson.net";
|
||||
|
||||
let db = connect().await.expect("Failed to connect to db, aborting.");
|
||||
let dom = get(url).await;
|
||||
// Would probably take these in as parameters from a cli
|
||||
let url = "https://oliveratkinson.net/";
|
||||
let budget = 50;
|
||||
let mut crawled = 0;
|
||||
|
||||
walk(&dom, &db, url).await;
|
||||
let db = connect().await.expect("Failed to connect to db, aborting.");
|
||||
|
||||
// Kick off the whole machine - This Website object doesn't matter, it's just to allow for
|
||||
// get() to work.
|
||||
let mut site = Website::from(url);
|
||||
let dom = get(&mut site, &db).await.expect("Inital page returned None.");
|
||||
crawled += 1;
|
||||
walk(&dom, &db, &site).await;
|
||||
|
||||
while crawled < budget {
|
||||
let uncrawled = get_uncrawled_links(&db).await;
|
||||
debug!("Crawling {} pages...", uncrawled.len());
|
||||
|
||||
for mut site in uncrawled {
|
||||
if let Some(dom) = get(&mut site, &db).await {
|
||||
walk(&dom, &db, &site).await;
|
||||
crawled += 1;
|
||||
let percent = format!("{:.2}%", (crawled as f32/budget as f32) * 100f32);
|
||||
info!("Crawled {crawled} out of {budget} pages. ({percent})");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
info!("Done");
|
||||
}
|
||||
|
||||
async fn get(url: &str) -> Rc<Node> {
|
||||
let response = reqwest::get(url).await.unwrap();
|
||||
let data = response.text().await.unwrap();
|
||||
|
||||
let opts = ParseOpts {
|
||||
tree_builder: TreeBuilderOpts {
|
||||
drop_doctype: true,
|
||||
#[instrument(skip_all)]
|
||||
/// A quick helper function for downloading a url
|
||||
async fn get(site: &mut Website, db: &Surreal<Client>) -> Option<Rc<Node>> {
|
||||
if let Ok(response) = reqwest::get(site.href_str()).await {
|
||||
let data = response.text().await.unwrap();
|
||||
let opts = ParseOpts {
|
||||
tree_builder: TreeBuilderOpts {
|
||||
drop_doctype: true,
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
};
|
||||
};
|
||||
|
||||
let dom = parse_document(RcDom::default(), opts)
|
||||
.from_utf8()
|
||||
.read_from(&mut data.as_bytes())
|
||||
.unwrap();
|
||||
|
||||
let dom = parse_document(RcDom::default(), opts)
|
||||
.from_utf8()
|
||||
.read_from(&mut data.as_bytes())
|
||||
.unwrap();
|
||||
dom.document
|
||||
site.crawled();
|
||||
site.store(db).await;
|
||||
return Some(dom.document);
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
async fn walk(node: &rcdom::Handle, db: &Surreal<Client> , site_name: &str) {
|
||||
/// Walks the givin site, placing it's findings in the database
|
||||
async fn walk(node: &rcdom::Handle, db: &Surreal<Client> , site_name: &Website) {
|
||||
// Insert Or Update
|
||||
let _: Option<Vec<db::Record>> = match db.upsert(("website", site_name)).content(db::Website { href: String::from("/"), crawled: true, site: site_name.to_string() } ).await {
|
||||
Ok(e) => {
|
||||
// Return this for type coercion
|
||||
e
|
||||
},
|
||||
Err(e) => {
|
||||
// error!("{}", e);
|
||||
None
|
||||
}
|
||||
};
|
||||
// create_root(site_name, db).await;
|
||||
|
||||
match &node.data {
|
||||
rcdom::NodeData::Element { name, attrs, template_contents, mathml_annotation_xml_integration_point } => {
|
||||
for attr in attrs.borrow().clone() {
|
||||
let name = name.local.to_string();
|
||||
if name == "a" {
|
||||
if name.local.to_string() == "a" {
|
||||
if attr.value.starts_with("mailto") {
|
||||
// mailto link, lol
|
||||
let created: Option<db::Record> = db.create("email").content(db::Email {
|
||||
let _created: Option<db::Record> = db.create("email").content(db::Email {
|
||||
email: attr.value.to_string()
|
||||
}).await.unwrap();
|
||||
warn!("{:?}", created)
|
||||
} else {
|
||||
// FIXME this isn't actually creating records...?
|
||||
let _: Option<db::Record> = match db.create("website").content(db::Website {
|
||||
href: attr.value.to_string(),
|
||||
crawled: false,
|
||||
site: site_name.to_string()
|
||||
}).await {
|
||||
Ok(e) => {
|
||||
if let Some(a) = &e {
|
||||
debug!("{:?}", a);
|
||||
}
|
||||
e
|
||||
},
|
||||
Err(_) => todo!(),
|
||||
};
|
||||
let mut web = Website::new(&site_name.site(), &attr.value, false);
|
||||
web.store(db).await;
|
||||
}
|
||||
}
|
||||
};
|
||||
@ -94,3 +104,10 @@ async fn walk(node: &rcdom::Handle, db: &Surreal<Client> , site_name: &str) {
|
||||
Box::pin(walk(child, db, site_name)).await;
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns 0-50 uncrawled links (LIMIT = 50)
|
||||
async fn get_uncrawled_links(db: &Surreal<Client>) -> Vec<Website> {
|
||||
let mut response = db.query("SELECT * FROM website WHERE crawled = false LIMIT 50").await.expect("Hard-coded query failed..?");
|
||||
response.take(0).expect("Returned websites couldn't be parsed")
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user