This commit is contained in:
oliver 2024-08-23 05:22:49 -06:00
parent bfed9a6ca6
commit e66131b411
7 changed files with 5143 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/target

45
.vscode/launch.json vendored Normal file
View File

@ -0,0 +1,45 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"type": "lldb",
"request": "launch",
"name": "Debug executable 'surreal_spider'",
"cargo": {
"args": [
"build",
"--bin=surreal_spider",
"--package=surreal_spider"
],
"filter": {
"name": "surreal_spider",
"kind": "bin"
}
},
"args": [],
"cwd": "${workspaceFolder}"
},
{
"type": "lldb",
"request": "launch",
"name": "Debug unit tests in executable 'surreal_spider'",
"cargo": {
"args": [
"test",
"--no-run",
"--bin=surreal_spider",
"--package=surreal_spider"
],
"filter": {
"name": "surreal_spider",
"kind": "bin"
}
},
"args": [],
"cwd": "${workspaceFolder}"
}
]
}

4986
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

10
Cargo.toml Normal file
View File

@ -0,0 +1,10 @@
[package]
name = "surreal_spider"
version = "0.1.0"
edition = "2021"
[dependencies]
serde = { version = "1.0.208", features = ["derive"] }
spider = { version = "2.0.9", features = [], path="../spider/spider/" }
surrealdb = "1.5.4"
tokio = { version = "1.39.3", features = ["macros", "rt-multi-thread"] }

13
compose.yml Normal file
View File

@ -0,0 +1,13 @@
services:
db:
image: surrealdb/surrealdb:latest-dev
ports:
- 8000:8000
command:
- start
- --log
- debug
- --user
- root
- --pass
- root

10
schema.surql Normal file
View File

@ -0,0 +1,10 @@
DEFINE TABLE website SCHEMAFULL;
DEFINE FIELD accessed_at ON TABLE website TYPE datetime DEFAULT time::now();
DEFINE FIELD crawled ON TABLE website TYPE bool DEFAULT false;
DEFINE FIELD url ON TABLE website TYPE string;
DEFINE FIELD domain ON TABLE website VALUE parse::url::domain($this.url) ASSERT !type::is::none($value);
DEFINE FIELD path ON TABLE website VALUE parse::url::path($this.url) ASSERT !type::is::none($value);
DEFINE TABLE links_to SCHEMAFULL TYPE RELATION FROM website TO website;
DEFINE FIELD discovered_at ON TABLE links_to TYPE datetime DEFAULT time::now();

78
src/main.rs Normal file
View File

@ -0,0 +1,78 @@
use std::cell::LazyCell;
use std::sync::{Arc, RwLock};
use serde::{Deserialize, Serialize};
use spider::hashbrown::HashMap;
use surrealdb::engine::remote::ws::{Ws, Client};
use surrealdb::opt::auth::Root;
use surrealdb::sql::Thing;
use surrealdb::Surreal;
use tokio::spawn;
use tokio::sync::broadcast::{self, Sender, Receiver};
#[derive(Debug, Deserialize)]
struct Record {
#[allow(dead_code)]
id: Thing,
}
#[derive(Debug, Serialize)]
struct Website<'a> {
url: &'a str,
crawled: bool
}
#[tokio::main]
async fn main() -> surrealdb::Result<()> {
let db_handle = tokio::spawn(async move {
let db = Surreal::new::<Ws>("127.0.0.1:8000").await.unwrap();
db.signin(Root {
username: "root",
password: "root",
})
.await.unwrap();
db.use_ns("test").use_db("test").await.unwrap();
let mut sub = PIPE.subscribe();
loop {
match sub.recv().await {
Ok((from, to)) => {
// let f: Vec<Record> = db.create("website").content(Website {url: &from, crawled: true}).await.unwrap();
// let t: Vec<Record> = db.create("website").content(Website {url: &to, crawled: false}).await.unwrap();
// println!("{:?} {:?}", f , t);
},
Err(e) => {
eprintln!("Error: {}", e);
},
}
}
});
let mut site = spider::website::Website::new("https://oliveratkinson.net")
.with_budget(Some(HashMap::from([
("*", 25),
])))
.with_tld(true)
.with_on_link_find_callback(Some(|from, to| {
let from = from.as_ref().to_string();
let to = to.as_ref().to_string();
match PIPE.send((from.clone(), to.clone())) {
Ok(_) => {},
Err(e) => {
eprintln!("{:?}", e);
},
};
println!("{from} -> {to}");
}))
.build()
.unwrap();
site.crawl().await;
let _ = db_handle.await;
Ok(())
}
const PIPE: LazyCell<Sender<(String, String)>> = LazyCell::new(|| {
broadcast::channel(100).0
});