add
This commit is contained in:
parent
bfed9a6ca6
commit
e66131b411
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
/target
|
45
.vscode/launch.json
vendored
Normal file
45
.vscode/launch.json
vendored
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
{
|
||||||
|
// Use IntelliSense to learn about possible attributes.
|
||||||
|
// Hover to view descriptions of existing attributes.
|
||||||
|
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||||
|
"version": "0.2.0",
|
||||||
|
"configurations": [
|
||||||
|
{
|
||||||
|
"type": "lldb",
|
||||||
|
"request": "launch",
|
||||||
|
"name": "Debug executable 'surreal_spider'",
|
||||||
|
"cargo": {
|
||||||
|
"args": [
|
||||||
|
"build",
|
||||||
|
"--bin=surreal_spider",
|
||||||
|
"--package=surreal_spider"
|
||||||
|
],
|
||||||
|
"filter": {
|
||||||
|
"name": "surreal_spider",
|
||||||
|
"kind": "bin"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"args": [],
|
||||||
|
"cwd": "${workspaceFolder}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "lldb",
|
||||||
|
"request": "launch",
|
||||||
|
"name": "Debug unit tests in executable 'surreal_spider'",
|
||||||
|
"cargo": {
|
||||||
|
"args": [
|
||||||
|
"test",
|
||||||
|
"--no-run",
|
||||||
|
"--bin=surreal_spider",
|
||||||
|
"--package=surreal_spider"
|
||||||
|
],
|
||||||
|
"filter": {
|
||||||
|
"name": "surreal_spider",
|
||||||
|
"kind": "bin"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"args": [],
|
||||||
|
"cwd": "${workspaceFolder}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
4986
Cargo.lock
generated
Normal file
4986
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
10
Cargo.toml
Normal file
10
Cargo.toml
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
[package]
|
||||||
|
name = "surreal_spider"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2021"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
serde = { version = "1.0.208", features = ["derive"] }
|
||||||
|
spider = { version = "2.0.9", features = [], path="../spider/spider/" }
|
||||||
|
surrealdb = "1.5.4"
|
||||||
|
tokio = { version = "1.39.3", features = ["macros", "rt-multi-thread"] }
|
13
compose.yml
Normal file
13
compose.yml
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
services:
|
||||||
|
db:
|
||||||
|
image: surrealdb/surrealdb:latest-dev
|
||||||
|
ports:
|
||||||
|
- 8000:8000
|
||||||
|
command:
|
||||||
|
- start
|
||||||
|
- --log
|
||||||
|
- debug
|
||||||
|
- --user
|
||||||
|
- root
|
||||||
|
- --pass
|
||||||
|
- root
|
10
schema.surql
Normal file
10
schema.surql
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
DEFINE TABLE website SCHEMAFULL;
|
||||||
|
DEFINE FIELD accessed_at ON TABLE website TYPE datetime DEFAULT time::now();
|
||||||
|
DEFINE FIELD crawled ON TABLE website TYPE bool DEFAULT false;
|
||||||
|
DEFINE FIELD url ON TABLE website TYPE string;
|
||||||
|
DEFINE FIELD domain ON TABLE website VALUE parse::url::domain($this.url) ASSERT !type::is::none($value);
|
||||||
|
DEFINE FIELD path ON TABLE website VALUE parse::url::path($this.url) ASSERT !type::is::none($value);
|
||||||
|
|
||||||
|
DEFINE TABLE links_to SCHEMAFULL TYPE RELATION FROM website TO website;
|
||||||
|
DEFINE FIELD discovered_at ON TABLE links_to TYPE datetime DEFAULT time::now();
|
||||||
|
|
78
src/main.rs
Normal file
78
src/main.rs
Normal file
@ -0,0 +1,78 @@
|
|||||||
|
use std::cell::LazyCell;
|
||||||
|
use std::sync::{Arc, RwLock};
|
||||||
|
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use spider::hashbrown::HashMap;
|
||||||
|
use surrealdb::engine::remote::ws::{Ws, Client};
|
||||||
|
use surrealdb::opt::auth::Root;
|
||||||
|
use surrealdb::sql::Thing;
|
||||||
|
use surrealdb::Surreal;
|
||||||
|
use tokio::spawn;
|
||||||
|
use tokio::sync::broadcast::{self, Sender, Receiver};
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
struct Record {
|
||||||
|
#[allow(dead_code)]
|
||||||
|
id: Thing,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
struct Website<'a> {
|
||||||
|
url: &'a str,
|
||||||
|
crawled: bool
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> surrealdb::Result<()> {
|
||||||
|
|
||||||
|
let db_handle = tokio::spawn(async move {
|
||||||
|
let db = Surreal::new::<Ws>("127.0.0.1:8000").await.unwrap();
|
||||||
|
db.signin(Root {
|
||||||
|
username: "root",
|
||||||
|
password: "root",
|
||||||
|
})
|
||||||
|
.await.unwrap();
|
||||||
|
db.use_ns("test").use_db("test").await.unwrap();
|
||||||
|
|
||||||
|
let mut sub = PIPE.subscribe();
|
||||||
|
loop {
|
||||||
|
match sub.recv().await {
|
||||||
|
Ok((from, to)) => {
|
||||||
|
// let f: Vec<Record> = db.create("website").content(Website {url: &from, crawled: true}).await.unwrap();
|
||||||
|
// let t: Vec<Record> = db.create("website").content(Website {url: &to, crawled: false}).await.unwrap();
|
||||||
|
// println!("{:?} {:?}", f , t);
|
||||||
|
},
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("Error: {}", e);
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
let mut site = spider::website::Website::new("https://oliveratkinson.net")
|
||||||
|
.with_budget(Some(HashMap::from([
|
||||||
|
("*", 25),
|
||||||
|
])))
|
||||||
|
.with_tld(true)
|
||||||
|
.with_on_link_find_callback(Some(|from, to| {
|
||||||
|
let from = from.as_ref().to_string();
|
||||||
|
let to = to.as_ref().to_string();
|
||||||
|
match PIPE.send((from.clone(), to.clone())) {
|
||||||
|
Ok(_) => {},
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("{:?}", e);
|
||||||
|
},
|
||||||
|
};
|
||||||
|
println!("{from} -> {to}");
|
||||||
|
}))
|
||||||
|
.build()
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
site.crawl().await;
|
||||||
|
let _ = db_handle.await;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
const PIPE: LazyCell<Sender<(String, String)>> = LazyCell::new(|| {
|
||||||
|
broadcast::channel(100).0
|
||||||
|
});
|
Loading…
Reference in New Issue
Block a user