add
This commit is contained in:
		
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@@ -0,0 +1 @@
 | 
			
		||||
/target
 | 
			
		||||
							
								
								
									
										45
									
								
								.vscode/launch.json
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										45
									
								
								.vscode/launch.json
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@@ -0,0 +1,45 @@
 | 
			
		||||
{
 | 
			
		||||
    // Use IntelliSense to learn about possible attributes.
 | 
			
		||||
    // Hover to view descriptions of existing attributes.
 | 
			
		||||
    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
 | 
			
		||||
    "version": "0.2.0",
 | 
			
		||||
    "configurations": [
 | 
			
		||||
        {
 | 
			
		||||
            "type": "lldb",
 | 
			
		||||
            "request": "launch",
 | 
			
		||||
            "name": "Debug executable 'surreal_spider'",
 | 
			
		||||
            "cargo": {
 | 
			
		||||
                "args": [
 | 
			
		||||
                    "build",
 | 
			
		||||
                    "--bin=surreal_spider",
 | 
			
		||||
                    "--package=surreal_spider"
 | 
			
		||||
                ],
 | 
			
		||||
                "filter": {
 | 
			
		||||
                    "name": "surreal_spider",
 | 
			
		||||
                    "kind": "bin"
 | 
			
		||||
                }
 | 
			
		||||
            },
 | 
			
		||||
            "args": [],
 | 
			
		||||
            "cwd": "${workspaceFolder}"
 | 
			
		||||
        },
 | 
			
		||||
        {
 | 
			
		||||
            "type": "lldb",
 | 
			
		||||
            "request": "launch",
 | 
			
		||||
            "name": "Debug unit tests in executable 'surreal_spider'",
 | 
			
		||||
            "cargo": {
 | 
			
		||||
                "args": [
 | 
			
		||||
                    "test",
 | 
			
		||||
                    "--no-run",
 | 
			
		||||
                    "--bin=surreal_spider",
 | 
			
		||||
                    "--package=surreal_spider"
 | 
			
		||||
                ],
 | 
			
		||||
                "filter": {
 | 
			
		||||
                    "name": "surreal_spider",
 | 
			
		||||
                    "kind": "bin"
 | 
			
		||||
                }
 | 
			
		||||
            },
 | 
			
		||||
            "args": [],
 | 
			
		||||
            "cwd": "${workspaceFolder}"
 | 
			
		||||
        }
 | 
			
		||||
    ]
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										4986
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
										Normal file
									
								
							
							
						
						
									
										4986
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										10
									
								
								Cargo.toml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								Cargo.toml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,10 @@
 | 
			
		||||
[package]
 | 
			
		||||
name = "surreal_spider"
 | 
			
		||||
version = "0.1.0"
 | 
			
		||||
edition = "2021"
 | 
			
		||||
 | 
			
		||||
[dependencies]
 | 
			
		||||
serde = { version = "1.0.208", features = ["derive"] }
 | 
			
		||||
spider = { version = "2.0.9", features = [], path="../spider/spider/" }
 | 
			
		||||
surrealdb = "1.5.4"
 | 
			
		||||
tokio = { version = "1.39.3", features = ["macros", "rt-multi-thread"] }
 | 
			
		||||
							
								
								
									
										13
									
								
								compose.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								compose.yml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,13 @@
 | 
			
		||||
services:
 | 
			
		||||
  db:
 | 
			
		||||
    image: surrealdb/surrealdb:latest-dev
 | 
			
		||||
    ports:
 | 
			
		||||
    - 8000:8000
 | 
			
		||||
    command:
 | 
			
		||||
      - start
 | 
			
		||||
      - --log
 | 
			
		||||
      - debug
 | 
			
		||||
      - --user
 | 
			
		||||
      - root
 | 
			
		||||
      - --pass
 | 
			
		||||
      - root
 | 
			
		||||
							
								
								
									
										10
									
								
								schema.surql
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								schema.surql
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,10 @@
 | 
			
		||||
DEFINE TABLE website SCHEMAFULL;
 | 
			
		||||
    DEFINE FIELD accessed_at ON TABLE website TYPE datetime DEFAULT time::now();
 | 
			
		||||
    DEFINE FIELD crawled     ON TABLE website TYPE bool DEFAULT false;
 | 
			
		||||
    DEFINE FIELD url         ON TABLE website TYPE string;
 | 
			
		||||
    DEFINE FIELD domain      ON TABLE website VALUE parse::url::domain($this.url) ASSERT !type::is::none($value);
 | 
			
		||||
    DEFINE FIELD path        ON TABLE website VALUE parse::url::path($this.url) ASSERT !type::is::none($value);
 | 
			
		||||
 | 
			
		||||
DEFINE TABLE links_to SCHEMAFULL TYPE RELATION FROM website TO website;
 | 
			
		||||
    DEFINE FIELD discovered_at ON TABLE links_to TYPE datetime DEFAULT time::now();
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										78
									
								
								src/main.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										78
									
								
								src/main.rs
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,78 @@
 | 
			
		||||
use std::cell::LazyCell;
 | 
			
		||||
use std::sync::{Arc, RwLock};
 | 
			
		||||
 | 
			
		||||
use serde::{Deserialize, Serialize};
 | 
			
		||||
use spider::hashbrown::HashMap;
 | 
			
		||||
use surrealdb::engine::remote::ws::{Ws, Client};
 | 
			
		||||
use surrealdb::opt::auth::Root;
 | 
			
		||||
use surrealdb::sql::Thing;
 | 
			
		||||
use surrealdb::Surreal;
 | 
			
		||||
use tokio::spawn;
 | 
			
		||||
use tokio::sync::broadcast::{self, Sender, Receiver};
 | 
			
		||||
 | 
			
		||||
#[derive(Debug, Deserialize)]
 | 
			
		||||
struct Record {
 | 
			
		||||
    #[allow(dead_code)]
 | 
			
		||||
    id: Thing,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#[derive(Debug, Serialize)]
 | 
			
		||||
struct Website<'a> {
 | 
			
		||||
    url: &'a str,
 | 
			
		||||
    crawled: bool
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#[tokio::main]
 | 
			
		||||
async fn main() -> surrealdb::Result<()> {
 | 
			
		||||
    
 | 
			
		||||
    let db_handle = tokio::spawn(async move {
 | 
			
		||||
        let db = Surreal::new::<Ws>("127.0.0.1:8000").await.unwrap();
 | 
			
		||||
        db.signin(Root {
 | 
			
		||||
            username: "root",
 | 
			
		||||
            password: "root",
 | 
			
		||||
        })
 | 
			
		||||
        .await.unwrap();
 | 
			
		||||
        db.use_ns("test").use_db("test").await.unwrap(); 
 | 
			
		||||
 | 
			
		||||
        let mut sub = PIPE.subscribe();
 | 
			
		||||
        loop {
 | 
			
		||||
            match sub.recv().await {
 | 
			
		||||
                Ok((from, to)) => {
 | 
			
		||||
                    // let f: Vec<Record> = db.create("website").content(Website {url: &from, crawled: true}).await.unwrap();
 | 
			
		||||
                    // let t: Vec<Record> = db.create("website").content(Website {url: &to, crawled: false}).await.unwrap();
 | 
			
		||||
                    // println!("{:?} {:?}", f , t); 
 | 
			
		||||
                },
 | 
			
		||||
                Err(e) => {
 | 
			
		||||
                    eprintln!("Error: {}", e);
 | 
			
		||||
                },
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    let mut site = spider::website::Website::new("https://oliveratkinson.net")
 | 
			
		||||
        .with_budget(Some(HashMap::from([
 | 
			
		||||
            ("*", 25),
 | 
			
		||||
        ])))
 | 
			
		||||
        .with_tld(true)
 | 
			
		||||
        .with_on_link_find_callback(Some(|from, to| {
 | 
			
		||||
            let from = from.as_ref().to_string();
 | 
			
		||||
            let to = to.as_ref().to_string();
 | 
			
		||||
            match PIPE.send((from.clone(), to.clone())) {
 | 
			
		||||
                Ok(_) => {},
 | 
			
		||||
                Err(e) => {
 | 
			
		||||
                    eprintln!("{:?}", e);
 | 
			
		||||
                },
 | 
			
		||||
            };
 | 
			
		||||
            println!("{from} -> {to}"); 
 | 
			
		||||
        }))
 | 
			
		||||
        .build()
 | 
			
		||||
        .unwrap();
 | 
			
		||||
 | 
			
		||||
    site.crawl().await;
 | 
			
		||||
    let _ = db_handle.await;
 | 
			
		||||
    Ok(())
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
const PIPE: LazyCell<Sender<(String, String)>> = LazyCell::new(|| {
 | 
			
		||||
    broadcast::channel(100).0
 | 
			
		||||
});
 | 
			
		||||
		Reference in New Issue
	
	Block a user