add
This commit is contained in:
		
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					/target
 | 
				
			||||||
							
								
								
									
										45
									
								
								.vscode/launch.json
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										45
									
								
								.vscode/launch.json
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@@ -0,0 +1,45 @@
 | 
				
			|||||||
 | 
					{
 | 
				
			||||||
 | 
					    // Use IntelliSense to learn about possible attributes.
 | 
				
			||||||
 | 
					    // Hover to view descriptions of existing attributes.
 | 
				
			||||||
 | 
					    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
 | 
				
			||||||
 | 
					    "version": "0.2.0",
 | 
				
			||||||
 | 
					    "configurations": [
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            "type": "lldb",
 | 
				
			||||||
 | 
					            "request": "launch",
 | 
				
			||||||
 | 
					            "name": "Debug executable 'surreal_spider'",
 | 
				
			||||||
 | 
					            "cargo": {
 | 
				
			||||||
 | 
					                "args": [
 | 
				
			||||||
 | 
					                    "build",
 | 
				
			||||||
 | 
					                    "--bin=surreal_spider",
 | 
				
			||||||
 | 
					                    "--package=surreal_spider"
 | 
				
			||||||
 | 
					                ],
 | 
				
			||||||
 | 
					                "filter": {
 | 
				
			||||||
 | 
					                    "name": "surreal_spider",
 | 
				
			||||||
 | 
					                    "kind": "bin"
 | 
				
			||||||
 | 
					                }
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            "args": [],
 | 
				
			||||||
 | 
					            "cwd": "${workspaceFolder}"
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            "type": "lldb",
 | 
				
			||||||
 | 
					            "request": "launch",
 | 
				
			||||||
 | 
					            "name": "Debug unit tests in executable 'surreal_spider'",
 | 
				
			||||||
 | 
					            "cargo": {
 | 
				
			||||||
 | 
					                "args": [
 | 
				
			||||||
 | 
					                    "test",
 | 
				
			||||||
 | 
					                    "--no-run",
 | 
				
			||||||
 | 
					                    "--bin=surreal_spider",
 | 
				
			||||||
 | 
					                    "--package=surreal_spider"
 | 
				
			||||||
 | 
					                ],
 | 
				
			||||||
 | 
					                "filter": {
 | 
				
			||||||
 | 
					                    "name": "surreal_spider",
 | 
				
			||||||
 | 
					                    "kind": "bin"
 | 
				
			||||||
 | 
					                }
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            "args": [],
 | 
				
			||||||
 | 
					            "cwd": "${workspaceFolder}"
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
							
								
								
									
										4986
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
										Normal file
									
								
							
							
						
						
									
										4986
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										10
									
								
								Cargo.toml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								Cargo.toml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,10 @@
 | 
				
			|||||||
 | 
					[package]
 | 
				
			||||||
 | 
					name = "surreal_spider"
 | 
				
			||||||
 | 
					version = "0.1.0"
 | 
				
			||||||
 | 
					edition = "2021"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[dependencies]
 | 
				
			||||||
 | 
					serde = { version = "1.0.208", features = ["derive"] }
 | 
				
			||||||
 | 
					spider = { version = "2.0.9", features = [], path="../spider/spider/" }
 | 
				
			||||||
 | 
					surrealdb = "1.5.4"
 | 
				
			||||||
 | 
					tokio = { version = "1.39.3", features = ["macros", "rt-multi-thread"] }
 | 
				
			||||||
							
								
								
									
										13
									
								
								compose.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								compose.yml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,13 @@
 | 
				
			|||||||
 | 
					services:
 | 
				
			||||||
 | 
					  db:
 | 
				
			||||||
 | 
					    image: surrealdb/surrealdb:latest-dev
 | 
				
			||||||
 | 
					    ports:
 | 
				
			||||||
 | 
					    - 8000:8000
 | 
				
			||||||
 | 
					    command:
 | 
				
			||||||
 | 
					      - start
 | 
				
			||||||
 | 
					      - --log
 | 
				
			||||||
 | 
					      - debug
 | 
				
			||||||
 | 
					      - --user
 | 
				
			||||||
 | 
					      - root
 | 
				
			||||||
 | 
					      - --pass
 | 
				
			||||||
 | 
					      - root
 | 
				
			||||||
							
								
								
									
										10
									
								
								schema.surql
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								schema.surql
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,10 @@
 | 
				
			|||||||
 | 
					DEFINE TABLE website SCHEMAFULL;
 | 
				
			||||||
 | 
					    DEFINE FIELD accessed_at ON TABLE website TYPE datetime DEFAULT time::now();
 | 
				
			||||||
 | 
					    DEFINE FIELD crawled     ON TABLE website TYPE bool DEFAULT false;
 | 
				
			||||||
 | 
					    DEFINE FIELD url         ON TABLE website TYPE string;
 | 
				
			||||||
 | 
					    DEFINE FIELD domain      ON TABLE website VALUE parse::url::domain($this.url) ASSERT !type::is::none($value);
 | 
				
			||||||
 | 
					    DEFINE FIELD path        ON TABLE website VALUE parse::url::path($this.url) ASSERT !type::is::none($value);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFINE TABLE links_to SCHEMAFULL TYPE RELATION FROM website TO website;
 | 
				
			||||||
 | 
					    DEFINE FIELD discovered_at ON TABLE links_to TYPE datetime DEFAULT time::now();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
							
								
								
									
										78
									
								
								src/main.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										78
									
								
								src/main.rs
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,78 @@
 | 
				
			|||||||
 | 
					use std::cell::LazyCell;
 | 
				
			||||||
 | 
					use std::sync::{Arc, RwLock};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					use serde::{Deserialize, Serialize};
 | 
				
			||||||
 | 
					use spider::hashbrown::HashMap;
 | 
				
			||||||
 | 
					use surrealdb::engine::remote::ws::{Ws, Client};
 | 
				
			||||||
 | 
					use surrealdb::opt::auth::Root;
 | 
				
			||||||
 | 
					use surrealdb::sql::Thing;
 | 
				
			||||||
 | 
					use surrealdb::Surreal;
 | 
				
			||||||
 | 
					use tokio::spawn;
 | 
				
			||||||
 | 
					use tokio::sync::broadcast::{self, Sender, Receiver};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#[derive(Debug, Deserialize)]
 | 
				
			||||||
 | 
					struct Record {
 | 
				
			||||||
 | 
					    #[allow(dead_code)]
 | 
				
			||||||
 | 
					    id: Thing,
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#[derive(Debug, Serialize)]
 | 
				
			||||||
 | 
					struct Website<'a> {
 | 
				
			||||||
 | 
					    url: &'a str,
 | 
				
			||||||
 | 
					    crawled: bool
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#[tokio::main]
 | 
				
			||||||
 | 
					async fn main() -> surrealdb::Result<()> {
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    let db_handle = tokio::spawn(async move {
 | 
				
			||||||
 | 
					        let db = Surreal::new::<Ws>("127.0.0.1:8000").await.unwrap();
 | 
				
			||||||
 | 
					        db.signin(Root {
 | 
				
			||||||
 | 
					            username: "root",
 | 
				
			||||||
 | 
					            password: "root",
 | 
				
			||||||
 | 
					        })
 | 
				
			||||||
 | 
					        .await.unwrap();
 | 
				
			||||||
 | 
					        db.use_ns("test").use_db("test").await.unwrap(); 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        let mut sub = PIPE.subscribe();
 | 
				
			||||||
 | 
					        loop {
 | 
				
			||||||
 | 
					            match sub.recv().await {
 | 
				
			||||||
 | 
					                Ok((from, to)) => {
 | 
				
			||||||
 | 
					                    // let f: Vec<Record> = db.create("website").content(Website {url: &from, crawled: true}).await.unwrap();
 | 
				
			||||||
 | 
					                    // let t: Vec<Record> = db.create("website").content(Website {url: &to, crawled: false}).await.unwrap();
 | 
				
			||||||
 | 
					                    // println!("{:?} {:?}", f , t); 
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                Err(e) => {
 | 
				
			||||||
 | 
					                    eprintln!("Error: {}", e);
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    let mut site = spider::website::Website::new("https://oliveratkinson.net")
 | 
				
			||||||
 | 
					        .with_budget(Some(HashMap::from([
 | 
				
			||||||
 | 
					            ("*", 25),
 | 
				
			||||||
 | 
					        ])))
 | 
				
			||||||
 | 
					        .with_tld(true)
 | 
				
			||||||
 | 
					        .with_on_link_find_callback(Some(|from, to| {
 | 
				
			||||||
 | 
					            let from = from.as_ref().to_string();
 | 
				
			||||||
 | 
					            let to = to.as_ref().to_string();
 | 
				
			||||||
 | 
					            match PIPE.send((from.clone(), to.clone())) {
 | 
				
			||||||
 | 
					                Ok(_) => {},
 | 
				
			||||||
 | 
					                Err(e) => {
 | 
				
			||||||
 | 
					                    eprintln!("{:?}", e);
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					            };
 | 
				
			||||||
 | 
					            println!("{from} -> {to}"); 
 | 
				
			||||||
 | 
					        }))
 | 
				
			||||||
 | 
					        .build()
 | 
				
			||||||
 | 
					        .unwrap();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    site.crawl().await;
 | 
				
			||||||
 | 
					    let _ = db_handle.await;
 | 
				
			||||||
 | 
					    Ok(())
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					const PIPE: LazyCell<Sender<(String, String)>> = LazyCell::new(|| {
 | 
				
			||||||
 | 
					    broadcast::channel(100).0
 | 
				
			||||||
 | 
					});
 | 
				
			||||||
		Reference in New Issue
	
	Block a user