Compare commits

...

6 Commits

Author SHA1 Message Date
Oliver Atkinson
e4bf320ecd unifed settings for testing 2024-12-12 11:41:39 -07:00
Oliver Atkinson
fd71a8bc13 back to spider-rs 2024-12-12 11:04:45 -07:00
0f8a3d7215 using a custom parser now :) 2024-11-12 23:08:09 -07:00
574a370f30 readme updates 2024-11-12 21:24:57 -07:00
eaa79b749e prepare get function for s3 2024-11-12 21:19:05 -07:00
2c28d69d55 add s3 support 2024-11-12 21:03:58 -07:00
12 changed files with 1767 additions and 543 deletions

1
.gitignore vendored
View File

@ -1,5 +1,6 @@
/target /target
/.surrealdb /.surrealdb
/.minio
perf.data perf.data
flamegraph.svg flamegraph.svg
perf.data.old perf.data.old

2
.vscode/launch.json vendored
View File

@ -9,7 +9,7 @@
"request": "launch", "request": "launch",
"name": "Debug executable 'surreal_spider'", "name": "Debug executable 'surreal_spider'",
"env": { "env": {
"RUST_LOG": "surreal_spider=debug,reqwest=info", "RUST_LOG": "surreal_spider=trace,reqwest=info",
}, },
"cargo": { "cargo": {
"args": [ "args": [

1794
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -5,11 +5,13 @@ edition = "2021"
[dependencies] [dependencies]
html5ever = "0.29.0" html5ever = "0.29.0"
markup5ever_rcdom = "0.5.0-unofficial" # minio = "0.1.0"
minio = {git="https://github.com/minio/minio-rs.git", rev = "c28f576"}
reqwest = "0.12.9" reqwest = "0.12.9"
serde = { version = "1.0.214", features = ["derive"] } serde = { version = "1.0", features = ["derive"] }
surrealdb = "2.0.4" spider = { version = "2.21", features = ["sync"] }
tokio = { version="1.41.0", features = ["full"] } surrealdb = "2.0"
tracing = "0.1.40" tokio = { version="1.41", features = ["full"] }
tracing-subscriber = { version = "0.3.18", features = ["env-filter"] } tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
url = { version = "2.5.3", features = ["serde"] } url = { version = "2.5.3", features = ["serde"] }

View File

@ -1,23 +1,11 @@
# Surreal Crawler # Surreal Crawler
Mapping with a budget of 1000 (crawl 1000 sites, so many more links are actually discovered), on [my webiste](https://oliveratkinson.net) on 8/26/2024 took 1m9s. Crawls sites saving all the found links to a surrealdb database. It then proceeds to take batches of 100 uncrawled links untill the crawl budget is reached. It saves the data of each site in a minio database.
This is including the crawl and loading into the database and linking sites. (Locally hosted surreal db instance)
This run created 4299 site links with 23286 links between the sites. (It found my this git site which really bolsters those numbers.) ### TODO
## Install / Build
* You will need rust to compile the crawler [rustup.rs](https://rustup.rs)
* You need python3 (will come installed on most linux distros) and poetry for dependancy management.
* Install `pipx`, `python3`
* Then: `pipx install poetry`
* Then: `poetry install` to install the project dependancies
* You need to install [surrealdb](https://surrealdb.com)
## Use
Just run `./crawl.sh {url}` and it will start crawling. You can tweak the budget inside [crawl.sh](https://git.oliveratkinson.net/Oliver/internet_mapper/src/branch/main/crawl.sh) if you want.
You can also prefix the command with `time` to benchmark the system, such as: `time ./crawl.sh https://discord.com`.
- [ ] Domain filtering - prevent the crawler from going on alternate versions of wikipedia.
- [ ] Conditionally save content - based on filename or file contents
- [ ] GUI / TUI ?
- [ ] Better asynchronous getting of the sites. Currently it all happens serially.

View File

@ -1,5 +1,5 @@
services: services:
db: surreal:
image: surrealdb/surrealdb:latest-dev image: surrealdb/surrealdb:latest-dev
ports: ports:
- 8000:8000 - 8000:8000
@ -14,3 +14,18 @@ services:
- --pass - --pass
- root - root
- rocksdb:/mydata/database.db - rocksdb:/mydata/database.db
minio:
image: quay.io/minio/minio
ports:
- 9000:9000
- 9001:9001
environment:
- MINIO_ROOT_USER=root
- MINIO_ROOT_PASSWORD=an8charpassword
volumes:
- ./.minio/:/data
command:
- server
- /data
- --console-address
- ":9001"

View File

@ -1,2 +0,0 @@
DEFINE TABLE website SCHEMALESS;
DEFINE FIELD accessed_at ON TABLE website VALUE time::now();

View File

@ -8,12 +8,12 @@ use surrealdb::{
use tracing::{error, instrument, trace, warn}; use tracing::{error, instrument, trace, warn};
use url::Url; use url::Url;
use crate::Timer; use crate::{Config, Timer};
#[derive(Debug, Serialize, Deserialize, Clone)] #[derive(Debug, Serialize, Deserialize, Clone)]
pub struct Website { pub struct Website {
/// The url that this data is found at /// The url that this data is found at
site: Url, pub site: Url,
/// Wether or not this link has been crawled yet /// Wether or not this link has been crawled yet
pub crawled: bool, pub crawled: bool,
#[serde(skip_serializing)] #[serde(skip_serializing)]
@ -22,35 +22,20 @@ pub struct Website {
impl Website { impl Website {
/// Creates a blank site (assumes that url param is site's root) /// Creates a blank site (assumes that url param is site's root)
pub fn new(url: &str, crawled: bool) -> Self { pub fn new(url: Url, crawled: bool) -> Self {
let site = match Url::parse(url) {
Ok(a) => a,
Err(_) => todo!(),
};
Self { Self {
id: None, id: None,
crawled, crawled,
site, site: url,
} }
} }
pub fn set_crawled(&mut self) {
trace!("Set crawled to true");
self.crawled = true
}
pub fn mut_url(&mut self) -> &mut Url {
&mut self.site
}
#[instrument(skip_all)] #[instrument(skip_all)]
pub async fn links_to(&self, other: Vec<Thing>, db: &Surreal<Client>) { pub async fn links_to(&self, other: Vec<Thing>, db: &Surreal<Client>) {
let len = other.len(); let len = other.len();
if len == 0 {return} if len == 0 {return}
let from = self.site.to_string(); let from = self.site.to_string();
// let to = other.site.to_string();
trace!("Linking {from} to {} other pages.", other.len());
let msg = format!("Linked {len} pages"); let msg = format!("Linked {len} pages");
let timer = Timer::start(&msg); let timer = Timer::start(&msg);
// prevent the timer from being dropped instantly. // prevent the timer from being dropped instantly.
@ -69,7 +54,6 @@ impl Website {
let _: Vec<usize> = vec; let _: Vec<usize> = vec;
if let Some(num) = vec.get(0) { if let Some(num) = vec.get(0) {
if *num == len { if *num == len {
trace!("Link OK");
return; return;
} else { } else {
warn!("Didn't link all the records. {num}/{len}"); warn!("Didn't link all the records. {num}/{len}");
@ -149,19 +133,30 @@ pub struct Record {
pub id: Thing, pub id: Thing,
} }
pub async fn connect() -> surrealdb::Result<Surreal<Client>> { #[instrument(skip_all, name = "SurrealDB")]
pub async fn connect(config: &Config<'_>) -> surrealdb::Result<Surreal<Client>> {
trace!("Establishing connection to surreal...");
// Connect to the server // Connect to the server
let db = Surreal::new::<Ws>("127.0.0.1:8000").await?; let db = Surreal::new::<Ws>(config.surreal_url).await?;
trace!("Logging in...");
// Signin as a namespace, database, or root user // Signin as a namespace, database, or root user
db.signin(Root { db.signin(Root {
username: "root", username: config.surreal_username,
password: "root", password: config.surreal_password,
}) })
.await?; .await?;
// Select a specific namespace / database // Select a specific namespace / database
db.use_ns("test").use_db("v1.2").await?; db
.use_ns(config.surreal_ns)
.use_db(config.surreal_db)
.await?;
let setup = include_bytes!("setup.surql");
let file = setup.iter().map(|c| *c as char).collect::<String>();
db.query(file).await.expect("Failed to setup surreal tables.");
Ok(db) Ok(db)
} }

View File

@ -1,189 +1,116 @@
extern crate html5ever; extern crate html5ever;
extern crate markup5ever_rcdom as rcdom;
use std::time::Instant;
use db::{connect, Website}; use db::{connect, Website};
use html5ever::{ use parser::parse;
local_name, parse_document, tendril::TendrilSink, tree_builder::TreeBuilderOpts, ParseOpts, use s3::S3;
}; use surrealdb::{engine::remote::ws::Client, Surreal};
use rcdom::RcDom; use tokio::sync::broadcast::Receiver;
use std::time::Instant; use tracing::{debug, info, trace, trace_span};
use surrealdb::{engine::remote::ws::Client, sql::Thing, Surreal};
use tracing::{debug, info, instrument, trace, trace_span};
use tracing_subscriber::EnvFilter; use tracing_subscriber::EnvFilter;
use url::Url;
mod db; mod db;
mod parser;
mod s3;
struct Config<'a> {
surreal_ns: &'a str,
surreal_db: &'a str,
surreal_url: &'a str,
surreal_username: &'a str,
surreal_password: &'a str,
s3_url: &'a str,
s3_bucket: &'a str,
s3_access_key: &'a str,
s3_secret_key: &'a str,
}
#[tokio::main] #[tokio::main]
async fn main() { async fn main() {
let total_runtime = Timer::start("Completed");
tracing_subscriber::fmt() tracing_subscriber::fmt()
.with_env_filter(EnvFilter::from_default_env()) .with_env_filter(EnvFilter::from_default_env())
.with_line_number(true) .with_line_number(true)
.without_time() // .without_time()
.init(); .init();
debug!("Starting..."); debug!("Starting...");
let config = Config {
surreal_url: "localhost:8000",
surreal_username: "root",
surreal_password: "root",
surreal_ns: "test",
surreal_db: "spider",
s3_bucket: "spider",
s3_url: "http://localhost:9000",
s3_access_key: "0zv7GbLQsw4ZI8TclMps",
s3_secret_key: "5dB7QkGFw7fYbUJ5LpHk2GbWR7Bl710HlRz4NbzB",
};
// Would probably take these in as parameters from a cli // Would probably take these in as parameters from a cli
let url = "https://oliveratkinson.net/"; let starting_url = "https://oliveratkinson.net/";
// let url = "http://localhost:5500";
let budget = 1000;
let mut crawled = 0;
let db = connect().await.expect("Failed to connect to db, aborting."); let s3 = S3::connect(&config)
.await
.expect("Failed to connect to minio, aborting.");
let db = connect(&config)
.await
.expect("Failed to connect to surreal, aborting.");
let client = reqwest::Client::builder() let mut site = spider::website::Website::new(&starting_url)
// .use_rustls_tls() .with_limit(5)
.with_depth(0)
.build() .build()
.unwrap(); .unwrap();
// Kick off the whole machine - This Website object doesn't matter, it's just to allow for let mut rx: Receiver<spider::page::Page> = site.subscribe(0).unwrap();
// get() to work.
let span = trace_span!("Pre-Loop");
let pre_loop_span = span.enter();
// Download the site
let mut site = Website::new(&url, false);
get(&mut site, &db, &client, &mut crawled).await;
drop(pre_loop_span); let subscriber = tokio::spawn(async move {
let span = trace_span!("Sub");
let span = span.enter();
while let Ok(res) = rx.recv().await {
// Get body
let data = res.get_html();
let url = Url::parse(res.get_url()).unwrap();
let span = trace_span!("Loop"); trace!("Got '{}'", url.to_string());
let span = span.enter(); // Store document
while crawled < budget { s3.store(&data, &url).await;
let get_num = if budget - crawled < 100 {
budget - crawled
} else {
100
};
let uncrawled = get_uncrawled_links(&db, get_num).await; // Parse document and store relationships
if uncrawled.len() == 0 { let mut page = Website::new(url, true);
info!("Had more budget but finished crawling everything."); page.store(&db).await;
return;
// Relate this page to all the pages it links to
let span = trace_span!("Linking");
let span = span.enter();
let found_links = parse( &page, data).await;
let mut stored_links = Vec::new();
for mut link in found_links {
if let Some(id) = link.store(&db).await {
stored_links.push(id);
}
}
page.links_to(stored_links, &db).await;
drop(span);
} }
debug!("Crawling {} pages...", uncrawled.len()); drop(span);
});
let span = trace_span!("Crawling"); let timer = Timer::start("Crawled");
let _ = span.enter();
for mut site in uncrawled { site.crawl().await;
get(&mut site, &db, &client, &mut crawled).await; site.unsubscribe();
let percent = format!("{:.2}%", (crawled as f32 / budget as f32) * 100f32);
info!("Crawled {crawled} out of {budget} pages. ({percent})"); drop(timer);
}
} subscriber.await.unwrap();
drop(span);
info!("Done"); info!("Done");
} drop(total_runtime);
#[instrument(skip_all)]
/// A quick helper function for downloading a url
async fn get(
site: &mut Website,
db: &Surreal<Client>,
request_client: &reqwest::Client,
count: &mut usize,
) {
trace!("Get: {}", site.to_string());
let timer = Timer::start("Got page");
if let Ok(response) = request_client.get(site.to_string()).send().await {
timer.stop();
// Get body
let data = response.text().await.unwrap();
let opts = ParseOpts {
tree_builder: TreeBuilderOpts {
drop_doctype: true,
..Default::default()
},
..Default::default()
};
// Get DOM
let dom = parse_document(RcDom::default(), opts)
.from_utf8()
.read_from(&mut data.as_bytes())
.unwrap();
// TODO save the dom to minio if a flag is set
// Modify record in database
site.set_crawled();
site.store(db).await;
trace!("Got: {}", site.to_string());
// Walk all the children nodes, searching for links to other pages.
let mut buffer = Vec::new();
let timer = Timer::start("Walked");
walk(&dom.document, &db, &site, &mut buffer).await;
timer.stop();
// Put all the found links into the database.
site.links_to(buffer, &db).await;
*count += 1;
}
trace!("Failed to get: {}", site.to_string());
}
/// Walks the givin site, placing it's findings in the database
async fn walk(
node: &rcdom::Handle,
db: &Surreal<Client>,
site: &Website,
links_to: &mut Vec<Thing>,
) {
let span = trace_span!("Walk");
let span = span.enter();
// Match each node - node basically means element.
match &node.data {
rcdom::NodeData::Element { name, attrs, .. } => {
for attr in attrs.borrow().clone() {
match name.local {
local_name!("a")
| local_name!("audio")
| local_name!("area")
| local_name!("img")
| local_name!("link")
| local_name!("object")
| local_name!("source")
| local_name!("base")
| local_name!("video") => {
let attribute_name = attr.name.local.to_string();
if attribute_name == "src"
|| attribute_name == "href"
|| attribute_name == "data"
{
// Get clone of the current site object
let mut web = site.clone();
// Set url
let url = web.mut_url();
url.set_fragment(None); // removes #xyz
let joined = url.join(&attr.value).unwrap();
*url = joined;
// Set other attributes
web.crawled = false;
// TODO set element name
// let element_name = name.local.to_string();
if let Some(id) = web.store(db).await {
links_to.push(id);
}
}
}
local_name!("button") | local_name!("meta") | local_name!("iframe") => {
// dbg!(attrs);
}
_ => {}
};
}
}
_ => {}
};
drop(span);
for child in node.children.borrow().iter() {
Box::pin(walk(child, db, site, links_to)).await;
}
} }
/// Returns uncrawled links /// Returns uncrawled links

92
src/parser.rs Normal file
View File

@ -0,0 +1,92 @@
use std::default::Default;
use std::str::FromStr;
use html5ever::tokenizer::{BufferQueue, TokenizerResult};
use html5ever::tokenizer::{StartTag, TagToken};
use html5ever::tokenizer::{Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
use html5ever::{local_name, tendril::*};
use tracing::{instrument, trace};
use crate::db::Website;
#[derive(Clone)]
struct LinkParser<'a> {
site: &'a Website,
}
impl TokenSink for LinkParser<'_> {
type Handle = Vec<Website>;
fn process_token(&self, token: Token, _line_number: u64) -> TokenSinkResult<Self::Handle> {
match token {
TagToken(tag) => {
if tag.kind == StartTag {
match tag.name {
local_name!("a")
| local_name!("audio")
| local_name!("area")
| local_name!("img")
| local_name!("link")
| local_name!("object")
| local_name!("source")
| local_name!("base")
| local_name!("video") => {
let mut links = Vec::new();
for attr in &tag.attrs {
let attr_name = attr.name.local.to_string();
if attr_name == "src" || attr_name == "href" || attr_name == "data"
{
// Get clone of the current site object
let mut web = self.site.clone();
// Set url
let mut url = web.site;
url.set_fragment(None); // removes #xyz
let joined = url.join(&attr.value).unwrap();
web.site = joined;
web.crawled = false;
links.push(web);
}
}
return TokenSinkResult::Script(links);
}
local_name!("button") | local_name!("meta") | local_name!("iframe") => {
// dbg!(attrs);
}
_ => {}
}
}
}
_ => {}
}
TokenSinkResult::Continue
}
}
#[instrument(skip_all)]
pub async fn parse(site: &Website, data: String) -> Vec<Website> {
let sink = LinkParser { site };
let chunk = Tendril::from_str(&data).unwrap();
let mut input = BufferQueue::default();
input.push_back(chunk.try_reinterpret::<fmt::UTF8>().unwrap());
let token = Tokenizer::new(sink.clone(), TokenizerOpts::default());
// let mut links_to = Vec::new();
let mut res = Vec::new();
while !input.is_empty() {
if let TokenizerResult::Script(mut s) = token.feed(&mut input) {
res.append(&mut s);
}
}
trace!("Found {} links.", res.len());
assert!(input.is_empty());
token.end();
res
}

76
src/s3.rs Normal file
View File

@ -0,0 +1,76 @@
use minio::s3::{
args::{BucketExistsArgs, MakeBucketArgs}, client::ClientBuilder, creds::StaticProvider, error::Error, http::BaseUrl, types::S3Api, Client
};
use tracing::{instrument, trace};
use url::Url;
use crate::Config;
pub struct S3 {
bucket_name: String,
client: Client,
}
impl S3 {
#[instrument(skip_all, name = "S3")]
pub async fn connect(config: &Config<'_>) -> Result<Self, Error> {
let base_url = config.s3_url.parse::<BaseUrl>().unwrap();
let static_provider =
StaticProvider::new(&config.s3_access_key, &config.s3_secret_key, None);
let client = ClientBuilder::new(base_url)
.provider(Some(Box::new(static_provider)))
.build()?;
trace!("Checking bucket...");
let exists = client
.bucket_exists(&BucketExistsArgs::new(&config.s3_bucket).unwrap())
.await?;
if !exists {
trace!("Creating bucket...");
client
.make_bucket(&MakeBucketArgs::new(&config.s3_bucket).unwrap())
.await?;
}
trace!("Connection successfull");
Ok(Self {
bucket_name: config.s3_bucket.to_owned(),
client: client,
})
}
pub async fn store(&self, data: &str, name: &Url) {
if let Some(domain) = name.domain() {
let filename = domain.to_string() + name.path();
let _ = &self
.client
.put_object_content(&self.bucket_name, &filename, data.to_owned())
.send()
.await
.unwrap();
}
}
pub async fn _get(&self, name: &Url) -> Option<String> {
if let Some(domain) = name.domain() {
let filename = domain.to_string() + name.path();
let data = self
.client
.get_object(&self.bucket_name, &filename)
.send()
.await
.unwrap();
if let Ok(segments )= data.content.to_segmented_bytes().await {
return Some(segments.to_bytes().iter().map(|c| *c as char).collect::<String>())
}
}
None
}
}

2
src/setup.surql Normal file
View File

@ -0,0 +1,2 @@
DEFINE TABLE IF NOT EXISTS website SCHEMALESS;
DEFINE FIELD IF NOT EXISTS accessed_at ON TABLE website VALUE time::now();