Compare commits

...

6 Commits

Author SHA1 Message Date
Oliver Atkinson
e4bf320ecd unifed settings for testing 2024-12-12 11:41:39 -07:00
Oliver Atkinson
fd71a8bc13 back to spider-rs 2024-12-12 11:04:45 -07:00
0f8a3d7215 using a custom parser now :) 2024-11-12 23:08:09 -07:00
574a370f30 readme updates 2024-11-12 21:24:57 -07:00
eaa79b749e prepare get function for s3 2024-11-12 21:19:05 -07:00
2c28d69d55 add s3 support 2024-11-12 21:03:58 -07:00
12 changed files with 1767 additions and 543 deletions

1
.gitignore vendored
View File

@ -1,5 +1,6 @@
/target
/.surrealdb
/.minio
perf.data
flamegraph.svg
perf.data.old

2
.vscode/launch.json vendored
View File

@ -9,7 +9,7 @@
"request": "launch",
"name": "Debug executable 'surreal_spider'",
"env": {
"RUST_LOG": "surreal_spider=debug,reqwest=info",
"RUST_LOG": "surreal_spider=trace,reqwest=info",
},
"cargo": {
"args": [

1794
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -5,11 +5,13 @@ edition = "2021"
[dependencies]
html5ever = "0.29.0"
markup5ever_rcdom = "0.5.0-unofficial"
# minio = "0.1.0"
minio = {git="https://github.com/minio/minio-rs.git", rev = "c28f576"}
reqwest = "0.12.9"
serde = { version = "1.0.214", features = ["derive"] }
surrealdb = "2.0.4"
tokio = { version="1.41.0", features = ["full"] }
tracing = "0.1.40"
tracing-subscriber = { version = "0.3.18", features = ["env-filter"] }
serde = { version = "1.0", features = ["derive"] }
spider = { version = "2.21", features = ["sync"] }
surrealdb = "2.0"
tokio = { version="1.41", features = ["full"] }
tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
url = { version = "2.5.3", features = ["serde"] }

View File

@ -1,23 +1,11 @@
# Surreal Crawler
Mapping with a budget of 1000 (crawl 1000 sites, so many more links are actually discovered), on [my webiste](https://oliveratkinson.net) on 8/26/2024 took 1m9s.
Crawls sites saving all the found links to a surrealdb database. It then proceeds to take batches of 100 uncrawled links untill the crawl budget is reached. It saves the data of each site in a minio database.
This is including the crawl and loading into the database and linking sites. (Locally hosted surreal db instance)
This run created 4299 site links with 23286 links between the sites. (It found my this git site which really bolsters those numbers.)
## Install / Build
* You will need rust to compile the crawler [rustup.rs](https://rustup.rs)
* You need python3 (will come installed on most linux distros) and poetry for dependancy management.
* Install `pipx`, `python3`
* Then: `pipx install poetry`
* Then: `poetry install` to install the project dependancies
* You need to install [surrealdb](https://surrealdb.com)
## Use
Just run `./crawl.sh {url}` and it will start crawling. You can tweak the budget inside [crawl.sh](https://git.oliveratkinson.net/Oliver/internet_mapper/src/branch/main/crawl.sh) if you want.
You can also prefix the command with `time` to benchmark the system, such as: `time ./crawl.sh https://discord.com`.
### TODO
- [ ] Domain filtering - prevent the crawler from going on alternate versions of wikipedia.
- [ ] Conditionally save content - based on filename or file contents
- [ ] GUI / TUI ?
- [ ] Better asynchronous getting of the sites. Currently it all happens serially.

View File

@ -1,5 +1,5 @@
services:
db:
surreal:
image: surrealdb/surrealdb:latest-dev
ports:
- 8000:8000
@ -14,3 +14,18 @@ services:
- --pass
- root
- rocksdb:/mydata/database.db
minio:
image: quay.io/minio/minio
ports:
- 9000:9000
- 9001:9001
environment:
- MINIO_ROOT_USER=root
- MINIO_ROOT_PASSWORD=an8charpassword
volumes:
- ./.minio/:/data
command:
- server
- /data
- --console-address
- ":9001"

View File

@ -1,2 +0,0 @@
DEFINE TABLE website SCHEMALESS;
DEFINE FIELD accessed_at ON TABLE website VALUE time::now();

View File

@ -8,12 +8,12 @@ use surrealdb::{
use tracing::{error, instrument, trace, warn};
use url::Url;
use crate::Timer;
use crate::{Config, Timer};
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct Website {
/// The url that this data is found at
site: Url,
pub site: Url,
/// Wether or not this link has been crawled yet
pub crawled: bool,
#[serde(skip_serializing)]
@ -22,35 +22,20 @@ pub struct Website {
impl Website {
/// Creates a blank site (assumes that url param is site's root)
pub fn new(url: &str, crawled: bool) -> Self {
let site = match Url::parse(url) {
Ok(a) => a,
Err(_) => todo!(),
};
pub fn new(url: Url, crawled: bool) -> Self {
Self {
id: None,
crawled,
site,
site: url,
}
}
pub fn set_crawled(&mut self) {
trace!("Set crawled to true");
self.crawled = true
}
pub fn mut_url(&mut self) -> &mut Url {
&mut self.site
}
#[instrument(skip_all)]
pub async fn links_to(&self, other: Vec<Thing>, db: &Surreal<Client>) {
let len = other.len();
if len == 0 {return}
let from = self.site.to_string();
// let to = other.site.to_string();
trace!("Linking {from} to {} other pages.", other.len());
let msg = format!("Linked {len} pages");
let timer = Timer::start(&msg);
// prevent the timer from being dropped instantly.
@ -69,7 +54,6 @@ impl Website {
let _: Vec<usize> = vec;
if let Some(num) = vec.get(0) {
if *num == len {
trace!("Link OK");
return;
} else {
warn!("Didn't link all the records. {num}/{len}");
@ -149,19 +133,30 @@ pub struct Record {
pub id: Thing,
}
pub async fn connect() -> surrealdb::Result<Surreal<Client>> {
#[instrument(skip_all, name = "SurrealDB")]
pub async fn connect(config: &Config<'_>) -> surrealdb::Result<Surreal<Client>> {
trace!("Establishing connection to surreal...");
// Connect to the server
let db = Surreal::new::<Ws>("127.0.0.1:8000").await?;
let db = Surreal::new::<Ws>(config.surreal_url).await?;
trace!("Logging in...");
// Signin as a namespace, database, or root user
db.signin(Root {
username: "root",
password: "root",
username: config.surreal_username,
password: config.surreal_password,
})
.await?;
// Select a specific namespace / database
db.use_ns("test").use_db("v1.2").await?;
db
.use_ns(config.surreal_ns)
.use_db(config.surreal_db)
.await?;
let setup = include_bytes!("setup.surql");
let file = setup.iter().map(|c| *c as char).collect::<String>();
db.query(file).await.expect("Failed to setup surreal tables.");
Ok(db)
}

View File

@ -1,189 +1,116 @@
extern crate html5ever;
extern crate markup5ever_rcdom as rcdom;
use std::time::Instant;
use db::{connect, Website};
use html5ever::{
local_name, parse_document, tendril::TendrilSink, tree_builder::TreeBuilderOpts, ParseOpts,
};
use rcdom::RcDom;
use std::time::Instant;
use surrealdb::{engine::remote::ws::Client, sql::Thing, Surreal};
use tracing::{debug, info, instrument, trace, trace_span};
use parser::parse;
use s3::S3;
use surrealdb::{engine::remote::ws::Client, Surreal};
use tokio::sync::broadcast::Receiver;
use tracing::{debug, info, trace, trace_span};
use tracing_subscriber::EnvFilter;
use url::Url;
mod db;
mod parser;
mod s3;
struct Config<'a> {
surreal_ns: &'a str,
surreal_db: &'a str,
surreal_url: &'a str,
surreal_username: &'a str,
surreal_password: &'a str,
s3_url: &'a str,
s3_bucket: &'a str,
s3_access_key: &'a str,
s3_secret_key: &'a str,
}
#[tokio::main]
async fn main() {
let total_runtime = Timer::start("Completed");
tracing_subscriber::fmt()
.with_env_filter(EnvFilter::from_default_env())
.with_line_number(true)
.without_time()
// .without_time()
.init();
debug!("Starting...");
let config = Config {
surreal_url: "localhost:8000",
surreal_username: "root",
surreal_password: "root",
surreal_ns: "test",
surreal_db: "spider",
s3_bucket: "spider",
s3_url: "http://localhost:9000",
s3_access_key: "0zv7GbLQsw4ZI8TclMps",
s3_secret_key: "5dB7QkGFw7fYbUJ5LpHk2GbWR7Bl710HlRz4NbzB",
};
// Would probably take these in as parameters from a cli
let url = "https://oliveratkinson.net/";
// let url = "http://localhost:5500";
let budget = 1000;
let mut crawled = 0;
let starting_url = "https://oliveratkinson.net/";
let db = connect().await.expect("Failed to connect to db, aborting.");
let s3 = S3::connect(&config)
.await
.expect("Failed to connect to minio, aborting.");
let db = connect(&config)
.await
.expect("Failed to connect to surreal, aborting.");
let client = reqwest::Client::builder()
// .use_rustls_tls()
let mut site = spider::website::Website::new(&starting_url)
.with_limit(5)
.with_depth(0)
.build()
.unwrap();
// Kick off the whole machine - This Website object doesn't matter, it's just to allow for
// get() to work.
let span = trace_span!("Pre-Loop");
let pre_loop_span = span.enter();
// Download the site
let mut site = Website::new(&url, false);
get(&mut site, &db, &client, &mut crawled).await;
let mut rx: Receiver<spider::page::Page> = site.subscribe(0).unwrap();
drop(pre_loop_span);
let span = trace_span!("Loop");
let subscriber = tokio::spawn(async move {
let span = trace_span!("Sub");
let span = span.enter();
while crawled < budget {
let get_num = if budget - crawled < 100 {
budget - crawled
} else {
100
};
while let Ok(res) = rx.recv().await {
// Get body
let data = res.get_html();
let url = Url::parse(res.get_url()).unwrap();
let uncrawled = get_uncrawled_links(&db, get_num).await;
if uncrawled.len() == 0 {
info!("Had more budget but finished crawling everything.");
return;
trace!("Got '{}'", url.to_string());
// Store document
s3.store(&data, &url).await;
// Parse document and store relationships
let mut page = Website::new(url, true);
page.store(&db).await;
// Relate this page to all the pages it links to
let span = trace_span!("Linking");
let span = span.enter();
let found_links = parse( &page, data).await;
let mut stored_links = Vec::new();
for mut link in found_links {
if let Some(id) = link.store(&db).await {
stored_links.push(id);
}
debug!("Crawling {} pages...", uncrawled.len());
let span = trace_span!("Crawling");
let _ = span.enter();
for mut site in uncrawled {
get(&mut site, &db, &client, &mut crawled).await;
let percent = format!("{:.2}%", (crawled as f32 / budget as f32) * 100f32);
info!("Crawled {crawled} out of {budget} pages. ({percent})");
}
page.links_to(stored_links, &db).await;
drop(span);
}
drop(span);
});
let timer = Timer::start("Crawled");
site.crawl().await;
site.unsubscribe();
drop(timer);
subscriber.await.unwrap();
info!("Done");
}
#[instrument(skip_all)]
/// A quick helper function for downloading a url
async fn get(
site: &mut Website,
db: &Surreal<Client>,
request_client: &reqwest::Client,
count: &mut usize,
) {
trace!("Get: {}", site.to_string());
let timer = Timer::start("Got page");
if let Ok(response) = request_client.get(site.to_string()).send().await {
timer.stop();
// Get body
let data = response.text().await.unwrap();
let opts = ParseOpts {
tree_builder: TreeBuilderOpts {
drop_doctype: true,
..Default::default()
},
..Default::default()
};
// Get DOM
let dom = parse_document(RcDom::default(), opts)
.from_utf8()
.read_from(&mut data.as_bytes())
.unwrap();
// TODO save the dom to minio if a flag is set
// Modify record in database
site.set_crawled();
site.store(db).await;
trace!("Got: {}", site.to_string());
// Walk all the children nodes, searching for links to other pages.
let mut buffer = Vec::new();
let timer = Timer::start("Walked");
walk(&dom.document, &db, &site, &mut buffer).await;
timer.stop();
// Put all the found links into the database.
site.links_to(buffer, &db).await;
*count += 1;
}
trace!("Failed to get: {}", site.to_string());
}
/// Walks the givin site, placing it's findings in the database
async fn walk(
node: &rcdom::Handle,
db: &Surreal<Client>,
site: &Website,
links_to: &mut Vec<Thing>,
) {
let span = trace_span!("Walk");
let span = span.enter();
// Match each node - node basically means element.
match &node.data {
rcdom::NodeData::Element { name, attrs, .. } => {
for attr in attrs.borrow().clone() {
match name.local {
local_name!("a")
| local_name!("audio")
| local_name!("area")
| local_name!("img")
| local_name!("link")
| local_name!("object")
| local_name!("source")
| local_name!("base")
| local_name!("video") => {
let attribute_name = attr.name.local.to_string();
if attribute_name == "src"
|| attribute_name == "href"
|| attribute_name == "data"
{
// Get clone of the current site object
let mut web = site.clone();
// Set url
let url = web.mut_url();
url.set_fragment(None); // removes #xyz
let joined = url.join(&attr.value).unwrap();
*url = joined;
// Set other attributes
web.crawled = false;
// TODO set element name
// let element_name = name.local.to_string();
if let Some(id) = web.store(db).await {
links_to.push(id);
}
}
}
local_name!("button") | local_name!("meta") | local_name!("iframe") => {
// dbg!(attrs);
}
_ => {}
};
}
}
_ => {}
};
drop(span);
for child in node.children.borrow().iter() {
Box::pin(walk(child, db, site, links_to)).await;
}
drop(total_runtime);
}
/// Returns uncrawled links

92
src/parser.rs Normal file
View File

@ -0,0 +1,92 @@
use std::default::Default;
use std::str::FromStr;
use html5ever::tokenizer::{BufferQueue, TokenizerResult};
use html5ever::tokenizer::{StartTag, TagToken};
use html5ever::tokenizer::{Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
use html5ever::{local_name, tendril::*};
use tracing::{instrument, trace};
use crate::db::Website;
#[derive(Clone)]
struct LinkParser<'a> {
site: &'a Website,
}
impl TokenSink for LinkParser<'_> {
type Handle = Vec<Website>;
fn process_token(&self, token: Token, _line_number: u64) -> TokenSinkResult<Self::Handle> {
match token {
TagToken(tag) => {
if tag.kind == StartTag {
match tag.name {
local_name!("a")
| local_name!("audio")
| local_name!("area")
| local_name!("img")
| local_name!("link")
| local_name!("object")
| local_name!("source")
| local_name!("base")
| local_name!("video") => {
let mut links = Vec::new();
for attr in &tag.attrs {
let attr_name = attr.name.local.to_string();
if attr_name == "src" || attr_name == "href" || attr_name == "data"
{
// Get clone of the current site object
let mut web = self.site.clone();
// Set url
let mut url = web.site;
url.set_fragment(None); // removes #xyz
let joined = url.join(&attr.value).unwrap();
web.site = joined;
web.crawled = false;
links.push(web);
}
}
return TokenSinkResult::Script(links);
}
local_name!("button") | local_name!("meta") | local_name!("iframe") => {
// dbg!(attrs);
}
_ => {}
}
}
}
_ => {}
}
TokenSinkResult::Continue
}
}
#[instrument(skip_all)]
pub async fn parse(site: &Website, data: String) -> Vec<Website> {
let sink = LinkParser { site };
let chunk = Tendril::from_str(&data).unwrap();
let mut input = BufferQueue::default();
input.push_back(chunk.try_reinterpret::<fmt::UTF8>().unwrap());
let token = Tokenizer::new(sink.clone(), TokenizerOpts::default());
// let mut links_to = Vec::new();
let mut res = Vec::new();
while !input.is_empty() {
if let TokenizerResult::Script(mut s) = token.feed(&mut input) {
res.append(&mut s);
}
}
trace!("Found {} links.", res.len());
assert!(input.is_empty());
token.end();
res
}

76
src/s3.rs Normal file
View File

@ -0,0 +1,76 @@
use minio::s3::{
args::{BucketExistsArgs, MakeBucketArgs}, client::ClientBuilder, creds::StaticProvider, error::Error, http::BaseUrl, types::S3Api, Client
};
use tracing::{instrument, trace};
use url::Url;
use crate::Config;
pub struct S3 {
bucket_name: String,
client: Client,
}
impl S3 {
#[instrument(skip_all, name = "S3")]
pub async fn connect(config: &Config<'_>) -> Result<Self, Error> {
let base_url = config.s3_url.parse::<BaseUrl>().unwrap();
let static_provider =
StaticProvider::new(&config.s3_access_key, &config.s3_secret_key, None);
let client = ClientBuilder::new(base_url)
.provider(Some(Box::new(static_provider)))
.build()?;
trace!("Checking bucket...");
let exists = client
.bucket_exists(&BucketExistsArgs::new(&config.s3_bucket).unwrap())
.await?;
if !exists {
trace!("Creating bucket...");
client
.make_bucket(&MakeBucketArgs::new(&config.s3_bucket).unwrap())
.await?;
}
trace!("Connection successfull");
Ok(Self {
bucket_name: config.s3_bucket.to_owned(),
client: client,
})
}
pub async fn store(&self, data: &str, name: &Url) {
if let Some(domain) = name.domain() {
let filename = domain.to_string() + name.path();
let _ = &self
.client
.put_object_content(&self.bucket_name, &filename, data.to_owned())
.send()
.await
.unwrap();
}
}
pub async fn _get(&self, name: &Url) -> Option<String> {
if let Some(domain) = name.domain() {
let filename = domain.to_string() + name.path();
let data = self
.client
.get_object(&self.bucket_name, &filename)
.send()
.await
.unwrap();
if let Ok(segments )= data.content.to_segmented_bytes().await {
return Some(segments.to_bytes().iter().map(|c| *c as char).collect::<String>())
}
}
None
}
}

2
src/setup.surql Normal file
View File

@ -0,0 +1,2 @@
DEFINE TABLE IF NOT EXISTS website SCHEMALESS;
DEFINE FIELD IF NOT EXISTS accessed_at ON TABLE website VALUE time::now();