updating deps

This commit is contained in:
Oliver Atkinson 2024-12-12 14:14:38 -07:00
parent c1c8cf07bb
commit 22be3b2f61
3 changed files with 338 additions and 284 deletions

591
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -4,13 +4,13 @@ version = "0.1.0"
edition = "2021" edition = "2021"
[dependencies] [dependencies]
html5ever = "0.29.0" html5ever = "0.29"
# minio = "0.1.0" # minio = "0.1.0"
minio = {git="https://github.com/minio/minio-rs.git", rev = "c28f576"} minio = {git="https://github.com/minio/minio-rs.git", rev = "c28f576"}
reqwest = "0.12.9" reqwest = "0.12"
serde = { version = "1.0.214", features = ["derive"] } serde = { version = "1.0", features = ["derive"] }
surrealdb = "2.0.4" surrealdb = "2.1"
tokio = { version="1.41.0", features = ["full"] } tokio = { version="1.41.0", features = ["full"] }
tracing = "0.1.40" tracing = "0.1"
tracing-subscriber = { version = "0.3.18", features = ["env-filter"] } tracing-subscriber = { version = "0.3", features = ["env-filter"] }
url = { version = "2.5.3", features = ["serde"] } url = { version = "2.5", features = ["serde"] }

View File

@ -31,8 +31,8 @@ async fn main() {
tracing_subscriber::fmt() tracing_subscriber::fmt()
.with_env_filter(EnvFilter::from_default_env()) .with_env_filter(EnvFilter::from_default_env())
.with_line_number(true) .with_line_number(false)
// .without_time() .without_time()
.init(); .init();
debug!("Starting..."); debug!("Starting...");
@ -49,7 +49,9 @@ async fn main() {
}; };
// Would probably take these in as parameters from a cli // Would probably take these in as parameters from a cli
let starting_url = "https://oliveratkinson.net/"; let starting_url = "https://en.wikipedia.org/";
// When getting uncrawled pages, name must be LIKE this variable. "" will effectively get ignored.
let crawl_like = "wikipedia";
let budget = 5; let budget = 5;
let mut crawled = 0; let mut crawled = 0;
@ -76,7 +78,7 @@ async fn main() {
while crawled < budget { while crawled < budget {
let get_num = if budget - crawled < 100 { budget - crawled } else { 100 }; let get_num = if budget - crawled < 100 { budget - crawled } else { 100 };
let uncrawled = get_uncrawled_links(&db, get_num).await; let uncrawled = get_uncrawled_links(&db, get_num, crawl_like.to_string()).await;
if uncrawled.len() == 0 { if uncrawled.len() == 0 {
info!("Had more budget but finished crawling everything."); info!("Had more budget but finished crawling everything.");
return; return;
@ -126,13 +128,16 @@ async fn get(
} }
/// Returns uncrawled links /// Returns uncrawled links
async fn get_uncrawled_links(db: &Surreal<Client>, mut count: usize) -> Vec<Website> { #[instrument(skip(db))]
async fn get_uncrawled_links(db: &Surreal<Client>, mut count: usize, param: String) -> Vec<Website> {
if count > 100 { if count > 100 {
count = 100 count = 100
} }
trace!("Getting uncrawled links");
let mut response = db let mut response = db
.query("SELECT * FROM website WHERE crawled = false LIMIT $count") .query("SELECT * FROM website WHERE crawled = false AND site ~ type::string($format) LIMIT $count;")
.bind(("format", param))
.bind(("count", count)) .bind(("count", count))
.await .await
.expect("Hard-coded query failed..?"); .expect("Hard-coded query failed..?");