updating deps
This commit is contained in:
parent
c1c8cf07bb
commit
22be3b2f61
591
Cargo.lock
generated
591
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
14
Cargo.toml
14
Cargo.toml
@ -4,13 +4,13 @@ version = "0.1.0"
|
|||||||
edition = "2021"
|
edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
html5ever = "0.29.0"
|
html5ever = "0.29"
|
||||||
# minio = "0.1.0"
|
# minio = "0.1.0"
|
||||||
minio = {git="https://github.com/minio/minio-rs.git", rev = "c28f576"}
|
minio = {git="https://github.com/minio/minio-rs.git", rev = "c28f576"}
|
||||||
reqwest = "0.12.9"
|
reqwest = "0.12"
|
||||||
serde = { version = "1.0.214", features = ["derive"] }
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
surrealdb = "2.0.4"
|
surrealdb = "2.1"
|
||||||
tokio = { version="1.41.0", features = ["full"] }
|
tokio = { version="1.41.0", features = ["full"] }
|
||||||
tracing = "0.1.40"
|
tracing = "0.1"
|
||||||
tracing-subscriber = { version = "0.3.18", features = ["env-filter"] }
|
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||||
url = { version = "2.5.3", features = ["serde"] }
|
url = { version = "2.5", features = ["serde"] }
|
||||||
|
17
src/main.rs
17
src/main.rs
@ -31,8 +31,8 @@ async fn main() {
|
|||||||
|
|
||||||
tracing_subscriber::fmt()
|
tracing_subscriber::fmt()
|
||||||
.with_env_filter(EnvFilter::from_default_env())
|
.with_env_filter(EnvFilter::from_default_env())
|
||||||
.with_line_number(true)
|
.with_line_number(false)
|
||||||
// .without_time()
|
.without_time()
|
||||||
.init();
|
.init();
|
||||||
debug!("Starting...");
|
debug!("Starting...");
|
||||||
|
|
||||||
@ -49,7 +49,9 @@ async fn main() {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Would probably take these in as parameters from a cli
|
// Would probably take these in as parameters from a cli
|
||||||
let starting_url = "https://oliveratkinson.net/";
|
let starting_url = "https://en.wikipedia.org/";
|
||||||
|
// When getting uncrawled pages, name must be LIKE this variable. "" will effectively get ignored.
|
||||||
|
let crawl_like = "wikipedia";
|
||||||
let budget = 5;
|
let budget = 5;
|
||||||
let mut crawled = 0;
|
let mut crawled = 0;
|
||||||
|
|
||||||
@ -76,7 +78,7 @@ async fn main() {
|
|||||||
while crawled < budget {
|
while crawled < budget {
|
||||||
let get_num = if budget - crawled < 100 { budget - crawled } else { 100 };
|
let get_num = if budget - crawled < 100 { budget - crawled } else { 100 };
|
||||||
|
|
||||||
let uncrawled = get_uncrawled_links(&db, get_num).await;
|
let uncrawled = get_uncrawled_links(&db, get_num, crawl_like.to_string()).await;
|
||||||
if uncrawled.len() == 0 {
|
if uncrawled.len() == 0 {
|
||||||
info!("Had more budget but finished crawling everything.");
|
info!("Had more budget but finished crawling everything.");
|
||||||
return;
|
return;
|
||||||
@ -126,13 +128,16 @@ async fn get(
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Returns uncrawled links
|
/// Returns uncrawled links
|
||||||
async fn get_uncrawled_links(db: &Surreal<Client>, mut count: usize) -> Vec<Website> {
|
#[instrument(skip(db))]
|
||||||
|
async fn get_uncrawled_links(db: &Surreal<Client>, mut count: usize, param: String) -> Vec<Website> {
|
||||||
if count > 100 {
|
if count > 100 {
|
||||||
count = 100
|
count = 100
|
||||||
}
|
}
|
||||||
|
trace!("Getting uncrawled links");
|
||||||
|
|
||||||
let mut response = db
|
let mut response = db
|
||||||
.query("SELECT * FROM website WHERE crawled = false LIMIT $count")
|
.query("SELECT * FROM website WHERE crawled = false AND site ~ type::string($format) LIMIT $count;")
|
||||||
|
.bind(("format", param))
|
||||||
.bind(("count", count))
|
.bind(("count", count))
|
||||||
.await
|
.await
|
||||||
.expect("Hard-coded query failed..?");
|
.expect("Hard-coded query failed..?");
|
||||||
|
Loading…
Reference in New Issue
Block a user