This commit is contained in:
Oliver Atkinson 2024-12-12 14:59:54 -07:00
parent 215056e493
commit 298ad39a79

View File

@ -50,9 +50,9 @@ async fn main() {
// Would probably take these in as parameters from a cli // Would probably take these in as parameters from a cli
let starting_url = "https://en.wikipedia.org/"; let starting_url = "https://en.wikipedia.org/";
// When getting uncrawled pages, name must be LIKE this variable. "" will effectively get ignored. // When getting uncrawled pages, name must contain this variable. "" will effectively get ignored.
let crawl_like = "wikipedia"; let crawl_filter = "https://en.wikipedia.org/";
let budget = 5; let budget = 50;
let mut crawled = 0; let mut crawled = 0;
let s3 = S3::connect(&config).await.expect("Failed to connect to minio, aborting."); let s3 = S3::connect(&config).await.expect("Failed to connect to minio, aborting.");
@ -78,7 +78,7 @@ async fn main() {
while crawled < budget { while crawled < budget {
let get_num = if budget - crawled < 100 { budget - crawled } else { 100 }; let get_num = if budget - crawled < 100 { budget - crawled } else { 100 };
let uncrawled = get_uncrawled_links(&db, get_num, crawl_like.to_string()).await; let uncrawled = get_uncrawled_links(&db, get_num, crawl_filter.to_string()).await;
if uncrawled.len() == 0 { if uncrawled.len() == 0 {
info!("Had more budget but finished crawling everything."); info!("Had more budget but finished crawling everything.");
return; return;