updating deps
This commit is contained in:
		
							
								
								
									
										591
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										591
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										14
									
								
								Cargo.toml
									
									
									
									
									
								
							
							
						
						
									
										14
									
								
								Cargo.toml
									
									
									
									
									
								
							@@ -4,13 +4,13 @@ version = "0.1.0"
 | 
				
			|||||||
edition = "2021"
 | 
					edition = "2021"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[dependencies]
 | 
					[dependencies]
 | 
				
			||||||
html5ever = "0.29.0"
 | 
					html5ever = "0.29"
 | 
				
			||||||
# minio = "0.1.0"
 | 
					# minio = "0.1.0"
 | 
				
			||||||
minio = {git="https://github.com/minio/minio-rs.git", rev = "c28f576"}
 | 
					minio = {git="https://github.com/minio/minio-rs.git", rev = "c28f576"}
 | 
				
			||||||
reqwest = "0.12.9"
 | 
					reqwest = "0.12"
 | 
				
			||||||
serde = { version = "1.0.214", features = ["derive"] }
 | 
					serde = { version = "1.0", features = ["derive"] }
 | 
				
			||||||
surrealdb = "2.0.4"
 | 
					surrealdb = "2.1"
 | 
				
			||||||
tokio = { version="1.41.0", features = ["full"] }
 | 
					tokio = { version="1.41.0", features = ["full"] }
 | 
				
			||||||
tracing = "0.1.40"
 | 
					tracing = "0.1"
 | 
				
			||||||
tracing-subscriber = { version = "0.3.18", features = ["env-filter"] }
 | 
					tracing-subscriber = { version = "0.3", features = ["env-filter"] }
 | 
				
			||||||
url = { version = "2.5.3", features = ["serde"] }
 | 
					url = { version = "2.5", features = ["serde"] }
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										17
									
								
								src/main.rs
									
									
									
									
									
								
							
							
						
						
									
										17
									
								
								src/main.rs
									
									
									
									
									
								
							@@ -31,8 +31,8 @@ async fn main() {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    tracing_subscriber::fmt()
 | 
					    tracing_subscriber::fmt()
 | 
				
			||||||
        .with_env_filter(EnvFilter::from_default_env())
 | 
					        .with_env_filter(EnvFilter::from_default_env())
 | 
				
			||||||
        .with_line_number(true)
 | 
					        .with_line_number(false)
 | 
				
			||||||
        // .without_time()
 | 
					        .without_time()
 | 
				
			||||||
        .init();
 | 
					        .init();
 | 
				
			||||||
    debug!("Starting...");
 | 
					    debug!("Starting...");
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
@@ -49,7 +49,9 @@ async fn main() {
 | 
				
			|||||||
    };
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // Would probably take these in as parameters from a cli
 | 
					    // Would probably take these in as parameters from a cli
 | 
				
			||||||
    let starting_url = "https://oliveratkinson.net/";
 | 
					    let starting_url = "https://en.wikipedia.org/";
 | 
				
			||||||
 | 
					    // When getting uncrawled pages, name must be LIKE this variable. "" will effectively get ignored.
 | 
				
			||||||
 | 
					    let crawl_like = "wikipedia";
 | 
				
			||||||
    let budget = 5;
 | 
					    let budget = 5;
 | 
				
			||||||
    let mut crawled = 0;
 | 
					    let mut crawled = 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -76,7 +78,7 @@ async fn main() {
 | 
				
			|||||||
    while crawled < budget {
 | 
					    while crawled < budget {
 | 
				
			||||||
        let get_num = if budget - crawled < 100 { budget - crawled } else { 100 };
 | 
					        let get_num = if budget - crawled < 100 { budget - crawled } else { 100 };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        let uncrawled = get_uncrawled_links(&db, get_num).await;
 | 
					        let uncrawled = get_uncrawled_links(&db, get_num, crawl_like.to_string()).await;
 | 
				
			||||||
        if uncrawled.len() == 0 {
 | 
					        if uncrawled.len() == 0 {
 | 
				
			||||||
            info!("Had more budget but finished crawling everything.");
 | 
					            info!("Had more budget but finished crawling everything.");
 | 
				
			||||||
            return;
 | 
					            return;
 | 
				
			||||||
@@ -126,13 +128,16 @@ async fn get(
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/// Returns uncrawled links
 | 
					/// Returns uncrawled links
 | 
				
			||||||
async fn get_uncrawled_links(db: &Surreal<Client>, mut count: usize) -> Vec<Website> {
 | 
					#[instrument(skip(db))]
 | 
				
			||||||
 | 
					async fn get_uncrawled_links(db: &Surreal<Client>, mut count: usize, param: String) -> Vec<Website> {
 | 
				
			||||||
    if count > 100 {
 | 
					    if count > 100 {
 | 
				
			||||||
        count = 100
 | 
					        count = 100
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					    trace!("Getting uncrawled links");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    let mut response = db
 | 
					    let mut response = db
 | 
				
			||||||
        .query("SELECT * FROM website WHERE crawled = false LIMIT $count")
 | 
					        .query("SELECT * FROM website WHERE crawled = false AND site ~ type::string($format) LIMIT $count;")
 | 
				
			||||||
 | 
					        .bind(("format", param))
 | 
				
			||||||
        .bind(("count", count))
 | 
					        .bind(("count", count))
 | 
				
			||||||
        .await
 | 
					        .await
 | 
				
			||||||
        .expect("Hard-coded query failed..?");
 | 
					        .expect("Hard-coded query failed..?");
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user