Compare commits
	
		
			4 Commits
		
	
	
		
			5b728bacd6
			...
			main
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 2c339a36f9 | |||
| 73216f7003 | |||
| 1e59ebd5c4 | |||
| 52d5e101d0 | 
							
								
								
									
										12
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								README.md
									
									
									
									
									
								
							@@ -40,14 +40,18 @@ $EDITOR Crawler.toml
 | 
			
		||||
- [x] Allow for storing asynchronously - dropping the "links to" logic fixes this need
 | 
			
		||||
- [x] Control crawler via config file (no recompliation needed)
 | 
			
		||||
 | 
			
		||||
3/17/25: Took >1hr to crawl 100 pages
 | 
			
		||||
### Feats
 | 
			
		||||
 | 
			
		||||
3/19/25: Took 20min to crawl 1000 pages
 | 
			
		||||
3/17/25: Took >1hr to crawl 100 pages.
 | 
			
		||||
 | 
			
		||||
3/19/25: Took 20min to crawl 1000 pages.
 | 
			
		||||
This ment we stored 1000 pages, 142,997 urls, and 1,425,798 links between the two.
 | 
			
		||||
 | 
			
		||||
3/20/25: Took 5min to crawl 1000 pages
 | 
			
		||||
3/20/25: Took 5min to crawl 1000 pages.
 | 
			
		||||
 | 
			
		||||
3/21/25: Took 3min to crawl 1000 pages
 | 
			
		||||
3/21/25: Took 3min to crawl 1000 pages.
 | 
			
		||||
 | 
			
		||||
7/.../25: Downloaded just shy of 12TB of data from a remote server.
 | 
			
		||||
 | 
			
		||||
# About
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -52,7 +52,10 @@ pub async fn check_file_length(file: &PathBuf) -> Option<u64> {
 | 
			
		||||
            }
 | 
			
		||||
        },
 | 
			
		||||
        Err(err) => {
 | 
			
		||||
            error!("Failed to open file for testing... {}", err);
 | 
			
		||||
            match err.kind() {
 | 
			
		||||
                ErrorKind::NotFound => {/* ignore */},
 | 
			
		||||
                _ => warn!("Failed to open file to check length. {:?} {}", file, err),
 | 
			
		||||
            }
 | 
			
		||||
        },
 | 
			
		||||
    }
 | 
			
		||||
    None
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										38
									
								
								src/main.rs
									
									
									
									
									
								
							
							
						
						
									
										38
									
								
								src/main.rs
									
									
									
									
									
								
							@@ -22,7 +22,7 @@ use opentelemetry_sdk::{metrics::SdkMeterProvider, trace::SdkTracerProvider};
 | 
			
		||||
use serde::Deserialize;
 | 
			
		||||
use surrealdb::{engine::remote::ws::Client, Surreal};
 | 
			
		||||
use tokio::{
 | 
			
		||||
    io::{AsyncWriteExt, BufWriter},
 | 
			
		||||
    io::{AsyncReadExt, AsyncWriteExt, BufWriter},
 | 
			
		||||
    sync::RwLock,
 | 
			
		||||
    task::JoinSet,
 | 
			
		||||
};
 | 
			
		||||
@@ -246,15 +246,14 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
 | 
			
		||||
                        if disk_len == len {
 | 
			
		||||
                            skip_download = true;
 | 
			
		||||
                        }
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        if skip_download {
 | 
			
		||||
            trace!("Skipping download...");
 | 
			
		||||
                    } else {
 | 
			
		||||
                        // File not found (or other error).
 | 
			
		||||
                        // Program will continue on it's way, downloading content.
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // make sure that the file is good to go
 | 
			
		||||
        if let Some(file) = filesystem::init(&tmp_path).await {
 | 
			
		||||
            // Get body from response
 | 
			
		||||
@@ -262,9 +261,26 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
 | 
			
		||||
            let mut stream = response.bytes_stream();
 | 
			
		||||
 | 
			
		||||
            let should_parse = real_path.to_string_lossy().ends_with(".html");
 | 
			
		||||
                let mut writer = BufWriter::new(file);
 | 
			
		||||
 | 
			
		||||
            let mut buf: Vec<u8> = Vec::new();
 | 
			
		||||
 | 
			
		||||
            if skip_download && should_parse {
 | 
			
		||||
                // since we are skipping the download we will just read the file off the disk to
 | 
			
		||||
                // parse it
 | 
			
		||||
                if let Ok(mut file) = tokio::fs::OpenOptions::new() 
 | 
			
		||||
                    .read(true)
 | 
			
		||||
                    .open(&real_path).await
 | 
			
		||||
                {
 | 
			
		||||
                    if let Err(err) = file.read_to_end(&mut buf).await {
 | 
			
		||||
                        warn!("Failed to read file off disk for parsing, {}", err);
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
            } 
 | 
			
		||||
 | 
			
		||||
            // !!!DOWNLOADING TIME!!!
 | 
			
		||||
            if !skip_download {
 | 
			
		||||
                let mut writer = BufWriter::new(file);
 | 
			
		||||
 | 
			
		||||
                // Write file to disk
 | 
			
		||||
                trace!("Writing at: {:?}", tmp_path);
 | 
			
		||||
                BEING_STREAMED.add(1, &[]);
 | 
			
		||||
@@ -299,6 +315,7 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
 | 
			
		||||
 | 
			
		||||
                // stream_span.end();
 | 
			
		||||
                BEING_STREAMED.add(-1, &[]);
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            // (If needed) Parse the file
 | 
			
		||||
            if should_parse {
 | 
			
		||||
@@ -330,7 +347,6 @@ async fn process(mut site: Website, db: Surreal<Client>, reqwest: reqwest::Clien
 | 
			
		||||
            site.status_code = code.as_u16();
 | 
			
		||||
            Website::store_all(vec![site.clone()], &db).await;
 | 
			
		||||
        }
 | 
			
		||||
        }
 | 
			
		||||
    } else {
 | 
			
		||||
        error!(url = site.site.as_str(), "Failed to get: {}", &site.site);
 | 
			
		||||
    }
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user