no longer using spider, just wiritng my own crawler
This commit is contained in:
		
							
								
								
									
										1005
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										1005
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -4,4 +4,9 @@ version = "0.1.0"
 | 
			
		||||
edition = "2021"
 | 
			
		||||
 | 
			
		||||
[dependencies]
 | 
			
		||||
spider = { features = [], git="https://github.com/Rushmore75/spider.git", rev="ff91646973ad04ff423010f36206f550e37c4278" }
 | 
			
		||||
html5ever = "0.29.0"
 | 
			
		||||
markup5ever_rcdom = "0.5.0-unofficial"
 | 
			
		||||
reqwest = "0.12.8"
 | 
			
		||||
tokio = { version="1.40.0", features = ["full"] }
 | 
			
		||||
tracing = "0.1.40"
 | 
			
		||||
tracing-subscriber = "0.3.18"
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										71
									
								
								src/main.rs
									
									
									
									
									
								
							
							
						
						
									
										71
									
								
								src/main.rs
									
									
									
									
									
								
							@@ -1,35 +1,58 @@
 | 
			
		||||
use spider::{hashbrown::HashMap, tokio};
 | 
			
		||||
extern crate markup5ever_rcdom as rcdom;
 | 
			
		||||
extern crate html5ever;
 | 
			
		||||
 | 
			
		||||
use std::env;
 | 
			
		||||
use html5ever::{parse_document, tendril::TendrilSink, tree_builder::TreeBuilderOpts, ParseOpts};
 | 
			
		||||
use rcdom::RcDom;
 | 
			
		||||
use tracing::{debug, info, trace, warn};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#[tokio::main]
 | 
			
		||||
async fn main() {
 | 
			
		||||
    tracing_subscriber::fmt::init();
 | 
			
		||||
    debug!("Starting...");
 | 
			
		||||
 | 
			
		||||
    let args = std::env::args().collect::<Vec<String>>();
 | 
			
		||||
    let url = "https://oliveratkinson.net";
 | 
			
		||||
    let budget = "10";
 | 
			
		||||
    
 | 
			
		||||
    let url = &args[1];
 | 
			
		||||
    let budget = &args[2];
 | 
			
		||||
    let budget = match budget.parse::<u32>() {
 | 
			
		||||
        Ok(x) => x,
 | 
			
		||||
        Err(_) => panic!("Second arg must be a int"),
 | 
			
		||||
    let response = reqwest::get(url).await.unwrap();
 | 
			
		||||
    let data = response.text().await.unwrap();
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    let opts = ParseOpts {
 | 
			
		||||
        tree_builder: TreeBuilderOpts {
 | 
			
		||||
            drop_doctype: true,
 | 
			
		||||
            ..Default::default()
 | 
			
		||||
        },
 | 
			
		||||
        ..Default::default()
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    let mut site = spider::website::Website::new(url)
 | 
			
		||||
        .with_budget(Some(HashMap::from([
 | 
			
		||||
            ("*", budget),
 | 
			
		||||
        ])))
 | 
			
		||||
        .with_tld(true)
 | 
			
		||||
        .with_on_link_find_callback(Some(|from, to| {
 | 
			
		||||
            let from = from.as_ref().to_string();
 | 
			
		||||
            let to = to.as_ref().to_string();
 | 
			
		||||
 | 
			
		||||
            let from = from.trim();
 | 
			
		||||
            let to= to.trim();
 | 
			
		||||
 | 
			
		||||
            println!("{from};->;{to}"); 
 | 
			
		||||
        }))
 | 
			
		||||
        .build()
 | 
			
		||||
    let dom = parse_document(RcDom::default(), opts)
 | 
			
		||||
        .from_utf8()
 | 
			
		||||
        .read_from(&mut data.as_bytes())
 | 
			
		||||
        .unwrap();
 | 
			
		||||
 | 
			
		||||
    site.crawl().await;
 | 
			
		||||
    let a = &dom.document;
 | 
			
		||||
    warn!("Walking...");
 | 
			
		||||
 | 
			
		||||
    walk(a);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
fn walk(node: &rcdom::Handle) {
 | 
			
		||||
    match &node.data {
 | 
			
		||||
        rcdom::NodeData::Document => (),
 | 
			
		||||
        rcdom::NodeData::Doctype { name, public_id, system_id } => debug!("doctype"),
 | 
			
		||||
        rcdom::NodeData::Text { contents } => {},
 | 
			
		||||
        rcdom::NodeData::Comment { contents } => debug!("comment"),
 | 
			
		||||
        rcdom::NodeData::Element { name, attrs, template_contents, mathml_annotation_xml_integration_point } => {
 | 
			
		||||
            attrs.borrow().iter().for_each(|attr| {
 | 
			
		||||
                let name = name.local.to_string();
 | 
			
		||||
                let internal = &*attr.value;
 | 
			
		||||
                debug!("element: {name}, attr: {internal}"); 
 | 
			
		||||
            });
 | 
			
		||||
        },
 | 
			
		||||
        rcdom::NodeData::ProcessingInstruction { target, contents } => debug!("ProcessingInstruction"),
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    node.children.borrow().iter().for_each(|n| walk(n));
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user