no longer using spider, just wiritng my own crawler
This commit is contained in:
		
							
								
								
									
										1005
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										1005
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -4,4 +4,9 @@ version = "0.1.0"
 | 
				
			|||||||
edition = "2021"
 | 
					edition = "2021"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[dependencies]
 | 
					[dependencies]
 | 
				
			||||||
spider = { features = [], git="https://github.com/Rushmore75/spider.git", rev="ff91646973ad04ff423010f36206f550e37c4278" }
 | 
					html5ever = "0.29.0"
 | 
				
			||||||
 | 
					markup5ever_rcdom = "0.5.0-unofficial"
 | 
				
			||||||
 | 
					reqwest = "0.12.8"
 | 
				
			||||||
 | 
					tokio = { version="1.40.0", features = ["full"] }
 | 
				
			||||||
 | 
					tracing = "0.1.40"
 | 
				
			||||||
 | 
					tracing-subscriber = "0.3.18"
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										71
									
								
								src/main.rs
									
									
									
									
									
								
							
							
						
						
									
										71
									
								
								src/main.rs
									
									
									
									
									
								
							@@ -1,35 +1,58 @@
 | 
				
			|||||||
use spider::{hashbrown::HashMap, tokio};
 | 
					extern crate markup5ever_rcdom as rcdom;
 | 
				
			||||||
 | 
					extern crate html5ever;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					use std::env;
 | 
				
			||||||
 | 
					use html5ever::{parse_document, tendril::TendrilSink, tree_builder::TreeBuilderOpts, ParseOpts};
 | 
				
			||||||
 | 
					use rcdom::RcDom;
 | 
				
			||||||
 | 
					use tracing::{debug, info, trace, warn};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#[tokio::main]
 | 
					#[tokio::main]
 | 
				
			||||||
async fn main() {
 | 
					async fn main() {
 | 
				
			||||||
 | 
					    tracing_subscriber::fmt::init();
 | 
				
			||||||
 | 
					    debug!("Starting...");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    let args = std::env::args().collect::<Vec<String>>();
 | 
					    let url = "https://oliveratkinson.net";
 | 
				
			||||||
 | 
					    let budget = "10";
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    let url = &args[1];
 | 
					    let response = reqwest::get(url).await.unwrap();
 | 
				
			||||||
    let budget = &args[2];
 | 
					    let data = response.text().await.unwrap();
 | 
				
			||||||
    let budget = match budget.parse::<u32>() {
 | 
					
 | 
				
			||||||
        Ok(x) => x,
 | 
					
 | 
				
			||||||
        Err(_) => panic!("Second arg must be a int"),
 | 
					    let opts = ParseOpts {
 | 
				
			||||||
 | 
					        tree_builder: TreeBuilderOpts {
 | 
				
			||||||
 | 
					            drop_doctype: true,
 | 
				
			||||||
 | 
					            ..Default::default()
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
 | 
					        ..Default::default()
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    let dom = parse_document(RcDom::default(), opts)
 | 
				
			||||||
    let mut site = spider::website::Website::new(url)
 | 
					        .from_utf8()
 | 
				
			||||||
        .with_budget(Some(HashMap::from([
 | 
					        .read_from(&mut data.as_bytes())
 | 
				
			||||||
            ("*", budget),
 | 
					 | 
				
			||||||
        ])))
 | 
					 | 
				
			||||||
        .with_tld(true)
 | 
					 | 
				
			||||||
        .with_on_link_find_callback(Some(|from, to| {
 | 
					 | 
				
			||||||
            let from = from.as_ref().to_string();
 | 
					 | 
				
			||||||
            let to = to.as_ref().to_string();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            let from = from.trim();
 | 
					 | 
				
			||||||
            let to= to.trim();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            println!("{from};->;{to}"); 
 | 
					 | 
				
			||||||
        }))
 | 
					 | 
				
			||||||
        .build()
 | 
					 | 
				
			||||||
        .unwrap();
 | 
					        .unwrap();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    site.crawl().await;
 | 
					    let a = &dom.document;
 | 
				
			||||||
 | 
					    warn!("Walking...");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    walk(a);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					fn walk(node: &rcdom::Handle) {
 | 
				
			||||||
 | 
					    match &node.data {
 | 
				
			||||||
 | 
					        rcdom::NodeData::Document => (),
 | 
				
			||||||
 | 
					        rcdom::NodeData::Doctype { name, public_id, system_id } => debug!("doctype"),
 | 
				
			||||||
 | 
					        rcdom::NodeData::Text { contents } => {},
 | 
				
			||||||
 | 
					        rcdom::NodeData::Comment { contents } => debug!("comment"),
 | 
				
			||||||
 | 
					        rcdom::NodeData::Element { name, attrs, template_contents, mathml_annotation_xml_integration_point } => {
 | 
				
			||||||
 | 
					            attrs.borrow().iter().for_each(|attr| {
 | 
				
			||||||
 | 
					                let name = name.local.to_string();
 | 
				
			||||||
 | 
					                let internal = &*attr.value;
 | 
				
			||||||
 | 
					                debug!("element: {name}, attr: {internal}"); 
 | 
				
			||||||
 | 
					            });
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
 | 
					        rcdom::NodeData::ProcessingInstruction { target, contents } => debug!("ProcessingInstruction"),
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    node.children.borrow().iter().for_each(|n| walk(n));
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user