no longer using spider, just wiritng my own crawler

This commit is contained in:
Oliver Atkinson 2024-10-04 13:52:34 -06:00
parent 2d2b09116e
commit 974bccc457
3 changed files with 293 additions and 790 deletions

1005
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -4,4 +4,9 @@ version = "0.1.0"
edition = "2021" edition = "2021"
[dependencies] [dependencies]
spider = { features = [], git="https://github.com/Rushmore75/spider.git", rev="ff91646973ad04ff423010f36206f550e37c4278" } html5ever = "0.29.0"
markup5ever_rcdom = "0.5.0-unofficial"
reqwest = "0.12.8"
tokio = { version="1.40.0", features = ["full"] }
tracing = "0.1.40"
tracing-subscriber = "0.3.18"

View File

@ -1,35 +1,58 @@
use spider::{hashbrown::HashMap, tokio}; extern crate markup5ever_rcdom as rcdom;
extern crate html5ever;
use std::env;
use html5ever::{parse_document, tendril::TendrilSink, tree_builder::TreeBuilderOpts, ParseOpts};
use rcdom::RcDom;
use tracing::{debug, info, trace, warn};
#[tokio::main] #[tokio::main]
async fn main() { async fn main() {
tracing_subscriber::fmt::init();
debug!("Starting...");
let args = std::env::args().collect::<Vec<String>>(); let url = "https://oliveratkinson.net";
let budget = "10";
let response = reqwest::get(url).await.unwrap();
let data = response.text().await.unwrap();
let url = &args[1];
let budget = &args[2]; let opts = ParseOpts {
let budget = match budget.parse::<u32>() { tree_builder: TreeBuilderOpts {
Ok(x) => x, drop_doctype: true,
Err(_) => panic!("Second arg must be a int"), ..Default::default()
},
..Default::default()
}; };
let dom = parse_document(RcDom::default(), opts)
let mut site = spider::website::Website::new(url) .from_utf8()
.with_budget(Some(HashMap::from([ .read_from(&mut data.as_bytes())
("*", budget),
])))
.with_tld(true)
.with_on_link_find_callback(Some(|from, to| {
let from = from.as_ref().to_string();
let to = to.as_ref().to_string();
let from = from.trim();
let to= to.trim();
println!("{from};->;{to}");
}))
.build()
.unwrap(); .unwrap();
site.crawl().await; let a = &dom.document;
warn!("Walking...");
walk(a);
} }
fn walk(node: &rcdom::Handle) {
match &node.data {
rcdom::NodeData::Document => (),
rcdom::NodeData::Doctype { name, public_id, system_id } => debug!("doctype"),
rcdom::NodeData::Text { contents } => {},
rcdom::NodeData::Comment { contents } => debug!("comment"),
rcdom::NodeData::Element { name, attrs, template_contents, mathml_annotation_xml_integration_point } => {
attrs.borrow().iter().for_each(|attr| {
let name = name.local.to_string();
let internal = &*attr.value;
debug!("element: {name}, attr: {internal}");
});
},
rcdom::NodeData::ProcessingInstruction { target, contents } => debug!("ProcessingInstruction"),
};
node.children.borrow().iter().for_each(|n| walk(n));
}