no longer using spider, just wiritng my own crawler
This commit is contained in:
parent
2d2b09116e
commit
974bccc457
1005
Cargo.lock
generated
1005
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@ -4,4 +4,9 @@ version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
spider = { features = [], git="https://github.com/Rushmore75/spider.git", rev="ff91646973ad04ff423010f36206f550e37c4278" }
|
||||
html5ever = "0.29.0"
|
||||
markup5ever_rcdom = "0.5.0-unofficial"
|
||||
reqwest = "0.12.8"
|
||||
tokio = { version="1.40.0", features = ["full"] }
|
||||
tracing = "0.1.40"
|
||||
tracing-subscriber = "0.3.18"
|
||||
|
71
src/main.rs
71
src/main.rs
@ -1,35 +1,58 @@
|
||||
use spider::{hashbrown::HashMap, tokio};
|
||||
extern crate markup5ever_rcdom as rcdom;
|
||||
extern crate html5ever;
|
||||
|
||||
use std::env;
|
||||
use html5ever::{parse_document, tendril::TendrilSink, tree_builder::TreeBuilderOpts, ParseOpts};
|
||||
use rcdom::RcDom;
|
||||
use tracing::{debug, info, trace, warn};
|
||||
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
tracing_subscriber::fmt::init();
|
||||
debug!("Starting...");
|
||||
|
||||
let args = std::env::args().collect::<Vec<String>>();
|
||||
let url = "https://oliveratkinson.net";
|
||||
let budget = "10";
|
||||
|
||||
let response = reqwest::get(url).await.unwrap();
|
||||
let data = response.text().await.unwrap();
|
||||
|
||||
let url = &args[1];
|
||||
let budget = &args[2];
|
||||
let budget = match budget.parse::<u32>() {
|
||||
Ok(x) => x,
|
||||
Err(_) => panic!("Second arg must be a int"),
|
||||
|
||||
let opts = ParseOpts {
|
||||
tree_builder: TreeBuilderOpts {
|
||||
drop_doctype: true,
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
|
||||
let mut site = spider::website::Website::new(url)
|
||||
.with_budget(Some(HashMap::from([
|
||||
("*", budget),
|
||||
])))
|
||||
.with_tld(true)
|
||||
.with_on_link_find_callback(Some(|from, to| {
|
||||
let from = from.as_ref().to_string();
|
||||
let to = to.as_ref().to_string();
|
||||
|
||||
let from = from.trim();
|
||||
let to= to.trim();
|
||||
|
||||
println!("{from};->;{to}");
|
||||
}))
|
||||
.build()
|
||||
let dom = parse_document(RcDom::default(), opts)
|
||||
.from_utf8()
|
||||
.read_from(&mut data.as_bytes())
|
||||
.unwrap();
|
||||
|
||||
site.crawl().await;
|
||||
let a = &dom.document;
|
||||
warn!("Walking...");
|
||||
|
||||
walk(a);
|
||||
}
|
||||
|
||||
fn walk(node: &rcdom::Handle) {
|
||||
match &node.data {
|
||||
rcdom::NodeData::Document => (),
|
||||
rcdom::NodeData::Doctype { name, public_id, system_id } => debug!("doctype"),
|
||||
rcdom::NodeData::Text { contents } => {},
|
||||
rcdom::NodeData::Comment { contents } => debug!("comment"),
|
||||
rcdom::NodeData::Element { name, attrs, template_contents, mathml_annotation_xml_integration_point } => {
|
||||
attrs.borrow().iter().for_each(|attr| {
|
||||
let name = name.local.to_string();
|
||||
let internal = &*attr.value;
|
||||
debug!("element: {name}, attr: {internal}");
|
||||
});
|
||||
},
|
||||
rcdom::NodeData::ProcessingInstruction { target, contents } => debug!("ProcessingInstruction"),
|
||||
};
|
||||
|
||||
node.children.borrow().iter().for_each(|n| walk(n));
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user