no longer using spider, just wiritng my own crawler
This commit is contained in:
parent
2d2b09116e
commit
974bccc457
1005
Cargo.lock
generated
1005
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@ -4,4 +4,9 @@ version = "0.1.0"
|
|||||||
edition = "2021"
|
edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
spider = { features = [], git="https://github.com/Rushmore75/spider.git", rev="ff91646973ad04ff423010f36206f550e37c4278" }
|
html5ever = "0.29.0"
|
||||||
|
markup5ever_rcdom = "0.5.0-unofficial"
|
||||||
|
reqwest = "0.12.8"
|
||||||
|
tokio = { version="1.40.0", features = ["full"] }
|
||||||
|
tracing = "0.1.40"
|
||||||
|
tracing-subscriber = "0.3.18"
|
||||||
|
71
src/main.rs
71
src/main.rs
@ -1,35 +1,58 @@
|
|||||||
use spider::{hashbrown::HashMap, tokio};
|
extern crate markup5ever_rcdom as rcdom;
|
||||||
|
extern crate html5ever;
|
||||||
|
|
||||||
|
use std::env;
|
||||||
|
use html5ever::{parse_document, tendril::TendrilSink, tree_builder::TreeBuilderOpts, ParseOpts};
|
||||||
|
use rcdom::RcDom;
|
||||||
|
use tracing::{debug, info, trace, warn};
|
||||||
|
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() {
|
async fn main() {
|
||||||
|
tracing_subscriber::fmt::init();
|
||||||
|
debug!("Starting...");
|
||||||
|
|
||||||
let args = std::env::args().collect::<Vec<String>>();
|
let url = "https://oliveratkinson.net";
|
||||||
|
let budget = "10";
|
||||||
|
|
||||||
|
let response = reqwest::get(url).await.unwrap();
|
||||||
|
let data = response.text().await.unwrap();
|
||||||
|
|
||||||
let url = &args[1];
|
|
||||||
let budget = &args[2];
|
let opts = ParseOpts {
|
||||||
let budget = match budget.parse::<u32>() {
|
tree_builder: TreeBuilderOpts {
|
||||||
Ok(x) => x,
|
drop_doctype: true,
|
||||||
Err(_) => panic!("Second arg must be a int"),
|
..Default::default()
|
||||||
|
},
|
||||||
|
..Default::default()
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let dom = parse_document(RcDom::default(), opts)
|
||||||
let mut site = spider::website::Website::new(url)
|
.from_utf8()
|
||||||
.with_budget(Some(HashMap::from([
|
.read_from(&mut data.as_bytes())
|
||||||
("*", budget),
|
|
||||||
])))
|
|
||||||
.with_tld(true)
|
|
||||||
.with_on_link_find_callback(Some(|from, to| {
|
|
||||||
let from = from.as_ref().to_string();
|
|
||||||
let to = to.as_ref().to_string();
|
|
||||||
|
|
||||||
let from = from.trim();
|
|
||||||
let to= to.trim();
|
|
||||||
|
|
||||||
println!("{from};->;{to}");
|
|
||||||
}))
|
|
||||||
.build()
|
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
site.crawl().await;
|
let a = &dom.document;
|
||||||
|
warn!("Walking...");
|
||||||
|
|
||||||
|
walk(a);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn walk(node: &rcdom::Handle) {
|
||||||
|
match &node.data {
|
||||||
|
rcdom::NodeData::Document => (),
|
||||||
|
rcdom::NodeData::Doctype { name, public_id, system_id } => debug!("doctype"),
|
||||||
|
rcdom::NodeData::Text { contents } => {},
|
||||||
|
rcdom::NodeData::Comment { contents } => debug!("comment"),
|
||||||
|
rcdom::NodeData::Element { name, attrs, template_contents, mathml_annotation_xml_integration_point } => {
|
||||||
|
attrs.borrow().iter().for_each(|attr| {
|
||||||
|
let name = name.local.to_string();
|
||||||
|
let internal = &*attr.value;
|
||||||
|
debug!("element: {name}, attr: {internal}");
|
||||||
|
});
|
||||||
|
},
|
||||||
|
rcdom::NodeData::ProcessingInstruction { target, contents } => debug!("ProcessingInstruction"),
|
||||||
|
};
|
||||||
|
|
||||||
|
node.children.borrow().iter().for_each(|n| walk(n));
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user