110 lines
3.9 KiB
Rust
110 lines
3.9 KiB
Rust
use std::default::Default;
|
|
use std::str::FromStr;
|
|
|
|
use html5ever::tokenizer::{BufferQueue, TokenizerResult};
|
|
use html5ever::tokenizer::{StartTag, TagToken};
|
|
use html5ever::tokenizer::{Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
|
|
use html5ever::{local_name, tendril::*};
|
|
use surrealdb::engine::remote::ws::Client;
|
|
use surrealdb::Surreal;
|
|
use tracing::instrument;
|
|
|
|
use crate::db::Website;
|
|
use crate::Timer;
|
|
|
|
impl TokenSink for Website {
|
|
type Handle = Vec<Website>;
|
|
|
|
fn process_token(&self, token: Token, _line_number: u64) -> TokenSinkResult<Self::Handle> {
|
|
match token {
|
|
TagToken(tag) => {
|
|
if tag.kind == StartTag {
|
|
match tag.name {
|
|
local_name!("a")
|
|
| local_name!("audio")
|
|
| local_name!("area")
|
|
| local_name!("img")
|
|
| local_name!("link")
|
|
| local_name!("object")
|
|
| local_name!("source")
|
|
| local_name!("base")
|
|
| local_name!("video") => {
|
|
let mut links = Vec::new();
|
|
for attr in &tag.attrs {
|
|
let attr_name = attr.name.local.to_string();
|
|
if attr_name == "src" || attr_name == "href" || attr_name == "data"
|
|
{
|
|
// Get clone of the current site object
|
|
let mut web = self.clone();
|
|
|
|
// Set url
|
|
let mut url = web.site;
|
|
url.set_fragment(None); // removes #xyz
|
|
let joined = url.join(&attr.value).expect("Failed to join url during parsing!");
|
|
web.site = joined;
|
|
|
|
web.crawled = false;
|
|
|
|
links.push(web);
|
|
}
|
|
}
|
|
|
|
return TokenSinkResult::Script(links);
|
|
}
|
|
local_name!("button") | local_name!("meta") | local_name!("iframe") => {
|
|
// dbg!(attrs);
|
|
}
|
|
_ => {}
|
|
}
|
|
}
|
|
}
|
|
_ => {}
|
|
}
|
|
TokenSinkResult::Continue
|
|
}
|
|
}
|
|
|
|
#[instrument(skip_all)]
|
|
pub async fn parse(db: &Surreal<Client>, site: &mut Website, data: &str) {
|
|
// update self in db
|
|
site.set_crawled();
|
|
site.store(db).await;
|
|
|
|
// prep work
|
|
let mut other_sites: Vec<Website> = Vec::new();
|
|
{ // using blocks to prevent compiler's async worries
|
|
let _t = Timer::start("Parsed page");
|
|
|
|
// change data into something that can be tokenized
|
|
let chunk = Tendril::from_str(&data).expect("Failed to parse string into Tendril!");
|
|
// create buffer of tokens and push our input into it
|
|
let mut token_buffer = BufferQueue::default();
|
|
token_buffer.push_back(chunk.try_reinterpret::<fmt::UTF8>().expect("Failed to reinterprt chunk!"));
|
|
// create the tokenizer
|
|
let tokenizer = Tokenizer::new(site.clone(), TokenizerOpts::default());
|
|
|
|
// go thru buffer
|
|
while let TokenizerResult::Script(mut sites) = tokenizer.feed(&mut token_buffer) {
|
|
other_sites.append(&mut sites);
|
|
// other_sites.push(sites);
|
|
}
|
|
|
|
assert!(token_buffer.is_empty());
|
|
tokenizer.end();
|
|
}
|
|
|
|
{
|
|
let mut links_to = Vec::with_capacity(other_sites.len());
|
|
|
|
for a in other_sites {
|
|
|
|
let other = a.store(db).await;
|
|
if let Some(o) = other {
|
|
links_to.push(o);
|
|
}
|
|
}
|
|
|
|
site.links_to(links_to, db).await;
|
|
}
|
|
}
|