internet_mapper/src/main.rs

extern crate html5ever;
extern crate markup5ever_rcdom as rcdom;

use db::{connect, Website};
use html5ever::{
    local_name, parse_document, tendril::TendrilSink, tree_builder::TreeBuilderOpts, ParseOpts,
};
use rcdom::RcDom;
use std::time::Instant;
use surrealdb::{engine::remote::ws::Client, sql::Thing, Surreal};
use tracing::{debug, info, instrument, trace, trace_span};
use tracing_subscriber::EnvFilter;

mod db;

#[tokio::main]
async fn main() {
    tracing_subscriber::fmt()
        .with_env_filter(EnvFilter::from_default_env())
        .with_line_number(true)
        .without_time()
        .init();
    debug!("Starting...");

    // Would probably take these in as parameters from a cli
    let url = "https://oliveratkinson.net/";
    // let url = "http://localhost:5500";
    let budget = 1000;
    let mut crawled = 0;

    let db = connect().await.expect("Failed to connect to db, aborting.");

    let client = reqwest::Client::builder()
        // .use_rustls_tls()
        .build()
        .unwrap();

    // Kick off the whole machine - This Website object doesn't matter, it's just to allow for
    // get() to work.
    let span = trace_span!("Pre-Loop");
    let pre_loop_span = span.enter();
    // Download the site
    let mut site = Website::new(&url, false);
    get(&mut site, &db, &client, &mut crawled).await;

    drop(pre_loop_span);

    let span = trace_span!("Loop");
    let span = span.enter();
    while crawled < budget {
        let get_num = if budget - crawled < 100 {
            budget - crawled
        } else {
            100
        };

        let uncrawled = get_uncrawled_links(&db, get_num).await;
        if uncrawled.len() == 0 {
            info!("Had more budget but finished crawling everything.");
            return;
        }
        debug!("Crawling {} pages...", uncrawled.len());

        let span = trace_span!("Crawling");
        let _ = span.enter();

        for mut site in uncrawled {
            get(&mut site, &db, &client, &mut crawled).await;
            let percent = format!("{:.2}%", (crawled as f32 / budget as f32) * 100f32);
            info!("Crawled {crawled} out of {budget} pages. ({percent})");
        }
    }
    drop(span);

    info!("Done");
}

#[instrument(skip_all)]
/// A quick helper function for downloading a url
async fn get(
    site: &mut Website,
    db: &Surreal<Client>,
    request_client: &reqwest::Client,
    count: &mut usize,
) {
    trace!("Get: {}", site.to_string());
    let timer = Timer::start("Got page");

    if let Ok(response) = request_client.get(site.to_string()).send().await {
        timer.stop();

        // Get body
        let data = response.text().await.unwrap();
        let opts = ParseOpts {
            tree_builder: TreeBuilderOpts {
                drop_doctype: true,
                ..Default::default()
            },
            ..Default::default()
        };
        // Get DOM
        let dom = parse_document(RcDom::default(), opts)
            .from_utf8()
            .read_from(&mut data.as_bytes())
            .unwrap();

        // TODO save the dom to minio if a flag is set

        // Modify record in database
        site.set_crawled();
        site.store(db).await;
        trace!("Got: {}", site.to_string());

        // Walk all the children nodes, searching for links to other pages.
        let mut buffer = Vec::new();
        let timer = Timer::start("Walked");
        walk(&dom.document, &db, &site, &mut buffer).await;
        timer.stop();

        // Put all the found links into the database.
        site.links_to(buffer, &db).await;
        *count += 1;
    }
    trace!("Failed to get: {}", site.to_string());
}

/// Walks the givin site, placing it's findings in the database
async fn walk(
    node: &rcdom::Handle,
    db: &Surreal<Client>,
    site: &Website,
    links_to: &mut Vec<Thing>,
) {
    let span = trace_span!("Walk");
    let span = span.enter();
    // Match each node - node basically means element.
    match &node.data {
        rcdom::NodeData::Element { name, attrs, .. } => {
            for attr in attrs.borrow().clone() {
                match name.local {
                    local_name!("a")
                    | local_name!("audio")
                    | local_name!("area")
                    | local_name!("img")
                    | local_name!("link")
                    | local_name!("object")
                    | local_name!("source")
                    | local_name!("base")
                    | local_name!("video") => {
                        let attribute_name = attr.name.local.to_string();
                        if attribute_name == "src"
                            || attribute_name == "href"
                            || attribute_name == "data"
                        {
                            // Get clone of the current site object
                            let mut web = site.clone();

                            // Set url
                            let url = web.mut_url();
                            url.set_fragment(None); // removes #xyz
                            let joined = url.join(&attr.value).unwrap();
                            *url = joined;

                            // Set other attributes
                            web.crawled = false;
                            // TODO set element name
                            // let element_name = name.local.to_string();

                            if let Some(id) = web.store(db).await {
                                links_to.push(id);
                            }
                        }
                    }
                    local_name!("button") | local_name!("meta") | local_name!("iframe") => {
                        // dbg!(attrs);
                    }
                    _ => {}
                };
            }
        }
        _ => {}
    };
    drop(span);
    for child in node.children.borrow().iter() {
        Box::pin(walk(child, db, site, links_to)).await;
    }
}

/// Returns uncrawled links
async fn get_uncrawled_links(db: &Surreal<Client>, mut count: usize) -> Vec<Website> {
    if count > 100 {
        count = 100
    }

    let mut response = db
        .query("SELECT * FROM website WHERE crawled = false LIMIT $count")
        .bind(("count", count))
        .await
        .expect("Hard-coded query failed..?");
    response
        .take(0)
        .expect("Returned websites couldn't be parsed")
}

pub struct Timer<'a> {
    start: Instant,
    msg: &'a str,
}

impl<'a> Timer<'a> {
    #[inline]
    pub fn start(msg: &'a str) -> Self {
        Self {
            start: Instant::now(),
            msg,
        }
    }
    pub fn stop(&self) -> f64 {
        let dif = self.start.elapsed().as_micros();
        let ms = dif as f64 / 1000.;
        debug!("{}", format!("{} in {:.3}ms", self.msg, ms));
        ms
    }
}

impl Drop for Timer<'_> {
    fn drop(&mut self) {
        self.stop();
    }
}
no longer using spider, just wiritng my own crawler 2024-10-04 19:52:34 +00:00			`extern crate html5ever;`
formatting and timer changes, consolidated functions 2024-11-13 01:40:10 +00:00			`extern crate markup5ever_rcdom as rcdom;`
add 2024-08-23 11:22:49 +00:00
works more, but still not all the way 2024-11-09 18:30:32 +00:00			`use db::{connect, Website};`
formatting and timer changes, consolidated functions 2024-11-13 01:40:10 +00:00			`use html5ever::{`
			`local_name, parse_document, tendril::TendrilSink, tree_builder::TreeBuilderOpts, ParseOpts,`
			`};`
			`use rcdom::RcDom;`
			`use std::time::Instant;`
it works :party: 2024-11-10 06:30:57 +00:00			`use surrealdb::{engine::remote::ws::Client, sql::Thing, Surreal};`
formatting and timer changes, consolidated functions 2024-11-13 01:40:10 +00:00			`use tracing::{debug, info, instrument, trace, trace_span};`
works more, but still not all the way 2024-11-09 18:30:32 +00:00			`use tracing_subscriber::EnvFilter;`
add 2024-08-23 11:22:49 +00:00
crawling :spider: 2024-10-07 17:14:56 +00:00			`mod db;`
add 2024-08-23 11:22:49 +00:00
no longer using spider, just wiritng my own crawler 2024-10-04 19:52:34 +00:00			`#[tokio::main]`
			`async fn main() {`
works more, but still not all the way 2024-11-09 18:30:32 +00:00			`tracing_subscriber::fmt()`
			`.with_env_filter(EnvFilter::from_default_env())`
			`.with_line_number(true)`
			`.without_time()`
			`.init();`
no longer using spider, just wiritng my own crawler 2024-10-04 19:52:34 +00:00			`debug!("Starting...");`

works more, but still not all the way 2024-11-09 18:30:32 +00:00			`// Would probably take these in as parameters from a cli`
formatting and timer changes, consolidated functions 2024-11-13 01:40:10 +00:00			`let url = "https://oliveratkinson.net/";`
			`// let url = "http://localhost:5500";`
			`let budget = 1000;`
works more, but still not all the way 2024-11-09 18:30:32 +00:00			`let mut crawled = 0;`

crawling :spider: 2024-10-07 17:14:56 +00:00			`let db = connect().await.expect("Failed to connect to db, aborting.");`

use reqwest client for epic speedup 2024-11-11 03:37:00 +00:00			`let client = reqwest::Client::builder()`
			`// .use_rustls_tls()`
			`.build()`
			`.unwrap();`

works more, but still not all the way 2024-11-09 18:30:32 +00:00			`// Kick off the whole machine - This Website object doesn't matter, it's just to allow for`
			`// get() to work.`
it works now 2024-11-09 22:28:10 +00:00			`let span = trace_span!("Pre-Loop");`
			`let pre_loop_span = span.enter();`
added support for nearly all html tags that can have a link 2024-11-13 00:50:06 +00:00			`// Download the site`
it works now 2024-11-09 22:28:10 +00:00			`let mut site = Website::new(&url, false);`
formatting and timer changes, consolidated functions 2024-11-13 01:40:10 +00:00			`get(&mut site, &db, &client, &mut crawled).await;`
added support for nearly all html tags that can have a link 2024-11-13 00:50:06 +00:00
it works now 2024-11-09 22:28:10 +00:00			`drop(pre_loop_span);`
works more, but still not all the way 2024-11-09 18:30:32 +00:00
it works now 2024-11-09 22:28:10 +00:00			`let span = trace_span!("Loop");`
			`let span = span.enter();`
works more, but still not all the way 2024-11-09 18:30:32 +00:00			`while crawled < budget {`
it works :party: 2024-11-10 06:30:57 +00:00			`let get_num = if budget - crawled < 100 {`
			`budget - crawled`
formatting and timer changes, consolidated functions 2024-11-13 01:40:10 +00:00			`} else {`
			`100`
			`};`
it works :party: 2024-11-10 06:30:57 +00:00
			`let uncrawled = get_uncrawled_links(&db, get_num).await;`
added support for nearly all html tags that can have a link 2024-11-13 00:50:06 +00:00			`if uncrawled.len() == 0 {`
			`info!("Had more budget but finished crawling everything.");`
			`return;`
			`}`
works more, but still not all the way 2024-11-09 18:30:32 +00:00			`debug!("Crawling {} pages...", uncrawled.len());`

it works now 2024-11-09 22:28:10 +00:00			`let span = trace_span!("Crawling");`
			`let _ = span.enter();`

works more, but still not all the way 2024-11-09 18:30:32 +00:00			`for mut site in uncrawled {`
formatting and timer changes, consolidated functions 2024-11-13 01:40:10 +00:00			`get(&mut site, &db, &client, &mut crawled).await;`
			`let percent = format!("{:.2}%", (crawled as f32 / budget as f32) * 100f32);`
			`info!("Crawled {crawled} out of {budget} pages. ({percent})");`
works more, but still not all the way 2024-11-09 18:30:32 +00:00			`}`
			`}`
it works now 2024-11-09 22:28:10 +00:00			`drop(span);`
updates 2024-10-31 21:09:48 +00:00
			`info!("Done");`
crawling :spider: 2024-10-07 17:14:56 +00:00			`}`

works more, but still not all the way 2024-11-09 18:30:32 +00:00			`#[instrument(skip_all)]`
			`/// A quick helper function for downloading a url`
formatting and timer changes, consolidated functions 2024-11-13 01:40:10 +00:00			`async fn get(`
			`site: &mut Website,`
			`db: &Surreal<Client>,`
			`request_client: &reqwest::Client,`
			`count: &mut usize,`
			`) {`
it works now 2024-11-09 22:28:10 +00:00			`trace!("Get: {}", site.to_string());`
working, now onto speeding it up 2024-11-11 03:24:04 +00:00			`let timer = Timer::start("Got page");`
use reqwest client for epic speedup 2024-11-11 03:37:00 +00:00
formatting and timer changes, consolidated functions 2024-11-13 01:40:10 +00:00			`if let Ok(response) = request_client.get(site.to_string()).send().await {`
			`timer.stop();`
it works :party: 2024-11-10 06:30:57 +00:00
formatting and timer changes, consolidated functions 2024-11-13 01:40:10 +00:00			`// Get body`
works more, but still not all the way 2024-11-09 18:30:32 +00:00			`let data = response.text().await.unwrap();`
			`let opts = ParseOpts {`
			`tree_builder: TreeBuilderOpts {`
			`drop_doctype: true,`
			`..Default::default()`
			`},`
no longer using spider, just wiritng my own crawler 2024-10-04 19:52:34 +00:00			`..Default::default()`
works more, but still not all the way 2024-11-09 18:30:32 +00:00			`};`
formatting and timer changes, consolidated functions 2024-11-13 01:40:10 +00:00			`// Get DOM`
works more, but still not all the way 2024-11-09 18:30:32 +00:00			`let dom = parse_document(RcDom::default(), opts)`
			`.from_utf8()`
			`.read_from(&mut data.as_bytes())`
			`.unwrap();`
formatting and timer changes, consolidated functions 2024-11-13 01:40:10 +00:00
added support for nearly all html tags that can have a link 2024-11-13 00:50:06 +00:00			`// TODO save the dom to minio if a flag is set`
add 2024-08-23 11:22:49 +00:00
formatting and timer changes, consolidated functions 2024-11-13 01:40:10 +00:00			`// Modify record in database`
it works now 2024-11-09 22:28:10 +00:00			`site.set_crawled();`
works more, but still not all the way 2024-11-09 18:30:32 +00:00			`site.store(db).await;`
it works now 2024-11-09 22:28:10 +00:00			`trace!("Got: {}", site.to_string());`
formatting and timer changes, consolidated functions 2024-11-13 01:40:10 +00:00
			`// Walk all the children nodes, searching for links to other pages.`
			`let mut buffer = Vec::new();`
			`let timer = Timer::start("Walked");`
			`walk(&dom.document, &db, &site, &mut buffer).await;`
			`timer.stop();`

			`// Put all the found links into the database.`
			`site.links_to(buffer, &db).await;`
			`*count += 1;`
works more, but still not all the way 2024-11-09 18:30:32 +00:00			`}`
it works now 2024-11-09 22:28:10 +00:00			`trace!("Failed to get: {}", site.to_string());`
crawling :spider: 2024-10-07 17:14:56 +00:00			`}`
add 2024-08-23 11:22:49 +00:00
works more, but still not all the way 2024-11-09 18:30:32 +00:00			`/// Walks the givin site, placing it's findings in the database`
formatting and timer changes, consolidated functions 2024-11-13 01:40:10 +00:00			`async fn walk(`
			`node: &rcdom::Handle,`
			`db: &Surreal<Client>,`
			`site: &Website,`
			`links_to: &mut Vec<Thing>,`
			`) {`
it works now 2024-11-09 22:28:10 +00:00			`let span = trace_span!("Walk");`
			`let span = span.enter();`
added support for nearly all html tags that can have a link 2024-11-13 00:50:06 +00:00			`// Match each node - node basically means element.`
no longer using spider, just wiritng my own crawler 2024-10-04 19:52:34 +00:00			`match &node.data {`
formatting and timer changes, consolidated functions 2024-11-13 01:40:10 +00:00			`rcdom::NodeData::Element { name, attrs, .. } => {`
clean up walk() 2024-10-31 20:10:14 +00:00			`for attr in attrs.borrow().clone() {`
added support for nearly all html tags that can have a link 2024-11-13 00:50:06 +00:00			`match name.local {`
formatting and timer changes, consolidated functions 2024-11-13 01:40:10 +00:00			`local_name!("a")`
			`\| local_name!("audio")`
			`\| local_name!("area")`
			`\| local_name!("img")`
			`\| local_name!("link")`
			`\| local_name!("object")`
			`\| local_name!("source")`
			`\| local_name!("base")`
			`\| local_name!("video") => {`
added support for nearly all html tags that can have a link 2024-11-13 00:50:06 +00:00			`let attribute_name = attr.name.local.to_string();`
formatting and timer changes, consolidated functions 2024-11-13 01:40:10 +00:00			`if attribute_name == "src"`
			`\|\| attribute_name == "href"`
			`\|\| attribute_name == "data"`
			`{`
added support for nearly all html tags that can have a link 2024-11-13 00:50:06 +00:00			`// Get clone of the current site object`
			`let mut web = site.clone();`
formatting and timer changes, consolidated functions 2024-11-13 01:40:10 +00:00
added support for nearly all html tags that can have a link 2024-11-13 00:50:06 +00:00			`// Set url`
			`let url = web.mut_url();`
			`url.set_fragment(None); // removes #xyz`
			`let joined = url.join(&attr.value).unwrap();`
			`*url = joined;`

			`// Set other attributes`
			`web.crawled = false;`
			`// TODO set element name`
			`// let element_name = name.local.to_string();`

			`if let Some(id) = web.store(db).await {`
			`links_to.push(id);`
			`}`
it works :party: 2024-11-10 06:30:57 +00:00			`}`
formatting and timer changes, consolidated functions 2024-11-13 01:40:10 +00:00			`}`
			`local_name!("button") \| local_name!("meta") \| local_name!("iframe") => {`
added support for nearly all html tags that can have a link 2024-11-13 00:50:06 +00:00			`// dbg!(attrs);`
crawling :spider: 2024-10-07 17:14:56 +00:00			`}`
formatting and timer changes, consolidated functions 2024-11-13 01:40:10 +00:00			`_ => {}`
added support for nearly all html tags that can have a link 2024-11-13 00:50:06 +00:00			`};`
formatting and timer changes, consolidated functions 2024-11-13 01:40:10 +00:00			`}`
			`}`
			`_ => {}`
no longer using spider, just wiritng my own crawler 2024-10-04 19:52:34 +00:00			`};`
it works now 2024-11-09 22:28:10 +00:00			`drop(span);`
clean up walk() 2024-10-31 20:10:14 +00:00			`for child in node.children.borrow().iter() {`
formatting and timer changes, consolidated functions 2024-11-13 01:40:10 +00:00			`Box::pin(walk(child, db, site, links_to)).await;`
crawling :spider: 2024-10-07 17:14:56 +00:00			`}`
add 2024-08-23 11:22:49 +00:00			`}`
works more, but still not all the way 2024-11-09 18:30:32 +00:00
it works now 2024-11-09 22:28:10 +00:00			`/// Returns uncrawled links`
			`async fn get_uncrawled_links(db: &Surreal<Client>, mut count: usize) -> Vec<Website> {`
			`if count > 100 {`
			`count = 100`
			`}`

			`let mut response = db`
			`.query("SELECT * FROM website WHERE crawled = false LIMIT $count")`
			`.bind(("count", count))`
			`.await`
			`.expect("Hard-coded query failed..?");`
formatting and timer changes, consolidated functions 2024-11-13 01:40:10 +00:00			`response`
			`.take(0)`
			`.expect("Returned websites couldn't be parsed")`
works more, but still not all the way 2024-11-09 18:30:32 +00:00			`}`

working, now onto speeding it up 2024-11-11 03:24:04 +00:00			`pub struct Timer<'a> {`
			`start: Instant,`
			`msg: &'a str,`
			`}`

			`impl<'a> Timer<'a> {`
			`#[inline]`
			`pub fn start(msg: &'a str) -> Self {`
formatting and timer changes, consolidated functions 2024-11-13 01:40:10 +00:00			`Self {`
			`start: Instant::now(),`
			`msg,`
			`}`
			`}`
			`pub fn stop(&self) -> f64 {`
			`let dif = self.start.elapsed().as_micros();`
			`let ms = dif as f64 / 1000.;`
			`debug!("{}", format!("{} in {:.3}ms", self.msg, ms));`
			`ms`
working, now onto speeding it up 2024-11-11 03:24:04 +00:00			`}`
			`}`

formatting and timer changes, consolidated functions 2024-11-13 01:40:10 +00:00			`impl Drop for Timer<'_> {`
working, now onto speeding it up 2024-11-11 03:24:04 +00:00			`fn drop(&mut self) {`
formatting and timer changes, consolidated functions 2024-11-13 01:40:10 +00:00			`self.stop();`
working, now onto speeding it up 2024-11-11 03:24:04 +00:00			`}`
added support for nearly all html tags that can have a link 2024-11-13 00:50:06 +00:00			`}`