added support for nearly all html tags that can have a link

2024-11-12 17:50:06 -07:00 · 2024-11-12 17:50:06 -07:00 · 720adaa552
commit 720adaa552
parent 7c32600694
3 changed files with 52 additions and 31 deletions
--- a/src/db.rs
+++ b/src/db.rs
@ -15,7 +15,7 @@ pub struct Website {
    /// The url that this data is found at
    site: Url,
    /// Wether or not this link has been crawled yet
-    crawled: bool,
+    pub crawled: bool,
    #[serde(skip_serializing)]
    id: Option<Thing>,
 }
@ -39,10 +39,6 @@ impl Website {
        self.crawled = true
    }

-    pub fn crawled(&mut self) -> &mut bool {
-        &mut self.crawled
-    }
-
    pub fn mut_url(&mut self) -> &mut Url {
        &mut self.site
    }
@ -105,9 +101,11 @@ impl Website {
        if let Some(old) = response.take::<Option<Website>>(0).unwrap() {
            // site exists already
            if let Some(id) = old.id {
+                // make sure to preserve the "crawled status"
                let mut new = self.clone();
                new.crawled = old.crawled | new.crawled;

+                // update the record
                match db.upsert((id.tb, id.id.to_string())).content(new).await {
                    Ok(e) => {
                        if let Some(a) = e {
--- a/src/main.rs
+++ b/src/main.rs
@ -1,9 +1,9 @@
 extern crate markup5ever_rcdom as rcdom;
 extern crate html5ever;

-use std::{rc::Rc, time::Instant};
+use std::{path::is_separator, rc::Rc, time::Instant};
 use db::{connect, Website};
-use html5ever::{parse_document, tendril::TendrilSink, tree_builder::TreeBuilderOpts, ParseOpts};
+use html5ever::{local_name, parse_document, tendril::TendrilSink, tree_builder::TreeBuilderOpts, ParseOpts};
 use rcdom::{Node, RcDom};
 use surrealdb::{engine::remote::ws::Client, sql::Thing, Surreal};
 use tracing::{debug, info, instrument, trace, trace_span, warn};
@ -21,7 +21,8 @@ async fn main() {
    debug!("Starting...");

    // Would probably take these in as parameters from a cli
-    let url = "https://oliveratkinson.net/";
+    // let url = "https://oliveratkinson.net/";
+    let url = "http://localhost:5500";
    let budget = 50; 
    let mut crawled = 0;

@ -36,9 +37,12 @@ async fn main() {
    // get() to work.
    let span = trace_span!("Pre-Loop");
    let pre_loop_span = span.enter();
+    // Download the site
    let mut site = Website::new(&url, false);
    let dom = get(&mut site, &db, &client).await.expect("Inital page returned None.");
+
    crawl_wrapper(&dom, &db, &site, &mut crawled).await;
+
    drop(pre_loop_span);

    let span = trace_span!("Loop");
@ -49,6 +53,10 @@ async fn main() {
        } else {100};

        let uncrawled = get_uncrawled_links(&db, get_num).await;
+        if uncrawled.len() == 0 {
+            info!("Had more budget but finished crawling everything.");
+            return;
+        }
        debug!("Crawling {} pages...", uncrawled.len());

        let span = trace_span!("Crawling");
@ -101,6 +109,8 @@ async fn get(site: &mut Website, db: &Surreal<Client>, getter: &reqwest::Client)
            .read_from(&mut data.as_bytes())
            .unwrap();
        
+        // TODO save the dom to minio if a flag is set
+
        site.set_crawled();
        site.store(db).await;
        trace!("Got: {}", site.to_string());
@ -114,34 +124,48 @@ async fn get(site: &mut Website, db: &Surreal<Client>, getter: &reqwest::Client)
 async fn walk(node: &rcdom::Handle, db: &Surreal<Client> , site: &Website, links_to: &mut Vec<Thing>) {
    let span = trace_span!("Walk");
    let span = span.enter();
-
+    // Match each node - node basically means element.
    match &node.data {
        rcdom::NodeData::Element { name, attrs, template_contents, mathml_annotation_xml_integration_point } => {
            for attr in attrs.borrow().clone() {
-                if name.local.to_string() == "a" {
-                    if attr.value.starts_with("mailto") {
-                        trace!("Is mailto");
-                        // mailto link, lol
-                        let _created: Option<db::Record> = db.create("email").content(db::Email {
-                            email: attr.value.to_string(),
-                            on: site.domain_str().to_owned(),
-                        }).await.unwrap();
-                    } else {
-                        let mut web = site.clone();
-                        let url = web.mut_url();
+                match name.local {
+                    local_name!("a") |
+                    local_name!("audio") |
+                    local_name!("area") |
+                    local_name!("img") |
+                    local_name!("link") |
+                    local_name!("object") |
+                    local_name!("source") |
+                    local_name!("base") |
+                    local_name!("video") => {
+                        let attribute_name = attr.name.local.to_string();
+                        if attribute_name == "src" || attribute_name == "href" || attribute_name == "data" {
+                            // Get clone of the current site object
+                            let mut web = site.clone();
                            
-                        // TODO remove #xyz
-                        let joined = url.join(&attr.value).unwrap();
-                        *url = joined;
+                            // Set url
+                            let url = web.mut_url();
+                            url.set_fragment(None); // removes #xyz
+                            let joined = url.join(&attr.value).unwrap();
+                            *url = joined;

-                        let crawled = web.crawled();
-                        *crawled = false;
+                            // Set other attributes
+                            web.crawled = false;
+                            // TODO set element name
+                            // let element_name = name.local.to_string();

-                        if let Some(id) = web.store(db).await {
-                            links_to.push(id);
+                            if let Some(id) = web.store(db).await {
+                                links_to.push(id);
+                            }
                        }
+                    },
+                    local_name!("button") |
+                    local_name!("meta") |
+                    local_name!("iframe") => {
+                        // dbg!(attrs);
                    }
-                }
+                    _ => {/**/}
+                };
            };
        },
        _ => {},
--- a/1
+++ b/1
@ -1 +0,0 @@
-[{"result":[{"accessed_at":"2024-08-25T20:07:25.969525156Z","crawled":false,"domain":"google.com","id":"website:fd46b0cr5f5y3d57eje8","path":"/","url":"https://google.com"}],"status":"OK","time":"205.7
				`@ -1 +0,0 @@`
				`[{"result":[{"accessed_at":"2024-08-25T20:07:25.969525156Z","crawled":false,"domain":"google.com","id":"website:fd46b0cr5f5y3d57eje8","path":"/","url":"https://google.com"}],"status":"OK","time":"205.7`