Add scraper module (#9)

working scraper module POC
baduker · Sep 29, 2024 · 133dfff · 133dfff
1 parent 089d70c
commit 133dfff
Show file tree

Hide file tree

Showing 8 changed files with 417 additions and 25 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,10 +1,12 @@
 [package]
 name = "untitled"
-version = "0.1.0"
+version = "0.2.1"
 edition = "2021"
 
 [dependencies]
 clap = { version = "4.5.17", features = ["derive"] }
 dirs = "5.0.1"
 serde = { version = "1.0", features = ["derive"] }
 toml = "0.8.19"
+reqwest = { version = "0.12.7", features = ["blocking"] }
+scraper = {version = "0.20.0"}
diff --git a/src/cli.rs b/src/cli.rs
@@ -1,14 +1,36 @@
-use clap::Parser;
+use clap::{Parser, Subcommand};
 use std::path::PathBuf;
 
 #[derive(Parser, Debug)]
-#[command(author, version, about, long_about = None)]
+#[command(
+    author = "baduker",
+    version = env!("CARGO_PKG_VERSION"),
+    about = "A modest untitled CLI tool to fetch titled images from the web(site).",
+    long_about = "This is a simple CLI tool to interact, so to speak, with \
+    the girls from the kindgirls.com website. It's a work in progress, \
+    so don't expect too much from it. It's just an odd way to learn Rust."
+)]
 pub struct Cli {
     /// Path to the configuration file
     #[arg(short, long, value_name = "FILE", default_value = "untitled.toml")]
     pub config: PathBuf,
 
-    /// Print the configuration
-    #[arg(short, long)]
-    pub print: bool,
+    #[command(subcommand)]
+    pub command: Option<Commands>,
+}
+
+#[derive(Subcommand, Debug)]
+pub enum Commands {
+    #[command(about = "Configuration options")]
+    Config {
+        /// Print the configuration file
+        #[arg(short, long)]
+        print: bool,
+    },
+    #[command(about = "Get some girls")]
+    Scrape {
+        /// The URL to scrape from
+        #[arg(short, long, value_name = "URL")]
+        url: Option<String>,
+    },
 }
diff --git a/src/config.rs b/src/config.rs
@@ -11,7 +11,6 @@ pub trait Config: Default {
     }
     fn base_url(&self) -> &str;
     fn download_dir(&self) -> &str;
-    fn is_active(&self) -> bool;
 }
 
 impl Config for MyConfig {
@@ -25,10 +24,6 @@ impl Config for MyConfig {
     fn download_dir(&self) -> &str {
         &self.download_dir
     }
-
-    fn is_active(&self) -> bool {
-        self.is_active
-    }
 }
 #[derive(Debug, Deserialize, Serialize)]
 pub struct MyConfig {
@@ -42,7 +37,7 @@ impl Default for MyConfig {
     fn default() -> Self {
         MyConfig {
             app_version: MyConfig::app_version().to_string(),
-            base_url: "https://kindgirls.com/old".to_string(),
+            base_url: "https://kindgirls.com/".to_string(),
             download_dir: "kindgirls".to_string(),
             is_active: true,
         }
@@ -60,6 +55,11 @@ pub fn read_or_create_config<T: DeserializeOwned + Serialize + Config>(
     Ok(config)
 }
 
+pub fn print_config<T: std::fmt::Debug + Config>(config: &T) {
+    println!("Config: {:?}", config);
+    println!("base_url: {}", config.base_url());
+}
+
 fn create_default_config<T: Serialize + Config>(
     path: &PathBuf,
 ) -> Result<(), Box<dyn std::error::Error>> {

diff --git a/src/main.rs b/src/main.rs
@@ -1,27 +1,30 @@
 mod cli;
 mod config;
+mod scraper;
+mod utilities;
 
+use crate::scraper::collector::scrape;
 use clap::Parser;
-use cli::Cli;
-use config::{read_or_create_config, Config, MyConfig};
+use cli::{Cli, Commands};
+use config::{print_config, read_or_create_config, MyConfig};
 
 fn main() {
     let cli = Cli::parse();
     match read_or_create_config::<MyConfig>() {
-        Ok(config) => {
-            if cli.print {
-                print_config(config);
-            } else {
-                println!("Config loaded successfully!");
+        Ok(config) => match cli.command {
+            Some(Commands::Config { print }) => {
+                if print {
+                    print_config(&config);
+                }
             }
-        }
+            Some(Commands::Scrape { url }) => match url {
+                Some(url) => scrape(&config, Some(&url)),
+                None => scrape(&config, None),
+            },
+            None => eprintln!("No command specified! Use --help to see available commands."),
+        },
         Err(e) => {
             eprintln!("Error: {}", e);
         }
     }
 }
-
-fn print_config<T: std::fmt::Debug + Config>(config: T) {
-    println!("Config: {:?}", config);
-    println!("base_url: {:?}", config.base_url());
-}
diff --git a/src/scraper/collector.rs b/src/scraper/collector.rs
@@ -0,0 +1,149 @@
+use super::structs::{Bio, Gallery, Girl, Selectors, Stats, Video, Visuals};
+use crate::config::Config;
+use crate::utilities::{build_video_src_url, parse_video_duration, splitter};
+use reqwest::blocking::Client;
+use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
+use reqwest::Error;
+use scraper::{Html, Selector};
+
+const DEFAULT_USER_AGENT: &str = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0";
+const DEFAULT_BASE_URL: &str = "https://kindgirls.com";
+
+pub fn scrape<T: Config>(config: &T, url: Option<&str>) {
+    match url {
+        Some(url) => {
+            println!("Scraping from: {}", url);
+
+            if Girl::is_single_gallery(url) {
+                println!("Scraping for single gallery is NOT implemented yet. :(");
+                return;
+            }
+
+            let body = fetch(url);
+            match body {
+                Ok(content) => {
+                    let document = Html::parse_document(&content);
+                    let girl = collect_girl(url, &document);
+                    println!("{:?}", girl);
+                }
+                Err(e) => {
+                    println!("Error fetching URL: {}", e);
+                }
+            }
+        }
+        None => {
+            println!("Scraping from: {}", config.base_url());
+            println!("Download directory: {}", config.download_dir());
+        }
+    }
+}
+
+fn fetch(url: &str) -> Result<String, Error> {
+    let client = Client::new();
+    let mut headers = HeaderMap::new();
+    headers.insert(USER_AGENT, HeaderValue::from_static(DEFAULT_USER_AGENT));
+    let response = client.get(url).headers(headers).send();
+    let body = response?.text()?;
+    Ok(body)
+}
+
+fn collect_bio(document: &Html) -> Bio {
+    let selector = Selector::parse(Selectors::MODEL_INFO).unwrap();
+    let model_info = document.select(&selector).next().unwrap();
+    let info_text: Vec<String> = model_info
+        .text()
+        .map(|s| s.trim().to_string())
+        .filter(|s| !s.is_empty())
+        .collect();
+    Bio::new(info_text)
+}
+
+fn collect_gallery(document: &Html) -> Vec<Gallery> {
+    let selector = Selector::parse(Selectors::MODEL_GALLERIES).unwrap();
+    document
+        .select(&selector)
+        .map(|element| {
+            let href = element.value().attr("href").unwrap();
+            let full_url = format!("{}{}", DEFAULT_BASE_URL, href);
+            let gallery_id = splitter(href, "=").last().unwrap().to_string();
+            let text = element.text().collect::<Vec<_>>().join(" ");
+            let total_photos = text.split_whitespace().next().unwrap().parse::<i32>().ok();
+            let title = splitter(element.value().attr("title").unwrap(), ", ");
+            let date = title.last().unwrap().to_string();
+
+            Gallery {
+                id: Some(gallery_id),
+                date: Some(date),
+                link: Some(full_url),
+                total_photos,
+            }
+        })
+        .collect()
+}
+
+fn collect_videos(document: &Html) -> Option<Vec<Video>> {
+    let selector = Selector::parse(Selectors::MODEL_VIDEOS).unwrap();
+    let model_videos = document.select(&selector);
+
+    let videos: Vec<Video> = model_videos
+        .map(|video_element| {
+            let video_href = video_element.value().attr("href").unwrap().to_string();
+            let video_full_url = format!("{}{}", DEFAULT_BASE_URL, video_href);
+
+            // Create a new selector for the img tag within the video link
+            let img_selector = Selector::parse("img").unwrap();
+            let img_element = video_element.select(&img_selector).next().unwrap();
+            let video_source_url = img_element.value().attr("src").unwrap().to_string();
+
+            let video_length = video_element.text().collect::<Vec<_>>().join(" ");
+
+            Video {
+                link: Some(video_full_url),
+                source: Some(build_video_src_url(video_source_url)),
+                duration: Some(parse_video_duration(&video_length)),
+            }
+        })
+        .collect();
+
+    if videos.is_empty() {
+        None
+    } else {
+        Some(videos)
+    }
+}
+
+fn collect_visuals(galleries: Vec<Gallery>, videos: Option<Vec<Video>>) -> Visuals {
+    Visuals { galleries, videos }
+}
+
+fn collect_stats(visuals: &Visuals) -> Stats {
+    let total_images: i32 = visuals
+        .galleries
+        .iter()
+        .filter_map(|g| g.total_photos)
+        .sum();
+
+    let total_videos = visuals.videos.as_ref().map(|videos| videos.len());
+
+    Stats {
+        total_galleries: visuals.galleries.len(),
+        total_photos: total_images,
+        total_videos,
+    }
+}
+
+fn collect_girl(url: &str, document: &Html) -> Result<Girl, String> {
+    let is_single_gallery = Girl::is_single_gallery(url);
+    let bio = collect_bio(document);
+    let galleries = collect_gallery(document);
+    let videos = collect_videos(document);
+    let visuals = collect_visuals(galleries, videos);
+    let stats = collect_stats(&visuals);
+
+    Ok(Girl {
+        is_single_gallery,
+        bio,
+        content: visuals,
+        stats,
+    })
+}
diff --git a/src/scraper/mod.rs b/src/scraper/mod.rs
@@ -0,0 +1,3 @@
+pub mod collector;
+
+mod structs;