Skip to content

Commit

Permalink
Add scraper module (#9)
Browse files Browse the repository at this point in the history
working scraper module POC
  • Loading branch information
baduker authored Sep 29, 2024
1 parent 089d70c commit 133dfff
Show file tree
Hide file tree
Showing 8 changed files with 417 additions and 25 deletions.
4 changes: 3 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
[package]
name = "untitled"
version = "0.1.0"
version = "0.2.1"
edition = "2021"

[dependencies]
clap = { version = "4.5.17", features = ["derive"] }
dirs = "5.0.1"
serde = { version = "1.0", features = ["derive"] }
toml = "0.8.19"
reqwest = { version = "0.12.7", features = ["blocking"] }
scraper = {version = "0.20.0"}
32 changes: 27 additions & 5 deletions src/cli.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,36 @@
use clap::Parser;
use clap::{Parser, Subcommand};
use std::path::PathBuf;

#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
#[command(
author = "baduker",
version = env!("CARGO_PKG_VERSION"),
about = "A modest untitled CLI tool to fetch titled images from the web(site).",
long_about = "This is a simple CLI tool to interact, so to speak, with \
the girls from the kindgirls.com website. It's a work in progress, \
so don't expect too much from it. It's just an odd way to learn Rust."
)]
pub struct Cli {
/// Path to the configuration file
#[arg(short, long, value_name = "FILE", default_value = "untitled.toml")]
pub config: PathBuf,

/// Print the configuration
#[arg(short, long)]
pub print: bool,
#[command(subcommand)]
pub command: Option<Commands>,
}

#[derive(Subcommand, Debug)]
pub enum Commands {
#[command(about = "Configuration options")]
Config {
/// Print the configuration file
#[arg(short, long)]
print: bool,
},
#[command(about = "Get some girls")]
Scrape {
/// The URL to scrape from
#[arg(short, long, value_name = "URL")]
url: Option<String>,
},
}
12 changes: 6 additions & 6 deletions src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ pub trait Config: Default {
}
fn base_url(&self) -> &str;
fn download_dir(&self) -> &str;
fn is_active(&self) -> bool;
}

impl Config for MyConfig {
Expand All @@ -25,10 +24,6 @@ impl Config for MyConfig {
fn download_dir(&self) -> &str {
&self.download_dir
}

fn is_active(&self) -> bool {
self.is_active
}
}
#[derive(Debug, Deserialize, Serialize)]
pub struct MyConfig {
Expand All @@ -42,7 +37,7 @@ impl Default for MyConfig {
fn default() -> Self {
MyConfig {
app_version: MyConfig::app_version().to_string(),
base_url: "https://kindgirls.com/old".to_string(),
base_url: "https://kindgirls.com/".to_string(),
download_dir: "kindgirls".to_string(),
is_active: true,
}
Expand All @@ -60,6 +55,11 @@ pub fn read_or_create_config<T: DeserializeOwned + Serialize + Config>(
Ok(config)
}

pub fn print_config<T: std::fmt::Debug + Config>(config: &T) {
println!("Config: {:?}", config);
println!("base_url: {}", config.base_url());
}

fn create_default_config<T: Serialize + Config>(
path: &PathBuf,
) -> Result<(), Box<dyn std::error::Error>> {
Expand Down
29 changes: 16 additions & 13 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -1,27 +1,30 @@
mod cli;
mod config;
mod scraper;
mod utilities;

use crate::scraper::collector::scrape;
use clap::Parser;
use cli::Cli;
use config::{read_or_create_config, Config, MyConfig};
use cli::{Cli, Commands};
use config::{print_config, read_or_create_config, MyConfig};

fn main() {
let cli = Cli::parse();
match read_or_create_config::<MyConfig>() {
Ok(config) => {
if cli.print {
print_config(config);
} else {
println!("Config loaded successfully!");
Ok(config) => match cli.command {
Some(Commands::Config { print }) => {
if print {
print_config(&config);
}
}
}
Some(Commands::Scrape { url }) => match url {
Some(url) => scrape(&config, Some(&url)),
None => scrape(&config, None),
},
None => eprintln!("No command specified! Use --help to see available commands."),
},
Err(e) => {
eprintln!("Error: {}", e);
}
}
}

fn print_config<T: std::fmt::Debug + Config>(config: T) {
println!("Config: {:?}", config);
println!("base_url: {:?}", config.base_url());
}
149 changes: 149 additions & 0 deletions src/scraper/collector.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
use super::structs::{Bio, Gallery, Girl, Selectors, Stats, Video, Visuals};
use crate::config::Config;
use crate::utilities::{build_video_src_url, parse_video_duration, splitter};
use reqwest::blocking::Client;
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
use reqwest::Error;
use scraper::{Html, Selector};

const DEFAULT_USER_AGENT: &str = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0";
const DEFAULT_BASE_URL: &str = "https://kindgirls.com";

pub fn scrape<T: Config>(config: &T, url: Option<&str>) {
match url {
Some(url) => {
println!("Scraping from: {}", url);

if Girl::is_single_gallery(url) {
println!("Scraping for single gallery is NOT implemented yet. :(");
return;
}

let body = fetch(url);
match body {
Ok(content) => {
let document = Html::parse_document(&content);
let girl = collect_girl(url, &document);
println!("{:?}", girl);
}
Err(e) => {
println!("Error fetching URL: {}", e);
}
}
}
None => {
println!("Scraping from: {}", config.base_url());
println!("Download directory: {}", config.download_dir());
}
}
}

fn fetch(url: &str) -> Result<String, Error> {
let client = Client::new();
let mut headers = HeaderMap::new();
headers.insert(USER_AGENT, HeaderValue::from_static(DEFAULT_USER_AGENT));
let response = client.get(url).headers(headers).send();
let body = response?.text()?;
Ok(body)
}

fn collect_bio(document: &Html) -> Bio {
let selector = Selector::parse(Selectors::MODEL_INFO).unwrap();
let model_info = document.select(&selector).next().unwrap();
let info_text: Vec<String> = model_info
.text()
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect();
Bio::new(info_text)
}

fn collect_gallery(document: &Html) -> Vec<Gallery> {
let selector = Selector::parse(Selectors::MODEL_GALLERIES).unwrap();
document
.select(&selector)
.map(|element| {
let href = element.value().attr("href").unwrap();
let full_url = format!("{}{}", DEFAULT_BASE_URL, href);
let gallery_id = splitter(href, "=").last().unwrap().to_string();
let text = element.text().collect::<Vec<_>>().join(" ");
let total_photos = text.split_whitespace().next().unwrap().parse::<i32>().ok();
let title = splitter(element.value().attr("title").unwrap(), ", ");
let date = title.last().unwrap().to_string();

Gallery {
id: Some(gallery_id),
date: Some(date),
link: Some(full_url),
total_photos,
}
})
.collect()
}

fn collect_videos(document: &Html) -> Option<Vec<Video>> {
let selector = Selector::parse(Selectors::MODEL_VIDEOS).unwrap();
let model_videos = document.select(&selector);

let videos: Vec<Video> = model_videos
.map(|video_element| {
let video_href = video_element.value().attr("href").unwrap().to_string();
let video_full_url = format!("{}{}", DEFAULT_BASE_URL, video_href);

// Create a new selector for the img tag within the video link
let img_selector = Selector::parse("img").unwrap();
let img_element = video_element.select(&img_selector).next().unwrap();
let video_source_url = img_element.value().attr("src").unwrap().to_string();

let video_length = video_element.text().collect::<Vec<_>>().join(" ");

Video {
link: Some(video_full_url),
source: Some(build_video_src_url(video_source_url)),
duration: Some(parse_video_duration(&video_length)),
}
})
.collect();

if videos.is_empty() {
None
} else {
Some(videos)
}
}

fn collect_visuals(galleries: Vec<Gallery>, videos: Option<Vec<Video>>) -> Visuals {
Visuals { galleries, videos }
}

fn collect_stats(visuals: &Visuals) -> Stats {
let total_images: i32 = visuals
.galleries
.iter()
.filter_map(|g| g.total_photos)
.sum();

let total_videos = visuals.videos.as_ref().map(|videos| videos.len());

Stats {
total_galleries: visuals.galleries.len(),
total_photos: total_images,
total_videos,
}
}

fn collect_girl(url: &str, document: &Html) -> Result<Girl, String> {
let is_single_gallery = Girl::is_single_gallery(url);
let bio = collect_bio(document);
let galleries = collect_gallery(document);
let videos = collect_videos(document);
let visuals = collect_visuals(galleries, videos);
let stats = collect_stats(&visuals);

Ok(Girl {
is_single_gallery,
bio,
content: visuals,
stats,
})
}
3 changes: 3 additions & 0 deletions src/scraper/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
pub mod collector;

mod structs;
Loading

0 comments on commit 133dfff

Please sign in to comment.