-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
working scraper module POC
- Loading branch information
Showing
8 changed files
with
417 additions
and
25 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,12 @@ | ||
[package] | ||
name = "untitled" | ||
version = "0.1.0" | ||
version = "0.2.1" | ||
edition = "2021" | ||
|
||
[dependencies] | ||
clap = { version = "4.5.17", features = ["derive"] } | ||
dirs = "5.0.1" | ||
serde = { version = "1.0", features = ["derive"] } | ||
toml = "0.8.19" | ||
reqwest = { version = "0.12.7", features = ["blocking"] } | ||
scraper = {version = "0.20.0"} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,36 @@ | ||
use clap::Parser; | ||
use clap::{Parser, Subcommand}; | ||
use std::path::PathBuf; | ||
|
||
#[derive(Parser, Debug)] | ||
#[command(author, version, about, long_about = None)] | ||
#[command( | ||
author = "baduker", | ||
version = env!("CARGO_PKG_VERSION"), | ||
about = "A modest untitled CLI tool to fetch titled images from the web(site).", | ||
long_about = "This is a simple CLI tool to interact, so to speak, with \ | ||
the girls from the kindgirls.com website. It's a work in progress, \ | ||
so don't expect too much from it. It's just an odd way to learn Rust." | ||
)] | ||
pub struct Cli { | ||
/// Path to the configuration file | ||
#[arg(short, long, value_name = "FILE", default_value = "untitled.toml")] | ||
pub config: PathBuf, | ||
|
||
/// Print the configuration | ||
#[arg(short, long)] | ||
pub print: bool, | ||
#[command(subcommand)] | ||
pub command: Option<Commands>, | ||
} | ||
|
||
#[derive(Subcommand, Debug)] | ||
pub enum Commands { | ||
#[command(about = "Configuration options")] | ||
Config { | ||
/// Print the configuration file | ||
#[arg(short, long)] | ||
print: bool, | ||
}, | ||
#[command(about = "Get some girls")] | ||
Scrape { | ||
/// The URL to scrape from | ||
#[arg(short, long, value_name = "URL")] | ||
url: Option<String>, | ||
}, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,27 +1,30 @@ | ||
mod cli; | ||
mod config; | ||
mod scraper; | ||
mod utilities; | ||
|
||
use crate::scraper::collector::scrape; | ||
use clap::Parser; | ||
use cli::Cli; | ||
use config::{read_or_create_config, Config, MyConfig}; | ||
use cli::{Cli, Commands}; | ||
use config::{print_config, read_or_create_config, MyConfig}; | ||
|
||
fn main() { | ||
let cli = Cli::parse(); | ||
match read_or_create_config::<MyConfig>() { | ||
Ok(config) => { | ||
if cli.print { | ||
print_config(config); | ||
} else { | ||
println!("Config loaded successfully!"); | ||
Ok(config) => match cli.command { | ||
Some(Commands::Config { print }) => { | ||
if print { | ||
print_config(&config); | ||
} | ||
} | ||
} | ||
Some(Commands::Scrape { url }) => match url { | ||
Some(url) => scrape(&config, Some(&url)), | ||
None => scrape(&config, None), | ||
}, | ||
None => eprintln!("No command specified! Use --help to see available commands."), | ||
}, | ||
Err(e) => { | ||
eprintln!("Error: {}", e); | ||
} | ||
} | ||
} | ||
|
||
fn print_config<T: std::fmt::Debug + Config>(config: T) { | ||
println!("Config: {:?}", config); | ||
println!("base_url: {:?}", config.base_url()); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
use super::structs::{Bio, Gallery, Girl, Selectors, Stats, Video, Visuals}; | ||
use crate::config::Config; | ||
use crate::utilities::{build_video_src_url, parse_video_duration, splitter}; | ||
use reqwest::blocking::Client; | ||
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT}; | ||
use reqwest::Error; | ||
use scraper::{Html, Selector}; | ||
|
||
const DEFAULT_USER_AGENT: &str = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"; | ||
const DEFAULT_BASE_URL: &str = "https://kindgirls.com"; | ||
|
||
pub fn scrape<T: Config>(config: &T, url: Option<&str>) { | ||
match url { | ||
Some(url) => { | ||
println!("Scraping from: {}", url); | ||
|
||
if Girl::is_single_gallery(url) { | ||
println!("Scraping for single gallery is NOT implemented yet. :("); | ||
return; | ||
} | ||
|
||
let body = fetch(url); | ||
match body { | ||
Ok(content) => { | ||
let document = Html::parse_document(&content); | ||
let girl = collect_girl(url, &document); | ||
println!("{:?}", girl); | ||
} | ||
Err(e) => { | ||
println!("Error fetching URL: {}", e); | ||
} | ||
} | ||
} | ||
None => { | ||
println!("Scraping from: {}", config.base_url()); | ||
println!("Download directory: {}", config.download_dir()); | ||
} | ||
} | ||
} | ||
|
||
fn fetch(url: &str) -> Result<String, Error> { | ||
let client = Client::new(); | ||
let mut headers = HeaderMap::new(); | ||
headers.insert(USER_AGENT, HeaderValue::from_static(DEFAULT_USER_AGENT)); | ||
let response = client.get(url).headers(headers).send(); | ||
let body = response?.text()?; | ||
Ok(body) | ||
} | ||
|
||
fn collect_bio(document: &Html) -> Bio { | ||
let selector = Selector::parse(Selectors::MODEL_INFO).unwrap(); | ||
let model_info = document.select(&selector).next().unwrap(); | ||
let info_text: Vec<String> = model_info | ||
.text() | ||
.map(|s| s.trim().to_string()) | ||
.filter(|s| !s.is_empty()) | ||
.collect(); | ||
Bio::new(info_text) | ||
} | ||
|
||
fn collect_gallery(document: &Html) -> Vec<Gallery> { | ||
let selector = Selector::parse(Selectors::MODEL_GALLERIES).unwrap(); | ||
document | ||
.select(&selector) | ||
.map(|element| { | ||
let href = element.value().attr("href").unwrap(); | ||
let full_url = format!("{}{}", DEFAULT_BASE_URL, href); | ||
let gallery_id = splitter(href, "=").last().unwrap().to_string(); | ||
let text = element.text().collect::<Vec<_>>().join(" "); | ||
let total_photos = text.split_whitespace().next().unwrap().parse::<i32>().ok(); | ||
let title = splitter(element.value().attr("title").unwrap(), ", "); | ||
let date = title.last().unwrap().to_string(); | ||
|
||
Gallery { | ||
id: Some(gallery_id), | ||
date: Some(date), | ||
link: Some(full_url), | ||
total_photos, | ||
} | ||
}) | ||
.collect() | ||
} | ||
|
||
fn collect_videos(document: &Html) -> Option<Vec<Video>> { | ||
let selector = Selector::parse(Selectors::MODEL_VIDEOS).unwrap(); | ||
let model_videos = document.select(&selector); | ||
|
||
let videos: Vec<Video> = model_videos | ||
.map(|video_element| { | ||
let video_href = video_element.value().attr("href").unwrap().to_string(); | ||
let video_full_url = format!("{}{}", DEFAULT_BASE_URL, video_href); | ||
|
||
// Create a new selector for the img tag within the video link | ||
let img_selector = Selector::parse("img").unwrap(); | ||
let img_element = video_element.select(&img_selector).next().unwrap(); | ||
let video_source_url = img_element.value().attr("src").unwrap().to_string(); | ||
|
||
let video_length = video_element.text().collect::<Vec<_>>().join(" "); | ||
|
||
Video { | ||
link: Some(video_full_url), | ||
source: Some(build_video_src_url(video_source_url)), | ||
duration: Some(parse_video_duration(&video_length)), | ||
} | ||
}) | ||
.collect(); | ||
|
||
if videos.is_empty() { | ||
None | ||
} else { | ||
Some(videos) | ||
} | ||
} | ||
|
||
fn collect_visuals(galleries: Vec<Gallery>, videos: Option<Vec<Video>>) -> Visuals { | ||
Visuals { galleries, videos } | ||
} | ||
|
||
fn collect_stats(visuals: &Visuals) -> Stats { | ||
let total_images: i32 = visuals | ||
.galleries | ||
.iter() | ||
.filter_map(|g| g.total_photos) | ||
.sum(); | ||
|
||
let total_videos = visuals.videos.as_ref().map(|videos| videos.len()); | ||
|
||
Stats { | ||
total_galleries: visuals.galleries.len(), | ||
total_photos: total_images, | ||
total_videos, | ||
} | ||
} | ||
|
||
fn collect_girl(url: &str, document: &Html) -> Result<Girl, String> { | ||
let is_single_gallery = Girl::is_single_gallery(url); | ||
let bio = collect_bio(document); | ||
let galleries = collect_gallery(document); | ||
let videos = collect_videos(document); | ||
let visuals = collect_visuals(galleries, videos); | ||
let stats = collect_stats(&visuals); | ||
|
||
Ok(Girl { | ||
is_single_gallery, | ||
bio, | ||
content: visuals, | ||
stats, | ||
}) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
pub mod collector; | ||
|
||
mod structs; |
Oops, something went wrong.