Skip to content

Commit

Permalink
perf(conf): remove box indirection proxies, whitelist, and blacklist
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Feb 26, 2025
1 parent bcd45f2 commit 022b6bd
Show file tree
Hide file tree
Showing 10 changed files with 119 additions and 153 deletions.
206 changes: 87 additions & 119 deletions Cargo.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.30.24"
version = "2.31.1"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
32 changes: 15 additions & 17 deletions spider/src/configuration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ pub enum RedirectPolicy {

#[cfg(not(feature = "regex"))]
/// Allow list normal matching paths.
pub type AllowList = Box<Vec<CompactString>>;
pub type AllowList = Vec<CompactString>;

#[cfg(feature = "regex")]
/// Allow list regex.
Expand Down Expand Up @@ -96,9 +96,9 @@ pub struct Configuration {
/// Preserve the HTTP host header from being included.
pub preserve_host_header: bool,
/// List of pages to not crawl. [optional: regex pattern matching]
pub blacklist_url: Option<Box<Vec<CompactString>>>,
pub blacklist_url: Option<Vec<CompactString>>,
/// List of pages to only crawl. [optional: regex pattern matching]
pub whitelist_url: Option<Box<Vec<CompactString>>>,
pub whitelist_url: Option<Vec<CompactString>>,
/// User-Agent for request.
pub user_agent: Option<Box<CompactString>>,
/// Polite crawling delay in milli seconds.
Expand All @@ -108,7 +108,7 @@ pub struct Configuration {
/// Use HTTP2 for connection. Enable if you know the website has http2 support.
pub http2_prior_knowledge: bool,
/// Use proxy list for performing network request.
pub proxies: Option<Box<Vec<RequestProxy>>>,
pub proxies: Option<Vec<RequestProxy>>,
/// Headers to include with request.
pub headers: Option<Box<SerializableHeaderMap>>,
#[cfg(feature = "sitemap")]
Expand Down Expand Up @@ -401,7 +401,7 @@ impl Configuration {

#[cfg(not(feature = "regex"))]
/// Handle the blacklist options.
pub fn get_blacklist(&self) -> Box<Vec<CompactString>> {
pub fn get_blacklist(&self) -> Vec<CompactString> {
match &self.blacklist_url {
Some(blacklist) => blacklist.to_owned(),
_ => Default::default(),
Expand Down Expand Up @@ -453,7 +453,7 @@ impl Configuration {

#[cfg(not(feature = "regex"))]
/// Handle the whitelist options.
pub fn get_whitelist(&self) -> Box<Vec<CompactString>> {
pub fn get_whitelist(&self) -> Vec<CompactString> {
match &self.whitelist_url {
Some(whitelist) => whitelist.to_owned(),
_ => Default::default(),
Expand Down Expand Up @@ -591,20 +591,18 @@ impl Configuration {
/// Use proxies for request.
pub fn with_proxies(&mut self, proxies: Option<Vec<String>>) -> &mut Self {
self.proxies = proxies.map(|p| {
Box::new(
p.iter()
.map(|addr| RequestProxy {
addr: addr.to_owned(),
..Default::default()
})
.collect::<Vec<RequestProxy>>(),
)
p.iter()
.map(|addr| RequestProxy {
addr: addr.to_owned(),
..Default::default()
})
.collect::<Vec<RequestProxy>>()
});
self
}

/// Use proxies for request with control between chrome and http.
pub fn with_proxies_direct(&mut self, proxies: Option<Box<Vec<RequestProxy>>>) -> &mut Self {
pub fn with_proxies_direct(&mut self, proxies: Option<Vec<RequestProxy>>) -> &mut Self {
self.proxies = proxies;
self
}
Expand All @@ -621,7 +619,7 @@ impl Configuration {
Vec<CompactString>: From<Vec<T>>,
{
match blacklist_url {
Some(p) => self.blacklist_url = Some(Box::new(p.into())),
Some(p) => self.blacklist_url = Some(p.into()),
_ => self.blacklist_url = None,
};
self
Expand All @@ -633,7 +631,7 @@ impl Configuration {
Vec<CompactString>: From<Vec<T>>,
{
match whitelist_url {
Some(p) => self.whitelist_url = Some(Box::new(p.into())),
Some(p) => self.whitelist_url = Some(p.into()),
_ => self.whitelist_url = None,
};
self
Expand Down
4 changes: 2 additions & 2 deletions spider/src/features/chrome.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ pub fn parse_cookies_with_jar(cookie_str: &str, url: &Url) -> Result<Vec<CookieP
/// get chrome configuration
#[cfg(not(feature = "chrome_headed"))]
pub fn get_browser_config(
proxies: &Option<Box<Vec<crate::configuration::RequestProxy>>>,
proxies: &Option<Vec<crate::configuration::RequestProxy>>,
intercept: bool,
cache_enabled: bool,
viewport: impl Into<Option<chromiumoxide::handler::viewport::Viewport>>,
Expand Down Expand Up @@ -142,7 +142,7 @@ pub fn get_browser_config(
/// get chrome configuration headful
#[cfg(feature = "chrome_headed")]
pub fn get_browser_config(
proxies: &Option<Box<Vec<crate::configuration::RequestProxy>>>,
proxies: &Option<Vec<crate::configuration::RequestProxy>>,
intercept: bool,
cache_enabled: bool,
viewport: impl Into<Option<chromiumoxide::handler::viewport::Viewport>>,
Expand Down
18 changes: 9 additions & 9 deletions spider/src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ pub struct Website {
/// Extra links to crawl.
extra_links: Box<HashSet<CaseInsensitiveString>>,
/// Pages visited.
pages: Option<Box<Vec<Page>>>,
pages: Option<Vec<Page>>,
/// Robot.txt parser.
robot_file_parser: Option<Box<RobotFileParser>>,
/// Base url of the crawl.
Expand Down Expand Up @@ -944,7 +944,7 @@ impl Website {
}

/// Page getter.
pub fn get_pages(&self) -> Option<&Box<Vec<Page>>> {
pub fn get_pages(&self) -> Option<&Vec<Page>> {
self.pages.as_ref()
}

Expand Down Expand Up @@ -2703,7 +2703,7 @@ impl Website {
let mut rx2 = w.subscribe(0).expect("receiver enabled");

if self.pages.is_none() {
self.pages = Some(Box::new(Vec::new()));
self.pages = Some(Vec::new());
}

let crawl = async move {
Expand Down Expand Up @@ -2734,7 +2734,7 @@ impl Website {
let mut rx2 = w.subscribe(0).expect("receiver enabled");

if self.pages.is_none() {
self.pages = Some(Box::new(Vec::new()));
self.pages = Some(Vec::new());
}
let crawl = async move {
w.crawl_raw().await;
Expand Down Expand Up @@ -2764,7 +2764,7 @@ impl Website {
let mut rx2 = w.subscribe(0).expect("receiver enabled");

if self.pages.is_none() {
self.pages = Some(Box::new(Vec::new()));
self.pages = Some(Vec::new());
}

let crawl = async move {
Expand Down Expand Up @@ -2795,7 +2795,7 @@ impl Website {
let mut rx2 = w.subscribe(0).expect("receiver enabled");

if self.pages.is_none() {
self.pages = Some(Box::new(Vec::new()));
self.pages = Some(Vec::new());
}

let crawl = async move {
Expand Down Expand Up @@ -4685,7 +4685,7 @@ impl Website {
/// Use proxies for request with control between chrome and http.
pub fn with_proxies_direct(
&mut self,
proxies: Option<Box<Vec<crate::configuration::RequestProxy>>>,
proxies: Option<Vec<crate::configuration::RequestProxy>>,
) -> &mut Self {
self.configuration.with_proxies_direct(proxies);
self
Expand Down Expand Up @@ -5507,9 +5507,9 @@ async fn crawl_invalid() {
#[tokio::test]
async fn not_crawl_blacklist() {
let mut website: Website = Website::new("https://choosealicense.com");
website.configuration.blacklist_url = Some(Box::new(Vec::from([CompactString::from(
website.configuration.blacklist_url = Some(Vec::from([CompactString::from(
"https://choosealicense.com/licenses/",
)])));
)]));

website.crawl().await;
assert!(
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.30.24"
version = "2.31.1"
rust-version = "1.70"
authors = [
"j-mendez <[email protected]>"
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.30.24"
version = "2.31.1"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.30.24"
version = "2.31.1"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.30.24"
version = "2.31.1"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "2.30.24"
version = "2.31.1"
authors = [
"j-mendez <[email protected]>"
]
Expand Down

0 comments on commit 022b6bd

Please sign in to comment.