From 6cdd6ea94adbae063e7fb6d9da71dac0c86adc12 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 27 Oct 2024 07:35:26 -0400 Subject: [PATCH] `stats`: perf refactoring - optimize init_date_inference helper fn - use fast_float crate when converting bytes to float - removed unnecessary from_bytes helper as fast-float can convert directly from bytes --- src/cmd/stats.rs | 44 ++++++++++++++++---------------------------- 1 file changed, 16 insertions(+), 28 deletions(-) diff --git a/src/cmd/stats.rs b/src/cmd/stats.rs index 0b17d1dce..e845ade91 100644 --- a/src/cmd/stats.rs +++ b/src/cmd/stats.rs @@ -255,8 +255,7 @@ use itertools::Itertools; use qsv_dateparser::parse_with_preference; use serde::{Deserialize, Serialize}; use simd_json::{prelude::ValueAsScalar, OwnedValue}; -use simdutf8::basic::from_utf8; -use smallvec::{smallvec, SmallVec}; +use smallvec::SmallVec; use stats::{merge_all, Commute, MinMax, OnlineStats, Unsorted}; use tempfile::NamedTempFile; use threadpool::ThreadPool; @@ -1175,28 +1174,27 @@ fn init_date_inference( if !infer_dates { // we're not inferring dates, set INFER_DATE_FLAGS to all false INFER_DATE_FLAGS - .set(smallvec![false; headers.len()]) - .map_err(|e| format!("Cannot init empty date inference flags: {e:?}"))?; + .set(SmallVec::from_elem(false, headers.len())) + .map_err(|_| "Cannot init empty date inference flags".to_string())?; return Ok(()); } let infer_date_flags = if flag_whitelist.eq_ignore_ascii_case("all") { log::info!("inferring dates for ALL fields"); - smallvec![true; headers.len()] + SmallVec::from_elem(true, headers.len()) } else { let mut header_str = String::new(); let whitelist_lower = flag_whitelist.to_lowercase(); - // log::info!("inferring dates with date-whitelist: {whitelist_lower}"); + log::info!("inferring dates with date-whitelist: {whitelist_lower}"); - let whitelist = whitelist_lower - .split(',') - .map(str::trim) - .collect::>(); + let whitelist: SmallVec<[&str; 8]> = whitelist_lower.split(',').map(str::trim).collect(); headers .iter() .map(|header| { - // safety: we know the header is a valid String, so we can use unwrap - util::to_lowercase_into(&from_bytes::(header).unwrap(), &mut header_str); + util::to_lowercase_into( + simdutf8::basic::from_utf8(header).unwrap_or_default(), + &mut header_str, + ); whitelist .iter() .any(|whitelist_item| header_str.contains(whitelist_item)) @@ -1346,7 +1344,7 @@ impl Stats { } } else { // safety: we know the sample is a valid f64, so we can use unwrap - let n = from_bytes::(sample).unwrap(); + let n = fast_float::parse(sample).unwrap(); if let Some(v) = self.median.as_mut() { v.add(n); } @@ -1562,7 +1560,7 @@ impl Stats { let stotlen = if let Some((stotlen_work, sum)) = self.sum.as_ref().and_then(|sum| sum.show(typ)) { if typ == FieldType::TFloat { - if let Ok(f64_val) = sum.parse::() { + if let Ok(f64_val) = fast_float::parse::(sum.as_bytes()) { pieces.push(util::round_num(f64_val, round_places)); } else { pieces.push(format!("ERROR: Cannot convert {sum} to a float.")); @@ -1865,7 +1863,7 @@ impl FieldType { } // Check for float - if s.parse::().is_ok() { + if fast_float::parse::(s.as_bytes()).is_ok() { return (FieldType::TFloat, None); } @@ -1965,7 +1963,7 @@ impl TypedSum { match typ { TFloat => { self.stotlen = self.stotlen.saturating_add(sample.len() as u64); - if let Some(float_sample) = from_bytes::(sample) { + if let Ok(float_sample) = fast_float::parse::(sample) { if let Some(ref mut f) = self.float { *f += float_sample; } else { @@ -1977,7 +1975,7 @@ impl TypedSum { self.stotlen = self.stotlen.saturating_add(sample.len() as u64); if let Some(ref mut float) = self.float { // safety: we know that the sample is a valid f64 - *float += from_bytes::(sample).unwrap(); + *float += fast_float::parse::(sample).unwrap(); } else { // so we don't panic on overflow/underflow, use saturating_add self.integer = self @@ -2060,7 +2058,7 @@ impl TypedMinMax { match typ { TString | TNull => {}, TFloat => { - let n = from_utf8(sample).unwrap().parse::().unwrap(); + let n = fast_float::parse::(sample).unwrap(); self.floats.add(n); self.integers.add(n as i64); @@ -2175,13 +2173,3 @@ impl Commute for TypedMinMax { self.dates.merge(other.dates); } } - -#[allow(clippy::inline_always)] -#[inline(always)] -fn from_bytes(bytes: &[u8]) -> Option { - if let Ok(x) = simdutf8::basic::from_utf8(bytes) { - x.parse().ok() - } else { - None - } -}