From 705dd0e209629d9d9202bc15ec9ae381a5521d4f Mon Sep 17 00:00:00 2001 From: Dima <111751109+Dimchikkk@users.noreply.github.com> Date: Tue, 12 Nov 2024 18:21:38 +0000 Subject: [PATCH] improve performance of regexp_count (#13364) * improve performance of regexp_count * fix clippy * collect with Int64Array to eliminate one temp Vec --------- Co-authored-by: Dima --- datafusion/functions/src/regex/regexpcount.rs | 82 +++++++++---------- 1 file changed, 39 insertions(+), 43 deletions(-) diff --git a/datafusion/functions/src/regex/regexpcount.rs b/datafusion/functions/src/regex/regexpcount.rs index 7c4313effffb..1286c6b5b1bc 100644 --- a/datafusion/functions/src/regex/regexpcount.rs +++ b/datafusion/functions/src/regex/regexpcount.rs @@ -30,7 +30,6 @@ use datafusion_expr::{ }; use itertools::izip; use regex::Regex; -use std::collections::hash_map::Entry; use std::collections::HashMap; use std::sync::{Arc, OnceLock}; @@ -312,12 +311,12 @@ where let pattern = compile_regex(regex, flags_scalar)?; - Ok(Arc::new(Int64Array::from_iter_values( + Ok(Arc::new( values .iter() .map(|value| count_matches(value, &pattern, start_scalar)) - .collect::, ArrowError>>()?, - ))) + .collect::>()?, + )) } (true, true, false) => { let regex = match regex_scalar { @@ -336,17 +335,17 @@ where ))); } - Ok(Arc::new(Int64Array::from_iter_values( + Ok(Arc::new( values .iter() .zip(flags_array.iter()) .map(|(value, flags)| { let pattern = compile_and_cache_regex(regex, flags, &mut regex_cache)?; - count_matches(value, &pattern, start_scalar) + count_matches(value, pattern, start_scalar) }) - .collect::, ArrowError>>()?, - ))) + .collect::>()?, + )) } (true, false, true) => { let regex = match regex_scalar { @@ -360,13 +359,13 @@ where let start_array = start_array.unwrap(); - Ok(Arc::new(Int64Array::from_iter_values( + Ok(Arc::new( values .iter() .zip(start_array.iter()) .map(|(value, start)| count_matches(value, &pattern, start)) - .collect::, ArrowError>>()?, - ))) + .collect::>()?, + )) } (true, false, false) => { let regex = match regex_scalar { @@ -385,7 +384,7 @@ where ))); } - Ok(Arc::new(Int64Array::from_iter_values( + Ok(Arc::new( izip!( values.iter(), start_array.unwrap().iter(), @@ -395,10 +394,10 @@ where let pattern = compile_and_cache_regex(regex, flags, &mut regex_cache)?; - count_matches(value, &pattern, start) + count_matches(value, pattern, start) }) - .collect::, ArrowError>>()?, - ))) + .collect::>()?, + )) } (false, true, true) => { if values.len() != regex_array.len() { @@ -409,7 +408,7 @@ where ))); } - Ok(Arc::new(Int64Array::from_iter_values( + Ok(Arc::new( values .iter() .zip(regex_array.iter()) @@ -424,10 +423,10 @@ where flags_scalar, &mut regex_cache, )?; - count_matches(value, &pattern, start_scalar) + count_matches(value, pattern, start_scalar) }) - .collect::, ArrowError>>()?, - ))) + .collect::>()?, + )) } (false, true, false) => { if values.len() != regex_array.len() { @@ -447,7 +446,7 @@ where ))); } - Ok(Arc::new(Int64Array::from_iter_values( + Ok(Arc::new( izip!(values.iter(), regex_array.iter(), flags_array.iter()) .map(|(value, regex, flags)| { let regex = match regex { @@ -458,10 +457,10 @@ where let pattern = compile_and_cache_regex(regex, flags, &mut regex_cache)?; - count_matches(value, &pattern, start_scalar) + count_matches(value, pattern, start_scalar) }) - .collect::, ArrowError>>()?, - ))) + .collect::>()?, + )) } (false, false, true) => { if values.len() != regex_array.len() { @@ -481,7 +480,7 @@ where ))); } - Ok(Arc::new(Int64Array::from_iter_values( + Ok(Arc::new( izip!(values.iter(), regex_array.iter(), start_array.iter()) .map(|(value, regex, start)| { let regex = match regex { @@ -494,10 +493,10 @@ where flags_scalar, &mut regex_cache, )?; - count_matches(value, &pattern, start) + count_matches(value, pattern, start) }) - .collect::, ArrowError>>()?, - ))) + .collect::>()?, + )) } (false, false, false) => { if values.len() != regex_array.len() { @@ -526,7 +525,7 @@ where ))); } - Ok(Arc::new(Int64Array::from_iter_values( + Ok(Arc::new( izip!( values.iter(), regex_array.iter(), @@ -541,27 +540,24 @@ where let pattern = compile_and_cache_regex(regex, flags, &mut regex_cache)?; - count_matches(value, &pattern, start) + count_matches(value, pattern, start) }) - .collect::, ArrowError>>()?, - ))) + .collect::>()?, + )) } } } -fn compile_and_cache_regex( - regex: &str, - flags: Option<&str>, - regex_cache: &mut HashMap, -) -> Result { - match regex_cache.entry(regex.to_string()) { - Entry::Vacant(entry) => { - let compiled = compile_regex(regex, flags)?; - entry.insert(compiled.clone()); - Ok(compiled) - } - Entry::Occupied(entry) => Ok(entry.get().to_owned()), +fn compile_and_cache_regex<'a>( + regex: &'a str, + flags: Option<&'a str>, + regex_cache: &'a mut HashMap, +) -> Result<&'a Regex, ArrowError> { + if !regex_cache.contains_key(regex) { + let compiled = compile_regex(regex, flags)?; + regex_cache.insert(regex.to_string(), compiled); } + Ok(regex_cache.get(regex).unwrap()) } fn compile_regex(regex: &str, flags: Option<&str>) -> Result {