Skip to content

Commit

Permalink
feat: Add 'expression.max_compiled_regexes' Query Config (#11850)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #11850

Adding 'expression.max_compiled_regexes' Query Config property, so it
can be adjusted per query.
Also increasing the dfault value to 100 from 20.

Reviewed By: yuandagits

Differential Revision: D67183811

fbshipit-source-id: 5ccc6defe2dbd8829d8627a931944d4da203297c
  • Loading branch information
Sergey Pershin authored and facebook-github-bot committed Dec 14, 2024
1 parent e27c8f3 commit 1779351
Show file tree
Hide file tree
Showing 10 changed files with 185 additions and 74 deletions.
9 changes: 9 additions & 0 deletions velox/core/QueryConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,11 @@ class QueryConfig {
static constexpr const char* kExprMaxArraySizeInReduce =
"expression.max_array_size_in_reduce";

/// Controls maximum number of compiled regular expression patterns per
/// function instance per thread of execution.
static constexpr const char* kExprMaxCompiledRegexes =
"expression.max_compiled_regexes";

/// Used for backpressure to block local exchange producers when the local
/// exchange buffer reaches or exceeds this size.
static constexpr const char* kMaxLocalExchangeBufferSize =
Expand Down Expand Up @@ -617,6 +622,10 @@ class QueryConfig {
return get<uint64_t>(kExprMaxArraySizeInReduce, 100'000);
}

uint64_t exprMaxCompiledRegexes() const {
return get<uint64_t>(kExprMaxCompiledRegexes, 100);
}

bool adjustTimestampToTimezone() const {
return get<bool>(kAdjustTimestampToTimezone, false);
}
Expand Down
4 changes: 4 additions & 0 deletions velox/docs/configs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,10 @@ Expression Evaluation Configuration
- integer
- 100000
- ``Reduce`` function will throw an error if encountered an array of size greater than this.
* - expression.max_compiled_regexes
- integer
- 100
- Controls maximum number of compiled regular expression patterns per batch.
* - debug_disable_expression_with_peeling
- bool
- false
Expand Down
6 changes: 3 additions & 3 deletions velox/docs/functions/spark/regexp.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ See https://github.com/google/re2/wiki/Syntax for more information.
Note: The wildcard '%' represents 0, 1 or multiple characters and the
wildcard '_' represents exactly one character.

Note: Each function instance allow for a maximum of 20 regular expressions to
be compiled per thread of execution. Not all patterns require
compilation of regular expressions. Patterns 'hello', 'hello%', '_hello__%',
Note: Each function instance allow for a maximum of ``expression.max_compiled_regexes``
(default 100) regular expressions to be compiled per thread of execution. Not all patterns
require compilation of regular expressions. Patterns 'hello', 'hello%', '_hello__%',
'%hello', '%__hello_', '%hello%', where 'hello', 'velox'
contains only regular characters and '_' wildcards are evaluated without
using regular expressions. Only those patterns that require the compilation of
Expand Down
67 changes: 43 additions & 24 deletions velox/functions/lib/Re2Functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ Expected<RE2*> ReCache::tryFindOrCompile(const StringView& pattern) {
return reIt->second.get();
}

if (cache_.size() >= kMaxCompiledRegexes) {
if (cache_.size() >= maxCompiledRegexes_) {
return folly::makeUnexpected(
Status::UserError("Max number of regex reached"));
}
Expand Down Expand Up @@ -239,6 +239,8 @@ class Re2MatchConstantPattern final : public exec::VectorFunction {
template <bool (*Fn)(StringView, const RE2&)>
class Re2Match final : public exec::VectorFunction {
public:
explicit Re2Match(int64_t maxCompiledRegexes) : cache_(maxCompiledRegexes) {}

void apply(
const SelectivityVector& rows,
std::vector<VectorPtr>& args,
Expand Down Expand Up @@ -359,8 +361,8 @@ class Re2SearchAndExtractConstantPattern final : public exec::VectorFunction {
template <typename T>
class Re2SearchAndExtract final : public exec::VectorFunction {
public:
explicit Re2SearchAndExtract(bool emptyNoMatch)
: emptyNoMatch_(emptyNoMatch) {}
explicit Re2SearchAndExtract(bool emptyNoMatch, int64_t maxCompiledRegexes)
: emptyNoMatch_(emptyNoMatch), cache_(maxCompiledRegexes) {}
void apply(
const SelectivityVector& rows,
std::vector<VectorPtr>& args,
Expand Down Expand Up @@ -886,11 +888,15 @@ class LikeWithRe2 final : public exec::VectorFunction {
};

// This function is constructed when pattern or escape are not constants.
// It allows up to kMaxCompiledRegexes different regular expressions to be
// compiled throughout the query lifetime per expression and thread of
// execution, note that optimized regular expressions that are not compiled are
// not counted.
// It allows up to 'expression.max_compiled_regexes' different regular
// expressions to be compiled throughout the query lifetime per expression and
// thread of execution, note that optimized regular expressions that are not
// compiled are not counted.
class LikeGeneric final : public exec::VectorFunction {
public:
explicit LikeGeneric(int64_t maxCompiledRegexes)
: maxCompiledRegexes_(maxCompiledRegexes) {}

void apply(
const SelectivityVector& rows,
std::vector<VectorPtr>& args,
Expand Down Expand Up @@ -1008,7 +1014,7 @@ class LikeGeneric final : public exec::VectorFunction {

VELOX_USER_CHECK_LT(
compiledRegularExpressions_.size(),
kMaxCompiledRegexes,
maxCompiledRegexes_,
"Max number of regex reached");

bool validEscapeUsage;
Expand All @@ -1033,6 +1039,7 @@ class LikeGeneric final : public exec::VectorFunction {
std::pair<std::string, std::optional<char>>,
std::unique_ptr<RE2>>
compiledRegularExpressions_;
int64_t maxCompiledRegexes_;
};

void re2ExtractAll(
Expand Down Expand Up @@ -1145,6 +1152,9 @@ class Re2ExtractAllConstantPattern final : public exec::VectorFunction {
template <typename T>
class Re2ExtractAll final : public exec::VectorFunction {
public:
explicit Re2ExtractAll(int64_t maxCompiledRegexes)
: cache_(maxCompiledRegexes) {}

void apply(
const SelectivityVector& rows,
std::vector<VectorPtr>& args,
Expand Down Expand Up @@ -1204,7 +1214,8 @@ class Re2ExtractAll final : public exec::VectorFunction {
template <bool (*Fn)(StringView, const RE2&)>
std::shared_ptr<exec::VectorFunction> makeRe2MatchImpl(
const std::string& name,
const std::vector<exec::VectorFunctionArg>& inputArgs) {
const std::vector<exec::VectorFunctionArg>& inputArgs,
const core::QueryConfig& config) {
if (inputArgs.size() != 2 || !inputArgs[0].type->isVarchar() ||
!inputArgs[1].type->isVarchar()) {
VELOX_UNSUPPORTED(
Expand All @@ -1220,11 +1231,14 @@ std::shared_ptr<exec::VectorFunction> makeRe2MatchImpl(
constantPattern->as<ConstantVector<StringView>>()->valueAt(0));
}

return std::make_shared<Re2Match<Fn>>();
return std::make_shared<Re2Match<Fn>>(config.exprMaxCompiledRegexes());
}

class RegexpReplaceWithLambdaFunction : public exec::VectorFunction {
public:
explicit RegexpReplaceWithLambdaFunction(int64_t maxCompiledRegexes)
: cache_(maxCompiledRegexes) {}

void apply(
const SelectivityVector& rows,
std::vector<VectorPtr>& args,
Expand Down Expand Up @@ -1592,8 +1606,8 @@ class RegexpReplaceWithLambdaFunction : public exec::VectorFunction {
std::shared_ptr<exec::VectorFunction> makeRe2Match(
const std::string& name,
const std::vector<exec::VectorFunctionArg>& inputArgs,
const core::QueryConfig& /*config*/) {
return makeRe2MatchImpl<re2FullMatch>(name, inputArgs);
const core::QueryConfig& config) {
return makeRe2MatchImpl<re2FullMatch>(name, inputArgs, config);
}

std::vector<std::shared_ptr<exec::FunctionSignature>> re2MatchSignatures() {
Expand All @@ -1608,8 +1622,8 @@ std::vector<std::shared_ptr<exec::FunctionSignature>> re2MatchSignatures() {
std::shared_ptr<exec::VectorFunction> makeRe2Search(
const std::string& name,
const std::vector<exec::VectorFunctionArg>& inputArgs,
const core::QueryConfig& /*config*/) {
return makeRe2MatchImpl<re2PartialMatch>(name, inputArgs);
const core::QueryConfig& config) {
return makeRe2MatchImpl<re2PartialMatch>(name, inputArgs, config);
}

std::vector<std::shared_ptr<exec::FunctionSignature>> re2SearchSignatures() {
Expand All @@ -1624,7 +1638,7 @@ std::vector<std::shared_ptr<exec::FunctionSignature>> re2SearchSignatures() {
std::shared_ptr<exec::VectorFunction> makeRe2Extract(
const std::string& name,
const std::vector<exec::VectorFunctionArg>& inputArgs,
const core::QueryConfig& /*config*/,
const core::QueryConfig& config,
const bool emptyNoMatch) {
auto numArgs = inputArgs.size();
VELOX_USER_CHECK(
Expand Down Expand Up @@ -1673,11 +1687,14 @@ std::shared_ptr<exec::VectorFunction> makeRe2Extract(
}
}

const auto maxCompiledRegexes = config.exprMaxCompiledRegexes();
switch (groupIdTypeKind) {
case TypeKind::INTEGER:
return std::make_shared<Re2SearchAndExtract<int32_t>>(emptyNoMatch);
return std::make_shared<Re2SearchAndExtract<int32_t>>(
emptyNoMatch, maxCompiledRegexes);
case TypeKind::BIGINT:
return std::make_shared<Re2SearchAndExtract<int64_t>>(emptyNoMatch);
return std::make_shared<Re2SearchAndExtract<int64_t>>(
emptyNoMatch, maxCompiledRegexes);
default:
VELOX_UNREACHABLE();
}
Expand Down Expand Up @@ -2158,14 +2175,14 @@ PatternMetadata determinePatternKind(
std::shared_ptr<exec::VectorFunction> makeLike(
const std::string& name,
const std::vector<exec::VectorFunctionArg>& inputArgs,
const core::QueryConfig& /*config*/) {
const core::QueryConfig& config) {
auto numArgs = inputArgs.size();

std::optional<char> escapeChar;
if (numArgs == 3) {
BaseVector* escape = inputArgs[2].constantValue.get();
if (!escape) {
return std::make_shared<LikeGeneric>();
return std::make_shared<LikeGeneric>(config.exprMaxCompiledRegexes());
}

auto constantEscape = escape->as<ConstantVector<StringView>>();
Expand All @@ -2191,7 +2208,7 @@ std::shared_ptr<exec::VectorFunction> makeLike(

BaseVector* constantPattern = inputArgs[1].constantValue.get();
if (!constantPattern) {
return std::make_shared<LikeGeneric>();
return std::make_shared<LikeGeneric>(config.exprMaxCompiledRegexes());
}

if (constantPattern->isNullAt(0)) {
Expand Down Expand Up @@ -2273,7 +2290,7 @@ std::vector<std::shared_ptr<exec::FunctionSignature>> likeSignatures() {
std::shared_ptr<exec::VectorFunction> makeRe2ExtractAll(
const std::string& name,
const std::vector<exec::VectorFunctionArg>& inputArgs,
const core::QueryConfig& /*config*/) {
const core::QueryConfig& config) {
auto numArgs = inputArgs.size();
VELOX_USER_CHECK(
numArgs == 2 || numArgs == 3,
Expand Down Expand Up @@ -2318,11 +2335,12 @@ std::shared_ptr<exec::VectorFunction> makeRe2ExtractAll(
}
}

const auto maxCompiledRegexes = config.exprMaxCompiledRegexes();
switch (groupIdTypeKind) {
case TypeKind::INTEGER:
return std::make_shared<Re2ExtractAll<int32_t>>();
return std::make_shared<Re2ExtractAll<int32_t>>(maxCompiledRegexes);
case TypeKind::BIGINT:
return std::make_shared<Re2ExtractAll<int64_t>>();
return std::make_shared<Re2ExtractAll<int64_t>>(maxCompiledRegexes);
default:
VELOX_UNREACHABLE();
}
Expand Down Expand Up @@ -2357,7 +2375,8 @@ std::shared_ptr<exec::VectorFunction> makeRegexpReplaceWithLambda(
const std::string& name,
const std::vector<exec::VectorFunctionArg>& inputArgs,
const core::QueryConfig& config) {
return std::make_shared<RegexpReplaceWithLambdaFunction>();
return std::make_shared<RegexpReplaceWithLambdaFunction>(
config.exprMaxCompiledRegexes());
}

std::vector<std::shared_ptr<exec::FunctionSignature>>
Expand Down
25 changes: 22 additions & 3 deletions velox/functions/lib/Re2Functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,6 @@ class PatternMetadata {
std::vector<std::string> substrings_;
};

inline const int kMaxCompiledRegexes = 20;

/// The functions in this file use RE2 as the regex engine. RE2 is fast, but
/// supports only a subset of PCRE syntax and in particular does not support
/// backtracking and associated features (e.g. backreferences).
Expand Down Expand Up @@ -255,18 +253,26 @@ std::vector<std::shared_ptr<exec::FunctionSignature>> re2ExtractAllSignatures();
namespace detail {

// A cache of compiled regular expressions (RE2 instances). Allows up to
// 'kMaxCompiledRegexes' different expressions.
// 'expression.max_compiled_regexes' different expressions.
//
// Compiling regular expressions is expensive. It can take up to 200 times
// more CPU time to compile a regex vs. evaluate it.
class ReCache {
public:
explicit ReCache(uint64_t maxCompiledRegexes)
: maxCompiledRegexes_(maxCompiledRegexes) {}

void setMaxCompiledRegexes(uint64_t maxCompiledRegexes) {
maxCompiledRegexes_ = maxCompiledRegexes;
}

RE2* findOrCompile(const StringView& pattern);

Expected<RE2*> tryFindOrCompile(const StringView& pattern);

private:
folly::F14FastMap<std::string, std::unique_ptr<RE2>> cache_;
uint64_t maxCompiledRegexes_;
};

} // namespace detail
Expand All @@ -287,6 +293,8 @@ template <
std::string (*prepareRegexpPattern)(const StringView&),
std::string (*prepareRegexpReplacement)(const RE2&, const StringView&)>
struct Re2RegexpReplace {
Re2RegexpReplace() : cache_(0) {}

VELOX_DEFINE_FUNCTION_TYPES(T);

FOLLY_ALWAYS_INLINE void initialize(
Expand All @@ -304,6 +312,7 @@ struct Re2RegexpReplace {
processedPattern,
re_->error());
}
cache_.setMaxCompiledRegexes(config.exprMaxCompiledRegexes());

if (replacement != nullptr) {
// Constant 'replacement' with non-constant 'pattern' needs to be
Expand Down Expand Up @@ -377,8 +386,18 @@ struct Re2RegexpReplace {

template <typename TExec>
struct Re2RegexpSplit {
Re2RegexpSplit() : cache_(0) {}

VELOX_DEFINE_FUNCTION_TYPES(TExec);

FOLLY_ALWAYS_INLINE void initialize(
const std::vector<TypePtr>& /*inputTypes*/,
const core::QueryConfig& config,
const arg_type<Varchar>* /*string*/,
const arg_type<Varchar>* /*pattern*/) {
cache_.setMaxCompiledRegexes(config.exprMaxCompiledRegexes());
}

static constexpr int32_t reuse_strings_from_arg = 0;

void call(
Expand Down
Loading

0 comments on commit 1779351

Please sign in to comment.