Skip to content

Commit

Permalink
ARROW-18235: [C++][Gandiva] Fix the like function implementation for …
Browse files Browse the repository at this point in the history
…escape chars (apache#14579)

 The current implementation of optimisation for like function does a
 removal of the escape char, this causes errors in matching if the escape
 char is one of the pcre special chars or if the escape char is followed
 by itself.

 Fix this by only removing the '\\' escape char during optimisation.

Authored-by: Siddhant Rao <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
  • Loading branch information
siddhantrao23 committed Nov 22, 2022
1 parent c3c98b8 commit c39b8a6
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 10 deletions.
9 changes: 1 addition & 8 deletions cpp/src/gandiva/regex_functions_holder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,7 @@ RE2 LikeHolder::ends_with_regex_(R"(\.\*([^\.\*])*)");
RE2 LikeHolder::is_substr_regex_(R"(\.\*([^\.\*])*\.\*)");

std::string& RemovePatternEscapeChars(const FunctionNode& node, std::string& pattern) {
if (node.children().size() != 2) {
auto escape_char = dynamic_cast<LiteralNode*>(node.children().at(2).get());
pattern.erase(std::remove(pattern.begin(), pattern.end(),
arrow::util::get<std::string>(escape_char->holder()).at(0)),
pattern.end()); // remove escape chars
} else {
pattern.erase(std::remove(pattern.begin(), pattern.end(), '\\'), pattern.end());
}
pattern.erase(std::remove(pattern.begin(), pattern.end(), '\\'), pattern.end());
return pattern;
}

Expand Down
13 changes: 11 additions & 2 deletions cpp/src/gandiva/regex_functions_holder_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ class TestLikeHolder : public ::testing::Test {
auto pattern_node =
std::make_shared<LiteralNode>(arrow::utf8(), LiteralHolder(pattern), false);
auto escape_char_node = std::make_shared<LiteralNode>(
arrow::int8(), LiteralHolder((int8_t)escape_char), false);
arrow::utf8(), LiteralHolder(std::string(1, escape_char)), false);
return FunctionNode("like", {field, pattern_node, escape_char_node},
arrow::boolean());
}
Expand Down Expand Up @@ -177,7 +177,16 @@ TEST_F(TestLikeHolder, TestOptimise) {
fnode = LikeHolder::TryOptimize(BuildLike("\\%xyz", '\\'));
EXPECT_EQ(fnode.descriptor()->name(), "like");
EXPECT_EQ(fnode.ToString(),
"bool like((string) in, (const string) '\\%xyz', (const int8) \\)");
"bool like((string) in, (const string) '\\%xyz', (const string) '\\')");

// optimised for escape pattern that are pcre special chars.
fnode = LikeHolder::TryOptimize(BuildLike("%ab^_cd^_de%", '^'));
EXPECT_EQ(fnode.descriptor()->name(), "is_substr");
EXPECT_EQ(fnode.ToString(), "bool is_substr((string) in, (const string) 'ab_cd_de')");

fnode = LikeHolder::TryOptimize(BuildLike("%ab^^cd^^de%", '^'));
EXPECT_EQ(fnode.descriptor()->name(), "is_substr");
EXPECT_EQ(fnode.ToString(), "bool is_substr((string) in, (const string) 'ab^cd^de')");
}

TEST_F(TestLikeHolder, TestMatchOneEscape) {
Expand Down
33 changes: 33 additions & 0 deletions cpp/src/gandiva/tests/filter_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,39 @@ TEST_F(TestFilter, TestLike) {

// Validate results
EXPECT_ARROW_ARRAY_EQUALS(exp, selection_vector->ToArray());

auto literal_escape_pattern =
TreeExprBuilder::MakeStringLiteral("%tu^_security^_freeze%");
auto escape_char = TreeExprBuilder::MakeStringLiteral("^");
like_func = TreeExprBuilder::MakeFunction(
"like", {node_f0, literal_escape_pattern, escape_char}, boolean());

condition = TreeExprBuilder::MakeCondition(like_func);

status = Filter::Make(schema, condition, TestConfiguration(), &filter);
EXPECT_TRUE(status.ok());

// Create a row-batch with some sample data
num_records = 5;
array0 = MakeArrowArrayUtf8(
{"AAAtu_security_freezeBBB", "hello", "bye", "abc-x", "AAAtusecurityfreezeBBB"},
{true, true, true, true, true});

// expected output (indices for which condition matches)
exp = MakeArrowArrayUint16({0});

// prepare input record batch
in_batch = arrow::RecordBatch::Make(schema, num_records, {array0});

status = SelectionVector::MakeInt16(num_records, pool_, &selection_vector);
EXPECT_TRUE(status.ok());

// Evaluate expression
status = filter->Evaluate(*in_batch, selection_vector);
EXPECT_TRUE(status.ok());

// Validate results
EXPECT_ARROW_ARRAY_EQUALS(exp, selection_vector->ToArray());
}

} // namespace gandiva

0 comments on commit c39b8a6

Please sign in to comment.