Skip to content

Commit

Permalink
Optimize LIKE for more relaxed patterns (facebookincubator#8594)
Browse files Browse the repository at this point in the history
Summary:
In this PR we optimize LIKE operations for patterns which I call them
kRelaxed[Prefix|Suffix] patterns, e.g.

- kRelaxedPrefix: _a_bc%%
- kRelaxedSuffix: %%_a_bc

'Relaxed' here means there is less restrictions than their counterparts.
The algorithm of recognizing these relaxed patterns can be explained by
an example, say we have a pattern '___hello___%%', it is split into 4
sub patterns:

- [0] kSingleCharWildcard: ___
- [1] kLiteralString: hello
- [2] kSingleCharWildcard: ___
- [3] kAnyCharsWildcard: %%

Since the 'kAnyCharsWildcard' only occurs at the end of the pattern, we can
determine it is a kRelaxedPrefix pattern, and then use the first 3 fixed
sub-patterns to do the matching.

The benchmark result:

Before(kGeneric):

```
============================================================================
[...]hmarks/ExpressionBenchmarkBuilder.cpp     relative  time/iter   iters/s
============================================================================
like_generic##like_generic                                   1.34s   747.38m
----------------------------------------------------------------------------
----------------------------------------------------------------------------
like_prefix##like_prefix                                  340.30ms      2.94
like_prefix##like_relaxed_prefix_1                        334.77ms      2.99
like_prefix##like_relaxed_prefix_2                        350.70ms      2.85
like_prefix##starts_with                                    5.35ms    187.05
like_substring##like_substring                               1.26s   790.87m
like_substring##strpos                                     20.55ms     48.67
like_suffix##like_suffix                                  957.06ms      1.04
like_suffix##like_relaxed_suffix_1                        935.90ms      1.07
like_suffix##like_relaxed_suffix_2                           1.08s   926.79m
like_suffix##ends_with                                      5.35ms    187.07
```

After(kRelaxedPrefix, kRelaxedSuffix):

```
============================================================================
[...]hmarks/ExpressionBenchmarkBuilder.cpp     relative  time/iter   iters/s
============================================================================
like_generic##like_generic                                   1.48s   674.92m
----------------------------------------------------------------------------
----------------------------------------------------------------------------
like_prefix##like_prefix                                    7.05ms    141.80
like_prefix##like_relaxed_prefix_1                          9.06ms    110.36
like_prefix##like_relaxed_prefix_2                          8.55ms    116.94
like_prefix##starts_with                                    5.34ms    187.22
like_substring##like_substring                             22.47ms     44.50
like_substring##strpos                                     20.72ms     48.27
like_suffix##like_suffix                                    7.05ms    141.82
like_suffix##like_relaxed_suffix_1                          9.08ms    110.16
like_suffix##like_relaxed_suffix_2                          8.52ms    117.30
like_suffix##ends_with                                      5.35ms    187.07
```

The speedup for kRelaxedPrefix is about 40x, speedup for kRelaxedSuffix is about 100x.

Pull Request resolved: facebookincubator#8594

Reviewed By: Yuhta

Differential Revision: D53264233

Pulled By: mbasmanova

fbshipit-source-id: 5d5b3b639dbaee83194c840aba0ba7de8f978e77
  • Loading branch information
xumingming authored and facebook-github-bot committed Feb 1, 2024
1 parent 790d6bd commit fc06455
Show file tree
Hide file tree
Showing 4 changed files with 970 additions and 305 deletions.
88 changes: 58 additions & 30 deletions velox/benchmarks/basic/LikeBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,59 +35,87 @@ int main(int argc, char** argv) {
// Register the scalar functions.
prestosql::registerAllScalarFunctions("");

// exec::register
ExpressionBenchmarkBuilder benchmarkBuilder;
const vector_size_t vectorSize = 1000;
auto vectorMaker = benchmarkBuilder.vectorMaker();

auto makeInput =
[&](vector_size_t vectorSize, bool padAtHead, bool padAtTail) {
return vectorMaker.flatVector<std::string>(vectorSize, [&](auto row) {
// Strings in even rows contain/start with/end with a_b_c depends on
// value of padAtHead && padAtTail.
if (row % 2 == 0) {
auto padding = std::string(row / 2 + 1, 'x');
if (padAtHead && padAtTail) {
return fmt::format("{}a_b_c{}", padding, padding);
} else if (padAtHead) {
return fmt::format("{}a_b_c", padding);
} else if (padAtTail) {
return fmt::format("a_b_c{}", padding);
} else {
return std::string("a_b_c");
}
} else {
return std::string(row, 'x');
}
});
};
auto makeInput = [&](vector_size_t vectorSize,
bool padAtHead,
bool padAtTail,
std::string content = "a_b_c",
std::string paddingStr = "xxx") {
return vectorMaker.flatVector<std::string>(vectorSize, [&](auto row) {
// Strings in even rows contain/start with/end with a_b_c depends on
// value of padAtHead && padAtTail.

// Calculates the padding.
std::ostringstream os;
for (auto i = 0; i < row / 2 + 1; ++i) {
os << paddingStr;
}
auto padding = os.str();

if (row % 2 == 0) {
if (padAtHead && padAtTail) {
return fmt::format("{}{}{}", padding, content, padding);
} else if (padAtHead) {
return fmt::format("{}{}", padding, content);
} else if (padAtTail) {
return fmt::format("{}{}", content, padding);
} else {
return content;
}
} else {
// Yes, two padding concatenated, since we have a '/2' above.
return padding + padding;
}
});
};

auto substringInput = makeInput(vectorSize, true, true);
auto prefixInput = makeInput(vectorSize, false, true);
auto prefixUnicodeInput = makeInput(vectorSize, false, true, "你_好_啊");
auto suffixInput = makeInput(vectorSize, true, false);
auto suffixUnicodeInput = makeInput(vectorSize, true, false, "你_好_啊");

benchmarkBuilder
.addBenchmarkSet(
"like_substring", vectorMaker.rowVector({"col0"}, {substringInput}))
.addExpression("like_substring", R"(like(col0, '%a\_b\_c%', '\'))")
"substring", vectorMaker.rowVector({"col0"}, {substringInput}))
.addExpression("substring", R"(like(col0, '%a\_b\_c%', '\'))")
.addExpression("strpos", R"(strpos(col0, 'a_b_c') > 0)");

benchmarkBuilder
.addBenchmarkSet(
"like_prefix", vectorMaker.rowVector({"col0"}, {prefixInput}))
.addExpression("like_prefix", R"(like(col0, 'a\_b\_c%', '\'))")
"prefix",
vectorMaker.rowVector(
{"col0", "col1"}, {prefixInput, prefixUnicodeInput}))
.addExpression("prefix", R"(like(col0, 'a\_b\_c%', '\'))")
.addExpression("relaxed_prefix_1", R"(like(col0, 'a\__\_c%', '\'))")
.addExpression("relaxed_prefix_2", R"(like(col0, '_\__\_c%', '\'))")
.addExpression(
"relaxed_prefix_unicode_1", R"(like(col1, '你\__\_啊%', '\'))")
.addExpression(
"relaxed_prefix_unicode_2", R"(like(col1, '_\__\_啊%', '\'))")
.addExpression("starts_with", R"(starts_with(col0, 'a_b_c'))");

benchmarkBuilder
.addBenchmarkSet(
"like_suffix", vectorMaker.rowVector({"col0"}, {suffixInput}))
.addExpression("like_suffix", R"(like(col0, '%a\_b\_c', '\'))")
"suffix",
vectorMaker.rowVector(
{"col0", "col1"}, {suffixInput, suffixUnicodeInput}))
.addExpression("suffix", R"(like(col0, '%a\_b\_c', '\'))")
.addExpression("relaxed_suffix_1", R"(like(col0, '%a\__\_c', '\'))")
.addExpression("relaxed_suffix_2", R"(like(col0, '%_\__\_c', '\'))")
.addExpression(
"relaxed_suffix_unicode_1", R"(like(col1, '%你\__\_啊', '\'))")
.addExpression(
"relaxed_suffix_unicode_2", R"(like(col1, '%_\__\_啊', '\'))")
.addExpression("ends_with", R"(ends_with(col0, 'a_b_c'))");

benchmarkBuilder
.addBenchmarkSet(
"like_generic", vectorMaker.rowVector({"col0"}, {substringInput}))
.addExpression("like_generic", R"(like(col0, '%a%b%c'))");
"generic", vectorMaker.rowVector({"col0"}, {substringInput}))
.addExpression("generic", R"(like(col0, '%a%b%c'))");

benchmarkBuilder.registerBenchmarks();
benchmarkBuilder.testBenchmarks();
Expand Down
Loading

0 comments on commit fc06455

Please sign in to comment.