Skip to content

Commit fc06455

Browse files
xumingmingfacebook-github-bot
authored andcommitted
Optimize LIKE for more relaxed patterns (apache#8594)
Summary: In this PR we optimize LIKE operations for patterns which I call them kRelaxed[Prefix|Suffix] patterns, e.g. - kRelaxedPrefix: _a_bc%% - kRelaxedSuffix: %%_a_bc 'Relaxed' here means there is less restrictions than their counterparts. The algorithm of recognizing these relaxed patterns can be explained by an example, say we have a pattern '___hello___%%', it is split into 4 sub patterns: - [0] kSingleCharWildcard: ___ - [1] kLiteralString: hello - [2] kSingleCharWildcard: ___ - [3] kAnyCharsWildcard: %% Since the 'kAnyCharsWildcard' only occurs at the end of the pattern, we can determine it is a kRelaxedPrefix pattern, and then use the first 3 fixed sub-patterns to do the matching. The benchmark result: Before(kGeneric): ``` ============================================================================ [...]hmarks/ExpressionBenchmarkBuilder.cpp relative time/iter iters/s ============================================================================ like_generic##like_generic 1.34s 747.38m ---------------------------------------------------------------------------- ---------------------------------------------------------------------------- like_prefix##like_prefix 340.30ms 2.94 like_prefix##like_relaxed_prefix_1 334.77ms 2.99 like_prefix##like_relaxed_prefix_2 350.70ms 2.85 like_prefix##starts_with 5.35ms 187.05 like_substring##like_substring 1.26s 790.87m like_substring##strpos 20.55ms 48.67 like_suffix##like_suffix 957.06ms 1.04 like_suffix##like_relaxed_suffix_1 935.90ms 1.07 like_suffix##like_relaxed_suffix_2 1.08s 926.79m like_suffix##ends_with 5.35ms 187.07 ``` After(kRelaxedPrefix, kRelaxedSuffix): ``` ============================================================================ [...]hmarks/ExpressionBenchmarkBuilder.cpp relative time/iter iters/s ============================================================================ like_generic##like_generic 1.48s 674.92m ---------------------------------------------------------------------------- ---------------------------------------------------------------------------- like_prefix##like_prefix 7.05ms 141.80 like_prefix##like_relaxed_prefix_1 9.06ms 110.36 like_prefix##like_relaxed_prefix_2 8.55ms 116.94 like_prefix##starts_with 5.34ms 187.22 like_substring##like_substring 22.47ms 44.50 like_substring##strpos 20.72ms 48.27 like_suffix##like_suffix 7.05ms 141.82 like_suffix##like_relaxed_suffix_1 9.08ms 110.16 like_suffix##like_relaxed_suffix_2 8.52ms 117.30 like_suffix##ends_with 5.35ms 187.07 ``` The speedup for kRelaxedPrefix is about 40x, speedup for kRelaxedSuffix is about 100x. Pull Request resolved: facebookincubator/velox#8594 Reviewed By: Yuhta Differential Revision: D53264233 Pulled By: mbasmanova fbshipit-source-id: 5d5b3b639dbaee83194c840aba0ba7de8f978e77
1 parent 790d6bd commit fc06455

File tree

4 files changed

+970
-305
lines changed

4 files changed

+970
-305
lines changed

velox/benchmarks/basic/LikeBenchmark.cpp

Lines changed: 58 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -35,59 +35,87 @@ int main(int argc, char** argv) {
3535
// Register the scalar functions.
3636
prestosql::registerAllScalarFunctions("");
3737

38-
// exec::register
3938
ExpressionBenchmarkBuilder benchmarkBuilder;
4039
const vector_size_t vectorSize = 1000;
4140
auto vectorMaker = benchmarkBuilder.vectorMaker();
4241

43-
auto makeInput =
44-
[&](vector_size_t vectorSize, bool padAtHead, bool padAtTail) {
45-
return vectorMaker.flatVector<std::string>(vectorSize, [&](auto row) {
46-
// Strings in even rows contain/start with/end with a_b_c depends on
47-
// value of padAtHead && padAtTail.
48-
if (row % 2 == 0) {
49-
auto padding = std::string(row / 2 + 1, 'x');
50-
if (padAtHead && padAtTail) {
51-
return fmt::format("{}a_b_c{}", padding, padding);
52-
} else if (padAtHead) {
53-
return fmt::format("{}a_b_c", padding);
54-
} else if (padAtTail) {
55-
return fmt::format("a_b_c{}", padding);
56-
} else {
57-
return std::string("a_b_c");
58-
}
59-
} else {
60-
return std::string(row, 'x');
61-
}
62-
});
63-
};
42+
auto makeInput = [&](vector_size_t vectorSize,
43+
bool padAtHead,
44+
bool padAtTail,
45+
std::string content = "a_b_c",
46+
std::string paddingStr = "xxx") {
47+
return vectorMaker.flatVector<std::string>(vectorSize, [&](auto row) {
48+
// Strings in even rows contain/start with/end with a_b_c depends on
49+
// value of padAtHead && padAtTail.
50+
51+
// Calculates the padding.
52+
std::ostringstream os;
53+
for (auto i = 0; i < row / 2 + 1; ++i) {
54+
os << paddingStr;
55+
}
56+
auto padding = os.str();
57+
58+
if (row % 2 == 0) {
59+
if (padAtHead && padAtTail) {
60+
return fmt::format("{}{}{}", padding, content, padding);
61+
} else if (padAtHead) {
62+
return fmt::format("{}{}", padding, content);
63+
} else if (padAtTail) {
64+
return fmt::format("{}{}", content, padding);
65+
} else {
66+
return content;
67+
}
68+
} else {
69+
// Yes, two padding concatenated, since we have a '/2' above.
70+
return padding + padding;
71+
}
72+
});
73+
};
6474

6575
auto substringInput = makeInput(vectorSize, true, true);
6676
auto prefixInput = makeInput(vectorSize, false, true);
77+
auto prefixUnicodeInput = makeInput(vectorSize, false, true, "你_好_啊");
6778
auto suffixInput = makeInput(vectorSize, true, false);
79+
auto suffixUnicodeInput = makeInput(vectorSize, true, false, "你_好_啊");
6880

6981
benchmarkBuilder
7082
.addBenchmarkSet(
71-
"like_substring", vectorMaker.rowVector({"col0"}, {substringInput}))
72-
.addExpression("like_substring", R"(like(col0, '%a\_b\_c%', '\'))")
83+
"substring", vectorMaker.rowVector({"col0"}, {substringInput}))
84+
.addExpression("substring", R"(like(col0, '%a\_b\_c%', '\'))")
7385
.addExpression("strpos", R"(strpos(col0, 'a_b_c') > 0)");
7486

7587
benchmarkBuilder
7688
.addBenchmarkSet(
77-
"like_prefix", vectorMaker.rowVector({"col0"}, {prefixInput}))
78-
.addExpression("like_prefix", R"(like(col0, 'a\_b\_c%', '\'))")
89+
"prefix",
90+
vectorMaker.rowVector(
91+
{"col0", "col1"}, {prefixInput, prefixUnicodeInput}))
92+
.addExpression("prefix", R"(like(col0, 'a\_b\_c%', '\'))")
93+
.addExpression("relaxed_prefix_1", R"(like(col0, 'a\__\_c%', '\'))")
94+
.addExpression("relaxed_prefix_2", R"(like(col0, '_\__\_c%', '\'))")
95+
.addExpression(
96+
"relaxed_prefix_unicode_1", R"(like(col1, '你\__\_啊%', '\'))")
97+
.addExpression(
98+
"relaxed_prefix_unicode_2", R"(like(col1, '_\__\_啊%', '\'))")
7999
.addExpression("starts_with", R"(starts_with(col0, 'a_b_c'))");
80100

81101
benchmarkBuilder
82102
.addBenchmarkSet(
83-
"like_suffix", vectorMaker.rowVector({"col0"}, {suffixInput}))
84-
.addExpression("like_suffix", R"(like(col0, '%a\_b\_c', '\'))")
103+
"suffix",
104+
vectorMaker.rowVector(
105+
{"col0", "col1"}, {suffixInput, suffixUnicodeInput}))
106+
.addExpression("suffix", R"(like(col0, '%a\_b\_c', '\'))")
107+
.addExpression("relaxed_suffix_1", R"(like(col0, '%a\__\_c', '\'))")
108+
.addExpression("relaxed_suffix_2", R"(like(col0, '%_\__\_c', '\'))")
109+
.addExpression(
110+
"relaxed_suffix_unicode_1", R"(like(col1, '%你\__\_啊', '\'))")
111+
.addExpression(
112+
"relaxed_suffix_unicode_2", R"(like(col1, '%_\__\_啊', '\'))")
85113
.addExpression("ends_with", R"(ends_with(col0, 'a_b_c'))");
86114

87115
benchmarkBuilder
88116
.addBenchmarkSet(
89-
"like_generic", vectorMaker.rowVector({"col0"}, {substringInput}))
90-
.addExpression("like_generic", R"(like(col0, '%a%b%c'))");
117+
"generic", vectorMaker.rowVector({"col0"}, {substringInput}))
118+
.addExpression("generic", R"(like(col0, '%a%b%c'))");
91119

92120
benchmarkBuilder.registerBenchmarks();
93121
benchmarkBuilder.testBenchmarks();

0 commit comments

Comments
 (0)