Skip to content

Commit a7729d6

Browse files
committed
fix: function filters with token-based text indexes
1 parent 3f58fcb commit a7729d6

12 files changed

+446
-62
lines changed

src/Interpreters/ITokenExtractor.cpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,4 +240,34 @@ bool SplitTokenExtractor::nextInStringLike(const char * data, size_t length, siz
240240
return !bad_token && !token.empty();
241241
}
242242

243+
void SplitTokenExtractor::substringToBloomFilter(const char * data, size_t length, BloomFilter & bloom_filter, bool is_prefix, bool is_suffix) const
244+
{
245+
size_t cur = 0;
246+
size_t token_start = 0;
247+
size_t token_len = 0;
248+
249+
while (cur < length && nextInString(data, length, &cur, &token_start, &token_len))
250+
// In order to avoid filter updates with incomplete tokens,
251+
// first token is ignored, unless substring is prefix and
252+
// last token is ignored, unless substring is suffix
253+
if ((token_start > 0 || is_prefix) && (token_start + token_len < length || is_suffix))
254+
bloom_filter.add(data + token_start, token_len);
255+
}
256+
257+
void SplitTokenExtractor::substringToGinFilter(const char * data, size_t length, GinFilter & gin_filter, bool is_prefix, bool is_suffix) const
258+
{
259+
gin_filter.setQueryString(data, length);
260+
261+
size_t cur = 0;
262+
size_t token_start = 0;
263+
size_t token_len = 0;
264+
265+
while (cur < length && nextInString(data, length, &cur, &token_start, &token_len))
266+
// In order to avoid filter updates with incomplete tokens,
267+
// first token is ignored, unless substring is prefix and
268+
// last token is ignored, unless substring is suffix
269+
if ((token_start > 0 || is_prefix) && (token_start + token_len < length || is_suffix))
270+
gin_filter.addTerm(data + token_start, token_len);
271+
}
272+
243273
}

src/Interpreters/ITokenExtractor.h

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,17 +28,45 @@ struct ITokenExtractor
2828
/// It skips unescaped `%` and `_` and supports escaping symbols, but it is less lightweight.
2929
virtual bool nextInStringLike(const char * data, size_t length, size_t * pos, String & out) const = 0;
3030

31+
/// Updates Bloom filter from exact-match string filter value
3132
virtual void stringToBloomFilter(const char * data, size_t length, BloomFilter & bloom_filter) const = 0;
3233

34+
/// Updates Bloom filter from substring-match string filter value.
35+
/// An `ITokenExtractor` implementation may decide to skip certain
36+
/// tokens depending on whether the substring is a prefix or a suffix.
37+
virtual void substringToBloomFilter(
38+
const char * data,
39+
size_t length,
40+
BloomFilter & bloom_filter,
41+
bool is_prefix [[maybe_unused]],
42+
bool is_suffix [[maybe_unused]]) const
43+
{
44+
stringToBloomFilter(data, length, bloom_filter);
45+
}
46+
3347
virtual void stringPaddedToBloomFilter(const char * data, size_t length, BloomFilter & bloom_filter) const
3448
{
3549
stringToBloomFilter(data, length, bloom_filter);
3650
}
3751

3852
virtual void stringLikeToBloomFilter(const char * data, size_t length, BloomFilter & bloom_filter) const = 0;
3953

54+
/// Updates GIN filter from exact-match string filter value
4055
virtual void stringToGinFilter(const char * data, size_t length, GinFilter & gin_filter) const = 0;
4156

57+
/// Updates GIN filter from substring-match string filter value.
58+
/// An `ITokenExtractor` implementation may decide to skip certain
59+
/// tokens depending on whether the substring is a prefix or a suffix.
60+
virtual void substringToGinFilter(
61+
const char * data,
62+
size_t length,
63+
GinFilter & gin_filter,
64+
bool is_prefix [[maybe_unused]],
65+
bool is_suffix [[maybe_unused]]) const
66+
{
67+
stringToGinFilter(data, length, gin_filter);
68+
}
69+
4270
virtual void stringPaddedToGinFilter(const char * data, size_t length, GinFilter & gin_filter) const
4371
{
4472
stringToGinFilter(data, length, gin_filter);
@@ -148,6 +176,11 @@ struct SplitTokenExtractor final : public ITokenExtractorHelper<SplitTokenExtrac
148176

149177
bool nextInStringLike(const char * data, size_t length, size_t * __restrict pos, String & token) const override;
150178

179+
void substringToBloomFilter(const char * data, size_t length, BloomFilter & bloom_filter, bool is_prefix, bool is_suffix) const override;
180+
181+
void substringToGinFilter(const char * data, size_t length, GinFilter & gin_filter, bool is_prefix, bool is_suffix) const override;
182+
183+
151184
};
152185

153186
}

src/Storages/MergeTree/MergeTreeIndexBloomFilterText.cpp

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -566,7 +566,7 @@ bool MergeTreeConditionBloomFilterText::traverseTreeEquals(
566566
out.function = RPNElement::FUNCTION_EQUALS;
567567
out.bloom_filter = std::make_unique<BloomFilter>(params);
568568
const auto & value = const_value.get<String>();
569-
token_extractor->stringToBloomFilter(value.data(), value.size(), *out.bloom_filter);
569+
token_extractor->substringToBloomFilter(value.data(), value.size(), *out.bloom_filter, true, false);
570570
return true;
571571
}
572572
else if (function_name == "endsWith")
@@ -575,7 +575,7 @@ bool MergeTreeConditionBloomFilterText::traverseTreeEquals(
575575
out.function = RPNElement::FUNCTION_EQUALS;
576576
out.bloom_filter = std::make_unique<BloomFilter>(params);
577577
const auto & value = const_value.get<String>();
578-
token_extractor->stringToBloomFilter(value.data(), value.size(), *out.bloom_filter);
578+
token_extractor->substringToBloomFilter(value.data(), value.size(), *out.bloom_filter, false, true);
579579
return true;
580580
}
581581
else if (function_name == "multiSearchAny"
@@ -596,7 +596,15 @@ bool MergeTreeConditionBloomFilterText::traverseTreeEquals(
596596

597597
bloom_filters.back().emplace_back(params);
598598
const auto & value = element.get<String>();
599-
token_extractor->stringToBloomFilter(value.data(), value.size(), bloom_filters.back().back());
599+
600+
if (function_name == "multiSearchAny")
601+
{
602+
token_extractor->substringToBloomFilter(value.data(), value.size(), bloom_filters.back().back(), false, false);
603+
}
604+
else
605+
{
606+
token_extractor->stringToBloomFilter(value.data(), value.size(), bloom_filters.back().back());
607+
}
600608
}
601609
out.set_bloom_filters = std::move(bloom_filters);
602610
return true;
@@ -625,12 +633,12 @@ bool MergeTreeConditionBloomFilterText::traverseTreeEquals(
625633
for (const auto & alternative : alternatives)
626634
{
627635
bloom_filters.back().emplace_back(params);
628-
token_extractor->stringToBloomFilter(alternative.data(), alternative.size(), bloom_filters.back().back());
636+
token_extractor->substringToBloomFilter(alternative.data(), alternative.size(), bloom_filters.back().back(), false, false);
629637
}
630638
out.set_bloom_filters = std::move(bloom_filters);
631639
}
632640
else
633-
token_extractor->stringToBloomFilter(required_substring.data(), required_substring.size(), *out.bloom_filter);
641+
token_extractor->substringToBloomFilter(required_substring.data(), required_substring.size(), *out.bloom_filter, false, false);
634642

635643
return true;
636644
}

src/Storages/MergeTree/MergeTreeIndexFullText.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -594,7 +594,7 @@ bool MergeTreeConditionFullText::traverseASTEquals(
594594
out.function = RPNElement::FUNCTION_EQUALS;
595595
out.gin_filter = std::make_unique<GinFilter>(params);
596596
const auto & value = const_value.get<String>();
597-
token_extractor->stringToGinFilter(value.data(), value.size(), *out.gin_filter);
597+
token_extractor->substringToGinFilter(value.data(), value.size(), *out.gin_filter, true, false);
598598
return true;
599599
}
600600
else if (function_name == "endsWith")
@@ -603,7 +603,7 @@ bool MergeTreeConditionFullText::traverseASTEquals(
603603
out.function = RPNElement::FUNCTION_EQUALS;
604604
out.gin_filter = std::make_unique<GinFilter>(params);
605605
const auto & value = const_value.get<String>();
606-
token_extractor->stringToGinFilter(value.data(), value.size(), *out.gin_filter);
606+
token_extractor->substringToGinFilter(value.data(), value.size(), *out.gin_filter, false, true);
607607
return true;
608608
}
609609
else if (function_name == "multiSearchAny")
@@ -621,7 +621,7 @@ bool MergeTreeConditionFullText::traverseASTEquals(
621621

622622
gin_filters.back().emplace_back(params);
623623
const auto & value = element.get<String>();
624-
token_extractor->stringToGinFilter(value.data(), value.size(), gin_filters.back().back());
624+
token_extractor->substringToGinFilter(value.data(), value.size(), gin_filters.back().back(), false, false);
625625
}
626626
out.set_gin_filters = std::move(gin_filters);
627627
return true;
@@ -649,14 +649,14 @@ bool MergeTreeConditionFullText::traverseASTEquals(
649649
for (const auto & alternative : alternatives)
650650
{
651651
gin_filters.back().emplace_back(params);
652-
token_extractor->stringToGinFilter(alternative.data(), alternative.size(), gin_filters.back().back());
652+
token_extractor->substringToGinFilter(alternative.data(), alternative.size(), gin_filters.back().back(), false, false);
653653
}
654654
out.set_gin_filters = std::move(gin_filters);
655655
}
656656
else
657657
{
658658
out.gin_filter = std::make_unique<GinFilter>(params);
659-
token_extractor->stringToGinFilter(required_substring.data(), required_substring.size(), *out.gin_filter);
659+
token_extractor->substringToGinFilter(required_substring.data(), required_substring.size(), *out.gin_filter, false, false);
660660
}
661661

662662
return true;

tests/queries/0_stateless/02346_fulltext_index_match_predicate.reference

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
1-
1 Hello ClickHouse
2-
2 Hello World
1+
1 Well, Hello ClickHouse !
2+
2 Well, Hello World !
33
Granules: 6/6
44
Granules: 2/6
55
Granules: 6/6
66
Granules: 2/6
77
---
8-
1 Hello ClickHouse
9-
2 Hello World
10-
6 World Champion
8+
1 Well, Hello ClickHouse !
9+
2 Well, Hello World !
10+
6 True World Champion
1111
Granules: 6/6
1212
Granules: 3/6
1313
Granules: 6/6
1414
Granules: 3/6
1515
---
16-
5 OLAP Database
16+
5 Its An OLAP Database
1717
Granules: 6/6
1818
Granules: 1/6
1919
Granules: 6/6

tests/queries/0_stateless/02346_fulltext_index_match_predicate.sql

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -14,19 +14,19 @@ ENGINE = MergeTree
1414
ORDER BY id
1515
SETTINGS index_granularity = 1;
1616

17-
INSERT INTO tab VALUES (1, 'Hello ClickHouse'), (2, 'Hello World'), (3, 'Good Weather'), (4, 'Say Hello'), (5, 'OLAP Database'), (6, 'World Champion');
17+
INSERT INTO tab VALUES (1, 'Well, Hello ClickHouse !'), (2, 'Well, Hello World !'), (3, 'Good Weather !'), (4, 'Say Hello !'), (5, 'Its An OLAP Database'), (6, 'True World Champion');
1818

19-
SELECT * FROM tab WHERE match(str, 'Hello (ClickHouse|World)') ORDER BY id;
19+
SELECT * FROM tab WHERE match(str, ' Hello (ClickHouse|World) ') ORDER BY id;
2020

2121
-- Read 2/6 granules
22-
-- Required string: 'Hello '
23-
-- Alternatives: 'Hello ClickHouse', 'Hello World'
22+
-- Required string: ' Hello '
23+
-- Alternatives: ' Hello ClickHouse ', ' Hello World '
2424

2525
SELECT *
2626
FROM
2727
(
2828
EXPLAIN PLAN indexes=1
29-
SELECT * FROM tab WHERE match(str, 'Hello (ClickHouse|World)') ORDER BY id
29+
SELECT * FROM tab WHERE match(str, ' Hello (ClickHouse|World) ') ORDER BY id
3030
)
3131
WHERE
3232
explain LIKE '%Granules: %'
@@ -37,7 +37,7 @@ SELECT *
3737
FROM
3838
(
3939
EXPLAIN PLAN indexes=1
40-
SELECT * FROM tab WHERE match(str, 'Hello (ClickHouse|World)') ORDER BY id
40+
SELECT * FROM tab WHERE match(str, ' Hello (ClickHouse|World) ') ORDER BY id
4141
)
4242
WHERE
4343
explain LIKE '%Granules: %'
@@ -46,17 +46,17 @@ SETTINGS
4646

4747
SELECT '---';
4848

49-
SELECT * FROM tab WHERE match(str, '.*(ClickHouse|World)') ORDER BY id;
49+
SELECT * FROM tab WHERE match(str, '.* (ClickHouse|World) ') ORDER BY id;
5050

5151
-- Read 3/6 granules
5252
-- Required string: -
53-
-- Alternatives: 'ClickHouse', 'World'
53+
-- Alternatives: ' ClickHouse ', ' World '
5454

5555
SELECT *
5656
FROM
5757
(
5858
EXPLAIN PLAN indexes = 1
59-
SELECT * FROM tab WHERE match(str, '.*(ClickHouse|World)') ORDER BY id
59+
SELECT * FROM tab WHERE match(str, '.* (ClickHouse|World) ') ORDER BY id
6060
)
6161
WHERE
6262
explain LIKE '%Granules: %'
@@ -67,7 +67,7 @@ SELECT *
6767
FROM
6868
(
6969
EXPLAIN PLAN indexes = 1
70-
SELECT * FROM tab WHERE match(str, '.*(ClickHouse|World)') ORDER BY id
70+
SELECT * FROM tab WHERE match(str, '.* (ClickHouse|World) ') ORDER BY id
7171
)
7272
WHERE
7373
explain LIKE '%Granules: %'
@@ -76,17 +76,17 @@ SETTINGS
7676

7777
SELECT '---';
7878

79-
SELECT * FROM tab WHERE match(str, 'OLAP.*') ORDER BY id;
79+
SELECT * FROM tab WHERE match(str, ' OLAP .*') ORDER BY id;
8080

8181
-- Read 1/6 granules
82-
-- Required string: 'OLAP'
82+
-- Required string: ' OLAP '
8383
-- Alternatives: -
8484

8585
SELECT *
8686
FROM
8787
(
8888
EXPLAIN PLAN indexes = 1
89-
SELECT * FROM tab WHERE match(str, 'OLAP (.*?)*') ORDER BY id
89+
SELECT * FROM tab WHERE match(str, ' OLAP (.*?)*') ORDER BY id
9090
)
9191
WHERE
9292
explain LIKE '%Granules: %'
@@ -97,7 +97,7 @@ SELECT *
9797
FROM
9898
(
9999
EXPLAIN PLAN indexes = 1
100-
SELECT * FROM tab WHERE match(str, 'OLAP (.*?)*') ORDER BY id
100+
SELECT * FROM tab WHERE match(str, ' OLAP (.*?)*') ORDER BY id
101101
)
102102
WHERE
103103
explain LIKE '%Granules: %'

tests/queries/0_stateless/02346_fulltext_index_search.reference

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,19 +13,19 @@ af full_text
1313
1
1414
Test full_text()
1515
af full_text
16-
101 Alick a01
17-
106 Alick a06
18-
111 Alick b01
19-
116 Alick b06
20-
101 Alick a01
21-
106 Alick a06
16+
101 x Alick a01 y
17+
106 x Alick a06 y
18+
111 x Alick b01 y
19+
116 x Alick b06 y
20+
101 x Alick a01 y
21+
106 x Alick a06 y
2222
1
23-
101 Alick a01
24-
111 Alick b01
23+
101 x Alick a01 y
24+
111 x Alick b01 y
2525
1
2626
Test on array columns
2727
af full_text
28-
3 ['Click a03','Click b03']
28+
3 ['x Click a03 y','x Click b03 y']
2929
1
3030
Test on map columns
3131
af full_text

tests/queries/0_stateless/02346_fulltext_index_search.sql

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ CREATE TABLE tab_x(k UInt64, s String, INDEX af(s) TYPE full_text())
6767
ENGINE = MergeTree() ORDER BY k
6868
SETTINGS index_granularity = 2, index_granularity_bytes = '10Mi';
6969

70-
INSERT INTO tab_x VALUES (101, 'Alick a01'), (102, 'Blick a02'), (103, 'Click a03'), (104, 'Dlick a04'), (105, 'Elick a05'), (106, 'Alick a06'), (107, 'Blick a07'), (108, 'Click a08'), (109, 'Dlick a09'), (110, 'Elick a10'), (111, 'Alick b01'), (112, 'Blick b02'), (113, 'Click b03'), (114, 'Dlick b04'), (115, 'Elick b05'), (116, 'Alick b06'), (117, 'Blick b07'), (118, 'Click b08'), (119, 'Dlick b09'), (120, 'Elick b10');
70+
INSERT INTO tab_x VALUES (101, 'x Alick a01 y'), (102, 'x Blick a02 y'), (103, 'x Click a03 y'), (104, 'x Dlick a04 y'), (105, 'x Elick a05 y'), (106, 'x Alick a06 y'), (107, 'x Blick a07 y'), (108, 'x Click a08 y'), (109, 'x Dlick a09 y'), (110, 'x Elick a10 y'), (111, 'x Alick b01 y'), (112, 'x Blick b02 y'), (113, 'x Click b03 y'), (114, 'x Dlick b04 y'), (115, 'x Elick b05 y'), (116, 'x Alick b06 y'), (117, 'x Blick b07 y'), (118, 'x Click b08 y'), (119, 'x Dlick b09 y'), (120, 'x Elick b10 y');
7171

7272
-- check full_text index was created
7373
SELECT name, type FROM system.data_skipping_indices WHERE table == 'tab_x' AND database = currentDatabase() LIMIT 1;
@@ -86,27 +86,27 @@ SELECT read_rows==8 from system.query_log
8686
LIMIT 1;
8787

8888
-- search full_text index with IN operator
89-
SELECT * FROM tab_x WHERE s IN ('Alick a01', 'Alick a06') ORDER BY k;
89+
SELECT * FROM tab_x WHERE s IN ('x Alick a01 y', 'x Alick a06 y') ORDER BY k;
9090

9191
-- check the query only read 2 granules (4 rows total; each granule has 2 rows)
9292
SYSTEM FLUSH LOGS;
9393
SELECT read_rows==4 from system.query_log
9494
WHERE query_kind ='Select'
9595
AND current_database = currentDatabase()
96-
AND endsWith(trimRight(query), 'SELECT * FROM tab_x WHERE s IN (\'Alick a01\', \'Alick a06\') ORDER BY k;')
96+
AND endsWith(trimRight(query), 'SELECT * FROM tab_x WHERE s IN (\'x Alick a01 y\', \'x Alick a06 y\') ORDER BY k;')
9797
AND type='QueryFinish'
9898
AND result_rows==2
9999
LIMIT 1;
100100

101101
-- search full_text index with multiSearch
102-
SELECT * FROM tab_x WHERE multiSearchAny(s, ['a01', 'b01']) ORDER BY k;
102+
SELECT * FROM tab_x WHERE multiSearchAny(s, [' a01 ', ' b01 ']) ORDER BY k;
103103

104104
-- check the query only read 2 granules (4 rows total; each granule has 2 rows)
105105
SYSTEM FLUSH LOGS;
106106
SELECT read_rows==4 from system.query_log
107107
WHERE query_kind ='Select'
108108
AND current_database = currentDatabase()
109-
AND endsWith(trimRight(query), 'SELECT * FROM tab_x WHERE multiSearchAny(s, [\'a01\', \'b01\']) ORDER BY k;')
109+
AND endsWith(trimRight(query), 'SELECT * FROM tab_x WHERE multiSearchAny(s, [\' a01 \', \' b01 \']) ORDER BY k;')
110110
AND type='QueryFinish'
111111
AND result_rows==2
112112
LIMIT 1;
@@ -126,14 +126,14 @@ INSERT INTO tab SELECT rowNumberInBlock(), groupArray(s) FROM tab_x GROUP BY k%1
126126
SELECT name, type FROM system.data_skipping_indices WHERE table == 'tab' AND database = currentDatabase() LIMIT 1;
127127

128128
-- search full_text index with has
129-
SELECT * FROM tab WHERE has(s, 'Click a03') ORDER BY k;
129+
SELECT * FROM tab WHERE has(s, 'x Click a03 y') ORDER BY k;
130130

131131
-- check the query must read all 10 granules (20 rows total; each granule has 2 rows)
132132
SYSTEM FLUSH LOGS;
133133
SELECT read_rows==2 from system.query_log
134134
WHERE query_kind ='Select'
135135
AND current_database = currentDatabase()
136-
AND endsWith(trimRight(query), 'SELECT * FROM tab WHERE has(s, \'Click a03\') ORDER BY k;')
136+
AND endsWith(trimRight(query), 'SELECT * FROM tab WHERE has(s, \'x Click a03 y\') ORDER BY k;')
137137
AND type='QueryFinish'
138138
AND result_rows==1
139139
LIMIT 1;

0 commit comments

Comments
 (0)