Skip to content

Commit 424cd1b

Browse files
committed
f_unknown if monotonic_functions not empty
1 parent 1e7f5b0 commit 424cd1b

File tree

5 files changed

+24
-54
lines changed

5 files changed

+24
-54
lines changed

src/Processors/Formats/Impl/Parquet/ParquetBloomFilterCondition.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,14 @@ std::vector<ParquetBloomFilterCondition::ConditionElement> keyConditionRPNToParq
376376

377377
for (const auto & rpn_element : rpn)
378378
{
379+
// this would be a problem for `where negate(x) = -58`.
380+
// It would perform a bf search on `-58`, and possibly miss row groups containing this data.
381+
if (!rpn_element.monotonic_functions_chain.empty())
382+
{
383+
condition_elements.emplace_back(Function::FUNCTION_UNKNOWN);
384+
continue;
385+
}
386+
379387
ParquetBloomFilterCondition::ConditionElement::HashesForColumns hashes;
380388

381389
if (rpn_element.function == RPNElement::FUNCTION_IN_RANGE

tests/queries/0_stateless/03036_test_parquet_bloom_filter_push_down.reference

Lines changed: 13 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -227,32 +227,6 @@ Bloom filter for ipv4 column. BF is on. Specified in the schema
227227
"bytes_read": 4005
228228
}
229229
}
230-
Bloom filter for ipv4 column. Even though column is transformed, we know it is a safe transformation.
231-
{
232-
"data": [
233-
{
234-
"json": "{\"key\":38, \"value\":\"NXONM\"}"
235-
}
236-
],
237-
"rows": 1,
238-
"statistics": {
239-
"rows_read": 72,
240-
"bytes_read": 4005
241-
}
242-
}
243-
Bloom filter for ipv4 column. Even though column is transformed, we know it is a safe transformation.
244-
{
245-
"data": [
246-
{
247-
"json": "{\"key\":38, \"value\":\"NXONM\"}"
248-
}
249-
],
250-
"rows": 1,
251-
"statistics": {
252-
"rows_read": 72,
253-
"bytes_read": 4005
254-
}
255-
}
256230
Bloom filter on 64 bit column read as ipv4. We explicitly deny it, should read all rg
257231
{
258232
"data": [
@@ -358,3 +332,16 @@ Invalid column conversion with in operation. String type can not be hashed again
358332
"bytes_read": 47419
359333
}
360334
}
335+
Transformations on key column shall not be allowed. Should read everything
336+
{
337+
"data": [
338+
{
339+
"uint64_logical": "7711695863945021976"
340+
}
341+
],
342+
"rows": 1,
343+
"statistics": {
344+
"rows_read": 1000,
345+
"bytes_read": 47419
346+
}
347+
}

tests/queries/0_stateless/03036_test_parquet_bloom_filter_push_down.sh

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -69,12 +69,6 @@ ${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Pa
6969
echo "Bloom filter for ipv4 column. BF is on. Specified in the schema"
7070
${CLICKHOUSE_CLIENT} --query="select ipv4 from file('${DATA_FILE_USER_PATH}', Parquet, 'ipv4 IPv4') where ipv4 = toIPv4('0.0.1.143') order by ipv4 asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
7171

72-
echo "Bloom filter for ipv4 column. Even though column is transformed, we know it is a safe transformation."
73-
${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where toIPv4(ipv4) = toIPv4('0.0.1.143') order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
74-
75-
echo "Bloom filter for ipv4 column. Even though column is transformed, we know it is a safe transformation."
76-
${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where (toIPv4(ipv4)) in (toIPv4('0.0.1.143')) order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
77-
7872
echo "Bloom filter on 64 bit column read as ipv4. We explicitly deny it, should read all rg"
7973
${CLICKHOUSE_CLIENT} --query="select uint64_logical from file ('${DATA_FILE_USER_PATH}', Parquet, 'uint64_logical IPv4') where uint64_logical = toIPv4(5552715629697883300) order by uint64_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
8074

@@ -96,4 +90,7 @@ ${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Pa
9690
echo "Invalid column conversion with in operation. String type can not be hashed against parquet int64 physical type. Should read everything"
9791
${CLICKHOUSE_CLIENT} --query="select uint64_logical from file('${DATA_FILE_USER_PATH}', Parquet, 'uint64_logical String') where uint64_logical in ('5') order by uint64_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
9892

93+
echo "Transformations on key column shall not be allowed. Should read everything"
94+
${CLICKHOUSE_CLIENT} --query="select uint64_logical from file('${DATA_FILE_USER_PATH}', Parquet) where negate(uint64_logical) = -7711695863945021976 order by uint64_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
95+
9996
rm -rf ${USER_FILES_PATH}/${CLICKHOUSE_TEST_UNIQUE_NAME:?}/*

tests/queries/0_stateless/03036_test_parquet_bloom_filter_push_down_ipv6.reference

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -49,18 +49,6 @@ bloom filter is on for ipv6, row groups should also be read since there is only
4949
"bytes_read": 128
5050
}
5151
}
52-
{
53-
"data": [
54-
{
55-
"toIPv6(ipv6)": "7afe:b9d4:e754:4e78:8783:37f5:b2ea:9995"
56-
}
57-
],
58-
"rows": 1,
59-
"statistics": {
60-
"rows_read": 5,
61-
"bytes_read": 128
62-
}
63-
}
6452
non existent ipv6, row group should be skipped
6553
{
6654
"data": [],
@@ -86,11 +74,3 @@ non existent ipv6, row group should be skipped
8674
"bytes_read": 128
8775
}
8876
}
89-
{
90-
"data": [],
91-
"rows": 0,
92-
"statistics": {
93-
"rows_read": 0,
94-
"bytes_read": 0
95-
}
96-
}

tests/queries/0_stateless/03036_test_parquet_bloom_filter_push_down_ipv6.sh

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,8 @@ echo "bloom filter is on for ipv6, row groups should also be read since there is
2626
${CLICKHOUSE_CLIENT} --query="select ipv6 from file('${DATA_FILE_USER_PATH}', Parquet, 'ipv6 IPv6') where ipv6 = '7afe:b9d4:e754:4e78:8783:37f5:b2ea:9995' Format JSON SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
2727
${CLICKHOUSE_CLIENT} --query="select ipv6 from file('${DATA_FILE_USER_PATH}', Parquet, 'ipv6 IPv6') where ipv6 = toIPv6('7afe:b9d4:e754:4e78:8783:37f5:b2ea:9995') Format JSON SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
2828
${CLICKHOUSE_CLIENT} --query="select toIPv6(ipv6) from file('${DATA_FILE_USER_PATH}', Parquet) where ipv6 = toIPv6('7afe:b9d4:e754:4e78:8783:37f5:b2ea:9995') Format JSON SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
29-
${CLICKHOUSE_CLIENT} --query="select toIPv6(ipv6) from file('${DATA_FILE_USER_PATH}', Parquet) where toIPv6(ipv6) = '7afe:b9d4:e754:4e78:8783:37f5:b2ea:9995' Format JSON SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
3029

3130
echo "non existent ipv6, row group should be skipped"
3231
${CLICKHOUSE_CLIENT} --query="select ipv6 from file('${DATA_FILE_USER_PATH}', Parquet, 'ipv6 IPv6') where ipv6 = 'fafe:b9d4:e754:4e78:8783:37f5:b2ea:9995' Format JSON SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
3332
${CLICKHOUSE_CLIENT} --query="select ipv6 from file('${DATA_FILE_USER_PATH}', Parquet, 'ipv6 IPv6') where ipv6 = toIPv6('fafe:b9d4:e754:4e78:8783:37f5:b2ea:9995') Format JSON SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
3433
${CLICKHOUSE_CLIENT} --query="select toIPv6(ipv6) from file('${DATA_FILE_USER_PATH}', Parquet) where ipv6 = toIPv6('fafe:b9d4:e754:4e78:8783:37f5:b2ea:9995') Format JSON SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
35-
${CLICKHOUSE_CLIENT} --query="select toIPv6(ipv6) from file('${DATA_FILE_USER_PATH}', Parquet) where toIPv6(ipv6) = 'fafe:b9d4:e754:4e78:8783:37f5:b2ea:9995' Format JSON SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'

0 commit comments

Comments
 (0)