Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
128 commits
Select commit Hold shift + click to select a range
fb454a0
wip, fix hashing
arthurpassos Apr 21, 2024
119f625
shitty serialize impl, missing arrays
arthurpassos Apr 24, 2024
8966747
wip
arthurpassos Apr 25, 2024
6a7b087
undo something
arthurpassos Apr 25, 2024
0c368e5
delete unnecessary stuff
arthurpassos Apr 25, 2024
b080a94
fix tests
arthurpassos Apr 25, 2024
67b3241
trigger ci
arthurpassos Apr 25, 2024
7a6a60b
remove unused variable
arthurpassos Apr 25, 2024
c873e65
progress
arthurpassos Apr 25, 2024
a9c3c10
style
arthurpassos Apr 25, 2024
d6834bd
try to fix defines
arthurpassos Apr 25, 2024
6d4fe75
add if use parquet
arthurpassos Apr 25, 2024
2df232f
more complex test
arthurpassos Apr 25, 2024
49b1b55
update docs
arthurpassos Apr 25, 2024
03f812c
add new setting to settingshistory
arthurpassos Apr 26, 2024
b1ca8b2
Merge branch 'master' into add_parquet_bloom_filters_support
arthurpassos Apr 26, 2024
b20181a
fix
arthurpassos Apr 26, 2024
8fec659
add suport for in check
arthurpassos Apr 26, 2024
335d499
add some tests for in cehck
arthurpassos Apr 26, 2024
558d5b0
try chassert
arthurpassos Apr 26, 2024
15cc4f8
fix dumb shit
arthurpassos Apr 26, 2024
6166d48
make test file reproducible, add bloom filter to the array column and…
arthurpassos Apr 27, 2024
3d56e79
fix existing tests
arthurpassos Apr 29, 2024
b8069bb
add hasall support
arthurpassos Apr 29, 2024
8ef8a7a
update test files so array column has higher cardinality
arthurpassos Apr 30, 2024
fb17353
simplify tests and add tests for has array operations
arthurpassos Apr 30, 2024
bdd05cc
simplify code by reducing the scope for now. Also add test for equali…
arthurpassos May 1, 2024
d1a8308
trigger ci
arthurpassos May 1, 2024
4eb7781
add test for nested function
arthurpassos May 2, 2024
7510786
optimize things a bit
arthurpassos May 2, 2024
668d154
Merge branch 'master' into add_parquet_bloom_filters_support
arthurpassos May 9, 2024
e6bf72a
hash only once approach
arthurpassos May 10, 2024
6cf0fdf
minor update
arthurpassos May 22, 2024
4441476
Merge branch 'ClickHouse:master' into add_parquet_bloom_filters_support
arthurpassos May 23, 2024
41c28d7
extract rpn build process into separate class
arthurpassos May 23, 2024
d26cb2f
Merge branch 'master' into add_parquet_bloom_filters_support
arthurpassos May 24, 2024
653a0d9
do not get parquet reader from arrow
arthurpassos May 27, 2024
7c2f975
early return for no row groups
arthurpassos May 28, 2024
281b3af
remove a.ref
arthurpassos May 28, 2024
30d2509
re-trigger ci
arthurpassos May 29, 2024
56098f1
Merge branch 'master' into add_parquet_bloom_filters_support
arthurpassos Jun 5, 2024
576c151
Merge branch 'master' into add_parquet_bloom_filters_support
arthurpassos Jun 6, 2024
e931b44
specify a few types instead of auto
arthurpassos Jun 10, 2024
6995b43
add missing file
arthurpassos Jun 10, 2024
49a9506
Merge branch 'master' into add_parquet_bloom_filters_support
arthurpassos Jul 2, 2024
f47fc50
settings history..
arthurpassos Jul 5, 2024
a3a6f40
Merge branch 'master' into add_parquet_bloom_filters_support
arthurpassos Jul 15, 2024
8d0979e
try to use keycondition instead
arthurpassos Jul 18, 2024
56fb310
rmv debug var
arthurpassos Jul 18, 2024
28d0859
Merge branch 'master' into add_parquet_bloom_filters_support
arthurpassos Jul 18, 2024
e9ffc68
extern bad arguments
arthurpassos Jul 19, 2024
5fc6b54
style check
arthurpassos Jul 23, 2024
93d2b57
minor typo
arthurpassos Jul 23, 2024
0441a7b
minor fix
arthurpassos Jul 23, 2024
964cdda
drop support for array types
arthurpassos Jul 23, 2024
8438262
move parquet bf stuff out of keycondition
arthurpassos Jul 23, 2024
8ff8970
error codes
arthurpassos Jul 23, 2024
fcd2f6e
few fixes
arthurpassos Jul 24, 2024
84e8548
remove unnecessary code
arthurpassos Jul 24, 2024
4342129
delete unused files
arthurpassos Jul 24, 2024
43d5269
Merge branch 'master' into add_parquet_bloom_filters_support
arthurpassos Jul 30, 2024
9d0772b
progress on re-using keycondition and implementing in_set
arthurpassos Aug 2, 2024
6c29a27
trigger ci
arthurpassos Aug 3, 2024
602d6a8
minor stuff
arthurpassos Aug 3, 2024
3fdf82c
evaluate rpn only once
arthurpassos Aug 3, 2024
eb91f47
Merge branch 'master' into add_parquet_bloom_filters_support
arthurpassos Aug 3, 2024
a1851d0
fix issue when in_set columns do not have bf
arthurpassos Aug 12, 2024
15e0cba
Merge branch 'master' into add_parquet_bloom_filters_support
arthurpassos Aug 15, 2024
0f56a75
only get bloom filters for filtering columns
arthurpassos Aug 17, 2024
96175f3
Merge branch 'master' into add_parquet_bloom_filters_support
arthurpassos Aug 17, 2024
74f7618
get bf reader in non optimal way
arthurpassos Aug 17, 2024
3048487
only calculate filtering columns if key_condition and bloom fliter is…
arthurpassos Aug 19, 2024
5084f3e
get bf reader in optimal way
arthurpassos Aug 19, 2024
b4112d0
Delete
arthurpassos Aug 19, 2024
00108ab
some improvements, altho code looks odd
arthurpassos Aug 21, 2024
5801698
Merge branch 'master' into add_parquet_bloom_filters_support
arthurpassos Aug 22, 2024
7f2f196
Update SettingsChangesHistory.cpp
arthurpassos Aug 22, 2024
c7e087b
function_unknown instead of function_true in some cases
arthurpassos Aug 22, 2024
5250f73
put reader properties in a variable
arthurpassos Aug 22, 2024
60cfe30
process in_set even if one of the columns do not have bf
arthurpassos Aug 22, 2024
7c970c1
simplify index mapping a bit
arthurpassos Aug 23, 2024
a6eb0c5
Merge branch 'master' into add_parquet_bloom_filters_support
arthurpassos Aug 28, 2024
6dc5681
vector instead of map.. need to remember order of insertion is important
arthurpassos Aug 28, 2024
7ef5aaa
safe guard around bf across row groups
arthurpassos Aug 28, 2024
1fee8a7
Merge branch 'master' into add_parquet_bloom_filters_support
arthurpassos Aug 29, 2024
4fcc642
progress
arthurpassos Aug 29, 2024
6cabcc0
simplify getfilteringcolumns
arthurpassos Aug 29, 2024
1fce06a
remove unused type alias
arthurpassos Aug 30, 2024
9cef890
Merge branch 'master' into add_parquet_bloom_filters_support
arthurpassos Aug 30, 2024
dd81cbe
Merge branch 'master' into add_parquet_bloom_filters_support
arthurpassos Sep 13, 2024
e55584c
support only basic types with no special encodingwq
arthurpassos Sep 13, 2024
39a5bea
small fixes, I need to improve testing
arthurpassos Sep 15, 2024
18c65fb
tmp
arthurpassos Sep 17, 2024
10718eb
handle nullable on tuple
arthurpassos Sep 18, 2024
d017738
progress
arthurpassos Sep 19, 2024
10b3a0e
Merge branch 'master' into add_parquet_bloom_filters_support
arthurpassos Sep 19, 2024
da5304a
fix conflicts
arthurpassos Sep 20, 2024
7f85481
fix steyle
arthurpassos Sep 20, 2024
2f81b63
tryconvertfieldtotype
arthurpassos Sep 20, 2024
0fccff2
update tests to new dataset
arthurpassos Sep 23, 2024
938ef1c
grab Field from ordered set columns and rely on convertfieldtotype
arthurpassos Sep 25, 2024
3788b23
tmp
arthurpassos Sep 25, 2024
5c0afda
update tests
arthurpassos Sep 25, 2024
87e14ec
add proper uint64 tests
arthurpassos Sep 26, 2024
533c842
add uint8 and uuid tests
arthurpassos Sep 26, 2024
90bfd36
simplify logic to use vector of uint64 instead of columnptr. Fixes ms…
arthurpassos Sep 27, 2024
07d1672
undo hashstring modification
arthurpassos Sep 27, 2024
a8f6db5
address some coments
arthurpassos Oct 1, 2024
ffac264
simplify maybeTrueOnBloomFilter
arthurpassos Oct 1, 2024
f9f12fe
progress
arthurpassos Oct 4, 2024
1e7bb89
safe guards around index mapping
arthurpassos Oct 4, 2024
b2809a8
some structural changes
arthurpassos Oct 4, 2024
ed77901
progress
arthurpassos Oct 7, 2024
ce74e31
Merge branch 'master' into add_parquet_bloom_filters_support
arthurpassos Oct 7, 2024
37206d8
add setting lost during merge
arthurpassos Oct 7, 2024
2b5eef3
move setting to 24.10
arthurpassos Oct 8, 2024
73cba7f
add test for ipv6
arthurpassos Oct 8, 2024
e5188d4
rmv ssh key
arthurpassos Oct 8, 2024
514fa6f
add new test file
arthurpassos Oct 8, 2024
fadaef4
trigger ci
arthurpassos Oct 8, 2024
710a93f
minor improvements
arthurpassos Oct 9, 2024
99786b1
few comments adressed
arthurpassos Oct 11, 2024
a72a534
assert parquet column size is 16 bytes when hashing ipv6
arthurpassos Oct 11, 2024
9b6979b
Merge branch 'master' into add_parquet_bloom_filters_support
arthurpassos Oct 11, 2024
d76cea6
simplify parquet bf rpn by removing function_equals
arthurpassos Oct 11, 2024
fd78ef6
fix in with invalid conversion
arthurpassos Oct 15, 2024
1e7f5b0
docs change
arthurpassos Oct 15, 2024
424cd1b
f_unknown if monotonic_functions not empty
arthurpassos Oct 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/Core/FormatFactorySettingsDeclaration.h
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,9 @@ Avoid reordering rows when reading from Parquet files. Usually makes it much slo
)", 0) \
M(Bool, input_format_parquet_filter_push_down, true, R"(
When reading Parquet files, skip whole row groups based on the WHERE/PREWHERE expressions and min/max statistics in the Parquet metadata.
)", 0) \
M(Bool, input_format_parquet_bloom_filter_push_down, false, R"(
When reading Parquet files, skip whole row groups based on the WHERE expressions and bloom filter in the Parquet metadata.
)", 0) \
M(Bool, input_format_parquet_use_native_reader, false, R"(
When reading Parquet files, to use native reader instead of arrow reader.
Expand Down
1 change: 1 addition & 0 deletions src/Core/SettingsChangesHistory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
{"distributed_cache_max_unacked_inflight_packets", 10, 10, "A setting for ClickHouse Cloud"},
{"distributed_cache_data_packet_ack_window", 5, 5, "A setting for ClickHouse Cloud"},
{"max_parts_to_move", 1000, 1000, "New setting"},
{"input_format_parquet_bloom_filter_push_down", false, true, "When reading Parquet files, skip whole row groups based on the WHERE/PREWHERE expressions and bloom filter in the Parquet metadata."},
}
},
{"24.9",
Expand Down
4 changes: 4 additions & 0 deletions src/DataTypes/IDataType.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -321,13 +321,17 @@ bool isUInt8(TYPE data_type) { return WhichDataType(data_type).isUInt8(); } \
bool isUInt16(TYPE data_type) { return WhichDataType(data_type).isUInt16(); } \
bool isUInt32(TYPE data_type) { return WhichDataType(data_type).isUInt32(); } \
bool isUInt64(TYPE data_type) { return WhichDataType(data_type).isUInt64(); } \
bool isUInt128(TYPE data_type) { return WhichDataType(data_type).isUInt128(); } \
bool isUInt256(TYPE data_type) { return WhichDataType(data_type).isUInt256(); } \
bool isNativeUInt(TYPE data_type) { return WhichDataType(data_type).isNativeUInt(); } \
bool isUInt(TYPE data_type) { return WhichDataType(data_type).isUInt(); } \
\
bool isInt8(TYPE data_type) { return WhichDataType(data_type).isInt8(); } \
bool isInt16(TYPE data_type) { return WhichDataType(data_type).isInt16(); } \
bool isInt32(TYPE data_type) { return WhichDataType(data_type).isInt32(); } \
bool isInt64(TYPE data_type) { return WhichDataType(data_type).isInt64(); } \
bool isInt128(TYPE data_type) { return WhichDataType(data_type).isInt128(); } \
bool isInt256(TYPE data_type) { return WhichDataType(data_type).isInt256(); } \
bool isNativeInt(TYPE data_type) { return WhichDataType(data_type).isNativeInt(); } \
bool isInt(TYPE data_type) { return WhichDataType(data_type).isInt(); } \
\
Expand Down
6 changes: 5 additions & 1 deletion src/DataTypes/IDataType.h
Original file line number Diff line number Diff line change
Expand Up @@ -457,14 +457,18 @@ struct WhichDataType
bool isUInt8(TYPE data_type); \
bool isUInt16(TYPE data_type); \
bool isUInt32(TYPE data_type); \
bool isUInt64(TYPE data_type); \
bool isUInt64(TYPE data_type);\
bool isUInt128(TYPE data_type);\
bool isUInt256(TYPE data_type); \
bool isNativeUInt(TYPE data_type); \
bool isUInt(TYPE data_type); \
\
bool isInt8(TYPE data_type); \
bool isInt16(TYPE data_type); \
bool isInt32(TYPE data_type); \
bool isInt64(TYPE data_type); \
bool isInt128(TYPE data_type); \
bool isInt256(TYPE data_type); \
bool isNativeInt(TYPE data_type); \
bool isInt(TYPE data_type); \
\
Expand Down
1 change: 1 addition & 0 deletions src/Formats/FormatFactory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
format_settings.parquet.case_insensitive_column_matching = settings[Setting::input_format_parquet_case_insensitive_column_matching];
format_settings.parquet.preserve_order = settings[Setting::input_format_parquet_preserve_order];
format_settings.parquet.filter_push_down = settings[Setting::input_format_parquet_filter_push_down];
format_settings.parquet.bloom_filter_push_down = settings[Setting::input_format_parquet_bloom_filter_push_down];
format_settings.parquet.use_native_reader = settings[Setting::input_format_parquet_use_native_reader];
format_settings.parquet.allow_missing_columns = settings[Setting::input_format_parquet_allow_missing_columns];
format_settings.parquet.skip_columns_with_unsupported_types_in_schema_inference = settings[Setting::input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference];
Expand Down
1 change: 1 addition & 0 deletions src/Formats/FormatSettings.h
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@ struct FormatSettings
bool skip_columns_with_unsupported_types_in_schema_inference = false;
bool case_insensitive_column_matching = false;
bool filter_push_down = true;
bool bloom_filter_push_down = true;
bool use_native_reader = false;
std::unordered_set<int> skip_row_groups = {};
bool output_string_as_string = false;
Expand Down
2 changes: 2 additions & 0 deletions src/Interpreters/Set.h
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,8 @@ class MergeTreeSetIndex

const Columns & getOrderedSet() const { return ordered_set; }

const std::vector<KeyTuplePositionMapping> & getIndexesMapping() const { return indexes_mapping; }

private:
// If all arguments in tuple are key columns, we can optimize NOT IN when there is only one element.
bool has_all_keys;
Expand Down
36 changes: 27 additions & 9 deletions src/Processors/Formats/Impl/ArrowFieldIndexUtil.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <arrow/type_fwd.h>
#include <boost/algorithm/string/case_conv.hpp>
#include <Common/Exception.h>
#include <parquet/metadata.h>


namespace arrow
Expand Down Expand Up @@ -65,11 +66,22 @@ class ArrowFieldIndexUtil
return result;
}

// For a parquet schema {x: {i: int, j: int}}, this should be populated as follows
// clickhouse_index = 0, parquet_indexes = {0, 1}
struct ClickHouseIndexToParquetIndex
{
std::size_t clickhouse_index;
std::vector<int> parquet_indexes;
};

/// Only collect the required fields' indices. Eg. when just read a field of a struct,
/// don't need to collect the whole indices in this struct.
std::vector<int> findRequiredIndices(const Block & header, const arrow::Schema & schema)
std::vector<ClickHouseIndexToParquetIndex> findRequiredIndices(
const Block & header,
const arrow::Schema & schema,
const parquet::FileMetaData & file)
{
std::vector<int> required_indices;
std::vector<ClickHouseIndexToParquetIndex> required_indices;
std::unordered_set<int> added_indices;
/// Flat all named fields' index information into a map.
auto fields_indices = calculateFieldIndices(schema);
Expand All @@ -79,7 +91,7 @@ class ArrowFieldIndexUtil
std::string col_name = named_col.name;
if (ignore_case)
boost::to_lower(col_name);
findRequiredIndices(col_name, named_col.type, fields_indices, added_indices, required_indices);
findRequiredIndices(col_name, i, named_col.type, fields_indices, added_indices, required_indices, file);
}
return required_indices;
}
Expand Down Expand Up @@ -169,10 +181,12 @@ class ArrowFieldIndexUtil

void findRequiredIndices(
const String & name,
std::size_t header_index,
DataTypePtr data_type,
const std::unordered_map<std::string, std::pair<int, int>> & field_indices,
std::unordered_set<int> & added_indices,
std::vector<int> & required_indices)
std::vector<ClickHouseIndexToParquetIndex> & required_indices,
const parquet::FileMetaData & file)
{
auto nested_type = removeNullable(data_type);
if (const DB::DataTypeTuple * type_tuple = typeid_cast<const DB::DataTypeTuple *>(nested_type.get()))
Expand All @@ -187,20 +201,20 @@ class ArrowFieldIndexUtil
if (ignore_case)
boost::to_lower(field_name);
const auto & field_type = field_types[i];
findRequiredIndices(Nested::concatenateName(name, field_name), field_type, field_indices, added_indices, required_indices);
findRequiredIndices(Nested::concatenateName(name, field_name), header_index, field_type, field_indices, added_indices, required_indices, file);
}
return;
}
}
else if (const auto * type_array = typeid_cast<const DB::DataTypeArray *>(nested_type.get()))
{
findRequiredIndices(name, type_array->getNestedType(), field_indices, added_indices, required_indices);
findRequiredIndices(name, header_index, type_array->getNestedType(), field_indices, added_indices, required_indices, file);
return;
}
else if (const auto * type_map = typeid_cast<const DB::DataTypeMap *>(nested_type.get()))
{
findRequiredIndices(name, type_map->getKeyType(), field_indices, added_indices, required_indices);
findRequiredIndices(name, type_map->getValueType(), field_indices, added_indices, required_indices);
findRequiredIndices(name, header_index, type_map->getKeyType(), field_indices, added_indices, required_indices, file);
findRequiredIndices(name, header_index, type_map->getValueType(), field_indices, added_indices, required_indices, file);
return;
}
auto it = field_indices.find(name);
Expand All @@ -211,14 +225,18 @@ class ArrowFieldIndexUtil
}
else
{
ClickHouseIndexToParquetIndex index_mapping;
index_mapping.clickhouse_index = header_index;
for (int j = 0; j < it->second.second; ++j)
{
auto index = it->second.first + j;
if (added_indices.insert(index).second)
{
required_indices.emplace_back(index);
index_mapping.parquet_indexes.emplace_back(index);
}
}

required_indices.emplace_back(index_mapping);
}
}
};
Expand Down
Loading