Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
1b6c602
draft / poc
arthurpassos Nov 1, 2024
55e387d
add a test
arthurpassos Nov 1, 2024
679cb6e
merge minmax and bf eval
arthurpassos Nov 5, 2024
f66be67
trigger ci
arthurpassos Nov 6, 2024
a7c78bd
extern logical_error
arthurpassos Nov 6, 2024
b5d7f78
update test
arthurpassos Nov 6, 2024
49923bb
update test
arthurpassos Nov 6, 2024
f559037
explicit in constructor
arthurpassos Nov 6, 2024
649cf0e
update tests
arthurpassos Dec 3, 2024
0155d62
update comment
arthurpassos Dec 3, 2024
ed0ba4c
address some comments
arthurpassos Dec 3, 2024
940ddaa
address some comments
arthurpassos Dec 3, 2024
18bc6dc
small fix
arthurpassos Dec 3, 2024
6eb7176
refactor
arthurpassos Dec 4, 2024
b76fa9d
remove commented out code
arthurpassos Dec 4, 2024
e8d6eed
define err code
arthurpassos Dec 4, 2024
c4abc09
check element.monotonic_functions_chain.empty()
arthurpassos Dec 4, 2024
87e35ef
avoid multiple virtual f calls
arthurpassos Dec 4, 2024
67ebbb7
style changes
arthurpassos Dec 5, 2024
f03b3f3
hehe
arthurpassos Dec 5, 2024
bd54e8d
workaround darwin uint64 issue
arthurpassos Dec 5, 2024
b57aa78
final adjustments
arthurpassos Dec 6, 2024
1760057
Update ArrowColumnToCHColumn.cpp
arthurpassos Dec 6, 2024
7ec6a56
forgot to include this file
arthurpassos Dec 6, 2024
8f57c5a
perhaps this will wokr
arthurpassos Dec 6, 2024
e8c8977
...
arthurpassos Dec 7, 2024
a6f4077
lol
arthurpassos Dec 11, 2024
b138072
re-trigger ci
arthurpassos Dec 12, 2024
5ed182c
add missing columndescriptor check
arthurpassos Dec 12, 2024
42635b7
Merge remote-tracking branch 'origin/master' into merge_parquet_minma…
al13n321 Dec 17, 2024
fdee80d
Merge remote-tracking branch 'origin/master' into merge_parquet_minma…
al13n321 Dec 20, 2024
6bc1933
Merge branch 'master' into merge_parquet_minmax_bloom_filter_evaluation
arthurpassos Jan 8, 2025
0d87fcd
Merge branch 'master' into merge_parquet_minmax_bloom_filter_evaluation
arthurpassos Jan 17, 2025
f0c4cee
Merge branch 'master' into merge_parquet_minmax_bloom_filter_evaluation
arthurpassos Jan 20, 2025
07cb54f
Merge branch 'master' into merge_parquet_minmax_bloom_filter_evaluation
arthurpassos Jan 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
525 changes: 0 additions & 525 deletions src/Processors/Formats/Impl/Parquet/ParquetBloomFilterCondition.cpp

This file was deleted.

73 changes: 0 additions & 73 deletions src/Processors/Formats/Impl/Parquet/ParquetBloomFilterCondition.h

This file was deleted.

191 changes: 191 additions & 0 deletions src/Processors/Formats/Impl/Parquet/parquetBloomFilterHash.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
#include <Processors/Formats/Impl/Parquet/parquetBloomFilterHash.h>

#if USE_PARQUET

#include <parquet/metadata.h>
#include <parquet/xxhasher.h>

namespace DB
{

bool isParquetStringTypeSupportedForBloomFilters(
const std::shared_ptr<const parquet::LogicalType> & logical_type,
parquet::ConvertedType::type converted_type)
{
if (logical_type &&
!logical_type->is_none()
&& !(logical_type->is_string() || logical_type->is_BSON() || logical_type->is_JSON()))
{
return false;
}

if (parquet::ConvertedType::type::NONE != converted_type &&
!(converted_type == parquet::ConvertedType::JSON || converted_type == parquet::ConvertedType::UTF8
|| converted_type == parquet::ConvertedType::BSON))
{
return false;
}

return true;
}

bool isParquetIntegerTypeSupportedForBloomFilters(const std::shared_ptr<const parquet::LogicalType> & logical_type, parquet::ConvertedType::type converted_type)
{
if (logical_type && !logical_type->is_none() && !logical_type->is_int())
{
return false;
}

if (parquet::ConvertedType::type::NONE != converted_type && !(converted_type == parquet::ConvertedType::INT_8 || converted_type == parquet::ConvertedType::INT_16
|| converted_type == parquet::ConvertedType::INT_32 || converted_type == parquet::ConvertedType::INT_64
|| converted_type == parquet::ConvertedType::UINT_8 || converted_type == parquet::ConvertedType::UINT_16
|| converted_type == parquet::ConvertedType::UINT_32 || converted_type == parquet::ConvertedType::UINT_64))
{
return false;
}

return true;
}

template <typename T>
uint64_t hashSpecialFLBATypes(const Field & field)
{
const T & value = field.safeGet<T>();

parquet::FLBA flba(reinterpret_cast<const uint8_t*>(&value));

parquet::XxHasher hasher;

return hasher.Hash(&flba, sizeof(T));
};

std::optional<uint64_t> tryHashStringWithoutCompatibilityCheck(const Field & field)
{
const auto field_type = field.getType();

if (field_type != Field::Types::Which::String)
{
return std::nullopt;
}

parquet::XxHasher hasher;
parquet::ByteArray ba { field.safeGet<std::string>() };

return hasher.Hash(&ba);
}

std::optional<uint64_t> tryHashString(
const Field & field,
const std::shared_ptr<const parquet::LogicalType> & logical_type,
parquet::ConvertedType::type converted_type)
{
if (!isParquetStringTypeSupportedForBloomFilters(logical_type, converted_type))
{
return std::nullopt;
}

return tryHashStringWithoutCompatibilityCheck(field);
}

std::optional<uint64_t> tryHashFLBA(
const Field & field,
const std::shared_ptr<const parquet::LogicalType> & logical_type,
parquet::ConvertedType::type converted_type,
std::size_t parquet_column_length)
{
if (!isParquetStringTypeSupportedForBloomFilters(logical_type, converted_type))
{
return std::nullopt;
}

const auto field_type = field.getType();

if (field_type == Field::Types::Which::IPv6 && parquet_column_length == sizeof(IPv6))
{
return hashSpecialFLBATypes<IPv6>(field);
}

return tryHashStringWithoutCompatibilityCheck(field);
}

template <typename ParquetPhysicalType>
std::optional<uint64_t> tryHashInt(const Field & field, const std::shared_ptr<const parquet::LogicalType> & logical_type, parquet::ConvertedType::type converted_type)
{
if (!isParquetIntegerTypeSupportedForBloomFilters(logical_type, converted_type))
{
return std::nullopt;
}

parquet::XxHasher hasher;

if (field.getType() == Field::Types::Which::Int64)
{
return hasher.Hash(static_cast<ParquetPhysicalType>(field.safeGet<int64_t>()));
}
else if (field.getType() == Field::Types::Which::UInt64)
{
return hasher.Hash(static_cast<ParquetPhysicalType>(field.safeGet<uint64_t>()));
}
else if (field.getType() == Field::Types::IPv4)
{
/*
* In theory, we could accept IPv4 over 64 bits variables. It would only be a problem in case it was hashed using the byte array api
* with a zero-ed buffer that had a 32 bits variable copied into it.
*
* To be on the safe side, accept only in case physical type is 32 bits.
* */
if constexpr (std::is_same_v<int32_t, ParquetPhysicalType>)
{
return hasher.Hash(static_cast<ParquetPhysicalType>(field.safeGet<IPv4>()));
}
}

return std::nullopt;
}

std::optional<uint64_t> parquetTryHashField(const Field & field, const parquet::ColumnDescriptor * parquet_column_descriptor)
{
const auto physical_type = parquet_column_descriptor->physical_type();
const auto & logical_type = parquet_column_descriptor->logical_type();
const auto converted_type = parquet_column_descriptor->converted_type();

switch (physical_type)
{
case parquet::Type::type::INT32:
return tryHashInt<int32_t>(field, logical_type, converted_type);
case parquet::Type::type::INT64:
return tryHashInt<int64_t>(field, logical_type, converted_type);
case parquet::Type::type::BYTE_ARRAY:
return tryHashString(field, logical_type, converted_type);
case parquet::Type::type::FIXED_LEN_BYTE_ARRAY:
return tryHashFLBA(field, logical_type, converted_type, parquet_column_descriptor->type_length());
default:
return std::nullopt;
}
}

std::optional<std::vector<uint64_t>> parquetTryHashColumn(const IColumn * data_column, const parquet::ColumnDescriptor * parquet_column_descriptor)
{
std::vector<uint64_t> hashes;

for (size_t i = 0u; i < data_column->size(); i++)
{
Field f;
data_column->get(i, f);

auto hashed_value = parquetTryHashField(f, parquet_column_descriptor);

if (!hashed_value)
{
return std::nullopt;
}

hashes.emplace_back(*hashed_value);
}

return hashes;
}

}

#endif
25 changes: 25 additions & 0 deletions src/Processors/Formats/Impl/Parquet/parquetBloomFilterHash.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#pragma once

#include <config.h>

#if USE_PARQUET

#include <Processors/Formats/Impl/ArrowFieldIndexUtil.h>

namespace DB
{

/*
* Try to hash a ClickHouse field, nullopt in case it can't be done
* */
std::optional<uint64_t> parquetTryHashField(const Field & field, const parquet::ColumnDescriptor * parquet_column_descriptor);


/*
* Try to hash elements in a ClickHouse column; Will return std::nullopt in case one of them can't be hashed
* */
std::optional<std::vector<uint64_t>> parquetTryHashColumn(const IColumn * data_column, const parquet::ColumnDescriptor * parquet_column_descriptor);

}

#endif
Loading