Skip to content

Commit 9d79bd9

Browse files
authored
Merge pull request #65792 from rschu1ze/stats-refactoring
Refactor statistics interface
2 parents 77aec8a + 6ccb26b commit 9d79bd9

18 files changed

+353
-336
lines changed

docs/en/engines/table-engines/mergetree-family/mergetree.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -993,11 +993,11 @@ They can be used for prewhere optimization only if we enable `set allow_statisti
993993

994994
- `TDigest`
995995

996-
Stores distribution of values from numeric columns in [TDigest](https://github.com/tdunning/t-digest) sketch.
996+
[TDigest](https://github.com/tdunning/t-digest) sketches which allow to compute approximate percentiles (e.g. the 90th percentile) for numeric columns.
997997

998998
- `Uniq`
999-
1000-
Estimate the number of distinct values of a column by HyperLogLog.
999+
1000+
[HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog) sketches which provide an estimation how many distinct values a column contains.
10011001

10021002
## Column-level Settings {#column-level-settings}
10031003

docs/en/sql-reference/statements/alter/statistics.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,6 @@ There is an example adding two statistics types to two columns:
2828
ALTER TABLE t1 MODIFY STATISTICS c, d TYPE TDigest, Uniq;
2929
```
3030

31-
:::note
31+
:::note
3232
Statistic manipulation is supported only for tables with [`*MergeTree`](../../../engines/table-engines/mergetree-family/mergetree.md) engine (including [replicated](../../../engines/table-engines/mergetree-family/replication.md) variants).
3333
:::

src/Storages/Statistics/Statistics.cpp

Lines changed: 55 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,14 @@
1-
#include <optional>
2-
#include <numeric>
3-
41
#include <Storages/Statistics/Statistics.h>
52
#include <Storages/Statistics/ConditionSelectivityEstimator.h>
6-
#include <Storages/Statistics/TDigestStatistics.h>
7-
#include <Storages/Statistics/UniqStatistics.h>
3+
#include <Storages/Statistics/StatisticsTDigest.h>
4+
#include <Storages/Statistics/StatisticsUniq.h>
85
#include <Storages/StatisticsDescription.h>
96
#include <Storages/ColumnsDescription.h>
107
#include <IO/ReadHelpers.h>
118
#include <IO/WriteHelpers.h>
129
#include <Common/Exception.h>
10+
#include <Common/logger_useful.h>
11+
1312

1413
namespace DB
1514
{
@@ -20,32 +19,57 @@ namespace ErrorCodes
2019
extern const int INCORRECT_QUERY;
2120
}
2221

23-
/// Version / bitmask of statistics / data of statistics /
2422
enum StatisticsFileVersion : UInt16
2523
{
2624
V0 = 0,
2725
};
2826

29-
IStatistics::IStatistics(const SingleStatisticsDescription & stat_) : stat(stat_) {}
27+
IStatistics::IStatistics(const SingleStatisticsDescription & stat_)
28+
: stat(stat_)
29+
{
30+
}
3031

3132
ColumnStatistics::ColumnStatistics(const ColumnStatisticsDescription & stats_desc_)
32-
: stats_desc(stats_desc_), rows(0)
33+
: stats_desc(stats_desc_)
3334
{
3435
}
3536

3637
void ColumnStatistics::update(const ColumnPtr & column)
3738
{
3839
rows += column->size();
39-
for (const auto & iter : stats)
40-
{
41-
iter.second->update(column);
42-
}
40+
for (const auto & stat : stats)
41+
stat.second->update(column);
42+
}
43+
44+
UInt64 IStatistics::estimateCardinality() const
45+
{
46+
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cardinality estimation is not implemented for this type of statistics");
4347
}
4448

49+
Float64 IStatistics::estimateEqual(Float64 /*val*/) const
50+
{
51+
throw Exception(ErrorCodes::LOGICAL_ERROR, "Equality estimation is not implemented for this type of statistics");
52+
}
53+
54+
Float64 IStatistics::estimateLess(Float64 /*val*/) const
55+
{
56+
throw Exception(ErrorCodes::LOGICAL_ERROR, "Less-than estimation is not implemented for this type of statistics");
57+
}
58+
59+
/// -------------------------------------
60+
/// Implementation of the estimation:
61+
/// Note: Each statistics object supports certain types predicates natively, e.g.
62+
/// - TDigest: '< X' (less-than predicates)
63+
/// - Count-min sketches: '= X' (equal predicates)
64+
/// - Uniq (HyperLogLog): 'count distinct(*)' (column cardinality)
65+
/// If multiple statistics objects are available per column, it is sometimes also possible to combine them in a clever way.
66+
/// For that reason, all estimation are performed in a central place (here), and we don't simply pass the predicate to the first statistics
67+
/// object that supports it natively.
68+
4569
Float64 ColumnStatistics::estimateLess(Float64 val) const
4670
{
4771
if (stats.contains(StatisticsType::TDigest))
48-
return std::static_pointer_cast<TDigestStatistics>(stats.at(StatisticsType::TDigest))->estimateLess(val);
72+
return stats.at(StatisticsType::TDigest)->estimateLess(val);
4973
return rows * ConditionSelectivityEstimator::default_normal_cond_factor;
5074
}
5175

@@ -58,32 +82,32 @@ Float64 ColumnStatistics::estimateEqual(Float64 val) const
5882
{
5983
if (stats.contains(StatisticsType::Uniq) && stats.contains(StatisticsType::TDigest))
6084
{
61-
auto uniq_static = std::static_pointer_cast<UniqStatistics>(stats.at(StatisticsType::Uniq));
62-
/// 2048 is the default number of buckets in TDigest. In this case, TDigest stores exactly one value (with many rows)
63-
/// for every bucket.
64-
if (uniq_static->getCardinality() < 2048)
65-
{
66-
auto tdigest_static = std::static_pointer_cast<TDigestStatistics>(stats.at(StatisticsType::TDigest));
67-
return tdigest_static->estimateEqual(val);
68-
}
85+
/// 2048 is the default number of buckets in TDigest. In this case, TDigest stores exactly one value (with many rows) for every bucket.
86+
if (stats.at(StatisticsType::Uniq)->estimateCardinality() < 2048)
87+
return stats.at(StatisticsType::TDigest)->estimateEqual(val);
6988
}
7089
if (val < - ConditionSelectivityEstimator::threshold || val > ConditionSelectivityEstimator::threshold)
7190
return rows * ConditionSelectivityEstimator::default_normal_cond_factor;
7291
else
7392
return rows * ConditionSelectivityEstimator::default_good_cond_factor;
7493
}
7594

95+
/// -------------------------------------
96+
7697
void ColumnStatistics::serialize(WriteBuffer & buf)
7798
{
7899
writeIntBinary(V0, buf);
100+
79101
UInt64 stat_types_mask = 0;
80102
for (const auto & [type, _]: stats)
81103
stat_types_mask |= 1 << UInt8(type);
82104
writeIntBinary(stat_types_mask, buf);
83-
/// We write some basic statistics
105+
106+
/// as the column row count is always useful, save it in any case
84107
writeIntBinary(rows, buf);
85-
/// We write complex statistics
86-
for (const auto & [type, stat_ptr]: stats)
108+
109+
/// write the actual statistics object
110+
for (const auto & [type, stat_ptr] : stats)
87111
stat_ptr->serialize(buf);
88112
}
89113

@@ -96,7 +120,9 @@ void ColumnStatistics::deserialize(ReadBuffer &buf)
96120

97121
UInt64 stat_types_mask = 0;
98122
readIntBinary(stat_types_mask, buf);
123+
99124
readIntBinary(rows, buf);
125+
100126
for (auto it = stats.begin(); it != stats.end();)
101127
{
102128
if (!(stat_types_mask & 1 << UInt8(it->first)))
@@ -136,15 +162,15 @@ void MergeTreeStatisticsFactory::registerValidator(StatisticsType stats_type, Va
136162
{
137163
if (!validators.emplace(stats_type, std::move(validator)).second)
138164
throw Exception(ErrorCodes::LOGICAL_ERROR, "MergeTreeStatisticsFactory: the statistics validator type {} is not unique", stats_type);
139-
140165
}
141166

142167
MergeTreeStatisticsFactory::MergeTreeStatisticsFactory()
143168
{
144-
registerCreator(StatisticsType::TDigest, TDigestCreator);
145-
registerCreator(StatisticsType::Uniq, UniqCreator);
146169
registerValidator(StatisticsType::TDigest, TDigestValidator);
170+
registerCreator(StatisticsType::TDigest, TDigestCreator);
171+
147172
registerValidator(StatisticsType::Uniq, UniqValidator);
173+
registerCreator(StatisticsType::Uniq, UniqCreator);
148174
}
149175

150176
MergeTreeStatisticsFactory & MergeTreeStatisticsFactory::instance()
@@ -159,9 +185,7 @@ void MergeTreeStatisticsFactory::validate(const ColumnStatisticsDescription & st
159185
{
160186
auto it = validators.find(type);
161187
if (it == validators.end())
162-
{
163-
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown Statistic type '{}'", type);
164-
}
188+
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown statistic type '{}'", type);
165189
it->second(desc, data_type);
166190
}
167191
}
@@ -173,10 +197,7 @@ ColumnStatisticsPtr MergeTreeStatisticsFactory::get(const ColumnStatisticsDescri
173197
{
174198
auto it = creators.find(type);
175199
if (it == creators.end())
176-
{
177-
throw Exception(ErrorCodes::INCORRECT_QUERY,
178-
"Unknown Statistic type '{}'. Available types: tdigest, uniq", type);
179-
}
200+
throw Exception(ErrorCodes::INCORRECT_QUERY, "Unknown statistic type '{}'. Available types: 'tdigest' 'uniq'", type);
180201
auto stat_ptr = (it->second)(desc, stats.data_type);
181202
column_stat->stats[type] = stat_ptr;
182203
}
Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,15 @@
11
#pragma once
22

3-
#include <memory>
4-
#include <boost/core/noncopyable.hpp>
5-
63
#include <Core/Block.h>
7-
#include <Common/logger_useful.h>
84
#include <IO/ReadBuffer.h>
95
#include <IO/WriteBuffer.h>
106
#include <Storages/StatisticsDescription.h>
117

8+
#include <boost/core/noncopyable.hpp>
129

1310
namespace DB
1411
{
1512

16-
/// this is for user-defined statistic.
1713
constexpr auto STATS_FILE_PREFIX = "statistics_";
1814
constexpr auto STATS_FILE_SUFFIX = ".stats";
1915

@@ -25,14 +21,21 @@ class IStatistics
2521
{
2622
public:
2723
explicit IStatistics(const SingleStatisticsDescription & stat_);
28-
2924
virtual ~IStatistics() = default;
3025

31-
virtual void serialize(WriteBuffer & buf) = 0;
26+
virtual void update(const ColumnPtr & column) = 0;
3227

28+
virtual void serialize(WriteBuffer & buf) = 0;
3329
virtual void deserialize(ReadBuffer & buf) = 0;
3430

35-
virtual void update(const ColumnPtr & column) = 0;
31+
/// Estimate the cardinality of the column.
32+
/// Throws if the statistics object is not able to do a meaningful estimation.
33+
virtual UInt64 estimateCardinality() const;
34+
35+
/// Per-value estimations.
36+
/// Throws if the statistics object is not able to do a meaningful estimation.
37+
virtual Float64 estimateEqual(Float64 val) const; /// cardinality of val in the column
38+
virtual Float64 estimateLess(Float64 val) const; /// summarized cardinality of values < val in the column
3639

3740
protected:
3841
SingleStatisticsDescription stat;
@@ -43,29 +46,27 @@ using StatisticsPtr = std::shared_ptr<IStatistics>;
4346
class ColumnStatistics
4447
{
4548
public:
46-
explicit ColumnStatistics(const ColumnStatisticsDescription & stats_);
49+
explicit ColumnStatistics(const ColumnStatisticsDescription & stats_desc_);
50+
4751
void serialize(WriteBuffer & buf);
4852
void deserialize(ReadBuffer & buf);
49-
String getFileName() const;
5053

54+
String getFileName() const;
5155
const String & columnName() const;
5256

5357
UInt64 rowCount() const;
5458

5559
void update(const ColumnPtr & column);
5660

5761
Float64 estimateLess(Float64 val) const;
58-
5962
Float64 estimateGreater(Float64 val) const;
60-
6163
Float64 estimateEqual(Float64 val) const;
6264

6365
private:
64-
6566
friend class MergeTreeStatisticsFactory;
6667
ColumnStatisticsDescription stats_desc;
6768
std::map<StatisticsType, StatisticsPtr> stats;
68-
UInt64 rows; /// the number of rows of the column
69+
UInt64 rows = 0; /// the number of rows in the column
6970
};
7071

7172
class ColumnsDescription;
@@ -79,25 +80,23 @@ class MergeTreeStatisticsFactory : private boost::noncopyable
7980

8081
void validate(const ColumnStatisticsDescription & stats, DataTypePtr data_type) const;
8182

82-
using Creator = std::function<StatisticsPtr(const SingleStatisticsDescription & stats, DataTypePtr data_type)>;
83-
8483
using Validator = std::function<void(const SingleStatisticsDescription & stats, DataTypePtr data_type)>;
84+
using Creator = std::function<StatisticsPtr(const SingleStatisticsDescription & stats, DataTypePtr data_type)>;
8585

8686
ColumnStatisticsPtr get(const ColumnStatisticsDescription & stats) const;
87-
8887
ColumnsStatistics getMany(const ColumnsDescription & columns) const;
8988

90-
void registerCreator(StatisticsType type, Creator creator);
9189
void registerValidator(StatisticsType type, Validator validator);
90+
void registerCreator(StatisticsType type, Creator creator);
9291

9392
protected:
9493
MergeTreeStatisticsFactory();
9594

9695
private:
97-
using Creators = std::unordered_map<StatisticsType, Creator>;
9896
using Validators = std::unordered_map<StatisticsType, Validator>;
99-
Creators creators;
97+
using Creators = std::unordered_map<StatisticsType, Creator>;
10098
Validators validators;
99+
Creators creators;
101100
};
102101

103102
}
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
#include <Storages/Statistics/StatisticsTDigest.h>
2+
#include <DataTypes/DataTypeNullable.h>
3+
4+
namespace DB
5+
{
6+
namespace ErrorCodes
7+
{
8+
extern const int ILLEGAL_STATISTICS;
9+
}
10+
11+
StatisticsTDigest::StatisticsTDigest(const SingleStatisticsDescription & stat_)
12+
: IStatistics(stat_)
13+
{
14+
}
15+
16+
void StatisticsTDigest::update(const ColumnPtr & column)
17+
{
18+
size_t rows = column->size();
19+
20+
for (size_t row = 0; row < rows; ++row)
21+
{
22+
/// TODO: support more types.
23+
Float64 value = column->getFloat64(row);
24+
t_digest.add(value, 1);
25+
}
26+
}
27+
28+
void StatisticsTDigest::serialize(WriteBuffer & buf)
29+
{
30+
t_digest.serialize(buf);
31+
}
32+
33+
void StatisticsTDigest::deserialize(ReadBuffer & buf)
34+
{
35+
t_digest.deserialize(buf);
36+
}
37+
38+
Float64 StatisticsTDigest::estimateLess(Float64 val) const
39+
{
40+
return t_digest.getCountLessThan(val);
41+
}
42+
43+
Float64 StatisticsTDigest::estimateEqual(Float64 val) const
44+
{
45+
return t_digest.getCountEqual(val);
46+
}
47+
48+
void TDigestValidator(const SingleStatisticsDescription &, DataTypePtr data_type)
49+
{
50+
data_type = removeNullable(data_type);
51+
if (!data_type->isValueRepresentedByNumber())
52+
throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics of type 'tdigest' do not support type {}", data_type->getName());
53+
}
54+
55+
StatisticsPtr TDigestCreator(const SingleStatisticsDescription & stat, DataTypePtr)
56+
{
57+
return std::make_shared<StatisticsTDigest>(stat);
58+
}
59+
60+
}

0 commit comments

Comments
 (0)