Data Skipping Indices#4143
Conversation
Nikvas0/index creation
Nikvas0/index stream
Nikvas0/unique index
| const auto one = std::make_shared<ASTLiteral>(Field(1)); | ||
| const auto two = std::make_shared<ASTLiteral>(Field(2)); | ||
|
|
||
| node = makeASTFunction( |
There was a problem hiding this comment.
Would special function for swapping last two bits be better in this case?
There was a problem hiding this comment.
Yes, but this function would not be usable for anything else.
There was a problem hiding this comment.
Let's make this function nevertheless.
| else if (select.prewhere_expression) | ||
| new_expression = select.prewhere_expression->clone(); | ||
| else | ||
| /// 11_2 -- can be true and false at the same time |
There was a problem hiding this comment.
11_2 - I guess you mean "11 in binary".
If you write 0b11 - it will be more obvious.
alesapin
left a comment
There was a problem hiding this comment.
My current remarks mostly about style. Code is really cool. I'll try to look one more time later.
|
|
||
| String MergeTreeUniqueGranule::toString() const | ||
| { | ||
| String res = ""; |
There was a problem hiding this comment.
some stream (like ostringstream) would be better
|
|
||
|
|
||
| /// Structure for storing basic index info like columns, expression, arguments, ... | ||
| class MergeTreeIndex |
| using MutableMergeTreeIndexPtr = std::shared_ptr<MergeTreeIndex>; | ||
|
|
||
|
|
||
| struct MergeTreeIndexGranule |
| IndexConditionPtr MergeTreeMinMaxIndex::createIndexCondition( | ||
| const SelectQueryInfo & query, const Context & context) const | ||
| { | ||
| return std::make_shared<MinMaxCondition>(query, context, *this); |
|
|
||
| }; | ||
|
|
||
| std::unique_ptr<MergeTreeIndex> MergeTreeMinMaxIndexCreator( |
There was a problem hiding this comment.
Move definition of this function to .cpp and add declaration to factory .cpp file. Also function name starts from lowercase.
| { | ||
| stream.assertMark(); | ||
| } | ||
| catch (Exception &e) |
| /// If there are no marks after the end of range, just use max_read_buffer_size | ||
| if (right >= marks_count | ||
| || (right + 1 == marks_count | ||
| && getMark(right).offset_in_compressed_file |
There was a problem hiding this comment.
Hard to read, maybe save into several tmp variables?
| virtual String toString() const = 0; | ||
| virtual bool empty() const = 0; | ||
|
|
||
| virtual void update(const Block & block, size_t * pos, size_t limit) = 0; |
|
|
||
| void MergeTreeMinMaxGranule::update(const Block & block, size_t * pos, size_t limit) | ||
| { | ||
| size_t rows_read = std::min(limit, block.rows() - *pos); |
There was a problem hiding this comment.
Maybe add check that *pos is less than number of rows? I know from outer code, that it's always true.
| auto granule = std::dynamic_pointer_cast<MergeTreeUniqueGranule>(idx_granule); | ||
| if (!granule) | ||
| throw Exception( | ||
| "Unique index condition got wrong granule", ErrorCodes::LOGICAL_ERROR); |
| INSERT INTO test.minmax_idx VALUES (1, 5, 6.9, 1.57, 'bac', 'c', '2014-07-11'); | ||
|
|
||
| /* simple select */ | ||
| SELECT * FROM test.minmax_idx WHERE i32 = 5 AND i32 + f64 < 12 AND 3 < d AND d < 7 AND (s = 'bac' OR s = 'cba') ORDER BY dt; |
There was a problem hiding this comment.
maybe it would be better to append FORMAT JSON to end of query. After that we can see how many rows were read:
{
"meta":
[
{
"name": "u64",
"type": "UInt64"
},
{
"name": "i32",
"type": "Int32"
},
{
"name": "f64",
"type": "Float64"
},
{
"name": "d",
"type": "Decimal(10, 2)"
},
{
"name": "s",
"type": "String"
},
{
"name": "e",
"type": "Enum8('a' = 1, 'b' = 2, 'c' = 3)"
},
{
"name": "dt",
"type": "Date"
}
],
"data":
[
{
"u64": "0",
"i32": 5,
"f64": 4.7,
"d": 6.50,
"s": "cba",
"e": "b",
"dt": "2014-01-04"
}
],
"rows": 1,
"statistics":
{
"elapsed": 0.000601825,
"rows_read": 1,
"bytes_read": 43
}
}
|
|
||
| * `ALTER ADD INDEX name expression TYPE type GRANULARITY value AFTER name [AFTER name2]` - Adds index description to tables metadata. | ||
|
|
||
| * `ALTER DROP INDEX name` - Removes index description from tables metadata and deletes index files from disk. |
| namespace DB | ||
| { | ||
|
|
||
| using IndicesAsts = std::vector<std::shared_ptr<ASTIndexDeclaration>>; |
| String name; | ||
| IAST * expr; | ||
| ASTFunction * type; | ||
| Field granularity; |
There was a problem hiding this comment.
Why we store granularity as Field if it's unsigned integer?
| extern const int KEEPER_EXCEPTION; | ||
| extern const int ALL_REPLICAS_LOST; | ||
| extern const int REPLICA_STATUS_CHANGED; | ||
| extern const int INCORRECT_QUERY; |
| MergeTreeReaderStream( | ||
| const String &path_prefix_, const String &extension_, size_t marks_count_, | ||
| const MarkRanges &all_mark_ranges, | ||
| MarkCache *mark_cache, bool save_marks_in_cache, |
| * `minmax` | ||
| Хранит минимум и максимум выражения (если выражение - `tuple`, то для каждого элемента `tuple`), используя их для пропуска блоков аналогично первичному ключу. | ||
|
|
||
| * `unique(max_rows)` |
There was a problem hiding this comment.
It can be easily confused with unique constraint. Let's think about better name for it.
|
Fixed in #4286 |
I hereby agree to the terms of the CLA available at: https://yandex.ru/legal/cla/?lang=en
For changelog. Remove if this is non-significant change.
Category (leave one):
Short description (up to few sentences):
Data Skipping Indices for ClickHouse.
These indices aggregate some info about big blocks of data during writing/merging. After that, they are used to avoid reading the data which do not satisfy the query.
Detailed description (optional):
CREATE TABLEquery.ALTER TABLE ... ADD/DROP INDEX