-
Notifications
You must be signed in to change notification settings - Fork 8.3k
Expand file tree
/
Copy pathMergeTreeSettings.cpp
More file actions
2804 lines (2496 loc) · 131 KB
/
MergeTreeSettings.cpp
File metadata and controls
2804 lines (2496 loc) · 131 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#include <Storages/MergeTree/MergeTreeSettings.h>
#include <Columns/IColumn.h>
#include <Core/BaseSettings.h>
#include <Core/BaseSettingsFwdMacrosImpl.h>
#include <Core/BaseSettingsProgramOptions.h>
#include <Core/MergeSelectorAlgorithm.h>
#include <Core/SettingsChangesHistory.h>
#include <Disks/DiskFromAST.h>
#include <Parsers/ASTCreateQuery.h>
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTSetQuery.h>
#include <Parsers/FieldFromAST.h>
#include <Parsers/isDiskFunction.h>
#include <Storages/MergeTree/MergeTreeData.h>
#include <Storages/System/MutableColumnsAndConstraints.h>
#include <Common/Exception.h>
#include <Common/NamePrompter.h>
#include <Common/logger_useful.h>
#include <Common/ZooKeeper/ZooKeeper.h>
#include <Interpreters/Context.h>
#include <Disks/DiskObjectStorage/DiskObjectStorage.h>
#include <cmath>
#include <boost/program_options.hpp>
#include <fmt/ranges.h>
#include <Poco/Util/AbstractConfiguration.h>
#include <Poco/Util/Application.h>
#include "config.h"
#if !CLICKHOUSE_CLOUD
constexpr UInt64 default_min_bytes_for_wide_part = 10485760lu;
#else
constexpr UInt64 default_min_bytes_for_wide_part = 1024lu * 1024lu * 1024lu;
#endif
namespace DB
{
namespace ErrorCodes
{
extern const int UNKNOWN_SETTING;
extern const int BAD_ARGUMENTS;
extern const int LOGICAL_ERROR;
extern const int READONLY;
}
// clang-format off
/** These settings represent fine tunes for internal details of MergeTree storages
* and should not be changed by the user without a reason.
*/
#define MERGE_TREE_SETTINGS(DECLARE, DECLARE_WITH_ALIAS) \
DECLARE(UInt64, min_compress_block_size, 0, R"(
Minimum size of blocks of uncompressed data required for compression when
writing the next mark. You can also specify this setting in the global settings
(see [min_compress_block_size](/operations/settings/merge-tree-settings#min_compress_block_size)
setting). The value specified when the table is created overrides the global value
for this setting.
)", 0) \
DECLARE(UInt64, max_compress_block_size, 0, R"(
The maximum size of blocks of uncompressed data before compressing for writing
to a table. You can also specify this setting in the global settings
(see [max_compress_block_size](/operations/settings/merge-tree-settings#max_compress_block_size)
setting). The value specified when the table is created overrides the global
value for this setting.
)", 0) \
DECLARE(UInt64, index_granularity, 8192, R"(
Maximum number of data rows between the marks of an index. I.e how many rows
correspond to one primary key value.
)", 0) \
\
/** Data storing format settings. */ \
DECLARE(UInt64, min_bytes_for_wide_part, default_min_bytes_for_wide_part, R"(
Minimum number of bytes/rows in a data part that can be stored in `Wide`
format. You can set one, both or none of these settings.
)", 0) \
DECLARE(UInt32, min_level_for_wide_part, 0, R"(
Minimal part level to create a data part in `Wide` format instead of `Compact`.
)", 0) \
DECLARE(UInt64, min_rows_for_wide_part, 0, R"(
Minimal number of rows to create a data part in `Wide` format instead of `Compact`.
)", 0) \
DECLARE(UInt64, max_merge_delayed_streams_for_parallel_write, 40, R"(
The maximum number of streams (columns) that can be flushed in parallel
(analog of max_insert_delayed_streams_for_parallel_write for merges). Works
only for Vertical merges.
)", 0) \
DECLARE(Float, ratio_of_defaults_for_sparse_serialization, 0.9375f, R"(
Minimal ratio of the number of _default_ values to the number of _all_ values
in a column. Setting this value causes the column to be stored using sparse
serializations.
If a column is sparse (contains mostly zeros), ClickHouse can encode it in
a sparse format and automatically optimize calculations - the data does not
require full decompression during queries. To enable this sparse
serialization, define the `ratio_of_defaults_for_sparse_serialization`
setting to be less than 1.0. If the value is greater than or equal to 1.0,
then the columns will be always written using the normal full serialization.
Possible values:
- Float between `0` and `1` to enable sparse serialization
- `1.0` (or greater) if you do not want to use sparse serialization
**Example**
Notice the `s` column in the following table is an empty string for 95% of
the rows. In `my_regular_table` we do not use sparse serialization, and in
`my_sparse_table` we set `ratio_of_defaults_for_sparse_serialization` to
0.95:
```sql
CREATE TABLE my_regular_table
(
`id` UInt64,
`s` String
)
ENGINE = MergeTree
ORDER BY id;
INSERT INTO my_regular_table
SELECT
number AS id,
number % 20 = 0 ? toString(number): '' AS s
FROM
numbers(10000000);
CREATE TABLE my_sparse_table
(
`id` UInt64,
`s` String
)
ENGINE = MergeTree
ORDER BY id
SETTINGS ratio_of_defaults_for_sparse_serialization = 0.95;
INSERT INTO my_sparse_table
SELECT
number,
number % 20 = 0 ? toString(number): ''
FROM
numbers(10000000);
```
Notice the `s` column in `my_sparse_table` uses less storage space on disk:
```sql
SELECT table, name, data_compressed_bytes, data_uncompressed_bytes FROM system.columns
WHERE table LIKE 'my_%_table';
```
```response
┌─table────────────┬─name─┬─data_compressed_bytes─┬─data_uncompressed_bytes─┐
│ my_regular_table │ id │ 37790741 │ 75488328 │
│ my_regular_table │ s │ 2451377 │ 12683106 │
│ my_sparse_table │ id │ 37790741 │ 75488328 │
│ my_sparse_table │ s │ 2283454 │ 9855751 │
└──────────────────┴──────┴───────────────────────┴─────────────────────────┘
```
You can verify if a column is using the sparse encoding by viewing the
`serialization_kind` column of the `system.parts_columns` table:
```sql
SELECT column, serialization_kind FROM system.parts_columns
WHERE table LIKE 'my_sparse_table';
```
You can see which parts of `s` were stored using the sparse serialization:
```response
┌─column─┬─serialization_kind─┐
│ id │ Default │
│ s │ Default │
│ id │ Default │
│ s │ Default │
│ id │ Default │
│ s │ Sparse │
│ id │ Default │
│ s │ Sparse │
│ id │ Default │
│ s │ Sparse │
│ id │ Default │
│ s │ Sparse │
│ id │ Default │
│ s │ Sparse │
│ id │ Default │
│ s │ Sparse │
│ id │ Default │
│ s │ Sparse │
│ id │ Default │
│ s │ Sparse │
│ id │ Default │
│ s │ Sparse │
└────────┴────────────────────┘
```
)", 0) \
DECLARE(Bool, replace_long_file_name_to_hash, true, R"(
If the file name for column is too long (more than 'max_file_name_length'
bytes) replace it to SipHash128
)", 0) \
DECLARE(Bool, escape_index_filenames, true, R"(
Prior to 26.1 we didn't escape special symbols in filenames created for secondary indices, which could lead to issues with some
characters in index names producing broken parts. This is added purely for compatibility reasons. It should not be changed unless you
are reading old parts with indices using non-ascii characters in their names.
)", 0) \
DECLARE(UInt64, max_file_name_length, 127, R"(
The maximal length of the file name to keep it as is without hashing.
Takes effect only if setting `replace_long_file_name_to_hash` is enabled.
The value of this setting does not include the length of file extension. So,
it is recommended to set it below the maximum filename length (usually 255
bytes) with some gap to avoid filesystem errors.
)", 0) \
DECLARE(UInt64, min_bytes_for_full_part_storage, 0, R"(
Only available in ClickHouse Cloud. Minimal uncompressed size in bytes to
use full type of storage for data part instead of packed
)", 0) \
DECLARE(UInt32, min_level_for_full_part_storage, 0, R"(
Only available in ClickHouse Cloud. Minimal part level to
use full type of storage for data part instead of packed
)", 0) \
DECLARE(UInt64, min_rows_for_full_part_storage, 0, R"(
Only available in ClickHouse Cloud. Minimal number of rows to use full type
of storage for data part instead of packed
)", 0) \
DECLARE(UInt64, compact_parts_max_bytes_to_buffer, 128 * 1024 * 1024, R"(
Only available in ClickHouse Cloud. Maximal number of bytes to write in a
single stripe in compact parts
)", 0) \
DECLARE(NonZeroUInt64, compact_parts_max_granules_to_buffer, 128, R"(
Only available in ClickHouse Cloud. Maximal number of granules to write in a
single stripe in compact parts
)", 0) \
DECLARE(UInt64, compact_parts_merge_max_bytes_to_prefetch_part, 16 * 1024 * 1024, R"(
Only available in ClickHouse Cloud. Maximal size of compact part to read it
in a whole to memory during merge.
)", 0) \
DECLARE(UInt64, merge_max_bytes_to_prewarm_cache, 1ULL * 1024 * 1024 * 1024, R"(
Only available in ClickHouse Cloud. Maximal size of part (compact or packed)
to prewarm cache during merge.
)", 0) \
DECLARE(UInt64, merge_total_max_bytes_to_prewarm_cache, 15ULL * 1024 * 1024 * 1024, R"(
Only available in ClickHouse Cloud. Maximal size of parts in total to prewarm
cache during merge.
)", 0) \
DECLARE(Bool, load_existing_rows_count_for_old_parts, false, R"(
If enabled along with [exclude_deleted_rows_for_part_size_in_merge](#exclude_deleted_rows_for_part_size_in_merge),
deleted rows count for existing data parts will be calculated during table
starting up. Note that it may slow down start up table loading.
Possible values:
- `true`
- `false`
**See Also**
- [exclude_deleted_rows_for_part_size_in_merge](#exclude_deleted_rows_for_part_size_in_merge) setting
)", 0) \
DECLARE(Bool, use_compact_variant_discriminators_serialization, true, R"(
Enables compact mode for binary serialization of discriminators in Variant
data type.
This mode allows to use significantly less memory for storing discriminators
in parts when there is mostly one variant or a lot of NULL values.
)", 0) \
DECLARE(Bool, escape_variant_subcolumn_filenames, true, R"(
Escape special symbols in filenames created for subcolumns of Variant data type in Wide parts of MergeTree table. Needed for compatibility.
)", 0) \
DECLARE(MergeTreeSerializationInfoVersion, serialization_info_version, "with_types", R"(
Serialization info version used when writing `serialization.json`.
This setting is required for compatibility during cluster upgrades.
Possible values:
- `basic` - Basic format.
- `with_types` - Format with additional `types_serialization_versions` field, allowing per-type serialization versions.
This makes settings like `string_serialization_version` effective.
During rolling upgrades, set this to `basic` so that new servers produce
data parts compatible with old servers. After the upgrade completes,
switch to `WITH_TYPES` to enable per-type serialization versions.
)", 0) \
DECLARE(MergeTreeStringSerializationVersion, string_serialization_version, "with_size_stream", R"(
Controls the serialization format for top-level `String` columns.
This setting is only effective when `serialization_info_version` is set to "with_types".
When set to `with_size_stream`, top-level `String` columns are serialized with a separate
`.size` subcolumn storing string lengths, rather than inline. This allows real `.size`
subcolumns and can improve compression efficiency.
Nested `String` types (e.g., inside `Nullable`, `LowCardinality`, `Array`, or `Map`)
are not affected, except when they appear in a `Tuple`.
Possible values:
- `single_stream` — Use the standard serialization format with inline sizes.
- `with_size_stream` — Use a separate size stream for top-level `String` columns.
)", 0) \
DECLARE(MergeTreeNullableSerializationVersion, nullable_serialization_version, "basic", R"(
Controls the serialization method used for `Nullable(T)` columns.
Possible values:
- basic — Use the standard serialization for `Nullable(T)`.
- allow_sparse — Permit `Nullable(T)` to use sparse encoding.
)", 0) \
DECLARE(MergeTreeObjectSerializationVersion, object_serialization_version, "v3", R"(
Serialization version for JSON data type. Required for compatibility.
Possible values:
- `v1`
- `v2`
- `v3`
Only version `v3` supports changing the shared data serialization version.
)", 0) \
DECLARE(MergeTreeObjectSharedDataSerializationVersion, object_shared_data_serialization_version, "advanced", R"(
Serialization version for shared data inside JSON data type.
Possible values:
- `map` - store shared data as `Map(String, String)`
- `map_with_buckets` - store shared data as several separate `Map(String, String)` columns. Using buckets improves reading individual paths from shared data.
- `advanced` - special serialization of shared data designed to significantly improve reading of individual paths from shared data.
Note that this serialization increases the shared data storage size on disk because we store a lot of additional information.
The number of buckets for `map_with_buckets` and `advanced` serializations is determined by settings
[object_shared_data_buckets_for_compact_part](#object_shared_data_buckets_for_compact_part)/[object_shared_data_buckets_for_wide_part](#object_shared_data_buckets_for_wide_part).
)", 0) \
DECLARE(MergeTreeObjectSharedDataSerializationVersion, object_shared_data_serialization_version_for_zero_level_parts, "map_with_buckets", R"(
This setting allows to specify different serialization version of the
shared data inside JSON type for zero level parts that are created during inserts.
It's recommended not to use `advanced` shared data serialization for zero level parts because it can increase
the insertion time significantly.
)", 0) \
DECLARE(NonZeroUInt64, object_shared_data_buckets_for_compact_part, 8, R"(
The number of buckets for JSON shared data serialization in Compact parts. Works with `map_with_buckets` and `advanced` shared data serializations.
The maximum allowed value is 256.
)", 0) \
DECLARE(NonZeroUInt64, object_shared_data_buckets_for_wide_part, 32, R"(
The number of buckets for JSON shared data serialization in Wide parts. Works with `map_with_buckets` and `advanced` shared data serializations.
The maximum allowed value is 256.
)", 0) \
DECLARE(MergeTreeDynamicSerializationVersion, dynamic_serialization_version, "v3", R"(
Serialization version for Dynamic data type. Required for compatibility.
Possible values:
- `v1`
- `v2`
- `v3`
)", 0) \
DECLARE(Bool, propagate_types_serialization_versions_to_nested_types, true, R"(
If true, serialization versions like string_serialization_version will be propagated inside nested types like Array/Map/Nullable/JSON/etc. If disabled, the serialization version will take affect only to top-level columns of this type and Tuple el
)", 0)\
DECLARE(MergeTreeMapSerializationVersion, map_serialization_version, "basic", R"(
Controls the serialization method used for `Map` columns.
Possible values:
- basic — Use the standard serialization for `Map`.
- with_buckets — Split keys into buckets during serialization. Using buckets improves reading individual keys from the Map.
The number of buckets in `with_buckets` serialization is determined by [max_buckets_in_map](#max_buckets_in_map) and [map_buckets_strategy](#map_buckets_strategy).
)", 0) \
DECLARE(MergeTreeMapSerializationVersion, map_serialization_version_for_zero_level_parts, "basic", R"(
This setting allows to specify a different serialization version of
`Map` columns for zero level parts that are created during inserts.
It can be useful to keep `basic` serialization for zero level parts to avoid
performance degradation during inserts, while using `with_buckets` for merged parts.
)", 0) \
DECLARE(NonZeroUInt64, max_buckets_in_map, 32, R"(
The maximum number of buckets for `Map` serialization. Works with `with_buckets` `Map` serialization.
The actual number of buckets is determined by [map_buckets_strategy](#map_buckets_strategy).
The maximum allowed value is 256.
)", 0) \
DECLARE(MergeTreeMapBucketsStrategy, map_buckets_strategy, "sqrt", R"(
Controls the strategy for choosing the number of buckets in `with_buckets` `Map` serialization based on the average map size.
Possible values:
- constant — Always use [max_buckets_in_map](#max_buckets_in_map) as the number of buckets, regardless of the average map size.
- sqrt — Use `round(map_buckets_coefficient * sqrt(avg_map_size))` as the number of buckets, clamped to `[1, max_buckets_in_map]`.
- linear — Use `round(map_buckets_coefficient * avg_map_size)` as the number of buckets, clamped to `[1, max_buckets_in_map]`.
)", 0) \
DECLARE(Float, map_buckets_coefficient, 1.0, R"(
The coefficient used in `sqrt` and `linear` [map_buckets_strategy](#map_buckets_strategy) to calculate the number of buckets from the average map size.
For `sqrt` strategy: `round(map_buckets_coefficient * sqrt(avg_map_size))`.
For `linear` strategy: `round(map_buckets_coefficient * avg_map_size)`.
Ignored when `map_buckets_strategy` is `constant`.
)", 0) \
DECLARE(UInt64, map_buckets_min_avg_size, 32, R"(
The minimum average map size (number of keys per row) required to apply `with_buckets` serialization.
If the average map size is less than this value, a single bucket is used regardless of other bucket settings.
A value of `0` disables the threshold and always applies the bucketing strategy.
This setting is useful to avoid the overhead of bucketed serialization for small maps where the benefit is negligible.
)", 0) \
DECLARE(Bool, write_marks_for_substreams_in_compact_parts, true, R"(
Enables writing marks per each substream instead of per each column in Compact parts.
It allows to read individual subcolumns from the data part efficiently.
For example, column `t Tuple(a String, b UInt32, c Array(Nullable(UInt32)))` is serialized in the next substreams:
- `t.a` for String data of tuple element `a`
- `t.b` for UInt32 data of tuple element `b`
- `t.c.size0` for array sizes of tuple element `c`
- `t.c.null` for null map of nested array elements of tuple element `c`
- `t.c` for UInt32 data pf nested array elements of tuple element `c`
When this setting is enabled, we will write a mark for each of these 5 substreams, which means that we will be able to read
the data of each individual substream from the granule separately if needed. For example, if we want to read the subcolumn `t.c` we will read only data of
substreams `t.c.size0`, `t.c.null` and `t.c` and won't read data from substreams `t.a` and `t.b`. When this setting is disabled,
we will write a mark only for top-level column `t`, which means that we will always read the whole column data from the granule, even if we need only data of some substreams.
)", 0) \
DECLARE(UInt64Auto, merge_max_dynamic_subcolumns_in_wide_part, Field("auto"), R"(
The maximum number of dynamic subcolumns that can be created in every column in the Wide data part after merge.
It allows to reduce number of files created in Wide data part regardless of dynamic parameters specified in the data type.
For example, if the table has a column with the JSON(max_dynamic_paths=1024) type and the setting merge_max_dynamic_subcolumns_in_wide_part is set to 128,
after merge into the Wide data part number of dynamic paths will be decreased to 128 in this part and only 128 paths will be written as dynamic subcolumns.
)", 0) \
\
DECLARE(UInt64Auto, merge_max_dynamic_subcolumns_in_compact_part, Field("auto"), R"(
The maximum number of dynamic subcolumns that can be created in every column in the Compact data part after merge.
It allows to control the number of dynamic subcolumns in Compact part regardless of dynamic parameters specified in the data type.
For example, if the table has a column with the JSON(max_dynamic_paths=1024) type and the setting merge_max_dynamic_subcolumns_in_compact_part is set to 128,
after merge into the Compact data part number of dynamic paths will be decreased to 128 in this part and only 128 paths will be written as dynamic subcolumns.
)", 0) \
\
/** Merge selector settings. */ \
DECLARE(UInt64, merge_selector_blurry_base_scale_factor, 0, R"(
Controls when the logic kicks in relatively to the number of parts in
partition. The bigger the factor the more belated reaction will be.
)", 0) \
DECLARE(UInt64, merge_selector_window_size, 1000, R"(
How many parts to look at once.
)", 0) \
\
/** Merge settings. */ \
DECLARE(NonZeroUInt64, merge_max_block_size, 8192, R"(
The number of rows that are read from the merged parts into memory.
Possible values:
- Any positive integer.
Merge reads rows from parts in blocks of `merge_max_block_size` rows, then
merges and writes the result into a new part. The read block is placed in RAM,
so `merge_max_block_size` affects the size of the RAM required for the merge.
Thus, merges can consume a large amount of RAM for tables with very wide rows
(if the average row size is 100kb, then when merging 10 parts,
(100kb * 10 * 8192) = ~ 8GB of RAM). By decreasing `merge_max_block_size`,
you can reduce the amount of RAM required for a merge but slow down a merge.
)", 0) \
DECLARE(UInt64, merge_max_block_size_bytes, 10 * 1024 * 1024, R"(
How many bytes in blocks should be formed for merge operations. By default
has the same value as `index_granularity_bytes`.
)", 0) \
DECLARE(UInt64, max_bytes_to_merge_at_max_space_in_pool, 150ULL * 1024 * 1024 * 1024, R"(
The maximum total parts size (in bytes) to be merged into one part, if there
are enough resources available. Corresponds roughly to the maximum possible
part size created by an automatic background merge. (0 means merges will be disabled)
Possible values:
- Any non-negative integer.
The merge scheduler periodically analyzes the sizes and number of parts in
partitions, and if there are enough free resources in the pool, it starts
background merges. Merges occur until the total size of the source parts is
larger than `max_bytes_to_merge_at_max_space_in_pool`.
Merges initiated by [OPTIMIZE FINAL](/sql-reference/statements/optimize)
ignore `max_bytes_to_merge_at_max_space_in_pool` (only the free disk space
is taken into account).
)", 0) \
DECLARE(UInt64, max_bytes_to_merge_at_min_space_in_pool, 1024 * 1024, R"(
The maximum total part size (in bytes) to be merged into one part, with the
minimum available resources in the background pool.
Possible values:
- Any positive integer.
`max_bytes_to_merge_at_min_space_in_pool` defines the maximum total size of
parts which can be merged despite the lack of available disk space (in pool).
This is necessary to reduce the number of small parts and the chance of
`Too many parts` errors.
Merges book disk space by doubling the total merged parts sizes.
Thus, with a small amount of free disk space, a situation may occur in which
there is free space, but this space is already booked by ongoing large merges,
so other merges are unable to start, and the number of small parts grows
with every insert.
)", 0) \
DECLARE(UInt64, max_replicated_merges_in_queue, 1000, R"(
How many tasks of merging and mutating parts are allowed simultaneously in
ReplicatedMergeTree queue.
)", 0) \
DECLARE(UInt64, max_replicated_mutations_in_queue, 8, R"(
How many tasks of mutating parts are allowed simultaneously in
ReplicatedMergeTree queue.
)", 0) \
DECLARE(UInt64, max_replicated_merges_with_ttl_in_queue, 1, R"(
How many tasks of merging parts with TTL are allowed simultaneously in
ReplicatedMergeTree queue.
)", 0) \
DECLARE(UInt64, number_of_free_entries_in_pool_to_lower_max_size_of_merge, 8, R"(
When there is less than the specified number of free entries in pool
(or replicated queue), start to lower maximum size of merge to process
(or to put in queue).
This is to allow small merges to process - not filling the pool with long
running merges.
Possible values:
- Any positive integer.
)", 0) \
DECLARE(UInt64, number_of_free_entries_in_pool_to_execute_mutation, 20, R"(
When there is less than specified number of free entries in pool, do not
execute part mutations. This is to leave free threads for regular merges and
to avoid "Too many parts" errors.
Possible values:
- Any positive integer.
**Usage**
The value of the `number_of_free_entries_in_pool_to_execute_mutation` setting
should be less than the value of the [background_pool_size](/operations/server-configuration-parameters/settings.md/#background_pool_size)
* [background_merges_mutations_concurrency_ratio](/operations/server-configuration-parameters/settings.md/#background_merges_mutations_concurrency_ratio).
Otherwise, ClickHouse will throw an exception.
)", 0) \
DECLARE(UInt64, max_number_of_mutations_for_replica, 0, R"(
Limit the number of part mutations per replica to the specified amount.
Zero means no limit on the number of mutations per replica (the execution can
still be constrained by other settings).
)", 0) \
DECLARE(UInt64, max_number_of_merges_with_ttl_in_pool, 2, R"(When there is
more than specified number of merges with TTL entries in pool, do not assign
new merge with TTL. This is to leave free threads for regular merges and
avoid \"Too many parts\"
)", 0) \
DECLARE(Seconds, old_parts_lifetime, 8 * 60, R"(
The time (in seconds) of storing inactive parts to protect against data loss
during spontaneous server reboots.
Possible values:
- Any positive integer.
After merging several parts into a new part, ClickHouse marks the original
parts as inactive and deletes them only after `old_parts_lifetime` seconds.
Inactive parts are removed if they are not used by current queries, i.e. if
the `refcount` of the part is 1.
`fsync` is not called for new parts, so for some time new parts exist only
in the server's RAM (OS cache). If the server is rebooted spontaneously, new
parts can be lost or damaged. To protect data inactive parts are not deleted
immediately.
During startup ClickHouse checks the integrity of the parts. If the merged
part is damaged ClickHouse returns the inactive parts to the active list,
and later merges them again. Then the damaged part is renamed (the `broken_`
prefix is added) and moved to the `detached` folder. If the merged part is
not damaged, then the original inactive parts are renamed (the `ignored_`
prefix is added) and moved to the `detached` folder.
The default `dirty_expire_centisecs` value (a Linux kernel setting) is 30
seconds (the maximum time that written data is stored only in RAM), but under
heavy loads on the disk system data can be written much later. Experimentally,
a value of 480 seconds was chosen for `old_parts_lifetime`, during which a
new part is guaranteed to be written to disk.
)", 0) \
DECLARE(Seconds, temporary_directories_lifetime, 86400, R"(
How many seconds to keep tmp_-directories. You should not lower this value
because merges and mutations may not be able to work with low value of this
setting.
)", 0) \
DECLARE(Seconds, lock_acquire_timeout_for_background_operations, DBMS_DEFAULT_LOCK_ACQUIRE_TIMEOUT_SEC, R"(
For background operations like merges, mutations etc. How many seconds before
failing to acquire table locks.
)", 0) \
DECLARE(UInt64, min_rows_to_fsync_after_merge, 0, R"(
Minimal number of rows to do fsync for part after merge (0 - disabled)
)", 0) \
DECLARE(UInt64, min_compressed_bytes_to_fsync_after_merge, 0, R"(
Minimal number of compressed bytes to do fsync for part after merge (0 - disabled)
)", 0) \
DECLARE(UInt64, min_compressed_bytes_to_fsync_after_fetch, 0, R"(
Minimal number of compressed bytes to do fsync for part after fetch (0 - disabled)
)", 0) \
DECLARE(UInt64, replicated_fetches_min_part_level, 0, R"(
Minimum part level to fetch from other replicas. Parts with level below this threshold are postponed
(kept in the replication queue and re-evaluated each scheduling cycle, not permanently skipped).
Use 1 to postpone fetching level-0 (unmerged) parts, reducing replication overhead during heavy ingestion.
Default: 0 (fetch all parts regardless of level).
)", 0) \
DECLARE(UInt64, replicated_fetches_min_part_level_timeout_seconds, 300, R"(
Timeout in seconds after which a part below replicated_fetches_min_part_level will be fetched anyway.
Use 0 to disable the timeout (parts below the minimum level are postponed indefinitely until merged).
Default: 300 (force fetch after 5 minutes).
)", 0) \
DECLARE(Bool, fsync_after_insert, false, R"(
Do fsync for every inserted part. Significantly decreases performance of
inserts, not recommended to use with wide parts.
)", 0) \
DECLARE(Bool, fsync_part_directory, false, R"(
Do fsync for part directory after all part operations (writes, renames, etc.).
)", 0) \
DECLARE(UInt64, non_replicated_deduplication_window, 0, R"(
The number of the most recently inserted blocks in the non-replicated
[MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) table
for which hash sums are stored to check for duplicates.
Possible values:
- Any positive integer.
- `0` (disable deduplication).
A deduplication mechanism is used, similar to replicated tables (see
[replicated_deduplication_window](#replicated_deduplication_window) setting).
The hash sums of the created parts are written to a local file on a disk.
)", 0) \
DECLARE(UInt64, max_parts_to_merge_at_once, 100, R"(
Max amount of parts which can be merged at once (0 - disabled). Doesn't affect
OPTIMIZE FINAL query.
)", 0) \
DECLARE(Bool, materialize_statistics_on_merge, true, R"(When enabled, merges will build and store statistics for new parts.
Otherwise they can be created/stored by explicit [MATERIALIZE STATISTICS](/sql-reference/statements/alter/statistics.md)
or [during INSERTs](/operations/settings/settings.md#materialize_statistics_on_insert))", 0) \
DECLARE(Bool, materialize_skip_indexes_on_merge, true, R"(
When enabled, merges build and store skip indices for new parts.
Otherwise they can be created/stored by explicit [MATERIALIZE INDEX](/sql-reference/statements/alter/skipping-index.md/#materialize-index)
or [during INSERTs](/operations/settings/settings.md/#materialize_skip_indexes_on_insert).
See also [exclude_materialize_skip_indexes_on_merge](#exclude_materialize_skip_indexes_on_merge) for more fine-grained control.
)", 0) \
DECLARE(String, exclude_materialize_skip_indexes_on_merge, "", R"(
Excludes provided comma delimited list of skip indexes from being built and stored during merges. Has no effect if
[materialize_skip_indexes_on_merge](#materialize_skip_indexes_on_merge) is false.
The excluded skip indexes will still be built and stored by an explicit
[MATERIALIZE INDEX](/sql-reference/statements/alter/skipping-index.md/#materialize-index) query or during INSERTs depending on
the [materialize_skip_indexes_on_insert](/operations/settings/settings.md/#materialize_skip_indexes_on_insert)
session setting.
Example:
```sql
CREATE TABLE tab
(
a UInt64,
b UInt64,
INDEX idx_a a TYPE minmax,
INDEX idx_b b TYPE set(3)
)
ENGINE = MergeTree ORDER BY tuple() SETTINGS exclude_materialize_skip_indexes_on_merge = 'idx_a';
INSERT INTO tab SELECT number, number / 50 FROM numbers(100); -- setting has no effect on INSERTs
-- idx_a will be excluded from update during background or explicit merge via OPTIMIZE TABLE FINAL
-- can exclude multiple indexes by providing a list
ALTER TABLE tab MODIFY SETTING exclude_materialize_skip_indexes_on_merge = 'idx_a, idx_b';
-- default setting, no indexes excluded from being updated during merge
ALTER TABLE tab MODIFY SETTING exclude_materialize_skip_indexes_on_merge = '';
```
)", 0) \
DECLARE(UInt64, merge_selecting_sleep_ms, 5000, R"(
Minimum time to wait before trying to select parts to merge again after no
parts were selected. A lower setting will trigger selecting tasks in
background_schedule_pool frequently which result in large amount of requests
to zookeeper in large-scale clusters
)", 0) \
DECLARE(UInt64, max_merge_selecting_sleep_ms, 60000, R"(
Maximum time to wait before trying to select parts to merge again after no
parts were selected. A lower setting will trigger selecting tasks in
background_schedule_pool frequently which result in large amount of
requests to zookeeper in large-scale clusters
)", 0) \
DECLARE(Float, merge_selecting_sleep_slowdown_factor, 1.2f, R"(
The sleep time for merge selecting task is multiplied by this factor when
there's nothing to merge and divided when a merge was assigned
)", 0) \
DECLARE(UInt64, merge_tree_clear_old_temporary_directories_interval_seconds, 60, R"(
Sets the interval in seconds for ClickHouse to execute the cleanup of old
temporary directories.
Possible values:
- Any positive integer.
)", 0) \
DECLARE(UInt64, merge_tree_clear_old_parts_interval_seconds, 1, R"(
Sets the interval in seconds for ClickHouse to execute the cleanup of old
parts, WALs, and mutations.
Possible values:
- Any positive integer.
)", 0) \
DECLARE(UInt64, min_age_to_force_merge_seconds, 0, R"(
Merge parts if every part in the range is older than the value of
`min_age_to_force_merge_seconds`.
By default, ignores setting `max_bytes_to_merge_at_max_space_in_pool`
(see `enable_max_bytes_limit_for_min_age_to_force_merge`).
Possible values:
- Positive integer.
)", 0) \
DECLARE(Bool, min_age_to_force_merge_on_partition_only, false, R"(
Whether `min_age_to_force_merge_seconds` should be applied only on the entire
partition and not on subset.
By default, ignores setting `max_bytes_to_merge_at_max_space_in_pool` (see
`enable_max_bytes_limit_for_min_age_to_force_merge`).
Possible values:
- true, false
)", false) \
DECLARE(Bool, enable_max_bytes_limit_for_min_age_to_force_merge, true, R"(
If settings `min_age_to_force_merge_seconds` and
`min_age_to_force_merge_on_partition_only` should respect setting
`max_bytes_to_merge_at_max_space_in_pool`.
Possible values:
- `true`
- `false`
)", false) \
DECLARE(UInt64, number_of_free_entries_in_pool_to_execute_optimize_entire_partition, 25, R"(
When there is less than specified number of free entries in pool, do not
execute optimizing entire partition in the background (this task generated
when set `min_age_to_force_merge_seconds` and enable
`min_age_to_force_merge_on_partition_only`). This is to leave free threads
for regular merges and avoid "Too many parts".
Possible values:
- Positive integer.
The value of the `number_of_free_entries_in_pool_to_execute_optimize_entire_partition`
setting should be less than the value of the
[background_pool_size](/operations/server-configuration-parameters/settings.md/#background_pool_size)
* [background_merges_mutations_concurrency_ratio](/operations/server-configuration-parameters/settings.md/#background_merges_mutations_concurrency_ratio).
Otherwise, ClickHouse throws an exception.
)", 0) \
DECLARE(Bool, remove_rolled_back_parts_immediately, 1, R"(
Setting for an incomplete experimental feature.
)", EXPERIMENTAL) \
DECLARE(UInt64, replicated_max_mutations_in_one_entry, 10000, R"(
Max number of mutation commands that can be merged together and executed in
one MUTATE_PART entry (0 means unlimited)
)", 0) \
DECLARE(UInt64, number_of_mutations_to_delay, 500, R"(If table has at least
that many unfinished mutations, artificially slow down mutations of table.
Disabled if set to 0
)", 0) \
DECLARE(UInt64, number_of_mutations_to_throw, 1000, R"(
If table has at least that many unfinished mutations, throw 'Too many mutations'
exception. Disabled if set to 0
)", 0) \
DECLARE(UInt64, min_delay_to_mutate_ms, 10, R"(
Min delay of mutating MergeTree table in milliseconds, if there are a lot of
unfinished mutations
)", 0) \
DECLARE(UInt64, max_delay_to_mutate_ms, 1000, R"(
Max delay of mutating MergeTree table in milliseconds, if there are a lot of
unfinished mutations
)", 0) \
DECLARE(Bool, exclude_deleted_rows_for_part_size_in_merge, false, R"(
If enabled, estimated actual size of data parts (i.e., excluding those rows
that have been deleted through `DELETE FROM`) will be used when selecting
parts to merge. Note that this behavior is only triggered for data parts
affected by `DELETE FROM` executed after this setting is enabled.
Possible values:
- `true`
- `false`
**See Also**
- [load_existing_rows_count_for_old_parts](#load_existing_rows_count_for_old_parts)
setting
)", 0) \
DECLARE(String, merge_workload, "", R"(
Used to regulate how resources are utilized and shared between merges and
other workloads. Specified value is used as `workload` setting value for
background merges of this table. If not specified (empty string), then
server setting `merge_workload` is used instead.
**See Also**
- [Workload Scheduling](/operations/workload-scheduling.md)
)", 0) \
DECLARE(String, mutation_workload, "", R"(
Used to regulate how resources are utilized and shared between mutations and
other workloads. Specified value is used as `workload` setting value for
background mutations of this table. If not specified (empty string), then
server setting `mutation_workload` is used instead.
**See Also**
- [Workload Scheduling](/operations/workload-scheduling.md)
)", 0) \
DECLARE(Milliseconds, background_task_preferred_step_execution_time_ms, 50, R"(
Target time to execution of one step of merge or mutation. Can be exceeded if
one step takes longer time
)", 0) \
DECLARE(Bool, enforce_index_structure_match_on_partition_manipulation, false, R"(
If this setting is enabled for destination table of a partition manipulation
query (`ATTACH/MOVE/REPLACE PARTITION`), the indices and projections must be
identical between the source and destination tables. Otherwise, the destination
table can have a superset of the source table's indices and projections.
)", 0) \
DECLARE(MergeSelectorAlgorithm, merge_selector_algorithm, MergeSelectorAlgorithm::SIMPLE, R"(
The algorithm to select parts for merges assignment
)", EXPERIMENTAL) \
DECLARE(Bool, merge_selector_enable_heuristic_to_lower_max_parts_to_merge_at_once, false, R"(
Enable heuristic for simple merge selector which will lower maximum limit for merge choice.
By doing so number of concurrent merges will increase which can help with TOO_MANY_PARTS
errors but at the same time this will increase the write amplification.
)", EXPERIMENTAL) \
DECLARE(UInt64, merge_selector_heuristic_to_lower_max_parts_to_merge_at_once_exponent, 5, R"(
Controls the exponent value used in formulae building lowering curve. Lowering exponent will
lower merge widths which will trigger increase in write amplification. The reverse is also true.
)", EXPERIMENTAL) \
DECLARE(Bool, merge_selector_enable_heuristic_to_remove_small_parts_at_right, true, R"(
Enable heuristic for selecting parts for merge which removes parts from right
side of range, if their size is less than specified ratio (0.01) of sum_size.
Works for Simple and StochasticSimple merge selectors
)", 0) \
DECLARE(Float, merge_selector_base, 5.0, R"(Affects write amplification of
assigned merges (expert level setting, don't change if you don't understand
what it is doing). Works for Simple and StochasticSimple merge selectors
)", 0) \
DECLARE(UInt64, min_parts_to_merge_at_once, 0, R"(
Minimal amount of data parts which merge selector can pick to merge at once
(expert level setting, don't change if you don't understand what it is doing).
0 - disabled. Works for Simple and StochasticSimple merge selectors.
)", 0) \
DECLARE(Bool, apply_patches_on_merge, true, R"(
If true patch parts are applied on merges
)", 0) \
\
DECLARE(UInt64, max_uncompressed_bytes_in_patches, 30ULL * 1024 * 1024 * 1024, R"(
The maximum uncompressed size of data in all patch parts in bytes.
If amount of data in all patch parts exceeds this value, lightweight updates will be rejected.
0 - unlimited.
)", 0) \
/** Inserts settings. */ \
DECLARE(UInt64, parts_to_delay_insert, 1000, R"(
If the number of active parts in a single partition exceeds the
`parts_to_delay_insert` value, an `INSERT` is artificially slowed down.
Possible values:
- Any positive integer.
ClickHouse artificially executes `INSERT` longer (adds 'sleep') so that the
background merge process can merge parts faster than they are added.
)", 0) \
DECLARE(UInt64, inactive_parts_to_delay_insert, 0, R"(
If the number of inactive parts in a single partition in the table exceeds
the `inactive_parts_to_delay_insert` value, an `INSERT` is artificially
slowed down.
:::tip
It is useful when a server fails to clean up parts quickly enough.
:::
Possible values:
- Any positive integer.
)", 0) \
DECLARE(UInt64, parts_to_throw_insert, 3000, R"(
If the number of active parts in a single partition exceeds the
`parts_to_throw_insert` value, `INSERT` is interrupted with the `Too many
parts (N). Merges are processing significantly slower than inserts`
exception.
Possible values:
- Any positive integer.
To achieve maximum performance of `SELECT` queries, it is necessary to
minimize the number of parts processed, see [Merge Tree](/development/architecture#merge-tree).
Prior to version 23.6 this setting was set to 300. You can set a higher
different value, it will reduce the probability of the `Too many parts`
error, but at the same time `SELECT` performance might degrade. Also in case
of a merge issue (for example, due to insufficient disk space) you will
notice it later than you would with the original 300.
)", 0) \
DECLARE(UInt64, inactive_parts_to_throw_insert, 0, R"(
If the number of inactive parts in a single partition more than the
`inactive_parts_to_throw_insert` value, `INSERT` is interrupted with the
following error:
> "Too many inactive parts (N). Parts cleaning are processing significantly
slower than inserts" exception."
Possible values:
- Any positive integer.
)", 0) \
DECLARE(UInt64, max_avg_part_size_for_too_many_parts, 1ULL * 1024 * 1024 * 1024, R"(
The 'too many parts' check according to 'parts_to_delay_insert' and
'parts_to_throw_insert' will be active only if the average part size (in the
relevant partition) is not larger than the specified threshold. If it is
larger than the specified threshold, the INSERTs will be neither delayed or
rejected. This allows to have hundreds of terabytes in a single table on a
single server if the parts are successfully merged to larger parts. This
does not affect the thresholds on inactive parts or total parts.
)", 0) \
DECLARE(UInt64, max_delay_to_insert, 1, R"(
The value in seconds, which is used to calculate the `INSERT` delay, if the
number of active parts in a single partition exceeds the
[parts_to_delay_insert](#parts_to_delay_insert) value.
Possible values:
- Any positive integer.
The delay (in milliseconds) for `INSERT` is calculated by the formula:
```code
max_k = parts_to_throw_insert - parts_to_delay_insert
k = 1 + parts_count_in_partition - parts_to_delay_insert
delay_milliseconds = pow(max_delay_to_insert * 1000, k / max_k)
```
For example, if a partition has 299 active parts and parts_to_throw_insert
= 300, parts_to_delay_insert = 150, max_delay_to_insert = 1, `INSERT` is
delayed for `pow( 1 * 1000, (1 + 299 - 150) / (300 - 150) ) = 1000`
milliseconds.
Starting from version 23.1 formula has been changed to:
```code
allowed_parts_over_threshold = parts_to_throw_insert - parts_to_delay_insert
parts_over_threshold = parts_count_in_partition - parts_to_delay_insert + 1
delay_milliseconds = max(min_delay_to_insert_ms, (max_delay_to_insert * 1000)
* parts_over_threshold / allowed_parts_over_threshold)
```
For example, if a partition has 224 active parts and parts_to_throw_insert
= 300, parts_to_delay_insert = 150, max_delay_to_insert = 1,
min_delay_to_insert_ms = 10, `INSERT` is delayed for `max( 10, 1 * 1000 *
(224 - 150 + 1) / (300 - 150) ) = 500` milliseconds.
)", 0) \
DECLARE(UInt64, min_delay_to_insert_ms, 10, R"(
Min delay of inserting data into MergeTree table in milliseconds, if there
are a lot of unmerged parts in single partition.
)", 0) \
DECLARE(UInt64, max_parts_in_total, 100000, R"(
If the total number of active parts in all partitions of a table exceeds the
`max_parts_in_total` value `INSERT` is interrupted with the `Too many parts
(N)` exception.
Possible values:
- Any positive integer.
A large number of parts in a table reduces performance of ClickHouse queries
and increases ClickHouse boot time. Most often this is a consequence of an
incorrect design (mistakes when choosing a partitioning strategy - too small
partitions).
)", 0) \
DECLARE(Bool, async_insert, false, R"(
If true, data from INSERT query is stored in queue and later flushed to
table in background.
)", 0) \
DECLARE(Bool, add_implicit_sign_column_constraint_for_collapsing_engine, false, R"(
If true, adds an implicit constraint for the `sign` column of a CollapsingMergeTree
or VersionedCollapsingMergeTree table to allow only valid values (`1` and `-1`).
)", 0) \
DECLARE(Milliseconds, sleep_before_commit_local_part_in_replicated_table_ms, 0, R"(
For testing. Do not change it.
)", 0) \
DECLARE(Bool, optimize_row_order, false, R"(
Controls if the row order should be optimized during inserts to improve the
compressability of the newly inserted table part.
Only has an effect for ordinary MergeTree-engine tables. Does nothing for
specialized MergeTree engine tables (e.g. CollapsingMergeTree).
MergeTree tables are (optionally) compressed using [compression codecs](/sql-reference/statements/create/table#column_compression_codec).
Generic compression codecs such as LZ4 and ZSTD achieve maximum compression
rates if the data exposes patterns. Long runs of the same value typically
compress very well.
If this setting is enabled, ClickHouse attempts to store the data in newly
inserted parts in a row order that minimizes the number of equal-value runs
across the columns of the new table part.
In other words, a small number of equal-value runs mean that individual runs
are long and compress well.
Finding the optimal row order is computationally infeasible (NP hard).
Therefore, ClickHouse uses a heuristics to quickly find a row order which
still improves compression rates over the original row order.
<details markdown="1">
<summary>Heuristics for finding a row order</summary>
It is generally possible to shuffle the rows of a table (or table part)
freely as SQL considers the same table (table part) in different row order
equivalent.
This freedom of shuffling rows is restricted when a primary key is defined
for the table. In ClickHouse, a primary key `C1, C2, ..., CN` enforces that
the table rows are sorted by columns `C1`, `C2`, ... `Cn` ([clustered index](https://en.wikipedia.org/wiki/Database_index#Clustered)).
As a result, rows can only be shuffled within "equivalence classes" of row,
i.e. rows which have the same values in their primary key columns.
The intuition is that primary keys with high-cardinality, e.g. primary keys
involving a `DateTime64` timestamp column, lead to many small equivalence
classes. Likewise, tables with a low-cardinality primary key, create few and