Skip to content

Commit 4794dc2

Browse files
committed
Merge remote-tracking branch 'ClickHouse/master' into dist/config-settings
2 parents 1751d76 + 8b314a3 commit 4794dc2

File tree

241 files changed

+7618
-2088
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

241 files changed

+7618
-2088
lines changed

base/base/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ set (CMAKE_CXX_STANDARD 20)
1010

1111
set (SRCS
1212
argsToConfig.cpp
13+
cgroupsv2.cpp
1314
coverage.cpp
1415
demangle.cpp
1516
getAvailableMemoryAmount.cpp

base/base/cgroupsv2.cpp

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#include <base/cgroupsv2.h>
2+
3+
#include <base/defines.h>
4+
5+
#include <fstream>
6+
#include <sstream>
7+
8+
9+
bool cgroupsV2Enabled()
10+
{
11+
#if defined(OS_LINUX)
12+
/// This file exists iff the host has cgroups v2 enabled.
13+
auto controllers_file = default_cgroups_mount / "cgroup.controllers";
14+
if (!std::filesystem::exists(controllers_file))
15+
return false;
16+
return true;
17+
#else
18+
return false;
19+
#endif
20+
}
21+
22+
bool cgroupsV2MemoryControllerEnabled()
23+
{
24+
#if defined(OS_LINUX)
25+
chassert(cgroupsV2Enabled());
26+
/// According to https://docs.kernel.org/admin-guide/cgroup-v2.html:
27+
/// - file 'cgroup.controllers' defines which controllers *can* be enabled
28+
/// - file 'cgroup.subtree_control' defines which controllers *are* enabled
29+
/// Caveat: nested groups may disable controllers. For simplicity, check only the top-level group.
30+
std::ifstream subtree_control_file(default_cgroups_mount / "cgroup.subtree_control");
31+
if (!subtree_control_file.is_open())
32+
return false;
33+
std::string controllers;
34+
std::getline(subtree_control_file, controllers);
35+
if (controllers.find("memory") == std::string::npos)
36+
return false;
37+
return true;
38+
#else
39+
return false;
40+
#endif
41+
}
42+
43+
std::string cgroupV2OfProcess()
44+
{
45+
#if defined(OS_LINUX)
46+
chassert(cgroupsV2Enabled());
47+
/// All PIDs assigned to a cgroup are in /sys/fs/cgroups/{cgroup_name}/cgroup.procs
48+
/// A simpler way to get the membership is:
49+
std::ifstream cgroup_name_file("/proc/self/cgroup");
50+
if (!cgroup_name_file.is_open())
51+
return "";
52+
/// With cgroups v2, there will be a *single* line with prefix "0::/"
53+
/// (see https://docs.kernel.org/admin-guide/cgroup-v2.html)
54+
std::string cgroup;
55+
std::getline(cgroup_name_file, cgroup);
56+
static const std::string v2_prefix = "0::/";
57+
if (!cgroup.starts_with(v2_prefix))
58+
return "";
59+
cgroup = cgroup.substr(v2_prefix.length());
60+
return cgroup;
61+
#else
62+
return "";
63+
#endif
64+
}

base/base/cgroupsv2.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#pragma once
2+
3+
#include <filesystem>
4+
#include <string>
5+
6+
#if defined(OS_LINUX)
7+
/// I think it is possible to mount the cgroups hierarchy somewhere else (e.g. when in containers).
8+
/// /sys/fs/cgroup was still symlinked to the actual mount in the cases that I have seen.
9+
static inline const std::filesystem::path default_cgroups_mount = "/sys/fs/cgroup";
10+
#endif
11+
12+
/// Is cgroups v2 enabled on the system?
13+
bool cgroupsV2Enabled();
14+
15+
/// Is the memory controller of cgroups v2 enabled on the system?
16+
/// Assumes that cgroupsV2Enabled() is enabled.
17+
bool cgroupsV2MemoryControllerEnabled();
18+
19+
/// Which cgroup does the process belong to?
20+
/// Returns an empty string if the cgroup cannot be determined.
21+
/// Assumes that cgroupsV2Enabled() is enabled.
22+
std::string cgroupV2OfProcess();

base/base/getMemoryAmount.cpp

Lines changed: 5 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,14 @@
11
#include <base/getMemoryAmount.h>
22

3+
#include <base/cgroupsv2.h>
34
#include <base/getPageSize.h>
45

56
#include <fstream>
6-
#include <sstream>
77
#include <stdexcept>
88

99
#include <unistd.h>
1010
#include <sys/types.h>
1111
#include <sys/param.h>
12-
#if defined(BSD)
13-
#include <sys/sysctl.h>
14-
#endif
1512

1613

1714
namespace
@@ -20,49 +17,14 @@ namespace
2017
std::optional<uint64_t> getCgroupsV2MemoryLimit()
2118
{
2219
#if defined(OS_LINUX)
23-
const std::filesystem::path default_cgroups_mount = "/sys/fs/cgroup";
24-
25-
/// This file exists iff the host has cgroups v2 enabled.
26-
std::ifstream controllers_file(default_cgroups_mount / "cgroup.controllers");
27-
if (!controllers_file.is_open())
28-
return {};
29-
30-
/// Make sure that the memory controller is enabled.
31-
/// - cgroup.controllers defines which controllers *can* be enabled.
32-
/// - cgroup.subtree_control defines which controllers *are* enabled.
33-
/// (see https://docs.kernel.org/admin-guide/cgroup-v2.html)
34-
/// Caveat: nested groups may disable controllers. For simplicity, check only the top-level group.
35-
/// ReadBufferFromFile subtree_control_file(default_cgroups_mount / "cgroup.subtree_control");
36-
/// std::string subtree_control;
37-
/// readString(subtree_control, subtree_control_file);
38-
/// if (subtree_control.find("memory") == std::string::npos)
39-
/// return {};
40-
std::ifstream subtree_control_file(default_cgroups_mount / "cgroup.subtree_control");
41-
std::stringstream subtree_control_buf;
42-
subtree_control_buf << subtree_control_file.rdbuf();
43-
std::string subtree_control = subtree_control_buf.str();
44-
if (subtree_control.find("memory") == std::string::npos)
45-
return {};
46-
47-
/// Identify the cgroup the process belongs to
48-
/// All PIDs assigned to a cgroup are in /sys/fs/cgroups/{cgroup_name}/cgroup.procs
49-
/// A simpler way to get the membership is:
50-
std::ifstream cgroup_name_file("/proc/self/cgroup");
51-
if (!cgroup_name_file.is_open())
20+
if (!cgroupsV2Enabled())
5221
return {};
5322

54-
std::stringstream cgroup_name_buf;
55-
cgroup_name_buf << cgroup_name_file.rdbuf();
56-
std::string cgroup_name = cgroup_name_buf.str();
57-
if (!cgroup_name.empty() && cgroup_name.back() == '\n')
58-
cgroup_name.pop_back(); /// remove trailing newline, if any
59-
/// With cgroups v2, there will be a *single* line with prefix "0::/"
60-
const std::string v2_prefix = "0::/";
61-
if (!cgroup_name.starts_with(v2_prefix))
23+
if (!cgroupsV2MemoryControllerEnabled())
6224
return {};
63-
cgroup_name = cgroup_name.substr(v2_prefix.length());
6425

65-
std::filesystem::path current_cgroup = cgroup_name.empty() ? default_cgroups_mount : (default_cgroups_mount / cgroup_name);
26+
std::string cgroup = cgroupV2OfProcess();
27+
auto current_cgroup = cgroup.empty() ? default_cgroups_mount : (default_cgroups_mount / cgroup);
6628

6729
/// Open the bottom-most nested memory limit setting file. If there is no such file at the current
6830
/// level, try again at the parent level as memory settings are inherited.

docker/test/fuzzer/run-fuzzer.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ function download
8686

8787
chmod +x clickhouse
8888
# clickhouse may be compressed - run once to decompress
89-
./clickhouse ||:
89+
./clickhouse --query "SELECT 1" ||:
9090
ln -s ./clickhouse ./clickhouse-server
9191
ln -s ./clickhouse ./clickhouse-client
9292
ln -s ./clickhouse ./clickhouse-local

docs/en/engines/table-engines/mergetree-family/mergetree.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -870,6 +870,11 @@ Tags:
870870
- `load_balancing` - Policy for disk balancing, `round_robin` or `least_used`.
871871
- `least_used_ttl_ms` - Configure timeout (in milliseconds) for the updating available space on all disks (`0` - update always, `-1` - never update, default is `60000`). Note, if the disk can be used by ClickHouse only and is not subject to a online filesystem resize/shrink you can use `-1`, in all other cases it is not recommended, since eventually it will lead to incorrect space distribution.
872872
- `prefer_not_to_merge` — You should not use this setting. Disables merging of data parts on this volume (this is harmful and leads to performance degradation). When this setting is enabled (don't do it), merging data on this volume is not allowed (which is bad). This allows (but you don't need it) controlling (if you want to control something, you're making a mistake) how ClickHouse works with slow disks (but ClickHouse knows better, so please don't use this setting).
873+
- `volume_priority` — Defines the priority (order) in which volumes are filled. Lower value means higher priority. The parameter values should be natural numbers and collectively cover the range from 1 to N (lowest priority given) without skipping any numbers.
874+
* If _all_ volumes are tagged, they are prioritized in given order.
875+
* If only _some_ volumes are tagged, those without the tag have the lowest priority, and they are prioritized in the order they are defined in config.
876+
* If _no_ volumes are tagged, their priority is set correspondingly to their order they are declared in configuration.
877+
* Two volumes cannot have the same priority value.
873878

874879
Configuration examples:
875880

@@ -919,7 +924,8 @@ In given example, the `hdd_in_order` policy implements the [round-robin](https:/
919924
If there are different kinds of disks available in the system, `moving_from_ssd_to_hdd` policy can be used instead. The volume `hot` consists of an SSD disk (`fast_ssd`), and the maximum size of a part that can be stored on this volume is 1GB. All the parts with the size larger than 1GB will be stored directly on the `cold` volume, which contains an HDD disk `disk1`.
920925
Also, once the disk `fast_ssd` gets filled by more than 80%, data will be transferred to the `disk1` by a background process.
921926

922-
The order of volume enumeration within a storage policy is important. Once a volume is overfilled, data are moved to the next one. The order of disk enumeration is important as well because data are stored on them in turns.
927+
The order of volume enumeration within a storage policy is important in case at least one of the volumes listed has no explicit `volume_priority` parameter.
928+
Once a volume is overfilled, data are moved to the next one. The order of disk enumeration is important as well because data are stored on them in turns.
923929

924930
When creating a table, one can apply one of the configured storage policies to it:
925931

docs/en/operations/server-configuration-parameters/settings.md

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,20 @@ Type: Bool
199199

200200
Default: 0
201201

202+
203+
## dns_cache_max_size
204+
205+
Internal DNS cache max size in bytes.
206+
207+
:::note
208+
ClickHouse also has a reverse cache, so the actual memory usage could be twice as much.
209+
:::
210+
211+
Type: UInt64
212+
213+
Default: 1024
214+
215+
202216
## dns_cache_update_period
203217

204218
Internal DNS cache update period in seconds.
@@ -458,6 +472,38 @@ Type: Double
458472

459473
Default: 0.9
460474

475+
## cgroups_memory_usage_observer_wait_time
476+
477+
Interval in seconds during which the server's maximum allowed memory consumption is adjusted by the corresponding threshold in cgroups. (see
478+
settings `cgroup_memory_watcher_hard_limit_ratio` and `cgroup_memory_watcher_soft_limit_ratio`).
479+
480+
Type: UInt64
481+
482+
Default: 15
483+
484+
## cgroup_memory_watcher_hard_limit_ratio
485+
486+
Specifies the "hard" threshold with regards to the memory consumption of the server process according to cgroups after which the server's
487+
maximum memory consumption is adjusted to the threshold value.
488+
489+
See settings `cgroups_memory_usage_observer_wait_time` and `cgroup_memory_watcher_soft_limit_ratio`
490+
491+
Type: Double
492+
493+
Default: 0.95
494+
495+
## cgroup_memory_watcher_soft_limit_ratio
496+
497+
Specifies the "soft" threshold with regards to the memory consumption of the server process according to cgroups after which arenas in
498+
jemalloc are purged.
499+
500+
501+
See settings `cgroups_memory_usage_observer_wait_time` and `cgroup_memory_watcher_hard_limit_ratio`
502+
503+
Type: Double
504+
505+
Default: 0.95
506+
461507
## max_table_size_to_drop
462508

463509
Restriction on deleting tables.

docs/en/operations/settings/settings.md

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -755,7 +755,7 @@ By default: 1,000,000. It only works when reading from MergeTree engines.
755755

756756
## max_concurrent_queries_for_user {#max-concurrent-queries-for-user}
757757

758-
The maximum number of simultaneously processed queries related to MergeTree table per user.
758+
The maximum number of simultaneously processed queries per user.
759759

760760
Possible values:
761761

@@ -1776,7 +1776,7 @@ Default value: 0 (no restriction).
17761776
## insert_quorum {#insert_quorum}
17771777

17781778
:::note
1779-
`insert_quorum` does not apply when using the [`SharedMergeTree` table engine](/en/cloud/reference/shared-merge-tree) in ClickHouse Cloud as all inserts are quorum inserted.
1779+
This setting is not applicable to SharedMergeTree, see [SharedMergeTree consistency](/docs/en/cloud/reference/shared-merge-tree/#consistency) for more information.
17801780
:::
17811781

17821782
Enables the quorum writes.
@@ -1819,7 +1819,7 @@ See also:
18191819
## insert_quorum_parallel {#insert_quorum_parallel}
18201820

18211821
:::note
1822-
`insert_quorum_parallel` does not apply when using the [`SharedMergeTree` table engine](/en/cloud/reference/shared-merge-tree) in ClickHouse Cloud as all inserts are quorum inserted.
1822+
This setting is not applicable to SharedMergeTree, see [SharedMergeTree consistency](/docs/en/cloud/reference/shared-merge-tree/#consistency) for more information.
18231823
:::
18241824

18251825
Enables or disables parallelism for quorum `INSERT` queries. If enabled, additional `INSERT` queries can be sent while previous queries have not yet finished. If disabled, additional writes to the same table will be rejected.
@@ -1839,6 +1839,10 @@ See also:
18391839

18401840
## select_sequential_consistency {#select_sequential_consistency}
18411841

1842+
:::note
1843+
This setting differ in behavior between SharedMergeTree and ReplicatedMergeTree, see [SharedMergeTree consistency](/docs/en/cloud/reference/shared-merge-tree/#consistency) for more information about the behavior of `select_sequential_consistency` in SharedMergeTree.
1844+
:::
1845+
18421846
Enables or disables sequential consistency for `SELECT` queries. Requires `insert_quorum_parallel` to be disabled (enabled by default).
18431847

18441848
Possible values:
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
---
2+
slug: /en/operations/system-tables/dns_cache
3+
---
4+
# dns_cache
5+
6+
Contains information about cached DNS records.
7+
8+
Columns:
9+
10+
- `hostname` ([String](../../sql-reference/data-types/string.md)) — cached hostname
11+
- `ip_address` ([String](../../sql-reference/data-types/string.md)) — ip address for the hostname
12+
- `ip_family` ([Enum](../../sql-reference/data-types/enum.md)) — family of the ip address, possible values:
13+
- 'IPv4'
14+
- 'IPv6'
15+
- 'UNIX_LOCAL'
16+
- `cached_at` ([DateTime](../../sql-reference/data-types/datetime.md)) - when the record was cached
17+
18+
**Example**
19+
20+
Query:
21+
22+
```sql
23+
SELECT * FROM system.dns_cache;
24+
```
25+
26+
Result:
27+
28+
| hostname | ip\_address | ip\_family | cached\_at |
29+
| :--- | :--- | :--- | :--- |
30+
| localhost | ::1 | IPv6 | 2024-02-11 17:04:40 |
31+
| localhost | 127.0.0.1 | IPv4 | 2024-02-11 17:04:40 |
32+
33+
**See also**
34+
35+
- [disable_internal_dns_cache setting](../../operations/server-configuration-parameters/settings.md#disable_internal_dns_cache)
36+
- [dns_cache_max_size setting](../../operations/server-configuration-parameters/settings.md#dns_cache_max_size)
37+
- [dns_cache_update_period setting](../../operations/server-configuration-parameters/settings.md#dns_cache_update_period)
38+
- [dns_max_consecutive_failures setting](../../operations/server-configuration-parameters/settings.md#dns_max_consecutive_failures)
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
---
2+
slug: /en/operations/system-tables/settings_changes
3+
---
4+
# settings_changes
5+
6+
Contains information about setting changes in previous ClickHouse versions.
7+
8+
Columns:
9+
10+
- `version` ([String](../../sql-reference/data-types/string.md)) — The ClickHouse version in which settings were changed
11+
- `changes` ([Array](../../sql-reference/data-types/array.md) of [Tuple](../../sql-reference/data-types/tuple.md)) — A description of the setting changes: (setting name, previous value, new value, reason for the change)
12+
13+
**Example**
14+
15+
``` sql
16+
SELECT *
17+
FROM system.settings_changes
18+
WHERE version = '23.5'
19+
FORMAT Vertical
20+
```
21+
22+
``` text
23+
Row 1:
24+
──────
25+
version: 23.5
26+
changes: [('input_format_parquet_preserve_order','1','0','Allow Parquet reader to reorder rows for better parallelism.'),('parallelize_output_from_storages','0','1','Allow parallelism when executing queries that read from file/url/s3/etc. This may reorder rows.'),('use_with_fill_by_sorting_prefix','0','1','Columns preceding WITH FILL columns in ORDER BY clause form sorting prefix. Rows with different values in sorting prefix are filled independently'),('output_format_parquet_compliant_nested_types','0','1','Change an internal field name in output Parquet file schema.')]
27+
```
28+
29+
**See also**
30+
31+
- [Settings](../../operations/settings/index.md#session-settings-intro)
32+
- [system.settings](settings.md)

0 commit comments

Comments
 (0)