ClickHouse
diff --git a/‎src/Common/AsynchronousMetrics.cpp‎
Lines changed: 6 additions & 24 deletions b/‎src/Common/AsynchronousMetrics.cpp‎
Lines changed: 6 additions & 24 deletions
diff --git a/‎src/Common/AsynchronousMetrics.h‎
Lines changed: 0 additions & 6 deletions b/‎src/Common/AsynchronousMetrics.h‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎src/Common/ErrorCodes.cpp‎
Lines changed: 1 addition & 0 deletions b/‎src/Common/ErrorCodes.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/Common/ProfileEvents.cpp‎
Lines changed: 48 additions & 23 deletions b/‎src/Common/ProfileEvents.cpp‎
Lines changed: 48 additions & 23 deletions
diff --git a/‎src/Common/ProfileEvents.h‎
Lines changed: 5 additions & 0 deletions b/‎src/Common/ProfileEvents.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/Core/ServerSettings.cpp‎
Lines changed: 2 additions & 1 deletion b/‎src/Core/ServerSettings.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/Core/Settings.cpp‎
Lines changed: 2 additions & 0 deletions b/‎src/Core/Settings.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/Core/SettingsChangesHistory.cpp‎
Lines changed: 2 additions & 0 deletions b/‎src/Core/SettingsChangesHistory.cpp‎
Lines changed: 2 additions & 0 deletions
@@ -11,6 +11,7 @@
 #include <Common/formatReadable.h>
 #include <Common/logger_useful.h>
 #include <Common/setThreadName.h>
+#include <Core/ServerSettings.h>
 
 #include <boost/locale/date_time_facet.hpp>
 
@@ -23,14 +24,13 @@
 #endif
 
 
-namespace ProfileEvents
+namespace DB
 {
-    extern const Event OSCPUWaitMicroseconds;
-    extern const Event OSCPUVirtualTimeMicroseconds;
-}
 
-namespace DB
+namespace ServerSetting
 {
+    extern const ServerSettingsUInt64 os_cpu_busy_time_threshold;
+}
 
 namespace ErrorCodes
 {
@@ -1826,7 +1826,7 @@ void AsynchronousMetrics::update(TimePoint update_time, bool force_update)
         }
     }
 
-    new_values["OSCPUOverload"] = { getCPUOverloadMetric(), "Relative CPU deficit, calculated as: how many threads are waiting for CPU relative to the number of threads, using CPU. If it is greater than zero, the server would benefit from more CPU. If it is significantly greater than zero, the server could become unresponsive. The metric is accumulated between the updates of asynchronous metrics." };
+    new_values["OSCPUOverload"] = { ProfileEvents::global_counters.getCPUOverload(context->getServerSettings()[ServerSetting::os_cpu_busy_time_threshold], /*reset*/ true), "Relative CPU deficit, calculated as: how many threads are waiting for CPU relative to the number of threads, using CPU. If it is greater than zero, the server would benefit from more CPU. If it is significantly greater than zero, the server could become unresponsive. The metric is accumulated between the updates of asynchronous metrics." };
 
     /// Add more metrics as you wish.
 
@@ -1849,22 +1849,4 @@ void AsynchronousMetrics::update(TimePoint update_time, bool force_update)
     }
 }
 
-double AsynchronousMetrics::getCPUOverloadMetric()
-{
-    Int64 curr_cpu_wait_microseconds = ProfileEvents::global_counters[ProfileEvents::OSCPUWaitMicroseconds];
-    Int64 curr_cpu_virtual_time_microseconds = ProfileEvents::global_counters[ProfileEvents::OSCPUVirtualTimeMicroseconds];
-
-    Int64 os_cpu_wait_microseconds = curr_cpu_wait_microseconds - prev_cpu_wait_microseconds;
-    Int64 os_cpu_virtual_time_microseconds = curr_cpu_virtual_time_microseconds - prev_cpu_virtual_time_microseconds;
-
-    prev_cpu_wait_microseconds = curr_cpu_wait_microseconds;
-    prev_cpu_virtual_time_microseconds = curr_cpu_virtual_time_microseconds;
-
-    /// If we used less than one CPU core, we cannot detect overload.
-    if (os_cpu_virtual_time_microseconds < 1'000'000 || os_cpu_wait_microseconds <= 0)
-        return 0;
-
-    return static_cast<double>(os_cpu_wait_microseconds) / os_cpu_virtual_time_microseconds;
-}
-
 }
@@ -124,12 +124,6 @@ class AsynchronousMetrics
     [[maybe_unused]] const bool update_rss;
     ContextPtr context;
 
-
-    Int64 prev_cpu_wait_microseconds = 0;
-    Int64 prev_cpu_virtual_time_microseconds = 0;
-
-    double getCPUOverloadMetric();
-
 #if defined(OS_LINUX)
     std::optional<ReadBufferFromFilePRead> meminfo TSA_GUARDED_BY(data_mutex);
     std::optional<ReadBufferFromFilePRead> loadavg TSA_GUARDED_BY(data_mutex);
 
@@ -624,6 +624,7 @@
     M(742, DELTA_KERNEL_ERROR) \
     M(743, ICEBERG_SPECIFICATION_VIOLATION) \
     M(744, SESSION_ID_EMPTY) \
+    M(745, SERVER_OVERLOADED) \
 \
     M(900, DISTRIBUTED_CACHE_ERROR) \
     M(901, CANNOT_USE_DISTRIBUTED_CACHE) \
 
@@ -1,6 +1,7 @@
 #include <Common/ProfileEvents.h>
 #include <Common/CurrentThread.h>
 #include <Common/TraceSender.h>
+#include <Interpreters/Context.h>
 
 
 // clang-format off
@@ -11,7 +12,7 @@
     M(Query, "Number of queries to be interpreted and potentially executed. Does not include queries that failed to parse or were rejected due to AST size limits, quota limits or limits on the number of simultaneously running queries. May include internal queries initiated by ClickHouse itself. Does not count subqueries.", ValueType::Number) \
     M(SelectQuery, "Same as Query, but only for SELECT queries.", ValueType::Number) \
     M(InsertQuery, "Same as Query, but only for INSERT queries.", ValueType::Number) \
-    M(InitialQuery, "Same as Query, but only counts initial queries (see is_initial_query).", ValueType::Number)\
+    M(InitialQuery, "Same as Query, but only counts initial queries (see is_initial_query).", ValueType::Number) \
     M(QueriesWithSubqueries, "Count queries with all subqueries", ValueType::Number) \
     M(SelectQueriesWithSubqueries, "Count SELECT queries with all subqueries", ValueType::Number) \
     M(InsertQueriesWithSubqueries, "Count INSERT queries with all subqueries", ValueType::Number) \
@@ -712,7 +713,7 @@ The server successfully detected this situation and will download merged part fr
     M(KeeperLatency, "Keeper latency", ValueType::Milliseconds) \
     M(KeeperTotalElapsedMicroseconds, "Keeper total latency for a single request", ValueType::Microseconds) \
     M(KeeperProcessElapsedMicroseconds, "Keeper commit latency for a single request", ValueType::Microseconds) \
-    M(KeeperPreprocessElapsedMicroseconds, "Keeper preprocessing latency for a single reuquest", ValueType::Microseconds)\
+    M(KeeperPreprocessElapsedMicroseconds, "Keeper preprocessing latency for a single reuquest", ValueType::Microseconds) \
     M(KeeperStorageLockWaitMicroseconds, "Time spent waiting for acquiring Keeper storage lock", ValueType::Microseconds) \
     M(KeeperCommitWaitElapsedMicroseconds, "Time spent waiting for certain log to be committed", ValueType::Microseconds) \
     M(KeeperBatchMaxCount, "Number of times the size of batch was limited by the amount", ValueType::Number) \
@@ -743,29 +744,29 @@ The server successfully detected this situation and will download merged part fr
     M(S3QueueSetFileProcessingMicroseconds, "Time spent to set file as processing", ValueType::Microseconds) \
     M(S3QueueSetFileProcessedMicroseconds, "Time spent to set file as processed", ValueType::Microseconds) \
     M(S3QueueSetFileFailedMicroseconds, "Time spent to set file as failed", ValueType::Microseconds) \
-    M(ObjectStorageQueueFailedFiles, "Number of files which failed to be processed", ValueType::Number)\
-    M(ObjectStorageQueueProcessedFiles, "Number of files which were processed", ValueType::Number)\
+    M(ObjectStorageQueueFailedFiles, "Number of files which failed to be processed", ValueType::Number) \
+    M(ObjectStorageQueueProcessedFiles, "Number of files which were processed", ValueType::Number) \
     M(ObjectStorageQueueCleanupMaxSetSizeOrTTLMicroseconds, "Time spent to set file as failed", ValueType::Microseconds) \
     M(ObjectStorageQueuePullMicroseconds, "Time spent to read file data", ValueType::Microseconds) \
     M(ObjectStorageQueueLockLocalFileStatusesMicroseconds, "Time spent to lock local file statuses", ValueType::Microseconds) \
-    M(ObjectStorageQueueFailedToBatchSetProcessing, "Number of times batched set processing request failed", ValueType::Number)\
-    M(ObjectStorageQueueTrySetProcessingRequests, "The number of times we tried to make set processing request", ValueType::Number)\
-    M(ObjectStorageQueueTrySetProcessingSucceeded, "The number of times we successfully set file as processing", ValueType::Number)\
-    M(ObjectStorageQueueTrySetProcessingFailed, "The number of times we unsuccessfully set file as processing", ValueType::Number)\
-    M(ObjectStorageQueueListedFiles, "Number of listed files in StorageS3(Azure)Queue", ValueType::Number)\
-    M(ObjectStorageQueueFilteredFiles, "Number of filtered files in StorageS3(Azure)Queue", ValueType::Number)\
-    M(ObjectStorageQueueReadFiles, "Number of read files (not equal to the number of actually inserted files)", ValueType::Number)\
-    M(ObjectStorageQueueReadRows, "Number of read rows (not equal to the number of actually inserted rows)", ValueType::Number)\
-    M(ObjectStorageQueueReadBytes, "Number of read bytes (not equal to the number of actually inserted bytes)", ValueType::Number)\
-    M(ObjectStorageQueueExceptionsDuringRead, "Number of exceptions during read in S3(Azure)Queue", ValueType::Number)\
-    M(ObjectStorageQueueExceptionsDuringInsert, "Number of exceptions during insert in S3(Azure)Queue", ValueType::Number)\
-    M(ObjectStorageQueueRemovedObjects, "Number of objects removed as part of after_processing = delete", ValueType::Number)\
-    M(ObjectStorageQueueInsertIterations, "Number of insert iterations", ValueType::Number)\
-    M(ObjectStorageQueueCommitRequests, "Number of keeper requests to commit files as either failed or processed", ValueType::Number)\
-    M(ObjectStorageQueueSuccessfulCommits, "Number of successful keeper commits", ValueType::Number)\
-    M(ObjectStorageQueueUnsuccessfulCommits, "Number of unsuccessful keeper commits", ValueType::Number)\
-    M(ObjectStorageQueueCancelledFiles, "Number cancelled files in StorageS3(Azure)Queue", ValueType::Number)\
-    M(ObjectStorageQueueProcessedRows, "Number of processed rows in StorageS3(Azure)Queue", ValueType::Number)\
+    M(ObjectStorageQueueFailedToBatchSetProcessing, "Number of times batched set processing request failed", ValueType::Number) \
+    M(ObjectStorageQueueTrySetProcessingRequests, "The number of times we tried to make set processing request", ValueType::Number) \
+    M(ObjectStorageQueueTrySetProcessingSucceeded, "The number of times we successfully set file as processing", ValueType::Number) \
+    M(ObjectStorageQueueTrySetProcessingFailed, "The number of times we unsuccessfully set file as processing", ValueType::Number) \
+    M(ObjectStorageQueueListedFiles, "Number of listed files in StorageS3(Azure)Queue", ValueType::Number) \
+    M(ObjectStorageQueueFilteredFiles, "Number of filtered files in StorageS3(Azure)Queue", ValueType::Number) \
+    M(ObjectStorageQueueReadFiles, "Number of read files (not equal to the number of actually inserted files)", ValueType::Number) \
+    M(ObjectStorageQueueReadRows, "Number of read rows (not equal to the number of actually inserted rows)", ValueType::Number) \
+    M(ObjectStorageQueueReadBytes, "Number of read bytes (not equal to the number of actually inserted bytes)", ValueType::Number) \
+    M(ObjectStorageQueueExceptionsDuringRead, "Number of exceptions during read in S3(Azure)Queue", ValueType::Number) \
+    M(ObjectStorageQueueExceptionsDuringInsert, "Number of exceptions during insert in S3(Azure)Queue", ValueType::Number) \
+    M(ObjectStorageQueueRemovedObjects, "Number of objects removed as part of after_processing = delete", ValueType::Number) \
+    M(ObjectStorageQueueInsertIterations, "Number of insert iterations", ValueType::Number) \
+    M(ObjectStorageQueueCommitRequests, "Number of keeper requests to commit files as either failed or processed", ValueType::Number) \
+    M(ObjectStorageQueueSuccessfulCommits, "Number of successful keeper commits", ValueType::Number) \
+    M(ObjectStorageQueueUnsuccessfulCommits, "Number of unsuccessful keeper commits", ValueType::Number) \
+    M(ObjectStorageQueueCancelledFiles, "Number cancelled files in StorageS3(Azure)Queue", ValueType::Number) \
+    M(ObjectStorageQueueProcessedRows, "Number of processed rows in StorageS3(Azure)Queue", ValueType::Number) \
     \
     M(ServerStartupMilliseconds, "Time elapsed from starting server to listening to sockets in milliseconds", ValueType::Milliseconds) \
     M(IOUringSQEsSubmitted, "Total number of io_uring SQEs submitted", ValueType::Number) \
@@ -938,7 +939,7 @@ The server successfully detected this situation and will download merged part fr
     M(StorageConnectionsPreserved, "Number of preserved connections for storages", ValueType::Number) \
     M(StorageConnectionsExpired, "Number of expired connections for storages", ValueType::Number) \
     M(StorageConnectionsErrors, "Number of cases when creation of a connection for storage is failed", ValueType::Number) \
-    M(StorageConnectionsElapsedMicroseconds, "Total time spend on creating connections for storages", ValueType::Microseconds)                                                                                                                                                                                                                                               \
+    M(StorageConnectionsElapsedMicroseconds, "Total time spend on creating connections for storages", ValueType::Microseconds) \
     \
     M(DiskConnectionsCreated, "Number of created connections for disk", ValueType::Number) \
     M(DiskConnectionsReused, "Number of reused connections for disk", ValueType::Number) \
@@ -1122,6 +1123,30 @@ void incrementNoTrace(Event event, Count amount)
     DB::CurrentThread::getProfileEvents().incrementNoTrace(event, amount);
 }
 
+double Counters::getCPUOverload(Int64 os_cpu_busy_time_threshold, bool reset)
+{
+    /// It's possible that we'll have slightly inconsistent values between wait time and busy time. But since we take the value of CPU wait time first,
+    /// it should not affect the situation a lot. In the worst case scenario we will have a slightly lower CPU overload value than it should be, but it's fine.
+    Int64 curr_cpu_wait_microseconds = counters[OSCPUWaitMicroseconds];
+    Int64 curr_cpu_virtual_time_microseconds = counters[OSCPUVirtualTimeMicroseconds];
+
+    Int64 os_cpu_wait_microseconds = curr_cpu_wait_microseconds - prev_cpu_wait_microseconds.load(std::memory_order_acquire);
+    Int64 os_cpu_virtual_time_microseconds = curr_cpu_virtual_time_microseconds - prev_cpu_virtual_time_microseconds.load(std::memory_order_acquire);
+
+    if (reset)
+    {
+        /// It's important to update wait time first, since the atomicity is not guaranteed for both counters at the same time.
+        /// So in the worst case scenario, we'll update prev wait time first, which will result in an underestimated wait time and lower CPU overload value.
+        prev_cpu_wait_microseconds.store(curr_cpu_wait_microseconds, std::memory_order_release);
+        prev_cpu_virtual_time_microseconds.store(curr_cpu_virtual_time_microseconds, std::memory_order_release);
+    }
+
+    if (os_cpu_virtual_time_microseconds <= os_cpu_busy_time_threshold || os_cpu_wait_microseconds <= 0)
+        return 0;
+
+    return static_cast<double>(os_cpu_wait_microseconds) / os_cpu_virtual_time_microseconds;
+}
+
 void Counters::increment(Event event, Count amount)
 {
     Counters * current = this;
 
@@ -2,6 +2,7 @@
 
 #include <Common/VariableContext.h>
 #include <Common/Stopwatch.h>
+#include <Interpreters/Context_fwd.h>
 #include <base/types.h>
 #include <base/strong_typedef.h>
 #include <Poco/Message.h>
@@ -62,6 +63,8 @@ namespace ProfileEvents
         /// Used to propagate increments
         std::atomic<Counters *> parent = {};
         bool trace_profile_events = false;
+        Counter prev_cpu_wait_microseconds = 0;
+        Counter prev_cpu_virtual_time_microseconds = 0;
 
     public:
 
@@ -86,6 +89,8 @@ namespace ProfileEvents
             return counters[event];
         }
 
+        double getCPUOverload(Int64 os_cpu_busy_time_threshold, bool reset = false);
+
         void increment(Event event, Count amount = 1);
         void incrementNoTrace(Event event, Count amount = 1);
 
 
@@ -1035,7 +1035,8 @@ The policy on how to perform a scheduling of CPU slots specified by `concurrent_
     <wait_dictionaries_load_at_startup>true</wait_dictionaries_load_at_startup>
     ```
     )", 0) \
-    DECLARE(Bool, storage_shared_set_join_use_inner_uuid, true, "If enabled, an inner UUID is generated during the creation of SharedSet and SharedJoin. ClickHouse Cloud only", 0)
+    DECLARE(Bool, storage_shared_set_join_use_inner_uuid, true, "If enabled, an inner UUID is generated during the creation of SharedSet and SharedJoin. ClickHouse Cloud only", 0) \
+    DECLARE(UInt64, os_cpu_busy_time_threshold, 1'000'000, "Threshold of OS CPU busy time in microseconds (OSCPUVirtualTimeMicroseconds metric) to consider CPU doing some useful work, no CPU overload would be considered if busy time was below this value.", 0) \
 
 
 // clang-format on
 
@@ -6420,6 +6420,8 @@ Note that initially (24.12) there was a server setting (`send_settings_to_client
     DECLARE(Milliseconds, low_priority_query_wait_time_ms, 1000, R"(
 Wait time in milliseconds when lower priority query meets higher priority query.
 )", BETA) \
+    DECLARE(Float, min_os_cpu_wait_time_ratio_to_throw, 2.0, "Min ratio between OS CPU wait (OSCPUWaitMicroseconds metric) and busy (OSCPUVirtualTimeMicroseconds metric) times to consider rejecting queries. Linear interpolation between min and max ratio is used to calculate the probability, the probability is 0 at this point.", 0) \
+    DECLARE(Float, max_os_cpu_wait_time_ratio_to_throw, 6.0, "Max ratio between OS CPU wait (OSCPUWaitMicroseconds metric) and busy (OSCPUVirtualTimeMicroseconds metric) times to consider rejecting queries. Linear interpolation between min and max ratio is used to calculate the probability, the probability is 1 at this point.", 0) \
     \
     /* ####################################################### */ \
     /* ########### START OF EXPERIMENTAL FEATURES ############ */ \
 
@@ -76,6 +76,8 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory()
             {"low_priority_query_wait_time_ms", 1000, 1000, "New setting."},
             {"allow_experimental_shared_set_join", 0, 1, "A setting for ClickHouse Cloud to enable SharedSet and SharedJoin"},
             {"distributed_cache_read_request_max_tries", 20, 20, "New setting"},
+            {"min_os_cpu_wait_time_ratio_to_throw", 0, 2, "New setting"},
+            {"max_os_cpu_wait_time_ratio_to_throw", 0, 6, "New setting"},
         });
         addSettingsChanges(settings_changes_history, "25.3",
         {
Original file line number	Diff line number	Diff line change
`@@ -624,6 +624,7 @@`
`624`	`624`	`M(742, DELTA_KERNEL_ERROR) \`
`625`	`625`	`M(743, ICEBERG_SPECIFICATION_VIOLATION) \`
`626`	`626`	`M(744, SESSION_ID_EMPTY) \`
	`627`	`+ M(745, SERVER_OVERLOADED) \`
`627`	`628`	`\`
`628`	`629`	`M(900, DISTRIBUTED_CACHE_ERROR) \`
`629`	`630`	`M(901, CANNOT_USE_DISTRIBUTED_CACHE) \`
Original file line number	Diff line number	Diff line change
@@ -6420,6 +6420,8 @@ Note that initially (24.12) there was a server setting (`send_settings_to_client
`6420`	`6420`	`DECLARE(Milliseconds, low_priority_query_wait_time_ms, 1000, R"(`
`6421`	`6421`	`Wait time in milliseconds when lower priority query meets higher priority query.`
`6422`	`6422`	`)", BETA) \`
	`6423`	`+ DECLARE(Float, min_os_cpu_wait_time_ratio_to_throw, 2.0, "Min ratio between OS CPU wait (OSCPUWaitMicroseconds metric) and busy (OSCPUVirtualTimeMicroseconds metric) times to consider rejecting queries. Linear interpolation between min and max ratio is used to calculate the probability, the probability is 0 at this point.", 0) \`
	`6424`	`+ DECLARE(Float, max_os_cpu_wait_time_ratio_to_throw, 6.0, "Max ratio between OS CPU wait (OSCPUWaitMicroseconds metric) and busy (OSCPUVirtualTimeMicroseconds metric) times to consider rejecting queries. Linear interpolation between min and max ratio is used to calculate the probability, the probability is 1 at this point.", 0) \`
`6423`	`6425`	`\`
`6424`	`6426`	`/* ####################################################### */ \`
`6425`	`6427`	`/* ########### START OF EXPERIMENTAL FEATURES ############ */ \`
Original file line number	Diff line number	Diff line change
`@@ -76,6 +76,8 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory()`
`76`	`76`	`{"low_priority_query_wait_time_ms", 1000, 1000, "New setting."},`
`77`	`77`	`{"allow_experimental_shared_set_join", 0, 1, "A setting for ClickHouse Cloud to enable SharedSet and SharedJoin"},`
`78`	`78`	`{"distributed_cache_read_request_max_tries", 20, 20, "New setting"},`
	`79`	`+ {"min_os_cpu_wait_time_ratio_to_throw", 0, 2, "New setting"},`
	`80`	`+ {"max_os_cpu_wait_time_ratio_to_throw", 0, 6, "New setting"},`
`79`	`81`	`});`
`80`	`82`	`addSettingsChanges(settings_changes_history, "25.3",`
`81`	`83`	`{`