ClickHouse · jkartseva · Aug 5, 2025 · Jul 31, 2025 · Jul 31, 2025 · Jul 31, 2025
diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp
@@ -34,6 +34,7 @@ namespace Setting
     extern const SettingsUInt64 s3_max_connections;
     extern const SettingsUInt64 s3_max_redirects;
     extern const SettingsBool s3_slow_all_threads_after_network_error;
+    extern const SettingsBool backup_slow_all_threads_after_retryable_s3_error;
 }
 
 namespace S3AuthSetting
@@ -105,6 +106,7 @@ namespace
             static_cast<unsigned>(local_settings[Setting::s3_max_redirects]),
             static_cast<unsigned>(local_settings[Setting::backup_restore_s3_retry_attempts]),
             local_settings[Setting::s3_slow_all_threads_after_network_error],
+            local_settings[Setting::backup_slow_all_threads_after_retryable_s3_error],
             local_settings[Setting::enable_s3_requests_logging],
             /* for_disk_s3 = */ false,
             request_settings.get_request_throttler,

diff --git a/src/Coordination/KeeperSnapshotManagerS3.cpp b/src/Coordination/KeeperSnapshotManagerS3.cpp
@@ -105,6 +105,7 @@ void KeeperSnapshotManagerS3::updateS3Configuration(const Poco::Util::AbstractCo
         static constexpr size_t s3_max_redirects = 10;
         static constexpr size_t s3_retry_attempts = 10;
         static constexpr bool s3_slow_all_threads_after_network_error = true;
+        static constexpr bool s3_slow_all_threads_after_retryable_error = false;
         static constexpr bool enable_s3_requests_logging = false;
 
         if (!new_uri.key.empty())
@@ -115,9 +116,15 @@ void KeeperSnapshotManagerS3::updateS3Configuration(const Poco::Util::AbstractCo
 
         S3::PocoHTTPClientConfiguration client_configuration = S3::ClientFactory::instance().createClientConfiguration(
             auth_settings[S3AuthSetting::region],
-            RemoteHostFilter(), s3_max_redirects, s3_retry_attempts, s3_slow_all_threads_after_network_error,
+            RemoteHostFilter(),
+            s3_max_redirects,
+            s3_retry_attempts,
+            s3_slow_all_threads_after_network_error,
+            s3_slow_all_threads_after_retryable_error,
             enable_s3_requests_logging,
-            /* for_disk_s3 = */ false, /* get_request_throttler = */ {}, /* put_request_throttler = */ {},
+            /* for_disk_s3 = */ false,
+            /* get_request_throttler = */ {},
+            /* put_request_throttler = */ {},
             new_uri.uri.getScheme());
 
         client_configuration.endpointOverride = new_uri.endpoint;

diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp
@@ -463,9 +463,9 @@ When set to `true` than for all azure requests first two attempts are made with
 When set to `false` than all attempts are made with identical timeouts.
 )", 0) \
     DECLARE(Bool, s3_slow_all_threads_after_network_error, true, R"(
-When set to `true` than all threads executing s3 requests to the same endpoint get slow down for a while
-after one s3 request fails with a retryable network error.
-When set to `false` than each thread executing s3 request uses an independent set of backoffs on network errors.
+When set to `true`, all threads executing S3 requests to the same backup endpoint are slowed down
+after any single s3 request encounters a retryable network error, such as socket timeout.
+When set to `false`, each thread handles S3 request backoff independently of the others.
 )", 0) \
     DECLARE(UInt64, azure_list_object_keys_size, 1000, R"(
 Maximum number of files that could be returned in batch by ListObject request
@@ -3325,6 +3325,11 @@ Approximate probability of failure for a keeper request during backup or restore
 )", 0) \
     DECLARE(UInt64, backup_restore_s3_retry_attempts, 1000, R"(
 Setting for Aws::Client::RetryStrategy, Aws::Client does retries itself, 0 means no retries. It takes place only for backup/restore.
+)", 0) \
+    DECLARE(Bool, backup_slow_all_threads_after_retryable_s3_error, true, R"(
+When set to `true`, all threads executing S3 requests to the same backup endpoint are slowed down
+after any single S3 request encounters a retryable S3 error, such as 'Slow Down'.
+When set to `false`, each thread handles s3 request backoff independently of the others.
 )", 0) \
     DECLARE(UInt64, max_backup_bandwidth, 0, R"(
 The maximum read speed in bytes per second for particular backup on server. Zero means unlimited.

diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp
@@ -62,6 +62,8 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory()
             {"delta_lake_enable_expression_visitor_logging", false, false, "New setting"},
             {"write_full_path_in_iceberg_metadata", false, false, "New setting."},
             {"output_format_orc_compression_block_size", 65536, 262144, "New setting"},
+            {"backup_slow_all_threads_after_retryable_s3_error", true, true, "New setting"},
+
         });
         addSettingsChanges(settings_changes_history, "25.7",
         {

diff --git a/src/Databases/DataLake/GlueCatalog.cpp b/src/Databases/DataLake/GlueCatalog.cpp
@@ -109,6 +109,7 @@ GlueCatalog::GlueCatalog(
     int s3_max_redirects = static_cast<int>(global_settings[DB::Setting::s3_max_redirects]);
     int s3_retry_attempts = static_cast<int>(global_settings[DB::Setting::s3_retry_attempts]);
     bool s3_slow_all_threads_after_network_error = global_settings[DB::Setting::s3_slow_all_threads_after_network_error];
+    bool s3_slow_all_threads_after_retryable_error = false;
     bool enable_s3_requests_logging = global_settings[DB::Setting::enable_s3_requests_logging];
 
     DB::S3::PocoHTTPClientConfiguration poco_config = DB::S3::ClientFactory::instance().createClientConfiguration(
@@ -117,11 +118,11 @@ GlueCatalog::GlueCatalog(
         s3_max_redirects,
         s3_retry_attempts,
         s3_slow_all_threads_after_network_error,
+        s3_slow_all_threads_after_retryable_error,
         enable_s3_requests_logging,
         false,
         nullptr,
-        nullptr
-    );
+        nullptr);
 
     Aws::Glue::GlueClientConfiguration client_configuration;
     client_configuration.maxConnections = static_cast<unsigned>(global_settings[DB::Setting::s3_max_connections]);

diff --git a/src/Disks/ObjectStorages/S3/diskSettings.cpp b/src/Disks/ObjectStorages/S3/diskSettings.cpp
@@ -111,6 +111,8 @@ std::unique_ptr<S3::Client> getClient(
     if (!for_disk_s3 && local_settings.isChanged("s3_slow_all_threads_after_network_error"))
         s3_slow_all_threads_after_network_error = static_cast<int>(local_settings[Setting::s3_slow_all_threads_after_network_error]);
 
+    bool s3_slow_all_threads_after_retryable_error = false;
+
     bool enable_s3_requests_logging = global_settings[Setting::enable_s3_requests_logging];
     if (!for_disk_s3 && local_settings.isChanged("enable_s3_requests_logging"))
         enable_s3_requests_logging = local_settings[Setting::enable_s3_requests_logging];
@@ -121,6 +123,7 @@ std::unique_ptr<S3::Client> getClient(
         s3_max_redirects,
         s3_retry_attempts,
         s3_slow_all_threads_after_network_error,
+        s3_slow_all_threads_after_retryable_error,
         enable_s3_requests_logging,
         for_disk_s3,
         request_settings.get_request_throttler,

diff --git a/src/IO/S3/Client.cpp b/src/IO/S3/Client.cpp
@@ -110,12 +110,8 @@ bool Client::RetryStrategy::useGCSRewrite(const Aws::Client::AWSError<Aws::Clien
 /// NOLINTNEXTLINE(google-runtime-int)
 long Client::RetryStrategy::CalculateDelayBeforeNextRetry(const Aws::Client::AWSError<Aws::Client::CoreErrors>&, long attemptedRetries) const
 {
-    if (attemptedRetries == 0)
-    {
-        return 0;
-    }
-
-    uint64_t backoffLimitedPow = 1ul << std::min(attemptedRetries, 31l);
+    chassert(attemptedRetries >= 0);
+    uint64_t backoffLimitedPow = 1ul << std::clamp(attemptedRetries, 0l, 31l);
     return std::min<uint64_t>(scaleFactor * backoffLimitedPow, maxDelayMs);
 }
 
@@ -668,13 +664,13 @@ Client::doRequestWithRetryNetworkErrors(RequestType & request, RequestFn request
     auto with_retries = [this, request_fn_ = std::move(request_fn)] (const RequestType & request_)
     {
         chassert(client_configuration.retryStrategy);
-        const Int64 max_attempts = client_configuration.retryStrategy->GetMaxAttempts();
+        const Int64 max_attempts = client_configuration.s3_retry_attempts + 1;
         chassert(max_attempts > 0);
         std::exception_ptr last_exception = nullptr;
         for (Int64 attempt_no = 0; attempt_no < max_attempts; ++attempt_no)
         {
-            /// Sometimes we need to slow down because other requests failed with network errors to free the S3 server a bit.
-            slowDownAfterNetworkError();
+            /// Slowing down due to a previously encountered retryable error, possibly from another thread.
+            slowDownAfterRetryableError();
 
             try
             {
@@ -688,7 +684,19 @@ Client::doRequestWithRetryNetworkErrors(RequestType & request, RequestFn request
                 /// Not all requests can be retried in that way.
                 /// Requests that read out response body to build the result are possible to retry.
                 /// Requests that expose the response stream as an answer are not retried with that code. E.g. GetObject.
-                return request_fn_(request_);
+                auto outcome = request_fn_(request_);
+
+                if (!outcome.IsSuccess()
+                    /// AWS SDK's built-in per-thread retry logic is disabled.
+                    && client_configuration.s3_slow_all_threads_after_retryable_error
+                    && attempt_no + 1 < max_attempts
+                    /// Retry attempts are managed by the outer loop, so the attemptedRetries argument can be ignored.
+                    && client_configuration.retryStrategy->ShouldRetry(outcome.GetError(), /*attemptedRetries*/ -1))
+                {
+                    updateNextTimeToRetryAfterRetryableError(outcome.GetError(), attempt_no);
+                    continue;
+                }
+                return outcome;
             }
             catch (Poco::Net::NetException &)
             {
@@ -714,11 +722,12 @@ Client::doRequestWithRetryNetworkErrors(RequestType & request, RequestFn request
 
                 auto error = Aws::Client::AWSError<Aws::Client::CoreErrors>(Aws::Client::CoreErrors::NETWORK_CONNECTION, /*retry*/ true);
 
-                /// Check if query is canceled
-                if (!client_configuration.retryStrategy->ShouldRetry(error, attempt_no))
+                /// Check if query is canceled.
+                /// Retry attempts are managed by the outer loop, so the attemptedRetries argument can be ignored.
+                if (!client_configuration.retryStrategy->ShouldRetry(error, /*attemptedRetries*/ -1))
                     break;
 
-                sleepAfterNetworkError(error, attempt_no);
+                updateNextTimeToRetryAfterRetryableError(error, attempt_no);
             }
         }
 
@@ -749,36 +758,33 @@ RequestResult Client::processRequestResult(RequestResult && outcome) const
     return RequestResult(error);
 }
 
-void Client::sleepAfterNetworkError(Aws::Client::AWSError<Aws::Client::CoreErrors> error, Int64 attempt_no) const
+void Client::updateNextTimeToRetryAfterRetryableError(Aws::Client::AWSError<Aws::Client::CoreErrors> error, Int64 attempt_no) const
 {
-    auto sleep_ms = client_configuration.retryStrategy->CalculateDelayBeforeNextRetry(error, attempt_no);
-    if (!client_configuration.s3_slow_all_threads_after_network_error)
-    {
-        LOG_WARNING(log, "Request failed, now waiting {} ms before attempting again", sleep_ms);
-        sleepForMilliseconds(sleep_ms);
+    if (!client_configuration.s3_slow_all_threads_after_network_error || !client_configuration.s3_slow_all_threads_after_retryable_error)
         return;
-    }
 
-    /// Set the time other s3 requests must wait until.
+    auto sleep_ms = client_configuration.retryStrategy->CalculateDelayBeforeNextRetry(error, attempt_no);
+    /// Other S3 requests must wait until this time.
     UInt64 current_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now().time_since_epoch()).count();
     UInt64 next_time_ms = current_time_ms + sleep_ms;
-    /// next_time_to_retry_after_network_error = std::max(next_time_to_retry_after_network_error, next_time_ms)
-    for (UInt64 stored_next_time = next_time_to_retry_after_network_error;
-         (stored_next_time < next_time_ms) && !next_time_to_retry_after_network_error.compare_exchange_weak(stored_next_time, next_time_ms);)
+    UInt64 stored_next_time = next_time_to_retry_after_retryable_error;
+    while (stored_next_time < next_time_ms
+           && !next_time_to_retry_after_retryable_error.compare_exchange_weak(stored_next_time, next_time_ms))
     {
+        /// Atomically update to a later retry time, but only if it's further in the future.
     }
 }
 
-void Client::slowDownAfterNetworkError() const
+void Client::slowDownAfterRetryableError() const
 {
-    if (!client_configuration.s3_slow_all_threads_after_network_error)
+    if (!client_configuration.s3_slow_all_threads_after_network_error && !client_configuration.s3_slow_all_threads_after_retryable_error)
         return;
 
-    /// Wait until `next_time_to_retry_after_network_error`.
+    /// Wait until `next_time_to_retry_after_retryable_error`.
     for (;;)
     {
         UInt64 current_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now().time_since_epoch()).count();
-        UInt64 next_time_ms = next_time_to_retry_after_network_error.load();
+        UInt64 next_time_ms = next_time_to_retry_after_retryable_error.load();
         if (current_time_ms >= next_time_ms)
             break;
         UInt64 sleep_ms = next_time_ms - current_time_ms;
@@ -1057,7 +1063,9 @@ std::unique_ptr<S3::Client> ClientFactory::create( // NOLINT
             credentials_configuration.role_session_name, credentials_configuration.expiration_window_seconds,
             std::move(credentials_provider), client_configuration, credentials_configuration.sts_endpoint_override);
 
-    client_configuration.retryStrategy = std::make_shared<Client::RetryStrategy>(client_configuration.s3_retry_attempts);
+    /// Disable per-thread retry loops if global retry coordination is in use.
+    uint32_t retry_attempts = client_configuration.s3_slow_all_threads_after_retryable_error ? 0 : client_configuration.s3_retry_attempts;
+    client_configuration.retryStrategy = std::make_shared<Client::RetryStrategy>(retry_attempts);
 
     /// Use virtual addressing if endpoint is not specified.
     if (client_configuration.endpointOverride.empty())
@@ -1079,6 +1087,7 @@ PocoHTTPClientConfiguration ClientFactory::createClientConfiguration( // NOLINT
     unsigned int s3_max_redirects,
     unsigned int s3_retry_attempts,
     bool s3_slow_all_threads_after_network_error,
+    bool s3_slow_all_threads_after_retryable_error,
     bool enable_s3_requests_logging,
     bool for_disk_s3,
     const ThrottlerPtr & get_request_throttler,
@@ -1099,6 +1108,7 @@ PocoHTTPClientConfiguration ClientFactory::createClientConfiguration( // NOLINT
         s3_max_redirects,
         s3_retry_attempts,
         s3_slow_all_threads_after_network_error,
+        s3_slow_all_threads_after_retryable_error,
         enable_s3_requests_logging,
         for_disk_s3,
         context->getGlobalContext()->getSettingsRef()[Setting::s3_use_adaptive_timeouts],

diff --git a/src/IO/S3/Client.h b/src/IO/S3/Client.h
@@ -294,8 +294,8 @@ class Client : private Aws::S3::S3Client
     template <typename RequestResult>
     RequestResult processRequestResult(RequestResult && outcome) const;
 
-    void sleepAfterNetworkError(Aws::Client::AWSError<Aws::Client::CoreErrors> error, Int64 attempt_no) const;
-    void slowDownAfterNetworkError() const;
+    void updateNextTimeToRetryAfterRetryableError(Aws::Client::AWSError<Aws::Client::CoreErrors> error, Int64 attempt_no) const;
+    void slowDownAfterRetryableError() const;
 
     String initial_endpoint;
     std::shared_ptr<Aws::Auth::AWSCredentialsProvider> credentials_provider;
@@ -317,8 +317,8 @@ class Client : private Aws::S3::S3Client
 
     const size_t max_redirects;
 
-    /// S3 requests must wait until this time because some s3 request fails with a retryable network error.
-    mutable std::atomic<UInt64> next_time_to_retry_after_network_error = 0;
+    /// S3 requests must wait until this time because some s3 request fails with a retryable error.
+    mutable std::atomic<UInt64> next_time_to_retry_after_retryable_error = 0;
 
     const ServerSideEncryptionKMSConfig sse_kms_config;
 
@@ -349,6 +349,7 @@ class ClientFactory
         unsigned int s3_max_redirects,
         unsigned int s3_retry_attempts,
         bool s3_slow_all_threads_after_network_error,
+        bool s3_slow_all_threads_after_retryable_error,
         bool enable_s3_requests_logging,
         bool for_disk_s3,
         const ThrottlerPtr & get_request_throttler,

diff --git a/src/IO/S3/Credentials.cpp b/src/IO/S3/Credentials.cpp
@@ -739,6 +739,7 @@ S3CredentialsProviderChain::S3CredentialsProviderChain(
                 configuration.s3_max_redirects,
                 configuration.s3_retry_attempts,
                 configuration.s3_slow_all_threads_after_network_error,
+                configuration.s3_slow_all_threads_after_retryable_error,
                 configuration.enable_s3_requests_logging,
                 configuration.for_disk_s3,
                 configuration.get_request_throttler,
@@ -755,6 +756,7 @@ S3CredentialsProviderChain::S3CredentialsProviderChain(
                 configuration.s3_max_redirects,
                 configuration.s3_retry_attempts,
                 configuration.s3_slow_all_threads_after_network_error,
+                configuration.s3_slow_all_threads_after_retryable_error,
                 configuration.enable_s3_requests_logging,
                 configuration.for_disk_s3,
                 configuration.get_request_throttler,
@@ -806,6 +808,7 @@ S3CredentialsProviderChain::S3CredentialsProviderChain(
                 configuration.s3_max_redirects,
                 configuration.s3_retry_attempts,
                 configuration.s3_slow_all_threads_after_network_error,
+                configuration.s3_slow_all_threads_after_retryable_error,
                 configuration.enable_s3_requests_logging,
                 configuration.for_disk_s3,
                 configuration.get_request_throttler,

diff --git a/src/IO/S3/PocoHTTPClient.cpp b/src/IO/S3/PocoHTTPClient.cpp
@@ -117,24 +117,26 @@ namespace DB::S3
 {
 
 PocoHTTPClientConfiguration::PocoHTTPClientConfiguration(
-        std::function<ProxyConfiguration()> per_request_configuration_,
-        const String & force_region_,
-        const RemoteHostFilter & remote_host_filter_,
-        unsigned int s3_max_redirects_,
-        unsigned int s3_retry_attempts_,
-        bool s3_slow_all_threads_after_network_error_,
-        bool enable_s3_requests_logging_,
-        bool for_disk_s3_,
-        bool s3_use_adaptive_timeouts_,
-        const ThrottlerPtr & get_request_throttler_,
-        const ThrottlerPtr & put_request_throttler_,
-        std::function<void(const ProxyConfiguration &)> error_report_)
+    std::function<ProxyConfiguration()> per_request_configuration_,
+    const String & force_region_,
+    const RemoteHostFilter & remote_host_filter_,
+    unsigned int s3_max_redirects_,
+    unsigned int s3_retry_attempts_,
+    bool s3_slow_all_threads_after_network_error_,
+    bool s3_slow_all_threads_after_retryable_error_,
+    bool enable_s3_requests_logging_,
+    bool for_disk_s3_,
+    bool s3_use_adaptive_timeouts_,
+    const ThrottlerPtr & get_request_throttler_,
+    const ThrottlerPtr & put_request_throttler_,
+    std::function<void(const ProxyConfiguration &)> error_report_)
     : per_request_configuration(per_request_configuration_)
     , force_region(force_region_)
     , remote_host_filter(remote_host_filter_)
     , s3_max_redirects(s3_max_redirects_)
     , s3_retry_attempts(s3_retry_attempts_)
     , s3_slow_all_threads_after_network_error(s3_slow_all_threads_after_network_error_)
+    , s3_slow_all_threads_after_retryable_error(s3_slow_all_threads_after_retryable_error_)
     , enable_s3_requests_logging(enable_s3_requests_logging_)
     , for_disk_s3(for_disk_s3_)
     , get_request_throttler(get_request_throttler_)