Backport #86089 to 25.8: Fix deadlock in PipelineExecutor downscaling logic

robot-clickhouse · robot-clickhouse · commit 74cd25a05c10 · 2025-09-18T18:14:57.000Z
diff --git a/src/Common/ISlotControl.h b/src/Common/ISlotControl.h
@@ -62,7 +62,7 @@ class ISlotLease : public IAcquiredSlot
 
     /// This method is for CPU consumption only.
     /// It should be called from a thread that started using the slot.
-    /// Required for obtainting CPU time for the thread, because ctor is called in another thread.
+    /// Required for obtaining CPU time for the thread, because ctor is called in another thread.
     virtual void startConsumption() = 0;
 
     /// Renew the slot. This method should be called periodically.
@@ -81,6 +81,9 @@ class ISlotAllocation : public std::enable_shared_from_this<ISlotAllocation>, bo
 public:
     virtual ~ISlotAllocation() = default;
 
+    /// Free the allocated slots, cancel slot requests and wake up preempted threads.
+    virtual void free() {}
+
     /// Take one already granted slot if available.
     [[nodiscard]] virtual AcquiredSlotPtr tryAcquire() = 0;
 
diff --git a/src/Common/Scheduler/CPULeaseAllocation.cpp b/src/Common/Scheduler/CPULeaseAllocation.cpp
@@ -207,11 +207,29 @@ CPULeaseAllocation::CPULeaseAllocation(SlotCount max_threads_, ResourceLink mast
 }
 
 CPULeaseAllocation::~CPULeaseAllocation()
+{
+    free();
+}
+
+void CPULeaseAllocation::free()
 {
     std::unique_lock lock{mutex};
+
+    if (shutdown)
+        return;
+
     shutdown = true;
     acquirable.store(false, std::memory_order_relaxed);
 
+    // Wake up all preempted threads
+    while (true)
+    {
+        if (size_t thread_num = threads.preempted.find_first(); thread_num != boost::dynamic_bitset<>::npos)
+            resetPreempted(thread_num);
+        else
+            break; // No preempted threads, we are done
+    }
+
     // Properly cancel pending resource request (if any)
     requests.cancel(lock);
 
@@ -433,13 +451,21 @@ bool CPULeaseAllocation::renew(Lease & lease)
     }
 
     std::unique_lock lock{mutex};
+
     if (exception)
         throw Exception(ErrorCodes::RESOURCE_ACCESS_DENIED, "CPU Resource request failed: {}", getExceptionMessage(exception, /* with_stacktrace = */ false));
 
     consume(lock, delta_ns);
 
     report_span.reset();
 
+    if (shutdown) // Allocation is being destroyed, worker thread should stop
+    {
+        downscale(lease.slot_id);
+        lease.reset();
+        return false;
+    }
+
     // Check if we need to decrease number of running threads (i.e. `acquired`).
     // We want number of `acquired` slots to be less than number of `allocated` slots.
     // Difference `allocated - acquired` equals `granted`. But we allow `granted == -1` for two reasons:
@@ -477,14 +503,17 @@ bool CPULeaseAllocation::renew(Lease & lease)
             CurrentMetrics::Increment preempted_increment(CurrentMetrics::ConcurrencyControlPreempted);
             acquired_increment.sub(1);
 
-            if (!waitForGrant(lock, thread_num))
+            if (!waitForGrant(lock, thread_num) || shutdown)
             {
-                // Timeout - worker thread should stop, but query continues
+                // Timeout or exception or shutdown - worker thread should stop
                 downscale(thread_num);
                 lease.reset();
                 return false;
             }
 
+            if (settings.on_resume)
+                settings.on_resume(thread_num);
+
             if (exception) // Stop the query
                 throw Exception(ErrorCodes::RESOURCE_ACCESS_DENIED, "CPU Resource request failed: {}", getExceptionMessage(exception, /* with_stacktrace = */ false));
 
@@ -503,9 +532,26 @@ bool CPULeaseAllocation::waitForGrant(std::unique_lock<std::mutex> & lock, size_
 
     auto predicate = [this, thread_num]
     {
-        return !threads.preempted[thread_num] || exception;
+        return !threads.preempted[thread_num] || exception || shutdown;
     };
 
+    // It is important to call on_preempt w/o lock to avoid deadlock due to recursive locking:
+    // renew() -> ExecutorTasks::preempt() -> ExecutorTasks::finish() -> free()
+    if (settings.on_preempt)
+    {
+        lock.unlock();
+        try
+        {
+            settings.on_preempt(thread_num);
+        }
+        catch (...)
+        {
+            lock.lock();
+            throw;
+        }
+        lock.lock();
+    }
+
     if (timeout == std::chrono::milliseconds::max())
     {
         threads.wake[thread_num].wait(lock, predicate);
diff --git a/src/Common/Scheduler/CPULeaseAllocation.h b/src/Common/Scheduler/CPULeaseAllocation.h
@@ -15,6 +15,7 @@
 #include <condition_variable>
 #include <mutex>
 #include <chrono>
+#include <functional>
 
 namespace DB
 {
@@ -34,6 +35,12 @@ struct CPULeaseSettings
     /// Timeout after which preempted thread should exit
     std::chrono::milliseconds preemption_timeout = default_preemption_timeout;
 
+    /// Callback to be invoked when a thread is preempted
+    std::function<void(size_t slot_id)> on_preempt;
+
+    /// Callback to be invoked when a thread is resumed
+    std::function<void(size_t slot_id)> on_resume;
+
     /// For debugging purposes, not used in production
     String workload;
 
@@ -149,6 +156,9 @@ class CPULeaseAllocation final : public ISlotAllocation
         CPULeaseSettings settings = {});
     ~CPULeaseAllocation() override;
 
+    /// Free all resources held by this allocation.
+    void free() override;
+
     /// Take one already granted slot if available. Never blocks or waits for slots.
     /// Should be used before spawning worker threads for a query.
     [[nodiscard]] AcquiredSlotPtr tryAcquire() override;
diff --git a/src/Processors/Executors/ExecutorTasks.cpp b/src/Processors/Executors/ExecutorTasks.cpp
@@ -18,12 +18,26 @@ void ExecutorTasks::finish()
         async_task_queue.finish();
     }
 
+    freeCPU();
+
     std::lock_guard guard(executor_contexts_mutex);
 
     for (auto & context : executor_contexts)
         context->wakeUp();
 }
 
+void ExecutorTasks::freeCPU()
+{
+    SlotAllocationPtr slots;
+    {
+        std::lock_guard lock(mutex);
+        slots = std::exchange(cpu_slots, nullptr);
+    }
+    if (!slots)
+        return;
+    slots->free();
+}
+
 void ExecutorTasks::rethrowFirstThreadException()
 {
     for (auto & executor_context : executor_contexts)
@@ -177,14 +191,19 @@ ExecutorTasks::SpawnStatus ExecutorTasks::pushTasks(Queue & queue, Queue & async
     return DO_NOT_SPAWN; // No new tasks -- no need for new threads
 }
 
-void ExecutorTasks::init(size_t num_threads_, size_t use_threads_, bool profile_processors, bool trace_processors, ReadProgressCallback * callback)
+void ExecutorTasks::init(size_t num_threads_, size_t use_threads_, const SlotAllocationPtr & cpu_slots_, bool profile_processors, bool trace_processors, ReadProgressCallback * callback)
 {
     num_threads = num_threads_;
     use_threads = use_threads_;
     threads_queue.init(num_threads);
     task_queue.init(num_threads);
     fast_task_queue.init(num_threads);
 
+    {
+        std::lock_guard lock(mutex); // In case finish() is executed concurrently with init() due to exception
+        cpu_slots = cpu_slots_;
+    }
+
     // Initialize slot counters with zeros up to max_threads
     slot_count.resize(num_threads, 0);
 
@@ -246,12 +265,11 @@ ExecutorTasks::SpawnStatus ExecutorTasks::upscale(size_t slot_id)
 
 void ExecutorTasks::downscale(size_t slot_id)
 {
-    std::unique_lock lock(mutex);
+    std::lock_guard lock(mutex);
 
     if (slot_id >= slot_count.size() || slot_count[slot_id] == 0)
         return;
     --slot_count[slot_id];
-    --total_slots;
 
     if (slot_id + 1 == use_threads)
     {
@@ -265,16 +283,34 @@ void ExecutorTasks::downscale(size_t slot_id)
             }
         }
     }
+}
 
-    // We should make sure that downscaled thread has no local task inside context.
-    // It is allowed to have tasks in `task_queue` or `fast_task_queue` because they can be stealed by other threads.
+void ExecutorTasks::preempt(size_t slot_id)
+{
+    std::unique_lock lock(mutex);
+    --total_slots;
+
+    /// We should make sure that preempted thread has no local task inside context.
+    /// It is allowed to have tasks in `task_queue` or `fast_task_queue` because they can be stealed by other threads.
     auto & context = executor_contexts[slot_id];
     if (auto * task = context->popTask())
     {
         task_queue.push(task, slot_id);
         /// Wake up at least one thread to avoid deadlocks (all other threads maybe idle)
-        tryWakeUpAnyOtherThreadWithTasks(*context, lock);
+        tryWakeUpAnyOtherThreadWithTasks(*context, lock); // this releases the lock if it wakes up a thread
     }
+    else if (task_queue.empty() && fast_task_queue.empty() && async_task_queue.empty() && threads_queue.size() == total_slots)
+    {
+        /// Finish pipeline if preempted thread was the last non-idle thread executed the last task of the whole pipeline
+        lock.unlock();
+        finish();
+    }
+}
+
+void ExecutorTasks::resume(size_t)
+{
+    std::lock_guard lock(mutex);
+    ++total_slots;
 }
 
 void ExecutorTasks::processAsyncTasks()
diff --git a/src/Processors/Executors/ExecutorTasks.h b/src/Processors/Executors/ExecutorTasks.h
@@ -4,6 +4,7 @@
 #include <Processors/Executors/PollingQueue.h>
 #include <Processors/Executors/ThreadsQueue.h>
 #include <Processors/Executors/TasksQueue.h>
+#include <Common/ISlotControl.h>
 #include <stack>
 
 namespace DB
@@ -46,12 +47,15 @@ class ExecutorTasks
     /// Reference counters for thread CPU slots to handle race conditions between upscale/downscale.
     std::vector<size_t> slot_count;
 
-    /// Total number of slots (sum of all slot_count).
+    /// Total number of non-preempted slots.
     size_t total_slots = 0;
 
     /// A set of currently waiting threads.
     ThreadsQueue threads_queue;
 
+    /// CPU slots for each thread.
+    SlotAllocationPtr cpu_slots;
+
     /// Threshold found by rolling dice.
     const static size_t TOO_MANY_IDLE_THRESHOLD = 4;
 
@@ -94,18 +98,27 @@ class ExecutorTasks
     // If non-local tasks were added, wake up one thread to process them.
     SpawnStatus pushTasks(Queue & queue, Queue & async_queue, ExecutionThreadContext & context);
 
-    void init(size_t num_threads_, size_t use_threads_, bool profile_processors, bool trace_processors, ReadProgressCallback * callback);
+    void init(size_t num_threads_, size_t use_threads_, const SlotAllocationPtr & cpu_slots_, bool profile_processors, bool trace_processors, ReadProgressCallback * callback);
     void fill(Queue & queue, Queue & async_queue);
 
+    /// Release CPU slots
+    void freeCPU();
+
     /// Upscale to include slot_id. Updates use_threads to max(use_threads, slot_id + 1)
     /// Returns spawn status indicating if more threads should be spawned
     SpawnStatus upscale(size_t slot_id);
 
-    void processAsyncTasks();
-
     /// Downscale by removing slot_id from active slots. Updates use_threads to highest active slot + 1
     void downscale(size_t slot_id);
 
+    /// Temporarily release slot_id without downscale. Later either downscale() or resume() is called.
+    void preempt(size_t slot_id);
+
+    /// Resume execution of a previously preempted slot.
+    void resume(size_t slot_id);
+
+    void processAsyncTasks();
+
     ExecutionThreadContext & getThreadContext(size_t thread_num) { return *executor_contexts[thread_num]; }
 
     String dump();
diff --git a/src/Processors/Executors/PipelineExecutor.cpp b/src/Processors/Executors/PipelineExecutor.cpp
@@ -226,6 +226,7 @@ void PipelineExecutor::setReadProgressCallback(ReadProgressCallbackPtr callback)
 void PipelineExecutor::finalizeExecution()
 {
     single_thread_cpu_slot.reset();
+    tasks.freeCPU();
     {
         std::lock_guard lock(spawn_mutex);
         cpu_slots.reset();
@@ -408,6 +409,7 @@ void PipelineExecutor::executeStepImpl(size_t thread_num, IAcquiredSlot * cpu_sl
                 try
                 {
                     // Preemption point. Renewal could block execution due to CPU overload.
+                    // It may trigger callbacks to tasks.preempt() and tasks.resume()
                     if (!cpu_helper.renew())
                     {
                         tasks.downscale(cpu_helper.id());
@@ -437,7 +439,7 @@ void PipelineExecutor::executeStepImpl(size_t thread_num, IAcquiredSlot * cpu_sl
 }
 
 /// Properly allocate CPU slots or lease for the thread pool
-static SlotAllocationPtr allocateCPU(size_t num_threads, bool concurrency_control, bool trace_cpu_scheduling)
+SlotAllocationPtr PipelineExecutor::allocateCPU(size_t num_threads, bool concurrency_control)
 {
     // The first thread is called master thread.
     // It is NOT the thread that handles async tasks (unless query has max_threads=1).
@@ -494,6 +496,8 @@ static SlotAllocationPtr allocateCPU(size_t num_threads, bool concurrency_contro
                             .quantum_ns = static_cast<ResourceCost>(quantum_ns),
                             .report_ns = static_cast<ResourceCost>(quantum_ns / 10),
                             .preemption_timeout = std::chrono::milliseconds(query_context->getCPUSlotPreemptionTimeout()),
+                            .on_preempt = [this](size_t slot_id) { tasks.preempt(slot_id); },
+                            .on_resume = [this](size_t slot_id) { tasks.resume(slot_id); },
                             .workload = query_context->getSettingsRef()[Setting::workload],
                             .trace_cpu_scheduling = trace_cpu_scheduling,
                         });
@@ -521,7 +525,7 @@ void PipelineExecutor::initializeExecution(size_t num_threads, bool concurrency_
     is_execution_initialized = true;
     tryUpdateExecutionStatus(ExecutionStatus::NotStarted, ExecutionStatus::Executing);
 
-    cpu_slots = allocateCPU(num_threads, concurrency_control, trace_cpu_scheduling);
+    cpu_slots = allocateCPU(num_threads, concurrency_control);
 
     Queue queue;
     Queue async_queue;
@@ -530,7 +534,7 @@ void PipelineExecutor::initializeExecution(size_t num_threads, bool concurrency_
     /// use_threads should reflect number of thread spawned and can grow with tasks.upscale(...).
     /// Starting from 1 instead of 0 is to tackle the single thread scenario, where no upscale() will
     /// be invoked but actually 1 thread used.
-    tasks.init(num_threads, 1, profile_processors, trace_processors, read_progress_callback.get());
+    tasks.init(num_threads, 1, cpu_slots, profile_processors, trace_processors, read_progress_callback.get());
     tasks.fill(queue, async_queue);
 
     if (num_threads > 1)
diff --git a/src/Processors/Executors/PipelineExecutor.h b/src/Processors/Executors/PipelineExecutor.h
@@ -111,7 +111,6 @@ class PipelineExecutor
 
     void initializeExecution(size_t num_threads, bool concurrency_control); /// Initialize executor contexts and task_queue.
     void finalizeExecution(); /// Check all processors are finished.
-    void spawnThreads(AcquiredSlotPtr slot) TSA_REQUIRES(spawn_mutex);
 
     /// Methods connected to execution.
     void executeImpl(size_t num_threads, bool concurrency_control);
@@ -120,6 +119,10 @@ class PipelineExecutor
     void finish();
     void cancel(ExecutionStatus reason);
 
+    // Methods for CPU scheduling
+    SlotAllocationPtr allocateCPU(size_t num_threads, bool concurrency_control);
+    void spawnThreads(AcquiredSlotPtr slot) TSA_REQUIRES(spawn_mutex);
+
     /// If execution_status == from, change it to desired.
     bool tryUpdateExecutionStatus(ExecutionStatus expected, ExecutionStatus desired);
 
diff --git a/tests/integration/test_scheduler_cpu_preemptive/test.py b/tests/integration/test_scheduler_cpu_preemptive/test.py