Skip to content

Commit 2e45181

Browse files
authored
fix: use Recreate strategy for GPU workloads to prevent rolling update deadlock (#196)
Signed-off-by: Christopher Maher <[email protected]>
1 parent 8e6d968 commit 2e45181

File tree

2 files changed

+14
-0
lines changed

2 files changed

+14
-0
lines changed

internal/controller/inferenceservice_controller.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -839,6 +839,13 @@ func (r *InferenceServiceReconciler) constructDeployment(
839839
}
840840

841841
if gpuCount > 0 {
842+
// Use Recreate strategy for GPU workloads to prevent deadlock:
843+
// RollingUpdate requires the new pod to be Ready before terminating the old,
844+
// but the new pod cannot schedule if the old pod holds the only available GPU(s).
845+
deployment.Spec.Strategy = appsv1.DeploymentStrategy{
846+
Type: appsv1.RecreateDeploymentStrategyType,
847+
}
848+
842849
tolerations := []corev1.Toleration{
843850
{
844851
Key: "nvidia.com/gpu",

internal/controller/inferenceservice_controller_test.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,9 @@ var _ = Describe("Multi-GPU Deployment Construction", func() {
255255
By("verifying GPU resource limits")
256256
gpuLimit := container.Resources.Limits["nvidia.com/gpu"]
257257
Expect(gpuLimit).To(Equal(resource.MustParse("2")))
258+
259+
By("verifying Recreate strategy is set to avoid GPU deadlock")
260+
Expect(deployment.Spec.Strategy.Type).To(Equal(appsv1.RecreateDeploymentStrategyType))
258261
})
259262

260263
It("should include multi-GPU args for 4 GPU model", func() {
@@ -579,6 +582,9 @@ var _ = Describe("Multi-GPU Deployment Construction", func() {
579582
}
580583
}
581584
Expect(hasNvidiaToleration).To(BeTrue())
585+
586+
By("verifying Recreate strategy is set for GPU workloads")
587+
Expect(deployment.Spec.Strategy.Type).To(Equal(appsv1.RecreateDeploymentStrategyType))
582588
})
583589

584590
It("should apply custom node selector from InferenceService spec", func() {
@@ -1840,6 +1846,7 @@ var _ = Describe("constructDeployment additional cases", func() {
18401846
deployment := reconciler.constructDeployment(isvc, model, 1)
18411847
Expect(deployment.Spec.Template.Spec.Tolerations).To(BeEmpty())
18421848
Expect(deployment.Spec.Template.Spec.NodeSelector).To(BeEmpty())
1849+
Expect(deployment.Spec.Strategy.Type).To(Equal(appsv1.DeploymentStrategyType("")))
18431850
})
18441851

18451852
It("should use explicit GPU layers from Model spec", func() {

0 commit comments

Comments
 (0)