feat: add KV cache type configuration and extraArgs escape hatch (#256)

Defilan · web-flow · commit 7a4b855d180c · 2026-03-31T20:45:44.000-07:00
Signed-off-by: Christopher Maher &lt;chris@mahercode.io&gt;
diff --git a/api/v1alpha1/inferenceservice_types.go b/api/v1alpha1/inferenceservice_types.go
@@ -85,6 +85,26 @@ type InferenceServiceSpec struct {
 	// +optional
 	Jinja *bool `json:"jinja,omitempty"`
 
+	// CacheTypeK sets the KV cache quantization type for keys.
+	// Supported values depend on the llama.cpp build version.
+	// Maps to llama.cpp --cache-type-k flag. Default: f16 (llama.cpp default).
+	// +kubebuilder:validation:Enum=f16;f32;q8_0;q4_0;q4_1;q5_0;q5_1;iq4_nl
+	// +optional
+	CacheTypeK string `json:"cacheTypeK,omitempty"`
+
+	// CacheTypeV sets the KV cache quantization type for values.
+	// Maps to llama.cpp --cache-type-v flag. Default: f16 (llama.cpp default).
+	// +kubebuilder:validation:Enum=f16;f32;q8_0;q4_0;q4_1;q5_0;q5_1;iq4_nl
+	// +optional
+	CacheTypeV string `json:"cacheTypeV,omitempty"`
+
+	// ExtraArgs provides additional command-line arguments passed directly to the
+	// llama-server process. Use for flags not yet supported as typed CRD fields.
+	// Arguments are appended after all other configured flags.
+	// Example: ["--seed", "42", "--batch-size", "2048"]
+	// +optional
+	ExtraArgs []string `json:"extraArgs,omitempty"`
+
 	// Priority determines scheduling priority for GPU allocation.
 	// Higher priority services can preempt lower priority ones when GPUs are scarce.
 	// +kubebuilder:validation:Enum=critical;high;normal;low;batch
diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go
diff --git a/config/crd/bases/inference.llmkube.dev_inferenceservices.yaml b/config/crd/bases/inference.llmkube.dev_inferenceservices.yaml
@@ -69,6 +69,35 @@ spec:
           spec:
             description: spec defines the desired state of InferenceService
             properties:
+              cacheTypeK:
+                description: |-
+                  CacheTypeK sets the KV cache quantization type for keys.
+                  Supported values depend on the llama.cpp build version.
+                  Maps to llama.cpp --cache-type-k flag. Default: f16 (llama.cpp default).
+                enum:
+                - f16
+                - f32
+                - q8_0
+                - q4_0
+                - q4_1
+                - q5_0
+                - q5_1
+                - iq4_nl
+                type: string
+              cacheTypeV:
+                description: |-
+                  CacheTypeV sets the KV cache quantization type for values.
+                  Maps to llama.cpp --cache-type-v flag. Default: f16 (llama.cpp default).
+                enum:
+                - f16
+                - f32
+                - q8_0
+                - q4_0
+                - q4_1
+                - q5_0
+                - q5_1
+                - iq4_nl
+                type: string
               contextSize:
                 description: |-
                   ContextSize sets the context window size for the llama.cpp server (-c flag).
@@ -102,6 +131,15 @@ spec:
                     - LoadBalancer
                     type: string
                 type: object
+              extraArgs:
+                description: |-
+                  ExtraArgs provides additional command-line arguments passed directly to the
+                  llama-server process. Use for flags not yet supported as typed CRD fields.
+                  Arguments are appended after all other configured flags.
+                  Example: ["--seed", "42", "--batch-size", "2048"]
+                items:
+                  type: string
+                type: array
               flashAttention:
                 description: |-
                   FlashAttention enables flash attention for faster prompt processing and reduced
diff --git a/internal/controller/inferenceservice_controller.go b/internal/controller/inferenceservice_controller.go
@@ -736,6 +736,16 @@ func appendJinjaArgs(args []string, jinja *bool) []string {
 	return args
 }
 
+func appendCacheTypeArgs(args []string, cacheTypeK, cacheTypeV string) []string {
+	if cacheTypeK != "" {
+		args = append(args, "--cache-type-k", cacheTypeK)
+	}
+	if cacheTypeV != "" {
+		args = append(args, "--cache-type-v", cacheTypeV)
+	}
+	return args
+}
+
 func (r *InferenceServiceReconciler) constructDeployment(
 	isvc *inferencev1alpha1.InferenceService,
 	model *inferencev1alpha1.Model,
@@ -800,6 +810,10 @@ func (r *InferenceServiceReconciler) constructDeployment(
 	args = appendParallelSlotsArgs(args, isvc.Spec.ParallelSlots)
 	args = appendFlashAttentionArgs(args, isvc.Spec.FlashAttention, gpuCount)
 	args = appendJinjaArgs(args, isvc.Spec.Jinja)
+	args = appendCacheTypeArgs(args, isvc.Spec.CacheTypeK, isvc.Spec.CacheTypeV)
+	if len(isvc.Spec.ExtraArgs) > 0 {
+		args = append(args, isvc.Spec.ExtraArgs...)
+	}
 
 	// Enable Prometheus metrics endpoint on llama.cpp
 	args = append(args, "--metrics")
diff --git a/internal/controller/inferenceservice_controller_test.go b/internal/controller/inferenceservice_controller_test.go
@@ -1154,6 +1154,223 @@ var _ = Describe("Context Size Configuration", func() {
 			Expect(args).NotTo(ContainElement("--jinja"))
 		})
 	})
+
+	Context("when cache type is configured", func() {
+		var (
+			reconciler *InferenceServiceReconciler
+			model      *inferencev1alpha1.Model
+		)
+
+		BeforeEach(func() {
+			reconciler = &InferenceServiceReconciler{
+				ModelCachePath:     "/tmp/llmkube/models",
+				InitContainerImage: "docker.io/curlimages/curl:8.18.0",
+			}
+
+			model = &inferencev1alpha1.Model{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "cache-type-model",
+					Namespace: "default",
+				},
+				Spec: inferencev1alpha1.ModelSpec{
+					Source: "https://example.com/model.gguf",
+					Hardware: &inferencev1alpha1.HardwareSpec{
+						GPU: &inferencev1alpha1.GPUSpec{
+							Count:  1,
+							Layers: 64,
+						},
+					},
+				},
+				Status: inferencev1alpha1.ModelStatus{
+					Phase:    "Ready",
+					CacheKey: "test-cache-key",
+					Path:     "/tmp/llmkube/models/test-model.gguf",
+				},
+			}
+		})
+
+		It("should include --cache-type-k flag when cacheTypeK is set", func() {
+			replicas := int32(1)
+			isvc := &inferencev1alpha1.InferenceService{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "cache-k-service",
+					Namespace: "default",
+				},
+				Spec: inferencev1alpha1.InferenceServiceSpec{
+					ModelRef:   "cache-type-model",
+					Replicas:   &replicas,
+					Image:      "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					CacheTypeK: "q4_0",
+					Resources: &inferencev1alpha1.InferenceResourceRequirements{
+						GPU: 1,
+					},
+				},
+			}
+
+			deployment := reconciler.constructDeployment(isvc, model, 1)
+
+			args := deployment.Spec.Template.Spec.Containers[0].Args
+			Expect(args).To(ContainElements("--cache-type-k", "q4_0"))
+		})
+
+		It("should include --cache-type-v flag when cacheTypeV is set", func() {
+			replicas := int32(1)
+			isvc := &inferencev1alpha1.InferenceService{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "cache-v-service",
+					Namespace: "default",
+				},
+				Spec: inferencev1alpha1.InferenceServiceSpec{
+					ModelRef:   "cache-type-model",
+					Replicas:   &replicas,
+					Image:      "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					CacheTypeV: "q8_0",
+					Resources: &inferencev1alpha1.InferenceResourceRequirements{
+						GPU: 1,
+					},
+				},
+			}
+
+			deployment := reconciler.constructDeployment(isvc, model, 1)
+
+			args := deployment.Spec.Template.Spec.Containers[0].Args
+			Expect(args).To(ContainElements("--cache-type-v", "q8_0"))
+		})
+
+		It("should include both cache type flags when both are set", func() {
+			replicas := int32(1)
+			isvc := &inferencev1alpha1.InferenceService{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "cache-both-service",
+					Namespace: "default",
+				},
+				Spec: inferencev1alpha1.InferenceServiceSpec{
+					ModelRef:   "cache-type-model",
+					Replicas:   &replicas,
+					Image:      "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					CacheTypeK: "q4_0",
+					CacheTypeV: "q8_0",
+					Resources: &inferencev1alpha1.InferenceResourceRequirements{
+						GPU: 1,
+					},
+				},
+			}
+
+			deployment := reconciler.constructDeployment(isvc, model, 1)
+
+			args := deployment.Spec.Template.Spec.Containers[0].Args
+			Expect(args).To(ContainElements("--cache-type-k", "q4_0"))
+			Expect(args).To(ContainElements("--cache-type-v", "q8_0"))
+		})
+
+		It("should NOT include cache type flags when neither is set", func() {
+			replicas := int32(1)
+			isvc := &inferencev1alpha1.InferenceService{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "no-cache-type-service",
+					Namespace: "default",
+				},
+				Spec: inferencev1alpha1.InferenceServiceSpec{
+					ModelRef: "cache-type-model",
+					Replicas: &replicas,
+					Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					Resources: &inferencev1alpha1.InferenceResourceRequirements{
+						GPU: 1,
+					},
+				},
+			}
+
+			deployment := reconciler.constructDeployment(isvc, model, 1)
+
+			args := deployment.Spec.Template.Spec.Containers[0].Args
+			Expect(args).NotTo(ContainElement("--cache-type-k"))
+			Expect(args).NotTo(ContainElement("--cache-type-v"))
+		})
+	})
+
+	Context("when extraArgs is configured", func() {
+		var (
+			reconciler *InferenceServiceReconciler
+			model      *inferencev1alpha1.Model
+		)
+
+		BeforeEach(func() {
+			reconciler = &InferenceServiceReconciler{
+				ModelCachePath:     "/tmp/llmkube/models",
+				InitContainerImage: "docker.io/curlimages/curl:8.18.0",
+			}
+
+			model = &inferencev1alpha1.Model{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "extra-args-model",
+					Namespace: "default",
+				},
+				Spec: inferencev1alpha1.ModelSpec{
+					Source: "https://example.com/model.gguf",
+					Hardware: &inferencev1alpha1.HardwareSpec{
+						GPU: &inferencev1alpha1.GPUSpec{
+							Count:  1,
+							Layers: 64,
+						},
+					},
+				},
+				Status: inferencev1alpha1.ModelStatus{
+					Phase:    "Ready",
+					CacheKey: "test-cache-key",
+					Path:     "/tmp/llmkube/models/test-model.gguf",
+				},
+			}
+		})
+
+		It("should append all extra args in order", func() {
+			replicas := int32(1)
+			isvc := &inferencev1alpha1.InferenceService{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "extra-args-service",
+					Namespace: "default",
+				},
+				Spec: inferencev1alpha1.InferenceServiceSpec{
+					ModelRef:  "extra-args-model",
+					Replicas:  &replicas,
+					Image:     "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					ExtraArgs: []string{"--seed", "42", "--batch-size", "2048"},
+					Resources: &inferencev1alpha1.InferenceResourceRequirements{
+						GPU: 1,
+					},
+				},
+			}
+
+			deployment := reconciler.constructDeployment(isvc, model, 1)
+
+			args := deployment.Spec.Template.Spec.Containers[0].Args
+			Expect(args).To(ContainElements("--seed", "42"))
+			Expect(args).To(ContainElements("--batch-size", "2048"))
+		})
+
+		It("should NOT append anything extra when extraArgs is empty", func() {
+			replicas := int32(1)
+			isvc := &inferencev1alpha1.InferenceService{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "no-extra-args-service",
+					Namespace: "default",
+				},
+				Spec: inferencev1alpha1.InferenceServiceSpec{
+					ModelRef: "extra-args-model",
+					Replicas: &replicas,
+					Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					Resources: &inferencev1alpha1.InferenceResourceRequirements{
+						GPU: 1,
+					},
+				},
+			}
+
+			deployment := reconciler.constructDeployment(isvc, model, 1)
+
+			args := deployment.Spec.Template.Spec.Containers[0].Args
+			Expect(args).NotTo(ContainElement("--seed"))
+			Expect(args).NotTo(ContainElement("--batch-size"))
+		})
+	})
 })
 
 var _ = Describe("Multi-GPU End-to-End Reconciliation", func() {
diff --git a/pkg/cli/deploy.go b/pkg/cli/deploy.go