Skip to content

Commit 7a4b855

Browse files
authored
feat: add KV cache type configuration and extraArgs escape hatch (#256)
Signed-off-by: Christopher Maher <[email protected]>
1 parent 149c582 commit 7a4b855

File tree

6 files changed

+355
-27
lines changed

6 files changed

+355
-27
lines changed

api/v1alpha1/inferenceservice_types.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,26 @@ type InferenceServiceSpec struct {
8585
// +optional
8686
Jinja *bool `json:"jinja,omitempty"`
8787

88+
// CacheTypeK sets the KV cache quantization type for keys.
89+
// Supported values depend on the llama.cpp build version.
90+
// Maps to llama.cpp --cache-type-k flag. Default: f16 (llama.cpp default).
91+
// +kubebuilder:validation:Enum=f16;f32;q8_0;q4_0;q4_1;q5_0;q5_1;iq4_nl
92+
// +optional
93+
CacheTypeK string `json:"cacheTypeK,omitempty"`
94+
95+
// CacheTypeV sets the KV cache quantization type for values.
96+
// Maps to llama.cpp --cache-type-v flag. Default: f16 (llama.cpp default).
97+
// +kubebuilder:validation:Enum=f16;f32;q8_0;q4_0;q4_1;q5_0;q5_1;iq4_nl
98+
// +optional
99+
CacheTypeV string `json:"cacheTypeV,omitempty"`
100+
101+
// ExtraArgs provides additional command-line arguments passed directly to the
102+
// llama-server process. Use for flags not yet supported as typed CRD fields.
103+
// Arguments are appended after all other configured flags.
104+
// Example: ["--seed", "42", "--batch-size", "2048"]
105+
// +optional
106+
ExtraArgs []string `json:"extraArgs,omitempty"`
107+
88108
// Priority determines scheduling priority for GPU allocation.
89109
// Higher priority services can preempt lower priority ones when GPUs are scarce.
90110
// +kubebuilder:validation:Enum=critical;high;normal;low;batch

api/v1alpha1/zz_generated.deepcopy.go

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

config/crd/bases/inference.llmkube.dev_inferenceservices.yaml

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,35 @@ spec:
6969
spec:
7070
description: spec defines the desired state of InferenceService
7171
properties:
72+
cacheTypeK:
73+
description: |-
74+
CacheTypeK sets the KV cache quantization type for keys.
75+
Supported values depend on the llama.cpp build version.
76+
Maps to llama.cpp --cache-type-k flag. Default: f16 (llama.cpp default).
77+
enum:
78+
- f16
79+
- f32
80+
- q8_0
81+
- q4_0
82+
- q4_1
83+
- q5_0
84+
- q5_1
85+
- iq4_nl
86+
type: string
87+
cacheTypeV:
88+
description: |-
89+
CacheTypeV sets the KV cache quantization type for values.
90+
Maps to llama.cpp --cache-type-v flag. Default: f16 (llama.cpp default).
91+
enum:
92+
- f16
93+
- f32
94+
- q8_0
95+
- q4_0
96+
- q4_1
97+
- q5_0
98+
- q5_1
99+
- iq4_nl
100+
type: string
72101
contextSize:
73102
description: |-
74103
ContextSize sets the context window size for the llama.cpp server (-c flag).
@@ -102,6 +131,15 @@ spec:
102131
- LoadBalancer
103132
type: string
104133
type: object
134+
extraArgs:
135+
description: |-
136+
ExtraArgs provides additional command-line arguments passed directly to the
137+
llama-server process. Use for flags not yet supported as typed CRD fields.
138+
Arguments are appended after all other configured flags.
139+
Example: ["--seed", "42", "--batch-size", "2048"]
140+
items:
141+
type: string
142+
type: array
105143
flashAttention:
106144
description: |-
107145
FlashAttention enables flash attention for faster prompt processing and reduced

internal/controller/inferenceservice_controller.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -736,6 +736,16 @@ func appendJinjaArgs(args []string, jinja *bool) []string {
736736
return args
737737
}
738738

739+
func appendCacheTypeArgs(args []string, cacheTypeK, cacheTypeV string) []string {
740+
if cacheTypeK != "" {
741+
args = append(args, "--cache-type-k", cacheTypeK)
742+
}
743+
if cacheTypeV != "" {
744+
args = append(args, "--cache-type-v", cacheTypeV)
745+
}
746+
return args
747+
}
748+
739749
func (r *InferenceServiceReconciler) constructDeployment(
740750
isvc *inferencev1alpha1.InferenceService,
741751
model *inferencev1alpha1.Model,
@@ -800,6 +810,10 @@ func (r *InferenceServiceReconciler) constructDeployment(
800810
args = appendParallelSlotsArgs(args, isvc.Spec.ParallelSlots)
801811
args = appendFlashAttentionArgs(args, isvc.Spec.FlashAttention, gpuCount)
802812
args = appendJinjaArgs(args, isvc.Spec.Jinja)
813+
args = appendCacheTypeArgs(args, isvc.Spec.CacheTypeK, isvc.Spec.CacheTypeV)
814+
if len(isvc.Spec.ExtraArgs) > 0 {
815+
args = append(args, isvc.Spec.ExtraArgs...)
816+
}
803817

804818
// Enable Prometheus metrics endpoint on llama.cpp
805819
args = append(args, "--metrics")

internal/controller/inferenceservice_controller_test.go

Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1154,6 +1154,223 @@ var _ = Describe("Context Size Configuration", func() {
11541154
Expect(args).NotTo(ContainElement("--jinja"))
11551155
})
11561156
})
1157+
1158+
Context("when cache type is configured", func() {
1159+
var (
1160+
reconciler *InferenceServiceReconciler
1161+
model *inferencev1alpha1.Model
1162+
)
1163+
1164+
BeforeEach(func() {
1165+
reconciler = &InferenceServiceReconciler{
1166+
ModelCachePath: "/tmp/llmkube/models",
1167+
InitContainerImage: "docker.io/curlimages/curl:8.18.0",
1168+
}
1169+
1170+
model = &inferencev1alpha1.Model{
1171+
ObjectMeta: metav1.ObjectMeta{
1172+
Name: "cache-type-model",
1173+
Namespace: "default",
1174+
},
1175+
Spec: inferencev1alpha1.ModelSpec{
1176+
Source: "https://example.com/model.gguf",
1177+
Hardware: &inferencev1alpha1.HardwareSpec{
1178+
GPU: &inferencev1alpha1.GPUSpec{
1179+
Count: 1,
1180+
Layers: 64,
1181+
},
1182+
},
1183+
},
1184+
Status: inferencev1alpha1.ModelStatus{
1185+
Phase: "Ready",
1186+
CacheKey: "test-cache-key",
1187+
Path: "/tmp/llmkube/models/test-model.gguf",
1188+
},
1189+
}
1190+
})
1191+
1192+
It("should include --cache-type-k flag when cacheTypeK is set", func() {
1193+
replicas := int32(1)
1194+
isvc := &inferencev1alpha1.InferenceService{
1195+
ObjectMeta: metav1.ObjectMeta{
1196+
Name: "cache-k-service",
1197+
Namespace: "default",
1198+
},
1199+
Spec: inferencev1alpha1.InferenceServiceSpec{
1200+
ModelRef: "cache-type-model",
1201+
Replicas: &replicas,
1202+
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
1203+
CacheTypeK: "q4_0",
1204+
Resources: &inferencev1alpha1.InferenceResourceRequirements{
1205+
GPU: 1,
1206+
},
1207+
},
1208+
}
1209+
1210+
deployment := reconciler.constructDeployment(isvc, model, 1)
1211+
1212+
args := deployment.Spec.Template.Spec.Containers[0].Args
1213+
Expect(args).To(ContainElements("--cache-type-k", "q4_0"))
1214+
})
1215+
1216+
It("should include --cache-type-v flag when cacheTypeV is set", func() {
1217+
replicas := int32(1)
1218+
isvc := &inferencev1alpha1.InferenceService{
1219+
ObjectMeta: metav1.ObjectMeta{
1220+
Name: "cache-v-service",
1221+
Namespace: "default",
1222+
},
1223+
Spec: inferencev1alpha1.InferenceServiceSpec{
1224+
ModelRef: "cache-type-model",
1225+
Replicas: &replicas,
1226+
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
1227+
CacheTypeV: "q8_0",
1228+
Resources: &inferencev1alpha1.InferenceResourceRequirements{
1229+
GPU: 1,
1230+
},
1231+
},
1232+
}
1233+
1234+
deployment := reconciler.constructDeployment(isvc, model, 1)
1235+
1236+
args := deployment.Spec.Template.Spec.Containers[0].Args
1237+
Expect(args).To(ContainElements("--cache-type-v", "q8_0"))
1238+
})
1239+
1240+
It("should include both cache type flags when both are set", func() {
1241+
replicas := int32(1)
1242+
isvc := &inferencev1alpha1.InferenceService{
1243+
ObjectMeta: metav1.ObjectMeta{
1244+
Name: "cache-both-service",
1245+
Namespace: "default",
1246+
},
1247+
Spec: inferencev1alpha1.InferenceServiceSpec{
1248+
ModelRef: "cache-type-model",
1249+
Replicas: &replicas,
1250+
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
1251+
CacheTypeK: "q4_0",
1252+
CacheTypeV: "q8_0",
1253+
Resources: &inferencev1alpha1.InferenceResourceRequirements{
1254+
GPU: 1,
1255+
},
1256+
},
1257+
}
1258+
1259+
deployment := reconciler.constructDeployment(isvc, model, 1)
1260+
1261+
args := deployment.Spec.Template.Spec.Containers[0].Args
1262+
Expect(args).To(ContainElements("--cache-type-k", "q4_0"))
1263+
Expect(args).To(ContainElements("--cache-type-v", "q8_0"))
1264+
})
1265+
1266+
It("should NOT include cache type flags when neither is set", func() {
1267+
replicas := int32(1)
1268+
isvc := &inferencev1alpha1.InferenceService{
1269+
ObjectMeta: metav1.ObjectMeta{
1270+
Name: "no-cache-type-service",
1271+
Namespace: "default",
1272+
},
1273+
Spec: inferencev1alpha1.InferenceServiceSpec{
1274+
ModelRef: "cache-type-model",
1275+
Replicas: &replicas,
1276+
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
1277+
Resources: &inferencev1alpha1.InferenceResourceRequirements{
1278+
GPU: 1,
1279+
},
1280+
},
1281+
}
1282+
1283+
deployment := reconciler.constructDeployment(isvc, model, 1)
1284+
1285+
args := deployment.Spec.Template.Spec.Containers[0].Args
1286+
Expect(args).NotTo(ContainElement("--cache-type-k"))
1287+
Expect(args).NotTo(ContainElement("--cache-type-v"))
1288+
})
1289+
})
1290+
1291+
Context("when extraArgs is configured", func() {
1292+
var (
1293+
reconciler *InferenceServiceReconciler
1294+
model *inferencev1alpha1.Model
1295+
)
1296+
1297+
BeforeEach(func() {
1298+
reconciler = &InferenceServiceReconciler{
1299+
ModelCachePath: "/tmp/llmkube/models",
1300+
InitContainerImage: "docker.io/curlimages/curl:8.18.0",
1301+
}
1302+
1303+
model = &inferencev1alpha1.Model{
1304+
ObjectMeta: metav1.ObjectMeta{
1305+
Name: "extra-args-model",
1306+
Namespace: "default",
1307+
},
1308+
Spec: inferencev1alpha1.ModelSpec{
1309+
Source: "https://example.com/model.gguf",
1310+
Hardware: &inferencev1alpha1.HardwareSpec{
1311+
GPU: &inferencev1alpha1.GPUSpec{
1312+
Count: 1,
1313+
Layers: 64,
1314+
},
1315+
},
1316+
},
1317+
Status: inferencev1alpha1.ModelStatus{
1318+
Phase: "Ready",
1319+
CacheKey: "test-cache-key",
1320+
Path: "/tmp/llmkube/models/test-model.gguf",
1321+
},
1322+
}
1323+
})
1324+
1325+
It("should append all extra args in order", func() {
1326+
replicas := int32(1)
1327+
isvc := &inferencev1alpha1.InferenceService{
1328+
ObjectMeta: metav1.ObjectMeta{
1329+
Name: "extra-args-service",
1330+
Namespace: "default",
1331+
},
1332+
Spec: inferencev1alpha1.InferenceServiceSpec{
1333+
ModelRef: "extra-args-model",
1334+
Replicas: &replicas,
1335+
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
1336+
ExtraArgs: []string{"--seed", "42", "--batch-size", "2048"},
1337+
Resources: &inferencev1alpha1.InferenceResourceRequirements{
1338+
GPU: 1,
1339+
},
1340+
},
1341+
}
1342+
1343+
deployment := reconciler.constructDeployment(isvc, model, 1)
1344+
1345+
args := deployment.Spec.Template.Spec.Containers[0].Args
1346+
Expect(args).To(ContainElements("--seed", "42"))
1347+
Expect(args).To(ContainElements("--batch-size", "2048"))
1348+
})
1349+
1350+
It("should NOT append anything extra when extraArgs is empty", func() {
1351+
replicas := int32(1)
1352+
isvc := &inferencev1alpha1.InferenceService{
1353+
ObjectMeta: metav1.ObjectMeta{
1354+
Name: "no-extra-args-service",
1355+
Namespace: "default",
1356+
},
1357+
Spec: inferencev1alpha1.InferenceServiceSpec{
1358+
ModelRef: "extra-args-model",
1359+
Replicas: &replicas,
1360+
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
1361+
Resources: &inferencev1alpha1.InferenceResourceRequirements{
1362+
GPU: 1,
1363+
},
1364+
},
1365+
}
1366+
1367+
deployment := reconciler.constructDeployment(isvc, model, 1)
1368+
1369+
args := deployment.Spec.Template.Spec.Containers[0].Args
1370+
Expect(args).NotTo(ContainElement("--seed"))
1371+
Expect(args).NotTo(ContainElement("--batch-size"))
1372+
})
1373+
})
11571374
})
11581375

11591376
var _ = Describe("Multi-GPU End-to-End Reconciliation", func() {

0 commit comments

Comments
 (0)