@@ -1154,6 +1154,223 @@ var _ = Describe("Context Size Configuration", func() {
11541154 Expect (args ).NotTo (ContainElement ("--jinja" ))
11551155 })
11561156 })
1157+
1158+ Context ("when cache type is configured" , func () {
1159+ var (
1160+ reconciler * InferenceServiceReconciler
1161+ model * inferencev1alpha1.Model
1162+ )
1163+
1164+ BeforeEach (func () {
1165+ reconciler = & InferenceServiceReconciler {
1166+ ModelCachePath : "/tmp/llmkube/models" ,
1167+ InitContainerImage : "docker.io/curlimages/curl:8.18.0" ,
1168+ }
1169+
1170+ model = & inferencev1alpha1.Model {
1171+ ObjectMeta : metav1.ObjectMeta {
1172+ Name : "cache-type-model" ,
1173+ Namespace : "default" ,
1174+ },
1175+ Spec : inferencev1alpha1.ModelSpec {
1176+ Source : "https://example.com/model.gguf" ,
1177+ Hardware : & inferencev1alpha1.HardwareSpec {
1178+ GPU : & inferencev1alpha1.GPUSpec {
1179+ Count : 1 ,
1180+ Layers : 64 ,
1181+ },
1182+ },
1183+ },
1184+ Status : inferencev1alpha1.ModelStatus {
1185+ Phase : "Ready" ,
1186+ CacheKey : "test-cache-key" ,
1187+ Path : "/tmp/llmkube/models/test-model.gguf" ,
1188+ },
1189+ }
1190+ })
1191+
1192+ It ("should include --cache-type-k flag when cacheTypeK is set" , func () {
1193+ replicas := int32 (1 )
1194+ isvc := & inferencev1alpha1.InferenceService {
1195+ ObjectMeta : metav1.ObjectMeta {
1196+ Name : "cache-k-service" ,
1197+ Namespace : "default" ,
1198+ },
1199+ Spec : inferencev1alpha1.InferenceServiceSpec {
1200+ ModelRef : "cache-type-model" ,
1201+ Replicas : & replicas ,
1202+ Image : "ghcr.io/ggml-org/llama.cpp:server-cuda" ,
1203+ CacheTypeK : "q4_0" ,
1204+ Resources : & inferencev1alpha1.InferenceResourceRequirements {
1205+ GPU : 1 ,
1206+ },
1207+ },
1208+ }
1209+
1210+ deployment := reconciler .constructDeployment (isvc , model , 1 )
1211+
1212+ args := deployment .Spec .Template .Spec .Containers [0 ].Args
1213+ Expect (args ).To (ContainElements ("--cache-type-k" , "q4_0" ))
1214+ })
1215+
1216+ It ("should include --cache-type-v flag when cacheTypeV is set" , func () {
1217+ replicas := int32 (1 )
1218+ isvc := & inferencev1alpha1.InferenceService {
1219+ ObjectMeta : metav1.ObjectMeta {
1220+ Name : "cache-v-service" ,
1221+ Namespace : "default" ,
1222+ },
1223+ Spec : inferencev1alpha1.InferenceServiceSpec {
1224+ ModelRef : "cache-type-model" ,
1225+ Replicas : & replicas ,
1226+ Image : "ghcr.io/ggml-org/llama.cpp:server-cuda" ,
1227+ CacheTypeV : "q8_0" ,
1228+ Resources : & inferencev1alpha1.InferenceResourceRequirements {
1229+ GPU : 1 ,
1230+ },
1231+ },
1232+ }
1233+
1234+ deployment := reconciler .constructDeployment (isvc , model , 1 )
1235+
1236+ args := deployment .Spec .Template .Spec .Containers [0 ].Args
1237+ Expect (args ).To (ContainElements ("--cache-type-v" , "q8_0" ))
1238+ })
1239+
1240+ It ("should include both cache type flags when both are set" , func () {
1241+ replicas := int32 (1 )
1242+ isvc := & inferencev1alpha1.InferenceService {
1243+ ObjectMeta : metav1.ObjectMeta {
1244+ Name : "cache-both-service" ,
1245+ Namespace : "default" ,
1246+ },
1247+ Spec : inferencev1alpha1.InferenceServiceSpec {
1248+ ModelRef : "cache-type-model" ,
1249+ Replicas : & replicas ,
1250+ Image : "ghcr.io/ggml-org/llama.cpp:server-cuda" ,
1251+ CacheTypeK : "q4_0" ,
1252+ CacheTypeV : "q8_0" ,
1253+ Resources : & inferencev1alpha1.InferenceResourceRequirements {
1254+ GPU : 1 ,
1255+ },
1256+ },
1257+ }
1258+
1259+ deployment := reconciler .constructDeployment (isvc , model , 1 )
1260+
1261+ args := deployment .Spec .Template .Spec .Containers [0 ].Args
1262+ Expect (args ).To (ContainElements ("--cache-type-k" , "q4_0" ))
1263+ Expect (args ).To (ContainElements ("--cache-type-v" , "q8_0" ))
1264+ })
1265+
1266+ It ("should NOT include cache type flags when neither is set" , func () {
1267+ replicas := int32 (1 )
1268+ isvc := & inferencev1alpha1.InferenceService {
1269+ ObjectMeta : metav1.ObjectMeta {
1270+ Name : "no-cache-type-service" ,
1271+ Namespace : "default" ,
1272+ },
1273+ Spec : inferencev1alpha1.InferenceServiceSpec {
1274+ ModelRef : "cache-type-model" ,
1275+ Replicas : & replicas ,
1276+ Image : "ghcr.io/ggml-org/llama.cpp:server-cuda" ,
1277+ Resources : & inferencev1alpha1.InferenceResourceRequirements {
1278+ GPU : 1 ,
1279+ },
1280+ },
1281+ }
1282+
1283+ deployment := reconciler .constructDeployment (isvc , model , 1 )
1284+
1285+ args := deployment .Spec .Template .Spec .Containers [0 ].Args
1286+ Expect (args ).NotTo (ContainElement ("--cache-type-k" ))
1287+ Expect (args ).NotTo (ContainElement ("--cache-type-v" ))
1288+ })
1289+ })
1290+
1291+ Context ("when extraArgs is configured" , func () {
1292+ var (
1293+ reconciler * InferenceServiceReconciler
1294+ model * inferencev1alpha1.Model
1295+ )
1296+
1297+ BeforeEach (func () {
1298+ reconciler = & InferenceServiceReconciler {
1299+ ModelCachePath : "/tmp/llmkube/models" ,
1300+ InitContainerImage : "docker.io/curlimages/curl:8.18.0" ,
1301+ }
1302+
1303+ model = & inferencev1alpha1.Model {
1304+ ObjectMeta : metav1.ObjectMeta {
1305+ Name : "extra-args-model" ,
1306+ Namespace : "default" ,
1307+ },
1308+ Spec : inferencev1alpha1.ModelSpec {
1309+ Source : "https://example.com/model.gguf" ,
1310+ Hardware : & inferencev1alpha1.HardwareSpec {
1311+ GPU : & inferencev1alpha1.GPUSpec {
1312+ Count : 1 ,
1313+ Layers : 64 ,
1314+ },
1315+ },
1316+ },
1317+ Status : inferencev1alpha1.ModelStatus {
1318+ Phase : "Ready" ,
1319+ CacheKey : "test-cache-key" ,
1320+ Path : "/tmp/llmkube/models/test-model.gguf" ,
1321+ },
1322+ }
1323+ })
1324+
1325+ It ("should append all extra args in order" , func () {
1326+ replicas := int32 (1 )
1327+ isvc := & inferencev1alpha1.InferenceService {
1328+ ObjectMeta : metav1.ObjectMeta {
1329+ Name : "extra-args-service" ,
1330+ Namespace : "default" ,
1331+ },
1332+ Spec : inferencev1alpha1.InferenceServiceSpec {
1333+ ModelRef : "extra-args-model" ,
1334+ Replicas : & replicas ,
1335+ Image : "ghcr.io/ggml-org/llama.cpp:server-cuda" ,
1336+ ExtraArgs : []string {"--seed" , "42" , "--batch-size" , "2048" },
1337+ Resources : & inferencev1alpha1.InferenceResourceRequirements {
1338+ GPU : 1 ,
1339+ },
1340+ },
1341+ }
1342+
1343+ deployment := reconciler .constructDeployment (isvc , model , 1 )
1344+
1345+ args := deployment .Spec .Template .Spec .Containers [0 ].Args
1346+ Expect (args ).To (ContainElements ("--seed" , "42" ))
1347+ Expect (args ).To (ContainElements ("--batch-size" , "2048" ))
1348+ })
1349+
1350+ It ("should NOT append anything extra when extraArgs is empty" , func () {
1351+ replicas := int32 (1 )
1352+ isvc := & inferencev1alpha1.InferenceService {
1353+ ObjectMeta : metav1.ObjectMeta {
1354+ Name : "no-extra-args-service" ,
1355+ Namespace : "default" ,
1356+ },
1357+ Spec : inferencev1alpha1.InferenceServiceSpec {
1358+ ModelRef : "extra-args-model" ,
1359+ Replicas : & replicas ,
1360+ Image : "ghcr.io/ggml-org/llama.cpp:server-cuda" ,
1361+ Resources : & inferencev1alpha1.InferenceResourceRequirements {
1362+ GPU : 1 ,
1363+ },
1364+ },
1365+ }
1366+
1367+ deployment := reconciler .constructDeployment (isvc , model , 1 )
1368+
1369+ args := deployment .Spec .Template .Spec .Containers [0 ].Args
1370+ Expect (args ).NotTo (ContainElement ("--seed" ))
1371+ Expect (args ).NotTo (ContainElement ("--batch-size" ))
1372+ })
1373+ })
11571374})
11581375
11591376var _ = Describe ("Multi-GPU End-to-End Reconciliation" , func () {
0 commit comments