Found a bug in the codellama vllm model_len logic. (#380)

sam-scale · web-flow · commit 5b6aeff6b663 · 2023-11-15T13:33:17.000-08:00
* Found a bug in the codellama vllm model_len logic.

Also, let's just avoid the vLLM error by making sure max_num_batched_tokens &gt;= max_model_len

* nevermind I realized that if statement will never happen here.
diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -174,9 +174,9 @@
     "mammoth-coder": {"max_model_len": 16384, "max_num_batched_tokens": 16384},
     # Based on config here: https://huggingface.co/TIGER-Lab/MAmmoTH-Coder-7B/blob/main/config.json#L12
     # Can also see 13B, 34B there too
-    "code-llama": {"max_model_len": 16384, "max_num_batched_tokens": 16384},
+    "codellama": {"max_model_len": 16384, "max_num_batched_tokens": 16384},
     # Based on config here: https://huggingface.co/codellama/CodeLlama-7b-hf/blob/main/config.json#L12
-    # Can also see 13B, 34B there too
+    # Can also see 13B, 34B there too. Note, codellama is one word.
     "llama-2": {"max_model_len": None, "max_num_batched_tokens": 4096},
     "mistral": {"max_model_len": 8000, "max_num_batched_tokens": 8000},
 }