File tree Expand file tree Collapse file tree 2 files changed +7
-0
lines changed
Expand file tree Collapse file tree 2 files changed +7
-0
lines changed Original file line number Diff line number Diff line change @@ -752,6 +752,7 @@ def __init__(
752752 mul_mat_q : bool = True ,
753753 logits_all : bool = False ,
754754 embedding : bool = False ,
755+ offload_kqv : bool = False ,
755756 # Sampling Params
756757 last_n_tokens_size : int = 64 ,
757758 # LoRA Params
@@ -817,6 +818,7 @@ def __init__(
817818 yarn_orig_ctx: YaRN original context size
818819 logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
819820 embedding: Embedding mode only.
821+ offload_kqv: Offload K, Q, V to GPU.
820822 last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
821823 lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
822824 lora_path: Path to a LoRA file to apply to the model.
@@ -903,6 +905,7 @@ def __init__(
903905 self .context_params .mul_mat_q = mul_mat_q
904906 self .context_params .logits_all = logits_all
905907 self .context_params .embedding = embedding
908+ self .context_params .offload_kqv = offload_kqv
906909
907910 # Sampling Params
908911 self .last_n_tokens_size = last_n_tokens_size
Original file line number Diff line number Diff line change @@ -100,6 +100,9 @@ class Settings(BaseSettings):
100100 )
101101 logits_all : bool = Field (default = True , description = "Whether to return logits." )
102102 embedding : bool = Field (default = True , description = "Whether to use embeddings." )
103+ offload_kqv : bool = Field (
104+ default = False , description = "Whether to offload kqv to the GPU."
105+ )
103106 # Sampling Params
104107 last_n_tokens_size : int = Field (
105108 default = 64 ,
@@ -409,6 +412,7 @@ def create_app(settings: Optional[Settings] = None):
409412 mul_mat_q = settings .mul_mat_q ,
410413 logits_all = settings .logits_all ,
411414 embedding = settings .embedding ,
415+ offload_kqv = settings .offload_kqv ,
412416 # Sampling Params
413417 last_n_tokens_size = settings .last_n_tokens_size ,
414418 # LoRA Params
You can’t perform that action at this time.
0 commit comments