Skip to content

Commit 8ae42f3

Browse files
Google APIscopybara-github
authored andcommitted
feat: Add speculative decoding spec to DeployedModel proto
PiperOrigin-RevId: 719114015
1 parent 08e6c4e commit 8ae42f3

1 file changed

Lines changed: 46 additions & 4 deletions

File tree

google/cloud/aiplatform/v1/endpoint.proto

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,10 @@ message DeployedModel {
321321
// System labels to apply to Model Garden deployments.
322322
// System labels are managed by Google for internal use only.
323323
map<string, string> system_labels = 28;
324+
325+
// Optional. Spec for configuring speculative decoding.
326+
SpeculativeDecodingSpec speculative_decoding_spec = 30
327+
[(google.api.field_behavior) = OPTIONAL];
324328
}
325329

326330
// PrivateEndpoints proto is used to provide paths for users to send
@@ -361,14 +365,52 @@ message PredictRequestResponseLoggingConfig {
361365
BigQueryDestination bigquery_destination = 3;
362366
}
363367

368+
// Configurations (e.g. inference timeout) that are applied on your endpoints.
369+
message ClientConnectionConfig {
370+
// Customizable online prediction request timeout.
371+
google.protobuf.Duration inference_timeout = 1;
372+
}
373+
364374
// Configuration for faster model deployment.
365375
message FasterDeploymentConfig {
366376
// If true, enable fast tryout feature for this deployed model.
367377
bool fast_tryout_enabled = 2;
368378
}
369379

370-
// Configurations (e.g. inference timeout) that are applied on your endpoints.
371-
message ClientConnectionConfig {
372-
// Customizable online prediction request timeout.
373-
google.protobuf.Duration inference_timeout = 1;
380+
// Configuration for Speculative Decoding.
381+
message SpeculativeDecodingSpec {
382+
// Draft model speculation works by using the smaller model to generate
383+
// candidate tokens for speculative decoding.
384+
message DraftModelSpeculation {
385+
// Required. The resource name of the draft model.
386+
string draft_model = 1 [
387+
(google.api.field_behavior) = REQUIRED,
388+
(google.api.resource_reference) = {
389+
type: "aiplatform.googleapis.com/Model"
390+
}
391+
];
392+
}
393+
394+
// N-Gram speculation works by trying to find matching tokens in the
395+
// previous prompt sequence and use those as speculation for generating
396+
// new tokens.
397+
message NgramSpeculation {
398+
// The number of last N input tokens used as ngram to search/match
399+
// against the previous prompt sequence.
400+
// This is equal to the N in N-Gram.
401+
// The default value is 3 if not specified.
402+
int32 ngram_size = 1;
403+
}
404+
405+
// The type of speculation method to use.
406+
oneof speculation {
407+
// draft model speculation.
408+
DraftModelSpeculation draft_model_speculation = 2;
409+
410+
// N-Gram speculation.
411+
NgramSpeculation ngram_speculation = 3;
412+
}
413+
414+
// The number of speculative tokens to generate at each step.
415+
int32 speculative_token_count = 1;
374416
}

0 commit comments

Comments
 (0)