feat: Add speculative decoding spec to DeployedModel proto

Google APIs · copybara-github · commit 8ae42f34e12c · 2025-01-23T19:56:53.000-08:00
PiperOrigin-RevId: 719114015
diff --git a/google/cloud/aiplatform/v1/endpoint.proto b/google/cloud/aiplatform/v1/endpoint.proto
@@ -321,6 +321,10 @@ message DeployedModel {
   // System labels to apply to Model Garden deployments.
   // System labels are managed by Google for internal use only.
   map<string, string> system_labels = 28;
+
+  // Optional. Spec for configuring speculative decoding.
+  SpeculativeDecodingSpec speculative_decoding_spec = 30
+      [(google.api.field_behavior) = OPTIONAL];
 }
 
 // PrivateEndpoints proto is used to provide paths for users to send
@@ -361,14 +365,52 @@ message PredictRequestResponseLoggingConfig {
   BigQueryDestination bigquery_destination = 3;
 }
 
+// Configurations (e.g. inference timeout) that are applied on your endpoints.
+message ClientConnectionConfig {
+  // Customizable online prediction request timeout.
+  google.protobuf.Duration inference_timeout = 1;
+}
+
 // Configuration for faster model deployment.
 message FasterDeploymentConfig {
   // If true, enable fast tryout feature for this deployed model.
   bool fast_tryout_enabled = 2;
 }
 
-// Configurations (e.g. inference timeout) that are applied on your endpoints.
-message ClientConnectionConfig {
-  // Customizable online prediction request timeout.
-  google.protobuf.Duration inference_timeout = 1;
+// Configuration for Speculative Decoding.
+message SpeculativeDecodingSpec {
+  // Draft model speculation works by using the smaller model to generate
+  // candidate tokens for speculative decoding.
+  message DraftModelSpeculation {
+    // Required. The resource name of the draft model.
+    string draft_model = 1 [
+      (google.api.field_behavior) = REQUIRED,
+      (google.api.resource_reference) = {
+        type: "aiplatform.googleapis.com/Model"
+      }
+    ];
+  }
+
+  // N-Gram speculation works by trying to find matching tokens in the
+  // previous prompt sequence and use those as speculation for generating
+  // new tokens.
+  message NgramSpeculation {
+    // The number of last N input tokens used as ngram to search/match
+    // against the previous prompt sequence.
+    // This is equal to the N in N-Gram.
+    // The default value is 3 if not specified.
+    int32 ngram_size = 1;
+  }
+
+  // The type of speculation method to use.
+  oneof speculation {
+    // draft model speculation.
+    DraftModelSpeculation draft_model_speculation = 2;
+
+    // N-Gram speculation.
+    NgramSpeculation ngram_speculation = 3;
+  }
+
+  // The number of speculative tokens to generate at each step.
+  int32 speculative_token_count = 1;
 }