@@ -321,6 +321,10 @@ message DeployedModel {
321321 // System labels to apply to Model Garden deployments.
322322 // System labels are managed by Google for internal use only.
323323 map <string , string > system_labels = 28 ;
324+
325+ // Optional. Spec for configuring speculative decoding.
326+ SpeculativeDecodingSpec speculative_decoding_spec = 30
327+ [(google.api.field_behavior ) = OPTIONAL ];
324328}
325329
326330// PrivateEndpoints proto is used to provide paths for users to send
@@ -361,14 +365,52 @@ message PredictRequestResponseLoggingConfig {
361365 BigQueryDestination bigquery_destination = 3 ;
362366}
363367
368+ // Configurations (e.g. inference timeout) that are applied on your endpoints.
369+ message ClientConnectionConfig {
370+ // Customizable online prediction request timeout.
371+ google.protobuf.Duration inference_timeout = 1 ;
372+ }
373+
364374// Configuration for faster model deployment.
365375message FasterDeploymentConfig {
366376 // If true, enable fast tryout feature for this deployed model.
367377 bool fast_tryout_enabled = 2 ;
368378}
369379
370- // Configurations (e.g. inference timeout) that are applied on your endpoints.
371- message ClientConnectionConfig {
372- // Customizable online prediction request timeout.
373- google.protobuf.Duration inference_timeout = 1 ;
380+ // Configuration for Speculative Decoding.
381+ message SpeculativeDecodingSpec {
382+ // Draft model speculation works by using the smaller model to generate
383+ // candidate tokens for speculative decoding.
384+ message DraftModelSpeculation {
385+ // Required. The resource name of the draft model.
386+ string draft_model = 1 [
387+ (google.api.field_behavior ) = REQUIRED ,
388+ (google.api.resource_reference ) = {
389+ type : "aiplatform.googleapis.com/Model"
390+ }
391+ ];
392+ }
393+
394+ // N-Gram speculation works by trying to find matching tokens in the
395+ // previous prompt sequence and use those as speculation for generating
396+ // new tokens.
397+ message NgramSpeculation {
398+ // The number of last N input tokens used as ngram to search/match
399+ // against the previous prompt sequence.
400+ // This is equal to the N in N-Gram.
401+ // The default value is 3 if not specified.
402+ int32 ngram_size = 1 ;
403+ }
404+
405+ // The type of speculation method to use.
406+ oneof speculation {
407+ // draft model speculation.
408+ DraftModelSpeculation draft_model_speculation = 2 ;
409+
410+ // N-Gram speculation.
411+ NgramSpeculation ngram_speculation = 3 ;
412+ }
413+
414+ // The number of speculative tokens to generate at each step.
415+ int32 speculative_token_count = 1 ;
374416}
0 commit comments