feat: add Gemini TTS Multispeaker API fields

Google APIs · copybara-github · commit c82133531c78 · 2025-09-23T09:32:56.000-07:00
PiperOrigin-RevId: 810470893
diff --git a/google/cloud/texttospeech/v1beta1/cloud_tts.proto b/google/cloud/texttospeech/v1beta1/cloud_tts.proto
@@ -270,6 +270,28 @@ message MultiSpeakerMarkup {
   repeated Turn turns = 1 [(google.api.field_behavior) = REQUIRED];
 }
 
+// Configuration for a single speaker in a Gemini TTS multi-speaker setup.
+// Enables dialogue between two speakers.
+message MultispeakerPrebuiltVoice {
+  // Required. The speaker alias of the voice. This is the user-chosen speaker
+  // name that is used in the multispeaker text input, such as "Speaker1".
+  string speaker_alias = 1 [(google.api.field_behavior) = REQUIRED];
+
+  // Required. The speaker ID of the voice. See
+  // https://cloud.google.com/text-to-speech/docs/gemini-tts#voice_options
+  // for available values.
+  string speaker_id = 2 [(google.api.field_behavior) = REQUIRED];
+}
+
+// Configuration for a multi-speaker text-to-speech setup. Enables the use of up
+// to two distinct voices in a single synthesis request.
+message MultiSpeakerVoiceConfig {
+  // Required. A list of configurations for the voices of the speakers. Exactly
+  // two speaker voice configurations must be provided.
+  repeated MultispeakerPrebuiltVoice speaker_voice_configs = 2
+      [(google.api.field_behavior) = REQUIRED];
+}
+
 // Contains text input to be synthesized. Either `text` or `ssml` must be
 // supplied. Supplying both or neither returns
 // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. The
@@ -356,6 +378,12 @@ message VoiceSelectionParams {
   // Optional. The name of the model. If set, the service will choose the model
   // matching the specified configuration.
   string model_name = 6 [(google.api.field_behavior) = OPTIONAL];
+
+  // Optional. The configuration for a Gemini multi-speaker text-to-speech
+  // setup. Enables the use of two distinct voices in a single synthesis
+  // request.
+  MultiSpeakerVoiceConfig multi_speaker_voice_config = 7
+      [(google.api.field_behavior) = OPTIONAL];
 }
 
 // Description of audio data to be synthesized.
@@ -530,6 +558,10 @@ message StreamingSynthesisInput {
     // Markup for HD voices specifically. This field may not be used with any
     // other voices.
     string markup = 5;
+
+    // Multi-speaker markup for Gemini TTS. This field may not
+    // be used with any other voices.
+    MultiSpeakerMarkup multi_speaker_markup = 7;
   }
 
   // This is system instruction supported only for controllable voice models.
diff --git a/google/cloud/texttospeech/v1beta1/texttospeech_v1beta1.yaml b/google/cloud/texttospeech/v1beta1/texttospeech_v1beta1.yaml
@@ -15,10 +15,6 @@ documentation:
   summary: |-
     Synthesizes natural-sounding speech by applying powerful neural network
     models.
-  overview: |-
-    # Introduction
-
-    Google Cloud Text-to-Speech API provides speech synthesis as a service.
 
 http:
   rules: