@@ -255,6 +255,28 @@ message MultiSpeakerMarkup {
255255 repeated Turn turns = 1 [(google.api.field_behavior ) = REQUIRED ];
256256}
257257
258+ // Configuration for a single speaker in a Gemini TTS multi-speaker setup.
259+ // Enables dialogue between two speakers.
260+ message MultispeakerPrebuiltVoice {
261+ // Required. The speaker alias of the voice. This is the user-chosen speaker
262+ // name that is used in the multispeaker text input, such as "Speaker1".
263+ string speaker_alias = 1 [(google.api.field_behavior ) = REQUIRED ];
264+
265+ // Required. The speaker ID of the voice. See
266+ // https://cloud.google.com/text-to-speech/docs/gemini-tts#voice_options
267+ // for available values.
268+ string speaker_id = 2 [(google.api.field_behavior ) = REQUIRED ];
269+ }
270+
271+ // Configuration for a multi-speaker text-to-speech setup. Enables the use of up
272+ // to two distinct voices in a single synthesis request.
273+ message MultiSpeakerVoiceConfig {
274+ // Required. A list of configurations for the voices of the speakers. Exactly
275+ // two speaker voice configurations must be provided.
276+ repeated MultispeakerPrebuiltVoice speaker_voice_configs = 2
277+ [(google.api.field_behavior ) = REQUIRED ];
278+ }
279+
258280// Contains text input to be synthesized. Either `text` or `ssml` must be
259281// supplied. Supplying both or neither returns
260282// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. The
@@ -341,6 +363,12 @@ message VoiceSelectionParams {
341363 // Optional. The name of the model. If set, the service will choose the model
342364 // matching the specified configuration.
343365 string model_name = 6 [(google.api.field_behavior ) = OPTIONAL ];
366+
367+ // Optional. The configuration for a Gemini multi-speaker text-to-speech
368+ // setup. Enables the use of two distinct voices in a single synthesis
369+ // request.
370+ MultiSpeakerVoiceConfig multi_speaker_voice_config = 7
371+ [(google.api.field_behavior ) = OPTIONAL ];
344372}
345373
346374// Description of audio data to be synthesized.
@@ -498,6 +526,10 @@ message StreamingSynthesisInput {
498526 // Markup for HD voices specifically. This field may not be used with any
499527 // other voices.
500528 string markup = 5 ;
529+
530+ // Multi-speaker markup for Gemini TTS. This field may not
531+ // be used with any other voices.
532+ MultiSpeakerMarkup multi_speaker_markup = 7 ;
501533 }
502534
503535 // This is system instruction supported only for controllable voice models.
0 commit comments