@@ -270,6 +270,28 @@ message MultiSpeakerMarkup {
270270 repeated Turn turns = 1 [(google.api.field_behavior ) = REQUIRED ];
271271}
272272
273+ // Configuration for a single speaker in a Gemini TTS multi-speaker setup.
274+ // Enables dialogue between two speakers.
275+ message MultispeakerPrebuiltVoice {
276+ // Required. The speaker alias of the voice. This is the user-chosen speaker
277+ // name that is used in the multispeaker text input, such as "Speaker1".
278+ string speaker_alias = 1 [(google.api.field_behavior ) = REQUIRED ];
279+
280+ // Required. The speaker ID of the voice. See
281+ // https://cloud.google.com/text-to-speech/docs/gemini-tts#voice_options
282+ // for available values.
283+ string speaker_id = 2 [(google.api.field_behavior ) = REQUIRED ];
284+ }
285+
286+ // Configuration for a multi-speaker text-to-speech setup. Enables the use of up
287+ // to two distinct voices in a single synthesis request.
288+ message MultiSpeakerVoiceConfig {
289+ // Required. A list of configurations for the voices of the speakers. Exactly
290+ // two speaker voice configurations must be provided.
291+ repeated MultispeakerPrebuiltVoice speaker_voice_configs = 2
292+ [(google.api.field_behavior ) = REQUIRED ];
293+ }
294+
273295// Contains text input to be synthesized. Either `text` or `ssml` must be
274296// supplied. Supplying both or neither returns
275297// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. The
@@ -356,6 +378,12 @@ message VoiceSelectionParams {
356378 // Optional. The name of the model. If set, the service will choose the model
357379 // matching the specified configuration.
358380 string model_name = 6 [(google.api.field_behavior ) = OPTIONAL ];
381+
382+ // Optional. The configuration for a Gemini multi-speaker text-to-speech
383+ // setup. Enables the use of two distinct voices in a single synthesis
384+ // request.
385+ MultiSpeakerVoiceConfig multi_speaker_voice_config = 7
386+ [(google.api.field_behavior ) = OPTIONAL ];
359387}
360388
361389// Description of audio data to be synthesized.
@@ -530,6 +558,10 @@ message StreamingSynthesisInput {
530558 // Markup for HD voices specifically. This field may not be used with any
531559 // other voices.
532560 string markup = 5 ;
561+
562+ // Multi-speaker markup for Gemini TTS. This field may not
563+ // be used with any other voices.
564+ MultiSpeakerMarkup multi_speaker_markup = 7 ;
533565 }
534566
535567 // This is system instruction supported only for controllable voice models.
0 commit comments