feat: add multi-speaker markup, which allows generating dialogue between multiple speakers

Google APIs · copybara-github · commit 56f5fa4555e6 · 2024-10-24T11:03:55.000-07:00
PiperOrigin-RevId: 689444598
diff --git a/google/cloud/texttospeech/v1beta1/cloud_tts.proto b/google/cloud/texttospeech/v1beta1/cloud_tts.proto
@@ -222,6 +222,22 @@ message CustomPronunciations {
   repeated CustomPronunciationParams pronunciations = 1;
 }
 
+// A collection of turns for multi-speaker synthesis.
+message MultiSpeakerMarkup {
+  // A Multi-speaker turn.
+  message Turn {
+    // Required. The speaker of the turn, for example, 'O' or 'Q'. Please refer
+    // to documentation for available speakers.
+    string speaker = 1 [(google.api.field_behavior) = REQUIRED];
+
+    // Required. The text to speak.
+    string text = 2 [(google.api.field_behavior) = REQUIRED];
+  }
+
+  // Required. Speaker turns.
+  repeated Turn turns = 1 [(google.api.field_behavior) = REQUIRED];
+}
+
 // Contains text input to be synthesized. Either `text` or `ssml` must be
 // supplied. Supplying both or neither returns
 // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. The
@@ -238,6 +254,10 @@ message SynthesisInput {
     // more information, see
     // [SSML](https://cloud.google.com/text-to-speech/docs/ssml).
     string ssml = 2;
+
+    // The multi-speaker input to be synthesized. Only applicable for
+    // multi-speaker synthesis.
+    MultiSpeakerMarkup multi_speaker_markup = 4;
   }
 
   // Optional. The pronunciation customizations to be applied to the input. If