feat: Support promptable voices by specifying a model name and a prompt

Google APIs · copybara-github · commit b738e78ed63e · 2025-08-25T13:20:17.000-07:00
feat: Add enum value M4A to enum AudioEncoding
docs: A comment for method `StreamingSynthesize` in service `TextToSpeech` is changed
docs: A comment for enum value `AUDIO_ENCODING_UNSPECIFIED` in enum `AudioEncoding` is changed
docs: A comment for enum value `OGG_OPUS` in enum `AudioEncoding` is changed
docs: A comment for enum value `PCM` in enum `AudioEncoding` is changed
docs: A comment for field `low_latency_journey_synthesis` in message `.google.cloud.texttospeech.v1beta1.AdvancedVoiceOptions` is changed
docs: A comment for enum value `PHONETIC_ENCODING_IPA` in enum `PhoneticEncoding` is changed
docs: A comment for enum value `PHONETIC_ENCODING_X_SAMPA` in enum `PhoneticEncoding` is changed
docs: A comment for field `phrase` in message `.google.cloud.texttospeech.v1beta1.CustomPronunciationParams` is changed
docs: A comment for field `pronunciations` in message `.google.cloud.texttospeech.v1beta1.CustomPronunciations` is changed
docs: A comment for message `MultiSpeakerMarkup` is changed
docs: A comment for field `custom_pronunciations` in message `.google.cloud.texttospeech.v1beta1.SynthesisInput` is changed
docs: A comment for field `voice_clone` in message `.google.cloud.texttospeech.v1beta1.VoiceSelectionParams` is changed
docs: A comment for field `speaking_rate` in message `.google.cloud.texttospeech.v1beta1.AudioConfig` is changed
docs: A comment for field `audio_encoding` in message `.google.cloud.texttospeech.v1beta1.StreamingAudioConfig` is changed
docs: A comment for field `text` in message `.google.cloud.texttospeech.v1beta1.StreamingSynthesisInput` is changed

PiperOrigin-RevId: 799242210
diff --git a/google/cloud/texttospeech/v1beta1/cloud_tts.proto b/google/cloud/texttospeech/v1beta1/cloud_tts.proto
@@ -59,7 +59,7 @@ service TextToSpeech {
     option (google.api.method_signature) = "input,voice,audio_config";
   }
 
-  // Performs bidirectional streaming speech synthesis: receive audio while
+  // Performs bidirectional streaming speech synthesis: receives audio while
   // sending text.
   rpc StreamingSynthesize(stream StreamingSynthesizeRequest)
       returns (stream StreamingSynthesizeResponse) {}
@@ -88,7 +88,8 @@ enum SsmlVoiceGender {
 // Configuration to set up audio encoder. The encoding determines the output
 // audio format that we'd like.
 enum AudioEncoding {
-  // Not specified. Will return result
+  // Not specified. Only used by GenerateVoiceCloningKey. Otherwise, will return
+  // result
   // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT].
   AUDIO_ENCODING_UNSPECIFIED = 0;
 
@@ -102,7 +103,7 @@ enum AudioEncoding {
   // MP3 at 64kbps.
   MP3_64_KBPS = 4;
 
-  // Opus encoded audio wrapped in an ogg container. The result will be a
+  // Opus encoded audio wrapped in an ogg container. The result is a
   // file which can be played natively on Android, and in browsers (at least
   // Chrome and Firefox). The quality of the encoding is considerably higher
   // than MP3 while using approximately the same bitrate.
@@ -117,9 +118,12 @@ enum AudioEncoding {
   ALAW = 6;
 
   // Uncompressed 16-bit signed little-endian samples (Linear PCM).
-  // Note that as opposed to LINEAR16, audio will not be wrapped in a WAV (or
+  // Note that as opposed to LINEAR16, audio won't be wrapped in a WAV (or
   // any other) header.
   PCM = 7;
+
+  // M4A audio.
+  M4A = 8;
 }
 
 // The top-level message sent by the client for the `ListVoices` method.
@@ -160,8 +164,8 @@ message Voice {
 
 // Used for advanced voice options.
 message AdvancedVoiceOptions {
-  // Only for Journey voices. If false, the synthesis will be context aware
-  // and have higher latency.
+  // Only for Journey voices. If false, the synthesis is context aware
+  // and has a higher latency.
   optional bool low_latency_journey_synthesis = 1;
 }
 
@@ -199,18 +203,41 @@ message CustomPronunciationParams {
     // Not specified.
     PHONETIC_ENCODING_UNSPECIFIED = 0;
 
-    // IPA. (e.g. apple -> ˈæpəl )
+    // IPA, such as apple -> ˈæpəl.
     // https://en.wikipedia.org/wiki/International_Phonetic_Alphabet
     PHONETIC_ENCODING_IPA = 1;
 
-    // X-SAMPA (e.g. apple -> "{p@l" )
+    // X-SAMPA, such as apple -> "{p@l".
     // https://en.wikipedia.org/wiki/X-SAMPA
     PHONETIC_ENCODING_X_SAMPA = 2;
+
+    // For reading-to-pron conversion to work well, the `pronunciation` field
+    //  should only contain Kanji, Hiragana, and Katakana.
+    //
+    // The pronunciation can also contain pitch accents.
+    // The start of a pitch phrase is specified with `^` and the down-pitch
+    // position is specified with `!`, for example:
+    //
+    //     phrase:端  pronunciation:^はし
+    //     phrase:箸  pronunciation:^は!し
+    //     phrase:橋  pronunciation:^はし!
+    //
+    // We currently only support the Tokyo dialect, which allows at most one
+    // down-pitch per phrase (i.e. at most one `!` between `^`).
+    PHONETIC_ENCODING_JAPANESE_YOMIGANA = 3;
+
+    // Used to specify pronunciations for Mandarin words. See
+    // https://en.wikipedia.org/wiki/Pinyin.
+    //
+    // For example: 朝阳, the pronunciation is "chao2 yang2". The number
+    // represents the tone, and there is a space between syllables. Neutral
+    // tones are represented by 5, for example 孩子 "hai2 zi5".
+    PHONETIC_ENCODING_PINYIN = 4;
   }
 
-  // The phrase to which the customization will be applied.
-  // The phrase can be multiple words (in the case of proper nouns etc), but
-  // should not span to a whole sentence.
+  // The phrase to which the customization is applied.
+  // The phrase can be multiple words, such as proper nouns, but shouldn't span
+  // the length of the sentence.
   optional string phrase = 1;
 
   // The phonetic encoding of the phrase.
@@ -223,13 +250,13 @@ message CustomPronunciationParams {
 
 // A collection of pronunciation customizations.
 message CustomPronunciations {
-  // The pronunciation customizations to be applied.
+  // The pronunciation customizations are applied.
   repeated CustomPronunciationParams pronunciations = 1;
 }
 
 // A collection of turns for multi-speaker synthesis.
 message MultiSpeakerMarkup {
-  // A Multi-speaker turn.
+  // A multi-speaker turn.
   message Turn {
     // Required. The speaker of the turn, for example, 'O' or 'Q'. Please refer
     // to documentation for available speakers.
@@ -253,6 +280,10 @@ message SynthesisInput {
     // The raw text to be synthesized.
     string text = 1;
 
+    // Markup for HD voices specifically. This field may not be used with any
+    // other voices.
+    string markup = 5;
+
     // The SSML document to be synthesized. The SSML document must be valid
     // and well-formed. Otherwise the RPC will fail and return
     // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. For
@@ -265,18 +296,16 @@ message SynthesisInput {
     MultiSpeakerMarkup multi_speaker_markup = 4;
   }
 
-  // Optional. The pronunciation customizations to be applied to the input. If
-  // this is set, the input will be synthesized using the given pronunciation
+  // Optional. The pronunciation customizations are applied to the input. If
+  // this is set, the input is synthesized using the given pronunciation
   // customizations.
   //
-  // The initial support will be for EFIGS (English, French,
-  // Italian, German, Spanish) languages, as provided in
-  // VoiceSelectionParams. Journey and Instant Clone voices are
-  // not supported yet.
+  // The initial support is for en-us, with plans to expand to other locales in
+  // the future. Instant Clone voices aren't supported.
   //
   // In order to customize the pronunciation of a phrase, there must be an exact
   // match of the phrase in the input types. If using SSML, the phrase must not
-  // be inside a phoneme tag (entirely or partially).
+  // be inside a phoneme tag.
   CustomPronunciations custom_pronunciations = 3
       [(google.api.field_behavior) = OPTIONAL];
 }
@@ -314,20 +343,24 @@ message VoiceSelectionParams {
   CustomVoiceParams custom_voice = 4;
 
   // Optional. The configuration for a voice clone. If
-  // [VoiceCloneParams.voice_clone_key] is set, the service will choose the
-  // voice clone matching the specified configuration.
+  // [VoiceCloneParams.voice_clone_key] is set, the service chooses the voice
+  // clone matching the specified configuration.
   VoiceCloneParams voice_clone = 5 [(google.api.field_behavior) = OPTIONAL];
+
+  // Optional. The name of the model. If set, the service will choose the model
+  // matching the specified configuration.
+  string model_name = 6 [(google.api.field_behavior) = OPTIONAL];
 }
 
 // Description of audio data to be synthesized.
 message AudioConfig {
   // Required. The format of the audio byte stream.
   AudioEncoding audio_encoding = 1 [(google.api.field_behavior) = REQUIRED];
 
-  // Optional. Input only. Speaking rate/speed, in the range [0.25, 4.0]. 1.0 is
+  // Optional. Input only. Speaking rate/speed, in the range [0.25, 2.0]. 1.0 is
   // the normal native speed supported by the specific voice. 2.0 is twice as
   // fast, and 0.5 is half as fast. If unset(0.0), defaults to the native 1.0
-  // speed. Any other values < 0.25 or > 4.0 will return an error.
+  // speed. Any other values < 0.25 or > 2.0 will return an error.
   double speaking_rate = 2 [
     (google.api.field_behavior) = INPUT_ONLY,
     (google.api.field_behavior) = OPTIONAL
@@ -440,12 +473,21 @@ message Timepoint {
 // Description of the desired output audio data.
 message StreamingAudioConfig {
   // Required. The format of the audio byte stream.
-  // For now, streaming only supports PCM and OGG_OPUS. All other encodings
-  // will return an error.
+  // Streaming supports PCM, ALAW, MULAW and OGG_OPUS. All other encodings
+  // return an error.
   AudioEncoding audio_encoding = 1 [(google.api.field_behavior) = REQUIRED];
 
   // Optional. The synthesis sample rate (in hertz) for this audio.
   int32 sample_rate_hertz = 2 [(google.api.field_behavior) = OPTIONAL];
+
+  // Optional. Input only. Speaking rate/speed, in the range [0.25, 2.0]. 1.0 is
+  // the normal native speed supported by the specific voice. 2.0 is twice as
+  // fast, and 0.5 is half as fast. If unset(0.0), defaults to the native 1.0
+  // speed. Any other values < 0.25 or > 2.0 will return an error.
+  double speaking_rate = 3 [
+    (google.api.field_behavior) = INPUT_ONLY,
+    (google.api.field_behavior) = OPTIONAL
+  ];
 }
 
 // Provides configuration information for the StreamingSynthesize request.
@@ -456,17 +498,36 @@ message StreamingSynthesizeConfig {
   // Optional. The configuration of the synthesized audio.
   StreamingAudioConfig streaming_audio_config = 4
       [(google.api.field_behavior) = OPTIONAL];
+
+  // Optional. The pronunciation customizations are applied to the input. If
+  // this is set, the input is synthesized using the given pronunciation
+  // customizations.
+  //
+  // The initial support is for en-us, with plans to expand to other locales in
+  // the future. Instant Clone voices aren't supported.
+  //
+  // In order to customize the pronunciation of a phrase, there must be an exact
+  // match of the phrase in the input types. If using SSML, the phrase must not
+  // be inside a phoneme tag.
+  CustomPronunciations custom_pronunciations = 5
+      [(google.api.field_behavior) = OPTIONAL];
 }
 
 // Input to be synthesized.
 message StreamingSynthesisInput {
   oneof input_source {
     // The raw text to be synthesized. It is recommended that each input
-    // contains complete, terminating sentences, as this will likely result in
-    // better prosody in the output audio. That being said, users are free to
-    // input text however they please.
+    // contains complete, terminating sentences, which results in better prosody
+    // in the output audio.
     string text = 1;
+
+    // Markup for HD voices specifically. This field may not be used with any
+    // other voices.
+    string markup = 5;
   }
+
+  // This is system instruction supported only for controllable voice models.
+  optional string prompt = 6;
 }
 
 // Request message for the `StreamingSynthesize` method. Multiple