@@ -59,7 +59,7 @@ service TextToSpeech {
5959 option (google.api.method_signature ) = "input,voice,audio_config" ;
6060 }
6161
62- // Performs bidirectional streaming speech synthesis: receive audio while
62+ // Performs bidirectional streaming speech synthesis: receives audio while
6363 // sending text.
6464 rpc StreamingSynthesize (stream StreamingSynthesizeRequest )
6565 returns (stream StreamingSynthesizeResponse ) {}
@@ -88,7 +88,8 @@ enum SsmlVoiceGender {
8888// Configuration to set up audio encoder. The encoding determines the output
8989// audio format that we'd like.
9090enum AudioEncoding {
91- // Not specified. Will return result
91+ // Not specified. Only used by GenerateVoiceCloningKey. Otherwise, will return
92+ // result
9293 // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT].
9394 AUDIO_ENCODING_UNSPECIFIED = 0 ;
9495
@@ -102,7 +103,7 @@ enum AudioEncoding {
102103 // MP3 at 64kbps.
103104 MP3_64_KBPS = 4 ;
104105
105- // Opus encoded audio wrapped in an ogg container. The result will be a
106+ // Opus encoded audio wrapped in an ogg container. The result is a
106107 // file which can be played natively on Android, and in browsers (at least
107108 // Chrome and Firefox). The quality of the encoding is considerably higher
108109 // than MP3 while using approximately the same bitrate.
@@ -117,9 +118,12 @@ enum AudioEncoding {
117118 ALAW = 6 ;
118119
119120 // Uncompressed 16-bit signed little-endian samples (Linear PCM).
120- // Note that as opposed to LINEAR16, audio will not be wrapped in a WAV (or
121+ // Note that as opposed to LINEAR16, audio won't be wrapped in a WAV (or
121122 // any other) header.
122123 PCM = 7 ;
124+
125+ // M4A audio.
126+ M4A = 8 ;
123127}
124128
125129// The top-level message sent by the client for the `ListVoices` method.
@@ -160,8 +164,8 @@ message Voice {
160164
161165// Used for advanced voice options.
162166message AdvancedVoiceOptions {
163- // Only for Journey voices. If false, the synthesis will be context aware
164- // and have higher latency.
167+ // Only for Journey voices. If false, the synthesis is context aware
168+ // and has a higher latency.
165169 optional bool low_latency_journey_synthesis = 1 ;
166170}
167171
@@ -199,18 +203,41 @@ message CustomPronunciationParams {
199203 // Not specified.
200204 PHONETIC_ENCODING_UNSPECIFIED = 0 ;
201205
202- // IPA. (e.g. apple -> ˈæpəl )
206+ // IPA, such as apple -> ˈæpəl.
203207 // https://en.wikipedia.org/wiki/International_Phonetic_Alphabet
204208 PHONETIC_ENCODING_IPA = 1 ;
205209
206- // X-SAMPA (e.g. apple -> "{p@l" )
210+ // X-SAMPA, such as apple -> "{p@l".
207211 // https://en.wikipedia.org/wiki/X-SAMPA
208212 PHONETIC_ENCODING_X_SAMPA = 2 ;
213+
214+ // For reading-to-pron conversion to work well, the `pronunciation` field
215+ // should only contain Kanji, Hiragana, and Katakana.
216+ //
217+ // The pronunciation can also contain pitch accents.
218+ // The start of a pitch phrase is specified with `^` and the down-pitch
219+ // position is specified with `!`, for example:
220+ //
221+ // phrase:端 pronunciation:^はし
222+ // phrase:箸 pronunciation:^は!し
223+ // phrase:橋 pronunciation:^はし!
224+ //
225+ // We currently only support the Tokyo dialect, which allows at most one
226+ // down-pitch per phrase (i.e. at most one `!` between `^`).
227+ PHONETIC_ENCODING_JAPANESE_YOMIGANA = 3 ;
228+
229+ // Used to specify pronunciations for Mandarin words. See
230+ // https://en.wikipedia.org/wiki/Pinyin.
231+ //
232+ // For example: 朝阳, the pronunciation is "chao2 yang2". The number
233+ // represents the tone, and there is a space between syllables. Neutral
234+ // tones are represented by 5, for example 孩子 "hai2 zi5".
235+ PHONETIC_ENCODING_PINYIN = 4 ;
209236 }
210237
211- // The phrase to which the customization will be applied.
212- // The phrase can be multiple words (in the case of proper nouns etc) , but
213- // should not span to a whole sentence.
238+ // The phrase to which the customization is applied.
239+ // The phrase can be multiple words, such as proper nouns, but shouldn't span
240+ // the length of the sentence.
214241 optional string phrase = 1 ;
215242
216243 // The phonetic encoding of the phrase.
@@ -223,13 +250,13 @@ message CustomPronunciationParams {
223250
224251// A collection of pronunciation customizations.
225252message CustomPronunciations {
226- // The pronunciation customizations to be applied.
253+ // The pronunciation customizations are applied.
227254 repeated CustomPronunciationParams pronunciations = 1 ;
228255}
229256
230257// A collection of turns for multi-speaker synthesis.
231258message MultiSpeakerMarkup {
232- // A Multi -speaker turn.
259+ // A multi -speaker turn.
233260 message Turn {
234261 // Required. The speaker of the turn, for example, 'O' or 'Q'. Please refer
235262 // to documentation for available speakers.
@@ -253,6 +280,10 @@ message SynthesisInput {
253280 // The raw text to be synthesized.
254281 string text = 1 ;
255282
283+ // Markup for HD voices specifically. This field may not be used with any
284+ // other voices.
285+ string markup = 5 ;
286+
256287 // The SSML document to be synthesized. The SSML document must be valid
257288 // and well-formed. Otherwise the RPC will fail and return
258289 // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. For
@@ -265,18 +296,16 @@ message SynthesisInput {
265296 MultiSpeakerMarkup multi_speaker_markup = 4 ;
266297 }
267298
268- // Optional. The pronunciation customizations to be applied to the input. If
269- // this is set, the input will be synthesized using the given pronunciation
299+ // Optional. The pronunciation customizations are applied to the input. If
300+ // this is set, the input is synthesized using the given pronunciation
270301 // customizations.
271302 //
272- // The initial support will be for EFIGS (English, French,
273- // Italian, German, Spanish) languages, as provided in
274- // VoiceSelectionParams. Journey and Instant Clone voices are
275- // not supported yet.
303+ // The initial support is for en-us, with plans to expand to other locales in
304+ // the future. Instant Clone voices aren't supported.
276305 //
277306 // In order to customize the pronunciation of a phrase, there must be an exact
278307 // match of the phrase in the input types. If using SSML, the phrase must not
279- // be inside a phoneme tag (entirely or partially) .
308+ // be inside a phoneme tag.
280309 CustomPronunciations custom_pronunciations = 3
281310 [(google.api.field_behavior ) = OPTIONAL ];
282311}
@@ -314,20 +343,24 @@ message VoiceSelectionParams {
314343 CustomVoiceParams custom_voice = 4 ;
315344
316345 // Optional. The configuration for a voice clone. If
317- // [VoiceCloneParams.voice_clone_key] is set, the service will choose the
318- // voice clone matching the specified configuration.
346+ // [VoiceCloneParams.voice_clone_key] is set, the service chooses the voice
347+ // clone matching the specified configuration.
319348 VoiceCloneParams voice_clone = 5 [(google.api.field_behavior ) = OPTIONAL ];
349+
350+ // Optional. The name of the model. If set, the service will choose the model
351+ // matching the specified configuration.
352+ string model_name = 6 [(google.api.field_behavior ) = OPTIONAL ];
320353}
321354
322355// Description of audio data to be synthesized.
323356message AudioConfig {
324357 // Required. The format of the audio byte stream.
325358 AudioEncoding audio_encoding = 1 [(google.api.field_behavior ) = REQUIRED ];
326359
327- // Optional. Input only. Speaking rate/speed, in the range [0.25, 4 .0]. 1.0 is
360+ // Optional. Input only. Speaking rate/speed, in the range [0.25, 2 .0]. 1.0 is
328361 // the normal native speed supported by the specific voice. 2.0 is twice as
329362 // fast, and 0.5 is half as fast. If unset(0.0), defaults to the native 1.0
330- // speed. Any other values < 0.25 or > 4 .0 will return an error.
363+ // speed. Any other values < 0.25 or > 2 .0 will return an error.
331364 double speaking_rate = 2 [
332365 (google.api.field_behavior ) = INPUT_ONLY ,
333366 (google.api.field_behavior ) = OPTIONAL
@@ -440,12 +473,21 @@ message Timepoint {
440473// Description of the desired output audio data.
441474message StreamingAudioConfig {
442475 // Required. The format of the audio byte stream.
443- // For now, streaming only supports PCM and OGG_OPUS. All other encodings
444- // will return an error.
476+ // Streaming supports PCM, ALAW, MULAW and OGG_OPUS. All other encodings
477+ // return an error.
445478 AudioEncoding audio_encoding = 1 [(google.api.field_behavior ) = REQUIRED ];
446479
447480 // Optional. The synthesis sample rate (in hertz) for this audio.
448481 int32 sample_rate_hertz = 2 [(google.api.field_behavior ) = OPTIONAL ];
482+
483+ // Optional. Input only. Speaking rate/speed, in the range [0.25, 2.0]. 1.0 is
484+ // the normal native speed supported by the specific voice. 2.0 is twice as
485+ // fast, and 0.5 is half as fast. If unset(0.0), defaults to the native 1.0
486+ // speed. Any other values < 0.25 or > 2.0 will return an error.
487+ double speaking_rate = 3 [
488+ (google.api.field_behavior ) = INPUT_ONLY ,
489+ (google.api.field_behavior ) = OPTIONAL
490+ ];
449491}
450492
451493// Provides configuration information for the StreamingSynthesize request.
@@ -456,17 +498,36 @@ message StreamingSynthesizeConfig {
456498 // Optional. The configuration of the synthesized audio.
457499 StreamingAudioConfig streaming_audio_config = 4
458500 [(google.api.field_behavior ) = OPTIONAL ];
501+
502+ // Optional. The pronunciation customizations are applied to the input. If
503+ // this is set, the input is synthesized using the given pronunciation
504+ // customizations.
505+ //
506+ // The initial support is for en-us, with plans to expand to other locales in
507+ // the future. Instant Clone voices aren't supported.
508+ //
509+ // In order to customize the pronunciation of a phrase, there must be an exact
510+ // match of the phrase in the input types. If using SSML, the phrase must not
511+ // be inside a phoneme tag.
512+ CustomPronunciations custom_pronunciations = 5
513+ [(google.api.field_behavior ) = OPTIONAL ];
459514}
460515
461516// Input to be synthesized.
462517message StreamingSynthesisInput {
463518 oneof input_source {
464519 // The raw text to be synthesized. It is recommended that each input
465- // contains complete, terminating sentences, as this will likely result in
466- // better prosody in the output audio. That being said, users are free to
467- // input text however they please.
520+ // contains complete, terminating sentences, which results in better prosody
521+ // in the output audio.
468522 string text = 1 ;
523+
524+ // Markup for HD voices specifically. This field may not be used with any
525+ // other voices.
526+ string markup = 5 ;
469527 }
528+
529+ // This is system instruction supported only for controllable voice models.
530+ optional string prompt = 6 ;
470531}
471532
472533// Request message for the `StreamingSynthesize` method. Multiple
0 commit comments