Skip to content

Commit b738e78

Browse files
Google APIscopybara-github
authored andcommitted
feat: Support promptable voices by specifying a model name and a prompt
feat: Add enum value M4A to enum AudioEncoding docs: A comment for method `StreamingSynthesize` in service `TextToSpeech` is changed docs: A comment for enum value `AUDIO_ENCODING_UNSPECIFIED` in enum `AudioEncoding` is changed docs: A comment for enum value `OGG_OPUS` in enum `AudioEncoding` is changed docs: A comment for enum value `PCM` in enum `AudioEncoding` is changed docs: A comment for field `low_latency_journey_synthesis` in message `.google.cloud.texttospeech.v1beta1.AdvancedVoiceOptions` is changed docs: A comment for enum value `PHONETIC_ENCODING_IPA` in enum `PhoneticEncoding` is changed docs: A comment for enum value `PHONETIC_ENCODING_X_SAMPA` in enum `PhoneticEncoding` is changed docs: A comment for field `phrase` in message `.google.cloud.texttospeech.v1beta1.CustomPronunciationParams` is changed docs: A comment for field `pronunciations` in message `.google.cloud.texttospeech.v1beta1.CustomPronunciations` is changed docs: A comment for message `MultiSpeakerMarkup` is changed docs: A comment for field `custom_pronunciations` in message `.google.cloud.texttospeech.v1beta1.SynthesisInput` is changed docs: A comment for field `voice_clone` in message `.google.cloud.texttospeech.v1beta1.VoiceSelectionParams` is changed docs: A comment for field `speaking_rate` in message `.google.cloud.texttospeech.v1beta1.AudioConfig` is changed docs: A comment for field `audio_encoding` in message `.google.cloud.texttospeech.v1beta1.StreamingAudioConfig` is changed docs: A comment for field `text` in message `.google.cloud.texttospeech.v1beta1.StreamingSynthesisInput` is changed PiperOrigin-RevId: 799242210
1 parent f956e53 commit b738e78

1 file changed

Lines changed: 90 additions & 29 deletions

File tree

google/cloud/texttospeech/v1beta1/cloud_tts.proto

Lines changed: 90 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ service TextToSpeech {
5959
option (google.api.method_signature) = "input,voice,audio_config";
6060
}
6161

62-
// Performs bidirectional streaming speech synthesis: receive audio while
62+
// Performs bidirectional streaming speech synthesis: receives audio while
6363
// sending text.
6464
rpc StreamingSynthesize(stream StreamingSynthesizeRequest)
6565
returns (stream StreamingSynthesizeResponse) {}
@@ -88,7 +88,8 @@ enum SsmlVoiceGender {
8888
// Configuration to set up audio encoder. The encoding determines the output
8989
// audio format that we'd like.
9090
enum AudioEncoding {
91-
// Not specified. Will return result
91+
// Not specified. Only used by GenerateVoiceCloningKey. Otherwise, will return
92+
// result
9293
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT].
9394
AUDIO_ENCODING_UNSPECIFIED = 0;
9495

@@ -102,7 +103,7 @@ enum AudioEncoding {
102103
// MP3 at 64kbps.
103104
MP3_64_KBPS = 4;
104105

105-
// Opus encoded audio wrapped in an ogg container. The result will be a
106+
// Opus encoded audio wrapped in an ogg container. The result is a
106107
// file which can be played natively on Android, and in browsers (at least
107108
// Chrome and Firefox). The quality of the encoding is considerably higher
108109
// than MP3 while using approximately the same bitrate.
@@ -117,9 +118,12 @@ enum AudioEncoding {
117118
ALAW = 6;
118119

119120
// Uncompressed 16-bit signed little-endian samples (Linear PCM).
120-
// Note that as opposed to LINEAR16, audio will not be wrapped in a WAV (or
121+
// Note that as opposed to LINEAR16, audio won't be wrapped in a WAV (or
121122
// any other) header.
122123
PCM = 7;
124+
125+
// M4A audio.
126+
M4A = 8;
123127
}
124128

125129
// The top-level message sent by the client for the `ListVoices` method.
@@ -160,8 +164,8 @@ message Voice {
160164

161165
// Used for advanced voice options.
162166
message AdvancedVoiceOptions {
163-
// Only for Journey voices. If false, the synthesis will be context aware
164-
// and have higher latency.
167+
// Only for Journey voices. If false, the synthesis is context aware
168+
// and has a higher latency.
165169
optional bool low_latency_journey_synthesis = 1;
166170
}
167171

@@ -199,18 +203,41 @@ message CustomPronunciationParams {
199203
// Not specified.
200204
PHONETIC_ENCODING_UNSPECIFIED = 0;
201205

202-
// IPA. (e.g. apple -> ˈæpəl )
206+
// IPA, such as apple -> ˈæpəl.
203207
// https://en.wikipedia.org/wiki/International_Phonetic_Alphabet
204208
PHONETIC_ENCODING_IPA = 1;
205209

206-
// X-SAMPA (e.g. apple -> "{p@l" )
210+
// X-SAMPA, such as apple -> "{p@l".
207211
// https://en.wikipedia.org/wiki/X-SAMPA
208212
PHONETIC_ENCODING_X_SAMPA = 2;
213+
214+
// For reading-to-pron conversion to work well, the `pronunciation` field
215+
// should only contain Kanji, Hiragana, and Katakana.
216+
//
217+
// The pronunciation can also contain pitch accents.
218+
// The start of a pitch phrase is specified with `^` and the down-pitch
219+
// position is specified with `!`, for example:
220+
//
221+
// phrase:端 pronunciation:^はし
222+
// phrase:箸 pronunciation:^は!し
223+
// phrase:橋 pronunciation:^はし!
224+
//
225+
// We currently only support the Tokyo dialect, which allows at most one
226+
// down-pitch per phrase (i.e. at most one `!` between `^`).
227+
PHONETIC_ENCODING_JAPANESE_YOMIGANA = 3;
228+
229+
// Used to specify pronunciations for Mandarin words. See
230+
// https://en.wikipedia.org/wiki/Pinyin.
231+
//
232+
// For example: 朝阳, the pronunciation is "chao2 yang2". The number
233+
// represents the tone, and there is a space between syllables. Neutral
234+
// tones are represented by 5, for example 孩子 "hai2 zi5".
235+
PHONETIC_ENCODING_PINYIN = 4;
209236
}
210237

211-
// The phrase to which the customization will be applied.
212-
// The phrase can be multiple words (in the case of proper nouns etc), but
213-
// should not span to a whole sentence.
238+
// The phrase to which the customization is applied.
239+
// The phrase can be multiple words, such as proper nouns, but shouldn't span
240+
// the length of the sentence.
214241
optional string phrase = 1;
215242

216243
// The phonetic encoding of the phrase.
@@ -223,13 +250,13 @@ message CustomPronunciationParams {
223250

224251
// A collection of pronunciation customizations.
225252
message CustomPronunciations {
226-
// The pronunciation customizations to be applied.
253+
// The pronunciation customizations are applied.
227254
repeated CustomPronunciationParams pronunciations = 1;
228255
}
229256

230257
// A collection of turns for multi-speaker synthesis.
231258
message MultiSpeakerMarkup {
232-
// A Multi-speaker turn.
259+
// A multi-speaker turn.
233260
message Turn {
234261
// Required. The speaker of the turn, for example, 'O' or 'Q'. Please refer
235262
// to documentation for available speakers.
@@ -253,6 +280,10 @@ message SynthesisInput {
253280
// The raw text to be synthesized.
254281
string text = 1;
255282

283+
// Markup for HD voices specifically. This field may not be used with any
284+
// other voices.
285+
string markup = 5;
286+
256287
// The SSML document to be synthesized. The SSML document must be valid
257288
// and well-formed. Otherwise the RPC will fail and return
258289
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. For
@@ -265,18 +296,16 @@ message SynthesisInput {
265296
MultiSpeakerMarkup multi_speaker_markup = 4;
266297
}
267298

268-
// Optional. The pronunciation customizations to be applied to the input. If
269-
// this is set, the input will be synthesized using the given pronunciation
299+
// Optional. The pronunciation customizations are applied to the input. If
300+
// this is set, the input is synthesized using the given pronunciation
270301
// customizations.
271302
//
272-
// The initial support will be for EFIGS (English, French,
273-
// Italian, German, Spanish) languages, as provided in
274-
// VoiceSelectionParams. Journey and Instant Clone voices are
275-
// not supported yet.
303+
// The initial support is for en-us, with plans to expand to other locales in
304+
// the future. Instant Clone voices aren't supported.
276305
//
277306
// In order to customize the pronunciation of a phrase, there must be an exact
278307
// match of the phrase in the input types. If using SSML, the phrase must not
279-
// be inside a phoneme tag (entirely or partially).
308+
// be inside a phoneme tag.
280309
CustomPronunciations custom_pronunciations = 3
281310
[(google.api.field_behavior) = OPTIONAL];
282311
}
@@ -314,20 +343,24 @@ message VoiceSelectionParams {
314343
CustomVoiceParams custom_voice = 4;
315344

316345
// Optional. The configuration for a voice clone. If
317-
// [VoiceCloneParams.voice_clone_key] is set, the service will choose the
318-
// voice clone matching the specified configuration.
346+
// [VoiceCloneParams.voice_clone_key] is set, the service chooses the voice
347+
// clone matching the specified configuration.
319348
VoiceCloneParams voice_clone = 5 [(google.api.field_behavior) = OPTIONAL];
349+
350+
// Optional. The name of the model. If set, the service will choose the model
351+
// matching the specified configuration.
352+
string model_name = 6 [(google.api.field_behavior) = OPTIONAL];
320353
}
321354

322355
// Description of audio data to be synthesized.
323356
message AudioConfig {
324357
// Required. The format of the audio byte stream.
325358
AudioEncoding audio_encoding = 1 [(google.api.field_behavior) = REQUIRED];
326359

327-
// Optional. Input only. Speaking rate/speed, in the range [0.25, 4.0]. 1.0 is
360+
// Optional. Input only. Speaking rate/speed, in the range [0.25, 2.0]. 1.0 is
328361
// the normal native speed supported by the specific voice. 2.0 is twice as
329362
// fast, and 0.5 is half as fast. If unset(0.0), defaults to the native 1.0
330-
// speed. Any other values < 0.25 or > 4.0 will return an error.
363+
// speed. Any other values < 0.25 or > 2.0 will return an error.
331364
double speaking_rate = 2 [
332365
(google.api.field_behavior) = INPUT_ONLY,
333366
(google.api.field_behavior) = OPTIONAL
@@ -440,12 +473,21 @@ message Timepoint {
440473
// Description of the desired output audio data.
441474
message StreamingAudioConfig {
442475
// Required. The format of the audio byte stream.
443-
// For now, streaming only supports PCM and OGG_OPUS. All other encodings
444-
// will return an error.
476+
// Streaming supports PCM, ALAW, MULAW and OGG_OPUS. All other encodings
477+
// return an error.
445478
AudioEncoding audio_encoding = 1 [(google.api.field_behavior) = REQUIRED];
446479

447480
// Optional. The synthesis sample rate (in hertz) for this audio.
448481
int32 sample_rate_hertz = 2 [(google.api.field_behavior) = OPTIONAL];
482+
483+
// Optional. Input only. Speaking rate/speed, in the range [0.25, 2.0]. 1.0 is
484+
// the normal native speed supported by the specific voice. 2.0 is twice as
485+
// fast, and 0.5 is half as fast. If unset(0.0), defaults to the native 1.0
486+
// speed. Any other values < 0.25 or > 2.0 will return an error.
487+
double speaking_rate = 3 [
488+
(google.api.field_behavior) = INPUT_ONLY,
489+
(google.api.field_behavior) = OPTIONAL
490+
];
449491
}
450492

451493
// Provides configuration information for the StreamingSynthesize request.
@@ -456,17 +498,36 @@ message StreamingSynthesizeConfig {
456498
// Optional. The configuration of the synthesized audio.
457499
StreamingAudioConfig streaming_audio_config = 4
458500
[(google.api.field_behavior) = OPTIONAL];
501+
502+
// Optional. The pronunciation customizations are applied to the input. If
503+
// this is set, the input is synthesized using the given pronunciation
504+
// customizations.
505+
//
506+
// The initial support is for en-us, with plans to expand to other locales in
507+
// the future. Instant Clone voices aren't supported.
508+
//
509+
// In order to customize the pronunciation of a phrase, there must be an exact
510+
// match of the phrase in the input types. If using SSML, the phrase must not
511+
// be inside a phoneme tag.
512+
CustomPronunciations custom_pronunciations = 5
513+
[(google.api.field_behavior) = OPTIONAL];
459514
}
460515

461516
// Input to be synthesized.
462517
message StreamingSynthesisInput {
463518
oneof input_source {
464519
// The raw text to be synthesized. It is recommended that each input
465-
// contains complete, terminating sentences, as this will likely result in
466-
// better prosody in the output audio. That being said, users are free to
467-
// input text however they please.
520+
// contains complete, terminating sentences, which results in better prosody
521+
// in the output audio.
468522
string text = 1;
523+
524+
// Markup for HD voices specifically. This field may not be used with any
525+
// other voices.
526+
string markup = 5;
469527
}
528+
529+
// This is system instruction supported only for controllable voice models.
530+
optional string prompt = 6;
470531
}
471532

472533
// Request message for the `StreamingSynthesize` method. Multiple

0 commit comments

Comments
 (0)