Skip to content

Commit 8f7ef1c

Browse files
Google APIscopybara-github
authored andcommitted
feat: Support markup input for Cloud TTS Chirp 3: HD voice synthesis
feat: Support pinyin/yomigana custom pronunciation encodings for cmn-cn/ja-jp PiperOrigin-RevId: 754921874
1 parent 7ab5d0f commit 8f7ef1c

1 file changed

Lines changed: 31 additions & 0 deletions

File tree

google/cloud/texttospeech/v1/cloud_tts.proto

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,29 @@ message CustomPronunciationParams {
191191
// X-SAMPA, such as apple -> "{p@l".
192192
// https://en.wikipedia.org/wiki/X-SAMPA
193193
PHONETIC_ENCODING_X_SAMPA = 2;
194+
195+
// For reading-to-pron conversion to work well, the `pronunciation` field
196+
// should only contain Kanji, Hiragana, and Katakana.
197+
//
198+
// The pronunciation can also contain pitch accents.
199+
// The start of a pitch phrase is specified with `^` and the down-pitch
200+
// position is specified with `!`, for example:
201+
//
202+
// phrase:端 pronunciation:^はし
203+
// phrase:箸 pronunciation:^は!し
204+
// phrase:橋 pronunciation:^はし!
205+
//
206+
// We currently only support the Tokyo dialect, which allows at most one
207+
// down-pitch per phrase (i.e. at most one `!` between `^`).
208+
PHONETIC_ENCODING_JAPANESE_YOMIGANA = 3;
209+
210+
// Used to specify pronunciations for Mandarin words. See
211+
// https://en.wikipedia.org/wiki/Pinyin.
212+
//
213+
// For example: 朝阳, the pronunciation is "chao2 yang2". The number
214+
// represents the tone, and there is a space between syllables. Neutral
215+
// tones are represented by 5, for example 孩子 "hai2 zi5".
216+
PHONETIC_ENCODING_PINYIN = 4;
194217
}
195218

196219
// The phrase to which the customization is applied.
@@ -238,6 +261,10 @@ message SynthesisInput {
238261
// The raw text to be synthesized.
239262
string text = 1;
240263

264+
// Markup for HD voices specifically. This field may not be used with any
265+
// other voices.
266+
string markup = 5;
267+
241268
// The SSML document to be synthesized. The SSML document must be valid
242269
// and well-formed. Otherwise the RPC will fail and return
243270
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. For
@@ -453,6 +480,10 @@ message StreamingSynthesisInput {
453480
// contains complete, terminating sentences, which results in better prosody
454481
// in the output audio.
455482
string text = 1;
483+
484+
// Markup for HD voices specifically. This field may not be used with any
485+
// other voices.
486+
string markup = 5;
456487
}
457488
}
458489

0 commit comments

Comments
 (0)