@@ -191,6 +191,29 @@ message CustomPronunciationParams {
191191 // X-SAMPA, such as apple -> "{p@l".
192192 // https://en.wikipedia.org/wiki/X-SAMPA
193193 PHONETIC_ENCODING_X_SAMPA = 2 ;
194+
195+ // For reading-to-pron conversion to work well, the `pronunciation` field
196+ // should only contain Kanji, Hiragana, and Katakana.
197+ //
198+ // The pronunciation can also contain pitch accents.
199+ // The start of a pitch phrase is specified with `^` and the down-pitch
200+ // position is specified with `!`, for example:
201+ //
202+ // phrase:端 pronunciation:^はし
203+ // phrase:箸 pronunciation:^は!し
204+ // phrase:橋 pronunciation:^はし!
205+ //
206+ // We currently only support the Tokyo dialect, which allows at most one
207+ // down-pitch per phrase (i.e. at most one `!` between `^`).
208+ PHONETIC_ENCODING_JAPANESE_YOMIGANA = 3 ;
209+
210+ // Used to specify pronunciations for Mandarin words. See
211+ // https://en.wikipedia.org/wiki/Pinyin.
212+ //
213+ // For example: 朝阳, the pronunciation is "chao2 yang2". The number
214+ // represents the tone, and there is a space between syllables. Neutral
215+ // tones are represented by 5, for example 孩子 "hai2 zi5".
216+ PHONETIC_ENCODING_PINYIN = 4 ;
194217 }
195218
196219 // The phrase to which the customization is applied.
@@ -238,6 +261,10 @@ message SynthesisInput {
238261 // The raw text to be synthesized.
239262 string text = 1 ;
240263
264+ // Markup for HD voices specifically. This field may not be used with any
265+ // other voices.
266+ string markup = 5 ;
267+
241268 // The SSML document to be synthesized. The SSML document must be valid
242269 // and well-formed. Otherwise the RPC will fail and return
243270 // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. For
@@ -453,6 +480,10 @@ message StreamingSynthesisInput {
453480 // contains complete, terminating sentences, which results in better prosody
454481 // in the output audio.
455482 string text = 1 ;
483+
484+ // Markup for HD voices specifically. This field may not be used with any
485+ // other voices.
486+ string markup = 5 ;
456487 }
457488}
458489
0 commit comments