1111// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212// See the License for the specific language governing permissions and
1313// limitations under the License.
14- //
1514
1615syntax = "proto3" ;
1716
@@ -83,7 +82,7 @@ enum SsmlVoiceGender {
8382 // A female voice.
8483 FEMALE = 2 ;
8584
86- // A gender-neutral voice.
85+ // A gender-neutral voice. This voice is not yet supported.
8786 NEUTRAL = 3 ;
8887}
8988
@@ -100,11 +99,18 @@ enum AudioEncoding {
10099 // MP3 audio at 32kbps.
101100 MP3 = 2 ;
102101
102+ // MP3 at 64kbps.
103+ MP3_64_KBPS = 4 ;
104+
103105 // Opus encoded audio wrapped in an ogg container. The result will be a
104106 // file which can be played natively on Android, and in browsers (at least
105107 // Chrome and Firefox). The quality of the encoding is considerably higher
106108 // than MP3 while using approximately the same bitrate.
107109 OGG_OPUS = 3 ;
110+
111+ // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
112+ // Audio content returned as MULAW also contains a WAV header.
113+ MULAW = 5 ;
108114}
109115
110116// The message returned to the client by the `ListVoices` method.
@@ -132,6 +138,15 @@ message Voice {
132138
133139// The top-level message sent by the client for the `SynthesizeSpeech` method.
134140message SynthesizeSpeechRequest {
141+ // The type of timepoint information that is returned in the response.
142+ enum TimepointType {
143+ // Not specified. No timepoint information will be returned.
144+ TIMEPOINT_TYPE_UNSPECIFIED = 0 ;
145+
146+ // Timepoint information of `<mark>` tags in SSML input will be returned.
147+ SSML_MARK = 1 ;
148+ }
149+
135150 // Required. The Synthesizer requires either plain text or SSML as input.
136151 SynthesisInput input = 1 [(google.api.field_behavior ) = REQUIRED ];
137152
@@ -140,6 +155,9 @@ message SynthesizeSpeechRequest {
140155
141156 // Required. The configuration of the synthesized audio.
142157 AudioConfig audio_config = 3 [(google.api.field_behavior ) = REQUIRED ];
158+
159+ // Whether and what timepoints should be returned in the response.
160+ repeated TimepointType enable_time_pointing = 4 ;
143161}
144162
145163// Contains text input to be synthesized. Either `text` or `ssml` must be
@@ -251,4 +269,21 @@ message SynthesizeSpeechResponse {
251269 // with all bytes fields, protobuffers use a pure binary representation,
252270 // whereas JSON representations use base64.
253271 bytes audio_content = 1 ;
272+
273+ // A link between a position in the original request input and a corresponding
274+ // time in the output audio. It's only supported via `<mark>` of SSML input.
275+ repeated Timepoint timepoints = 2 ;
276+
277+ // The audio metadata of `audio_content`.
278+ AudioConfig audio_config = 4 ;
279+ }
280+
281+ // This contains a mapping between a certain point in the input text and a
282+ // corresponding time in the output audio.
283+ message Timepoint {
284+ // Timepoint name as received from the client within `<mark>` tag.
285+ string mark_name = 4 ;
286+
287+ // Time offset in seconds from the start of the synthesized audio.
288+ double time_seconds = 3 ;
254289}
0 commit comments