@@ -16,11 +16,11 @@ syntax = "proto3";
1616
1717package google.cloud.dialogflow.v2 ;
1818
19+ import "google/api/annotations.proto" ;
1920import "google/api/field_behavior.proto" ;
2021import "google/api/resource.proto" ;
2122import "google/protobuf/duration.proto" ;
2223import "google/protobuf/timestamp.proto" ;
23- import "google/api/annotations.proto" ;
2424
2525option cc_enable_arenas = true ;
2626option csharp_namespace = "Google.Cloud.Dialogflow.V2" ;
@@ -109,31 +109,6 @@ message SpeechContext {
109109 float boost = 2 ;
110110}
111111
112- // Information for a word recognized by the speech recognizer.
113- message SpeechWordInfo {
114- // The word this info is for.
115- string word = 3 ;
116-
117- // Time offset relative to the beginning of the audio that corresponds to the
118- // start of the spoken word. This is an experimental feature and the accuracy
119- // of the time offset can vary.
120- google.protobuf.Duration start_offset = 1 ;
121-
122- // Time offset relative to the beginning of the audio that corresponds to the
123- // end of the spoken word. This is an experimental feature and the accuracy of
124- // the time offset can vary.
125- google.protobuf.Duration end_offset = 2 ;
126-
127- // The Speech confidence between 0.0 and 1.0 for this word. A higher number
128- // indicates an estimated greater likelihood that the recognized word is
129- // correct. The default of 0.0 is a sentinel value indicating that confidence
130- // was not set.
131- //
132- // This field is not guaranteed to be fully stable over time for the same
133- // audio input. Users should also not rely on it to always be provided.
134- float confidence = 4 ;
135- }
136-
137112// Variant of the specified [Speech model][google.cloud.dialogflow.v2.InputAudioConfig.model] to use.
138113//
139114// See the [Cloud Speech
@@ -177,6 +152,31 @@ enum SpeechModelVariant {
177152 USE_ENHANCED = 3 ;
178153}
179154
155+ // Information for a word recognized by the speech recognizer.
156+ message SpeechWordInfo {
157+ // The word this info is for.
158+ string word = 3 ;
159+
160+ // Time offset relative to the beginning of the audio that corresponds to the
161+ // start of the spoken word. This is an experimental feature and the accuracy
162+ // of the time offset can vary.
163+ google.protobuf.Duration start_offset = 1 ;
164+
165+ // Time offset relative to the beginning of the audio that corresponds to the
166+ // end of the spoken word. This is an experimental feature and the accuracy of
167+ // the time offset can vary.
168+ google.protobuf.Duration end_offset = 2 ;
169+
170+ // The Speech confidence between 0.0 and 1.0 for this word. A higher number
171+ // indicates an estimated greater likelihood that the recognized word is
172+ // correct. The default of 0.0 is a sentinel value indicating that confidence
173+ // was not set.
174+ //
175+ // This field is not guaranteed to be fully stable over time for the same
176+ // audio input. Users should also not rely on it to always be provided.
177+ float confidence = 4 ;
178+ }
179+
180180// Instructs the speech recognizer how to process the audio content.
181181message InputAudioConfig {
182182 // Required. Audio encoding of the audio content to process.
@@ -256,6 +256,23 @@ message InputAudioConfig {
256256 bool disable_no_speech_recognized_event = 14 ;
257257}
258258
259+ // Gender of the voice as described in
260+ // [SSML voice element](https://www.w3.org/TR/speech-synthesis11/#edef_voice).
261+ enum SsmlVoiceGender {
262+ // An unspecified gender, which means that the client doesn't care which
263+ // gender the selected voice will have.
264+ SSML_VOICE_GENDER_UNSPECIFIED = 0 ;
265+
266+ // A male voice.
267+ SSML_VOICE_GENDER_MALE = 1 ;
268+
269+ // A female voice.
270+ SSML_VOICE_GENDER_FEMALE = 2 ;
271+
272+ // A gender-neutral voice.
273+ SSML_VOICE_GENDER_NEUTRAL = 3 ;
274+ }
275+
259276// Description of which voice to use for speech synthesis.
260277message VoiceSelectionParams {
261278 // Optional. The name of the voice. If not set, the service will choose a
@@ -303,21 +320,29 @@ message SynthesizeSpeechConfig {
303320 VoiceSelectionParams voice = 4 ;
304321}
305322
306- // Gender of the voice as described in
307- // [SSML voice element](https://www.w3.org/TR/speech-synthesis11/#edef_voice).
308- enum SsmlVoiceGender {
309- // An unspecified gender, which means that the client doesn't care which
310- // gender the selected voice will have.
311- SSML_VOICE_GENDER_UNSPECIFIED = 0 ;
323+ // Audio encoding of the output audio format in Text-To-Speech.
324+ enum OutputAudioEncoding {
325+ // Not specified.
326+ OUTPUT_AUDIO_ENCODING_UNSPECIFIED = 0 ;
312327
313- // A male voice.
314- SSML_VOICE_GENDER_MALE = 1 ;
328+ // Uncompressed 16-bit signed little-endian samples (Linear PCM).
329+ // Audio content returned as LINEAR16 also contains a WAV header.
330+ OUTPUT_AUDIO_ENCODING_LINEAR_16 = 1 ;
315331
316- // A female voice .
317- SSML_VOICE_GENDER_FEMALE = 2 ;
332+ // MP3 audio at 32kbps .
333+ OUTPUT_AUDIO_ENCODING_MP3 = 2 ;
318334
319- // A gender-neutral voice.
320- SSML_VOICE_GENDER_NEUTRAL = 3 ;
335+ // MP3 audio at 64kbps.
336+ OUTPUT_AUDIO_ENCODING_MP3_64_KBPS = 4 ;
337+
338+ // Opus encoded audio wrapped in an ogg container. The result will be a
339+ // file which can be played natively on Android, and in browsers (at least
340+ // Chrome and Firefox). The quality of the encoding is considerably higher
341+ // than MP3 while using approximately the same bitrate.
342+ OUTPUT_AUDIO_ENCODING_OGG_OPUS = 3 ;
343+
344+ // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
345+ OUTPUT_AUDIO_ENCODING_MULAW = 5 ;
321346}
322347
323348// Instructs the speech synthesizer on how to generate the output audio content.
@@ -349,28 +374,3 @@ message SpeechToTextConfig {
349374 // would emit an error.
350375 SpeechModelVariant speech_model_variant = 1 ;
351376}
352-
353- // Audio encoding of the output audio format in Text-To-Speech.
354- enum OutputAudioEncoding {
355- // Not specified.
356- OUTPUT_AUDIO_ENCODING_UNSPECIFIED = 0 ;
357-
358- // Uncompressed 16-bit signed little-endian samples (Linear PCM).
359- // Audio content returned as LINEAR16 also contains a WAV header.
360- OUTPUT_AUDIO_ENCODING_LINEAR_16 = 1 ;
361-
362- // MP3 audio at 32kbps.
363- OUTPUT_AUDIO_ENCODING_MP3 = 2 ;
364-
365- // MP3 audio at 64kbps.
366- OUTPUT_AUDIO_ENCODING_MP3_64_KBPS = 4 ;
367-
368- // Opus encoded audio wrapped in an ogg container. The result will be a
369- // file which can be played natively on Android, and in browsers (at least
370- // Chrome and Firefox). The quality of the encoding is considerably higher
371- // than MP3 while using approximately the same bitrate.
372- OUTPUT_AUDIO_ENCODING_OGG_OPUS = 3 ;
373-
374- // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
375- OUTPUT_AUDIO_ENCODING_MULAW = 5 ;
376- }
0 commit comments