@@ -16,12 +16,12 @@ syntax = "proto3";
1616
1717package google.cloud.dialogflow.v2beta1 ;
1818
19+ import "google/api/annotations.proto" ;
1920import "google/api/field_behavior.proto" ;
2021import "google/api/resource.proto" ;
2122import "google/protobuf/duration.proto" ;
2223import "google/protobuf/field_mask.proto" ;
2324import "google/protobuf/timestamp.proto" ;
24- import "google/api/annotations.proto" ;
2525
2626option cc_enable_arenas = true ;
2727option csharp_namespace = "Google.Cloud.Dialogflow.V2beta1" ;
@@ -31,36 +31,6 @@ option java_outer_classname = "AudioConfigProto";
3131option java_package = "com.google.cloud.dialogflow.v2beta1" ;
3232option objc_class_prefix = "DF" ;
3333
34- // Hints for the speech recognizer to help with recognition in a specific
35- // conversation state.
36- message SpeechContext {
37- // Optional. A list of strings containing words and phrases that the speech
38- // recognizer should recognize with higher likelihood.
39- //
40- // This list can be used to:
41- //
42- // * improve accuracy for words and phrases you expect the user to say,
43- // e.g. typical commands for your Dialogflow agent
44- // * add additional words to the speech recognizer vocabulary
45- // * ...
46- //
47- // See the [Cloud Speech
48- // documentation](https://cloud.google.com/speech-to-text/quotas) for usage
49- // limits.
50- repeated string phrases = 1 ;
51-
52- // Optional. Boost for this context compared to other contexts:
53- //
54- // * If the boost is positive, Dialogflow will increase the probability that
55- // the phrases in this context are recognized over similar sounding phrases.
56- // * If the boost is unspecified or non-positive, Dialogflow will not apply
57- // any boost.
58- //
59- // Dialogflow recommends that you use boosts in the range (0, 20] and that you
60- // find a value that fits your use case with binary search.
61- float boost = 2 ;
62- }
63-
6434// Audio encoding of the audio content sent in the conversational query request.
6535// Refer to the
6636// [Cloud Speech API
@@ -110,29 +80,34 @@ enum AudioEncoding {
11080 AUDIO_ENCODING_SPEEX_WITH_HEADER_BYTE = 7 ;
11181}
11282
113- // Information for a word recognized by the speech recognizer.
114- message SpeechWordInfo {
115- // The word this info is for.
116- string word = 3 ;
117-
118- // Time offset relative to the beginning of the audio that corresponds to the
119- // start of the spoken word. This is an experimental feature and the accuracy
120- // of the time offset can vary.
121- google.protobuf.Duration start_offset = 1 ;
122-
123- // Time offset relative to the beginning of the audio that corresponds to the
124- // end of the spoken word. This is an experimental feature and the accuracy of
125- // the time offset can vary.
126- google.protobuf.Duration end_offset = 2 ;
83+ // Hints for the speech recognizer to help with recognition in a specific
84+ // conversation state.
85+ message SpeechContext {
86+ // Optional. A list of strings containing words and phrases that the speech
87+ // recognizer should recognize with higher likelihood.
88+ //
89+ // This list can be used to:
90+ //
91+ // * improve accuracy for words and phrases you expect the user to say,
92+ // e.g. typical commands for your Dialogflow agent
93+ // * add additional words to the speech recognizer vocabulary
94+ // * ...
95+ //
96+ // See the [Cloud Speech
97+ // documentation](https://cloud.google.com/speech-to-text/quotas) for usage
98+ // limits.
99+ repeated string phrases = 1 ;
127100
128- // The Speech confidence between 0.0 and 1.0 for this word. A higher number
129- // indicates an estimated greater likelihood that the recognized word is
130- // correct. The default of 0.0 is a sentinel value indicating that confidence
131- // was not set.
101+ // Optional. Boost for this context compared to other contexts:
132102 //
133- // This field is not guaranteed to be fully stable over time for the same
134- // audio input. Users should also not rely on it to always be provided.
135- float confidence = 4 ;
103+ // * If the boost is positive, Dialogflow will increase the probability that
104+ // the phrases in this context are recognized over similar sounding phrases.
105+ // * If the boost is unspecified or non-positive, Dialogflow will not apply
106+ // any boost.
107+ //
108+ // Dialogflow recommends that you use boosts in the range (0, 20] and that you
109+ // find a value that fits your use case with binary search.
110+ float boost = 2 ;
136111}
137112
138113// Variant of the specified [Speech model][google.cloud.dialogflow.v2beta1.InputAudioConfig.model] to use.
@@ -178,6 +153,31 @@ enum SpeechModelVariant {
178153 USE_ENHANCED = 3 ;
179154}
180155
156+ // Information for a word recognized by the speech recognizer.
157+ message SpeechWordInfo {
158+ // The word this info is for.
159+ string word = 3 ;
160+
161+ // Time offset relative to the beginning of the audio that corresponds to the
162+ // start of the spoken word. This is an experimental feature and the accuracy
163+ // of the time offset can vary.
164+ google.protobuf.Duration start_offset = 1 ;
165+
166+ // Time offset relative to the beginning of the audio that corresponds to the
167+ // end of the spoken word. This is an experimental feature and the accuracy of
168+ // the time offset can vary.
169+ google.protobuf.Duration end_offset = 2 ;
170+
171+ // The Speech confidence between 0.0 and 1.0 for this word. A higher number
172+ // indicates an estimated greater likelihood that the recognized word is
173+ // correct. The default of 0.0 is a sentinel value indicating that confidence
174+ // was not set.
175+ //
176+ // This field is not guaranteed to be fully stable over time for the same
177+ // audio input. Users should also not rely on it to always be provided.
178+ float confidence = 4 ;
179+ }
180+
181181// Instructs the speech recognizer on how to process the audio content.
182182message InputAudioConfig {
183183 // Required. Audio encoding of the audio content to process.
@@ -257,6 +257,23 @@ message InputAudioConfig {
257257 bool disable_no_speech_recognized_event = 14 ;
258258}
259259
260+ // Gender of the voice as described in
261+ // [SSML voice element](https://www.w3.org/TR/speech-synthesis11/#edef_voice).
262+ enum SsmlVoiceGender {
263+ // An unspecified gender, which means that the client doesn't care which
264+ // gender the selected voice will have.
265+ SSML_VOICE_GENDER_UNSPECIFIED = 0 ;
266+
267+ // A male voice.
268+ SSML_VOICE_GENDER_MALE = 1 ;
269+
270+ // A female voice.
271+ SSML_VOICE_GENDER_FEMALE = 2 ;
272+
273+ // A gender-neutral voice.
274+ SSML_VOICE_GENDER_NEUTRAL = 3 ;
275+ }
276+
260277// Description of which voice to use for speech synthesis.
261278message VoiceSelectionParams {
262279 // Optional. The name of the voice. If not set, the service will choose a
@@ -307,47 +324,6 @@ message SynthesizeSpeechConfig {
307324 VoiceSelectionParams voice = 4 ;
308325}
309326
310- // Gender of the voice as described in
311- // [SSML voice element](https://www.w3.org/TR/speech-synthesis11/#edef_voice).
312- enum SsmlVoiceGender {
313- // An unspecified gender, which means that the client doesn't care which
314- // gender the selected voice will have.
315- SSML_VOICE_GENDER_UNSPECIFIED = 0 ;
316-
317- // A male voice.
318- SSML_VOICE_GENDER_MALE = 1 ;
319-
320- // A female voice.
321- SSML_VOICE_GENDER_FEMALE = 2 ;
322-
323- // A gender-neutral voice.
324- SSML_VOICE_GENDER_NEUTRAL = 3 ;
325- }
326-
327- // Instructs the speech synthesizer how to generate the output audio content.
328- // If this audio config is supplied in a request, it overrides all existing
329- // text-to-speech settings applied to the agent.
330- message OutputAudioConfig {
331- // Required. Audio encoding of the synthesized audio content.
332- OutputAudioEncoding audio_encoding = 1 [(google.api.field_behavior ) = REQUIRED ];
333-
334- // The synthesis sample rate (in hertz) for this audio. If not
335- // provided, then the synthesizer will use the default sample rate based on
336- // the audio encoding. If this is different from the voice's natural sample
337- // rate, then the synthesizer will honor this request by converting to the
338- // desired sample rate (which might result in worse audio quality).
339- int32 sample_rate_hertz = 2 ;
340-
341- // Configuration of how speech should be synthesized.
342- SynthesizeSpeechConfig synthesize_speech_config = 3 ;
343- }
344-
345- // A wrapper of repeated TelephonyDtmf digits.
346- message TelephonyDtmfEvents {
347- // A sequence of TelephonyDtmf digits.
348- repeated TelephonyDtmf dtmf_events = 1 ;
349- }
350-
351327// Audio encoding of the output audio format in Text-To-Speech.
352328enum OutputAudioEncoding {
353329 // Not specified.
@@ -373,16 +349,22 @@ enum OutputAudioEncoding {
373349 OUTPUT_AUDIO_ENCODING_MULAW = 5 ;
374350}
375351
376- // Configures speech transcription for [ConversationProfile][google.cloud.dialogflow.v2beta1.ConversationProfile].
377- message SpeechToTextConfig {
378- // The speech model used in speech to text.
379- // `SPEECH_MODEL_VARIANT_UNSPECIFIED`, `USE_BEST_AVAILABLE` will be treated as
380- // `USE_ENHANCED`. It can be overridden in [AnalyzeContentRequest][google.cloud.dialogflow.v2beta1.AnalyzeContentRequest] and
381- // [StreamingAnalyzeContentRequest][google.cloud.dialogflow.v2beta1.StreamingAnalyzeContentRequest] request.
382- // If enhanced model variant is specified and an enhanced
383- // version of the specified model for the language does not exist, then it
384- // would emit an error.
385- SpeechModelVariant speech_model_variant = 1 ;
352+ // Instructs the speech synthesizer how to generate the output audio content.
353+ // If this audio config is supplied in a request, it overrides all existing
354+ // text-to-speech settings applied to the agent.
355+ message OutputAudioConfig {
356+ // Required. Audio encoding of the synthesized audio content.
357+ OutputAudioEncoding audio_encoding = 1 [(google.api.field_behavior ) = REQUIRED ];
358+
359+ // The synthesis sample rate (in hertz) for this audio. If not
360+ // provided, then the synthesizer will use the default sample rate based on
361+ // the audio encoding. If this is different from the voice's natural sample
362+ // rate, then the synthesizer will honor this request by converting to the
363+ // desired sample rate (which might result in worse audio quality).
364+ int32 sample_rate_hertz = 2 ;
365+
366+ // Configuration of how speech should be synthesized.
367+ SynthesizeSpeechConfig synthesize_speech_config = 3 ;
386368}
387369
388370// [DTMF](https://en.wikipedia.org/wiki/Dual-tone_multi-frequency_signaling)
@@ -439,3 +421,21 @@ enum TelephonyDtmf {
439421 // Pound/diamond/hash/square/gate/octothorpe: '#'.
440422 DTMF_POUND = 16 ;
441423}
424+
425+ // A wrapper of repeated TelephonyDtmf digits.
426+ message TelephonyDtmfEvents {
427+ // A sequence of TelephonyDtmf digits.
428+ repeated TelephonyDtmf dtmf_events = 1 ;
429+ }
430+
431+ // Configures speech transcription for [ConversationProfile][google.cloud.dialogflow.v2beta1.ConversationProfile].
432+ message SpeechToTextConfig {
433+ // The speech model used in speech to text.
434+ // `SPEECH_MODEL_VARIANT_UNSPECIFIED`, `USE_BEST_AVAILABLE` will be treated as
435+ // `USE_ENHANCED`. It can be overridden in [AnalyzeContentRequest][google.cloud.dialogflow.v2beta1.AnalyzeContentRequest] and
436+ // [StreamingAnalyzeContentRequest][google.cloud.dialogflow.v2beta1.StreamingAnalyzeContentRequest] request.
437+ // If enhanced model variant is specified and an enhanced
438+ // version of the specified model for the language does not exist, then it
439+ // would emit an error.
440+ SpeechModelVariant speech_model_variant = 1 ;
441+ }
0 commit comments