@@ -16,11 +16,11 @@ syntax = "proto3";
1616
1717package google.cloud.dialogflow.cx.v3beta1 ;
1818
19+ import "google/api/annotations.proto" ;
1920import "google/api/field_behavior.proto" ;
2021import "google/api/resource.proto" ;
2122import "google/protobuf/duration.proto" ;
2223import "google/protobuf/timestamp.proto" ;
23- import "google/api/annotations.proto" ;
2424
2525option cc_enable_arenas = true ;
2626option csharp_namespace = "Google.Cloud.Dialogflow.Cx.V3Beta1" ;
@@ -31,31 +31,6 @@ option java_package = "com.google.cloud.dialogflow.cx.v3beta1";
3131option objc_class_prefix = "DF" ;
3232option ruby_package = "Google::Cloud::Dialogflow::CX::V3beta1" ;
3333
34- // Information for a word recognized by the speech recognizer.
35- message SpeechWordInfo {
36- // The word this info is for.
37- string word = 3 ;
38-
39- // Time offset relative to the beginning of the audio that corresponds to the
40- // start of the spoken word. This is an experimental feature and the accuracy
41- // of the time offset can vary.
42- google.protobuf.Duration start_offset = 1 ;
43-
44- // Time offset relative to the beginning of the audio that corresponds to the
45- // end of the spoken word. This is an experimental feature and the accuracy of
46- // the time offset can vary.
47- google.protobuf.Duration end_offset = 2 ;
48-
49- // The Speech confidence between 0.0 and 1.0 for this word. A higher number
50- // indicates an estimated greater likelihood that the recognized word is
51- // correct. The default of 0.0 is a sentinel value indicating that confidence
52- // was not set.
53- //
54- // This field is not guaranteed to be fully stable over time for the same
55- // audio input. Users should also not rely on it to always be provided.
56- float confidence = 4 ;
57- }
58-
5934// Audio encoding of the audio content sent in the conversational query request.
6035// Refer to the
6136// [Cloud Speech API
@@ -105,6 +80,74 @@ enum AudioEncoding {
10580 AUDIO_ENCODING_SPEEX_WITH_HEADER_BYTE = 7 ;
10681}
10782
83+ // Variant of the specified [Speech model][google.cloud.dialogflow.cx.v3beta1.InputAudioConfig.model] to use.
84+ //
85+ // See the [Cloud Speech
86+ // documentation](https://cloud.google.com/speech-to-text/docs/enhanced-models)
87+ // for which models have different variants. For example, the "phone_call" model
88+ // has both a standard and an enhanced variant. When you use an enhanced model,
89+ // you will generally receive higher quality results than for a standard model.
90+ enum SpeechModelVariant {
91+ // No model variant specified. In this case Dialogflow defaults to
92+ // USE_BEST_AVAILABLE.
93+ SPEECH_MODEL_VARIANT_UNSPECIFIED = 0 ;
94+
95+ // Use the best available variant of the [Speech
96+ // model][InputAudioConfig.model] that the caller is eligible for.
97+ //
98+ // Please see the [Dialogflow
99+ // docs](https://cloud.google.com/dialogflow/docs/data-logging) for
100+ // how to make your project eligible for enhanced models.
101+ USE_BEST_AVAILABLE = 1 ;
102+
103+ // Use standard model variant even if an enhanced model is available. See the
104+ // [Cloud Speech
105+ // documentation](https://cloud.google.com/speech-to-text/docs/enhanced-models)
106+ // for details about enhanced models.
107+ USE_STANDARD = 2 ;
108+
109+ // Use an enhanced model variant:
110+ //
111+ // * If an enhanced variant does not exist for the given
112+ // [model][google.cloud.dialogflow.cx.v3beta1.InputAudioConfig.model] and request language, Dialogflow falls
113+ // back to the standard variant.
114+ //
115+ // The [Cloud Speech
116+ // documentation](https://cloud.google.com/speech-to-text/docs/enhanced-models)
117+ // describes which models have enhanced variants.
118+ //
119+ // * If the API caller isn't eligible for enhanced models, Dialogflow returns
120+ // an error. Please see the [Dialogflow
121+ // docs](https://cloud.google.com/dialogflow/docs/data-logging)
122+ // for how to make your project eligible.
123+ USE_ENHANCED = 3 ;
124+ }
125+
126+ // Information for a word recognized by the speech recognizer.
127+ message SpeechWordInfo {
128+ // The word this info is for.
129+ string word = 3 ;
130+
131+ // Time offset relative to the beginning of the audio that corresponds to the
132+ // start of the spoken word. This is an experimental feature and the accuracy
133+ // of the time offset can vary.
134+ google.protobuf.Duration start_offset = 1 ;
135+
136+ // Time offset relative to the beginning of the audio that corresponds to the
137+ // end of the spoken word. This is an experimental feature and the accuracy of
138+ // the time offset can vary.
139+ google.protobuf.Duration end_offset = 2 ;
140+
141+ // The Speech confidence between 0.0 and 1.0 for this word. A higher number
142+ // indicates an estimated greater likelihood that the recognized word is
143+ // correct. The default of 0.0 is a sentinel value indicating that confidence
144+ // was not set.
145+ //
146+ // This field is not guaranteed to be fully stable over time for the same
147+ // audio input. Users should also not rely on it to always be provided.
148+ float confidence = 4 ;
149+ }
150+
108151// Instructs the speech recognizer on how to process the audio content.
109152message InputAudioConfig {
110153 // Required. Audio encoding of the audio content to process.
@@ -158,47 +201,21 @@ message InputAudioConfig {
158201 bool single_utterance = 8 ;
159202}
160203
161- // Variant of the specified [Speech model][google.cloud.dialogflow.cx.v3beta1.InputAudioConfig.model] to use.
162- //
163- // See the [Cloud Speech
164- // documentation](https://cloud.google.com/speech-to-text/docs/enhanced-models)
165- // for which models have different variants. For example, the "phone_call" model
166- // has both a standard and an enhanced variant. When you use an enhanced model,
167- // you will generally receive higher quality results than for a standard model.
168- enum SpeechModelVariant {
169- // No model variant specified. In this case Dialogflow defaults to
170- // USE_BEST_AVAILABLE.
171- SPEECH_MODEL_VARIANT_UNSPECIFIED = 0 ;
204+ // Gender of the voice as described in
205+ // [SSML voice element](https://www.w3.org/TR/speech-synthesis11/#edef_voice).
206+ enum SsmlVoiceGender {
207+ // An unspecified gender, which means that the client doesn't care which
208+ // gender the selected voice will have.
209+ SSML_VOICE_GENDER_UNSPECIFIED = 0 ;
172210
173- // Use the best available variant of the [Speech
174- // model][InputAudioConfig.model] that the caller is eligible for.
175- //
176- // Please see the [Dialogflow
177- // docs](https://cloud.google.com/dialogflow/docs/data-logging) for
178- // how to make your project eligible for enhanced models.
179- USE_BEST_AVAILABLE = 1 ;
211+ // A male voice.
212+ SSML_VOICE_GENDER_MALE = 1 ;
180213
181- // Use standard model variant even if an enhanced model is available. See the
182- // [Cloud Speech
183- // documentation](https://cloud.google.com/speech-to-text/docs/enhanced-models)
184- // for details about enhanced models.
185- USE_STANDARD = 2 ;
214+ // A female voice.
215+ SSML_VOICE_GENDER_FEMALE = 2 ;
186216
187- // Use an enhanced model variant:
188- //
189- // * If an enhanced variant does not exist for the given
190- // [model][google.cloud.dialogflow.cx.v3beta1.InputAudioConfig.model] and request language, Dialogflow falls
191- // back to the standard variant.
192- //
193- // The [Cloud Speech
194- // documentation](https://cloud.google.com/speech-to-text/docs/enhanced-models)
195- // describes which models have enhanced variants.
196- //
197- // * If the API caller isn't eligible for enhanced models, Dialogflow returns
198- // an error. Please see the [Dialogflow
199- // docs](https://cloud.google.com/dialogflow/docs/data-logging)
200- // for how to make your project eligible.
201- USE_ENHANCED = 3 ;
217+ // A gender-neutral voice.
218+ SSML_VOICE_GENDER_NEUTRAL = 3 ;
202219}
203220
204221// Description of which voice to use for speech synthesis.
@@ -251,39 +268,6 @@ message SynthesizeSpeechConfig {
251268 VoiceSelectionParams voice = 4 ;
252269}
253270
254- // Instructs the speech synthesizer how to generate the output audio content.
255- message OutputAudioConfig {
256- // Required. Audio encoding of the synthesized audio content.
257- OutputAudioEncoding audio_encoding = 1 [(google.api.field_behavior ) = REQUIRED ];
258-
259- // Optional. The synthesis sample rate (in hertz) for this audio. If not
260- // provided, then the synthesizer will use the default sample rate based on
261- // the audio encoding. If this is different from the voice's natural sample
262- // rate, then the synthesizer will honor this request by converting to the
263- // desired sample rate (which might result in worse audio quality).
264- int32 sample_rate_hertz = 2 ;
265-
266- // Optional. Configuration of how speech should be synthesized.
267- SynthesizeSpeechConfig synthesize_speech_config = 3 ;
268- }
269-
270- // Gender of the voice as described in
271- // [SSML voice element](https://www.w3.org/TR/speech-synthesis11/#edef_voice).
272- enum SsmlVoiceGender {
273- // An unspecified gender, which means that the client doesn't care which
274- // gender the selected voice will have.
275- SSML_VOICE_GENDER_UNSPECIFIED = 0 ;
276-
277- // A male voice.
278- SSML_VOICE_GENDER_MALE = 1 ;
279-
280- // A female voice.
281- SSML_VOICE_GENDER_FEMALE = 2 ;
282-
283- // A gender-neutral voice.
284- SSML_VOICE_GENDER_NEUTRAL = 3 ;
285- }
286-
287271// Audio encoding of the output audio format in Text-To-Speech.
288272enum OutputAudioEncoding {
289273 // Not specified.
@@ -308,3 +292,19 @@ enum OutputAudioEncoding {
308292 // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
309293 OUTPUT_AUDIO_ENCODING_MULAW = 5 ;
310294}
295+
296+ // Instructs the speech synthesizer how to generate the output audio content.
297+ message OutputAudioConfig {
298+ // Required. Audio encoding of the synthesized audio content.
299+ OutputAudioEncoding audio_encoding = 1 [(google.api.field_behavior ) = REQUIRED ];
300+
301+ // Optional. The synthesis sample rate (in hertz) for this audio. If not
302+ // provided, then the synthesizer will use the default sample rate based on
303+ // the audio encoding. If this is different from the voice's natural sample
304+ // rate, then the synthesizer will honor this request by converting to the
305+ // desired sample rate (which might result in worse audio quality).
306+ int32 sample_rate_hertz = 2 ;
307+
308+ // Optional. Configuration of how speech should be synthesized.
309+ SynthesizeSpeechConfig synthesize_speech_config = 3 ;
310+ }
0 commit comments