@@ -29,6 +29,31 @@ option java_outer_classname = "AudioConfigProto";
2929option java_package = "com.google.cloud.dialogflow.cx.v3beta1" ;
3030option objc_class_prefix = "DF" ;
3131
32+ // Information for a word recognized by the speech recognizer.
33+ message SpeechWordInfo {
34+ // The word this info is for.
35+ string word = 3 ;
36+
37+ // Time offset relative to the beginning of the audio that corresponds to the
38+ // start of the spoken word. This is an experimental feature and the accuracy
39+ // of the time offset can vary.
40+ google.protobuf.Duration start_offset = 1 ;
41+
42+ // Time offset relative to the beginning of the audio that corresponds to the
43+ // end of the spoken word. This is an experimental feature and the accuracy of
44+ // the time offset can vary.
45+ google.protobuf.Duration end_offset = 2 ;
46+
47+ // The Speech confidence between 0.0 and 1.0 for this word. A higher number
48+ // indicates an estimated greater likelihood that the recognized word is
49+ // correct. The default of 0.0 is a sentinel value indicating that confidence
50+ // was not set.
51+ //
52+ // This field is not guaranteed to be fully stable over time for the same
53+ // audio input. Users should also not rely on it to always be provided.
54+ float confidence = 4 ;
55+ }
56+
3257// Audio encoding of the audio content sent in the conversational query request.
3358// Refer to the
3459// [Cloud Speech API
@@ -78,31 +103,6 @@ enum AudioEncoding {
78103 AUDIO_ENCODING_SPEEX_WITH_HEADER_BYTE = 7 ;
79104}
80105
81- // Information for a word recognized by the speech recognizer.
82- message SpeechWordInfo {
83- // The word this info is for.
84- string word = 3 ;
85-
86- // Time offset relative to the beginning of the audio that corresponds to the
87- // start of the spoken word. This is an experimental feature and the accuracy
88- // of the time offset can vary.
89- google.protobuf.Duration start_offset = 1 ;
90-
91- // Time offset relative to the beginning of the audio that corresponds to the
92- // end of the spoken word. This is an experimental feature and the accuracy of
93- // the time offset can vary.
94- google.protobuf.Duration end_offset = 2 ;
95-
96- // The Speech confidence between 0.0 and 1.0 for this word. A higher number
97- // indicates an estimated greater likelihood that the recognized word is
98- // correct. The default of 0.0 is a sentinel value indicating that confidence
99- // was not set.
100- //
101- // This field is not guaranteed to be fully stable over time for the same
102- // audio input. Users should also not rely on it to always be provided.
103- float confidence = 4 ;
104- }
105-
106106// Instructs the speech recognizer on how to process the audio content.
107107message InputAudioConfig {
108108 // Required. Audio encoding of the audio content to process.
@@ -249,6 +249,22 @@ message SynthesizeSpeechConfig {
249249 VoiceSelectionParams voice = 4 ;
250250}
251251
252+ // Instructs the speech synthesizer how to generate the output audio content.
253+ message OutputAudioConfig {
254+ // Required. Audio encoding of the synthesized audio content.
255+ OutputAudioEncoding audio_encoding = 1 [(google.api.field_behavior ) = REQUIRED ];
256+
257+ // Optional. The synthesis sample rate (in hertz) for this audio. If not
258+ // provided, then the synthesizer will use the default sample rate based on
259+ // the audio encoding. If this is different from the voice's natural sample
260+ // rate, then the synthesizer will honor this request by converting to the
261+ // desired sample rate (which might result in worse audio quality).
262+ int32 sample_rate_hertz = 2 ;
263+
264+ // Optional. Configuration of how speech should be synthesized.
265+ SynthesizeSpeechConfig synthesize_speech_config = 3 ;
266+ }
267+
252268// Gender of the voice as described in
253269// [SSML voice element](https://www.w3.org/TR/speech-synthesis11/#edef_voice).
254270enum SsmlVoiceGender {
@@ -266,22 +282,6 @@ enum SsmlVoiceGender {
266282 SSML_VOICE_GENDER_NEUTRAL = 3 ;
267283}
268284
269- // Instructs the speech synthesizer how to generate the output audio content.
270- message OutputAudioConfig {
271- // Required. Audio encoding of the synthesized audio content.
272- OutputAudioEncoding audio_encoding = 1 [(google.api.field_behavior ) = REQUIRED ];
273-
274- // Optional. The synthesis sample rate (in hertz) for this audio. If not
275- // provided, then the synthesizer will use the default sample rate based on
276- // the audio encoding. If this is different from the voice's natural sample
277- // rate, then the synthesizer will honor this request by converting to the
278- // desired sample rate (which might result in worse audio quality).
279- int32 sample_rate_hertz = 2 ;
280-
281- // Optional. Configuration of how speech should be synthesized.
282- SynthesizeSpeechConfig synthesize_speech_config = 3 ;
283- }
284-
285285// Audio encoding of the output audio format in Text-To-Speech.
286286enum OutputAudioEncoding {
287287 // Not specified.
0 commit comments