@@ -29,6 +29,31 @@ option java_outer_classname = "AudioConfigProto";
2929option java_package = "com.google.cloud.dialogflow.cx.v3" ;
3030option objc_class_prefix = "DF" ;
3131
32+ // Information for a word recognized by the speech recognizer.
33+ message SpeechWordInfo {
34+ // The word this info is for.
35+ string word = 3 ;
36+
37+ // Time offset relative to the beginning of the audio that corresponds to the
38+ // start of the spoken word. This is an experimental feature and the accuracy
39+ // of the time offset can vary.
40+ google.protobuf.Duration start_offset = 1 ;
41+
42+ // Time offset relative to the beginning of the audio that corresponds to the
43+ // end of the spoken word. This is an experimental feature and the accuracy of
44+ // the time offset can vary.
45+ google.protobuf.Duration end_offset = 2 ;
46+
47+ // The Speech confidence between 0.0 and 1.0 for this word. A higher number
48+ // indicates an estimated greater likelihood that the recognized word is
49+ // correct. The default of 0.0 is a sentinel value indicating that confidence
50+ // was not set.
51+ //
52+ // This field is not guaranteed to be fully stable over time for the same
53+ // audio input. Users should also not rely on it to always be provided.
54+ float confidence = 4 ;
55+ }
56+
3257// Audio encoding of the audio content sent in the conversational query request.
3358// Refer to the
3459// [Cloud Speech API
@@ -78,31 +103,6 @@ enum AudioEncoding {
78103 AUDIO_ENCODING_SPEEX_WITH_HEADER_BYTE = 7 ;
79104}
80105
81- // Information for a word recognized by the speech recognizer.
82- message SpeechWordInfo {
83- // The word this info is for.
84- string word = 3 ;
85-
86- // Time offset relative to the beginning of the audio that corresponds to the
87- // start of the spoken word. This is an experimental feature and the accuracy
88- // of the time offset can vary.
89- google.protobuf.Duration start_offset = 1 ;
90-
91- // Time offset relative to the beginning of the audio that corresponds to the
92- // end of the spoken word. This is an experimental feature and the accuracy of
93- // the time offset can vary.
94- google.protobuf.Duration end_offset = 2 ;
95-
96- // The Speech confidence between 0.0 and 1.0 for this word. A higher number
97- // indicates an estimated greater likelihood that the recognized word is
98- // correct. The default of 0.0 is a sentinel value indicating that confidence
99- // was not set.
100- //
101- // This field is not guaranteed to be fully stable over time for the same
102- // audio input. Users should also not rely on it to always be provided.
103- float confidence = 4 ;
104- }
105-
106106// Instructs the speech recognizer on how to process the audio content.
107107message InputAudioConfig {
108108 // Required. Audio encoding of the audio content to process.
@@ -250,6 +250,22 @@ message SynthesizeSpeechConfig {
250250 VoiceSelectionParams voice = 4 ;
251251}
252252
253+ // Instructs the speech synthesizer how to generate the output audio content.
254+ message OutputAudioConfig {
255+ // Required. Audio encoding of the synthesized audio content.
256+ OutputAudioEncoding audio_encoding = 1 [(google.api.field_behavior ) = REQUIRED ];
257+
258+ // Optional. The synthesis sample rate (in hertz) for this audio. If not
259+ // provided, then the synthesizer will use the default sample rate based on
260+ // the audio encoding. If this is different from the voice's natural sample
261+ // rate, then the synthesizer will honor this request by converting to the
262+ // desired sample rate (which might result in worse audio quality).
263+ int32 sample_rate_hertz = 2 ;
264+
265+ // Optional. Configuration of how speech should be synthesized.
266+ SynthesizeSpeechConfig synthesize_speech_config = 3 ;
267+ }
268+
253269// Gender of the voice as described in
254270// [SSML voice element](https://www.w3.org/TR/speech-synthesis11/#edef_voice).
255271enum SsmlVoiceGender {
@@ -267,22 +283,6 @@ enum SsmlVoiceGender {
267283 SSML_VOICE_GENDER_NEUTRAL = 3 ;
268284}
269285
270- // Instructs the speech synthesizer how to generate the output audio content.
271- message OutputAudioConfig {
272- // Required. Audio encoding of the synthesized audio content.
273- OutputAudioEncoding audio_encoding = 1 [(google.api.field_behavior ) = REQUIRED ];
274-
275- // Optional. The synthesis sample rate (in hertz) for this audio. If not
276- // provided, then the synthesizer will use the default sample rate based on
277- // the audio encoding. If this is different from the voice's natural sample
278- // rate, then the synthesizer will honor this request by converting to the
279- // desired sample rate (which might result in worse audio quality).
280- int32 sample_rate_hertz = 2 ;
281-
282- // Optional. Configuration of how speech should be synthesized.
283- SynthesizeSpeechConfig synthesize_speech_config = 3 ;
284- }
285-
286286// Audio encoding of the output audio format in Text-To-Speech.
287287enum OutputAudioEncoding {
288288 // Not specified.
0 commit comments