1- // Copyright 2019 Google LLC.
1+ // Copyright 2021 Google LLC
22//
33// Licensed under the Apache License, Version 2.0 (the "License");
44// you may not use this file except in compliance with the License.
1111// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212// See the License for the specific language governing permissions and
1313// limitations under the License.
14- //
1514
1615syntax = "proto3" ;
1716
@@ -24,6 +23,7 @@ import "google/longrunning/operations.proto";
2423import "google/protobuf/any.proto" ;
2524import "google/protobuf/duration.proto" ;
2625import "google/protobuf/timestamp.proto" ;
26+ import "google/protobuf/wrappers.proto" ;
2727import "google/rpc/status.proto" ;
2828
2929option cc_enable_arenas = true ;
@@ -136,6 +136,16 @@ message StreamingRecognitionConfig {
136136 // `END_OF_SINGLE_UTTERANCE` event and cease recognition. It will return no
137137 // more than one `StreamingRecognitionResult` with the `is_final` flag set to
138138 // `true`.
139+ //
140+ // The `single_utterance` field can only be used with specified models,
141+ // otherwise an error is thrown. The `model` field in [`RecognitionConfig`][]
142+ // must be set to:
143+ //
144+ // * `command_and_search`
145+ // * `phone_call` AND additional field `useEnhanced`=`true`
146+ // * The `model` field is left undefined. In this case the API auto-selects
147+ // a model based on any other parameters that you set in
148+ // `RecognitionConfig`.
139149 bool single_utterance = 2 ;
140150
141151 // If `true`, interim results (tentative hypotheses) may be
@@ -158,7 +168,7 @@ message RecognitionConfig {
158168 // a lossless encoding (`FLAC` or `LINEAR16`). The accuracy of the speech
159169 // recognition can be reduced if lossy codecs are used to capture or transmit
160170 // audio, particularly if background noise is present. Lossy codecs include
161- // `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, `SPEEX_WITH_HEADER_BYTE`, and `MP3`.
171+ // `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, `SPEEX_WITH_HEADER_BYTE`, `MP3`.
162172 //
163173 // The `FLAC` and `WAV` audio file formats include a header that describes the
164174 // included audio content. You can request recognition for `WAV` files that
@@ -274,7 +284,7 @@ message RecognitionConfig {
274284 // A means to provide context to assist the speech recognition. For more
275285 // information, see
276286 // [speech
277- // adaptation](https://cloud.google.com/speech-to-text/docs/context-strength ).
287+ // adaptation](https://cloud.google.com/speech-to-text/docs/adaptation ).
278288 repeated SpeechContext speech_contexts = 6 ;
279289
280290 // If `true`, the top result includes a list of words and
@@ -287,9 +297,6 @@ message RecognitionConfig {
287297 // This feature is only available in select languages. Setting this for
288298 // requests in other languages has no effect at all.
289299 // The default 'false' value does not add punctuation to result hypotheses.
290- // Note: This is currently offered as an experimental service, complimentary
291- // to all users. In the future this may be exclusively available as a
292- // premium feature.
293300 bool enable_automatic_punctuation = 11 ;
294301
295302 // Config to enable speaker diarization and set additional
@@ -325,7 +332,7 @@ message RecognitionConfig {
325332 // </tr>
326333 // <tr>
327334 // <td><code>video</code></td>
328- // <td>Best for audio that originated from from video or includes multiple
335+ // <td>Best for audio that originated from video or includes multiple
329336 // speakers. Ideally the audio is recorded at a 16khz or greater
330337 // sampling rate. This is a premium model that costs more than the
331338 // standard rate.</td>
@@ -367,9 +374,11 @@ message SpeakerDiarizationConfig {
367374 // number of speakers. If not set, the default value is 6.
368375 int32 max_speaker_count = 3 ;
369376
370- // Unused.
371- int32 speaker_tag = 5
372- [(google.api.field_behavior ) = OUTPUT_ONLY , deprecated = true ];
377+ // Output only. Unused.
378+ int32 speaker_tag = 5 [
379+ deprecated = true ,
380+ (google.api.field_behavior ) = OUTPUT_ONLY
381+ ];
373382}
374383
375384// Description of audio data to be recognized.
@@ -548,6 +557,9 @@ message RecognizeResponse {
548557 // Sequential list of transcription results corresponding to
549558 // sequential portions of audio.
550559 repeated SpeechRecognitionResult results = 2 ;
560+
561+ // When available, billed audio seconds for the corresponding request.
562+ google.protobuf.Duration total_billed_time = 3 ;
551563}
552564
553565// The only message returned to the client by the `LongRunningRecognize` method.
@@ -559,6 +571,9 @@ message LongRunningRecognizeResponse {
559571 // Sequential list of transcription results corresponding to
560572 // sequential portions of audio.
561573 repeated SpeechRecognitionResult results = 2 ;
574+
575+ // When available, billed audio seconds for the corresponding request.
576+ google.protobuf.Duration total_billed_time = 3 ;
562577}
563578
564579// Describes the progress of a long-running `LongRunningRecognize` call. It is
@@ -574,6 +589,10 @@ message LongRunningRecognizeMetadata {
574589
575590 // Time of the most recent processing update.
576591 google.protobuf.Timestamp last_update_time = 3 ;
592+
593+ // Output only. The URI of the audio file being transcribed. Empty if the audio was sent
594+ // as byte content.
595+ string uri = 4 [(google.api.field_behavior ) = OUTPUT_ONLY ];
577596}
578597
579598// `StreamingRecognizeResponse` is the only message returned to the client by
@@ -582,8 +601,8 @@ message LongRunningRecognizeMetadata {
582601// audio, and `single_utterance` is set to false, then no messages are streamed
583602// back to the client.
584603//
585- // Here's an example of a series of ten `StreamingRecognizeResponse`s that might
586- // be returned while processing audio:
604+ // Here's an example of a series of `StreamingRecognizeResponse`s that might be
605+ // returned while processing audio:
587606//
588607// 1. results { alternatives { transcript: "tube" } stability: 0.01 }
589608//
@@ -653,6 +672,10 @@ message StreamingRecognizeResponse {
653672
654673 // Indicates the type of speech event.
655674 SpeechEventType speech_event_type = 4 ;
675+
676+ // When available, billed audio seconds for the stream.
677+ // Set only if this is the last response in the stream.
678+ google.protobuf.Duration total_billed_time = 5 ;
656679}
657680
658681// A streaming speech recognition result corresponding to a portion of the audio
@@ -749,11 +772,10 @@ message WordInfo {
749772 // The word corresponding to this set of information.
750773 string word = 3 ;
751774
752- // A distinct integer value is assigned for every speaker within
775+ // Output only. A distinct integer value is assigned for every speaker within
753776 // the audio. This field specifies which one of those speakers was detected to
754777 // have spoken this word. Value ranges from '1' to diarization_speaker_count.
755778 // speaker_tag is set if enable_speaker_diarization = 'true' and only in the
756779 // top alternative.
757- int32 speaker_tag = 5
758- [(google.api.field_behavior ) = OUTPUT_ONLY ];
780+ int32 speaker_tag = 5 [(google.api.field_behavior ) = OUTPUT_ONLY ];
759781}
0 commit comments