feat: Add support for BatchRecognize

Google APIs · copybara-github · commit 7ee7867d60d1 · 2023-03-20T08:32:57.000-07:00
Deprecate the `update_config_request` field in the `OperationMetadata` message, which was never used and is output only.

PiperOrigin-RevId: 517976288
diff --git a/google/cloud/speech/v2/BUILD.bazel b/google/cloud/speech/v2/BUILD.bazel
@@ -41,6 +41,7 @@ proto_library_with_info(
     deps = [
         ":speech_proto",
         "//google/cloud:common_resources_proto",
+        "//google/cloud/location:location_proto",
     ],
 )
 
@@ -76,11 +77,13 @@ java_gapic_library(
     service_yaml = "speech_v2.yaml",
     test_deps = [
         ":speech_java_grpc",
+        "//google/cloud/location:location_java_grpc",
     ],
     transport = "grpc+rest",
     deps = [
         ":speech_java_proto",
         "//google/api:api_java_proto",
+        "//google/cloud/location:location_java_proto",
     ],
 )
 
@@ -141,6 +144,7 @@ go_gapic_library(
     transport = "grpc+rest",
     deps = [
         ":speech_go_proto",
+        "//google/cloud/location:location_go_proto",
         "//google/longrunning:longrunning_go_proto",
         "@com_google_cloud_go_longrunning//:go_default_library",
         "@com_google_cloud_go_longrunning//autogen:go_default_library",
diff --git a/google/cloud/speech/v2/cloud_speech.proto b/google/cloud/speech/v2/cloud_speech.proto
@@ -1,4 +1,4 @@
-// Copyright 2022 Google LLC
+// Copyright 2023 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -70,7 +70,7 @@ service Speech {
 
   // Returns the requested
   // [Recognizer][google.cloud.speech.v2.Recognizer]. Fails with
-  // [NOT_FOUND][google.rpc.Code.NOT_FOUND] if the requested recognizer doesn't
+  // [NOT_FOUND][google.rpc.Code.NOT_FOUND] if the requested Recognizer doesn't
   // exist.
   rpc GetRecognizer(GetRecognizerRequest) returns (Recognizer) {
     option (google.api.http) = {
@@ -417,14 +417,14 @@ message OperationMetadata {
     UndeletePhraseSetRequest undelete_phrase_set_request = 20;
 
     // The UpdateConfigRequest that spawned the Operation.
-    UpdateConfigRequest update_config_request = 21;
+    UpdateConfigRequest update_config_request = 21 [deprecated = true];
   }
 
   // The percent progress of the Operation. Values can range from 0-100. If the
   // value is 100, then the operation is finished.
   int32 progress_percent = 22;
 
-  // Specific metadata per RPC
+  // Specific metadata per RPC.
   oneof metadata {
     // Metadata specific to the BatchRecognize method.
     BatchRecognizeMetadata batch_recognize_metadata = 23;
@@ -600,22 +600,44 @@ message Recognizer {
   //   When using this model, the service will stop transcribing audio after the
   //   first utterance is detected and completed.
   //
-  // When using this model,
-  // [SEPARATE_RECOGNITION_PER_CHANNEL][google.cloud.speech.v2.RecognitionFeatures.MultiChannelMode.SEPARATE_RECOGNITION_PER_CHANNEL]
-  // is not supported; multi-channel audio is accepted, but only the first
-  // channel will be processed and transcribed.
+  //   When using this model,
+  //   [SEPARATE_RECOGNITION_PER_CHANNEL][google.cloud.speech.v2.RecognitionFeatures.MultiChannelMode.SEPARATE_RECOGNITION_PER_CHANNEL]
+  //   is not supported; multi-channel audio is accepted, but only the first
+  //   channel will be processed and transcribed.
+  //
+  // - `telephony`
+  //
+  //   Best for audio that originated from a phone call (typically recorded at
+  //   an 8khz sampling rate).
+  //
+  // - `medical_conversation`
+  //
+  //   For conversations between a medical provider—for example, a doctor or
+  //   nurse—and a patient. Use this model when both a provider and a patient
+  //   are speaking. Words uttered by each speaker are automatically detected
+  //   and labeled in the returned transcript.
+  //
+  //   For supported features please see [medical models
+  //   documentation](https://cloud.google.com/speech-to-text/docs/medical-models).
+  //
+  // - `medical_dictation`
+  //
+  //   For dictated notes spoken by a single medical provider—for example, a
+  //   doctor dictating notes about a patient's blood test results.
+  //
+  //   For supported features please see [medical models
+  //   documentation](https://cloud.google.com/speech-to-text/docs/medical-models).
+  //
+  // - `usm`
+  //
+  //   The next generation of Speech-to-Text models from Google.
   string model = 4 [(google.api.field_behavior) = REQUIRED];
 
   // Required. The language of the supplied audio as a
   // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
   //
-  // Supported languages:
-  //
-  // - `en-US`
-  //
-  // - `en-GB`
-  //
-  // - `fr-FR`
+  // Supported languages for each model are listed at:
+  // https://cloud.google.com/speech-to-text/docs/languages
   //
   // If additional languages are provided, recognition result will contain
   // recognition in the most likely language detected. The recognition result
@@ -689,14 +711,23 @@ message Recognizer {
 
 // Automatically detected decoding parameters.
 // Supported for the following encodings:
+//
 // * WAV_LINEAR16: 16-bit signed little-endian PCM samples in a WAV container.
+//
 // * WAV_MULAW: 8-bit companded mulaw samples in a WAV container.
+//
 // * WAV_ALAW: 8-bit companded alaw samples in a WAV container.
+//
 // * RFC4867_5_AMR: AMR frames with an rfc4867.5 header.
+//
 // * RFC4867_5_AMRWB: AMR-WB frames with an rfc4867.5 header.
+//
 // * FLAC: FLAC frames in the "native FLAC" container format.
+//
 // * MP3: MPEG audio frames with optional (ignored) ID3 metadata.
+//
 // * OGG_OPUS: Opus audio frames in an Ogg container.
+//
 // * WEBM_OPUS: Opus audio frames in a WebM container.
 message AutoDetectDecodingConfig {}
 
@@ -725,24 +756,32 @@ message ExplicitDecodingConfig {
   // sampling rate of the audio source to 16000 Hz. If that's not possible, use
   // the native sample rate of the audio source (instead of re-sampling).
   // Supported for the following encodings:
+  //
   // * LINEAR16: Headerless 16-bit signed little-endian PCM samples.
+  //
   // * MULAW: Headerless 8-bit companded mulaw samples.
+  //
   // * ALAW: Headerless 8-bit companded alaw samples.
   int32 sample_rate_hertz = 2;
 
   // Number of channels present in the audio data sent for recognition.
   // Supported for the following encodings:
+  //
   // * LINEAR16: Headerless 16-bit signed little-endian PCM samples.
+  //
   // * MULAW: Headerless 8-bit companded mulaw samples.
+  //
   // * ALAW: Headerless 8-bit companded alaw samples.
+  //
+  // The maximum allowed value is 8.
   int32 audio_channel_count = 3;
 }
 
 // Configuration to enable speaker diarization.
 message SpeakerDiarizationConfig {
   // Required. Minimum number of speakers in the conversation. This range gives
   // you more flexibility by allowing the system to automatically determine the
-  // correct number of speakers. If not set, the default value is 2.
+  // correct number of speakers.
   //
   // To fix the number of speakers detected in the audio, set
   // `min_speaker_count` = `max_speaker_count`.
@@ -825,29 +864,29 @@ message RecognitionFeatures {
 }
 
 // Provides "hints" to the speech recognizer to favor specific words and phrases
-// in the results. Phrase sets can be specified as an inline resource, or a
-// reference to an existing phrase set resource.
+// in the results. PhraseSets can be specified as an inline resource, or a
+// reference to an existing PhraseSet resource.
 message SpeechAdaptation {
-  // A biasing phrase set, which can be either a string referencing the name of
-  // an existing phrase set resource, or an inline definition of a phrase set.
+  // A biasing PhraseSet, which can be either a string referencing the name of
+  // an existing PhraseSets resource, or an inline definition of a PhraseSet.
   message AdaptationPhraseSet {
     oneof value {
-      // The name of an existing phrase set resource. The user must have read
+      // The name of an existing PhraseSet resource. The user must have read
       // access to the resource and it must not be deleted.
       string phrase_set = 1 [(google.api.resource_reference) = {
         type: "speech.googleapis.com/PhraseSet"
       }];
 
-      // An inline defined phrase set.
+      // An inline defined PhraseSet.
       PhraseSet inline_phrase_set = 2;
     }
   }
 
-  // A list of inline or referenced phrase sets.
+  // A list of inline or referenced PhraseSets.
   repeated AdaptationPhraseSet phrase_sets = 1;
 
-  // A list of inline custom classes. Existing custom class resources can be
-  // referenced directly in a phrase set.
+  // A list of inline CustomClasses. Existing CustomClass resources can be
+  // referenced directly in a PhraseSet.
   repeated CustomClass custom_classes = 2;
 }
 
@@ -955,9 +994,9 @@ message SpeechRecognitionAlternative {
   float confidence = 2;
 
   // A list of word-specific information for each recognized word.
-  // When
-  // [enable_speaker_diarization][google.cloud.speech.v2.SpeakerDiarizationConfig.enable_speaker_diarization]
-  // is true, you will see all the words from the beginning of the audio.
+  // When the
+  // [SpeakerDiarizationConfig][google.cloud.speech.v2.SpeakerDiarizationConfig]
+  // is set, you will see all the words from the beginning of the audio.
   repeated WordInfo words = 3;
 }
 
@@ -995,8 +1034,8 @@ message WordInfo {
   // A distinct label is assigned for every speaker within the audio. This field
   // specifies which one of those speakers was detected to have spoken this
   // word. `speaker_label` is set if
-  // [enable_speaker_diarization][google.cloud.speech.v2.SpeakerDiarizationConfig.enable_speaker_diarization]
-  // is `true` and only in the top alternative.
+  // [SpeakerDiarizationConfig][google.cloud.speech.v2.SpeakerDiarizationConfig]
+  // is given and only in the top alternative.
   string speaker_label = 6;
 }
 
@@ -1081,9 +1120,9 @@ message StreamingRecognitionConfig {
   // of the recognizer during this recognition request. If no mask is provided,
   // all non-default valued fields in
   // [config][google.cloud.speech.v2.StreamingRecognitionConfig.config] override
-  // the values in the recognizer for this recognition request. If a mask is
+  // the values in the Recognizer for this recognition request. If a mask is
   // provided, only the fields listed in the mask override the config in the
-  // recognizer for this recognition request. If a wildcard (`*`) is provided,
+  // Recognizer for this recognition request. If a wildcard (`*`) is provided,
   // [config][google.cloud.speech.v2.StreamingRecognitionConfig.config]
   // completely overrides and replaces the config in the recognizer for this
   // recognition request.
@@ -1130,6 +1169,7 @@ message StreamingRecognizeRequest {
     StreamingRecognitionConfig streaming_config = 6;
 
     // Inline audio bytes to be Recognized.
+    // Maximum size for this field is 15 KB per request.
     bytes audio = 5;
   }
 }
@@ -1170,7 +1210,37 @@ message BatchRecognizeRequest {
   google.protobuf.FieldMask config_mask = 5;
 
   // Audio files with file metadata for ASR.
+  // The maximum number of files allowed to be specified is 5.
   repeated BatchRecognizeFileMetadata files = 3;
+
+  // Configuration options for where to output the transcripts of each file.
+  RecognitionOutputConfig recognition_output_config = 6;
+}
+
+// Output configurations for Cloud Storage.
+message GcsOutputConfig {
+  // The Cloud Storage URI prefix with which recognition results will be
+  // written.
+  string uri = 1;
+}
+
+// Output configurations for inline response.
+message InlineOutputConfig {}
+
+// Configuration options for the output(s) of recognition.
+message RecognitionOutputConfig {
+  oneof output {
+    // If this message is populated, recognition results are written to the
+    // provided Google Cloud Storage URI.
+    GcsOutputConfig gcs_output_config = 1;
+
+    // If this message is populated, recognition results are provided in the
+    // [BatchRecognizeResponse][google.cloud.speech.v2.BatchRecognizeResponse]
+    // message of the Operation when completed. This is only supported when
+    // calling [BatchRecognize][google.cloud.speech.v2.Speech.BatchRecognize]
+    // with just one audio file.
+    InlineOutputConfig inline_response_config = 2;
+  }
 }
 
 // Response message for
@@ -1179,15 +1249,38 @@ message BatchRecognizeRequest {
 message BatchRecognizeResponse {
   // Map from filename to the final result for that file.
   map<string, BatchRecognizeFileResult> results = 1;
+
+  // When available, billed audio seconds for the corresponding request.
+  google.protobuf.Duration total_billed_duration = 2;
+}
+
+// Output type for Cloud Storage of BatchRecognize transcripts. Though this
+// proto isn't returned in this API anywhere, the Cloud Storage transcripts will
+// be this proto serialized and should be parsed as such.
+message BatchRecognizeResults {
+  // Sequential list of transcription results corresponding to sequential
+  // portions of audio.
+  repeated SpeechRecognitionResult results = 1;
+
+  // Metadata about the recognition.
+  RecognitionResponseMetadata metadata = 2;
 }
 
 // Final results for a single file.
 message BatchRecognizeFileResult {
-  // The GCS URI to which recognition results were written.
+  // The Cloud Storage URI to which recognition results were written.
   string uri = 1;
 
   // Error if one was encountered.
   google.rpc.Status error = 2;
+
+  RecognitionResponseMetadata metadata = 3;
+
+  // The transcript for the audio file. This is populated only when
+  // [InlineOutputConfig][google.cloud.speech.v2.InlineOutputConfig] is set in
+  // the
+  // [RecognitionOutputConfig][[google.cloud.speech.v2.RecognitionOutputConfig].
+  BatchRecognizeResults transcript = 4;
 }
 
 // Metadata about transcription for a single file (for example, progress
@@ -1199,7 +1292,7 @@ message BatchRecognizeTranscriptionMetadata {
   // Error if one was encountered.
   google.rpc.Status error = 2;
 
-  // The GCS URI to which recognition results will be written.
+  // The Cloud Storage URI to which recognition results will be written.
   string uri = 3;
 }
 
@@ -1562,11 +1655,11 @@ message PhraseSet {
     // be recognized over other similar sounding phrases. The higher the boost,
     // the higher the chance of false positive recognition as well. Negative
     // boost values would correspond to anti-biasing. Anti-biasing is not
-    // enabled, so negative boost will simply be ignored. Though `boost` can
-    // accept a wide range of positive values, most use cases are best served
-    // with values between 0 and 20. We recommend using a binary search approach
-    // to finding the optimal value for your use case. Speech recognition
-    // will skip PhraseSets with a boost value of 0.
+    // enabled, so negative boost values will return an error. Boost values must
+    // be between 0 and 20. Any values outside that range will return an error.
+    // We recommend using a binary search approach to finding the optimal value
+    // for your use case as well as adding phrases both with and without boost
+    // to your requests.
     float boost = 2;
   }
 
@@ -1597,7 +1690,8 @@ message PhraseSet {
   // phrase will be recognized over other similar sounding phrases. The higher
   // the boost, the higher the chance of false positive recognition as well.
   // Valid `boost` values are between 0 (exclusive) and 20. We recommend using a
-  // binary search approach to finding the optimal value for your use case.
+  // binary search approach to finding the optimal value for your use case as
+  // well as adding phrases both with and without boost to your requests.
   float boost = 4;
 
   // User-settable, human-readable name for the PhraseSet. Must be 63
diff --git a/google/cloud/speech/v2/speech_v2.yaml b/google/cloud/speech/v2/speech_v2.yaml