Skip to content

Commit 7ee7867

Browse files
Google APIscopybara-github
authored andcommitted
feat: Add support for BatchRecognize
Deprecate the `update_config_request` field in the `OperationMetadata` message, which was never used and is output only. PiperOrigin-RevId: 517976288
1 parent 247a5da commit 7ee7867

3 files changed

Lines changed: 166 additions & 40 deletions

File tree

google/cloud/speech/v2/BUILD.bazel

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ proto_library_with_info(
4141
deps = [
4242
":speech_proto",
4343
"//google/cloud:common_resources_proto",
44+
"//google/cloud/location:location_proto",
4445
],
4546
)
4647

@@ -76,11 +77,13 @@ java_gapic_library(
7677
service_yaml = "speech_v2.yaml",
7778
test_deps = [
7879
":speech_java_grpc",
80+
"//google/cloud/location:location_java_grpc",
7981
],
8082
transport = "grpc+rest",
8183
deps = [
8284
":speech_java_proto",
8385
"//google/api:api_java_proto",
86+
"//google/cloud/location:location_java_proto",
8487
],
8588
)
8689

@@ -141,6 +144,7 @@ go_gapic_library(
141144
transport = "grpc+rest",
142145
deps = [
143146
":speech_go_proto",
147+
"//google/cloud/location:location_go_proto",
144148
"//google/longrunning:longrunning_go_proto",
145149
"@com_google_cloud_go_longrunning//:go_default_library",
146150
"@com_google_cloud_go_longrunning//autogen:go_default_library",

google/cloud/speech/v2/cloud_speech.proto

Lines changed: 134 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2022 Google LLC
1+
// Copyright 2023 Google LLC
22
//
33
// Licensed under the Apache License, Version 2.0 (the "License");
44
// you may not use this file except in compliance with the License.
@@ -70,7 +70,7 @@ service Speech {
7070

7171
// Returns the requested
7272
// [Recognizer][google.cloud.speech.v2.Recognizer]. Fails with
73-
// [NOT_FOUND][google.rpc.Code.NOT_FOUND] if the requested recognizer doesn't
73+
// [NOT_FOUND][google.rpc.Code.NOT_FOUND] if the requested Recognizer doesn't
7474
// exist.
7575
rpc GetRecognizer(GetRecognizerRequest) returns (Recognizer) {
7676
option (google.api.http) = {
@@ -417,14 +417,14 @@ message OperationMetadata {
417417
UndeletePhraseSetRequest undelete_phrase_set_request = 20;
418418

419419
// The UpdateConfigRequest that spawned the Operation.
420-
UpdateConfigRequest update_config_request = 21;
420+
UpdateConfigRequest update_config_request = 21 [deprecated = true];
421421
}
422422

423423
// The percent progress of the Operation. Values can range from 0-100. If the
424424
// value is 100, then the operation is finished.
425425
int32 progress_percent = 22;
426426

427-
// Specific metadata per RPC
427+
// Specific metadata per RPC.
428428
oneof metadata {
429429
// Metadata specific to the BatchRecognize method.
430430
BatchRecognizeMetadata batch_recognize_metadata = 23;
@@ -600,22 +600,44 @@ message Recognizer {
600600
// When using this model, the service will stop transcribing audio after the
601601
// first utterance is detected and completed.
602602
//
603-
// When using this model,
604-
// [SEPARATE_RECOGNITION_PER_CHANNEL][google.cloud.speech.v2.RecognitionFeatures.MultiChannelMode.SEPARATE_RECOGNITION_PER_CHANNEL]
605-
// is not supported; multi-channel audio is accepted, but only the first
606-
// channel will be processed and transcribed.
603+
// When using this model,
604+
// [SEPARATE_RECOGNITION_PER_CHANNEL][google.cloud.speech.v2.RecognitionFeatures.MultiChannelMode.SEPARATE_RECOGNITION_PER_CHANNEL]
605+
// is not supported; multi-channel audio is accepted, but only the first
606+
// channel will be processed and transcribed.
607+
//
608+
// - `telephony`
609+
//
610+
// Best for audio that originated from a phone call (typically recorded at
611+
// an 8khz sampling rate).
612+
//
613+
// - `medical_conversation`
614+
//
615+
// For conversations between a medical provider—for example, a doctor or
616+
// nurse—and a patient. Use this model when both a provider and a patient
617+
// are speaking. Words uttered by each speaker are automatically detected
618+
// and labeled in the returned transcript.
619+
//
620+
// For supported features please see [medical models
621+
// documentation](https://cloud.google.com/speech-to-text/docs/medical-models).
622+
//
623+
// - `medical_dictation`
624+
//
625+
// For dictated notes spoken by a single medical provider—for example, a
626+
// doctor dictating notes about a patient's blood test results.
627+
//
628+
// For supported features please see [medical models
629+
// documentation](https://cloud.google.com/speech-to-text/docs/medical-models).
630+
//
631+
// - `usm`
632+
//
633+
// The next generation of Speech-to-Text models from Google.
607634
string model = 4 [(google.api.field_behavior) = REQUIRED];
608635

609636
// Required. The language of the supplied audio as a
610637
// [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
611638
//
612-
// Supported languages:
613-
//
614-
// - `en-US`
615-
//
616-
// - `en-GB`
617-
//
618-
// - `fr-FR`
639+
// Supported languages for each model are listed at:
640+
// https://cloud.google.com/speech-to-text/docs/languages
619641
//
620642
// If additional languages are provided, recognition result will contain
621643
// recognition in the most likely language detected. The recognition result
@@ -689,14 +711,23 @@ message Recognizer {
689711

690712
// Automatically detected decoding parameters.
691713
// Supported for the following encodings:
714+
//
692715
// * WAV_LINEAR16: 16-bit signed little-endian PCM samples in a WAV container.
716+
//
693717
// * WAV_MULAW: 8-bit companded mulaw samples in a WAV container.
718+
//
694719
// * WAV_ALAW: 8-bit companded alaw samples in a WAV container.
720+
//
695721
// * RFC4867_5_AMR: AMR frames with an rfc4867.5 header.
722+
//
696723
// * RFC4867_5_AMRWB: AMR-WB frames with an rfc4867.5 header.
724+
//
697725
// * FLAC: FLAC frames in the "native FLAC" container format.
726+
//
698727
// * MP3: MPEG audio frames with optional (ignored) ID3 metadata.
728+
//
699729
// * OGG_OPUS: Opus audio frames in an Ogg container.
730+
//
700731
// * WEBM_OPUS: Opus audio frames in a WebM container.
701732
message AutoDetectDecodingConfig {}
702733

@@ -725,24 +756,32 @@ message ExplicitDecodingConfig {
725756
// sampling rate of the audio source to 16000 Hz. If that's not possible, use
726757
// the native sample rate of the audio source (instead of re-sampling).
727758
// Supported for the following encodings:
759+
//
728760
// * LINEAR16: Headerless 16-bit signed little-endian PCM samples.
761+
//
729762
// * MULAW: Headerless 8-bit companded mulaw samples.
763+
//
730764
// * ALAW: Headerless 8-bit companded alaw samples.
731765
int32 sample_rate_hertz = 2;
732766

733767
// Number of channels present in the audio data sent for recognition.
734768
// Supported for the following encodings:
769+
//
735770
// * LINEAR16: Headerless 16-bit signed little-endian PCM samples.
771+
//
736772
// * MULAW: Headerless 8-bit companded mulaw samples.
773+
//
737774
// * ALAW: Headerless 8-bit companded alaw samples.
775+
//
776+
// The maximum allowed value is 8.
738777
int32 audio_channel_count = 3;
739778
}
740779

741780
// Configuration to enable speaker diarization.
742781
message SpeakerDiarizationConfig {
743782
// Required. Minimum number of speakers in the conversation. This range gives
744783
// you more flexibility by allowing the system to automatically determine the
745-
// correct number of speakers. If not set, the default value is 2.
784+
// correct number of speakers.
746785
//
747786
// To fix the number of speakers detected in the audio, set
748787
// `min_speaker_count` = `max_speaker_count`.
@@ -825,29 +864,29 @@ message RecognitionFeatures {
825864
}
826865

827866
// Provides "hints" to the speech recognizer to favor specific words and phrases
828-
// in the results. Phrase sets can be specified as an inline resource, or a
829-
// reference to an existing phrase set resource.
867+
// in the results. PhraseSets can be specified as an inline resource, or a
868+
// reference to an existing PhraseSet resource.
830869
message SpeechAdaptation {
831-
// A biasing phrase set, which can be either a string referencing the name of
832-
// an existing phrase set resource, or an inline definition of a phrase set.
870+
// A biasing PhraseSet, which can be either a string referencing the name of
871+
// an existing PhraseSets resource, or an inline definition of a PhraseSet.
833872
message AdaptationPhraseSet {
834873
oneof value {
835-
// The name of an existing phrase set resource. The user must have read
874+
// The name of an existing PhraseSet resource. The user must have read
836875
// access to the resource and it must not be deleted.
837876
string phrase_set = 1 [(google.api.resource_reference) = {
838877
type: "speech.googleapis.com/PhraseSet"
839878
}];
840879

841-
// An inline defined phrase set.
880+
// An inline defined PhraseSet.
842881
PhraseSet inline_phrase_set = 2;
843882
}
844883
}
845884

846-
// A list of inline or referenced phrase sets.
885+
// A list of inline or referenced PhraseSets.
847886
repeated AdaptationPhraseSet phrase_sets = 1;
848887

849-
// A list of inline custom classes. Existing custom class resources can be
850-
// referenced directly in a phrase set.
888+
// A list of inline CustomClasses. Existing CustomClass resources can be
889+
// referenced directly in a PhraseSet.
851890
repeated CustomClass custom_classes = 2;
852891
}
853892

@@ -955,9 +994,9 @@ message SpeechRecognitionAlternative {
955994
float confidence = 2;
956995

957996
// A list of word-specific information for each recognized word.
958-
// When
959-
// [enable_speaker_diarization][google.cloud.speech.v2.SpeakerDiarizationConfig.enable_speaker_diarization]
960-
// is true, you will see all the words from the beginning of the audio.
997+
// When the
998+
// [SpeakerDiarizationConfig][google.cloud.speech.v2.SpeakerDiarizationConfig]
999+
// is set, you will see all the words from the beginning of the audio.
9611000
repeated WordInfo words = 3;
9621001
}
9631002

@@ -995,8 +1034,8 @@ message WordInfo {
9951034
// A distinct label is assigned for every speaker within the audio. This field
9961035
// specifies which one of those speakers was detected to have spoken this
9971036
// word. `speaker_label` is set if
998-
// [enable_speaker_diarization][google.cloud.speech.v2.SpeakerDiarizationConfig.enable_speaker_diarization]
999-
// is `true` and only in the top alternative.
1037+
// [SpeakerDiarizationConfig][google.cloud.speech.v2.SpeakerDiarizationConfig]
1038+
// is given and only in the top alternative.
10001039
string speaker_label = 6;
10011040
}
10021041

@@ -1081,9 +1120,9 @@ message StreamingRecognitionConfig {
10811120
// of the recognizer during this recognition request. If no mask is provided,
10821121
// all non-default valued fields in
10831122
// [config][google.cloud.speech.v2.StreamingRecognitionConfig.config] override
1084-
// the values in the recognizer for this recognition request. If a mask is
1123+
// the values in the Recognizer for this recognition request. If a mask is
10851124
// provided, only the fields listed in the mask override the config in the
1086-
// recognizer for this recognition request. If a wildcard (`*`) is provided,
1125+
// Recognizer for this recognition request. If a wildcard (`*`) is provided,
10871126
// [config][google.cloud.speech.v2.StreamingRecognitionConfig.config]
10881127
// completely overrides and replaces the config in the recognizer for this
10891128
// recognition request.
@@ -1130,6 +1169,7 @@ message StreamingRecognizeRequest {
11301169
StreamingRecognitionConfig streaming_config = 6;
11311170

11321171
// Inline audio bytes to be Recognized.
1172+
// Maximum size for this field is 15 KB per request.
11331173
bytes audio = 5;
11341174
}
11351175
}
@@ -1170,7 +1210,37 @@ message BatchRecognizeRequest {
11701210
google.protobuf.FieldMask config_mask = 5;
11711211

11721212
// Audio files with file metadata for ASR.
1213+
// The maximum number of files allowed to be specified is 5.
11731214
repeated BatchRecognizeFileMetadata files = 3;
1215+
1216+
// Configuration options for where to output the transcripts of each file.
1217+
RecognitionOutputConfig recognition_output_config = 6;
1218+
}
1219+
1220+
// Output configurations for Cloud Storage.
1221+
message GcsOutputConfig {
1222+
// The Cloud Storage URI prefix with which recognition results will be
1223+
// written.
1224+
string uri = 1;
1225+
}
1226+
1227+
// Output configurations for inline response.
1228+
message InlineOutputConfig {}
1229+
1230+
// Configuration options for the output(s) of recognition.
1231+
message RecognitionOutputConfig {
1232+
oneof output {
1233+
// If this message is populated, recognition results are written to the
1234+
// provided Google Cloud Storage URI.
1235+
GcsOutputConfig gcs_output_config = 1;
1236+
1237+
// If this message is populated, recognition results are provided in the
1238+
// [BatchRecognizeResponse][google.cloud.speech.v2.BatchRecognizeResponse]
1239+
// message of the Operation when completed. This is only supported when
1240+
// calling [BatchRecognize][google.cloud.speech.v2.Speech.BatchRecognize]
1241+
// with just one audio file.
1242+
InlineOutputConfig inline_response_config = 2;
1243+
}
11741244
}
11751245

11761246
// Response message for
@@ -1179,15 +1249,38 @@ message BatchRecognizeRequest {
11791249
message BatchRecognizeResponse {
11801250
// Map from filename to the final result for that file.
11811251
map<string, BatchRecognizeFileResult> results = 1;
1252+
1253+
// When available, billed audio seconds for the corresponding request.
1254+
google.protobuf.Duration total_billed_duration = 2;
1255+
}
1256+
1257+
// Output type for Cloud Storage of BatchRecognize transcripts. Though this
1258+
// proto isn't returned in this API anywhere, the Cloud Storage transcripts will
1259+
// be this proto serialized and should be parsed as such.
1260+
message BatchRecognizeResults {
1261+
// Sequential list of transcription results corresponding to sequential
1262+
// portions of audio.
1263+
repeated SpeechRecognitionResult results = 1;
1264+
1265+
// Metadata about the recognition.
1266+
RecognitionResponseMetadata metadata = 2;
11821267
}
11831268

11841269
// Final results for a single file.
11851270
message BatchRecognizeFileResult {
1186-
// The GCS URI to which recognition results were written.
1271+
// The Cloud Storage URI to which recognition results were written.
11871272
string uri = 1;
11881273

11891274
// Error if one was encountered.
11901275
google.rpc.Status error = 2;
1276+
1277+
RecognitionResponseMetadata metadata = 3;
1278+
1279+
// The transcript for the audio file. This is populated only when
1280+
// [InlineOutputConfig][google.cloud.speech.v2.InlineOutputConfig] is set in
1281+
// the
1282+
// [RecognitionOutputConfig][[google.cloud.speech.v2.RecognitionOutputConfig].
1283+
BatchRecognizeResults transcript = 4;
11911284
}
11921285

11931286
// Metadata about transcription for a single file (for example, progress
@@ -1199,7 +1292,7 @@ message BatchRecognizeTranscriptionMetadata {
11991292
// Error if one was encountered.
12001293
google.rpc.Status error = 2;
12011294

1202-
// The GCS URI to which recognition results will be written.
1295+
// The Cloud Storage URI to which recognition results will be written.
12031296
string uri = 3;
12041297
}
12051298

@@ -1562,11 +1655,11 @@ message PhraseSet {
15621655
// be recognized over other similar sounding phrases. The higher the boost,
15631656
// the higher the chance of false positive recognition as well. Negative
15641657
// boost values would correspond to anti-biasing. Anti-biasing is not
1565-
// enabled, so negative boost will simply be ignored. Though `boost` can
1566-
// accept a wide range of positive values, most use cases are best served
1567-
// with values between 0 and 20. We recommend using a binary search approach
1568-
// to finding the optimal value for your use case. Speech recognition
1569-
// will skip PhraseSets with a boost value of 0.
1658+
// enabled, so negative boost values will return an error. Boost values must
1659+
// be between 0 and 20. Any values outside that range will return an error.
1660+
// We recommend using a binary search approach to finding the optimal value
1661+
// for your use case as well as adding phrases both with and without boost
1662+
// to your requests.
15701663
float boost = 2;
15711664
}
15721665

@@ -1597,7 +1690,8 @@ message PhraseSet {
15971690
// phrase will be recognized over other similar sounding phrases. The higher
15981691
// the boost, the higher the chance of false positive recognition as well.
15991692
// Valid `boost` values are between 0 (exclusive) and 20. We recommend using a
1600-
// binary search approach to finding the optimal value for your use case.
1693+
// binary search approach to finding the optimal value for your use case as
1694+
// well as adding phrases both with and without boost to your requests.
16011695
float boost = 4;
16021696

16031697
// User-settable, human-readable name for the PhraseSet. Must be 63

0 commit comments

Comments
 (0)