1- // Copyright 2022 Google LLC
1+ // Copyright 2023 Google LLC
22//
33// Licensed under the Apache License, Version 2.0 (the "License");
44// you may not use this file except in compliance with the License.
@@ -70,7 +70,7 @@ service Speech {
7070
7171 // Returns the requested
7272 // [Recognizer][google.cloud.speech.v2.Recognizer]. Fails with
73- // [NOT_FOUND][google.rpc.Code.NOT_FOUND] if the requested recognizer doesn't
73+ // [NOT_FOUND][google.rpc.Code.NOT_FOUND] if the requested Recognizer doesn't
7474 // exist.
7575 rpc GetRecognizer (GetRecognizerRequest ) returns (Recognizer ) {
7676 option (google.api.http ) = {
@@ -417,14 +417,14 @@ message OperationMetadata {
417417 UndeletePhraseSetRequest undelete_phrase_set_request = 20 ;
418418
419419 // The UpdateConfigRequest that spawned the Operation.
420- UpdateConfigRequest update_config_request = 21 ;
420+ UpdateConfigRequest update_config_request = 21 [ deprecated = true ] ;
421421 }
422422
423423 // The percent progress of the Operation. Values can range from 0-100. If the
424424 // value is 100, then the operation is finished.
425425 int32 progress_percent = 22 ;
426426
427- // Specific metadata per RPC
427+ // Specific metadata per RPC.
428428 oneof metadata {
429429 // Metadata specific to the BatchRecognize method.
430430 BatchRecognizeMetadata batch_recognize_metadata = 23 ;
@@ -600,22 +600,44 @@ message Recognizer {
600600 // When using this model, the service will stop transcribing audio after the
601601 // first utterance is detected and completed.
602602 //
603- // When using this model,
604- // [SEPARATE_RECOGNITION_PER_CHANNEL][google.cloud.speech.v2.RecognitionFeatures.MultiChannelMode.SEPARATE_RECOGNITION_PER_CHANNEL]
605- // is not supported; multi-channel audio is accepted, but only the first
606- // channel will be processed and transcribed.
603+ // When using this model,
604+ // [SEPARATE_RECOGNITION_PER_CHANNEL][google.cloud.speech.v2.RecognitionFeatures.MultiChannelMode.SEPARATE_RECOGNITION_PER_CHANNEL]
605+ // is not supported; multi-channel audio is accepted, but only the first
606+ // channel will be processed and transcribed.
607+ //
608+ // - `telephony`
609+ //
610+ // Best for audio that originated from a phone call (typically recorded at
611+ // an 8khz sampling rate).
612+ //
613+ // - `medical_conversation`
614+ //
615+ // For conversations between a medical provider—for example, a doctor or
616+ // nurse—and a patient. Use this model when both a provider and a patient
617+ // are speaking. Words uttered by each speaker are automatically detected
618+ // and labeled in the returned transcript.
619+ //
620+ // For supported features please see [medical models
621+ // documentation](https://cloud.google.com/speech-to-text/docs/medical-models).
622+ //
623+ // - `medical_dictation`
624+ //
625+ // For dictated notes spoken by a single medical provider—for example, a
626+ // doctor dictating notes about a patient's blood test results.
627+ //
628+ // For supported features please see [medical models
629+ // documentation](https://cloud.google.com/speech-to-text/docs/medical-models).
630+ //
631+ // - `usm`
632+ //
633+ // The next generation of Speech-to-Text models from Google.
607634 string model = 4 [(google.api.field_behavior ) = REQUIRED ];
608635
609636 // Required. The language of the supplied audio as a
610637 // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
611638 //
612- // Supported languages:
613- //
614- // - `en-US`
615- //
616- // - `en-GB`
617- //
618- // - `fr-FR`
639+ // Supported languages for each model are listed at:
640+ // https://cloud.google.com/speech-to-text/docs/languages
619641 //
620642 // If additional languages are provided, recognition result will contain
621643 // recognition in the most likely language detected. The recognition result
@@ -689,14 +711,23 @@ message Recognizer {
689711
690712// Automatically detected decoding parameters.
691713// Supported for the following encodings:
714+ //
692715// * WAV_LINEAR16: 16-bit signed little-endian PCM samples in a WAV container.
716+ //
693717// * WAV_MULAW: 8-bit companded mulaw samples in a WAV container.
718+ //
694719// * WAV_ALAW: 8-bit companded alaw samples in a WAV container.
720+ //
695721// * RFC4867_5_AMR: AMR frames with an rfc4867.5 header.
722+ //
696723// * RFC4867_5_AMRWB: AMR-WB frames with an rfc4867.5 header.
724+ //
697725// * FLAC: FLAC frames in the "native FLAC" container format.
726+ //
698727// * MP3: MPEG audio frames with optional (ignored) ID3 metadata.
728+ //
699729// * OGG_OPUS: Opus audio frames in an Ogg container.
730+ //
700731// * WEBM_OPUS: Opus audio frames in a WebM container.
701732message AutoDetectDecodingConfig {}
702733
@@ -725,24 +756,32 @@ message ExplicitDecodingConfig {
725756 // sampling rate of the audio source to 16000 Hz. If that's not possible, use
726757 // the native sample rate of the audio source (instead of re-sampling).
727758 // Supported for the following encodings:
759+ //
728760 // * LINEAR16: Headerless 16-bit signed little-endian PCM samples.
761+ //
729762 // * MULAW: Headerless 8-bit companded mulaw samples.
763+ //
730764 // * ALAW: Headerless 8-bit companded alaw samples.
731765 int32 sample_rate_hertz = 2 ;
732766
733767 // Number of channels present in the audio data sent for recognition.
734768 // Supported for the following encodings:
769+ //
735770 // * LINEAR16: Headerless 16-bit signed little-endian PCM samples.
771+ //
736772 // * MULAW: Headerless 8-bit companded mulaw samples.
773+ //
737774 // * ALAW: Headerless 8-bit companded alaw samples.
775+ //
776+ // The maximum allowed value is 8.
738777 int32 audio_channel_count = 3 ;
739778}
740779
741780// Configuration to enable speaker diarization.
742781message SpeakerDiarizationConfig {
743782 // Required. Minimum number of speakers in the conversation. This range gives
744783 // you more flexibility by allowing the system to automatically determine the
745- // correct number of speakers. If not set, the default value is 2.
784+ // correct number of speakers.
746785 //
747786 // To fix the number of speakers detected in the audio, set
748787 // `min_speaker_count` = `max_speaker_count`.
@@ -825,29 +864,29 @@ message RecognitionFeatures {
825864}
826865
827866// Provides "hints" to the speech recognizer to favor specific words and phrases
828- // in the results. Phrase sets can be specified as an inline resource, or a
829- // reference to an existing phrase set resource.
867+ // in the results. PhraseSets can be specified as an inline resource, or a
868+ // reference to an existing PhraseSet resource.
830869message SpeechAdaptation {
831- // A biasing phrase set , which can be either a string referencing the name of
832- // an existing phrase set resource, or an inline definition of a phrase set .
870+ // A biasing PhraseSet , which can be either a string referencing the name of
871+ // an existing PhraseSets resource, or an inline definition of a PhraseSet .
833872 message AdaptationPhraseSet {
834873 oneof value {
835- // The name of an existing phrase set resource. The user must have read
874+ // The name of an existing PhraseSet resource. The user must have read
836875 // access to the resource and it must not be deleted.
837876 string phrase_set = 1 [(google.api.resource_reference ) = {
838877 type : "speech.googleapis.com/PhraseSet"
839878 }];
840879
841- // An inline defined phrase set .
880+ // An inline defined PhraseSet .
842881 PhraseSet inline_phrase_set = 2 ;
843882 }
844883 }
845884
846- // A list of inline or referenced phrase sets .
885+ // A list of inline or referenced PhraseSets .
847886 repeated AdaptationPhraseSet phrase_sets = 1 ;
848887
849- // A list of inline custom classes . Existing custom class resources can be
850- // referenced directly in a phrase set .
888+ // A list of inline CustomClasses . Existing CustomClass resources can be
889+ // referenced directly in a PhraseSet .
851890 repeated CustomClass custom_classes = 2 ;
852891}
853892
@@ -955,9 +994,9 @@ message SpeechRecognitionAlternative {
955994 float confidence = 2 ;
956995
957996 // A list of word-specific information for each recognized word.
958- // When
959- // [enable_speaker_diarization ][google.cloud.speech.v2.SpeakerDiarizationConfig.enable_speaker_diarization ]
960- // is true , you will see all the words from the beginning of the audio.
997+ // When the
998+ // [SpeakerDiarizationConfig ][google.cloud.speech.v2.SpeakerDiarizationConfig]
999+ // is set , you will see all the words from the beginning of the audio.
9611000 repeated WordInfo words = 3 ;
9621001}
9631002
@@ -995,8 +1034,8 @@ message WordInfo {
9951034 // A distinct label is assigned for every speaker within the audio. This field
9961035 // specifies which one of those speakers was detected to have spoken this
9971036 // word. `speaker_label` is set if
998- // [enable_speaker_diarization ][google.cloud.speech.v2.SpeakerDiarizationConfig.enable_speaker_diarization ]
999- // is `true` and only in the top alternative.
1037+ // [SpeakerDiarizationConfig ][google.cloud.speech.v2.SpeakerDiarizationConfig]
1038+ // is given and only in the top alternative.
10001039 string speaker_label = 6 ;
10011040}
10021041
@@ -1081,9 +1120,9 @@ message StreamingRecognitionConfig {
10811120 // of the recognizer during this recognition request. If no mask is provided,
10821121 // all non-default valued fields in
10831122 // [config][google.cloud.speech.v2.StreamingRecognitionConfig.config] override
1084- // the values in the recognizer for this recognition request. If a mask is
1123+ // the values in the Recognizer for this recognition request. If a mask is
10851124 // provided, only the fields listed in the mask override the config in the
1086- // recognizer for this recognition request. If a wildcard (`*`) is provided,
1125+ // Recognizer for this recognition request. If a wildcard (`*`) is provided,
10871126 // [config][google.cloud.speech.v2.StreamingRecognitionConfig.config]
10881127 // completely overrides and replaces the config in the recognizer for this
10891128 // recognition request.
@@ -1130,6 +1169,7 @@ message StreamingRecognizeRequest {
11301169 StreamingRecognitionConfig streaming_config = 6 ;
11311170
11321171 // Inline audio bytes to be Recognized.
1172+ // Maximum size for this field is 15 KB per request.
11331173 bytes audio = 5 ;
11341174 }
11351175}
@@ -1170,7 +1210,37 @@ message BatchRecognizeRequest {
11701210 google.protobuf.FieldMask config_mask = 5 ;
11711211
11721212 // Audio files with file metadata for ASR.
1213+ // The maximum number of files allowed to be specified is 5.
11731214 repeated BatchRecognizeFileMetadata files = 3 ;
1215+
1216+ // Configuration options for where to output the transcripts of each file.
1217+ RecognitionOutputConfig recognition_output_config = 6 ;
1218+ }
1219+
1220+ // Output configurations for Cloud Storage.
1221+ message GcsOutputConfig {
1222+ // The Cloud Storage URI prefix with which recognition results will be
1223+ // written.
1224+ string uri = 1 ;
1225+ }
1226+
1227+ // Output configurations for inline response.
1228+ message InlineOutputConfig {}
1229+
1230+ // Configuration options for the output(s) of recognition.
1231+ message RecognitionOutputConfig {
1232+ oneof output {
1233+ // If this message is populated, recognition results are written to the
1234+ // provided Google Cloud Storage URI.
1235+ GcsOutputConfig gcs_output_config = 1 ;
1236+
1237+ // If this message is populated, recognition results are provided in the
1238+ // [BatchRecognizeResponse][google.cloud.speech.v2.BatchRecognizeResponse]
1239+ // message of the Operation when completed. This is only supported when
1240+ // calling [BatchRecognize][google.cloud.speech.v2.Speech.BatchRecognize]
1241+ // with just one audio file.
1242+ InlineOutputConfig inline_response_config = 2 ;
1243+ }
11741244}
11751245
11761246// Response message for
@@ -1179,15 +1249,38 @@ message BatchRecognizeRequest {
11791249message BatchRecognizeResponse {
11801250 // Map from filename to the final result for that file.
11811251 map <string , BatchRecognizeFileResult > results = 1 ;
1252+
1253+ // When available, billed audio seconds for the corresponding request.
1254+ google.protobuf.Duration total_billed_duration = 2 ;
1255+ }
1256+
1257+ // Output type for Cloud Storage of BatchRecognize transcripts. Though this
1258+ // proto isn't returned in this API anywhere, the Cloud Storage transcripts will
1259+ // be this proto serialized and should be parsed as such.
1260+ message BatchRecognizeResults {
1261+ // Sequential list of transcription results corresponding to sequential
1262+ // portions of audio.
1263+ repeated SpeechRecognitionResult results = 1 ;
1264+
1265+ // Metadata about the recognition.
1266+ RecognitionResponseMetadata metadata = 2 ;
11821267}
11831268
11841269// Final results for a single file.
11851270message BatchRecognizeFileResult {
1186- // The GCS URI to which recognition results were written.
1271+ // The Cloud Storage URI to which recognition results were written.
11871272 string uri = 1 ;
11881273
11891274 // Error if one was encountered.
11901275 google.rpc.Status error = 2 ;
1276+
1277+ RecognitionResponseMetadata metadata = 3 ;
1278+
1279+ // The transcript for the audio file. This is populated only when
1280+ // [InlineOutputConfig][google.cloud.speech.v2.InlineOutputConfig] is set in
1281+ // the
1282+ // [RecognitionOutputConfig][[google.cloud.speech.v2.RecognitionOutputConfig].
1283+ BatchRecognizeResults transcript = 4 ;
11911284}
11921285
11931286// Metadata about transcription for a single file (for example, progress
@@ -1199,7 +1292,7 @@ message BatchRecognizeTranscriptionMetadata {
11991292 // Error if one was encountered.
12001293 google.rpc.Status error = 2 ;
12011294
1202- // The GCS URI to which recognition results will be written.
1295+ // The Cloud Storage URI to which recognition results will be written.
12031296 string uri = 3 ;
12041297}
12051298
@@ -1562,11 +1655,11 @@ message PhraseSet {
15621655 // be recognized over other similar sounding phrases. The higher the boost,
15631656 // the higher the chance of false positive recognition as well. Negative
15641657 // boost values would correspond to anti-biasing. Anti-biasing is not
1565- // enabled, so negative boost will simply be ignored. Though `boost` can
1566- // accept a wide range of positive values, most use cases are best served
1567- // with values between 0 and 20. We recommend using a binary search approach
1568- // to finding the optimal value for your use case. Speech recognition
1569- // will skip PhraseSets with a boost value of 0 .
1658+ // enabled, so negative boost values will return an error. Boost values must
1659+ // be between 0 and 20. Any values outside that range will return an error.
1660+ // We recommend using a binary search approach to finding the optimal value
1661+ // for your use case as well as adding phrases both with and without boost
1662+ // to your requests .
15701663 float boost = 2 ;
15711664 }
15721665
@@ -1597,7 +1690,8 @@ message PhraseSet {
15971690 // phrase will be recognized over other similar sounding phrases. The higher
15981691 // the boost, the higher the chance of false positive recognition as well.
15991692 // Valid `boost` values are between 0 (exclusive) and 20. We recommend using a
1600- // binary search approach to finding the optimal value for your use case.
1693+ // binary search approach to finding the optimal value for your use case as
1694+ // well as adding phrases both with and without boost to your requests.
16011695 float boost = 4 ;
16021696
16031697 // User-settable, human-readable name for the PhraseSet. Must be 63
0 commit comments