Skip to content

Commit 57bf535

Browse files
committed
Update to speech.
1 parent f03e356 commit 57bf535

1 file changed

Lines changed: 105 additions & 26 deletions

File tree

google/cloud/speech/v1beta1/cloud_speech.proto

Lines changed: 105 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package google.cloud.speech.v1beta1;
1818

1919
import "google/api/annotations.proto";
2020
import "google/longrunning/operations.proto";
21+
import "google/protobuf/timestamp.proto";
2122
import "google/rpc/status.proto";
2223

2324
option java_multiple_files = true;
@@ -34,8 +35,9 @@ service Speech {
3435
}
3536

3637
// Perform asynchronous speech-recognition: receive results via the
37-
// google.longrunning.Operations interface. `Operation.response` returns
38-
// `AsyncRecognizeResponse`.
38+
// google.longrunning.Operations interface. Returns either an
39+
// `Operation.error` or an `Operation.response` which contains
40+
// an `AsyncRecognizeResponse` message.
3941
rpc AsyncRecognize(AsyncRecognizeRequest) returns (google.longrunning.Operation) {
4042
option (google.api.http) = { post: "/v1beta1/speech:asyncrecognize" body: "*" };
4143
}
@@ -87,7 +89,8 @@ message StreamingRecognizeRequest {
8789
// and all subsequent `StreamingRecognizeRequest` messages must contain
8890
// `audio_content` data. The audio bytes must be encoded as specified in
8991
// `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a
90-
// pure binary representation (not base64).
92+
// pure binary representation (not base64). See
93+
// [audio limits](https://cloud.google.com/speech/limits#content).
9194
bytes audio_content = 2;
9295
}
9396
}
@@ -102,13 +105,13 @@ message StreamingRecognitionConfig {
102105
// [Optional] If `false` or omitted, the recognizer will perform continuous
103106
// recognition (continuing to process audio even if the user pauses speaking)
104107
// until the client closes the output stream (gRPC API) or when the maximum
105-
// time limit has been reached. Multiple `SpeechRecognitionResult`s with the
106-
// `is_final` flag set to `true` may be returned.
108+
// time limit has been reached. Multiple `StreamingRecognitionResult`s with
109+
// the `is_final` flag set to `true` may be returned.
107110
//
108111
// If `true`, the recognizer will detect a single spoken utterance. When it
109112
// detects that the user has paused or stopped speaking, it will return an
110113
// `END_OF_UTTERANCE` event and cease recognition. It will return no more than
111-
// one `SpeechRecognitionResult` with the `is_final` flag set to `true`.
114+
// one `StreamingRecognitionResult` with the `is_final` flag set to `true`.
112115
bool single_utterance = 2;
113116

114117
// [Optional] If `true`, interim results (tentative hypotheses) may be
@@ -134,7 +137,7 @@ message RecognitionConfig {
134137
// Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT].
135138
ENCODING_UNSPECIFIED = 0;
136139

137-
// Uncompressed 16-bit signed little-endian samples.
140+
// Uncompressed 16-bit signed little-endian samples (Linear PCM).
138141
// This is the only encoding that may be used by `AsyncRecognize`.
139142
LINEAR16 = 1;
140143

@@ -144,7 +147,7 @@ message RecognitionConfig {
144147
//
145148
// The stream FLAC (Free Lossless Audio Codec) encoding is specified at:
146149
// http://flac.sourceforge.net/documentation.html.
147-
// Only 16-bit samples are supported.
150+
// 16-bit and 24-bit samples are supported.
148151
// Not all fields in STREAMINFO are supported.
149152
FLAC = 2;
150153

@@ -171,8 +174,8 @@ message RecognitionConfig {
171174
// [Optional] The language of the supplied audio as a BCP-47 language tag.
172175
// Example: "en-GB" https://www.rfc-editor.org/rfc/bcp/bcp47.txt
173176
// If omitted, defaults to "en-US". See
174-
// [Language Support](/speech/docs/best-practices#language_support) for
175-
// a list of the currently supported language codes.
177+
// [Language Support](https://cloud.google.com/speech/docs/best-practices#language_support)
178+
// for a list of the currently supported language codes.
176179
string language_code = 3;
177180

178181
// [Optional] Maximum number of recognition hypotheses to be returned.
@@ -196,15 +199,19 @@ message RecognitionConfig {
196199
// Provides "hints" to the speech recognizer to favor specific words and phrases
197200
// in the results.
198201
message SpeechContext {
199-
// [Optional] A list of up to 50 phrases of up to 100 characters each to
200-
// provide words and phrases "hints" to the speech recognition so that it is
201-
// more likely to recognize them.
202+
// [Optional] A list of strings containing words and phrases "hints" so that
203+
// the speech recognition is more likely to recognize them. This can be used
204+
// to improve the accuracy for specific words and phrases, for example, if
205+
// specific commands are typically spoken by the user. This can also be used
206+
// to add additional words to the vocabulary of the recognizer. See
207+
// [usage limits](https://cloud.google.com/speech/limits#content).
202208
repeated string phrases = 1;
203209
}
204210

205211
// Contains audio data in the encoding specified in the `RecognitionConfig`.
206212
// Either `content` or `uri` must be supplied. Supplying both or neither
207-
// returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT].
213+
// returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. See
214+
// [audio limits](https://cloud.google.com/speech/limits#content).
208215
message RecognitionAudio {
209216
oneof audio_source {
210217
// The audio data bytes encoded as specified in
@@ -217,32 +224,104 @@ message RecognitionAudio {
217224
// supported, which must be specified in the following format:
218225
// `gs://bucket_name/object_name` (other URI formats return
219226
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see
220-
// [Request URIs](/storage/docs/reference-uris).
227+
// [Request URIs](https://cloud.google.com/storage/docs/reference-uris).
221228
string uri = 2;
222229
}
223230
}
224231

225232
// `SyncRecognizeResponse` is the only message returned to the client by
226-
// `SyncRecognize`. It contains the result as zero or more
227-
// sequential `RecognizeResponse` messages.
233+
// `SyncRecognize`. It contains the result as zero or more sequential
234+
// `SpeechRecognitionResult` messages.
228235
message SyncRecognizeResponse {
229236
// [Output-only] Sequential list of transcription results corresponding to
230237
// sequential portions of audio.
231238
repeated SpeechRecognitionResult results = 2;
232239
}
233240

234241
// `AsyncRecognizeResponse` is the only message returned to the client by
235-
// `AsyncRecognize`. It contains the result as zero or more
236-
// sequential `RecognizeResponse` messages.
242+
// `AsyncRecognize`. It contains the result as zero or more sequential
243+
// `SpeechRecognitionResult` messages. It is included in the `result.response`
244+
// field of the `Operation` returned by the `GetOperation` call of the
245+
// `google::longrunning::Operations` service.
237246
message AsyncRecognizeResponse {
238247
// [Output-only] Sequential list of transcription results corresponding to
239248
// sequential portions of audio.
240249
repeated SpeechRecognitionResult results = 2;
241250
}
242251

252+
// `AsyncRecognizeMetadata` describes the progress of a long-running
253+
// `AsyncRecognize` call. It is included in the `metadata` field of the
254+
// `Operation` returned by the `GetOperation` call of the
255+
// `google::longrunning::Operations` service.
256+
message AsyncRecognizeMetadata {
257+
// Approximate percentage of audio processed thus far. Guaranteed to be 100
258+
// when the audio is fully processed and the results are available.
259+
int32 progress_percent = 1;
260+
261+
// Time when the request was received.
262+
google.protobuf.Timestamp start_time = 2;
263+
264+
// Time of the most recent processing update.
265+
google.protobuf.Timestamp last_update_time = 3;
266+
}
267+
243268
// `StreamingRecognizeResponse` is the only message returned to the client by
244-
// `StreamingRecognize`. It contains the result as zero or more
245-
// sequential `RecognizeResponse` messages.
269+
// `StreamingRecognize`. A series of one or more `StreamingRecognizeResponse`
270+
// messages are streamed back to the client.
271+
//
272+
// Here's an example of a series of ten `StreamingRecognizeResponse`s that might
273+
// be returned while processing audio:
274+
//
275+
// 1. endpointer_type: START_OF_SPEECH
276+
//
277+
// 2. results { alternatives { transcript: "tube" } stability: 0.01 }
278+
// result_index: 0
279+
//
280+
// 3. results { alternatives { transcript: "to be a" } stability: 0.01 }
281+
// result_index: 0
282+
//
283+
// 4. results { alternatives { transcript: "to be" } stability: 0.9 }
284+
// results { alternatives { transcript: " or not to be" } stability: 0.01 }
285+
// result_index: 0
286+
//
287+
// 5. results { alternatives { transcript: "to be or not to be"
288+
// confidence: 0.92 }
289+
// alternatives { transcript: "to bee or not to bee" }
290+
// is_final: true }
291+
// result_index: 0
292+
//
293+
// 6. results { alternatives { transcript: " that's" } stability: 0.01 }
294+
// result_index: 1
295+
//
296+
// 7. results { alternatives { transcript: " that is" } stability: 0.9 }
297+
// results { alternatives { transcript: " the question" } stability: 0.01 }
298+
// result_index: 1
299+
//
300+
// 8. endpointer_type: END_OF_SPEECH
301+
//
302+
// 9. results { alternatives { transcript: " that is the question"
303+
// confidence: 0.98 }
304+
// alternatives { transcript: " that was the question" }
305+
// is_final: true }
306+
// result_index: 1
307+
//
308+
// 10. endpointer_type: END_OF_AUDIO
309+
//
310+
// Notes:
311+
//
312+
// - Only two of the above responses #5 and #9 contain final results, they are
313+
// indicated by `is_final: true`. Concatenating these together generates the
314+
// full transcript: "to be or not to be that is the question".
315+
//
316+
// - The others contain interim `results`. #4 and #7 contain two interim
317+
// `results`, the first portion has a high stability and is less likely to
318+
// change, the second portion has a low stability and is very likely to
319+
// change. A UI designer might choose to show only high stability `results`.
320+
//
321+
// - The `result_index` indicates the portion of audio that has had final
322+
// results returned, and is no longer being processed. For example, the
323+
// `results` in #6 and later correspond to the portion of audio after
324+
// "to be or not to be".
246325
message StreamingRecognizeResponse {
247326
// Indicates the type of endpointer event.
248327
enum EndpointerType {
@@ -276,24 +355,24 @@ message StreamingRecognizeResponse {
276355
repeated StreamingRecognitionResult results = 2;
277356

278357
// [Output-only] Indicates the lowest index in the `results` array that has
279-
// changed. The repeated `SpeechRecognitionResult` results overwrite past
358+
// changed. The repeated `StreamingRecognitionResult` results overwrite past
280359
// results at this index and higher.
281360
int32 result_index = 3;
282361

283362
// [Output-only] Indicates the type of endpointer event.
284363
EndpointerType endpointer_type = 4;
285364
}
286365

287-
// A speech recognition result corresponding to a portion of the audio that is
288-
// currently being processed.
366+
// A streaming speech recognition result corresponding to a portion of the audio
367+
// that is currently being processed.
289368
message StreamingRecognitionResult {
290369
// [Output-only] May contain one or more recognition hypotheses (up to the
291370
// maximum specified in `max_alternatives`).
292371
repeated SpeechRecognitionAlternative alternatives = 1;
293372

294-
// [Output-only] If `false`, this `SpeechRecognitionResult` represents an
373+
// [Output-only] If `false`, this `StreamingRecognitionResult` represents an
295374
// interim result that may change. If `true`, this is the final time the
296-
// speech service will return this particular `SpeechRecognitionResult`,
375+
// speech service will return this particular `StreamingRecognitionResult`,
297376
// the recognizer will not return any further hypotheses for this portion of
298377
// the transcript and corresponding audio.
299378
bool is_final = 2;

0 commit comments

Comments
 (0)