1- // Copyright 2018 Google LLC.
1+ // Copyright 2019 Google LLC.
22//
33// Licensed under the Apache License, Version 2.0 (the "License");
44// you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@ import "google/longrunning/operations.proto";
2222import "google/protobuf/duration.proto" ;
2323import "google/protobuf/timestamp.proto" ;
2424import "google/rpc/status.proto" ;
25+ import "google/api/client.proto" ;
2526
2627option csharp_namespace = "Google.Cloud.VideoIntelligence.V1" ;
2728option go_package = "google.golang.org/genproto/googleapis/cloud/videointelligence/v1;videointelligence" ;
@@ -33,12 +34,14 @@ option ruby_package = "Google::Cloud::VideoIntelligence::V1";
3334
3435// Service that implements Google Cloud Video Intelligence API.
3536service VideoIntelligenceService {
37+ option (google.api.default_host ) = "videointelligence.googleapis.com" ;
38+ option (google.api.oauth_scopes ) = "https://www.googleapis.com/auth/cloud-platform" ;
39+
3640 // Performs asynchronous video annotation. Progress and results can be
3741 // retrieved through the `google.longrunning.Operations` interface.
3842 // `Operation.metadata` contains `AnnotateVideoProgress` (progress).
3943 // `Operation.response` contains `AnnotateVideoResponse` (results).
40- rpc AnnotateVideo (AnnotateVideoRequest )
41- returns (google.longrunning.Operation ) {
44+ rpc AnnotateVideo (AnnotateVideoRequest ) returns (google.longrunning.Operation ) {
4245 option (google.api.http ) = {
4346 post : "/v1/videos:annotate"
4447 body : "*"
@@ -52,10 +55,10 @@ message AnnotateVideoRequest {
5255 // [Google Cloud Storage](https://cloud.google.com/storage/) URIs are
5356 // supported, which must be specified in the following format:
5457 // `gs://bucket-id/object-id` (other URI formats return
55- // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For
56- // more information, see [Request URIs](/storage/docs/reference-uris). A video
57- // URI may include wildcards in `object-id`, and thus identify multiple
58- // videos. Supported wildcards: '*' to match 0 or more characters;
58+ // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see
59+ // [Request URIs](/storage/docs/reference-uris).
60+ // A video URI may include wildcards in `object-id`, and thus identify
61+ // multiple videos. Supported wildcards: '*' to match 0 or more characters;
5962 // '?' to match 1 character. If unset, the input video should be embedded
6063 // in the request as `input_content`. If set, `input_content` should be unset.
6164 string input_uri = 1 ;
@@ -75,8 +78,8 @@ message AnnotateVideoRequest {
7578 // Currently, only [Google Cloud Storage](https://cloud.google.com/storage/)
7679 // URIs are supported, which must be specified in the following format:
7780 // `gs://bucket-id/object-id` (other URI formats return
78- // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For
79- // more information, see [Request URIs](/storage/docs/reference-uris).
81+ // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see
82+ // [Request URIs](/storage/docs/reference-uris).
8083 string output_uri = 4 ;
8184
8285 // Optional cloud region where annotation should take place. Supported cloud
@@ -101,9 +104,6 @@ message VideoContext {
101104 // Config for EXPLICIT_CONTENT_DETECTION.
102105 ExplicitContentDetectionConfig explicit_content_detection_config = 4 ;
103106
104- // Config for FACE_DETECTION.
105- FaceDetectionConfig face_detection_config = 5 ;
106-
107107 // Config for SPEECH_TRANSCRIPTION.
108108 SpeechTranscriptionConfig speech_transcription_config = 6 ;
109109
@@ -114,6 +114,66 @@ message VideoContext {
114114 ObjectTrackingConfig object_tracking_config = 13 ;
115115}
116116
117+ // Video annotation feature.
118+ enum Feature {
119+ // Unspecified.
120+ FEATURE_UNSPECIFIED = 0 ;
121+
122+ // Label detection. Detect objects, such as dog or flower.
123+ LABEL_DETECTION = 1 ;
124+
125+ // Shot change detection.
126+ SHOT_CHANGE_DETECTION = 2 ;
127+
128+ // Explicit content detection.
129+ EXPLICIT_CONTENT_DETECTION = 3 ;
130+
131+ // Speech transcription.
132+ SPEECH_TRANSCRIPTION = 6 ;
133+
134+ // OCR text detection and tracking.
135+ TEXT_DETECTION = 7 ;
136+
137+ // Object detection and tracking.
138+ OBJECT_TRACKING = 9 ;
139+ }
140+
141+ // Label detection mode.
142+ enum LabelDetectionMode {
143+ // Unspecified.
144+ LABEL_DETECTION_MODE_UNSPECIFIED = 0 ;
145+
146+ // Detect shot-level labels.
147+ SHOT_MODE = 1 ;
148+
149+ // Detect frame-level labels.
150+ FRAME_MODE = 2 ;
151+
152+ // Detect both shot-level and frame-level labels.
153+ SHOT_AND_FRAME_MODE = 3 ;
154+ }
155+
156+ // Bucketized representation of likelihood.
157+ enum Likelihood {
158+ // Unspecified likelihood.
159+ LIKELIHOOD_UNSPECIFIED = 0 ;
160+
161+ // Very unlikely.
162+ VERY_UNLIKELY = 1 ;
163+
164+ // Unlikely.
165+ UNLIKELY = 2 ;
166+
167+ // Possible.
168+ POSSIBLE = 3 ;
169+
170+ // Likely.
171+ LIKELY = 4 ;
172+
173+ // Very likely.
174+ VERY_LIKELY = 5 ;
175+ }
176+
117177// Config for LABEL_DETECTION.
118178message LabelDetectionConfig {
119179 // What labels should be detected with LABEL_DETECTION, in addition to
@@ -156,28 +216,17 @@ message ShotChangeDetectionConfig {
156216 string model = 1 ;
157217}
158218
159- // Config for EXPLICIT_CONTENT_DETECTION.
160- message ExplicitContentDetectionConfig {
161- // Model to use for explicit content detection.
162- // Supported values: "builtin/stable" (the default if unset) and
163- // "builtin/latest".
164- string model = 1 ;
165- }
166-
167- // Config for FACE_DETECTION.
168- message FaceDetectionConfig {
169- // Model to use for face detection.
219+ // Config for OBJECT_TRACKING.
220+ message ObjectTrackingConfig {
221+ // Model to use for object tracking.
170222 // Supported values: "builtin/stable" (the default if unset) and
171223 // "builtin/latest".
172224 string model = 1 ;
173-
174- // Whether bounding boxes be included in the face annotation output.
175- bool include_bounding_boxes = 2 ;
176225}
177226
178- // Config for OBJECT_TRACKING .
179- message ObjectTrackingConfig {
180- // Model to use for object tracking .
227+ // Config for EXPLICIT_CONTENT_DETECTION .
228+ message ExplicitContentDetectionConfig {
229+ // Model to use for explicit content detection .
181230 // Supported values: "builtin/stable" (the default if unset) and
182231 // "builtin/latest".
183232 string model = 1 ;
@@ -295,57 +344,24 @@ message NormalizedBoundingBox {
295344 float bottom = 4 ;
296345}
297346
298- // Video segment level annotation results for face detection.
299- message FaceSegment {
300- // Video segment where a face was detected.
301- VideoSegment segment = 1 ;
302- }
303-
304- // Video frame level annotation results for face detection.
305- message FaceFrame {
306- // Normalized Bounding boxes in a frame.
307- // There can be more than one boxes if the same face is detected in multiple
308- // locations within the current frame.
309- repeated NormalizedBoundingBox normalized_bounding_boxes = 1 ;
310-
311- // Time-offset, relative to the beginning of the video,
312- // corresponding to the video frame for this location.
313- google.protobuf.Duration time_offset = 2 ;
314- }
315-
316- // Face annotation.
317- message FaceAnnotation {
318- // Thumbnail of a representative face view (in JPEG format).
319- bytes thumbnail = 1 ;
320-
321- // All video segments where a face was detected.
322- repeated FaceSegment segments = 2 ;
323-
324- // All video frames where a face was detected.
325- repeated FaceFrame frames = 3 ;
326- }
327-
328347// Annotation results for a single video.
329348message VideoAnnotationResults {
330349 // Video file location in
331350 // [Google Cloud Storage](https://cloud.google.com/storage/).
332351 string input_uri = 1 ;
333352
334- // Label annotations on video level or user specified segment level.
353+ // Topical label annotations on video level or user specified segment level.
335354 // There is exactly one element for each unique label.
336355 repeated LabelAnnotation segment_label_annotations = 2 ;
337356
338- // Label annotations on shot level.
357+ // Topical label annotations on shot level.
339358 // There is exactly one element for each unique label.
340359 repeated LabelAnnotation shot_label_annotations = 3 ;
341360
342361 // Label annotations on frame level.
343362 // There is exactly one element for each unique label.
344363 repeated LabelAnnotation frame_label_annotations = 4 ;
345364
346- // Face annotations. There is exactly one element for each unique face.
347- repeated FaceAnnotation face_annotations = 5 ;
348-
349365 // Shot annotations. Each shot is represented as a video segment.
350366 repeated VideoSegment shot_annotations = 6 ;
351367
@@ -391,6 +407,14 @@ message VideoAnnotationProgress {
391407
392408 // Time of the most recent update.
393409 google.protobuf.Timestamp update_time = 4 ;
410+
411+ // Specifies which feature is being tracked if the request contains more than
412+ // one features.
413+ Feature feature = 5 ;
414+
415+ // Specifies which segment is being tracked if the request contains more than
416+ // one segments.
417+ VideoSegment segment = 6 ;
394418}
395419
396420// Video annotation progress. Included in the `metadata`
@@ -491,15 +515,17 @@ message SpeechRecognitionAlternative {
491515 // Transcript text representing the words that the user spoke.
492516 string transcript = 1 ;
493517
494- // The confidence estimate between 0.0 and 1.0. A higher number
518+ // Output only. The confidence estimate between 0.0 and 1.0. A higher number
495519 // indicates an estimated greater likelihood that the recognized words are
496- // correct. This field is typically provided only for the top hypothesis, and
497- // only for `is_final=true` results. Clients should not rely on the
498- // `confidence` field as it is not guaranteed to be accurate or consistent .
520+ // correct. This field is set only for the top alternative.
521+ // This field is not guaranteed to be accurate and users should not rely on it
522+ // to be always provided .
499523 // The default of 0.0 is a sentinel value indicating `confidence` was not set.
500524 float confidence = 2 ;
501525
502- // A list of word-specific information for each recognized word.
526+ // Output only. A list of word-specific information for each recognized word.
527+ // Note: When `enable_speaker_diarization` is true, you will see all the words
528+ // from the beginning of the audio.
503529 repeated WordInfo words = 3 ;
504530}
505531
@@ -645,66 +671,3 @@ message ObjectTrackingAnnotation {
645671 // Streaming mode: it can only be one ObjectTrackingFrame message in frames.
646672 repeated ObjectTrackingFrame frames = 2 ;
647673}
648-
649- // Video annotation feature.
650- enum Feature {
651- // Unspecified.
652- FEATURE_UNSPECIFIED = 0 ;
653-
654- // Label detection. Detect objects, such as dog or flower.
655- LABEL_DETECTION = 1 ;
656-
657- // Shot change detection.
658- SHOT_CHANGE_DETECTION = 2 ;
659-
660- // Explicit content detection.
661- EXPLICIT_CONTENT_DETECTION = 3 ;
662-
663- // Human face detection and tracking.
664- FACE_DETECTION = 4 ;
665-
666- // Speech transcription.
667- SPEECH_TRANSCRIPTION = 6 ;
668-
669- // OCR text detection and tracking.
670- TEXT_DETECTION = 7 ;
671-
672- // Object detection and tracking.
673- OBJECT_TRACKING = 9 ;
674- }
675-
676- // Label detection mode.
677- enum LabelDetectionMode {
678- // Unspecified.
679- LABEL_DETECTION_MODE_UNSPECIFIED = 0 ;
680-
681- // Detect shot-level labels.
682- SHOT_MODE = 1 ;
683-
684- // Detect frame-level labels.
685- FRAME_MODE = 2 ;
686-
687- // Detect both shot-level and frame-level labels.
688- SHOT_AND_FRAME_MODE = 3 ;
689- }
690-
691- // Bucketized representation of likelihood.
692- enum Likelihood {
693- // Unspecified likelihood.
694- LIKELIHOOD_UNSPECIFIED = 0 ;
695-
696- // Very unlikely.
697- VERY_UNLIKELY = 1 ;
698-
699- // Unlikely.
700- UNLIKELY = 2 ;
701-
702- // Possible.
703- POSSIBLE = 3 ;
704-
705- // Likely.
706- LIKELY = 4 ;
707-
708- // Very likely.
709- VERY_LIKELY = 5 ;
710- }
0 commit comments