1- // Copyright 2018 Google Inc.
1+ // Copyright 2017 Google Inc.
22//
33// Licensed under the Apache License, Version 2.0 (the "License");
44// you may not use this file except in compliance with the License.
1414
1515syntax = "proto3" ;
1616
17- package google.cloud.videointelligence.v1p1beta1 ;
17+ package google.cloud.videointelligence.v1 ;
1818
1919import "google/api/annotations.proto" ;
2020import "google/longrunning/operations.proto" ;
2121import "google/protobuf/duration.proto" ;
2222import "google/protobuf/timestamp.proto" ;
2323import "google/rpc/status.proto" ;
2424
25- option csharp_namespace = "Google.Cloud.VideoIntelligence.V1P1Beta1 " ;
26- option go_package = "google.golang.org/genproto/googleapis/cloud/videointelligence/v1p1beta1 ;videointelligence" ;
25+ option csharp_namespace = "Google.Cloud.VideoIntelligence.V1 " ;
26+ option go_package = "google.golang.org/genproto/googleapis/cloud/videointelligence/v1 ;videointelligence" ;
2727option java_multiple_files = true ;
2828option java_outer_classname = "VideoIntelligenceServiceProto" ;
29- option java_package = "com.google.cloud.videointelligence.v1p1beta1 " ;
30- option php_namespace = "Google\\Cloud\\VideoIntelligence\\V1p1beta1 " ;
29+ option java_package = "com.google.cloud.videointelligence.v1 " ;
30+ option php_namespace = "Google\\Cloud\\VideoIntelligence\\V1 " ;
3131
3232
3333// Service that implements Google Cloud Video Intelligence API.
@@ -37,10 +37,7 @@ service VideoIntelligenceService {
3737 // `Operation.metadata` contains `AnnotateVideoProgress` (progress).
3838 // `Operation.response` contains `AnnotateVideoResponse` (results).
3939 rpc AnnotateVideo (AnnotateVideoRequest ) returns (google.longrunning.Operation ) {
40- option (google.api.http ) = {
41- post : "/v1p1beta1/videos:annotate"
42- body : "*"
43- };
40+ option (google.api.http ) = { post : "/v1/videos:annotate" body : "*" };
4441 }
4542}
4643
@@ -99,6 +96,9 @@ message VideoContext {
9996 // Config for EXPLICIT_CONTENT_DETECTION.
10097 ExplicitContentDetectionConfig explicit_content_detection_config = 4 ;
10198
99+ // Config for FACE_DETECTION.
100+ FaceDetectionConfig face_detection_config = 5 ;
101+
102102 // Config for SPEECH_TRANSCRIPTION.
103103 SpeechTranscriptionConfig speech_transcription_config = 6 ;
104104}
@@ -137,6 +137,17 @@ message ExplicitContentDetectionConfig {
137137 string model = 1 ;
138138}
139139
140+ // Config for FACE_DETECTION.
141+ message FaceDetectionConfig {
142+ // Model to use for face detection.
143+ // Supported values: "builtin/stable" (the default if unset) and
144+ // "builtin/latest".
145+ string model = 1 ;
146+
147+ // Whether bounding boxes be included in the face annotation output.
148+ bool include_bounding_boxes = 2 ;
149+ }
150+
140151// Video segment.
141152message VideoSegment {
142153 // Time-offset, relative to the beginning of the video,
@@ -217,9 +228,56 @@ message ExplicitContentAnnotation {
217228 repeated ExplicitContentFrame frames = 1 ;
218229}
219230
231+ // Normalized bounding box.
232+ // The normalized vertex coordinates are relative to the original image.
233+ // Range: [0, 1].
234+ message NormalizedBoundingBox {
235+ // Left X coordinate.
236+ float left = 1 ;
237+
238+ // Top Y coordinate.
239+ float top = 2 ;
240+
241+ // Right X coordinate.
242+ float right = 3 ;
243+
244+ // Bottom Y coordinate.
245+ float bottom = 4 ;
246+ }
247+
248+ // Video segment level annotation results for face detection.
249+ message FaceSegment {
250+ // Video segment where a face was detected.
251+ VideoSegment segment = 1 ;
252+ }
253+
254+ // Video frame level annotation results for face detection.
255+ message FaceFrame {
256+ // Normalized Bounding boxes in a frame.
257+ // There can be more than one boxes if the same face is detected in multiple
258+ // locations within the current frame.
259+ repeated NormalizedBoundingBox normalized_bounding_boxes = 1 ;
260+
261+ // Time-offset, relative to the beginning of the video,
262+ // corresponding to the video frame for this location.
263+ google.protobuf.Duration time_offset = 2 ;
264+ }
265+
266+ // Face annotation.
267+ message FaceAnnotation {
268+ // Thumbnail of a representative face view (in JPEG format).
269+ bytes thumbnail = 1 ;
270+
271+ // All video segments where a face was detected.
272+ repeated FaceSegment segments = 2 ;
273+
274+ // All video frames where a face was detected.
275+ repeated FaceFrame frames = 3 ;
276+ }
277+
220278// Annotation results for a single video.
221279message VideoAnnotationResults {
222- // Output only. Video file location in
280+ // Video file location in
223281 // [Google Cloud Storage](https://cloud.google.com/storage/).
224282 string input_uri = 1 ;
225283
@@ -235,6 +293,9 @@ message VideoAnnotationResults {
235293 // There is exactly one element for each unique label.
236294 repeated LabelAnnotation frame_label_annotations = 4 ;
237295
296+ // Face annotations. There is exactly one element for each unique face.
297+ repeated FaceAnnotation face_annotations = 5 ;
298+
238299 // Shot annotations. Each shot is represented as a video segment.
239300 repeated VideoSegment shot_annotations = 6 ;
240301
@@ -244,8 +305,8 @@ message VideoAnnotationResults {
244305 // Speech transcription.
245306 repeated SpeechTranscription speech_transcriptions = 11 ;
246307
247- // Output only. If set, indicates an error. Note that for a single
248- // `AnnotateVideoRequest` some videos may succeed and some may fail.
308+ // If set, indicates an error. Note that for a single `AnnotateVideoRequest`
309+ // some videos may succeed and some may fail.
249310 google.rpc.Status error = 9 ;
250311}
251312
@@ -259,18 +320,18 @@ message AnnotateVideoResponse {
259320
260321// Annotation progress for a single video.
261322message VideoAnnotationProgress {
262- // Output only. Video file location in
323+ // Video file location in
263324 // [Google Cloud Storage](https://cloud.google.com/storage/).
264325 string input_uri = 1 ;
265326
266- // Output only. Approximate percentage processed thus far. Guaranteed to be
327+ // Approximate percentage processed thus far. Guaranteed to be
267328 // 100 when fully processed.
268329 int32 progress_percent = 2 ;
269330
270- // Output only. Time when the request was received.
331+ // Time when the request was received.
271332 google.protobuf.Timestamp start_time = 3 ;
272333
273- // Output only. Time of the most recent update.
334+ // Time of the most recent update.
274335 google.protobuf.Timestamp update_time = 4 ;
275336}
276337
@@ -293,7 +354,7 @@ message SpeechTranscriptionConfig {
293354
294355 // *Optional* Maximum number of recognition hypotheses to be returned.
295356 // Specifically, the maximum number of `SpeechRecognitionAlternative` messages
296- // within each `SpeechRecognitionResult `. The server may return fewer than
357+ // within each `SpeechTranscription `. The server may return fewer than
297358 // `max_alternatives`. Valid values are `0`-`30`. A value of `0` or `1` will
298359 // return a maximum of one. If omitted, will return a maximum of one.
299360 int32 max_alternatives = 2 ;
@@ -318,6 +379,26 @@ message SpeechTranscriptionConfig {
318379 // *Optional* For file formats, such as MXF or MKV, supporting multiple audio
319380 // tracks, specify up to two tracks. Default: track 0.
320381 repeated int32 audio_tracks = 6 ;
382+
383+ // *Optional* If 'true', enables speaker detection for each recognized word in
384+ // the top alternative of the recognition result using a speaker_tag provided
385+ // in the WordInfo.
386+ // Note: When this is true, we send all the words from the beginning of the
387+ // audio for the top alternative in every consecutive responses.
388+ // This is done in order to improve our speaker tags as our models learn to
389+ // identify the speakers in the conversation over time.
390+ bool enable_speaker_diarization = 7 ;
391+
392+ // *Optional*
393+ // If set, specifies the estimated number of speakers in the conversation.
394+ // If not set, defaults to '2'.
395+ // Ignored unless enable_speaker_diarization is set to true.
396+ int32 diarization_speaker_count = 8 ;
397+
398+ // *Optional* If `true`, the top result includes a list of words and the
399+ // confidence for those words. If `false`, no word-level confidence
400+ // information is returned. The default is `false`.
401+ bool enable_word_confidence = 9 ;
321402}
322403
323404// Provides "hints" to the speech recognizer to favor specific words and phrases
@@ -334,48 +415,68 @@ message SpeechContext {
334415
335416// A speech recognition result corresponding to a portion of the audio.
336417message SpeechTranscription {
337- // Output only. May contain one or more recognition hypotheses (up to the
338- // maximum specified in `max_alternatives`).
339- // These alternatives are ordered in terms of accuracy, with the top (first)
340- // alternative being the most probable, as ranked by the recognizer.
418+ // May contain one or more recognition hypotheses (up to the maximum specified
419+ // in `max_alternatives`). These alternatives are ordered in terms of
420+ // accuracy, with the top (first) alternative being the most probable, as
421+ // ranked by the recognizer.
341422 repeated SpeechRecognitionAlternative alternatives = 1 ;
423+
424+ // Output only. The
425+ // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of the
426+ // language in this result. This language code was detected to have the most
427+ // likelihood of being spoken in the audio.
428+ string language_code = 2 ;
342429}
343430
344431// Alternative hypotheses (a.k.a. n-best list).
345432message SpeechRecognitionAlternative {
346- // Output only. Transcript text representing the words that the user spoke.
433+ // Transcript text representing the words that the user spoke.
347434 string transcript = 1 ;
348435
349- // Output only. The confidence estimate between 0.0 and 1.0. A higher number
436+ // The confidence estimate between 0.0 and 1.0. A higher number
350437 // indicates an estimated greater likelihood that the recognized words are
351438 // correct. This field is typically provided only for the top hypothesis, and
352439 // only for `is_final=true` results. Clients should not rely on the
353440 // `confidence` field as it is not guaranteed to be accurate or consistent.
354441 // The default of 0.0 is a sentinel value indicating `confidence` was not set.
355442 float confidence = 2 ;
356443
357- // Output only. A list of word-specific information for each recognized word.
444+ // A list of word-specific information for each recognized word.
358445 repeated WordInfo words = 3 ;
359446}
360447
361448// Word-specific information for recognized words. Word information is only
362449// included in the response when certain request parameters are set, such
363450// as `enable_word_time_offsets`.
364451message WordInfo {
365- // Output only. Time offset relative to the beginning of the audio, and
452+ // Time offset relative to the beginning of the audio, and
366453 // corresponding to the start of the spoken word. This field is only set if
367454 // `enable_word_time_offsets=true` and only in the top hypothesis. This is an
368455 // experimental feature and the accuracy of the time offset can vary.
369456 google.protobuf.Duration start_time = 1 ;
370457
371- // Output only. Time offset relative to the beginning of the audio, and
458+ // Time offset relative to the beginning of the audio, and
372459 // corresponding to the end of the spoken word. This field is only set if
373460 // `enable_word_time_offsets=true` and only in the top hypothesis. This is an
374461 // experimental feature and the accuracy of the time offset can vary.
375462 google.protobuf.Duration end_time = 2 ;
376463
377- // Output only. The word corresponding to this set of information.
464+ // The word corresponding to this set of information.
378465 string word = 3 ;
466+
467+ // Output only. The confidence estimate between 0.0 and 1.0. A higher number
468+ // indicates an estimated greater likelihood that the recognized words are
469+ // correct. This field is set only for the top alternative.
470+ // This field is not guaranteed to be accurate and users should not rely on it
471+ // to be always provided.
472+ // The default of 0.0 is a sentinel value indicating `confidence` was not set.
473+ float confidence = 4 ;
474+
475+ // Output only. A distinct integer value is assigned for every speaker within
476+ // the audio. This field specifies which one of those speakers was detected to
477+ // have spoken this word. Value ranges from 1 up to diarization_speaker_count,
478+ // and is only set if speaker diarization is enabled.
479+ int32 speaker_tag = 5 ;
379480}
380481
381482// Video annotation feature.
@@ -392,6 +493,9 @@ enum Feature {
392493 // Explicit content detection.
393494 EXPLICIT_CONTENT_DETECTION = 3 ;
394495
496+ // Human face detection and tracking.
497+ FACE_DETECTION = 4 ;
498+
395499 // Speech transcription.
396500 SPEECH_TRANSCRIPTION = 6 ;
397501}
0 commit comments