unacast
diff --git a/‎videointelligence/google/cloud/videointelligence_v1/proto/video_intelligence.proto‎
Lines changed: 132 additions & 28 deletions b/‎videointelligence/google/cloud/videointelligence_v1/proto/video_intelligence.proto‎
Lines changed: 132 additions & 28 deletions
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2017 Google Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,20 +14,20 @@
 
 syntax = "proto3";
 
-package google.cloud.videointelligence.v1p1beta1;
+package google.cloud.videointelligence.v1;
 
 import "google/api/annotations.proto";
 import "google/longrunning/operations.proto";
 import "google/protobuf/duration.proto";
 import "google/protobuf/timestamp.proto";
 import "google/rpc/status.proto";
 
-option csharp_namespace = "Google.Cloud.VideoIntelligence.V1P1Beta1";
-option go_package = "google.golang.org/genproto/googleapis/cloud/videointelligence/v1p1beta1;videointelligence";
+option csharp_namespace = "Google.Cloud.VideoIntelligence.V1";
+option go_package = "google.golang.org/genproto/googleapis/cloud/videointelligence/v1;videointelligence";
 option java_multiple_files = true;
 option java_outer_classname = "VideoIntelligenceServiceProto";
-option java_package = "com.google.cloud.videointelligence.v1p1beta1";
-option php_namespace = "Google\\Cloud\\VideoIntelligence\\V1p1beta1";
+option java_package = "com.google.cloud.videointelligence.v1";
+option php_namespace = "Google\\Cloud\\VideoIntelligence\\V1";
 
 
 // Service that implements Google Cloud Video Intelligence API.
@@ -37,10 +37,7 @@ service VideoIntelligenceService {
   // `Operation.metadata` contains `AnnotateVideoProgress` (progress).
   // `Operation.response` contains `AnnotateVideoResponse` (results).
   rpc AnnotateVideo(AnnotateVideoRequest) returns (google.longrunning.Operation) {
-    option (google.api.http) = {
-      post: "/v1p1beta1/videos:annotate"
-      body: "*"
-    };
+    option (google.api.http) = { post: "/v1/videos:annotate" body: "*" };
   }
 }
 
@@ -99,6 +96,9 @@ message VideoContext {
   // Config for EXPLICIT_CONTENT_DETECTION.
   ExplicitContentDetectionConfig explicit_content_detection_config = 4;
 
+  // Config for FACE_DETECTION.
+  FaceDetectionConfig face_detection_config = 5;
+
   // Config for SPEECH_TRANSCRIPTION.
   SpeechTranscriptionConfig speech_transcription_config = 6;
 }
@@ -137,6 +137,17 @@ message ExplicitContentDetectionConfig {
   string model = 1;
 }
 
+// Config for FACE_DETECTION.
+message FaceDetectionConfig {
+  // Model to use for face detection.
+  // Supported values: "builtin/stable" (the default if unset) and
+  // "builtin/latest".
+  string model = 1;
+
+  // Whether bounding boxes be included in the face annotation output.
+  bool include_bounding_boxes = 2;
+}
+
 // Video segment.
 message VideoSegment {
   // Time-offset, relative to the beginning of the video,
@@ -217,9 +228,56 @@ message ExplicitContentAnnotation {
   repeated ExplicitContentFrame frames = 1;
 }
 
+// Normalized bounding box.
+// The normalized vertex coordinates are relative to the original image.
+// Range: [0, 1].
+message NormalizedBoundingBox {
+  // Left X coordinate.
+  float left = 1;
+
+  // Top Y coordinate.
+  float top = 2;
+
+  // Right X coordinate.
+  float right = 3;
+
+  // Bottom Y coordinate.
+  float bottom = 4;
+}
+
+// Video segment level annotation results for face detection.
+message FaceSegment {
+  // Video segment where a face was detected.
+  VideoSegment segment = 1;
+}
+
+// Video frame level annotation results for face detection.
+message FaceFrame {
+  // Normalized Bounding boxes in a frame.
+  // There can be more than one boxes if the same face is detected in multiple
+  // locations within the current frame.
+  repeated NormalizedBoundingBox normalized_bounding_boxes = 1;
+
+  // Time-offset, relative to the beginning of the video,
+  // corresponding to the video frame for this location.
+  google.protobuf.Duration time_offset = 2;
+}
+
+// Face annotation.
+message FaceAnnotation {
+  // Thumbnail of a representative face view (in JPEG format).
+  bytes thumbnail = 1;
+
+  // All video segments where a face was detected.
+  repeated FaceSegment segments = 2;
+
+  // All video frames where a face was detected.
+  repeated FaceFrame frames = 3;
+}
+
 // Annotation results for a single video.
 message VideoAnnotationResults {
-  // Output only. Video file location in
+  // Video file location in
   // [Google Cloud Storage](https://cloud.google.com/storage/).
   string input_uri = 1;
 
@@ -235,6 +293,9 @@ message VideoAnnotationResults {
   // There is exactly one element for each unique label.
   repeated LabelAnnotation frame_label_annotations = 4;
 
+  // Face annotations. There is exactly one element for each unique face.
+  repeated FaceAnnotation face_annotations = 5;
+
   // Shot annotations. Each shot is represented as a video segment.
   repeated VideoSegment shot_annotations = 6;
 
@@ -244,8 +305,8 @@ message VideoAnnotationResults {
   // Speech transcription.
   repeated SpeechTranscription speech_transcriptions = 11;
 
-  // Output only. If set, indicates an error. Note that for a single
-  // `AnnotateVideoRequest` some videos may succeed and some may fail.
+  // If set, indicates an error. Note that for a single `AnnotateVideoRequest`
+  // some videos may succeed and some may fail.
   google.rpc.Status error = 9;
 }
 
@@ -259,18 +320,18 @@ message AnnotateVideoResponse {
 
 // Annotation progress for a single video.
 message VideoAnnotationProgress {
-  // Output only. Video file location in
+  // Video file location in
   // [Google Cloud Storage](https://cloud.google.com/storage/).
   string input_uri = 1;
 
-  // Output only. Approximate percentage processed thus far. Guaranteed to be
+  // Approximate percentage processed thus far. Guaranteed to be
   // 100 when fully processed.
   int32 progress_percent = 2;
 
-  // Output only. Time when the request was received.
+  // Time when the request was received.
   google.protobuf.Timestamp start_time = 3;
 
-  // Output only. Time of the most recent update.
+  // Time of the most recent update.
   google.protobuf.Timestamp update_time = 4;
 }
 
@@ -293,7 +354,7 @@ message SpeechTranscriptionConfig {
 
   // *Optional* Maximum number of recognition hypotheses to be returned.
   // Specifically, the maximum number of `SpeechRecognitionAlternative` messages
-  // within each `SpeechRecognitionResult`. The server may return fewer than
+  // within each `SpeechTranscription`. The server may return fewer than
   // `max_alternatives`. Valid values are `0`-`30`. A value of `0` or `1` will
   // return a maximum of one. If omitted, will return a maximum of one.
   int32 max_alternatives = 2;
@@ -318,6 +379,26 @@ message SpeechTranscriptionConfig {
   // *Optional* For file formats, such as MXF or MKV, supporting multiple audio
   // tracks, specify up to two tracks. Default: track 0.
   repeated int32 audio_tracks = 6;
+
+  // *Optional* If 'true', enables speaker detection for each recognized word in
+  // the top alternative of the recognition result using a speaker_tag provided
+  // in the WordInfo.
+  // Note: When this is true, we send all the words from the beginning of the
+  // audio for the top alternative in every consecutive responses.
+  // This is done in order to improve our speaker tags as our models learn to
+  // identify the speakers in the conversation over time.
+  bool enable_speaker_diarization = 7;
+
+  // *Optional*
+  // If set, specifies the estimated number of speakers in the conversation.
+  // If not set, defaults to '2'.
+  // Ignored unless enable_speaker_diarization is set to true.
+  int32 diarization_speaker_count = 8;
+
+  // *Optional* If `true`, the top result includes a list of words and the
+  // confidence for those words. If `false`, no word-level confidence
+  // information is returned. The default is `false`.
+  bool enable_word_confidence = 9;
 }
 
 // Provides "hints" to the speech recognizer to favor specific words and phrases
@@ -334,48 +415,68 @@ message SpeechContext {
 
 // A speech recognition result corresponding to a portion of the audio.
 message SpeechTranscription {
-  // Output only. May contain one or more recognition hypotheses (up to the
-  // maximum specified in `max_alternatives`).
-  // These alternatives are ordered in terms of accuracy, with the top (first)
-  // alternative being the most probable, as ranked by the recognizer.
+  // May contain one or more recognition hypotheses (up to the maximum specified
+  // in `max_alternatives`).  These alternatives are ordered in terms of
+  // accuracy, with the top (first) alternative being the most probable, as
+  // ranked by the recognizer.
   repeated SpeechRecognitionAlternative alternatives = 1;
+
+  // Output only. The
+  // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of the
+  // language in this result. This language code was detected to have the most
+  // likelihood of being spoken in the audio.
+  string language_code = 2;
 }
 
 // Alternative hypotheses (a.k.a. n-best list).
 message SpeechRecognitionAlternative {
-  // Output only. Transcript text representing the words that the user spoke.
+  // Transcript text representing the words that the user spoke.
   string transcript = 1;
 
-  // Output only. The confidence estimate between 0.0 and 1.0. A higher number
+  // The confidence estimate between 0.0 and 1.0. A higher number
   // indicates an estimated greater likelihood that the recognized words are
   // correct. This field is typically provided only for the top hypothesis, and
   // only for `is_final=true` results. Clients should not rely on the
   // `confidence` field as it is not guaranteed to be accurate or consistent.
   // The default of 0.0 is a sentinel value indicating `confidence` was not set.
   float confidence = 2;
 
-  // Output only. A list of word-specific information for each recognized word.
+  // A list of word-specific information for each recognized word.
   repeated WordInfo words = 3;
 }
 
 // Word-specific information for recognized words. Word information is only
 // included in the response when certain request parameters are set, such
 // as `enable_word_time_offsets`.
 message WordInfo {
-  // Output only. Time offset relative to the beginning of the audio, and
+  // Time offset relative to the beginning of the audio, and
   // corresponding to the start of the spoken word. This field is only set if
   // `enable_word_time_offsets=true` and only in the top hypothesis. This is an
   // experimental feature and the accuracy of the time offset can vary.
   google.protobuf.Duration start_time = 1;
 
-  // Output only. Time offset relative to the beginning of the audio, and
+  // Time offset relative to the beginning of the audio, and
   // corresponding to the end of the spoken word. This field is only set if
   // `enable_word_time_offsets=true` and only in the top hypothesis. This is an
   // experimental feature and the accuracy of the time offset can vary.
   google.protobuf.Duration end_time = 2;
 
-  // Output only. The word corresponding to this set of information.
+  // The word corresponding to this set of information.
   string word = 3;
+
+  // Output only. The confidence estimate between 0.0 and 1.0. A higher number
+  // indicates an estimated greater likelihood that the recognized words are
+  // correct. This field is set only for the top alternative.
+  // This field is not guaranteed to be accurate and users should not rely on it
+  // to be always provided.
+  // The default of 0.0 is a sentinel value indicating `confidence` was not set.
+  float confidence = 4;
+
+  // Output only. A distinct integer value is assigned for every speaker within
+  // the audio. This field specifies which one of those speakers was detected to
+  // have spoken this word. Value ranges from 1 up to diarization_speaker_count,
+  // and is only set if speaker diarization is enabled.
+  int32 speaker_tag = 5;
 }
 
 // Video annotation feature.
@@ -392,6 +493,9 @@ enum Feature {
   // Explicit content detection.
   EXPLICIT_CONTENT_DETECTION = 3;
 
+  // Human face detection and tracking.
+  FACE_DETECTION = 4;
+
   // Speech transcription.
   SPEECH_TRANSCRIPTION = 6;
 }