Skip to content

Commit 20e40e6

Browse files
yoshi-automationcrwilcox
authored andcommitted
Copy proto files alongside protoc versions.
1 parent 9b77e4e commit 20e40e6

File tree

5 files changed

+497
-494
lines changed

5 files changed

+497
-494
lines changed

videointelligence/google/cloud/videointelligence_v1/proto/video_intelligence.proto

Lines changed: 132 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2018 Google Inc.
1+
// Copyright 2017 Google Inc.
22
//
33
// Licensed under the Apache License, Version 2.0 (the "License");
44
// you may not use this file except in compliance with the License.
@@ -14,20 +14,20 @@
1414

1515
syntax = "proto3";
1616

17-
package google.cloud.videointelligence.v1p1beta1;
17+
package google.cloud.videointelligence.v1;
1818

1919
import "google/api/annotations.proto";
2020
import "google/longrunning/operations.proto";
2121
import "google/protobuf/duration.proto";
2222
import "google/protobuf/timestamp.proto";
2323
import "google/rpc/status.proto";
2424

25-
option csharp_namespace = "Google.Cloud.VideoIntelligence.V1P1Beta1";
26-
option go_package = "google.golang.org/genproto/googleapis/cloud/videointelligence/v1p1beta1;videointelligence";
25+
option csharp_namespace = "Google.Cloud.VideoIntelligence.V1";
26+
option go_package = "google.golang.org/genproto/googleapis/cloud/videointelligence/v1;videointelligence";
2727
option java_multiple_files = true;
2828
option java_outer_classname = "VideoIntelligenceServiceProto";
29-
option java_package = "com.google.cloud.videointelligence.v1p1beta1";
30-
option php_namespace = "Google\\Cloud\\VideoIntelligence\\V1p1beta1";
29+
option java_package = "com.google.cloud.videointelligence.v1";
30+
option php_namespace = "Google\\Cloud\\VideoIntelligence\\V1";
3131

3232

3333
// Service that implements Google Cloud Video Intelligence API.
@@ -37,10 +37,7 @@ service VideoIntelligenceService {
3737
// `Operation.metadata` contains `AnnotateVideoProgress` (progress).
3838
// `Operation.response` contains `AnnotateVideoResponse` (results).
3939
rpc AnnotateVideo(AnnotateVideoRequest) returns (google.longrunning.Operation) {
40-
option (google.api.http) = {
41-
post: "/v1p1beta1/videos:annotate"
42-
body: "*"
43-
};
40+
option (google.api.http) = { post: "/v1/videos:annotate" body: "*" };
4441
}
4542
}
4643

@@ -99,6 +96,9 @@ message VideoContext {
9996
// Config for EXPLICIT_CONTENT_DETECTION.
10097
ExplicitContentDetectionConfig explicit_content_detection_config = 4;
10198

99+
// Config for FACE_DETECTION.
100+
FaceDetectionConfig face_detection_config = 5;
101+
102102
// Config for SPEECH_TRANSCRIPTION.
103103
SpeechTranscriptionConfig speech_transcription_config = 6;
104104
}
@@ -137,6 +137,17 @@ message ExplicitContentDetectionConfig {
137137
string model = 1;
138138
}
139139

140+
// Config for FACE_DETECTION.
141+
message FaceDetectionConfig {
142+
// Model to use for face detection.
143+
// Supported values: "builtin/stable" (the default if unset) and
144+
// "builtin/latest".
145+
string model = 1;
146+
147+
// Whether bounding boxes be included in the face annotation output.
148+
bool include_bounding_boxes = 2;
149+
}
150+
140151
// Video segment.
141152
message VideoSegment {
142153
// Time-offset, relative to the beginning of the video,
@@ -217,9 +228,56 @@ message ExplicitContentAnnotation {
217228
repeated ExplicitContentFrame frames = 1;
218229
}
219230

231+
// Normalized bounding box.
232+
// The normalized vertex coordinates are relative to the original image.
233+
// Range: [0, 1].
234+
message NormalizedBoundingBox {
235+
// Left X coordinate.
236+
float left = 1;
237+
238+
// Top Y coordinate.
239+
float top = 2;
240+
241+
// Right X coordinate.
242+
float right = 3;
243+
244+
// Bottom Y coordinate.
245+
float bottom = 4;
246+
}
247+
248+
// Video segment level annotation results for face detection.
249+
message FaceSegment {
250+
// Video segment where a face was detected.
251+
VideoSegment segment = 1;
252+
}
253+
254+
// Video frame level annotation results for face detection.
255+
message FaceFrame {
256+
// Normalized Bounding boxes in a frame.
257+
// There can be more than one boxes if the same face is detected in multiple
258+
// locations within the current frame.
259+
repeated NormalizedBoundingBox normalized_bounding_boxes = 1;
260+
261+
// Time-offset, relative to the beginning of the video,
262+
// corresponding to the video frame for this location.
263+
google.protobuf.Duration time_offset = 2;
264+
}
265+
266+
// Face annotation.
267+
message FaceAnnotation {
268+
// Thumbnail of a representative face view (in JPEG format).
269+
bytes thumbnail = 1;
270+
271+
// All video segments where a face was detected.
272+
repeated FaceSegment segments = 2;
273+
274+
// All video frames where a face was detected.
275+
repeated FaceFrame frames = 3;
276+
}
277+
220278
// Annotation results for a single video.
221279
message VideoAnnotationResults {
222-
// Output only. Video file location in
280+
// Video file location in
223281
// [Google Cloud Storage](https://cloud.google.com/storage/).
224282
string input_uri = 1;
225283

@@ -235,6 +293,9 @@ message VideoAnnotationResults {
235293
// There is exactly one element for each unique label.
236294
repeated LabelAnnotation frame_label_annotations = 4;
237295

296+
// Face annotations. There is exactly one element for each unique face.
297+
repeated FaceAnnotation face_annotations = 5;
298+
238299
// Shot annotations. Each shot is represented as a video segment.
239300
repeated VideoSegment shot_annotations = 6;
240301

@@ -244,8 +305,8 @@ message VideoAnnotationResults {
244305
// Speech transcription.
245306
repeated SpeechTranscription speech_transcriptions = 11;
246307

247-
// Output only. If set, indicates an error. Note that for a single
248-
// `AnnotateVideoRequest` some videos may succeed and some may fail.
308+
// If set, indicates an error. Note that for a single `AnnotateVideoRequest`
309+
// some videos may succeed and some may fail.
249310
google.rpc.Status error = 9;
250311
}
251312

@@ -259,18 +320,18 @@ message AnnotateVideoResponse {
259320

260321
// Annotation progress for a single video.
261322
message VideoAnnotationProgress {
262-
// Output only. Video file location in
323+
// Video file location in
263324
// [Google Cloud Storage](https://cloud.google.com/storage/).
264325
string input_uri = 1;
265326

266-
// Output only. Approximate percentage processed thus far. Guaranteed to be
327+
// Approximate percentage processed thus far. Guaranteed to be
267328
// 100 when fully processed.
268329
int32 progress_percent = 2;
269330

270-
// Output only. Time when the request was received.
331+
// Time when the request was received.
271332
google.protobuf.Timestamp start_time = 3;
272333

273-
// Output only. Time of the most recent update.
334+
// Time of the most recent update.
274335
google.protobuf.Timestamp update_time = 4;
275336
}
276337

@@ -293,7 +354,7 @@ message SpeechTranscriptionConfig {
293354

294355
// *Optional* Maximum number of recognition hypotheses to be returned.
295356
// Specifically, the maximum number of `SpeechRecognitionAlternative` messages
296-
// within each `SpeechRecognitionResult`. The server may return fewer than
357+
// within each `SpeechTranscription`. The server may return fewer than
297358
// `max_alternatives`. Valid values are `0`-`30`. A value of `0` or `1` will
298359
// return a maximum of one. If omitted, will return a maximum of one.
299360
int32 max_alternatives = 2;
@@ -318,6 +379,26 @@ message SpeechTranscriptionConfig {
318379
// *Optional* For file formats, such as MXF or MKV, supporting multiple audio
319380
// tracks, specify up to two tracks. Default: track 0.
320381
repeated int32 audio_tracks = 6;
382+
383+
// *Optional* If 'true', enables speaker detection for each recognized word in
384+
// the top alternative of the recognition result using a speaker_tag provided
385+
// in the WordInfo.
386+
// Note: When this is true, we send all the words from the beginning of the
387+
// audio for the top alternative in every consecutive responses.
388+
// This is done in order to improve our speaker tags as our models learn to
389+
// identify the speakers in the conversation over time.
390+
bool enable_speaker_diarization = 7;
391+
392+
// *Optional*
393+
// If set, specifies the estimated number of speakers in the conversation.
394+
// If not set, defaults to '2'.
395+
// Ignored unless enable_speaker_diarization is set to true.
396+
int32 diarization_speaker_count = 8;
397+
398+
// *Optional* If `true`, the top result includes a list of words and the
399+
// confidence for those words. If `false`, no word-level confidence
400+
// information is returned. The default is `false`.
401+
bool enable_word_confidence = 9;
321402
}
322403

323404
// Provides "hints" to the speech recognizer to favor specific words and phrases
@@ -334,48 +415,68 @@ message SpeechContext {
334415

335416
// A speech recognition result corresponding to a portion of the audio.
336417
message SpeechTranscription {
337-
// Output only. May contain one or more recognition hypotheses (up to the
338-
// maximum specified in `max_alternatives`).
339-
// These alternatives are ordered in terms of accuracy, with the top (first)
340-
// alternative being the most probable, as ranked by the recognizer.
418+
// May contain one or more recognition hypotheses (up to the maximum specified
419+
// in `max_alternatives`). These alternatives are ordered in terms of
420+
// accuracy, with the top (first) alternative being the most probable, as
421+
// ranked by the recognizer.
341422
repeated SpeechRecognitionAlternative alternatives = 1;
423+
424+
// Output only. The
425+
// [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of the
426+
// language in this result. This language code was detected to have the most
427+
// likelihood of being spoken in the audio.
428+
string language_code = 2;
342429
}
343430

344431
// Alternative hypotheses (a.k.a. n-best list).
345432
message SpeechRecognitionAlternative {
346-
// Output only. Transcript text representing the words that the user spoke.
433+
// Transcript text representing the words that the user spoke.
347434
string transcript = 1;
348435

349-
// Output only. The confidence estimate between 0.0 and 1.0. A higher number
436+
// The confidence estimate between 0.0 and 1.0. A higher number
350437
// indicates an estimated greater likelihood that the recognized words are
351438
// correct. This field is typically provided only for the top hypothesis, and
352439
// only for `is_final=true` results. Clients should not rely on the
353440
// `confidence` field as it is not guaranteed to be accurate or consistent.
354441
// The default of 0.0 is a sentinel value indicating `confidence` was not set.
355442
float confidence = 2;
356443

357-
// Output only. A list of word-specific information for each recognized word.
444+
// A list of word-specific information for each recognized word.
358445
repeated WordInfo words = 3;
359446
}
360447

361448
// Word-specific information for recognized words. Word information is only
362449
// included in the response when certain request parameters are set, such
363450
// as `enable_word_time_offsets`.
364451
message WordInfo {
365-
// Output only. Time offset relative to the beginning of the audio, and
452+
// Time offset relative to the beginning of the audio, and
366453
// corresponding to the start of the spoken word. This field is only set if
367454
// `enable_word_time_offsets=true` and only in the top hypothesis. This is an
368455
// experimental feature and the accuracy of the time offset can vary.
369456
google.protobuf.Duration start_time = 1;
370457

371-
// Output only. Time offset relative to the beginning of the audio, and
458+
// Time offset relative to the beginning of the audio, and
372459
// corresponding to the end of the spoken word. This field is only set if
373460
// `enable_word_time_offsets=true` and only in the top hypothesis. This is an
374461
// experimental feature and the accuracy of the time offset can vary.
375462
google.protobuf.Duration end_time = 2;
376463

377-
// Output only. The word corresponding to this set of information.
464+
// The word corresponding to this set of information.
378465
string word = 3;
466+
467+
// Output only. The confidence estimate between 0.0 and 1.0. A higher number
468+
// indicates an estimated greater likelihood that the recognized words are
469+
// correct. This field is set only for the top alternative.
470+
// This field is not guaranteed to be accurate and users should not rely on it
471+
// to be always provided.
472+
// The default of 0.0 is a sentinel value indicating `confidence` was not set.
473+
float confidence = 4;
474+
475+
// Output only. A distinct integer value is assigned for every speaker within
476+
// the audio. This field specifies which one of those speakers was detected to
477+
// have spoken this word. Value ranges from 1 up to diarization_speaker_count,
478+
// and is only set if speaker diarization is enabled.
479+
int32 speaker_tag = 5;
379480
}
380481

381482
// Video annotation feature.
@@ -392,6 +493,9 @@ enum Feature {
392493
// Explicit content detection.
393494
EXPLICIT_CONTENT_DETECTION = 3;
394495

496+
// Human face detection and tracking.
497+
FACE_DETECTION = 4;
498+
395499
// Speech transcription.
396500
SPEECH_TRANSCRIPTION = 6;
397501
}

0 commit comments

Comments
 (0)