Skip to content

Commit 6b8eb5f

Browse files
yoshi-automationbusunkim96
authored andcommitted
Add Recognition Metadata (via synth). (googleapis#7961)
1 parent d149f80 commit 6b8eb5f

File tree

4 files changed

+725
-49
lines changed

4 files changed

+725
-49
lines changed

speech/google/cloud/speech_v1/gapic/enums.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,96 @@ class AudioEncoding(enum.IntEnum):
8282
SPEEX_WITH_HEADER_BYTE = 7
8383

8484

85+
class RecognitionMetadata(object):
86+
class InteractionType(enum.IntEnum):
87+
"""
88+
Use case categories that the audio recognition request can be described
89+
by.
90+
91+
Attributes:
92+
INTERACTION_TYPE_UNSPECIFIED (int): Use case is either unknown or is something other than one of the other
93+
values below.
94+
DISCUSSION (int): Multiple people in a conversation or discussion. For example in a
95+
meeting with two or more people actively participating. Typically all
96+
the primary people speaking would be in the same room (if not, see
97+
PHONE\_CALL)
98+
PRESENTATION (int): One or more persons lecturing or presenting to others, mostly
99+
uninterrupted.
100+
PHONE_CALL (int): A phone-call or video-conference in which two or more people, who are
101+
not in the same room, are actively participating.
102+
VOICEMAIL (int): A recorded message intended for another person to listen to.
103+
PROFESSIONALLY_PRODUCED (int): Professionally produced audio (eg. TV Show, Podcast).
104+
VOICE_SEARCH (int): Transcribe spoken questions and queries into text.
105+
VOICE_COMMAND (int): Transcribe voice commands, such as for controlling a device.
106+
DICTATION (int): Transcribe speech to text to create a written document, such as a
107+
text-message, email or report.
108+
"""
109+
110+
INTERACTION_TYPE_UNSPECIFIED = 0
111+
DISCUSSION = 1
112+
PRESENTATION = 2
113+
PHONE_CALL = 3
114+
VOICEMAIL = 4
115+
PROFESSIONALLY_PRODUCED = 5
116+
VOICE_SEARCH = 6
117+
VOICE_COMMAND = 7
118+
DICTATION = 8
119+
120+
class MicrophoneDistance(enum.IntEnum):
121+
"""
122+
Enumerates the types of capture settings describing an audio file.
123+
124+
Attributes:
125+
MICROPHONE_DISTANCE_UNSPECIFIED (int): Audio type is not known.
126+
NEARFIELD (int): The audio was captured from a closely placed microphone. Eg. phone,
127+
dictaphone, or handheld microphone. Generally if there speaker is within
128+
1 meter of the microphone.
129+
MIDFIELD (int): The speaker if within 3 meters of the microphone.
130+
FARFIELD (int): The speaker is more than 3 meters away from the microphone.
131+
"""
132+
133+
MICROPHONE_DISTANCE_UNSPECIFIED = 0
134+
NEARFIELD = 1
135+
MIDFIELD = 2
136+
FARFIELD = 3
137+
138+
class OriginalMediaType(enum.IntEnum):
139+
"""
140+
The original media the speech was recorded on.
141+
142+
Attributes:
143+
ORIGINAL_MEDIA_TYPE_UNSPECIFIED (int): Unknown original media type.
144+
AUDIO (int): The speech data is an audio recording.
145+
VIDEO (int): The speech data originally recorded on a video.
146+
"""
147+
148+
ORIGINAL_MEDIA_TYPE_UNSPECIFIED = 0
149+
AUDIO = 1
150+
VIDEO = 2
151+
152+
class RecordingDeviceType(enum.IntEnum):
153+
"""
154+
The type of device the speech was recorded with.
155+
156+
Attributes:
157+
RECORDING_DEVICE_TYPE_UNSPECIFIED (int): The recording device is unknown.
158+
SMARTPHONE (int): Speech was recorded on a smartphone.
159+
PC (int): Speech was recorded using a personal computer or tablet.
160+
PHONE_LINE (int): Speech was recorded over a phone line.
161+
VEHICLE (int): Speech was recorded in a vehicle.
162+
OTHER_OUTDOOR_DEVICE (int): Speech was recorded outdoors.
163+
OTHER_INDOOR_DEVICE (int): Speech was recorded indoors.
164+
"""
165+
166+
RECORDING_DEVICE_TYPE_UNSPECIFIED = 0
167+
SMARTPHONE = 1
168+
PC = 2
169+
PHONE_LINE = 3
170+
VEHICLE = 4
171+
OTHER_OUTDOOR_DEVICE = 5
172+
OTHER_INDOOR_DEVICE = 6
173+
174+
85175
class StreamingRecognizeResponse(object):
86176
class SpeechEventType(enum.IntEnum):
87177
"""

speech/google/cloud/speech_v1/proto/cloud_speech.proto

Lines changed: 140 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,7 @@ package google.cloud.speech.v1;
1919

2020
import "google/api/annotations.proto";
2121
import "google/longrunning/operations.proto";
22-
import "google/protobuf/any.proto";
2322
import "google/protobuf/duration.proto";
24-
import "google/protobuf/empty.proto";
2523
import "google/protobuf/timestamp.proto";
2624
import "google/rpc/status.proto";
2725

@@ -278,6 +276,9 @@ message RecognitionConfig {
278276
// premium feature.
279277
bool enable_automatic_punctuation = 11;
280278

279+
// *Optional* Metadata regarding this request.
280+
RecognitionMetadata metadata = 9;
281+
281282
// *Optional* Which model to select for the given request. Select the model
282283
// best suited to your domain to get best results. If a model is not
283284
// explicitly specified, then we auto-select a model based on the parameters
@@ -330,6 +331,133 @@ message RecognitionConfig {
330331
bool use_enhanced = 14;
331332
}
332333

334+
// Description of audio data to be recognized.
335+
message RecognitionMetadata {
336+
// Use case categories that the audio recognition request can be described
337+
// by.
338+
enum InteractionType {
339+
// Use case is either unknown or is something other than one of the other
340+
// values below.
341+
INTERACTION_TYPE_UNSPECIFIED = 0;
342+
343+
// Multiple people in a conversation or discussion. For example in a
344+
// meeting with two or more people actively participating. Typically
345+
// all the primary people speaking would be in the same room (if not,
346+
// see PHONE_CALL)
347+
DISCUSSION = 1;
348+
349+
// One or more persons lecturing or presenting to others, mostly
350+
// uninterrupted.
351+
PRESENTATION = 2;
352+
353+
// A phone-call or video-conference in which two or more people, who are
354+
// not in the same room, are actively participating.
355+
PHONE_CALL = 3;
356+
357+
// A recorded message intended for another person to listen to.
358+
VOICEMAIL = 4;
359+
360+
// Professionally produced audio (eg. TV Show, Podcast).
361+
PROFESSIONALLY_PRODUCED = 5;
362+
363+
// Transcribe spoken questions and queries into text.
364+
VOICE_SEARCH = 6;
365+
366+
// Transcribe voice commands, such as for controlling a device.
367+
VOICE_COMMAND = 7;
368+
369+
// Transcribe speech to text to create a written document, such as a
370+
// text-message, email or report.
371+
DICTATION = 8;
372+
}
373+
374+
// The use case most closely describing the audio content to be recognized.
375+
InteractionType interaction_type = 1;
376+
377+
// The industry vertical to which this speech recognition request most
378+
// closely applies. This is most indicative of the topics contained
379+
// in the audio. Use the 6-digit NAICS code to identify the industry
380+
// vertical - see https://www.naics.com/search/.
381+
uint32 industry_naics_code_of_audio = 3;
382+
383+
// Enumerates the types of capture settings describing an audio file.
384+
enum MicrophoneDistance {
385+
// Audio type is not known.
386+
MICROPHONE_DISTANCE_UNSPECIFIED = 0;
387+
388+
// The audio was captured from a closely placed microphone. Eg. phone,
389+
// dictaphone, or handheld microphone. Generally if there speaker is within
390+
// 1 meter of the microphone.
391+
NEARFIELD = 1;
392+
393+
// The speaker if within 3 meters of the microphone.
394+
MIDFIELD = 2;
395+
396+
// The speaker is more than 3 meters away from the microphone.
397+
FARFIELD = 3;
398+
}
399+
400+
// The audio type that most closely describes the audio being recognized.
401+
MicrophoneDistance microphone_distance = 4;
402+
403+
// The original media the speech was recorded on.
404+
enum OriginalMediaType {
405+
// Unknown original media type.
406+
ORIGINAL_MEDIA_TYPE_UNSPECIFIED = 0;
407+
408+
// The speech data is an audio recording.
409+
AUDIO = 1;
410+
411+
// The speech data originally recorded on a video.
412+
VIDEO = 2;
413+
}
414+
415+
// The original media the speech was recorded on.
416+
OriginalMediaType original_media_type = 5;
417+
418+
// The type of device the speech was recorded with.
419+
enum RecordingDeviceType {
420+
// The recording device is unknown.
421+
RECORDING_DEVICE_TYPE_UNSPECIFIED = 0;
422+
423+
// Speech was recorded on a smartphone.
424+
SMARTPHONE = 1;
425+
426+
// Speech was recorded using a personal computer or tablet.
427+
PC = 2;
428+
429+
// Speech was recorded over a phone line.
430+
PHONE_LINE = 3;
431+
432+
// Speech was recorded in a vehicle.
433+
VEHICLE = 4;
434+
435+
// Speech was recorded outdoors.
436+
OTHER_OUTDOOR_DEVICE = 5;
437+
438+
// Speech was recorded indoors.
439+
OTHER_INDOOR_DEVICE = 6;
440+
}
441+
442+
// The type of device the speech was recorded with.
443+
RecordingDeviceType recording_device_type = 6;
444+
445+
// The device used to make the recording. Examples 'Nexus 5X' or
446+
// 'Polycom SoundStation IP 6000' or 'POTS' or 'VoIP' or
447+
// 'Cardioid Microphone'.
448+
string recording_device_name = 7;
449+
450+
// Mime type of the original audio file. For example `audio/m4a`,
451+
// `audio/x-alaw-basic`, `audio/mp3`, `audio/3gpp`.
452+
// A list of possible audio mime types is maintained at
453+
// http://www.iana.org/assignments/media-types/media-types.xhtml#audio
454+
string original_mime_type = 8;
455+
456+
// Description of the content. Eg. "Recordings of federal supreme court
457+
// hearings from 2012".
458+
string audio_topic = 10;
459+
}
460+
333461
// Provides "hints" to the speech recognizer to favor specific words and phrases
334462
// in the results.
335463
message SpeechContext {
@@ -504,10 +632,20 @@ message StreamingRecognitionResult {
504632
// The default of 0.0 is a sentinel value indicating `stability` was not set.
505633
float stability = 3;
506634

635+
// Output only. Time offset of the end of this result relative to the
636+
// beginning of the audio.
637+
google.protobuf.Duration result_end_time = 4;
638+
507639
// For multi-channel audio, this is the channel number corresponding to the
508640
// recognized result for the audio from that channel.
509641
// For audio_channel_count = N, its output values can range from '1' to 'N'.
510642
int32 channel_tag = 5;
643+
644+
// Output only. The
645+
// [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of the
646+
// language in this result. This language code was detected to have the most
647+
// likelihood of being spoken in the audio.
648+
string language_code = 6;
511649
}
512650

513651
// A speech recognition result corresponding to a portion of the audio.

0 commit comments

Comments
 (0)