@@ -19,9 +19,7 @@ package google.cloud.speech.v1;
1919
2020import "google/api/annotations.proto" ;
2121import "google/longrunning/operations.proto" ;
22- import "google/protobuf/any.proto" ;
2322import "google/protobuf/duration.proto" ;
24- import "google/protobuf/empty.proto" ;
2523import "google/protobuf/timestamp.proto" ;
2624import "google/rpc/status.proto" ;
2725
@@ -278,6 +276,9 @@ message RecognitionConfig {
278276 // premium feature.
279277 bool enable_automatic_punctuation = 11 ;
280278
279+ // *Optional* Metadata regarding this request.
280+ RecognitionMetadata metadata = 9 ;
281+
281282 // *Optional* Which model to select for the given request. Select the model
282283 // best suited to your domain to get best results. If a model is not
283284 // explicitly specified, then we auto-select a model based on the parameters
@@ -330,6 +331,133 @@ message RecognitionConfig {
330331 bool use_enhanced = 14 ;
331332}
332333
334+ // Description of audio data to be recognized.
335+ message RecognitionMetadata {
336+ // Use case categories that the audio recognition request can be described
337+ // by.
338+ enum InteractionType {
339+ // Use case is either unknown or is something other than one of the other
340+ // values below.
341+ INTERACTION_TYPE_UNSPECIFIED = 0 ;
342+
343+ // Multiple people in a conversation or discussion. For example in a
344+ // meeting with two or more people actively participating. Typically
345+ // all the primary people speaking would be in the same room (if not,
346+ // see PHONE_CALL)
347+ DISCUSSION = 1 ;
348+
349+ // One or more persons lecturing or presenting to others, mostly
350+ // uninterrupted.
351+ PRESENTATION = 2 ;
352+
353+ // A phone-call or video-conference in which two or more people, who are
354+ // not in the same room, are actively participating.
355+ PHONE_CALL = 3 ;
356+
357+ // A recorded message intended for another person to listen to.
358+ VOICEMAIL = 4 ;
359+
360+ // Professionally produced audio (eg. TV Show, Podcast).
361+ PROFESSIONALLY_PRODUCED = 5 ;
362+
363+ // Transcribe spoken questions and queries into text.
364+ VOICE_SEARCH = 6 ;
365+
366+ // Transcribe voice commands, such as for controlling a device.
367+ VOICE_COMMAND = 7 ;
368+
369+ // Transcribe speech to text to create a written document, such as a
370+ // text-message, email or report.
371+ DICTATION = 8 ;
372+ }
373+
374+ // The use case most closely describing the audio content to be recognized.
375+ InteractionType interaction_type = 1 ;
376+
377+ // The industry vertical to which this speech recognition request most
378+ // closely applies. This is most indicative of the topics contained
379+ // in the audio. Use the 6-digit NAICS code to identify the industry
380+ // vertical - see https://www.naics.com/search/.
381+ uint32 industry_naics_code_of_audio = 3 ;
382+
383+ // Enumerates the types of capture settings describing an audio file.
384+ enum MicrophoneDistance {
385+ // Audio type is not known.
386+ MICROPHONE_DISTANCE_UNSPECIFIED = 0 ;
387+
388+ // The audio was captured from a closely placed microphone. Eg. phone,
389+ // dictaphone, or handheld microphone. Generally if there speaker is within
390+ // 1 meter of the microphone.
391+ NEARFIELD = 1 ;
392+
393+ // The speaker if within 3 meters of the microphone.
394+ MIDFIELD = 2 ;
395+
396+ // The speaker is more than 3 meters away from the microphone.
397+ FARFIELD = 3 ;
398+ }
399+
400+ // The audio type that most closely describes the audio being recognized.
401+ MicrophoneDistance microphone_distance = 4 ;
402+
403+ // The original media the speech was recorded on.
404+ enum OriginalMediaType {
405+ // Unknown original media type.
406+ ORIGINAL_MEDIA_TYPE_UNSPECIFIED = 0 ;
407+
408+ // The speech data is an audio recording.
409+ AUDIO = 1 ;
410+
411+ // The speech data originally recorded on a video.
412+ VIDEO = 2 ;
413+ }
414+
415+ // The original media the speech was recorded on.
416+ OriginalMediaType original_media_type = 5 ;
417+
418+ // The type of device the speech was recorded with.
419+ enum RecordingDeviceType {
420+ // The recording device is unknown.
421+ RECORDING_DEVICE_TYPE_UNSPECIFIED = 0 ;
422+
423+ // Speech was recorded on a smartphone.
424+ SMARTPHONE = 1 ;
425+
426+ // Speech was recorded using a personal computer or tablet.
427+ PC = 2 ;
428+
429+ // Speech was recorded over a phone line.
430+ PHONE_LINE = 3 ;
431+
432+ // Speech was recorded in a vehicle.
433+ VEHICLE = 4 ;
434+
435+ // Speech was recorded outdoors.
436+ OTHER_OUTDOOR_DEVICE = 5 ;
437+
438+ // Speech was recorded indoors.
439+ OTHER_INDOOR_DEVICE = 6 ;
440+ }
441+
442+ // The type of device the speech was recorded with.
443+ RecordingDeviceType recording_device_type = 6 ;
444+
445+ // The device used to make the recording. Examples 'Nexus 5X' or
446+ // 'Polycom SoundStation IP 6000' or 'POTS' or 'VoIP' or
447+ // 'Cardioid Microphone'.
448+ string recording_device_name = 7 ;
449+
450+ // Mime type of the original audio file. For example `audio/m4a`,
451+ // `audio/x-alaw-basic`, `audio/mp3`, `audio/3gpp`.
452+ // A list of possible audio mime types is maintained at
453+ // http://www.iana.org/assignments/media-types/media-types.xhtml#audio
454+ string original_mime_type = 8 ;
455+
456+ // Description of the content. Eg. "Recordings of federal supreme court
457+ // hearings from 2012".
458+ string audio_topic = 10 ;
459+ }
460+
333461// Provides "hints" to the speech recognizer to favor specific words and phrases
334462// in the results.
335463message SpeechContext {
@@ -504,10 +632,20 @@ message StreamingRecognitionResult {
504632 // The default of 0.0 is a sentinel value indicating `stability` was not set.
505633 float stability = 3 ;
506634
635+ // Output only. Time offset of the end of this result relative to the
636+ // beginning of the audio.
637+ google.protobuf.Duration result_end_time = 4 ;
638+
507639 // For multi-channel audio, this is the channel number corresponding to the
508640 // recognized result for the audio from that channel.
509641 // For audio_channel_count = N, its output values can range from '1' to 'N'.
510642 int32 channel_tag = 5 ;
643+
644+ // Output only. The
645+ // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of the
646+ // language in this result. This language code was detected to have the most
647+ // likelihood of being spoken in the audio.
648+ string language_code = 6 ;
511649}
512650
513651// A speech recognition result corresponding to a portion of the audio.
0 commit comments