1515
1616syntax = "proto3" ;
1717
18- package google.cloud.speech.v1p1beta1 ;
18+ package google.cloud.speech.v1 ;
1919
2020import "google/api/annotations.proto" ;
2121import "google/longrunning/operations.proto" ;
@@ -26,10 +26,10 @@ import "google/protobuf/timestamp.proto";
2626import "google/rpc/status.proto" ;
2727
2828option cc_enable_arenas = true ;
29- option go_package = "google.golang.org/genproto/googleapis/cloud/speech/v1p1beta1 ;speech" ;
29+ option go_package = "google.golang.org/genproto/googleapis/cloud/speech/v1 ;speech" ;
3030option java_multiple_files = true ;
3131option java_outer_classname = "SpeechProto" ;
32- option java_package = "com.google.cloud.speech.v1p1beta1 " ;
32+ option java_package = "com.google.cloud.speech.v1 " ;
3333
3434
3535// Service that implements Google Cloud Speech API.
@@ -38,7 +38,7 @@ service Speech {
3838 // has been sent and processed.
3939 rpc Recognize (RecognizeRequest ) returns (RecognizeResponse ) {
4040 option (google.api.http ) = {
41- post : "/v1p1beta1 /speech:recognize"
41+ post : "/v1 /speech:recognize"
4242 body : "*"
4343 };
4444 }
@@ -49,7 +49,7 @@ service Speech {
4949 // a `LongRunningRecognizeResponse` message.
5050 rpc LongRunningRecognize (LongRunningRecognizeRequest ) returns (google.longrunning.Operation ) {
5151 option (google.api.http ) = {
52- post : "/v1p1beta1 /speech:longrunningrecognize"
52+ post : "/v1 /speech:longrunningrecognize"
5353 body : "*"
5454 };
5555 }
@@ -203,7 +203,7 @@ message RecognitionConfig {
203203
204204 // Encoding of audio data sent in all `RecognitionAudio` messages.
205205 // This field is optional for `FLAC` and `WAV` audio files and required
206- // for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1p1beta1 .RecognitionConfig.AudioEncoding].
206+ // for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1 .RecognitionConfig.AudioEncoding].
207207 AudioEncoding encoding = 1 ;
208208
209209 // Sample rate in Hertz of the audio data sent in all
@@ -212,7 +212,7 @@ message RecognitionConfig {
212212 // source to 16000 Hz. If that's not possible, use the native sample rate of
213213 // the audio source (instead of re-sampling).
214214 // This field is optional for `FLAC` and `WAV` audio files and required
215- // for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1p1beta1 .RecognitionConfig.AudioEncoding].
215+ // for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1 .RecognitionConfig.AudioEncoding].
216216 int32 sample_rate_hertz = 2 ;
217217
218218 // *Optional* The number of channels in the input audio data.
@@ -226,7 +226,7 @@ message RecognitionConfig {
226226 // `enable_separate_recognition_per_channel` to 'true'.
227227 int32 audio_channel_count = 7 ;
228228
229- // This needs to be set to ‘ true’ explicitly and `audio_channel_count` > 1
229+ // This needs to be set to ` true` explicitly and `audio_channel_count` > 1
230230 // to get each channel recognized separately. The recognition result will
231231 // contain a `channel_tag` field to state which channel that result belongs
232232 // to. If this is not true, we will only recognize the first channel. The
@@ -241,20 +241,6 @@ message RecognitionConfig {
241241 // for a list of the currently supported language codes.
242242 string language_code = 3 ;
243243
244- // *Optional* A list of up to 3 additional
245- // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tags,
246- // listing possible alternative languages of the supplied audio.
247- // See [Language Support](/speech-to-text/docs/languages)
248- // for a list of the currently supported language codes.
249- // If alternative languages are listed, recognition result will contain
250- // recognition in the most likely language detected including the main
251- // language_code. The recognition result will include the language tag
252- // of the language detected in the audio.
253- // Note: This feature is only supported for Voice Command and Voice Search
254- // use cases and performance may vary for other use cases (e.g., phone call
255- // transcription).
256- repeated string alternative_language_codes = 18 ;
257-
258244 // *Optional* Maximum number of recognition hypotheses to be returned.
259245 // Specifically, the maximum number of `SpeechRecognitionAlternative` messages
260246 // within each `SpeechRecognitionResult`.
@@ -269,7 +255,7 @@ message RecognitionConfig {
269255 // won't be filtered out.
270256 bool profanity_filter = 5 ;
271257
272- // *Optional* array of [SpeechContext][google.cloud.speech.v1p1beta1 .SpeechContext].
258+ // *Optional* array of [SpeechContext][google.cloud.speech.v1 .SpeechContext].
273259 // A means to provide context to assist the speech recognition. For more
274260 // information, see [Phrase Hints](/speech-to-text/docs/basics#phrase-hints).
275261 repeated SpeechContext speech_contexts = 6 ;
@@ -280,11 +266,6 @@ message RecognitionConfig {
280266 // `false`.
281267 bool enable_word_time_offsets = 8 ;
282268
283- // *Optional* If `true`, the top result includes a list of words and the
284- // confidence for those words. If `false`, no word-level confidence
285- // information is returned. The default is `false`.
286- bool enable_word_confidence = 15 ;
287-
288269 // *Optional* If 'true', adds punctuation to recognition result hypotheses.
289270 // This feature is only available in select languages. Setting this for
290271 // requests in other languages has no effect at all.
@@ -294,26 +275,6 @@ message RecognitionConfig {
294275 // premium feature.
295276 bool enable_automatic_punctuation = 11 ;
296277
297- // *Optional* If 'true', enables speaker detection for each recognized word in
298- // the top alternative of the recognition result using a speaker_tag provided
299- // in the WordInfo.
300- // Note: When this is true, we send all the words from the beginning of the
301- // audio for the top alternative in every consecutive STREAMING responses.
302- // This is done in order to improve our speaker tags as our models learn to
303- // identify the speakers in the conversation over time.
304- // For non-streaming requests, the diarization results will be provided only
305- // in the top alternative of the FINAL SpeechRecognitionResult.
306- bool enable_speaker_diarization = 16 ;
307-
308- // *Optional*
309- // If set, specifies the estimated number of speakers in the conversation.
310- // If not set, defaults to '2'.
311- // Ignored unless enable_speaker_diarization is set to true."
312- int32 diarization_speaker_count = 17 ;
313-
314- // *Optional* Metadata regarding this request.
315- RecognitionMetadata metadata = 9 ;
316-
317278 // *Optional* Which model to select for the given request. Select the model
318279 // best suited to your domain to get best results. If a model is not
319280 // explicitly specified, then we auto-select a model based on the parameters
@@ -366,137 +327,6 @@ message RecognitionConfig {
366327 bool use_enhanced = 14 ;
367328}
368329
369- // Description of audio data to be recognized.
370- message RecognitionMetadata {
371- // Use case categories that the audio recognition request can be described
372- // by.
373- enum InteractionType {
374- // Use case is either unknown or is something other than one of the other
375- // values below.
376- INTERACTION_TYPE_UNSPECIFIED = 0 ;
377-
378- // Multiple people in a conversation or discussion. For example in a
379- // meeting with two or more people actively participating. Typically
380- // all the primary people speaking would be in the same room (if not,
381- // see PHONE_CALL)
382- DISCUSSION = 1 ;
383-
384- // One or more persons lecturing or presenting to others, mostly
385- // uninterrupted.
386- PRESENTATION = 2 ;
387-
388- // A phone-call or video-conference in which two or more people, who are
389- // not in the same room, are actively participating.
390- PHONE_CALL = 3 ;
391-
392- // A recorded message intended for another person to listen to.
393- VOICEMAIL = 4 ;
394-
395- // Professionally produced audio (eg. TV Show, Podcast).
396- PROFESSIONALLY_PRODUCED = 5 ;
397-
398- // Transcribe spoken questions and queries into text.
399- VOICE_SEARCH = 6 ;
400-
401- // Transcribe voice commands, such as for controlling a device.
402- VOICE_COMMAND = 7 ;
403-
404- // Transcribe speech to text to create a written document, such as a
405- // text-message, email or report.
406- DICTATION = 8 ;
407- }
408-
409- // Enumerates the types of capture settings describing an audio file.
410- enum MicrophoneDistance {
411- // Audio type is not known.
412- MICROPHONE_DISTANCE_UNSPECIFIED = 0 ;
413-
414- // The audio was captured from a closely placed microphone. Eg. phone,
415- // dictaphone, or handheld microphone. Generally if there speaker is within
416- // 1 meter of the microphone.
417- NEARFIELD = 1 ;
418-
419- // The speaker if within 3 meters of the microphone.
420- MIDFIELD = 2 ;
421-
422- // The speaker is more than 3 meters away from the microphone.
423- FARFIELD = 3 ;
424- }
425-
426- // The original media the speech was recorded on.
427- enum OriginalMediaType {
428- // Unknown original media type.
429- ORIGINAL_MEDIA_TYPE_UNSPECIFIED = 0 ;
430-
431- // The speech data is an audio recording.
432- AUDIO = 1 ;
433-
434- // The speech data originally recorded on a video.
435- VIDEO = 2 ;
436- }
437-
438- // The type of device the speech was recorded with.
439- enum RecordingDeviceType {
440- // The recording device is unknown.
441- RECORDING_DEVICE_TYPE_UNSPECIFIED = 0 ;
442-
443- // Speech was recorded on a smartphone.
444- SMARTPHONE = 1 ;
445-
446- // Speech was recorded using a personal computer or tablet.
447- PC = 2 ;
448-
449- // Speech was recorded over a phone line.
450- PHONE_LINE = 3 ;
451-
452- // Speech was recorded in a vehicle.
453- VEHICLE = 4 ;
454-
455- // Speech was recorded outdoors.
456- OTHER_OUTDOOR_DEVICE = 5 ;
457-
458- // Speech was recorded indoors.
459- OTHER_INDOOR_DEVICE = 6 ;
460- }
461-
462- // The use case most closely describing the audio content to be recognized.
463- InteractionType interaction_type = 1 ;
464-
465- // The industry vertical to which this speech recognition request most
466- // closely applies. This is most indicative of the topics contained
467- // in the audio. Use the 6-digit NAICS code to identify the industry
468- // vertical - see https://www.naics.com/search/.
469- uint32 industry_naics_code_of_audio = 3 ;
470-
471- // The audio type that most closely describes the audio being recognized.
472- MicrophoneDistance microphone_distance = 4 ;
473-
474- // The original media the speech was recorded on.
475- OriginalMediaType original_media_type = 5 ;
476-
477- // The type of device the speech was recorded with.
478- RecordingDeviceType recording_device_type = 6 ;
479-
480- // The device used to make the recording. Examples 'Nexus 5X' or
481- // 'Polycom SoundStation IP 6000' or 'POTS' or 'VoIP' or
482- // 'Cardioid Microphone'.
483- string recording_device_name = 7 ;
484-
485- // Mime type of the original audio file. For example `audio/m4a`,
486- // `audio/x-alaw-basic`, `audio/mp3`, `audio/3gpp`.
487- // A list of possible audio mime types is maintained at
488- // http://www.iana.org/assignments/media-types/media-types.xhtml#audio
489- string original_mime_type = 8 ;
490-
491- // Obfuscated (privacy-protected) ID of the user, to identify number of
492- // unique users using the service.
493- int64 obfuscated_id = 9 ;
494-
495- // Description of the content. Eg. "Recordings of federal supreme court
496- // hearings from 2012".
497- string audio_topic = 10 ;
498- }
499-
500330// Provides "hints" to the speech recognizer to favor specific words and phrases
501331// in the results.
502332message SpeechContext {
@@ -670,20 +500,10 @@ message StreamingRecognitionResult {
670500 // The default of 0.0 is a sentinel value indicating `stability` was not set.
671501 float stability = 3 ;
672502
673- // Output only. Time offset of the end of this result relative to the
674- // beginning of the audio.
675- google.protobuf.Duration result_end_time = 4 ;
676-
677503 // For multi-channel audio, this is the channel number corresponding to the
678504 // recognized result for the audio from that channel.
679505 // For audio_channel_count = N, its output values can range from '1' to 'N'.
680506 int32 channel_tag = 5 ;
681-
682- // Output only. The
683- // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of the
684- // language in this result. This language code was detected to have the most
685- // likelihood of being spoken in the audio.
686- string language_code = 6 ;
687507}
688508
689509// A speech recognition result corresponding to a portion of the audio.
@@ -698,12 +518,6 @@ message SpeechRecognitionResult {
698518 // recognized result for the audio from that channel.
699519 // For audio_channel_count = N, its output values can range from '1' to 'N'.
700520 int32 channel_tag = 2 ;
701-
702- // Output only. The
703- // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of the
704- // language in this result. This language code was detected to have the most
705- // likelihood of being spoken in the audio.
706- string language_code = 5 ;
707521}
708522
709523// Alternative hypotheses (a.k.a. n-best list).
@@ -746,20 +560,4 @@ message WordInfo {
746560
747561 // Output only. The word corresponding to this set of information.
748562 string word = 3 ;
749-
750- // Output only. The confidence estimate between 0.0 and 1.0. A higher number
751- // indicates an estimated greater likelihood that the recognized words are
752- // correct. This field is set only for the top alternative of a non-streaming
753- // result or, of a streaming result where `is_final=true`.
754- // This field is not guaranteed to be accurate and users should not rely on it
755- // to be always provided.
756- // The default of 0.0 is a sentinel value indicating `confidence` was not set.
757- float confidence = 4 ;
758-
759- // Output only. A distinct integer value is assigned for every speaker within
760- // the audio. This field specifies which one of those speakers was detected to
761- // have spoken this word. Value ranges from '1' to diarization_speaker_count.
762- // speaker_tag is set if enable_speaker_diarization = 'true' and only in the
763- // top alternative.
764- int32 speaker_tag = 5 ;
765563}
0 commit comments