@@ -21,26 +21,27 @@ class AudioEncoding(object):
2121
2222 All encodings support only 1 channel (mono) audio.
2323
24- If you send a ``FLAC`` or ``WAV`` audio file format in the request,
25- then if you specify an encoding in ``AudioEncoding``, it must match the
26- encoding described in the audio header. If it does not match, then the
27- request returns an
28- ``google.rpc.Code.INVALID_ARGUMENT`` error code. You can request
29- recognition for ``WAV`` files that contain either ``LINEAR16`` or ``MULAW``
30- encoded audio.
31- For audio file formats other than ``FLAC`` or ``WAV``, you must
32- specify the audio encoding in your ``RecognitionConfig``.
33-
3424 For best results, the audio source should be captured and transmitted using
3525 a lossless encoding (``FLAC`` or ``LINEAR16``). The accuracy of the speech
36- recognition can be reduced if lossy codecs, which include the other codecs
37- listed in this section, are used to capture or transmit the audio,
38- particularly if background noise is present.
26+ recognition can be reduced if lossy codecs are used to capture or transmit
27+ audio, particularly if background noise is present. Lossy codecs include
28+ ``MULAW``, ``AMR``, ``AMR_WB``, ``OGG_OPUS``, and ``SPEEX_WITH_HEADER_BYTE``.
29+
30+ The ``FLAC`` and ``WAV`` audio file formats include a header that describes the
31+ included audio content. You can request recognition for ``WAV`` files that
32+ contain either ``LINEAR16`` or ``MULAW`` encoded audio.
33+ If you send ``FLAC`` or ``WAV`` audio file format in
34+ your request, you do not need to specify an ``AudioEncoding``; the audio
35+ encoding format is determined from the file header. If you specify
36+ an ``AudioEncoding`` when you send send ``FLAC`` or ``WAV`` audio, the
37+ encoding configuration must match the encoding described in the audio
38+ header; otherwise the request returns an
39+ ``google.rpc.Code.INVALID_ARGUMENT`` error code.
3940
4041 Attributes:
4142 ENCODING_UNSPECIFIED (int): Not specified.
4243 LINEAR16 (int): Uncompressed 16-bit signed little-endian samples (Linear PCM).
43- FLAC (int): ``` FLAC`` <https://xiph.org/flac/documentation.html>`_ (Free Lossless Audio
44+ FLAC (int): ``FLAC`` (Free Lossless Audio
4445 Codec) is the recommended encoding because it is
4546 lossless--therefore recognition is not compromised--and
4647 requires only about half the bandwidth of ``LINEAR16``. ``FLAC`` stream
@@ -76,6 +77,107 @@ class AudioEncoding(object):
7677 SPEEX_WITH_HEADER_BYTE = 7
7778
7879
80+ class RecognitionMetadata (object ):
81+ class InteractionType (object ):
82+ """
83+ Use case categories that the audio recognition request can be described
84+ by.
85+
86+ Attributes:
87+ INTERACTION_TYPE_UNSPECIFIED (int): Use case is either unknown or is something other than one of the other
88+ values below.
89+ DISCUSSION (int): Multiple people in a conversation or discussion. For example in a
90+ meeting with two or more people actively participating. Typically
91+ all the primary people speaking would be in the same room (if not,
92+ see PHONE_CALL)
93+ PRESENTATION (int): One or more persons lecturing or presenting to others, mostly
94+ uninterrupted.
95+ PHONE_CALL (int): A phone-call or video-conference in which two or more people, who are
96+ not in the same room, are actively participating.
97+ VOICEMAIL (int): A recorded message intended for another person to listen to.
98+ PROFESSIONALLY_PRODUCED (int): Professionally produced audio (eg. TV Show, Podcast).
99+ VOICE_SEARCH (int): Transcribe spoken questions and queries into text.
100+ VOICE_COMMAND (int): Transcribe voice commands, such as for controlling a device.
101+ DICTATION (int): Transcribe speech to text to create a written document, such as a
102+ text-message, email or report.
103+ """
104+ INTERACTION_TYPE_UNSPECIFIED = 0
105+ DISCUSSION = 1
106+ PRESENTATION = 2
107+ PHONE_CALL = 3
108+ VOICEMAIL = 4
109+ PROFESSIONALLY_PRODUCED = 5
110+ VOICE_SEARCH = 6
111+ VOICE_COMMAND = 7
112+ DICTATION = 8
113+
114+ class MicrophoneDistance (object ):
115+ """
116+ Enumerates the types of capture settings describing an audio file.
117+
118+ Attributes:
119+ MICROPHONE_DISTANCE_UNSPECIFIED (int): Audio type is not known.
120+ NEARFIELD (int): The audio was captured from a closely placed microphone. Eg. phone,
121+ dictaphone, or handheld microphone. Generally if there speaker is within
122+ 1 meter of the microphone.
123+ MIDFIELD (int): The speaker if within 3 meters of the microphone.
124+ FARFIELD (int): The speaker is more than 3 meters away from the microphone.
125+ """
126+ MICROPHONE_DISTANCE_UNSPECIFIED = 0
127+ NEARFIELD = 1
128+ MIDFIELD = 2
129+ FARFIELD = 3
130+
131+ class OriginalMediaType (object ):
132+ """
133+ The original media the speech was recorded on.
134+
135+ Attributes:
136+ ORIGINAL_MEDIA_TYPE_UNSPECIFIED (int): Unknown original media type.
137+ AUDIO (int): The speech data is an audio recording.
138+ VIDEO (int): The speech data originally recorded on a video.
139+ """
140+ ORIGINAL_MEDIA_TYPE_UNSPECIFIED = 0
141+ AUDIO = 1
142+ VIDEO = 2
143+
144+ class RecordingDeviceType (object ):
145+ """
146+ The type of device the speech was recorded with.
147+
148+ Attributes:
149+ RECORDING_DEVICE_TYPE_UNSPECIFIED (int): The recording device is unknown.
150+ SMARTPHONE (int): Speech was recorded on a smartphone.
151+ PC (int): Speech was recorded using a personal computer or tablet.
152+ PHONE_LINE (int): Speech was recorded over a phone line.
153+ VEHICLE (int): Speech was recorded in a vehicle.
154+ OTHER_OUTDOOR_DEVICE (int): Speech was recorded outdoors.
155+ OTHER_INDOOR_DEVICE (int): Speech was recorded indoors.
156+ """
157+ RECORDING_DEVICE_TYPE_UNSPECIFIED = 0
158+ SMARTPHONE = 1
159+ PC = 2
160+ PHONE_LINE = 3
161+ VEHICLE = 4
162+ OTHER_OUTDOOR_DEVICE = 5
163+ OTHER_INDOOR_DEVICE = 6
164+
165+
166+ class GoogleDataCollectionConfig (object ):
167+ class LoggingConsentState (object ):
168+ """
169+ Speech content will not be logged until authorized consent is opted in.
170+ Once it is opted in, this flag enables/disables logging to override that
171+ consent. default = ENABLED (logging due to consent).
172+
173+ Attributes:
174+ ENABLED (int)
175+ DISABLED (int)
176+ """
177+ ENABLED = 0
178+ DISABLED = 1
179+
180+
79181class StreamingRecognizeResponse (object ):
80182 class SpeechEventType (object ):
81183 """
0 commit comments