Skip to content

Commit 3213c2c

Browse files
lukesneeringerchemelnucfin
authored andcommitted
Audio Logging and Recognition Metadata. (googleapis#5123)
1 parent b26923f commit 3213c2c

File tree

6 files changed

+713
-318
lines changed

6 files changed

+713
-318
lines changed
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
Speech Client API
2-
=================
1+
Client for Cloud Speech API
2+
===========================
33

44
.. automodule:: google.cloud.speech_v1p1beta1
5-
:members:
6-
:inherited-members:
5+
:members:
6+
:inherited-members:
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
Speech Client Types
2-
===================
1+
Types for Cloud Speech API Client
2+
=================================
33

44
.. automodule:: google.cloud.speech_v1p1beta1.types
5-
:members:
5+
:members:

speech/google/cloud/speech_v1p1beta1/gapic/enums.py

Lines changed: 116 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -21,26 +21,27 @@ class AudioEncoding(object):
2121
2222
All encodings support only 1 channel (mono) audio.
2323
24-
If you send a ``FLAC`` or ``WAV`` audio file format in the request,
25-
then if you specify an encoding in ``AudioEncoding``, it must match the
26-
encoding described in the audio header. If it does not match, then the
27-
request returns an
28-
``google.rpc.Code.INVALID_ARGUMENT`` error code. You can request
29-
recognition for ``WAV`` files that contain either ``LINEAR16`` or ``MULAW``
30-
encoded audio.
31-
For audio file formats other than ``FLAC`` or ``WAV``, you must
32-
specify the audio encoding in your ``RecognitionConfig``.
33-
3424
For best results, the audio source should be captured and transmitted using
3525
a lossless encoding (``FLAC`` or ``LINEAR16``). The accuracy of the speech
36-
recognition can be reduced if lossy codecs, which include the other codecs
37-
listed in this section, are used to capture or transmit the audio,
38-
particularly if background noise is present.
26+
recognition can be reduced if lossy codecs are used to capture or transmit
27+
audio, particularly if background noise is present. Lossy codecs include
28+
``MULAW``, ``AMR``, ``AMR_WB``, ``OGG_OPUS``, and ``SPEEX_WITH_HEADER_BYTE``.
29+
30+
The ``FLAC`` and ``WAV`` audio file formats include a header that describes the
31+
included audio content. You can request recognition for ``WAV`` files that
32+
contain either ``LINEAR16`` or ``MULAW`` encoded audio.
33+
If you send ``FLAC`` or ``WAV`` audio file format in
34+
your request, you do not need to specify an ``AudioEncoding``; the audio
35+
encoding format is determined from the file header. If you specify
36+
an ``AudioEncoding`` when you send send ``FLAC`` or ``WAV`` audio, the
37+
encoding configuration must match the encoding described in the audio
38+
header; otherwise the request returns an
39+
``google.rpc.Code.INVALID_ARGUMENT`` error code.
3940
4041
Attributes:
4142
ENCODING_UNSPECIFIED (int): Not specified.
4243
LINEAR16 (int): Uncompressed 16-bit signed little-endian samples (Linear PCM).
43-
FLAC (int): ```FLAC`` <https://xiph.org/flac/documentation.html>`_ (Free Lossless Audio
44+
FLAC (int): ``FLAC`` (Free Lossless Audio
4445
Codec) is the recommended encoding because it is
4546
lossless--therefore recognition is not compromised--and
4647
requires only about half the bandwidth of ``LINEAR16``. ``FLAC`` stream
@@ -76,6 +77,107 @@ class AudioEncoding(object):
7677
SPEEX_WITH_HEADER_BYTE = 7
7778

7879

80+
class RecognitionMetadata(object):
81+
class InteractionType(object):
82+
"""
83+
Use case categories that the audio recognition request can be described
84+
by.
85+
86+
Attributes:
87+
INTERACTION_TYPE_UNSPECIFIED (int): Use case is either unknown or is something other than one of the other
88+
values below.
89+
DISCUSSION (int): Multiple people in a conversation or discussion. For example in a
90+
meeting with two or more people actively participating. Typically
91+
all the primary people speaking would be in the same room (if not,
92+
see PHONE_CALL)
93+
PRESENTATION (int): One or more persons lecturing or presenting to others, mostly
94+
uninterrupted.
95+
PHONE_CALL (int): A phone-call or video-conference in which two or more people, who are
96+
not in the same room, are actively participating.
97+
VOICEMAIL (int): A recorded message intended for another person to listen to.
98+
PROFESSIONALLY_PRODUCED (int): Professionally produced audio (eg. TV Show, Podcast).
99+
VOICE_SEARCH (int): Transcribe spoken questions and queries into text.
100+
VOICE_COMMAND (int): Transcribe voice commands, such as for controlling a device.
101+
DICTATION (int): Transcribe speech to text to create a written document, such as a
102+
text-message, email or report.
103+
"""
104+
INTERACTION_TYPE_UNSPECIFIED = 0
105+
DISCUSSION = 1
106+
PRESENTATION = 2
107+
PHONE_CALL = 3
108+
VOICEMAIL = 4
109+
PROFESSIONALLY_PRODUCED = 5
110+
VOICE_SEARCH = 6
111+
VOICE_COMMAND = 7
112+
DICTATION = 8
113+
114+
class MicrophoneDistance(object):
115+
"""
116+
Enumerates the types of capture settings describing an audio file.
117+
118+
Attributes:
119+
MICROPHONE_DISTANCE_UNSPECIFIED (int): Audio type is not known.
120+
NEARFIELD (int): The audio was captured from a closely placed microphone. Eg. phone,
121+
dictaphone, or handheld microphone. Generally if there speaker is within
122+
1 meter of the microphone.
123+
MIDFIELD (int): The speaker if within 3 meters of the microphone.
124+
FARFIELD (int): The speaker is more than 3 meters away from the microphone.
125+
"""
126+
MICROPHONE_DISTANCE_UNSPECIFIED = 0
127+
NEARFIELD = 1
128+
MIDFIELD = 2
129+
FARFIELD = 3
130+
131+
class OriginalMediaType(object):
132+
"""
133+
The original media the speech was recorded on.
134+
135+
Attributes:
136+
ORIGINAL_MEDIA_TYPE_UNSPECIFIED (int): Unknown original media type.
137+
AUDIO (int): The speech data is an audio recording.
138+
VIDEO (int): The speech data originally recorded on a video.
139+
"""
140+
ORIGINAL_MEDIA_TYPE_UNSPECIFIED = 0
141+
AUDIO = 1
142+
VIDEO = 2
143+
144+
class RecordingDeviceType(object):
145+
"""
146+
The type of device the speech was recorded with.
147+
148+
Attributes:
149+
RECORDING_DEVICE_TYPE_UNSPECIFIED (int): The recording device is unknown.
150+
SMARTPHONE (int): Speech was recorded on a smartphone.
151+
PC (int): Speech was recorded using a personal computer or tablet.
152+
PHONE_LINE (int): Speech was recorded over a phone line.
153+
VEHICLE (int): Speech was recorded in a vehicle.
154+
OTHER_OUTDOOR_DEVICE (int): Speech was recorded outdoors.
155+
OTHER_INDOOR_DEVICE (int): Speech was recorded indoors.
156+
"""
157+
RECORDING_DEVICE_TYPE_UNSPECIFIED = 0
158+
SMARTPHONE = 1
159+
PC = 2
160+
PHONE_LINE = 3
161+
VEHICLE = 4
162+
OTHER_OUTDOOR_DEVICE = 5
163+
OTHER_INDOOR_DEVICE = 6
164+
165+
166+
class GoogleDataCollectionConfig(object):
167+
class LoggingConsentState(object):
168+
"""
169+
Speech content will not be logged until authorized consent is opted in.
170+
Once it is opted in, this flag enables/disables logging to override that
171+
consent. default = ENABLED (logging due to consent).
172+
173+
Attributes:
174+
ENABLED (int)
175+
DISABLED (int)
176+
"""
177+
ENABLED = 0
178+
DISABLED = 1
179+
180+
79181
class StreamingRecognizeResponse(object):
80182
class SpeechEventType(object):
81183
"""

speech/google/cloud/speech_v1p1beta1/gapic/speech_client.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
from google.cloud.speech_v1p1beta1.gapic import enums
2727
from google.cloud.speech_v1p1beta1.gapic import speech_client_config
2828
from google.cloud.speech_v1p1beta1.proto import cloud_speech_pb2
29+
from google.cloud.speech_v1p1beta1.proto import cloud_speech_pb2_grpc
30+
from google.longrunning import operations_pb2
2931

3032
_GAPIC_LIBRARY_VERSION = pkg_resources.get_distribution(
3133
'google-cloud-speech', ).version
@@ -85,7 +87,7 @@ def __init__(self,
8587
)
8688

8789
# Create the gRPC stubs.
88-
self.speech_stub = (cloud_speech_pb2.SpeechStub(channel))
90+
self.speech_stub = (cloud_speech_pb2_grpc.SpeechStub(channel))
8991

9092
# Operations client for methods that return long-running operations
9193
# futures.

0 commit comments

Comments
 (0)