Skip to content

Commit 88cdbae

Browse files
Speech v1p1beta1 (googleapis#4837)
This adds the new "1.1 beta 1" endpoint to the client library. This allows for the custom specification of which model is used behind the scenes for certain speech recognition tasks.
1 parent 424c201 commit 88cdbae

File tree

11 files changed

+2335
-0
lines changed

11 files changed

+2335
-0
lines changed
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# Copyright 2018 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import absolute_import
16+
17+
from google.cloud.speech_v1p1beta1 import types
18+
from google.cloud.speech_v1p1beta1.gapic import enums
19+
from google.cloud.speech_v1p1beta1.gapic import speech_client
20+
21+
from google.cloud.speech_v1.helpers import SpeechHelpers
22+
23+
24+
class SpeechClient(SpeechHelpers, speech_client.SpeechClient):
25+
__doc__ = speech_client.SpeechClient.__doc__
26+
enums = enums
27+
types = types
28+
29+
30+
__all__ = (
31+
'enums',
32+
'types',
33+
'SpeechClient',
34+
)

speech/google/cloud/speech_v1p1beta1/gapic/__init__.py

Whitespace-only changes.
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
# Copyright 2018 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""Wrappers for protocol buffer enum types."""
15+
16+
17+
class RecognitionConfig(object):
18+
class AudioEncoding(object):
19+
"""
20+
The encoding of the audio data sent in the request.
21+
22+
All encodings support only 1 channel (mono) audio.
23+
24+
If you send a ``FLAC`` or ``WAV`` audio file format in the request,
25+
then if you specify an encoding in ``AudioEncoding``, it must match the
26+
encoding described in the audio header. If it does not match, then the
27+
request returns an
28+
``google.rpc.Code.INVALID_ARGUMENT`` error code. You can request
29+
recognition for ``WAV`` files that contain either ``LINEAR16`` or ``MULAW``
30+
encoded audio.
31+
For audio file formats other than ``FLAC`` or ``WAV``, you must
32+
specify the audio encoding in your ``RecognitionConfig``.
33+
34+
For best results, the audio source should be captured and transmitted using
35+
a lossless encoding (``FLAC`` or ``LINEAR16``). The accuracy of the speech
36+
recognition can be reduced if lossy codecs, which include the other codecs
37+
listed in this section, are used to capture or transmit the audio,
38+
particularly if background noise is present.
39+
40+
Attributes:
41+
ENCODING_UNSPECIFIED (int): Not specified.
42+
LINEAR16 (int): Uncompressed 16-bit signed little-endian samples (Linear PCM).
43+
FLAC (int): ```FLAC`` <https://xiph.org/flac/documentation.html>`_ (Free Lossless Audio
44+
Codec) is the recommended encoding because it is
45+
lossless--therefore recognition is not compromised--and
46+
requires only about half the bandwidth of ``LINEAR16``. ``FLAC`` stream
47+
encoding supports 16-bit and 24-bit samples, however, not all fields in
48+
``STREAMINFO`` are supported.
49+
MULAW (int): 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
50+
AMR (int): Adaptive Multi-Rate Narrowband codec. ``sample_rate_hertz`` must be 8000.
51+
AMR_WB (int): Adaptive Multi-Rate Wideband codec. ``sample_rate_hertz`` must be 16000.
52+
OGG_OPUS (int): Opus encoded audio frames in Ogg container
53+
(`OggOpus <https://wiki.xiph.org/OggOpus>`_).
54+
``sample_rate_hertz`` must be one of 8000, 12000, 16000, 24000, or 48000.
55+
SPEEX_WITH_HEADER_BYTE (int): Although the use of lossy encodings is not recommended, if a very low
56+
bitrate encoding is required, ``OGG_OPUS`` is highly preferred over
57+
Speex encoding. The `Speex <https://speex.org/>`_ encoding supported by
58+
Cloud Speech API has a header byte in each block, as in MIME type
59+
``audio/x-speex-with-header-byte``.
60+
It is a variant of the RTP Speex encoding defined in
61+
`RFC 5574 <https://tools.ietf.org/html/rfc5574>`_.
62+
The stream is a sequence of blocks, one block per RTP packet. Each block
63+
starts with a byte containing the length of the block, in bytes, followed
64+
by one or more frames of Speex data, padded to an integral number of
65+
bytes (octets) as specified in RFC 5574. In other words, each RTP header
66+
is replaced with a single byte containing the block length. Only Speex
67+
wideband is supported. ``sample_rate_hertz`` must be 16000.
68+
"""
69+
ENCODING_UNSPECIFIED = 0
70+
LINEAR16 = 1
71+
FLAC = 2
72+
MULAW = 3
73+
AMR = 4
74+
AMR_WB = 5
75+
OGG_OPUS = 6
76+
SPEEX_WITH_HEADER_BYTE = 7
77+
78+
79+
class StreamingRecognizeResponse(object):
80+
class SpeechEventType(object):
81+
"""
82+
Indicates the type of speech event.
83+
84+
Attributes:
85+
SPEECH_EVENT_UNSPECIFIED (int): No speech event specified.
86+
END_OF_SINGLE_UTTERANCE (int): This event indicates that the server has detected the end of the user's
87+
speech utterance and expects no additional speech. Therefore, the server
88+
will not process additional audio (although it may subsequently return
89+
additional results). The client should stop sending additional audio
90+
data, half-close the gRPC connection, and wait for any additional results
91+
until the server closes the gRPC connection. This event is only sent if
92+
``single_utterance`` was set to ``true``, and is not used otherwise.
93+
"""
94+
SPEECH_EVENT_UNSPECIFIED = 0
95+
END_OF_SINGLE_UTTERANCE = 1

0 commit comments

Comments
 (0)