Skip to content

Commit 5a0e492

Browse files
authored
Merge pull request googleapis#2426 from daspecster/add-speech-async
Add speech asynchronous recognize support.
2 parents a7cb215 + 08e9e03 commit 5a0e492

19 files changed

+815
-71
lines changed

docs/index.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,10 @@
173173

174174
speech-usage
175175
Client <speech-client>
176+
speech-encoding
177+
speech-metadata
178+
speech-operation
179+
speech-transcript
176180

177181
.. toctree::
178182
:maxdepth: 0

docs/speech-client.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
Speech Client
2-
================
2+
=============
33

44
.. automodule:: google.cloud.speech.client
55
:members:

docs/speech-encoding.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Speech Encoding
2+
===============
3+
4+
.. automodule:: google.cloud.speech.encoding
5+
:members:
6+
:undoc-members:
7+
:show-inheritance:

docs/speech-metadata.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Speech Metadata
2+
===============
3+
4+
.. automodule:: google.cloud.speech.metadata
5+
:members:
6+
:undoc-members:
7+
:show-inheritance:

docs/speech-operation.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Speech Operation
2+
================
3+
4+
.. automodule:: google.cloud.speech.operation
5+
:members:
6+
:undoc-members:
7+
:show-inheritance:

docs/speech-transcript.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Speech Transcript
2+
=================
3+
4+
.. automodule:: google.cloud.speech.transcript
5+
:members:
6+
:undoc-members:
7+
:show-inheritance:

docs/speech-usage.rst

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@ Using the API
22
=============
33

44
The `Google Speech`_ API enables developers to convert audio to text.
5-
The API recognizes over 80 languages and variants, to support your global user base.
5+
The API recognizes over 80 languages and variants, to support your global user
6+
base.
67

78
.. warning::
89

@@ -30,11 +31,41 @@ create an instance of :class:`~google.cloud.speech.client.Client`.
3031
>>> client = speech.Client()
3132
3233
34+
Asychronous Recognition
35+
-----------------------
36+
37+
The :meth:`~google.cloud.speech.Client.async_recognize` sends audio data to the
38+
Speech API and initiates a Long Running Operation. Using this operation, you
39+
can periodically poll for recognition results. Use asynchronous requests for
40+
audio data of any duration up to 80 minutes.
41+
42+
See: `Speech Asynchronous Recognize`_
43+
44+
45+
.. code-block:: python
46+
47+
>>> import time
48+
>>> operation = client.async_recognize(
49+
... None, 'gs://my-bucket/recording.flac',
50+
... 'FLAC', 16000, max_alternatives=2)
51+
>>> retry_count = 100
52+
>>> while retry_count > 0 and not operation.complete:
53+
... retry_count -= 1
54+
... time.sleep(10)
55+
... operation.poll() # API call
56+
>>> operation.complete
57+
True
58+
>>> operation.results[0].transcript
59+
'how old is the Brooklyn Bridge'
60+
>>> operation.results[0].confidence
61+
0.98267895
62+
63+
3364
Synchronous Recognition
3465
-----------------------
3566

36-
The :meth:`~google.cloud.speech.Client.sync_recognize` method converts speech data to text
37-
and returns alternative text transcriptons.
67+
The :meth:`~google.cloud.speech.Client.sync_recognize` method converts speech
68+
data to text and returns alternative text transcriptons.
3869

3970
.. code-block:: python
4071
@@ -53,3 +84,4 @@ and returns alternative text transcriptons.
5384
confidence: 0
5485
5586
.. _sync_recognize: https://cloud.google.com/speech/reference/rest/v1beta1/speech/syncrecognize
87+
.. _Speech Asynchronous Recognize: https://cloud.google.com/speech/reference/rest/v1beta1/speech/asyncrecognize

speech/google/cloud/speech/client.py

Lines changed: 181 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -19,30 +19,8 @@
1919
from google.cloud._helpers import _to_bytes
2020
from google.cloud import client as client_module
2121
from google.cloud.speech.connection import Connection
22-
23-
24-
class Encoding(object):
25-
"""Audio encoding types.
26-
27-
See:
28-
https://cloud.google.com/speech/reference/rest/v1beta1/\
29-
RecognitionConfig#AudioEncoding
30-
"""
31-
32-
LINEAR16 = 'LINEAR16'
33-
"""LINEAR16 encoding type."""
34-
35-
FLAC = 'FLAC'
36-
"""FLAC encoding type."""
37-
38-
MULAW = 'MULAW'
39-
"""MULAW encoding type."""
40-
41-
AMR = 'AMR'
42-
"""AMR encoding type."""
43-
44-
AMR_WB = 'AMR_WB'
45-
"""AMR_WB encoding type."""
22+
from google.cloud.speech.encoding import Encoding
23+
from google.cloud.speech.operation import Operation
4624

4725

4826
class Client(client_module.Client):
@@ -68,6 +46,81 @@ class Client(client_module.Client):
6846

6947
_connection_class = Connection
7048

49+
def async_recognize(self, content, source_uri, encoding, sample_rate,
50+
language_code=None, max_alternatives=None,
51+
profanity_filter=None, speech_context=None):
52+
"""Asychronous Recognize request to Google Speech API.
53+
54+
.. _async_recognize: https://cloud.google.com/speech/reference/\
55+
rest/v1beta1/speech/asyncrecognize
56+
57+
See `async_recognize`_.
58+
59+
:type content: bytes
60+
:param content: Byte stream of audio.
61+
62+
:type source_uri: str
63+
:param source_uri: URI that points to a file that contains audio
64+
data bytes as specified in RecognitionConfig.
65+
Currently, only Google Cloud Storage URIs are
66+
supported, which must be specified in the following
67+
format: ``gs://bucket_name/object_name``.
68+
69+
:type encoding: str
70+
:param encoding: encoding of audio data sent in all RecognitionAudio
71+
messages, can be one of: :attr:`~.Encoding.LINEAR16`,
72+
:attr:`~.Encoding.FLAC`, :attr:`~.Encoding.MULAW`,
73+
:attr:`~.Encoding.AMR`, :attr:`~.Encoding.AMR_WB`
74+
75+
:type sample_rate: int
76+
:param sample_rate: Sample rate in Hertz of the audio data sent in all
77+
requests. Valid values are: 8000-48000. For best
78+
results, set the sampling rate of the audio source
79+
to 16000 Hz. If that's not possible, use the
80+
native sample rate of the audio source (instead of
81+
re-sampling).
82+
83+
:type language_code: str
84+
:param language_code: (Optional) The language of the supplied audio as
85+
BCP-47 language tag. Example: ``'en-GB'``.
86+
If omitted, defaults to ``'en-US'``.
87+
88+
:type max_alternatives: int
89+
:param max_alternatives: (Optional) Maximum number of recognition
90+
hypotheses to be returned. The server may
91+
return fewer than maxAlternatives.
92+
Valid values are 0-30. A value of 0 or 1
93+
will return a maximum of 1. Defaults to 1
94+
95+
:type profanity_filter: bool
96+
:param profanity_filter: If True, the server will attempt to filter
97+
out profanities, replacing all but the
98+
initial character in each filtered word with
99+
asterisks, e.g. ``'f***'``. If False or
100+
omitted, profanities won't be filtered out.
101+
102+
:type speech_context: list
103+
:param speech_context: A list of strings (max 50) containing words and
104+
phrases "hints" so that the speech recognition
105+
is more likely to recognize them. This can be
106+
used to improve the accuracy for specific words
107+
and phrases. This can also be used to add new
108+
words to the vocabulary of the recognizer.
109+
110+
:rtype: `~google.cloud.speech.operation.Operation`
111+
:returns: ``Operation`` for asynchronous request to Google Speech API.
112+
"""
113+
114+
data = _build_request_data(content, source_uri, encoding,
115+
sample_rate, language_code,
116+
max_alternatives, profanity_filter,
117+
speech_context)
118+
119+
api_response = self.connection.api_request(
120+
method='POST', path='speech:asyncrecognize', data=data)
121+
122+
return Operation.from_api_repr(self, api_response)
123+
71124
def sync_recognize(self, content, source_uri, encoding, sample_rate,
72125
language_code=None, max_alternatives=None,
73126
profanity_filter=None, speech_context=None):
@@ -139,44 +192,115 @@ def sync_recognize(self, content, source_uri, encoding, sample_rate,
139192
between 0 and 1.
140193
"""
141194

142-
if content is None and source_uri is None:
143-
raise ValueError('content and source_uri cannot be both '
144-
'equal to None')
145-
146-
if content is not None and source_uri is not None:
147-
raise ValueError('content and source_uri cannot be both '
148-
'different from None')
195+
data = _build_request_data(content, source_uri, encoding,
196+
sample_rate, language_code,
197+
max_alternatives, profanity_filter,
198+
speech_context)
149199

150-
if encoding is None:
151-
raise ValueError('encoding cannot be None')
152-
if sample_rate is None:
153-
raise ValueError('sample_rate cannot be None')
200+
api_response = self.connection.api_request(
201+
method='POST', path='speech:syncrecognize', data=data)
154202

155-
if content is not None:
156-
audio = {'content': b64encode(_to_bytes(content))}
203+
if len(api_response['results']) == 1:
204+
return api_response['results'][0]['alternatives']
157205
else:
158-
audio = {'uri': source_uri}
206+
raise ValueError('result in api should have length 1')
159207

160-
config = {'encoding': encoding, 'sampleRate': sample_rate}
161208

162-
if language_code is not None:
163-
config['languageCode'] = language_code
164-
if max_alternatives is not None:
165-
config['maxAlternatives'] = max_alternatives
166-
if profanity_filter is not None:
167-
config['profanityFilter'] = profanity_filter
168-
if speech_context is not None:
169-
config['speechContext'] = {'phrases': speech_context}
209+
def _build_request_data(content, source_uri, encoding, sample_rate,
210+
language_code=None, max_alternatives=None,
211+
profanity_filter=None, speech_context=None):
212+
"""Builds the request data before making API request.
213+
214+
:type content: bytes
215+
:param content: Byte stream of audio.
216+
217+
:type source_uri: str
218+
:param source_uri: URI that points to a file that contains audio
219+
data bytes as specified in RecognitionConfig.
220+
Currently, only Google Cloud Storage URIs are
221+
supported, which must be specified in the following
222+
format: ``gs://bucket_name/object_name``.
223+
224+
:type encoding: str
225+
:param encoding: encoding of audio data sent in all RecognitionAudio
226+
messages, can be one of: :attr:`~.Encoding.LINEAR16`,
227+
:attr:`~.Encoding.FLAC`, :attr:`~.Encoding.MULAW`,
228+
:attr:`~.Encoding.AMR`, :attr:`~.Encoding.AMR_WB`
229+
230+
:type sample_rate: int
231+
:param sample_rate: Sample rate in Hertz of the audio data sent in all
232+
requests. Valid values are: 8000-48000. For best
233+
results, set the sampling rate of the audio source
234+
to 16000 Hz. If that's not possible, use the
235+
native sample rate of the audio source (instead of
236+
re-sampling).
237+
238+
:type language_code: str
239+
:param language_code: (Optional) The language of the supplied audio as
240+
BCP-47 language tag. Example: ``'en-GB'``.
241+
If omitted, defaults to ``'en-US'``.
242+
243+
:type max_alternatives: int
244+
:param max_alternatives: (Optional) Maximum number of recognition
245+
hypotheses to be returned. The server may
246+
return fewer than maxAlternatives.
247+
Valid values are 0-30. A value of 0 or 1
248+
will return a maximum of 1. Defaults to 1
249+
250+
:type profanity_filter: bool
251+
:param profanity_filter: If True, the server will attempt to filter
252+
out profanities, replacing all but the
253+
initial character in each filtered word with
254+
asterisks, e.g. ``'f***'``. If False or
255+
omitted, profanities won't be filtered out.
256+
257+
:type speech_context: list
258+
:param speech_context: A list of strings (max 50) containing words and
259+
phrases "hints" so that the speech recognition
260+
is more likely to recognize them. This can be
261+
used to improve the accuracy for specific words
262+
and phrases. This can also be used to add new
263+
words to the vocabulary of the recognizer.
264+
265+
:rtype: dict
266+
:returns: Dictionary with required data for Google Speech API.
267+
"""
268+
if content is None and source_uri is None:
269+
raise ValueError('content and source_uri cannot be both '
270+
'equal to None')
170271

171-
data = {
172-
'audio': audio,
173-
'config': config,
174-
}
272+
if content is not None and source_uri is not None:
273+
raise ValueError('content and source_uri cannot be both '
274+
'different from None')
175275

176-
api_response = self.connection.api_request(
177-
method='POST', path='syncrecognize', data=data)
276+
if encoding is None:
277+
raise ValueError('encoding cannot be None')
178278

179-
if len(api_response['results']) == 1:
180-
return api_response['results'][0]['alternatives']
181-
else:
182-
raise ValueError('result in api should have length 1')
279+
encoding_value = getattr(Encoding, encoding)
280+
281+
if sample_rate is None:
282+
raise ValueError('sample_rate cannot be None')
283+
284+
if content is not None:
285+
audio = {'content': b64encode(_to_bytes(content))}
286+
else:
287+
audio = {'uri': source_uri}
288+
289+
config = {'encoding': encoding_value,
290+
'sampleRate': sample_rate}
291+
292+
if language_code is not None:
293+
config['languageCode'] = language_code
294+
if max_alternatives is not None:
295+
config['maxAlternatives'] = max_alternatives
296+
if profanity_filter is not None:
297+
config['profanityFilter'] = profanity_filter
298+
if speech_context is not None:
299+
config['speechContext'] = {'phrases': speech_context}
300+
301+
data = {
302+
'audio': audio,
303+
'config': config,
304+
}
305+
306+
return data

speech/google/cloud/speech/connection.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ class Connection(base_connection.JSONConnection):
2626
API_VERSION = 'v1beta1'
2727
"""The version of the API, used in building the API call's URL."""
2828

29-
API_URL_TEMPLATE = '{api_base_url}/{api_version}/speech:{path}'
29+
API_URL_TEMPLATE = '{api_base_url}/{api_version}/{path}'
3030
"""A template for the URL of a particular API call."""
3131

3232
SCOPE = ('https://www.googleapis.com/auth/cloud-platform',)

0 commit comments

Comments
 (0)