1919from google .cloud ._helpers import _to_bytes
2020from google .cloud import client as client_module
2121from google .cloud .speech .connection import Connection
22-
23-
24- class Encoding (object ):
25- """Audio encoding types.
26-
27- See:
28- https://cloud.google.com/speech/reference/rest/v1beta1/\
29- RecognitionConfig#AudioEncoding
30- """
31-
32- LINEAR16 = 'LINEAR16'
33- """LINEAR16 encoding type."""
34-
35- FLAC = 'FLAC'
36- """FLAC encoding type."""
37-
38- MULAW = 'MULAW'
39- """MULAW encoding type."""
40-
41- AMR = 'AMR'
42- """AMR encoding type."""
43-
44- AMR_WB = 'AMR_WB'
45- """AMR_WB encoding type."""
22+ from google .cloud .speech .encoding import Encoding
23+ from google .cloud .speech .operation import Operation
4624
4725
4826class Client (client_module .Client ):
@@ -68,6 +46,81 @@ class Client(client_module.Client):
6846
6947 _connection_class = Connection
7048
49+ def async_recognize (self , content , source_uri , encoding , sample_rate ,
50+ language_code = None , max_alternatives = None ,
51+ profanity_filter = None , speech_context = None ):
52+ """Asychronous Recognize request to Google Speech API.
53+
54+ .. _async_recognize: https://cloud.google.com/speech/reference/\
55+ rest/v1beta1/speech/asyncrecognize
56+
57+ See `async_recognize`_.
58+
59+ :type content: bytes
60+ :param content: Byte stream of audio.
61+
62+ :type source_uri: str
63+ :param source_uri: URI that points to a file that contains audio
64+ data bytes as specified in RecognitionConfig.
65+ Currently, only Google Cloud Storage URIs are
66+ supported, which must be specified in the following
67+ format: ``gs://bucket_name/object_name``.
68+
69+ :type encoding: str
70+ :param encoding: encoding of audio data sent in all RecognitionAudio
71+ messages, can be one of: :attr:`~.Encoding.LINEAR16`,
72+ :attr:`~.Encoding.FLAC`, :attr:`~.Encoding.MULAW`,
73+ :attr:`~.Encoding.AMR`, :attr:`~.Encoding.AMR_WB`
74+
75+ :type sample_rate: int
76+ :param sample_rate: Sample rate in Hertz of the audio data sent in all
77+ requests. Valid values are: 8000-48000. For best
78+ results, set the sampling rate of the audio source
79+ to 16000 Hz. If that's not possible, use the
80+ native sample rate of the audio source (instead of
81+ re-sampling).
82+
83+ :type language_code: str
84+ :param language_code: (Optional) The language of the supplied audio as
85+ BCP-47 language tag. Example: ``'en-GB'``.
86+ If omitted, defaults to ``'en-US'``.
87+
88+ :type max_alternatives: int
89+ :param max_alternatives: (Optional) Maximum number of recognition
90+ hypotheses to be returned. The server may
91+ return fewer than maxAlternatives.
92+ Valid values are 0-30. A value of 0 or 1
93+ will return a maximum of 1. Defaults to 1
94+
95+ :type profanity_filter: bool
96+ :param profanity_filter: If True, the server will attempt to filter
97+ out profanities, replacing all but the
98+ initial character in each filtered word with
99+ asterisks, e.g. ``'f***'``. If False or
100+ omitted, profanities won't be filtered out.
101+
102+ :type speech_context: list
103+ :param speech_context: A list of strings (max 50) containing words and
104+ phrases "hints" so that the speech recognition
105+ is more likely to recognize them. This can be
106+ used to improve the accuracy for specific words
107+ and phrases. This can also be used to add new
108+ words to the vocabulary of the recognizer.
109+
110+ :rtype: `~google.cloud.speech.operation.Operation`
111+ :returns: ``Operation`` for asynchronous request to Google Speech API.
112+ """
113+
114+ data = _build_request_data (content , source_uri , encoding ,
115+ sample_rate , language_code ,
116+ max_alternatives , profanity_filter ,
117+ speech_context )
118+
119+ api_response = self .connection .api_request (
120+ method = 'POST' , path = 'speech:asyncrecognize' , data = data )
121+
122+ return Operation .from_api_repr (self , api_response )
123+
71124 def sync_recognize (self , content , source_uri , encoding , sample_rate ,
72125 language_code = None , max_alternatives = None ,
73126 profanity_filter = None , speech_context = None ):
@@ -139,44 +192,115 @@ def sync_recognize(self, content, source_uri, encoding, sample_rate,
139192 between 0 and 1.
140193 """
141194
142- if content is None and source_uri is None :
143- raise ValueError ('content and source_uri cannot be both '
144- 'equal to None' )
145-
146- if content is not None and source_uri is not None :
147- raise ValueError ('content and source_uri cannot be both '
148- 'different from None' )
195+ data = _build_request_data (content , source_uri , encoding ,
196+ sample_rate , language_code ,
197+ max_alternatives , profanity_filter ,
198+ speech_context )
149199
150- if encoding is None :
151- raise ValueError ('encoding cannot be None' )
152- if sample_rate is None :
153- raise ValueError ('sample_rate cannot be None' )
200+ api_response = self .connection .api_request (
201+ method = 'POST' , path = 'speech:syncrecognize' , data = data )
154202
155- if content is not None :
156- audio = { 'content' : b64encode ( _to_bytes ( content ))}
203+ if len ( api_response [ 'results' ]) == 1 :
204+ return api_response [ 'results' ][ 0 ][ 'alternatives' ]
157205 else :
158- audio = { 'uri' : source_uri }
206+ raise ValueError ( 'result in api should have length 1' )
159207
160- config = {'encoding' : encoding , 'sampleRate' : sample_rate }
161208
162- if language_code is not None :
163- config ['languageCode' ] = language_code
164- if max_alternatives is not None :
165- config ['maxAlternatives' ] = max_alternatives
166- if profanity_filter is not None :
167- config ['profanityFilter' ] = profanity_filter
168- if speech_context is not None :
169- config ['speechContext' ] = {'phrases' : speech_context }
209+ def _build_request_data (content , source_uri , encoding , sample_rate ,
210+ language_code = None , max_alternatives = None ,
211+ profanity_filter = None , speech_context = None ):
212+ """Builds the request data before making API request.
213+
214+ :type content: bytes
215+ :param content: Byte stream of audio.
216+
217+ :type source_uri: str
218+ :param source_uri: URI that points to a file that contains audio
219+ data bytes as specified in RecognitionConfig.
220+ Currently, only Google Cloud Storage URIs are
221+ supported, which must be specified in the following
222+ format: ``gs://bucket_name/object_name``.
223+
224+ :type encoding: str
225+ :param encoding: encoding of audio data sent in all RecognitionAudio
226+ messages, can be one of: :attr:`~.Encoding.LINEAR16`,
227+ :attr:`~.Encoding.FLAC`, :attr:`~.Encoding.MULAW`,
228+ :attr:`~.Encoding.AMR`, :attr:`~.Encoding.AMR_WB`
229+
230+ :type sample_rate: int
231+ :param sample_rate: Sample rate in Hertz of the audio data sent in all
232+ requests. Valid values are: 8000-48000. For best
233+ results, set the sampling rate of the audio source
234+ to 16000 Hz. If that's not possible, use the
235+ native sample rate of the audio source (instead of
236+ re-sampling).
237+
238+ :type language_code: str
239+ :param language_code: (Optional) The language of the supplied audio as
240+ BCP-47 language tag. Example: ``'en-GB'``.
241+ If omitted, defaults to ``'en-US'``.
242+
243+ :type max_alternatives: int
244+ :param max_alternatives: (Optional) Maximum number of recognition
245+ hypotheses to be returned. The server may
246+ return fewer than maxAlternatives.
247+ Valid values are 0-30. A value of 0 or 1
248+ will return a maximum of 1. Defaults to 1
249+
250+ :type profanity_filter: bool
251+ :param profanity_filter: If True, the server will attempt to filter
252+ out profanities, replacing all but the
253+ initial character in each filtered word with
254+ asterisks, e.g. ``'f***'``. If False or
255+ omitted, profanities won't be filtered out.
256+
257+ :type speech_context: list
258+ :param speech_context: A list of strings (max 50) containing words and
259+ phrases "hints" so that the speech recognition
260+ is more likely to recognize them. This can be
261+ used to improve the accuracy for specific words
262+ and phrases. This can also be used to add new
263+ words to the vocabulary of the recognizer.
264+
265+ :rtype: dict
266+ :returns: Dictionary with required data for Google Speech API.
267+ """
268+ if content is None and source_uri is None :
269+ raise ValueError ('content and source_uri cannot be both '
270+ 'equal to None' )
170271
171- data = {
172- 'audio' : audio ,
173- 'config' : config ,
174- }
272+ if content is not None and source_uri is not None :
273+ raise ValueError ('content and source_uri cannot be both '
274+ 'different from None' )
175275
176- api_response = self . connection . api_request (
177- method = 'POST' , path = 'syncrecognize' , data = data )
276+ if encoding is None :
277+ raise ValueError ( 'encoding cannot be None' )
178278
179- if len (api_response ['results' ]) == 1 :
180- return api_response ['results' ][0 ]['alternatives' ]
181- else :
182- raise ValueError ('result in api should have length 1' )
279+ encoding_value = getattr (Encoding , encoding )
280+
281+ if sample_rate is None :
282+ raise ValueError ('sample_rate cannot be None' )
283+
284+ if content is not None :
285+ audio = {'content' : b64encode (_to_bytes (content ))}
286+ else :
287+ audio = {'uri' : source_uri }
288+
289+ config = {'encoding' : encoding_value ,
290+ 'sampleRate' : sample_rate }
291+
292+ if language_code is not None :
293+ config ['languageCode' ] = language_code
294+ if max_alternatives is not None :
295+ config ['maxAlternatives' ] = max_alternatives
296+ if profanity_filter is not None :
297+ config ['profanityFilter' ] = profanity_filter
298+ if speech_context is not None :
299+ config ['speechContext' ] = {'phrases' : speech_context }
300+
301+ data = {
302+ 'audio' : audio ,
303+ 'config' : config ,
304+ }
305+
306+ return data
0 commit comments