Speech API: enhanced model and recognition metadata (GoogleCloudPlatform#1436)

dizcology · chenyumic · commit 50c3bcd53e10 · 2018-04-06T13:05:27.000-07:00
* enhanced model and recognition metadata

* flake, update tests

* readme

* client library version update
diff --git a/speech/cloud-client/README.rst b/speech/cloud-client/README.rst
@@ -16,7 +16,7 @@ This directory contains samples for Google Cloud Speech API. The `Google Cloud S
 
 
 
-.. _Google Cloud Speech API: https://cloud.google.com/speech/docs/ 
+.. _Google Cloud Speech API: https://cloud.google.com/speech/docs/
 
 Setup
 -------------------------------------------------------------------------------
@@ -91,22 +91,21 @@ To run this sample:
     $ python transcribe.py
 
     usage: transcribe.py [-h] path
-
+    
     Google Cloud Speech API sample application using the REST API for batch
     processing.
-
+    
     Example usage:
         python transcribe.py resources/audio.raw
         python transcribe.py gs://cloud-samples-tests/speech/brooklyn.flac
-
+    
     positional arguments:
       path        File or GCS path for audio file to be recognized
-
+    
     optional arguments:
       -h, --help  show this help message and exit
 
 
-
 Transcribe async
 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
@@ -123,22 +122,21 @@ To run this sample:
     $ python transcribe_async.py
 
     usage: transcribe_async.py [-h] path
-
+    
     Google Cloud Speech API sample application using the REST API for async
     batch processing.
-
+    
     Example usage:
         python transcribe_async.py resources/audio.raw
         python transcribe_async.py gs://cloud-samples-tests/speech/vr.flac
-
+    
     positional arguments:
       path        File or GCS path for audio file to be recognized
-
+    
     optional arguments:
       -h, --help  show this help message and exit
 
 
-
 Transcribe with word time offsets
 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
@@ -155,21 +153,20 @@ To run this sample:
     $ python transcribe_word_time_offsets.py
 
     usage: transcribe_word_time_offsets.py [-h] path
-
+    
     Google Cloud Speech API sample that demonstrates word time offsets.
-
+    
     Example usage:
         python transcribe_word_time_offsets.py resources/audio.raw
         python transcribe_word_time_offsets.py         gs://cloud-samples-tests/speech/vr.flac
-
+    
     positional arguments:
       path        File or GCS path for audio file to be recognized
-
+    
     optional arguments:
       -h, --help  show this help message and exit
 
 
-
 Transcribe Streaming
 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
@@ -186,19 +183,50 @@ To run this sample:
     $ python transcribe_streaming.py
 
     usage: transcribe_streaming.py [-h] stream
-
+    
     Google Cloud Speech API sample application using the streaming API.
-
+    
     Example usage:
         python transcribe_streaming.py resources/audio.raw
-
+    
     positional arguments:
       stream      File to stream to the API
-
+    
     optional arguments:
       -h, --help  show this help message and exit
 
 
+Beta Samples
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+.. image:: https://gstatic.com/cloudssh/images/open-btn.png
+   :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=speech/cloud-client/beta_snippets.py;speech/cloud-client/README.rst
+
+
+
+
+To run this sample:
+
+.. code-block:: bash
+
+    $ python beta_snippets.py
+
+    usage: beta_snippets.py [-h] command path
+    
+    Google Cloud Speech API sample that demonstrates enhanced models
+    and recognition metadata.
+    
+    Example usage:
+        python beta_snippets.py enhanced-model resources/commercial_mono.wav
+        python beta_snippets.py metadata resources/commercial_mono.wav
+    
+    positional arguments:
+      command
+      path        File for audio file to be recognized
+    
+    optional arguments:
+      -h, --help  show this help message and exit
+
 
 
 
diff --git a/speech/cloud-client/README.rst.in b/speech/cloud-client/README.rst.in
@@ -34,6 +34,9 @@ samples:
 - name: Transcribe Streaming
   file: transcribe_streaming.py
   show_help: true
+- name: Beta Samples
+  file: beta_snippets.py
+  show_help: true
 
 cloud_client_library: true
 
diff --git a/speech/cloud-client/beta_snippets.py b/speech/cloud-client/beta_snippets.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+
+# Copyright 2018 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Google Cloud Speech API sample that demonstrates enhanced models
+and recognition metadata.
+
+Example usage:
+    python beta_snippets.py enhanced-model resources/commercial_mono.wav
+    python beta_snippets.py metadata resources/commercial_mono.wav
+"""
+
+import argparse
+import io
+
+from google.cloud import speech_v1p1beta1 as speech
+
+
+# [START speech_transcribe_file_with_enhanced_model]
+def transcribe_file_with_enhanced_model(path):
+    """Transcribe the given audio file using an enhanced model."""
+    client = speech.SpeechClient()
+
+    with io.open(path, 'rb') as audio_file:
+        content = audio_file.read()
+
+    audio = speech.types.RecognitionAudio(content=content)
+    config = speech.types.RecognitionConfig(
+        encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
+        sample_rate_hertz=8000,
+        language_code='en-US',
+        # Enhanced models are only available to projects that
+        # opt in for audio data collection.
+        use_enhanced=True,
+        # A model must be specified to use enhanced model.
+        model='phone_call')
+
+    response = client.recognize(config, audio)
+
+    for i, result in enumerate(response.results):
+        alternative = result.alternatives[0]
+        print('-' * 20)
+        print('First alternative of result {}'.format(i))
+        print('Transcript: {}'.format(alternative.transcript))
+# [END speech_transcribe_file_with_enhanced_model]
+
+
+# [START speech_transcribe_file_with_metadata]
+def transcribe_file_with_metadata(path):
+    """Send a request that includes recognition metadata."""
+    client = speech.SpeechClient()
+
+    with io.open(path, 'rb') as audio_file:
+        content = audio_file.read()
+
+    # Here we construct a recognition metadata object.
+    # Most metadata fields are specified as enums that can be found
+    # in speech.enums.RecognitionMetadata
+    metadata = speech.types.RecognitionMetadata()
+    metadata.interaction_type = (
+        speech.enums.RecognitionMetadata.InteractionType.DISCUSSION)
+    metadata.microphone_distance = (
+        speech.enums.RecognitionMetadata.MicrophoneDistance.NEARFIELD)
+    metadata.recording_device_type = (
+        speech.enums.RecognitionMetadata.RecordingDeviceType.SMARTPHONE)
+    # Some metadata fields are free form strings
+    metadata.recording_device_name = "Pixel 2 XL"
+    # And some are integers, for instance the 6 digit NAICS code
+    # https://www.naics.com/search/
+    metadata.industry_naics_code_of_audio = 519190
+
+    audio = speech.types.RecognitionAudio(content=content)
+    config = speech.types.RecognitionConfig(
+        encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
+        sample_rate_hertz=8000,
+        language_code='en-US',
+        # Add this in the request to send metadata.
+        metadata=metadata)
+
+    response = client.recognize(config, audio)
+
+    for i, result in enumerate(response.results):
+        alternative = result.alternatives[0]
+        print('-' * 20)
+        print('First alternative of result {}'.format(i))
+        print('Transcript: {}'.format(alternative.transcript))
+# [END speech_transcribe_file_with_metadata]
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument('command')
+    parser.add_argument(
+        'path', help='File for audio file to be recognized')
+
+    args = parser.parse_args()
+
+    if args.command == 'enhanced-model':
+        transcribe_file_with_enhanced_model(args.path)
+    elif args.command == 'metadata':
+        transcribe_file_with_metadata(args.path)
diff --git a/speech/cloud-client/beta_snippets_test.py b/speech/cloud-client/beta_snippets_test.py
@@ -0,0 +1,35 @@
+# Copyright 2018, Google, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from beta_snippets import (
+    transcribe_file_with_enhanced_model, transcribe_file_with_metadata)
+
+RESOURCES = os.path.join(os.path.dirname(__file__), 'resources')
+
+
+def test_transcribe_file_with_enhanced_model(capsys):
+    transcribe_file_with_enhanced_model(
+        os.path.join(RESOURCES, 'commercial_mono.wav'))
+    out, _ = capsys.readouterr()
+
+    assert 'Chrome' in out
+
+
+def test_transcribe_file_with_metadata(capsys):
+    transcribe_file_with_metadata(
+        os.path.join(RESOURCES, 'commercial_mono.wav'))
+    out, _ = capsys.readouterr()
+
+    assert 'Chrome' in out
diff --git a/speech/cloud-client/requirements.txt b/speech/cloud-client/requirements.txt
@@ -1 +1 @@
-google-cloud-speech==0.32.1
+google-cloud-speech==0.33.0
diff --git a/speech/cloud-client/resources/commercial_mono.wav b/speech/cloud-client/resources/commercial_mono.wav

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-google-cloud-speech==0.32.1`
	`1`	`+google-cloud-speech==0.33.0`