Skip to content

Commit a0a6a14

Browse files
committed
Added csv transcript
1 parent 5e958b5 commit a0a6a14

File tree

2 files changed

+73
-48
lines changed

2 files changed

+73
-48
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
google-cloud-speech==0.33.0
2+
google-cloud-storage==1.7.0
Lines changed: 72 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
12
#!/usr/bin/env python
23

34
# Copyright 2017 Google Inc. All Rights Reserved.
@@ -23,80 +24,103 @@
2324
"""
2425

2526
import argparse
27+
import csv
28+
import datetime
2629
import io
30+
import os
31+
32+
from google.cloud import speech_v1p1beta1 as speech
33+
from google.cloud import storage
2734

35+
def _safe_filename(filename):
36+
"""
37+
Generates a safe filename that is unlikely to collide with existing objects
38+
in Google Cloud Storage.
39+
``filename.ext`` is transformed into ``filename-YYYY-MM-DD-HHMMSS.ext``
40+
"""
41+
date = datetime.datetime.utcnow().strftime("%Y-%m-%d-%H%M%S")
42+
basename, extension = filename.rsplit('.', 1)
43+
return "{0}-{1}.{2}".format(basename, date, extension)
2844

2945
# [START def_transcribe_file]
30-
def transcribe_file(speech_file):
46+
def transcribe_file(filename, output):
3147
"""Transcribe the given audio file asynchronously."""
32-
from google.cloud import speech
33-
from google.cloud.speech import enums
34-
from google.cloud.speech import types
35-
client = speech.SpeechClient()
36-
37-
# [START migration_async_request]
38-
with io.open(speech_file, 'rb') as audio_file:
39-
content = audio_file.read()
40-
41-
audio = types.RecognitionAudio(content=content)
42-
config = types.RecognitionConfig(
43-
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
44-
sample_rate_hertz=16000,
45-
language_code='en-US')
46-
47-
# [START migration_async_response]
48-
operation = client.long_running_recognize(config, audio)
49-
# [END migration_async_request]
50-
51-
print('Waiting for operation to complete...')
52-
response = operation.result(timeout=90)
53-
54-
# Each result is for a consecutive portion of the audio. Iterate through
55-
# them to get the transcripts for the entire audio file.
56-
for result in response.results:
57-
# The first alternative is the most likely one for this portion.
58-
print(u'Transcript: {}'.format(result.alternatives[0].transcript))
59-
print('Confidence: {}'.format(result.alternatives[0].confidence))
60-
# [END migration_async_response]
48+
client = storage.Client()
49+
50+
bucket_name = 'bjoeris-temp-audio'
51+
bucket = client.bucket(bucket_name)
52+
blob_name = _safe_filename(filename)
53+
blob = bucket.blob(blob_name)
54+
print("Uploading file...")
55+
with io.open(filename, 'rb') as audio_file:
56+
blob.upload_from_file(audio_file)
57+
uri = "gs://{}/{}".format(bucket_name, blob_name)
58+
59+
transcribe_gcs(uri, output)
60+
print("Deleting file...")
61+
blob.delete()
6162
# [END def_transcribe_file]
6263

6364

6465
# [START def_transcribe_gcs]
65-
def transcribe_gcs(gcs_uri):
66+
def transcribe_gcs(gcs_uri, output):
6667
"""Asynchronously transcribes the audio file specified by the gcs_uri."""
67-
from google.cloud import speech
68-
from google.cloud.speech import enums
69-
from google.cloud.speech import types
7068
client = speech.SpeechClient()
7169

72-
audio = types.RecognitionAudio(uri=gcs_uri)
73-
config = types.RecognitionConfig(
74-
encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
70+
audio = speech.types.RecognitionAudio(uri=gcs_uri)
71+
72+
metadata = speech.types.RecognitionMetadata()
73+
metadata.interaction_type = speech.enums.RecognitionMetadata.InteractionType.DISCUSSION
74+
metadata.microphone_distance = speech.enums.RecognitionMetadata.MicrophoneDistance.NEARFIELD
75+
metadata.recording_device_type = speech.enums.RecognitionMetadata.RecordingDeviceType.PC
76+
config = speech.types.RecognitionConfig(
77+
encoding=speech.enums.RecognitionConfig.AudioEncoding.FLAC,
7578
sample_rate_hertz=16000,
76-
language_code='en-US')
79+
language_code='en-US',
80+
metadata=metadata,
81+
enable_automatic_punctuation=True,
82+
enable_word_time_offsets=True)
7783

7884
operation = client.long_running_recognize(config, audio)
7985

80-
print('Waiting for operation to complete...')
86+
print('Transcribing...')
8187
response = operation.result(timeout=90)
8288

8389
# Each result is for a consecutive portion of the audio. Iterate through
8490
# them to get the transcripts for the entire audio file.
85-
for result in response.results:
86-
# The first alternative is the most likely one for this portion.
87-
print(u'Transcript: {}'.format(result.alternatives[0].transcript))
88-
print('Confidence: {}'.format(result.alternatives[0].confidence))
89-
# [END def_transcribe_gcs]
91+
timestamp = 0.0
92+
with open(output, 'w', newline='') as csvfile:
93+
fieldnames = ['timestamp', 'confidence', 'transcript']
94+
csvwriter = csv.DictWriter(csvfile, fieldnames=fieldnames)
95+
csvwriter.writeheader()
96+
for result in response.results:
97+
alternative = result.alternatives[0]
98+
if len(alternative.words) > 0:
99+
timestamp = alternative.words[0].start_time
100+
timestamp = timestamp.seconds + 1e-9*timestamp.nanos
101+
timestamp_mins = int(timestamp // 60)
102+
timestamp_secs = timestamp - timestamp_mins * 60
103+
csvwriter.writerow({
104+
'timestamp': '{}:{}'.format(timestamp_mins, timestamp_secs),
105+
'confidence': alternative.confidence,
106+
'transcript': alternative.transcript,
107+
})
108+
print(u'{}:{} | {} | {}'.format(timestamp_mins, timestamp_secs , alternative.confidence, alternative.transcript))
109+
# [END def_transcribe]
90110

91111

92112
if __name__ == '__main__':
93113
parser = argparse.ArgumentParser(
94114
description=__doc__,
95115
formatter_class=argparse.RawDescriptionHelpFormatter)
96116
parser.add_argument(
97-
'path', help='File or GCS path for audio file to be recognized')
117+
'audio_file', help='File or GCS path for audio file to be transcribed')
118+
parser.add_argument(
119+
'--out', help='File to save the results (CSV)')
98120
args = parser.parse_args()
99-
if args.path.startswith('gs://'):
100-
transcribe_gcs(args.path)
121+
if args.out is None:
122+
args.out = os.path.splitext(args.audio_file)[0] + ".csv"
123+
if args.audio_file.startswith('gs://'):
124+
transcribe_gcs(args.audio_file, args.out)
101125
else:
102-
transcribe_file(args.path)
126+
transcribe_file(args.audio_file, args.out)

0 commit comments

Comments
 (0)