|
| 1 | + |
1 | 2 | #!/usr/bin/env python |
2 | 3 |
|
3 | 4 | # Copyright 2017 Google Inc. All Rights Reserved. |
|
23 | 24 | """ |
24 | 25 |
|
25 | 26 | import argparse |
| 27 | +import csv |
| 28 | +import datetime |
26 | 29 | import io |
| 30 | +import os |
| 31 | + |
| 32 | +from google.cloud import speech_v1p1beta1 as speech |
| 33 | +from google.cloud import storage |
27 | 34 |
|
| 35 | +def _safe_filename(filename): |
| 36 | + """ |
| 37 | + Generates a safe filename that is unlikely to collide with existing objects |
| 38 | + in Google Cloud Storage. |
| 39 | + ``filename.ext`` is transformed into ``filename-YYYY-MM-DD-HHMMSS.ext`` |
| 40 | + """ |
| 41 | + date = datetime.datetime.utcnow().strftime("%Y-%m-%d-%H%M%S") |
| 42 | + basename, extension = filename.rsplit('.', 1) |
| 43 | + return "{0}-{1}.{2}".format(basename, date, extension) |
28 | 44 |
|
29 | 45 | # [START def_transcribe_file] |
30 | | -def transcribe_file(speech_file): |
| 46 | +def transcribe_file(filename, output): |
31 | 47 | """Transcribe the given audio file asynchronously.""" |
32 | | - from google.cloud import speech |
33 | | - from google.cloud.speech import enums |
34 | | - from google.cloud.speech import types |
35 | | - client = speech.SpeechClient() |
36 | | - |
37 | | - # [START migration_async_request] |
38 | | - with io.open(speech_file, 'rb') as audio_file: |
39 | | - content = audio_file.read() |
40 | | - |
41 | | - audio = types.RecognitionAudio(content=content) |
42 | | - config = types.RecognitionConfig( |
43 | | - encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, |
44 | | - sample_rate_hertz=16000, |
45 | | - language_code='en-US') |
46 | | - |
47 | | - # [START migration_async_response] |
48 | | - operation = client.long_running_recognize(config, audio) |
49 | | - # [END migration_async_request] |
50 | | - |
51 | | - print('Waiting for operation to complete...') |
52 | | - response = operation.result(timeout=90) |
53 | | - |
54 | | - # Each result is for a consecutive portion of the audio. Iterate through |
55 | | - # them to get the transcripts for the entire audio file. |
56 | | - for result in response.results: |
57 | | - # The first alternative is the most likely one for this portion. |
58 | | - print(u'Transcript: {}'.format(result.alternatives[0].transcript)) |
59 | | - print('Confidence: {}'.format(result.alternatives[0].confidence)) |
60 | | - # [END migration_async_response] |
| 48 | + client = storage.Client() |
| 49 | + |
| 50 | + bucket_name = 'bjoeris-temp-audio' |
| 51 | + bucket = client.bucket(bucket_name) |
| 52 | + blob_name = _safe_filename(filename) |
| 53 | + blob = bucket.blob(blob_name) |
| 54 | + print("Uploading file...") |
| 55 | + with io.open(filename, 'rb') as audio_file: |
| 56 | + blob.upload_from_file(audio_file) |
| 57 | + uri = "gs://{}/{}".format(bucket_name, blob_name) |
| 58 | + |
| 59 | + transcribe_gcs(uri, output) |
| 60 | + print("Deleting file...") |
| 61 | + blob.delete() |
61 | 62 | # [END def_transcribe_file] |
62 | 63 |
|
63 | 64 |
|
64 | 65 | # [START def_transcribe_gcs] |
65 | | -def transcribe_gcs(gcs_uri): |
| 66 | +def transcribe_gcs(gcs_uri, output): |
66 | 67 | """Asynchronously transcribes the audio file specified by the gcs_uri.""" |
67 | | - from google.cloud import speech |
68 | | - from google.cloud.speech import enums |
69 | | - from google.cloud.speech import types |
70 | 68 | client = speech.SpeechClient() |
71 | 69 |
|
72 | | - audio = types.RecognitionAudio(uri=gcs_uri) |
73 | | - config = types.RecognitionConfig( |
74 | | - encoding=enums.RecognitionConfig.AudioEncoding.FLAC, |
| 70 | + audio = speech.types.RecognitionAudio(uri=gcs_uri) |
| 71 | + |
| 72 | + metadata = speech.types.RecognitionMetadata() |
| 73 | + metadata.interaction_type = speech.enums.RecognitionMetadata.InteractionType.DISCUSSION |
| 74 | + metadata.microphone_distance = speech.enums.RecognitionMetadata.MicrophoneDistance.NEARFIELD |
| 75 | + metadata.recording_device_type = speech.enums.RecognitionMetadata.RecordingDeviceType.PC |
| 76 | + config = speech.types.RecognitionConfig( |
| 77 | + encoding=speech.enums.RecognitionConfig.AudioEncoding.FLAC, |
75 | 78 | sample_rate_hertz=16000, |
76 | | - language_code='en-US') |
| 79 | + language_code='en-US', |
| 80 | + metadata=metadata, |
| 81 | + enable_automatic_punctuation=True, |
| 82 | + enable_word_time_offsets=True) |
77 | 83 |
|
78 | 84 | operation = client.long_running_recognize(config, audio) |
79 | 85 |
|
80 | | - print('Waiting for operation to complete...') |
| 86 | + print('Transcribing...') |
81 | 87 | response = operation.result(timeout=90) |
82 | 88 |
|
83 | 89 | # Each result is for a consecutive portion of the audio. Iterate through |
84 | 90 | # them to get the transcripts for the entire audio file. |
85 | | - for result in response.results: |
86 | | - # The first alternative is the most likely one for this portion. |
87 | | - print(u'Transcript: {}'.format(result.alternatives[0].transcript)) |
88 | | - print('Confidence: {}'.format(result.alternatives[0].confidence)) |
89 | | -# [END def_transcribe_gcs] |
| 91 | + timestamp = 0.0 |
| 92 | + with open(output, 'w', newline='') as csvfile: |
| 93 | + fieldnames = ['timestamp', 'confidence', 'transcript'] |
| 94 | + csvwriter = csv.DictWriter(csvfile, fieldnames=fieldnames) |
| 95 | + csvwriter.writeheader() |
| 96 | + for result in response.results: |
| 97 | + alternative = result.alternatives[0] |
| 98 | + if len(alternative.words) > 0: |
| 99 | + timestamp = alternative.words[0].start_time |
| 100 | + timestamp = timestamp.seconds + 1e-9*timestamp.nanos |
| 101 | + timestamp_mins = int(timestamp // 60) |
| 102 | + timestamp_secs = timestamp - timestamp_mins * 60 |
| 103 | + csvwriter.writerow({ |
| 104 | + 'timestamp': '{}:{}'.format(timestamp_mins, timestamp_secs), |
| 105 | + 'confidence': alternative.confidence, |
| 106 | + 'transcript': alternative.transcript, |
| 107 | + }) |
| 108 | + print(u'{}:{} | {} | {}'.format(timestamp_mins, timestamp_secs , alternative.confidence, alternative.transcript)) |
| 109 | +# [END def_transcribe] |
90 | 110 |
|
91 | 111 |
|
92 | 112 | if __name__ == '__main__': |
93 | 113 | parser = argparse.ArgumentParser( |
94 | 114 | description=__doc__, |
95 | 115 | formatter_class=argparse.RawDescriptionHelpFormatter) |
96 | 116 | parser.add_argument( |
97 | | - 'path', help='File or GCS path for audio file to be recognized') |
| 117 | + 'audio_file', help='File or GCS path for audio file to be transcribed') |
| 118 | + parser.add_argument( |
| 119 | + '--out', help='File to save the results (CSV)') |
98 | 120 | args = parser.parse_args() |
99 | | - if args.path.startswith('gs://'): |
100 | | - transcribe_gcs(args.path) |
| 121 | + if args.out is None: |
| 122 | + args.out = os.path.splitext(args.audio_file)[0] + ".csv" |
| 123 | + if args.audio_file.startswith('gs://'): |
| 124 | + transcribe_gcs(args.audio_file, args.out) |
101 | 125 | else: |
102 | | - transcribe_file(args.path) |
| 126 | + transcribe_file(args.audio_file, args.out) |
0 commit comments