From 8474c12e84b223470fd6ff3b234e74a40788379f Mon Sep 17 00:00:00 2001 From: Chris Forster Date: Thu, 22 Aug 2019 15:25:06 -0700 Subject: [PATCH 1/3] Removing unnecessary subprocess.communicate calls --- PyTorch/LanguageModeling/BERT/data/BooksDownloader.py | 1 - PyTorch/LanguageModeling/BERT/data/bertPrep.py | 2 -- 2 files changed, 3 deletions(-) diff --git a/PyTorch/LanguageModeling/BERT/data/BooksDownloader.py b/PyTorch/LanguageModeling/BERT/data/BooksDownloader.py index 4b0f5340d..49b7b57f4 100644 --- a/PyTorch/LanguageModeling/BERT/data/BooksDownloader.py +++ b/PyTorch/LanguageModeling/BERT/data/BooksDownloader.py @@ -13,4 +13,3 @@ def download(self): bookscorpus_download_command += ' ' + self.save_path + '/bookscorpus' bookscorpus_download_command += ' --trash-bad-count' bookscorpus_download_process = subprocess.run(bookscorpus_download_command, shell=True, check=True) - bookscorpus_download_process.communicate() diff --git a/PyTorch/LanguageModeling/BERT/data/bertPrep.py b/PyTorch/LanguageModeling/BERT/data/bertPrep.py index b83b26a78..f104914e5 100644 --- a/PyTorch/LanguageModeling/BERT/data/bertPrep.py +++ b/PyTorch/LanguageModeling/BERT/data/bertPrep.py @@ -70,7 +70,6 @@ def main(args): wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_en.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset print('WikiExtractor Command:', wikiextractor_command) wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True) - #wikiextractor_process.communicate() wiki_path = working_dir + '/' + directory_structure['extracted'] + '/wikicorpus_en' output_filename = directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt' @@ -84,7 +83,6 @@ def main(args): wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_zh.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset print('WikiExtractor Command:', wikiextractor_command) wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True) - #wikiextractor_process.communicate() wiki_path = working_dir + '/' + directory_structure['extracted'] + '/wikicorpus_zh' output_filename = directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt' From 7df14d232c901c7a5717147b99c5c42fb45845d2 Mon Sep 17 00:00:00 2001 From: Chris Forster Date: Thu, 22 Aug 2019 15:39:16 -0700 Subject: [PATCH 2/3] Updating Bookscorpus downloader to require less memory --- .../LanguageModeling/BERT/data/WikiDownloader.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/PyTorch/LanguageModeling/BERT/data/WikiDownloader.py b/PyTorch/LanguageModeling/BERT/data/WikiDownloader.py index 0fc192691..0f8d26ccf 100644 --- a/PyTorch/LanguageModeling/BERT/data/WikiDownloader.py +++ b/PyTorch/LanguageModeling/BERT/data/WikiDownloader.py @@ -3,6 +3,7 @@ import bz2 import os import urllib.request +import subprocess import sys class WikiDownloader: @@ -39,20 +40,9 @@ def download(self): # Always unzipping since this is relatively fast and will overwrite print('Unzipping:', self.output_files[self.language]) - #with open(self.save_path + '/' + file, mode='rb', buffering=131072) as f: - # it = iter(lambda: f.read(131072), b'') - # self.decompression(it, sys.stdout.buffer) - - zip = bz2.BZ2File(self.save_path + '/' + file) - open(self.save_path + '/wikicorpus_' + self.language + '.xml', mode='wb', buffering=131072).write(zip.read()) + subprocess.run('bzip2 -dk ' + self.save_path + '/' + file, shell=True, check=True) else: assert False, 'WikiDownloader not implemented for this language yet.' - def decompression(self, input, output): - decomp = bz2.BZ2Decompressor() - - for chunk in input: - dc = decomp.decompress(chunk) - output.write(dc) From 5f15123a1cbd1a9434c154b02039da60beee5091 Mon Sep 17 00:00:00 2001 From: Chris Forster Date: Wed, 28 Aug 2019 08:03:08 -0700 Subject: [PATCH 3/3] Renaming variable --- PyTorch/LanguageModeling/BERT/data/WikiDownloader.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/PyTorch/LanguageModeling/BERT/data/WikiDownloader.py b/PyTorch/LanguageModeling/BERT/data/WikiDownloader.py index 0f8d26ccf..b80723f76 100644 --- a/PyTorch/LanguageModeling/BERT/data/WikiDownloader.py +++ b/PyTorch/LanguageModeling/BERT/data/WikiDownloader.py @@ -28,19 +28,19 @@ def __init__(self, language, save_path): def download(self): if self.language in self.download_urls: url = self.download_urls[self.language] - file = self.output_files[self.language] + filename = self.output_files[self.language] print('Downloading:', url) - if os.path.isfile(self.save_path + '/' + file): + if os.path.isfile(self.save_path + '/' + filename): print('** Download file already exists, skipping download') else: response = urllib.request.urlopen(url) - with open(self.save_path + '/' + file, "wb") as handle: + with open(self.save_path + '/' + filename, "wb") as handle: handle.write(response.read()) # Always unzipping since this is relatively fast and will overwrite print('Unzipping:', self.output_files[self.language]) - subprocess.run('bzip2 -dk ' + self.save_path + '/' + file, shell=True, check=True) + subprocess.run('bzip2 -dk ' + self.save_path + '/' + filename, shell=True, check=True) else: assert False, 'WikiDownloader not implemented for this language yet.'