diff --git a/PyTorch/LanguageModeling/BERT/data/BooksDownloader.py b/PyTorch/LanguageModeling/BERT/data/BooksDownloader.py index 4b0f5340d..49b7b57f4 100644 --- a/PyTorch/LanguageModeling/BERT/data/BooksDownloader.py +++ b/PyTorch/LanguageModeling/BERT/data/BooksDownloader.py @@ -13,4 +13,3 @@ def download(self): bookscorpus_download_command += ' ' + self.save_path + '/bookscorpus' bookscorpus_download_command += ' --trash-bad-count' bookscorpus_download_process = subprocess.run(bookscorpus_download_command, shell=True, check=True) - bookscorpus_download_process.communicate() diff --git a/PyTorch/LanguageModeling/BERT/data/WikiDownloader.py b/PyTorch/LanguageModeling/BERT/data/WikiDownloader.py index 0fc192691..b80723f76 100644 --- a/PyTorch/LanguageModeling/BERT/data/WikiDownloader.py +++ b/PyTorch/LanguageModeling/BERT/data/WikiDownloader.py @@ -3,6 +3,7 @@ import bz2 import os import urllib.request +import subprocess import sys class WikiDownloader: @@ -27,32 +28,21 @@ def __init__(self, language, save_path): def download(self): if self.language in self.download_urls: url = self.download_urls[self.language] - file = self.output_files[self.language] + filename = self.output_files[self.language] print('Downloading:', url) - if os.path.isfile(self.save_path + '/' + file): + if os.path.isfile(self.save_path + '/' + filename): print('** Download file already exists, skipping download') else: response = urllib.request.urlopen(url) - with open(self.save_path + '/' + file, "wb") as handle: + with open(self.save_path + '/' + filename, "wb") as handle: handle.write(response.read()) # Always unzipping since this is relatively fast and will overwrite print('Unzipping:', self.output_files[self.language]) - #with open(self.save_path + '/' + file, mode='rb', buffering=131072) as f: - # it = iter(lambda: f.read(131072), b'') - # self.decompression(it, sys.stdout.buffer) - - zip = bz2.BZ2File(self.save_path + '/' + file) - open(self.save_path + '/wikicorpus_' + self.language + '.xml', mode='wb', buffering=131072).write(zip.read()) + subprocess.run('bzip2 -dk ' + self.save_path + '/' + filename, shell=True, check=True) else: assert False, 'WikiDownloader not implemented for this language yet.' - def decompression(self, input, output): - decomp = bz2.BZ2Decompressor() - - for chunk in input: - dc = decomp.decompress(chunk) - output.write(dc) diff --git a/PyTorch/LanguageModeling/BERT/data/bertPrep.py b/PyTorch/LanguageModeling/BERT/data/bertPrep.py index b83b26a78..f104914e5 100644 --- a/PyTorch/LanguageModeling/BERT/data/bertPrep.py +++ b/PyTorch/LanguageModeling/BERT/data/bertPrep.py @@ -70,7 +70,6 @@ def main(args): wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_en.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset print('WikiExtractor Command:', wikiextractor_command) wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True) - #wikiextractor_process.communicate() wiki_path = working_dir + '/' + directory_structure['extracted'] + '/wikicorpus_en' output_filename = directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt' @@ -84,7 +83,6 @@ def main(args): wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_zh.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset print('WikiExtractor Command:', wikiextractor_command) wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True) - #wikiextractor_process.communicate() wiki_path = working_dir + '/' + directory_structure['extracted'] + '/wikicorpus_zh' output_filename = directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt'