Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion PyTorch/LanguageModeling/BERT/data/BooksDownloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,3 @@ def download(self):
bookscorpus_download_command += ' ' + self.save_path + '/bookscorpus'
bookscorpus_download_command += ' --trash-bad-count'
bookscorpus_download_process = subprocess.run(bookscorpus_download_command, shell=True, check=True)
bookscorpus_download_process.communicate()
20 changes: 5 additions & 15 deletions PyTorch/LanguageModeling/BERT/data/WikiDownloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import bz2
import os
import urllib.request
import subprocess
import sys

class WikiDownloader:
Expand All @@ -27,32 +28,21 @@ def __init__(self, language, save_path):
def download(self):
if self.language in self.download_urls:
url = self.download_urls[self.language]
file = self.output_files[self.language]
filename = self.output_files[self.language]

print('Downloading:', url)
if os.path.isfile(self.save_path + '/' + file):
if os.path.isfile(self.save_path + '/' + filename):
print('** Download file already exists, skipping download')
else:
response = urllib.request.urlopen(url)
with open(self.save_path + '/' + file, "wb") as handle:
with open(self.save_path + '/' + filename, "wb") as handle:
handle.write(response.read())

# Always unzipping since this is relatively fast and will overwrite
print('Unzipping:', self.output_files[self.language])
#with open(self.save_path + '/' + file, mode='rb', buffering=131072) as f:
# it = iter(lambda: f.read(131072), b'')
# self.decompression(it, sys.stdout.buffer)

zip = bz2.BZ2File(self.save_path + '/' + file)
open(self.save_path + '/wikicorpus_' + self.language + '.xml', mode='wb', buffering=131072).write(zip.read())
subprocess.run('bzip2 -dk ' + self.save_path + '/' + filename, shell=True, check=True)

else:
assert False, 'WikiDownloader not implemented for this language yet.'

def decompression(self, input, output):
decomp = bz2.BZ2Decompressor()

for chunk in input:
dc = decomp.decompress(chunk)
output.write(dc)

2 changes: 0 additions & 2 deletions PyTorch/LanguageModeling/BERT/data/bertPrep.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ def main(args):
wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_en.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
print('WikiExtractor Command:', wikiextractor_command)
wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
#wikiextractor_process.communicate()

wiki_path = working_dir + '/' + directory_structure['extracted'] + '/wikicorpus_en'
output_filename = directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt'
Expand All @@ -84,7 +83,6 @@ def main(args):
wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_zh.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
print('WikiExtractor Command:', wikiextractor_command)
wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
#wikiextractor_process.communicate()

wiki_path = working_dir + '/' + directory_structure['extracted'] + '/wikicorpus_zh'
output_filename = directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt'
Expand Down