Skip to content

Commit e72ea69

Browse files
nvcforsterszmigacz
authored andcommitted
BERT-PyT subprocess for bzip in wikidownloader (NVIDIA#180)
* Removing unnecessary subprocess.communicate calls * Updating Bookscorpus downloader to require less memory * Renaming variable
1 parent 3d3ff3e commit e72ea69

2 files changed

Lines changed: 5 additions & 17 deletions

File tree

PyTorch/LanguageModeling/BERT/data/WikiDownloader.py

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import bz2
44
import os
55
import urllib.request
6+
import subprocess
67
import sys
78

89
class WikiDownloader:
@@ -27,32 +28,21 @@ def __init__(self, language, save_path):
2728
def download(self):
2829
if self.language in self.download_urls:
2930
url = self.download_urls[self.language]
30-
file = self.output_files[self.language]
31+
filename = self.output_files[self.language]
3132

3233
print('Downloading:', url)
33-
if os.path.isfile(self.save_path + '/' + file):
34+
if os.path.isfile(self.save_path + '/' + filename):
3435
print('** Download file already exists, skipping download')
3536
else:
3637
response = urllib.request.urlopen(url)
37-
with open(self.save_path + '/' + file, "wb") as handle:
38+
with open(self.save_path + '/' + filename, "wb") as handle:
3839
handle.write(response.read())
3940

4041
# Always unzipping since this is relatively fast and will overwrite
4142
print('Unzipping:', self.output_files[self.language])
42-
#with open(self.save_path + '/' + file, mode='rb', buffering=131072) as f:
43-
# it = iter(lambda: f.read(131072), b'')
44-
# self.decompression(it, sys.stdout.buffer)
45-
46-
zip = bz2.BZ2File(self.save_path + '/' + file)
47-
open(self.save_path + '/wikicorpus_' + self.language + '.xml', mode='wb', buffering=131072).write(zip.read())
43+
subprocess.run('bzip2 -dk ' + self.save_path + '/' + filename, shell=True, check=True)
4844

4945
else:
5046
assert False, 'WikiDownloader not implemented for this language yet.'
5147

52-
def decompression(self, input, output):
53-
decomp = bz2.BZ2Decompressor()
54-
55-
for chunk in input:
56-
dc = decomp.decompress(chunk)
57-
output.write(dc)
5848

PyTorch/LanguageModeling/BERT/data/bertPrep.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@ def main(args):
7070
wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_en.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
7171
print('WikiExtractor Command:', wikiextractor_command)
7272
wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
73-
#wikiextractor_process.communicate()
7473

7574
wiki_path = working_dir + '/' + directory_structure['extracted'] + '/wikicorpus_en'
7675
output_filename = directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt'
@@ -84,7 +83,6 @@ def main(args):
8483
wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_zh.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
8584
print('WikiExtractor Command:', wikiextractor_command)
8685
wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
87-
#wikiextractor_process.communicate()
8886

8987
wiki_path = working_dir + '/' + directory_structure['extracted'] + '/wikicorpus_zh'
9088
output_filename = directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt'

0 commit comments

Comments
 (0)