BERT-PyT subprocess for bzip in wikidownloader (NVIDIA#180)

nvcforster · szmigacz · commit e72ea6947be6 · 2019-08-29T07:21:53.000+02:00
* Removing unnecessary subprocess.communicate calls

* Updating Bookscorpus downloader to require less memory

* Renaming variable
diff --git a/PyTorch/LanguageModeling/BERT/data/WikiDownloader.py b/PyTorch/LanguageModeling/BERT/data/WikiDownloader.py
@@ -3,6 +3,7 @@
 import bz2
 import os
 import urllib.request
+import subprocess
 import sys
 
 class WikiDownloader:
@@ -27,32 +28,21 @@ def __init__(self, language, save_path):
     def download(self):
         if self.language in self.download_urls:
             url = self.download_urls[self.language]
-            file = self.output_files[self.language]
+            filename = self.output_files[self.language]
 
             print('Downloading:', url)
-            if os.path.isfile(self.save_path + '/' + file):
+            if os.path.isfile(self.save_path + '/' + filename):
                 print('** Download file already exists, skipping download')
             else:
                 response = urllib.request.urlopen(url)
-                with open(self.save_path + '/' + file, "wb") as handle:
+                with open(self.save_path + '/' + filename, "wb") as handle:
                     handle.write(response.read())
 
             # Always unzipping since this is relatively fast and will overwrite
             print('Unzipping:', self.output_files[self.language])
-            #with open(self.save_path + '/' + file, mode='rb', buffering=131072) as f:
-            #    it = iter(lambda: f.read(131072), b'')
-            #    self.decompression(it, sys.stdout.buffer)
-
-            zip = bz2.BZ2File(self.save_path + '/' + file)
-            open(self.save_path + '/wikicorpus_' + self.language + '.xml', mode='wb', buffering=131072).write(zip.read())
+            subprocess.run('bzip2 -dk ' + self.save_path + '/' + filename, shell=True, check=True)
 
         else:
             assert False, 'WikiDownloader not implemented for this language yet.'
 
-    def decompression(self, input, output):
-        decomp = bz2.BZ2Decompressor()
-
-        for chunk in input:
-            dc = decomp.decompress(chunk)
-            output.write(dc)
 
diff --git a/PyTorch/LanguageModeling/BERT/data/bertPrep.py b/PyTorch/LanguageModeling/BERT/data/bertPrep.py
@@ -70,7 +70,6 @@ def main(args):
                 wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_en.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
                 print('WikiExtractor Command:', wikiextractor_command)
                 wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
-                #wikiextractor_process.communicate()
 
             wiki_path = working_dir + '/' + directory_structure['extracted'] + '/wikicorpus_en'
             output_filename = directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt'
@@ -84,7 +83,6 @@ def main(args):
                 wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_zh.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
                 print('WikiExtractor Command:', wikiextractor_command)
                 wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
-                #wikiextractor_process.communicate()
 
             wiki_path = working_dir + '/' + directory_structure['extracted'] + '/wikicorpus_zh'
             output_filename = directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt'