From 8474c12e84b223470fd6ff3b234e74a40788379f Mon Sep 17 00:00:00 2001
From: Chris Forster <cforster@nvidia.com>
Date: Thu, 22 Aug 2019 15:25:06 -0700
Subject: [PATCH 1/3] Removing unnecessary subprocess.communicate calls

---
 PyTorch/LanguageModeling/BERT/data/BooksDownloader.py | 1 -
 PyTorch/LanguageModeling/BERT/data/bertPrep.py        | 2 --
 2 files changed, 3 deletions(-)

diff --git a/PyTorch/LanguageModeling/BERT/data/BooksDownloader.py b/PyTorch/LanguageModeling/BERT/data/BooksDownloader.py
index 4b0f5340d..49b7b57f4 100644
--- a/PyTorch/LanguageModeling/BERT/data/BooksDownloader.py
+++ b/PyTorch/LanguageModeling/BERT/data/BooksDownloader.py
@@ -13,4 +13,3 @@ def download(self):
         bookscorpus_download_command += ' ' + self.save_path + '/bookscorpus'
         bookscorpus_download_command += ' --trash-bad-count'
         bookscorpus_download_process = subprocess.run(bookscorpus_download_command, shell=True, check=True)
-        bookscorpus_download_process.communicate()
diff --git a/PyTorch/LanguageModeling/BERT/data/bertPrep.py b/PyTorch/LanguageModeling/BERT/data/bertPrep.py
index b83b26a78..f104914e5 100644
--- a/PyTorch/LanguageModeling/BERT/data/bertPrep.py
+++ b/PyTorch/LanguageModeling/BERT/data/bertPrep.py
@@ -70,7 +70,6 @@ def main(args):
                 wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_en.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
                 print('WikiExtractor Command:', wikiextractor_command)
                 wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
-                #wikiextractor_process.communicate()
 
             wiki_path = working_dir + '/' + directory_structure['extracted'] + '/wikicorpus_en'
             output_filename = directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt'
@@ -84,7 +83,6 @@ def main(args):
                 wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_zh.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
                 print('WikiExtractor Command:', wikiextractor_command)
                 wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
-                #wikiextractor_process.communicate()
 
             wiki_path = working_dir + '/' + directory_structure['extracted'] + '/wikicorpus_zh'
             output_filename = directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt'

From 7df14d232c901c7a5717147b99c5c42fb45845d2 Mon Sep 17 00:00:00 2001
From: Chris Forster <cforster@nvidia.com>
Date: Thu, 22 Aug 2019 15:39:16 -0700
Subject: [PATCH 2/3] Updating Bookscorpus downloader to require less memory

---
 .../LanguageModeling/BERT/data/WikiDownloader.py   | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/PyTorch/LanguageModeling/BERT/data/WikiDownloader.py b/PyTorch/LanguageModeling/BERT/data/WikiDownloader.py
index 0fc192691..0f8d26ccf 100644
--- a/PyTorch/LanguageModeling/BERT/data/WikiDownloader.py
+++ b/PyTorch/LanguageModeling/BERT/data/WikiDownloader.py
@@ -3,6 +3,7 @@
 import bz2
 import os
 import urllib.request
+import subprocess
 import sys
 
 class WikiDownloader:
@@ -39,20 +40,9 @@ def download(self):
 
             # Always unzipping since this is relatively fast and will overwrite
             print('Unzipping:', self.output_files[self.language])
-            #with open(self.save_path + '/' + file, mode='rb', buffering=131072) as f:
-            #    it = iter(lambda: f.read(131072), b'')
-            #    self.decompression(it, sys.stdout.buffer)
-
-            zip = bz2.BZ2File(self.save_path + '/' + file)
-            open(self.save_path + '/wikicorpus_' + self.language + '.xml', mode='wb', buffering=131072).write(zip.read())
+            subprocess.run('bzip2 -dk ' + self.save_path + '/' + file, shell=True, check=True)
 
         else:
             assert False, 'WikiDownloader not implemented for this language yet.'
 
-    def decompression(self, input, output):
-        decomp = bz2.BZ2Decompressor()
-
-        for chunk in input:
-            dc = decomp.decompress(chunk)
-            output.write(dc)
 

From 5f15123a1cbd1a9434c154b02039da60beee5091 Mon Sep 17 00:00:00 2001
From: Chris Forster <cforster@nvidia.com>
Date: Wed, 28 Aug 2019 08:03:08 -0700
Subject: [PATCH 3/3] Renaming variable

---
 PyTorch/LanguageModeling/BERT/data/WikiDownloader.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/PyTorch/LanguageModeling/BERT/data/WikiDownloader.py b/PyTorch/LanguageModeling/BERT/data/WikiDownloader.py
index 0f8d26ccf..b80723f76 100644
--- a/PyTorch/LanguageModeling/BERT/data/WikiDownloader.py
+++ b/PyTorch/LanguageModeling/BERT/data/WikiDownloader.py
@@ -28,19 +28,19 @@ def __init__(self, language, save_path):
     def download(self):
         if self.language in self.download_urls:
             url = self.download_urls[self.language]
-            file = self.output_files[self.language]
+            filename = self.output_files[self.language]
 
             print('Downloading:', url)
-            if os.path.isfile(self.save_path + '/' + file):
+            if os.path.isfile(self.save_path + '/' + filename):
                 print('** Download file already exists, skipping download')
             else:
                 response = urllib.request.urlopen(url)
-                with open(self.save_path + '/' + file, "wb") as handle:
+                with open(self.save_path + '/' + filename, "wb") as handle:
                     handle.write(response.read())
 
             # Always unzipping since this is relatively fast and will overwrite
             print('Unzipping:', self.output_files[self.language])
-            subprocess.run('bzip2 -dk ' + self.save_path + '/' + file, shell=True, check=True)
+            subprocess.run('bzip2 -dk ' + self.save_path + '/' + filename, shell=True, check=True)
 
         else:
             assert False, 'WikiDownloader not implemented for this language yet.'