[BERT/PyT][BERT/TF] Use mirror and wget (#833)

sharathts · swethmandava · web-flow · commit 3459f97d5884 · 2021-02-17T14:49:39.000-08:00
* Use mirror and wget

Mirror speedup download by 10x
wget fixes the stall with urllib

* add comment

* Update wikidownloader of bert tf1 to use mirror and wget

Co-authored-by: Swetha Mandava &lt;sweth.mandava@gmail.com&gt;
diff --git a/PyTorch/LanguageModeling/BERT/data/WikiDownloader.py b/PyTorch/LanguageModeling/BERT/data/WikiDownloader.py
@@ -16,6 +16,7 @@
 import urllib.request
 import subprocess
 import sys
+import subprocess
 
 class WikiDownloader:
     def __init__(self, language, save_path):
@@ -25,9 +26,10 @@ def __init__(self, language, save_path):
             os.makedirs(self.save_path)
 
         self.language = language
+        # Use a mirror from https://dumps.wikimedia.org/mirrors.html if the below links do not work
         self.download_urls = {
-            'en' : 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
-            'zh' : 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
+            'en' : 'https://dumps.wikimedia.your.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
+            'zh' : 'https://dumps.wikimedia.your.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
         }
 
         self.output_files = {
@@ -45,13 +47,15 @@ def download(self):
             if os.path.isfile(self.save_path + '/' + filename):
                 print('** Download file already exists, skipping download')
             else:
-                response = urllib.request.urlopen(url)
-                with open(self.save_path + '/' + filename, "wb") as handle:
-                    handle.write(response.read())
+                cmd = ['wget', url, '--output-document={}'.format(self.save_path + '/' + filename)]
+                print('Running:', cmd)
+                status = subprocess.run(cmd)
+                if status.returncode != 0:
+                    raise RuntimeError('Wiki download not successful')
 
             # Always unzipping since this is relatively fast and will overwrite
             print('Unzipping:', self.output_files[self.language])
             subprocess.run('bzip2 -dk ' + self.save_path + '/' + filename, shell=True, check=True)
 
         else:
-            assert False, 'WikiDownloader not implemented for this language yet.'
+            assert False, 'WikiDownloader not implemented for this language yet.'
diff --git a/TensorFlow/LanguageModeling/BERT/data/WikiDownloader.py b/TensorFlow/LanguageModeling/BERT/data/WikiDownloader.py
@@ -26,8 +26,8 @@ def __init__(self, language, save_path):
 
         self.language = language
         self.download_urls = {
-            'en' : 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
-            'zh' : 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
+            'en' : 'https://dumps.wikimedia.your.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
+            'zh' : 'https://dumps.wikimedia.your.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
         }
 
         self.output_files = {
@@ -45,9 +45,11 @@ def download(self):
             if os.path.isfile(self.save_path + '/' + filename):
                 print('** Download file already exists, skipping download')
             else:
-                response = urllib.request.urlopen(url)
-                with open(self.save_path + '/' + filename, "wb") as handle:
-                    handle.write(response.read())
+                cmd = ['wget', url, '--output-document={}'.format(self.save_path + '/' + filename)]
+                print('Running:', cmd)
+                status = subprocess.run(cmd)
+                if status.returncode != 0:
+                    raise RuntimeError('Wiki download not successful')
 
             # Always unzipping since this is relatively fast and will overwrite
             print('Unzipping:', self.output_files[self.language])