Skip to content

Commit 3459f97

Browse files
[BERT/PyT][BERT/TF] Use mirror and wget (#833)
* Use mirror and wget Mirror speedup download by 10x wget fixes the stall with urllib * add comment * Update wikidownloader of bert tf1 to use mirror and wget Co-authored-by: Swetha Mandava <sweth.mandava@gmail.com>
1 parent 0f5ff94 commit 3459f97

File tree

2 files changed

+17
-11
lines changed

2 files changed

+17
-11
lines changed

PyTorch/LanguageModeling/BERT/data/WikiDownloader.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import urllib.request
1717
import subprocess
1818
import sys
19+
import subprocess
1920

2021
class WikiDownloader:
2122
def __init__(self, language, save_path):
@@ -25,9 +26,10 @@ def __init__(self, language, save_path):
2526
os.makedirs(self.save_path)
2627

2728
self.language = language
29+
# Use a mirror from https://dumps.wikimedia.org/mirrors.html if the below links do not work
2830
self.download_urls = {
29-
'en' : 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
30-
'zh' : 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
31+
'en' : 'https://dumps.wikimedia.your.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
32+
'zh' : 'https://dumps.wikimedia.your.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
3133
}
3234

3335
self.output_files = {
@@ -45,13 +47,15 @@ def download(self):
4547
if os.path.isfile(self.save_path + '/' + filename):
4648
print('** Download file already exists, skipping download')
4749
else:
48-
response = urllib.request.urlopen(url)
49-
with open(self.save_path + '/' + filename, "wb") as handle:
50-
handle.write(response.read())
50+
cmd = ['wget', url, '--output-document={}'.format(self.save_path + '/' + filename)]
51+
print('Running:', cmd)
52+
status = subprocess.run(cmd)
53+
if status.returncode != 0:
54+
raise RuntimeError('Wiki download not successful')
5155

5256
# Always unzipping since this is relatively fast and will overwrite
5357
print('Unzipping:', self.output_files[self.language])
5458
subprocess.run('bzip2 -dk ' + self.save_path + '/' + filename, shell=True, check=True)
5559

5660
else:
57-
assert False, 'WikiDownloader not implemented for this language yet.'
61+
assert False, 'WikiDownloader not implemented for this language yet.'

TensorFlow/LanguageModeling/BERT/data/WikiDownloader.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@ def __init__(self, language, save_path):
2626

2727
self.language = language
2828
self.download_urls = {
29-
'en' : 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
30-
'zh' : 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
29+
'en' : 'https://dumps.wikimedia.your.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
30+
'zh' : 'https://dumps.wikimedia.your.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
3131
}
3232

3333
self.output_files = {
@@ -45,9 +45,11 @@ def download(self):
4545
if os.path.isfile(self.save_path + '/' + filename):
4646
print('** Download file already exists, skipping download')
4747
else:
48-
response = urllib.request.urlopen(url)
49-
with open(self.save_path + '/' + filename, "wb") as handle:
50-
handle.write(response.read())
48+
cmd = ['wget', url, '--output-document={}'.format(self.save_path + '/' + filename)]
49+
print('Running:', cmd)
50+
status = subprocess.run(cmd)
51+
if status.returncode != 0:
52+
raise RuntimeError('Wiki download not successful')
5153

5254
# Always unzipping since this is relatively fast and will overwrite
5355
print('Unzipping:', self.output_files[self.language])

0 commit comments

Comments
 (0)