33import bz2
44import os
55import urllib .request
6+ import subprocess
67import sys
78
89class WikiDownloader :
@@ -27,32 +28,21 @@ def __init__(self, language, save_path):
2728 def download (self ):
2829 if self .language in self .download_urls :
2930 url = self .download_urls [self .language ]
30- file = self .output_files [self .language ]
31+ filename = self .output_files [self .language ]
3132
3233 print ('Downloading:' , url )
33- if os .path .isfile (self .save_path + '/' + file ):
34+ if os .path .isfile (self .save_path + '/' + filename ):
3435 print ('** Download file already exists, skipping download' )
3536 else :
3637 response = urllib .request .urlopen (url )
37- with open (self .save_path + '/' + file , "wb" ) as handle :
38+ with open (self .save_path + '/' + filename , "wb" ) as handle :
3839 handle .write (response .read ())
3940
4041 # Always unzipping since this is relatively fast and will overwrite
4142 print ('Unzipping:' , self .output_files [self .language ])
42- #with open(self.save_path + '/' + file, mode='rb', buffering=131072) as f:
43- # it = iter(lambda: f.read(131072), b'')
44- # self.decompression(it, sys.stdout.buffer)
45-
46- zip = bz2 .BZ2File (self .save_path + '/' + file )
47- open (self .save_path + '/wikicorpus_' + self .language + '.xml' , mode = 'wb' , buffering = 131072 ).write (zip .read ())
43+ subprocess .run ('bzip2 -dk ' + self .save_path + '/' + filename , shell = True , check = True )
4844
4945 else :
5046 assert False , 'WikiDownloader not implemented for this language yet.'
5147
52- def decompression (self , input , output ):
53- decomp = bz2 .BZ2Decompressor ()
54-
55- for chunk in input :
56- dc = decomp .decompress (chunk )
57- output .write (dc )
5848
0 commit comments