Skip to content

Commit 52aa582

Browse files
committed
Deterministically zip directories
1 parent e500664 commit 52aa582

File tree

1 file changed

+29
-6
lines changed

1 file changed

+29
-6
lines changed

utils/zipfile_deterministic.py

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ class LargeZipFile(Exception):
3939
ZIP_DEFLATED = 8
4040
# Other ZIP compression methods not supported
4141

42+
DEFAULT_DATE = (1980,1,1,0,0,0) # hard-coded timestamp
43+
4244
# Below are some formats and associated data for reading/writing headers using
4345
# the struct module. The names and structures of headers/records are those used
4446
# in the PKWARE description of the ZIP file format:
@@ -136,6 +138,15 @@ def normalize_unicode(filename):
136138
"""For dealing with different unicode normalizations in filenames."""
137139
return unicodedata.normalize('NFC', unicode(filename, 'utf-8')).encode('utf-8')
138140

141+
def standardize_filename(filename):
142+
"""Get OS-independent form of filename"""
143+
# This is used to ensure paths in generated ZIP files always use
144+
# forward slashes as the directory separator, as required by the
145+
# ZIP format specification.
146+
if os.sep != "/" and os.sep in filename:
147+
filename = filename.replace(os.sep, "/")
148+
return normalize_unicode(filename)
149+
139150
def is_zipfile(filename):
140151
"""Quickly see if file is a ZIP file by checking the magic number."""
141152
try:
@@ -264,7 +275,7 @@ class ZipInfo (object):
264275
'_raw_time',
265276
)
266277

267-
def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)):
278+
def __init__(self, filename="NoName", date_time=DEFAULT_DATE):
268279
self.orig_filename = filename # Original file name in archive
269280

270281
# Terminate the file name at the first null byte. Null bytes in file
@@ -275,10 +286,7 @@ def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)):
275286
# This is used to ensure paths in generated ZIP files always use
276287
# forward slashes as the directory separator, as required by the
277288
# ZIP format specification.
278-
if os.sep != "/" and os.sep in filename:
279-
filename = filename.replace(os.sep, "/")
280-
281-
self.filename = normalize_unicode(filename) # Normalized file name
289+
self.filename = standardize_filename(filename) # Normalized file name
282290
self.date_time = date_time # year, month, day, hour, min, sec
283291
# Standard values:
284292
self.compress_type = ZIP_STORED # Type of compression for the file
@@ -1005,7 +1013,7 @@ def _writecheck(self, zinfo):
10051013
if not self._allowZip64:
10061014
raise LargeZipFile("Zipfile size would require ZIP64 extensions")
10071015

1008-
def write(self, filename, arcname=None, compress_type=None, date_time=(1980,1,1,0,0,0)):
1016+
def write(self, filename, arcname=None, compress_type=None, date_time=DEFAULT_DATE):
10091017
"""Put the bytes from filename into the archive under the name
10101018
arcname."""
10111019
if not self.fp:
@@ -1124,6 +1132,21 @@ def writestr(self, zinfo_or_arcname, bytes):
11241132
self.filelist.append(zinfo)
11251133
self.NameToInfo[zinfo.filename] = zinfo
11261134

1135+
def write_from_directory(self, directory, exclusions=None,
1136+
compress_type=None, date_time=DEFAULT_DATE)
1137+
"""
1138+
Create a ZIP package deterministically from a directory.
1139+
We need to sort the files in an OS-independent way before adding to the archive.
1140+
"""
1141+
file_dict = {}
1142+
for root,subfolders,files in os.walk(directory):
1143+
for fi in files:
1144+
filename = os.path.join(root, fi)
1145+
if filename not in exclusions:
1146+
file_dict.update({standardize_filename(filename): filename})
1147+
for new_filename, old_filename in sorted(file_dict.items()):
1148+
self.write(old_filename, compress_type=compress_type, date_time=date_time)
1149+
11271150
def __del__(self):
11281151
"""Call the "close()" method in case the user forgot."""
11291152
self.close()

0 commit comments

Comments
 (0)