Skip to content

Commit 82d3513

Browse files
committed
Add zlib flush options when using zlib.compressobj. Fixes #2434
Supports zlib flush options: Z_NO_FLUSH, Z_SYNC_FLUSH, Z_FULL_FLUSH, which became possible as of Java 7. In addition, supports gzip header (fixed) and trailer (based on CRC32, size) when flushed. Also supports incremental read of gzip header when decompressing. Now fully passes all tests in urllib3 (see #2434 for those bugs) with the exception of those tests using coverage (requires fix for #1638) and fcntl (requires fix for #1943). In addition, the gzip module, with one minor patch to support Jython's io implementation, is updated to latest in CPython 2.x. test_gzip and test_zlib also use stock tests.
1 parent 3e734b5 commit 82d3513

7 files changed

Lines changed: 378 additions & 877 deletions

File tree

Lib/gzip.py

Lines changed: 67 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ def write32u(output, value):
2121
# or unsigned.
2222
output.write(struct.pack("<L", value))
2323

24+
def read32(input):
25+
return struct.unpack("<I", input.read(4))[0]
26+
2427
def open(filename, mode="rb", compresslevel=9):
2528
"""Shorthand for GzipFile(filename, mode, compresslevel).
2629
@@ -161,9 +164,16 @@ def _init_write(self, filename):
161164
def _write_gzip_header(self):
162165
self.fileobj.write('\037\213') # magic header
163166
self.fileobj.write('\010') # compression method
164-
fname = os.path.basename(self.name)
165-
if fname.endswith(".gz"):
166-
fname = fname[:-3]
167+
try:
168+
# RFC 1952 requires the FNAME field to be Latin-1. Do not
169+
# include filenames that cannot be represented that way.
170+
fname = os.path.basename(self.name)
171+
if not isinstance(fname, str):
172+
fname = fname.encode('latin-1')
173+
if fname.endswith('.gz'):
174+
fname = fname[:-3]
175+
except UnicodeEncodeError:
176+
fname = ''
167177
flags = 0
168178
if fname:
169179
flags = FNAME
@@ -181,28 +191,24 @@ def _init_read(self):
181191
self.crc = zlib.crc32("") & 0xffffffffL
182192
self.size = 0
183193

184-
def _read_exact(self, n):
185-
data = self.fileobj.read(n)
186-
while len(data) < n:
187-
b = self.fileobj.read(n - len(data))
188-
if not b:
189-
raise EOFError("Compressed file ended before the "
190-
"end-of-stream marker was reached")
191-
data += b
192-
return data
193-
194194
def _read_gzip_header(self):
195195
magic = self.fileobj.read(2)
196196
if magic != '\037\213':
197197
raise IOError, 'Not a gzipped file'
198-
199-
method, flag, self.mtime = struct.unpack("<BBIxx", self._read_exact(8))
198+
method = ord( self.fileobj.read(1) )
200199
if method != 8:
201200
raise IOError, 'Unknown compression method'
201+
flag = ord( self.fileobj.read(1) )
202+
self.mtime = read32(self.fileobj)
203+
# extraflag = self.fileobj.read(1)
204+
# os = self.fileobj.read(1)
205+
self.fileobj.read(2)
202206

203207
if flag & FEXTRA:
204208
# Read & discard the extra field, if present
205-
self._read_exact(struct.unpack("<H", self._read_exact(2)))
209+
xlen = ord(self.fileobj.read(1))
210+
xlen = xlen + 256*ord(self.fileobj.read(1))
211+
self.fileobj.read(xlen)
206212
if flag & FNAME:
207213
# Read and discard a null-terminated string containing the filename
208214
while True:
@@ -216,7 +222,7 @@ def _read_gzip_header(self):
216222
if not s or s=='\000':
217223
break
218224
if flag & FHCRC:
219-
self._read_exact(2) # Read & discard the 16-bit header CRC
225+
self.fileobj.read(2) # Read & discard the 16-bit header CRC
220226

221227
def write(self,data):
222228
self._check_closed()
@@ -232,9 +238,9 @@ def write(self,data):
232238
data = data.tobytes()
233239

234240
if len(data) > 0:
235-
self.size = self.size + len(data)
241+
self.fileobj.write(self.compress.compress(data))
242+
self.size += len(data)
236243
self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
237-
self.fileobj.write( self.compress.compress(data) )
238244
self.offset += len(data)
239245

240246
return len(data)
@@ -250,16 +256,20 @@ def read(self, size=-1):
250256

251257
readsize = 1024
252258
if size < 0: # get the whole thing
253-
while self._read(readsize):
254-
readsize = min(self.max_read_chunk, readsize * 2)
255-
size = self.extrasize
259+
try:
260+
while True:
261+
self._read(readsize)
262+
readsize = min(self.max_read_chunk, readsize * 2)
263+
except EOFError:
264+
size = self.extrasize
256265
else: # just get some more of it
257-
while size > self.extrasize:
258-
if not self._read(readsize):
259-
if size > self.extrasize:
260-
size = self.extrasize
261-
break
262-
readsize = min(self.max_read_chunk, readsize * 2)
266+
try:
267+
while size > self.extrasize:
268+
self._read(readsize)
269+
readsize = min(self.max_read_chunk, readsize * 2)
270+
except EOFError:
271+
if size > self.extrasize:
272+
size = self.extrasize
263273

264274
offset = self.offset - self.extrastart
265275
chunk = self.extrabuf[offset: offset + size]
@@ -274,7 +284,7 @@ def _unread(self, buf):
274284

275285
def _read(self, size=1024):
276286
if self.fileobj is None:
277-
return False
287+
raise EOFError, "Reached EOF"
278288

279289
if self._new_member:
280290
# If the _new_member flag is set, we have to
@@ -285,7 +295,7 @@ def _read(self, size=1024):
285295
pos = self.fileobj.tell() # Save current position
286296
self.fileobj.seek(0, 2) # Seek to end of file
287297
if pos == self.fileobj.tell():
288-
return False
298+
raise EOFError, "Reached EOF"
289299
else:
290300
self.fileobj.seek( pos ) # Return to original position
291301

@@ -302,10 +312,9 @@ def _read(self, size=1024):
302312

303313
if buf == "":
304314
uncompress = self.decompress.flush()
305-
self.fileobj.seek(-len(self.decompress.unused_data), 1)
306315
self._read_eof()
307316
self._add_read_data( uncompress )
308-
return False
317+
raise EOFError, 'Reached EOF'
309318

310319
uncompress = self.decompress.decompress(buf)
311320
self._add_read_data( uncompress )
@@ -315,14 +324,13 @@ def _read(self, size=1024):
315324
# so seek back to the start of the unused data, finish up
316325
# this member, and read a new gzip header.
317326
# (The number of bytes to seek back is the length of the unused
318-
# data)
319-
self.fileobj.seek(-len(self.decompress.unused_data), 1)
327+
# data, minus 8 because _read_eof() will rewind a further 8 bytes)
328+
self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
320329

321330
# Check the CRC and file size, and set the flag so we read
322331
# a new member on the next call
323332
self._read_eof()
324333
self._new_member = True
325-
return True
326334

327335
def _add_read_data(self, data):
328336
self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
@@ -333,11 +341,14 @@ def _add_read_data(self, data):
333341
self.size = self.size + len(data)
334342

335343
def _read_eof(self):
336-
# We've read to the end of the file.
344+
# We've read to the end of the file, so we have to rewind in order
345+
# to reread the 8 bytes containing the CRC and the file size.
337346
# We check the that the computed CRC and size of the
338347
# uncompressed data matches the stored values. Note that the size
339348
# stored is the true file size mod 2**32.
340-
crc32, isize = struct.unpack("<II", self._read_exact(8))
349+
self.fileobj.seek(-8, 1)
350+
crc32 = read32(self.fileobj)
351+
isize = read32(self.fileobj) # may exceed 2GB
341352
if crc32 != self.crc:
342353
raise IOError("CRC check failed %s != %s" % (hex(crc32),
343354
hex(self.crc)))
@@ -358,19 +369,21 @@ def closed(self):
358369
return self.fileobj is None
359370

360371
def close(self):
361-
if self.fileobj is None:
372+
fileobj = self.fileobj
373+
if fileobj is None:
362374
return
363-
if self.mode == WRITE:
364-
self.fileobj.write(self.compress.flush())
365-
write32u(self.fileobj, self.crc)
366-
# self.size may exceed 2GB, or even 4GB
367-
write32u(self.fileobj, self.size & 0xffffffffL)
368-
self.fileobj = None
369-
elif self.mode == READ:
370-
self.fileobj = None
371-
if self.myfileobj:
372-
self.myfileobj.close()
373-
self.myfileobj = None
375+
self.fileobj = None
376+
try:
377+
if self.mode == WRITE:
378+
fileobj.write(self.compress.flush())
379+
write32u(fileobj, self.crc)
380+
# self.size may exceed 2GB, or even 4GB
381+
write32u(fileobj, self.size & 0xffffffffL)
382+
finally:
383+
myfileobj = self.myfileobj
384+
if myfileobj:
385+
self.myfileobj = None
386+
myfileobj.close()
374387

375388
def __enter__(self):
376389
# __enter__ is defined in _jyio._IOBase (aka
@@ -381,20 +394,11 @@ def __enter__(self):
381394
self._check_closed()
382395
return self
383396

384-
__iter__ = __enter__
385-
386-
if not sys.platform.startswith('java'):
387-
def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
388-
self._check_closed()
389-
if self.mode == WRITE:
390-
# Ensure the compressor's buffer is flushed
391-
self.fileobj.write(self.compress.flush(zlib_mode))
392-
self.fileobj.flush()
393-
else:
394-
# Java lacks Z_SYNC_FLUSH; thus Jython can't flush the
395-
# compressobj until EOF
396-
def flush(self,zlib_mode=None):
397-
self._check_closed()
397+
def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
398+
self._check_closed()
399+
if self.mode == WRITE:
400+
# Ensure the compressor's buffer is flushed
401+
self.fileobj.write(self.compress.flush(zlib_mode))
398402
self.fileobj.flush()
399403

400404
def fileno(self):

Lib/test/test_support.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -485,6 +485,8 @@ def make_jar_classloader(jar):
485485
if is_jython:
486486
# Jython disallows @ in module names
487487
TESTFN = '$test'
488+
TESTFN_UNICODE = "$test-\xe0\xf2"
489+
TESTFN_ENCODING = sys.getfilesystemencoding()
488490
elif os.name == 'riscos':
489491
TESTFN = 'testfile'
490492
else:

0 commit comments

Comments
 (0)