From 617e06403d65b743ff7f2ffe83bd8cf9bb0accb8 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Mon, 18 Oct 2021 13:47:27 +0200 Subject: [PATCH 1/7] Check gzip headers for corrupted fields --- Lib/gzip.py | 53 +++++++++++++++---- Lib/test/test_gzip.py | 30 +++++++++++ .../2021-10-18-13-46-55.bpo-45509.Upwb60.rst | 1 + 3 files changed, 73 insertions(+), 11 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2021-10-18-13-46-55.bpo-45509.Upwb60.rst diff --git a/Lib/gzip.py b/Lib/gzip.py index 6773ea3eef0971..693ac425d8d5f3 100644 --- a/Lib/gzip.py +++ b/Lib/gzip.py @@ -426,29 +426,60 @@ def _read_gzip_header(fp): if magic != b'\037\213': raise BadGzipFile('Not a gzipped file (%r)' % magic) - - (method, flag, last_mtime) = struct.unpack(" Date: Wed, 24 Nov 2021 10:48:00 +0100 Subject: [PATCH 2/7] Minor performance tweaks to _read_gzip_header Call the bool method and cache the result for faster truth checking. Do not test for empty bytes but use "not magic" instead for faster truth checking. --- Lib/gzip.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/gzip.py b/Lib/gzip.py index 693ac425d8d5f3..132719a7c6959a 100644 --- a/Lib/gzip.py +++ b/Lib/gzip.py @@ -421,7 +421,7 @@ def _read_gzip_header(fp): Returns last mtime if header was present or None otherwise. ''' magic = fp.read(2) - if magic == b'': + if not magic: return None if magic != b'\037\213': @@ -432,7 +432,7 @@ def _read_gzip_header(fp): raise BadGzipFile('Unknown compression method') # FHCRC will be checked often. So save the result of the check. - fhcrc = flag & FHCRC + fhcrc = bool(flag & FHCRC) # Only create and append to a list of header parts when FHCRC is set. # In the most common use cases FHCRC is not set. So we optimize for those # cases. From e68e76ede3cad99c094531ccf0596ad70bf1c883 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 24 Nov 2021 11:42:40 +0100 Subject: [PATCH 3/7] Optimize _read_gzip_header for the most common code paths Those are: + Only FNAME set. (Created by gzip and python's GzipFile) + No flags set. (Created by gzip.compress and zlib.compress with wbits=31) --- Lib/gzip.py | 48 +++++++++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/Lib/gzip.py b/Lib/gzip.py index 132719a7c6959a..171b8ba77c0d1a 100644 --- a/Lib/gzip.py +++ b/Lib/gzip.py @@ -431,48 +431,58 @@ def _read_gzip_header(fp): if method != 8: raise BadGzipFile('Unknown compression method') - # FHCRC will be checked often. So save the result of the check. - fhcrc = bool(flag & FHCRC) - # Only create and append to a list of header parts when FHCRC is set. - # In the most common use cases FHCRC is not set. So we optimize for those - # cases. - if fhcrc: - header_parts = [magic, base_header] + # No flags. No need for further parsing. These headers are returned by + # gzip.compress or zlib.compress(..., wbits=31) + if not flag: + return last_mtime + # Most gzip files will have only FNAME set. For example: produced by gzip + # command line application or python's GzipFile. + if flag == FNAME: + while True: + s = fp.read(1) + if not s: + raise EOFError("Compressed file ended before the " + "end-of-stream marker was reached") + if s == b'\000': + break + return last_mtime + + # Processing for more complex flags. + + # Save header parts for FHCRC checking + header_parts = [magic, base_header] if flag & FEXTRA: - # Read the extra field, if present, save the fields if FHCRC is set. + # Read the extra field, if present, save the fields for FHCRC checking. extra_len_bytes = _read_exact(fp, 2) extra_len, = struct.unpack(" Date: Fri, 10 Apr 2026 14:00:34 +0200 Subject: [PATCH 4/7] Update test_gzip.py Remove double import of zlib --- Lib/test/test_gzip.py | 1 - 1 file changed, 1 deletion(-) diff --git a/Lib/test/test_gzip.py b/Lib/test/test_gzip.py index 39249a0cd82d8c..b3b7c8f87e4f9f 100644 --- a/Lib/test/test_gzip.py +++ b/Lib/test/test_gzip.py @@ -9,7 +9,6 @@ import struct import sys import unittest -import zlib from subprocess import PIPE, Popen from test.support import catch_unraisable_exception from test.support import force_not_colorized_test_class, import_helper From 4d13ece56159cab4a96921e025e98e8a43b876e2 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Tue, 12 May 2026 16:14:00 +0200 Subject: [PATCH 5/7] Factor out reading until null and use a bytearray --- Lib/gzip.py | 51 +++++++++++++++++++-------------------------------- 1 file changed, 19 insertions(+), 32 deletions(-) diff --git a/Lib/gzip.py b/Lib/gzip.py index e96b95f567e590..0f34aedddfb1e5 100644 --- a/Lib/gzip.py +++ b/Lib/gzip.py @@ -484,6 +484,19 @@ def _read_exact(fp, n): return data +def _read_until_null(fp): + '''Read until the first encountered null byte in fp''' + result = io.BytesIO() + while True: + s = fp.read(1) + if not s: + raise EOFError("Compressed file ended before the " + "end-of-stream marker was reached") + if s == b'\000': + break + return result.getvalue() + + def _read_gzip_header(fp): '''Read a gzip header from `fp` and progress to the end of the header. @@ -507,54 +520,28 @@ def _read_gzip_header(fp): # Most gzip files will have only FNAME set. For example: produced by gzip # command line application or python's GzipFile. if flag == FNAME: - while True: - s = fp.read(1) - if not s: - raise EOFError("Compressed file ended before the " - "end-of-stream marker was reached") - if s == b'\000': - break + _read_until_null(fp) return last_mtime # Processing for more complex flags. - # Save header parts for FHCRC checking - header_parts = [magic, base_header] + header = bytearray(magic + base_header) if flag & FEXTRA: # Read the extra field, if present, save the fields for FHCRC checking. extra_len_bytes = _read_exact(fp, 2) extra_len, = struct.unpack(" Date: Tue, 12 May 2026 16:20:20 +0200 Subject: [PATCH 6/7] Simplify comments. Better grouping of statements --- Lib/gzip.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/Lib/gzip.py b/Lib/gzip.py index 0f34aedddfb1e5..e80cbd9300f083 100644 --- a/Lib/gzip.py +++ b/Lib/gzip.py @@ -513,34 +513,28 @@ def _read_gzip_header(fp): if method != 8: raise BadGzipFile('Unknown compression method') - # No flags. No need for further parsing. These headers are returned by - # gzip.compress or zlib.compress(..., wbits=31) + # Most common cases are no flags (gzip.compress, zlib.compress) or only + # FNAME set (GzipFile, gzip command line application). Exit early + # in those cases. if not flag: return last_mtime - # Most gzip files will have only FNAME set. For example: produced by gzip - # command line application or python's GzipFile. if flag == FNAME: _read_until_null(fp) return last_mtime - # Processing for more complex flags. - # Save header parts for FHCRC checking + # Processing for more complex flags. Save header parts for FHCRC checking. header = bytearray(magic + base_header) - if flag & FEXTRA: - # Read the extra field, if present, save the fields for FHCRC checking. extra_len_bytes = _read_exact(fp, 2) extra_len, = struct.unpack(" Date: Tue, 12 May 2026 16:33:17 +0200 Subject: [PATCH 7/7] Fix error in method --- Lib/gzip.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/gzip.py b/Lib/gzip.py index e80cbd9300f083..d02f62d42346eb 100644 --- a/Lib/gzip.py +++ b/Lib/gzip.py @@ -492,6 +492,7 @@ def _read_until_null(fp): if not s: raise EOFError("Compressed file ended before the " "end-of-stream marker was reached") + result.write(s) if s == b'\000': break return result.getvalue()