From 4dab156f14b8000c750248684e1a736192ae8cc0 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Thu, 7 May 2026 12:03:36 -0700 Subject: [PATCH] Reject compression='jpeg' in to_geotiff (no JPEGTables emitted) The JPEG-in-TIFF write path tags compression=7 (new-style JPEG) but writes a self-contained JFIF stream per tile/strip and never emits the JPEGTables tag (347) that the TIFF spec requires for that codec. libtiff, GDAL, and rasterio all reject the file with "TIFFReadEncodedStrip() failed". The internal reader round-trips because Pillow decodes the standalone JFIF bytes directly, which hides the break from xrspatial's own tests. Fix: reject compression='jpeg' at the to_geotiff entry point with a ValueError that points at deflate/zstd/lzw. The lower-level _writer.write is untouched so the existing self-decoding tests still demonstrate the codec works internally; re-enabling the public path needs a JPEGTables-aware encoder. --- .claude/sweep-accuracy-state.csv | 2 +- xrspatial/geotiff/__init__.py | 17 +++++++++++++++++ xrspatial/geotiff/tests/test_jpeg.py | 19 ++++++++++++------- 3 files changed, 30 insertions(+), 8 deletions(-) diff --git a/.claude/sweep-accuracy-state.csv b/.claude/sweep-accuracy-state.csv index 2f0580cf..04e95240 100644 --- a/.claude/sweep-accuracy-state.csv +++ b/.claude/sweep-accuracy-state.csv @@ -37,4 +37,4 @@ worley,2026-05-01,,MEDIUM,2;5,"MEDIUM: numpy backend uses np.empty_like(data) so zonal,2026-03-30T12:00:00Z,1090,,, geotiff,2026-05-06,pending-pr-cache-stale,MEDIUM,5,"Pass 5 (2026-05-06): MEDIUM fixed in fix-mmap-cache-stale-inode -- _MmapCache returned stale bytes after the file at the same path was replaced (e.g. by to_geotiff which writes via tempfile + os.replace, swapping the inode). Reproduces on numpy and GPU read paths: write A, read A, write B at same path, read returns A. Fixed by storing (st_ino, st_size, st_mtime_ns) per cache entry and dropping the entry on acquire when the file ident has changed; in-use mmaps stay valid for current holders. Tests added for both atomic-rename replacement and in-place truncate-and-rewrite. | Pass 4 (2026-05-06) PR #1501: sparse COG tiles (TileByteCounts==0). | Pass 3 (2026-05-06) PR #1500: predictor=3 byte-order. | Pass 2 (2026-05-05) PR #1498: predictor=2 sample-wise. | Pass 1 (2026-04-23) PR #1247. Re-confirmed clean: south-up flipping is a documented limitation in test_georef_edges.test_y_coords_known_limitation; CRS round-trip 4326/3857/32633/5070 exact; LERC int default lossless; NaN bit patterns survive predictor=3 round-trip; RATIONAL den=0 returns 0.0; ASCII tag NUL termination spec-correct on write; adler32 in GPU deflate path is per-tile not per-chunk so chunk-combine math does not apply; GeoDoubleParams uses correct double-array offset semantics. Deferred: per-band different NODATA via GDAL_METADATA exposed in attrs but not applied (single dataset-level value used for masking)." geotiff,2026-05-07,1507,MEDIUM,1;5,"Pass 6 (2026-05-07): MEDIUM fixed in PR #1507 - predictor=2 decode crashed with numba TypingError ('Unsupported array dtype: >u2/u4/u8') on big-endian TIFFs with multi-byte integer samples. Regression introduced by the pass-2 rework (#1498) that switched predictor=2 to a sample-wise numpy view in the file's byte order; numba nopython mode rejects non-native dtype arrays. Fix byte-swaps the buffer in place around the kernel call so the kernel sees native order and the on-disk view stays in file order. Tests cover uint16/int16/uint32/int32 BE round-trip plus LE sanity. uint8 BE+pred2 unaffected (single-byte path). Also noted but NOT fixed in this PR (one fix per PR rule): GPU read of BE multi-byte TIFFs crashes with AttributeError on out.byteswap() at _gpu_decode.py:1817 and 1556 because cupy.ndarray has no byteswap method; user-visible effect is a silent fallback to CPU which works for non-pred2 BE files. Re-confirmed clean: empty-array round-trip raises clearly (0x0, 0xN, 1x1 boundary), inline tag values <4 bytes positioned correctly, BigTIFF count overflow, IFD chain loop, RATIONAL den=0, NextIFDOffset > file size, sparse tiles, mmap cache invalidation post-#1506, BE without predictor reads correctly, 12-bit unpack, redirect handling, fp_predictor BE swizzle (#1500), nodata-out-of-dtype handled by string tag, HTTP short-read caught by size check at _reader.py:440, concurrent fetch propagates first error via ThreadPoolExecutor exit. | Pass 5 (2026-05-06) PR #1506: mmap cache stale after file replace. | Pass 4 (2026-05-06) PR #1501: sparse COG tiles. | Pass 3 (2026-05-06) PR #1500: predictor=3 byte order. | Pass 2 (2026-05-05) PR #1498: predictor=2 sample-wise. | Pass 1 (2026-04-23) PR #1247." -geotiff,2026-05-07,pending-pr-cache-refcount,HIGH,5,"Pass 7 (2026-05-07): HIGH fixed in fix-mmap-cache-refcount-after-replace -- _MmapCache.release() looked up the cache entry by realpath, so a holder that acquired the OLD mmap before an os.replace and released it AFTER another caller had acquired the post-replace entry would decrement the new holder's refcount. Subsequent eviction (cache full, or another acquire) closed the still-in-use mmap, breaking reads with 'mmap closed or invalid'. Real exposure: any concurrent reader/writer pattern where to_geotiff replaces a file that another reader had just opened via open_geotiff with chunks= or via _FileSource. PR #1506 added stale-replacement detection but did not fix the refcount confusion across the pop. Fix: acquire returns an opaque entry token; release takes the token and decrements that exact entry, regardless of cache state. Orphaned (popped) entries close their fh+mmap when their own refcount hits zero. _FileSource updated to pass the token. Regression test test_release_after_path_replacement_does_not_clobber_new_holder added. All 665 geotiff tests pass; GPU path verified. | Pass 6 (2026-05-07) PR #1507: BE pred2 numba TypingError. | Pass 5 (2026-05-06) PR #1506: mmap cache stale after file replace. | Pass 4 (2026-05-06) PR #1501: sparse COG tiles. | Pass 3 (2026-05-06) PR #1500: predictor=3 byte order. | Pass 2 (2026-05-05) PR #1498: predictor=2 sample-wise. | Pass 1 (2026-04-23) PR #1247. Re-confirmed clean over passes 2-7: items 2 (writer always emits LE TIFFs - hardcoded b'II'), 3 (RowsPerStrip default = height when missing), 4 (StripByteCounts missing raises clear ValueError), 5 (TileWidth without TileLength caught by 'tw <= 0 or th <= 0' check at _reader.py:688), 9 (read determinism on compressed+tiled+multiband), 11 (predictor=2 with awkward sample stride round-trips), 18 (compression_level=99 raises ValueError 'out of range for deflate (valid: 1-9)'), 21 (concurrent writes serialize correctly via mkstemp+os.replace), 24 (uint16 dtype preserved on numpy backend, dask honors chunks param), 26 (chunks rounds correctly with remainder chunk for non-tile-aligned). Deferred: item 8 (BytesIO/file-like sources are not supported, source.lower() error) - documented as 'str' parameter, not a bug; item 19 (LERC max_z_error not user-exposed by to_geotiff) - missing feature, not a bug." +geotiff,2026-05-07,pending-pr-jpeg-disable,HIGH,5;3,"Pass 8 (2026-05-07): HIGH fixed in fix-jpeg-tiff-disable -- to_geotiff(compression='jpeg') wrote files that no external reader can decode. The writer tags compression=7 (new-style JPEG) but emits a self-contained JFIF stream per tile/strip and never writes the JPEGTables tag (347) that the TIFF spec requires for that codec. libtiff/GDAL/rasterio all reject the file with TIFFReadEncodedStrip() failed; our reader round-trips because Pillow decodes the standalone JFIF, hiding the break. Pass-4 notes flagged the read side of the same JPEGTables gap and deferred it; pass-8 covers the write side. Fix: reject compression='jpeg' at the to_geotiff entry with a clear ValueError pointing at deflate/zstd/lzw. The internal _writer.write is untouched so the existing self-decoding tests still cover the codec; re-enabling the public path needs a JPEGTables-aware encoder. PR diffs reviewed but not merged: #1512 (BytesIO source) and #1513 (LERC max_z_error) -- both look correct; #1512 file-like read path goes through read_all() once so the per-call BytesIOSource lock is theoretical, and #1513 forwards max_z_error through every overview/tile/strip/streaming path including _write_vrt_tiled and _compress_block. No regressions found in either open PR. Other surfaces audited clean: predictor=3 with float16 (writer auto-promotes to float32 on both eager and streaming paths, value-exact round-trip); planar=2 multi-tile read uses band_idx*tiles_per_band offset so no cross-contamination between planes; _header.py multi-byte tag parsing uses bo (byte_order) consistently; Pillow YCbCr-vs-tagged-RGB photometric mismatch becomes moot once JPEG is disabled. Deferred (LOW/MEDIUM, not filed): JPEG2000 writer accepts arbitrary dtype with no validation (rare codec, narrow risk); float16 dtype not in tiff_dtype_to_numpy decode map (writer never emits it - asymmetric but unreachable); Orientation tag (274) still ignored on read (pass-4 deferral). | Pass 7 (2026-05-07): HIGH fixed in fix-mmap-cache-refcount-after-replace -- _MmapCache.release() looked up the cache entry by realpath, so a holder that acquired the OLD mmap before an os.replace and released it AFTER another caller had acquired the post-replace entry would decrement the new holder's refcount. Subsequent eviction (cache full, or another acquire) closed the still-in-use mmap, breaking reads with 'mmap closed or invalid'. Real exposure: any concurrent reader/writer pattern where to_geotiff replaces a file that another reader had just opened via open_geotiff with chunks= or via _FileSource. PR #1506 added stale-replacement detection but did not fix the refcount confusion across the pop. Fix: acquire returns an opaque entry token; release takes the token and decrements that exact entry, regardless of cache state. Orphaned (popped) entries close their fh+mmap when their own refcount hits zero. _FileSource updated to pass the token. Regression test test_release_after_path_replacement_does_not_clobber_new_holder added. All 665 geotiff tests pass; GPU path verified. | Pass 6 (2026-05-07) PR #1507: BE pred2 numba TypingError. | Pass 5 (2026-05-06) PR #1506: mmap cache stale after file replace. | Pass 4 (2026-05-06) PR #1501: sparse COG tiles. | Pass 3 (2026-05-06) PR #1500: predictor=3 byte order. | Pass 2 (2026-05-05) PR #1498: predictor=2 sample-wise. | Pass 1 (2026-04-23) PR #1247. Re-confirmed clean over passes 2-7: items 2 (writer always emits LE TIFFs - hardcoded b'II'), 3 (RowsPerStrip default = height when missing), 4 (StripByteCounts missing raises clear ValueError), 5 (TileWidth without TileLength caught by 'tw <= 0 or th <= 0' check at _reader.py:688), 9 (read determinism on compressed+tiled+multiband), 11 (predictor=2 with awkward sample stride round-trips), 18 (compression_level=99 raises ValueError 'out of range for deflate (valid: 1-9)'), 21 (concurrent writes serialize correctly via mkstemp+os.replace), 24 (uint16 dtype preserved on numpy backend, dask honors chunks param), 26 (chunks rounds correctly with remainder chunk for non-tile-aligned). Deferred: item 8 (BytesIO/file-like sources are not supported, source.lower() error) - documented as 'str' parameter, not a bug; item 19 (LERC max_z_error not user-exposed by to_geotiff) - missing feature, not a bug." diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 3fad38e6..14c6e2ec 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -667,6 +667,23 @@ def to_geotiff(data: xr.DataArray | np.ndarray, path: str, *, raise ValueError( f"Unknown compression {compression!r}. " f"Valid options: {list(_VALID_COMPRESSIONS)}.") + # JPEG-in-TIFF (compression=7) requires the JPEGTables tag (347) + # carrying the abbreviated quantization/Huffman tables. The + # current encoder writes a self-contained JFIF stream per + # tile/strip and omits JPEGTables, which makes the resulting + # files unreadable by libtiff / GDAL / rasterio: they reject the + # tile data with "TIFFReadEncodedStrip() failed". The internal + # reader round-trips because Pillow re-decodes the JFIF stream + # directly, masking the interop break. Refuse the write rather + # than emit files no other tool can decode. See issue tracking + # the proper JPEGTables fix for re-enabling this codec. + if compression.lower() == 'jpeg': + raise ValueError( + "compression='jpeg' is not supported: the encoder writes " + "self-contained JFIF streams without the required " + "JPEGTables tag (347), so other readers (libtiff, GDAL, " + "rasterio) reject the file. Use 'deflate', 'zstd', or " + "'lzw' instead.") # tile_size only applies to tiled output; warn if the caller passed a # non-default size alongside strip mode (it would otherwise be silently diff --git a/xrspatial/geotiff/tests/test_jpeg.py b/xrspatial/geotiff/tests/test_jpeg.py index 8edc89aa..b6a3f08d 100644 --- a/xrspatial/geotiff/tests/test_jpeg.py +++ b/xrspatial/geotiff/tests/test_jpeg.py @@ -133,8 +133,16 @@ def test_4band_rejected(self, tmp_path): class TestWriteGeotiffJpeg: """Test the public to_geotiff API with compression='jpeg'.""" - def test_to_geotiff_jpeg(self, tmp_path): - from xrspatial.geotiff import open_geotiff, to_geotiff + def test_to_geotiff_jpeg_rejected(self, tmp_path): + """to_geotiff refuses compression='jpeg'. + + The encoder writes self-contained JFIF streams without the + TIFF-required JPEGTables tag (347), so libtiff / GDAL / rasterio + cannot decode the file. Round-tripping internally via Pillow + would mask the interop break, so the public writer rejects the + codec until a JPEGTables-aware encoder is in place. + """ + from xrspatial.geotiff import to_geotiff rng = np.random.RandomState(1050) data = rng.randint(50, 200, (32, 32), dtype=np.uint8) @@ -144,8 +152,5 @@ def test_to_geotiff_jpeg(self, tmp_path): 'x': np.arange(32, dtype=float)}, ) path = str(tmp_path / 'api_1050.tif') - to_geotiff(da, path, compression='jpeg', tile_size=16) - - result = open_geotiff(path) - assert result.shape == (32, 32) - assert np.abs(result.values.astype(int) - data.astype(int)).mean() < 10 + with pytest.raises(ValueError, match="JPEGTables"): + to_geotiff(da, path, compression='jpeg', tile_size=16)