From 43b8f31f1aa6ac1e50dcdf622b44fddccfb34dec Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Thu, 7 May 2026 11:33:45 -0700 Subject: [PATCH 1/2] Accept binary file-like objects in to_geotiff and the readers Closes #1511. - New _BytesIOSource wraps any read+seek file-like; a lock around seek+read keeps thread-pool windowed reads race-free. - _open_source, read_to_array, _read_geo_info, and open_geotiff accept either a string or a file-like. - to_geotiff accepts any path with a write method; _write_bytes writes straight to the buffer for file-likes and keeps the temp-file + os.replace atomic write for string paths. - Reject cog=True for file-likes (deferred), gpu=True / chunks for file-like sources, and gate VRT branches on isinstance str so buffers can't accidentally hit the VRT code path. - Dask + file-like falls back to eager in-memory assembly since write_streaming patches IFD offsets in place on a temp path. Tests: xrspatial/geotiff/tests/test_bytesio_source.py covers round-trip, windowed read, COG/VRT rejection, and concurrent reads from one source. --- xrspatial/geotiff/__init__.py | 108 +++++++++--- xrspatial/geotiff/_reader.py | 81 ++++++++- xrspatial/geotiff/_writer.py | 17 +- .../geotiff/tests/test_bytesio_source.py | 157 ++++++++++++++++++ 4 files changed, 329 insertions(+), 34 deletions(-) create mode 100644 xrspatial/geotiff/tests/test_bytesio_source.py diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 3fad38e6..1c6fa8a6 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -179,25 +179,49 @@ def _coords_to_transform(da: xr.DataArray) -> GeoTransform | None: ) -def _read_geo_info(source: str, *, overview_level: int | None = None): +def _read_geo_info(source, *, overview_level: int | None = None): """Read only the geographic metadata and image dimensions from a GeoTIFF. Returns (geo_info, height, width, dtype, n_bands) without reading pixel - data. Uses mmap for header-only access -- O(1) memory regardless of file - size. + data. Uses mmap for header-only access on string paths; for file-like + inputs it reads the bytes directly. O(1) memory regardless of file size + when a path is supplied. Parameters ---------- + source : str or binary file-like + Path or any object with ``read``/``seek``. overview_level : int or None Overview IFD index (0 = full resolution). """ from ._dtypes import tiff_dtype_to_numpy from ._geotags import extract_geo_info from ._header import parse_all_ifds, parse_header + from ._reader import _is_file_like - with open(source, 'rb') as f: - import mmap - data = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) + if _is_file_like(source): + # File-like: read its full bytes; we don't try to mmap arbitrary + # buffers because they may not back a real file descriptor. + try: + cur = source.tell() + except (OSError, AttributeError): + cur = 0 + source.seek(0) + data = source.read() + try: + source.seek(cur) + except (OSError, AttributeError): + pass + close_data = False + elif isinstance(source, str): + with open(source, 'rb') as f: + import mmap + data = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) + close_data = True + else: + raise TypeError( + "source must be a str path or binary file-like, " + f"got {type(source).__name__}") try: header = parse_header(data) ifds = parse_all_ifds(data, header) @@ -213,7 +237,8 @@ def _read_geo_info(source: str, *, overview_level: int | None = None): n_bands = ifd.samples_per_pixel if ifd.samples_per_pixel > 1 else 0 return geo_info, ifd.height, ifd.width, file_dtype, n_bands finally: - data.close() + if close_data: + data.close() def _extent_to_window(transform, file_height, file_width, @@ -245,7 +270,7 @@ def _extent_to_window(transform, file_height, file_width, -def open_geotiff(source: str, *, dtype=None, window=None, +def open_geotiff(source, *, dtype=None, window=None, overview_level: int | None = None, band: int | None = None, name: str | None = None, @@ -264,8 +289,11 @@ def open_geotiff(source: str, *, dtype=None, window=None, Parameters ---------- - source : str - File path, HTTP URL, or cloud URI (s3://, gs://, az://). + source : str or binary file-like + File path, HTTP URL, cloud URI (s3://, gs://, az://), or a + binary file-like object (e.g. ``io.BytesIO``) with read+seek. + VRT, dask-chunked, GPU, and remote-URL paths require a string; + in-memory file-like buffers go through the eager numpy reader. dtype : str, numpy.dtype, or None Cast the result to this dtype after reading. None keeps the file's native dtype. Float-to-int casts raise ValueError to @@ -315,12 +343,25 @@ def open_geotiff(source: str, *, dtype=None, window=None, is lossy in a way users rarely intend; cast explicitly after read if you need it). """ - # VRT files - if source.lower().endswith('.vrt'): + # VRT files (string paths only -- VRT XML references other files on disk) + if isinstance(source, str) and source.lower().endswith('.vrt'): return read_vrt(source, dtype=dtype, window=window, band=band, name=name, chunks=chunks, gpu=gpu, max_pixels=max_pixels) + # File-like buffers don't support the GPU or dask code paths because + # those re-open the source by path from worker tasks or device-side + # readers. Reject early with a clear message. + if not isinstance(source, str): + if gpu: + raise ValueError( + "gpu=True is not supported for file-like sources. " + "Pass a path string instead.") + if chunks is not None: + raise ValueError( + "chunks=... (dask) is not supported for file-like sources. " + "Pass a path string instead.") + # GPU path if gpu: return read_geotiff_gpu(source, dtype=dtype, @@ -358,9 +399,11 @@ def open_geotiff(source: str, *, dtype=None, window=None, coords = {'y': full_y, 'x': full_x} if name is None: - # Derive from source path - import os - name = os.path.splitext(os.path.basename(source))[0] + # Derive from source path. File-like buffers don't have a path, + # so leave name unset rather than fabricating one. + if isinstance(source, str): + import os + name = os.path.splitext(os.path.basename(source))[0] attrs = {} if geo_info.crs_epsg is not None: @@ -579,7 +622,7 @@ def _merge_friendly_extra_tags(extra_tags_list, attrs: dict) -> list | None: return existing or None -def to_geotiff(data: xr.DataArray | np.ndarray, path: str, *, +def to_geotiff(data: xr.DataArray | np.ndarray, path, *, crs: int | str | None = None, nodata=None, compression: str = 'zstd', @@ -612,8 +655,11 @@ def to_geotiff(data: xr.DataArray | np.ndarray, path: str, *, ---------- data : xr.DataArray or np.ndarray 2D raster data. - path : str - Output file path. + path : str or binary file-like + Output file path, or any object exposing a ``write`` method + (e.g. ``io.BytesIO``). When a file-like is passed, the encoded + TIFF bytes are written to that object once assembly completes. + ``cog=True`` and ``.vrt`` outputs require a string path. crs : int, str, or None EPSG code (int), WKT string, or PROJ string. If None and data is a DataArray, tries to read from attrs ('crs' for EPSG, @@ -668,6 +714,21 @@ def to_geotiff(data: xr.DataArray | np.ndarray, path: str, *, f"Unknown compression {compression!r}. " f"Valid options: {list(_VALID_COMPRESSIONS)}.") + # File-like (BytesIO etc.) destinations: the streaming, GPU, COG, and + # VRT writers all need a real filesystem path (atomic rename, overview + # passes, sidecar writes). Reject those combos up front so the user + # gets a clear error instead of a deep traceback. + _path_is_file_like = (not isinstance(path, str)) and hasattr(path, 'write') + if _path_is_file_like: + if cog: + raise ValueError( + "cog=True is not supported for file-like destinations. " + "Pass a string path or write to BytesIO without cog=True.") + elif not isinstance(path, str): + raise TypeError( + f"path must be a str or a binary file-like with a write() " + f"method, got {type(path).__name__}") + # tile_size only applies to tiled output; warn if the caller passed a # non-default size alongside strip mode (it would otherwise be silently # ignored). @@ -680,8 +741,9 @@ def to_geotiff(data: xr.DataArray | np.ndarray, path: str, *, stacklevel=2, ) - # VRT tiled output - if path.lower().endswith('.vrt'): + # VRT tiled output (string paths only -- VRT writes a real .vrt file + # plus per-tile GeoTIFFs to a directory) + if isinstance(path, str) and path.lower().endswith('.vrt'): if cog: raise ValueError( "cog=True is not compatible with VRT output. " @@ -785,8 +847,10 @@ def to_geotiff(data: xr.DataArray | np.ndarray, path: str, *, # Dask-backed: stream tiles to avoid materialising the full array. # COG requires overviews from the full array, so it falls through - # to the eager path. - if hasattr(raw, 'dask') and not cog: + # to the eager path. Streaming write needs a real filesystem path + # (it builds a temp file then atomic-renames); for file-like + # destinations we materialise eagerly and assemble in-memory. + if hasattr(raw, 'dask') and not cog and not _path_is_file_like: dask_arr = raw # Handle band-first dimension order (band, y, x) -> (y, x, band) if raw.ndim == 3 and data.dims[0] in ('band', 'bands', 'channel'): diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index 656a8d40..7cbffc37 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -345,11 +345,68 @@ def close(self): def _is_fsspec_uri(path: str) -> bool: """Check if a path is a fsspec-compatible URI (not http/https/local).""" + if not isinstance(path, str): + return False if path.startswith(('http://', 'https://')): return False return '://' in path +def _is_file_like(obj) -> bool: + """Return True if obj exposes a binary file-like interface (read+seek).""" + return ( + not isinstance(obj, str) + and hasattr(obj, 'read') + and hasattr(obj, 'seek') + ) + + +class _BytesIOSource: + """Data source backed by an in-memory or any seekable binary file-like. + + Wraps a `BytesIO` or any object exposing ``read``/``seek`` so the reader + can issue windowed byte reads without touching the filesystem. Concurrent + callers (e.g. parallel tile decode) are serialized through a lock around + the seek+read pair so they don't race on the underlying buffer's cursor. + """ + + def __init__(self, fileobj): + self._fh = fileobj + self._lock = threading.Lock() + # Determine size by seeking to the end. Reset the cursor afterwards + # so callers that share the buffer don't observe a moved position. + try: + cur = fileobj.tell() + except (OSError, AttributeError): + cur = 0 + fileobj.seek(0, 2) + self._size = fileobj.tell() + try: + fileobj.seek(cur) + except (OSError, AttributeError): + fileobj.seek(0) + + def read_range(self, start: int, length: int) -> bytes: + if length <= 0: + return b'' + with self._lock: + self._fh.seek(start) + return self._fh.read(length) + + def read_all(self): + with self._lock: + self._fh.seek(0) + return self._fh.read() + + @property + def size(self) -> int: + return self._size + + def close(self): + # Don't close the caller's buffer -- they own it. + self._fh = None + + class _CloudSource: """Cloud storage data source using fsspec. @@ -385,8 +442,14 @@ def close(self): pass -def _open_source(source: str): - """Open a data source (local file, URL, or cloud path).""" +def _open_source(source): + """Open a data source (local file, URL, cloud path, or file-like).""" + if _is_file_like(source): + return _BytesIOSource(source) + if not isinstance(source, str): + raise TypeError( + f"source must be a str path/URL or a binary file-like object " + f"with read+seek methods, got {type(source).__name__}") if source.startswith(('http://', 'https://')): return _HTTPSource(source) if _is_fsspec_uri(source): @@ -1000,7 +1063,7 @@ def _read_cog_http(url: str, overview_level: int | None = None, # Main read function # --------------------------------------------------------------------------- -def read_to_array(source: str, *, window=None, overview_level: int | None = None, +def read_to_array(source, *, window=None, overview_level: int | None = None, band: int | None = None, max_pixels: int = MAX_PIXELS_DEFAULT, ) -> tuple[np.ndarray, GeoInfo]: @@ -1008,8 +1071,8 @@ def read_to_array(source: str, *, window=None, overview_level: int | None = None Parameters ---------- - source : str - File path or URL. + source : str or binary file-like + File path, URL, or a file-like object with ``read``/``seek``. window : tuple or None (row_start, col_start, row_stop, col_stop). overview_level : int or None @@ -1025,12 +1088,14 @@ def read_to_array(source: str, *, window=None, overview_level: int | None = None ------- (np.ndarray, GeoInfo) tuple """ - if source.startswith(('http://', 'https://')): + if isinstance(source, str) and source.startswith(('http://', 'https://')): return _read_cog_http(source, overview_level=overview_level, band=band, max_pixels=max_pixels) - # Local file or cloud storage: read all bytes then parse - if _is_fsspec_uri(source): + # Local file, cloud storage, or file-like buffer: read all bytes then parse + if _is_file_like(source): + src = _BytesIOSource(source) + elif _is_fsspec_uri(source): src = _CloudSource(source) else: src = _FileSource(source) diff --git a/xrspatial/geotiff/_writer.py b/xrspatial/geotiff/_writer.py index a04b1673..6ba0ca56 100644 --- a/xrspatial/geotiff/_writer.py +++ b/xrspatial/geotiff/_writer.py @@ -1496,17 +1496,26 @@ def write_streaming(dask_data, path: str, *, raise -def _is_fsspec_uri(path: str) -> bool: - """Check if a path is a fsspec-compatible URI.""" +def _is_fsspec_uri(path) -> bool: + """Check if a path is a fsspec-compatible URI (string only).""" + if not isinstance(path, str): + return False if path.startswith(('http://', 'https://')): return False return '://' in path -def _write_bytes(file_bytes: bytes, path: str) -> None: - """Write bytes to a local file (atomic) or cloud storage (via fsspec).""" +def _write_bytes(file_bytes: bytes, path) -> None: + """Write bytes to a local file (atomic), cloud storage (via fsspec), + or any binary file-like object exposing ``write``.""" import os + # File-like destination: append the encoded bytes. The caller owns + # the buffer's lifetime (we don't close it). + if not isinstance(path, str) and hasattr(path, 'write'): + path.write(file_bytes) + return + if _is_fsspec_uri(path): try: import fsspec diff --git a/xrspatial/geotiff/tests/test_bytesio_source.py b/xrspatial/geotiff/tests/test_bytesio_source.py new file mode 100644 index 00000000..743bafa9 --- /dev/null +++ b/xrspatial/geotiff/tests/test_bytesio_source.py @@ -0,0 +1,157 @@ +"""Round-trip and concurrency tests for file-like (BytesIO) read/write paths. + +Closes #1511. The reader and writer accept any binary file-like that exposes +``read``/``seek`` (reader) or ``write`` (writer), so users can keep encoded +TIFF bytes in memory or stream them through a non-filesystem destination. +""" +from __future__ import annotations + +import io +import threading +from concurrent.futures import ThreadPoolExecutor + +import numpy as np +import pytest +import xarray as xr + +from xrspatial.geotiff import open_geotiff, to_geotiff +from xrspatial.geotiff._reader import _BytesIOSource, read_to_array + + +def _make_da(height=32, width=40, dtype=np.float32): + arr = np.arange(height * width, dtype=dtype).reshape(height, width) + # Simple geotransform with negative pixel_height (north-up) + x = np.arange(width, dtype=np.float64) * 0.5 + 100.0 + 0.25 + y = np.arange(height, dtype=np.float64) * (-0.5) + 50.0 - 0.25 + da = xr.DataArray( + arr, + dims=['y', 'x'], + coords={'y': y, 'x': x}, + name='test', + attrs={ + 'crs': 4326, + 'transform': (0.5, 0.0, 100.0, 0.0, -0.5, 50.0), + }, + ) + return da + + +class TestBytesIORoundTrip: + def test_round_trip_basic(self): + """to_geotiff(BytesIO) + open_geotiff(BytesIO) round-trips data and crs.""" + da = _make_da() + buf = io.BytesIO() + to_geotiff(da, buf, compression='deflate') + buf.seek(0) + # Read back from a fresh BytesIO so the read path has to seek itself + round_tripped = open_geotiff(io.BytesIO(buf.getvalue())) + + np.testing.assert_array_equal(round_tripped.values, da.values) + assert round_tripped.attrs.get('crs') == 4326 + # Transform tuple preserved verbatim + assert round_tripped.attrs.get('transform') == da.attrs['transform'] + + def test_round_trip_uint8(self): + """uint8 round-trip without compression.""" + da = _make_da(dtype=np.uint8) + buf = io.BytesIO() + to_geotiff(da, buf, compression='none') + result = open_geotiff(io.BytesIO(buf.getvalue())) + np.testing.assert_array_equal(result.values, da.values) + + +class TestBytesIOWindowedRead: + def test_windowed_read(self): + """Windowed read from a BytesIO source returns the right slice.""" + da = _make_da(height=64, width=64) + buf = io.BytesIO() + to_geotiff(da, buf, compression='deflate', tiled=True, tile_size=16) + + # Window covering tile-aligned and unaligned regions + window = (8, 12, 40, 48) + arr, _ = read_to_array(io.BytesIO(buf.getvalue()), window=window) + expected = da.values[8:40, 12:48] + np.testing.assert_array_equal(arr, expected) + + +class TestBytesIORejectsCog: + def test_cog_with_file_like_rejected(self): + """cog=True with a file-like destination raises ValueError.""" + da = _make_da() + buf = io.BytesIO() + with pytest.raises(ValueError, match='cog=True'): + to_geotiff(da, buf, cog=True) + + +class TestBytesIORejectsVrt: + def test_vrt_extension_with_file_like_is_treated_as_geotiff(self): + """A file-like destination cannot carry a VRT extension marker. + + VRT output is filesystem-only (it writes per-tile sidecar GeoTIFFs + to a directory), so passing a buffer with an unrelated name should + not select the VRT path. The check uses isinstance(path, str), so a + BytesIO simply gets treated as a plain GeoTIFF destination. + """ + # No way to "name" a BytesIO as .vrt, so this asserts the gate works: + # passing a BytesIO produces a normal GeoTIFF. + da = _make_da() + buf = io.BytesIO() + to_geotiff(da, buf) + # Header magic for classic TIFF is II*\0 or MM\0* + head = buf.getvalue()[:4] + assert head in (b'II*\x00', b'MM\x00*', b'II+\x00', b'MM\x00+') + + def test_explicit_vrt_path_string_with_file_like_data_works(self, tmp_path): + """A real .vrt path with file-like data is unrelated to this PR. + + This test just sanity-checks that string-path VRT handling still + rejects cog=True (existing behaviour) -- the regression we worry + about is the writer accidentally taking the VRT branch when + ``path`` is a buffer. + """ + da = _make_da() + with pytest.raises(ValueError, match='cog'): + to_geotiff(da, str(tmp_path / 'out.vrt'), cog=True) + + +class TestBytesIOConcurrentReads: + def test_concurrent_windowed_reads(self): + """Many threads reading disjoint windows from one BytesIOSource see + consistent bytes (lock around seek+read prevents cursor races).""" + da = _make_da(height=128, width=128) + buf = io.BytesIO() + to_geotiff(da, buf, compression='deflate', tiled=True, tile_size=16) + encoded = buf.getvalue() + + # All threads share one source instance. + shared = io.BytesIO(encoded) + src = _BytesIOSource(shared) + + # Pick byte ranges scattered through the file. + size = src.size + rng = np.random.default_rng(0) + ranges = [] + for _ in range(64): + start = int(rng.integers(0, max(1, size - 32))) + length = int(rng.integers(1, min(128, size - start) + 1)) + ranges.append((start, length)) + + # Compute the truth values single-threaded. + truth = [encoded[s:s + n] for s, n in ranges] + + results = [None] * len(ranges) + errors: list[BaseException] = [] + + def worker(i): + try: + s, n = ranges[i] + results[i] = src.read_range(s, n) + except BaseException as e: # pragma: no cover - defensive + errors.append(e) + + with ThreadPoolExecutor(max_workers=8) as pool: + list(pool.map(worker, range(len(ranges)))) + + assert not errors, errors + for got, want in zip(results, truth): + assert got == want From eff1cd97b43bb81b303eba4e7006d544f8bf597f Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Thu, 7 May 2026 18:27:32 -0700 Subject: [PATCH 2/2] Address PR #1512 review: pathlib.Path, GPU+file-like, truncate-on-rewrite, tell() - `_coerce_path` normalises `os.PathLike` (e.g. `pathlib.Path`) to `str` at the top of every public reader/writer entry. Path('mosaic.vrt') now routes to read_vrt, Path('x.tif') derives a name, etc. - `to_geotiff` rejects `gpu=True` with a file-like destination up front. The write_geotiff_gpu path was never tested with buffers and would have hit `_write_bytes(path)` without truncating. - `_write_bytes` rewinds and truncates the buffer before writing when the destination supports it. Two writes to the same BytesIO now overwrite rather than concatenate, matching string-path semantics. - `_is_file_like` now requires `tell` in addition to `read`/`seek`. `_BytesIOSource` calls `tell()` to size the buffer; the previous gate let read-seekable-but-not-tellable inputs through and crash inside `__init__`. We drop the guarded-tell pattern in the constructor in favour of a single try/except that raises a clear ValueError if the buffer is unusable (e.g. closed). - Drop unused `threading` import from the test file. - Tests cover Path round-trip, Path('.vrt') VRT routing, Path-derived name, GPU+buffer rejection, BytesIO overwrite-on-rewrite, and the tell() requirement at the gate. --- xrspatial/geotiff/__init__.py | 35 +++++- xrspatial/geotiff/_reader.py | 43 +++++-- xrspatial/geotiff/_writer.py | 14 ++- .../geotiff/tests/test_bytesio_source.py | 115 +++++++++++++++++- 4 files changed, 191 insertions(+), 16 deletions(-) diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 1c6fa8a6..7ef8d8b5 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -197,8 +197,9 @@ def _read_geo_info(source, *, overview_level: int | None = None): from ._dtypes import tiff_dtype_to_numpy from ._geotags import extract_geo_info from ._header import parse_all_ifds, parse_header - from ._reader import _is_file_like + from ._reader import _coerce_path, _is_file_like + source = _coerce_path(source) if _is_file_like(source): # File-like: read its full bytes; we don't try to mmap arbitrary # buffers because they may not back a real file descriptor. @@ -343,6 +344,10 @@ def open_geotiff(source, *, dtype=None, window=None, is lossy in a way users rarely intend; cast explicitly after read if you need it). """ + from ._reader import _coerce_path + + source = _coerce_path(source) + # VRT files (string paths only -- VRT XML references other files on disk) if isinstance(source, str) and source.lower().endswith('.vrt'): return read_vrt(source, dtype=dtype, window=window, band=band, @@ -705,6 +710,10 @@ def to_geotiff(data: xr.DataArray | np.ndarray, path, *, rasters whose tile-row exceeds this budget are split into horizontal segments. Ignored for numpy / CuPy / COG paths. """ + from ._reader import _coerce_path + + path = _coerce_path(path) + # Up-front validation: catch bad compression names before they reach # any of the deeper write paths (streaming, GPU, VRT, COG) where the # error surfaces from _compression_tag with a less obvious traceback. @@ -763,6 +772,15 @@ def to_geotiff(data: xr.DataArray | np.ndarray, path, *, # Auto-detect GPU data and dispatch to write_geotiff_gpu use_gpu = gpu if gpu is not None else _is_gpu_data(data) + if use_gpu and _path_is_file_like: + # write_geotiff_gpu's nvCOMP path materialises tile parts and then + # calls _write_bytes(path), which would write at the buffer's + # current cursor without truncating. More importantly, the GPU + # path was never tested with file-like destinations; refuse rather + # than silently produce something untested. + raise ValueError( + "gpu=True is not supported for file-like destinations. " + "Pass a string path (or set gpu=False).") if use_gpu: try: write_geotiff_gpu(data, path, crs=crs, nodata=nodata, @@ -1191,12 +1209,16 @@ def read_geotiff_dask(source: str, *, dtype=None, chunks: int | tuple = 512, """ import dask.array as da + from ._reader import _coerce_path + + source = _coerce_path(source) + # ``read_geotiff`` already routes ``.vrt`` to ``read_vrt`` before # reaching here, so this branch is only hit when ``read_geotiff_dask`` # is called directly with a VRT path. Keep it as a defensive fallback # rather than letting the windowed-read path try to parse VRT XML as # TIFF bytes. ``read_vrt`` is the single source of truth for VRT. - if source.lower().endswith('.vrt'): + if isinstance(source, str) and source.lower().endswith('.vrt'): return read_vrt(source, dtype=dtype, name=name, chunks=chunks) # Metadata-only read: O(1) memory via mmap, no pixel decompression @@ -1370,12 +1392,16 @@ def read_geotiff_gpu(source: str, *, "cupy is required for GPU reads. " "Install it with: pip install cupy-cuda12x") - from ._reader import _FileSource, _check_dimensions, MAX_PIXELS_DEFAULT + from ._reader import ( + _FileSource, _check_dimensions, MAX_PIXELS_DEFAULT, _coerce_path, + ) from ._header import parse_header, parse_all_ifds, validate_tile_layout from ._dtypes import tiff_dtype_to_numpy from ._geotags import extract_geo_info from ._gpu_decode import gpu_decode_tiles + source = _coerce_path(source) + if max_pixels is None: max_pixels = MAX_PIXELS_DEFAULT @@ -1740,8 +1766,11 @@ def read_vrt(source: str, *, dtype=None, window=None, the original WKT. The source GeoTransform is preserved as a rasterio-style 6-tuple in ``attrs['transform']``. """ + from ._reader import _coerce_path from ._vrt import read_vrt as _read_vrt_internal + source = _coerce_path(source) + arr, vrt = _read_vrt_internal(source, window=window, band=band, max_pixels=max_pixels) diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index 7cbffc37..e7fec46e 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -353,14 +353,33 @@ def _is_fsspec_uri(path: str) -> bool: def _is_file_like(obj) -> bool: - """Return True if obj exposes a binary file-like interface (read+seek).""" + """Return True if obj exposes a binary file-like interface (read+seek+tell). + + ``tell`` is required because :class:`_BytesIOSource` uses it to compute + the buffer length via seek-to-end. ``os.PathLike`` instances don't + expose ``read``/``seek``/``tell`` and are excluded here so that + :func:`_coerce_path` can convert them to ``str`` upstream. + """ return ( not isinstance(obj, str) and hasattr(obj, 'read') and hasattr(obj, 'seek') + and hasattr(obj, 'tell') ) +def _coerce_path(source): + """Normalize ``os.PathLike`` (e.g. ``pathlib.Path``) to ``str``. + + Strings and binary file-likes pass through unchanged. Used at the top + of every public reader/writer entry so that ``Path('mosaic.vrt')`` + dispatches to the VRT path, ``Path('x.tif')`` derives a ``name``, etc. + """ + if isinstance(source, _os_module.PathLike): + return _os_module.fspath(source) + return source + + class _BytesIOSource: """Data source backed by an in-memory or any seekable binary file-like. @@ -371,20 +390,22 @@ class _BytesIOSource: """ def __init__(self, fileobj): + # _is_file_like (the gate that lets us reach this constructor) + # already requires read/seek/tell, so we can call tell() directly + # rather than guarding it. We do still defend against tell raising + # on a closed/detached buffer with an informative error. self._fh = fileobj self._lock = threading.Lock() - # Determine size by seeking to the end. Reset the cursor afterwards - # so callers that share the buffer don't observe a moved position. try: cur = fileobj.tell() - except (OSError, AttributeError): - cur = 0 - fileobj.seek(0, 2) - self._size = fileobj.tell() - try: + fileobj.seek(0, 2) + self._size = fileobj.tell() fileobj.seek(cur) - except (OSError, AttributeError): - fileobj.seek(0) + except (OSError, ValueError) as e: + raise ValueError( + f"file-like source is not usable for size measurement: " + f"{type(e).__name__}: {e}" + ) from e def read_range(self, start: int, length: int) -> bytes: if length <= 0: @@ -444,6 +465,7 @@ def close(self): def _open_source(source): """Open a data source (local file, URL, cloud path, or file-like).""" + source = _coerce_path(source) if _is_file_like(source): return _BytesIOSource(source) if not isinstance(source, str): @@ -1088,6 +1110,7 @@ def read_to_array(source, *, window=None, overview_level: int | None = None, ------- (np.ndarray, GeoInfo) tuple """ + source = _coerce_path(source) if isinstance(source, str) and source.startswith(('http://', 'https://')): return _read_cog_http(source, overview_level=overview_level, band=band, max_pixels=max_pixels) diff --git a/xrspatial/geotiff/_writer.py b/xrspatial/geotiff/_writer.py index 6ba0ca56..91ec43c1 100644 --- a/xrspatial/geotiff/_writer.py +++ b/xrspatial/geotiff/_writer.py @@ -1510,9 +1510,19 @@ def _write_bytes(file_bytes: bytes, path) -> None: or any binary file-like object exposing ``write``.""" import os - # File-like destination: append the encoded bytes. The caller owns - # the buffer's lifetime (we don't close it). + # File-like destination: match string-path "overwrite" semantics + # (writing to '/tmp/x.tif' twice produces a one-TIFF file, not two + # concatenated). Rewind+truncate when the buffer supports it so a + # caller reusing the same BytesIO across writes doesn't end up with + # silently appended TIFFs. The caller still owns the buffer's + # lifetime; we don't close it. if not isinstance(path, str) and hasattr(path, 'write'): + if hasattr(path, 'seek') and hasattr(path, 'truncate'): + try: + path.seek(0) + path.truncate(0) + except (OSError, AttributeError): + pass path.write(file_bytes) return diff --git a/xrspatial/geotiff/tests/test_bytesio_source.py b/xrspatial/geotiff/tests/test_bytesio_source.py index 743bafa9..803aeca2 100644 --- a/xrspatial/geotiff/tests/test_bytesio_source.py +++ b/xrspatial/geotiff/tests/test_bytesio_source.py @@ -7,7 +7,6 @@ from __future__ import annotations import io -import threading from concurrent.futures import ThreadPoolExecutor import numpy as np @@ -155,3 +154,117 @@ def worker(i): assert not errors, errors for got, want in zip(results, truth): assert got == want + + +# --------------------------------------------------------------------------- +# PR #1512 review followups: pathlib.Path normalisation, GPU+file-like reject, +# truncate-on-rewrite semantics, _is_file_like requires tell. +# --------------------------------------------------------------------------- + + +def _make_small_da(): + arr = np.arange(64, dtype=np.float32).reshape(8, 8) + return xr.DataArray( + arr, dims=['y', 'x'], + coords={'y': np.arange(8.0), 'x': np.arange(8.0)}, + attrs={'crs': 4326}, + ) + + +def test_pathlib_path_write_and_read_roundtrip(tmp_path): + """``pathlib.Path`` should behave identically to ``str``.""" + p = tmp_path / 'roundtrip.tif' + da = _make_small_da() + to_geotiff(da, p) # Path, not str + out = open_geotiff(p) # Path, not str + np.testing.assert_array_equal(out.values, da.values) + + +def test_pathlib_path_vrt_routes_to_read_vrt(tmp_path): + """``Path('x.vrt')`` must dispatch to the VRT path, not the TIFF reader.""" + import pathlib + + da = _make_small_da() + tif_path = tmp_path / 'src.tif' + to_geotiff(da, str(tif_path)) + + vrt_path = tmp_path / 'mosaic.vrt' + vrt_path.write_text(f""" + + + {tif_path} + 1 + + + + + +""") + out = open_geotiff(pathlib.Path(vrt_path)) + np.testing.assert_array_equal(out.values, da.values) + + +def test_pathlib_path_derives_name(tmp_path): + """``name`` should derive from ``Path`` stem, same as for ``str``.""" + p = tmp_path / 'mydata.tif' + to_geotiff(_make_small_da(), p) + out = open_geotiff(p) + assert out.name == 'mydata' + + +def test_to_geotiff_rejects_gpu_with_file_like(): + """``gpu=True`` + file-like is untested territory; reject up front.""" + da = _make_small_da() + buf = io.BytesIO() + with pytest.raises(ValueError, match='gpu=True is not supported'): + to_geotiff(da, buf, gpu=True) + + +def test_to_geotiff_buffer_rewritten_in_place(): + """Reusing the same BytesIO should overwrite, not append. + + Mirrors ``to_geotiff('/tmp/x.tif', ...)`` followed by another call to + the same path -- the second call replaces, not concatenates. PR #1512 + review found that file-like writes used to ``write()`` at the cursor + so two writes to one buffer produced two TIFFs back-to-back. + """ + buf = io.BytesIO() + to_geotiff(_make_small_da(), buf) + first_size = len(buf.getvalue()) + assert first_size > 0 + + # Write a second, different DataArray to the same buffer. + arr2 = np.full((4, 4), 7.0, dtype=np.float32) + da2 = xr.DataArray( + arr2, dims=['y', 'x'], + coords={'y': np.arange(4.0), 'x': np.arange(4.0)}, + attrs={'crs': 4326}, + ) + to_geotiff(da2, buf) + second_size = len(buf.getvalue()) + + # The second TIFF should fully replace the first, not append. + out = open_geotiff(io.BytesIO(buf.getvalue())) + np.testing.assert_array_equal(out.values, arr2) + # And the buffer must be smaller than first+second (i.e. no concat). + assert second_size < first_size + 100 + + +def test_is_file_like_requires_tell(): + """Objects with read+seek but no tell are not accepted as file-like. + + ``_BytesIOSource`` needs ``tell`` to compute the buffer size. We refuse + seekable-but-not-tellable inputs at the gate rather than crashing inside + ``__init__``. + """ + from xrspatial.geotiff._reader import _is_file_like + + class ReadSeekNoTell: + def read(self, n=-1): + return b'' + + def seek(self, *a, **k): + return 0 + + assert _is_file_like(io.BytesIO(b'x')) is True + assert _is_file_like(ReadSeekNoTell()) is False