From 9dd54a8495c14b7c0ebcd8e1ff57083d81435694 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Mon, 23 Mar 2026 19:08:31 -0700
Subject: [PATCH 1/3] Reduce dask graph size for large raster reprojection

Replace the O(N) delayed/from_delayed/block pattern in _reproject_dask
and _merge_dask with da.map_blocks over a template array. This produces
a single blockwise layer in the HighLevelGraph with O(1) metadata, so
graph construction no longer scales with chunk count.

Also adds empty-chunk skipping: precompute the source footprint in target
CRS and skip chunks that fall outside it entirely. This avoids pyproj
initialization and source data fetching for chunks that would just be
filled with nodata.

The streaming fallback threshold (previously 1GB graph metadata) is
removed since graph metadata is now constant-size. Large in-memory
arrays always go through dask when available.
---
 xrspatial/reproject/__init__.py   | 236 ++++++++++++++++++++----------
 xrspatial/tests/test_reproject.py | 123 ++++++++++++++++
 2 files changed, 280 insertions(+), 79 deletions(-)

diff --git a/xrspatial/reproject/__init__.py b/xrspatial/reproject/__init__.py
index c35fd89a..51769641 100644
--- a/xrspatial/reproject/__init__.py
+++ b/xrspatial/reproject/__init__.py
@@ -574,10 +574,10 @@ def reproject(
     else:
         is_cupy = is_cupy_array(data)
 
-    # For very large datasets, estimate whether a dask graph would fit
-    # in memory.  Each dask task uses ~1KB of graph metadata.  If the
-    # graph itself would exceed available memory, use a streaming
-    # approach instead of dask (process tiles sequentially, no graph).
+    # For large in-memory datasets, wrap in dask for chunked processing.
+    # map_blocks generates an O(1) HighLevelGraph (single blockwise layer)
+    # so graph metadata is no longer a concern -- the streaming fallback
+    # is only needed when dask itself is unavailable.
     _use_streaming = False
     if not is_dask and not is_cupy:
         nbytes = src_shape[0] * src_shape[1] * data.dtype.itemsize
@@ -585,19 +585,10 @@ def reproject(
             nbytes *= data.shape[2]
         _OOM_THRESHOLD = 512 * 1024 * 1024  # 512 MB
         if nbytes > _OOM_THRESHOLD:
-            # Estimate graph size for the output
             cs = chunk_size or 2048
             if isinstance(cs, int):
                 cs = (cs, cs)
-            n_out_chunks = (math.ceil(out_shape[0] / cs[0])
-                           * math.ceil(out_shape[1] / cs[1]))
-            graph_bytes = n_out_chunks * 1024  # ~1KB per task
-
-            if graph_bytes > 1024 * 1024 * 1024:  # > 1GB graph
-                # Graph too large for dask -- use streaming
-                _use_streaming = True
-            else:
-                # Graph fits -- use dask with large chunks
+            try:
                 import dask.array as _da
                 data = _da.from_array(data, chunks=cs)
                 raster = xr.DataArray(
@@ -605,6 +596,9 @@ def reproject(
                     name=raster.name, attrs=raster.attrs,
                 )
                 is_dask = True
+            except ImportError:
+                # dask not available -- fall back to streaming
+                _use_streaming = True
 
     # Serialize CRS for pickle safety
     src_wkt = src_crs.to_wkt()
@@ -1125,6 +1119,77 @@ def _reproject_dask_cupy(
     return result
 
 
+def _source_footprint_in_target(src_bounds, src_wkt, tgt_wkt):
+    """Compute an approximate bounding box of the source raster in target CRS.
+
+    Transforms corners and edge midpoints (12 points) to handle non-linear
+    projections.  Returns ``(left, bottom, right, top)`` in target CRS, or
+    *None* if the transform fails (e.g. out-of-domain).
+    """
+    try:
+        from ._crs_utils import _require_pyproj
+        pyproj = _require_pyproj()
+        src_crs = pyproj.CRS(src_wkt)
+        tgt_crs = pyproj.CRS(tgt_wkt)
+        transformer = pyproj.Transformer.from_crs(
+            src_crs, tgt_crs, always_xy=True
+        )
+    except Exception:
+        return None
+
+    sl, sb, sr, st = src_bounds
+    mx = (sl + sr) / 2
+    my = (sb + st) / 2
+    xs = [sl, mx, sr, sl, mx, sr, sl, mx, sr, sl, sr, mx]
+    ys = [sb, sb, sb, my, my, my, st, st, st, mx, mx, sb]
+    try:
+        tx, ty = transformer.transform(xs, ys)
+        tx = [v for v in tx if np.isfinite(v)]
+        ty = [v for v in ty if np.isfinite(v)]
+        if not tx or not ty:
+            return None
+        return (min(tx), min(ty), max(tx), max(ty))
+    except Exception:
+        return None
+
+
+def _bounds_overlap(a, b):
+    """Return True if bounding boxes *a* and *b* overlap."""
+    return a[0] < b[2] and a[2] > b[0] and a[1] < b[3] and a[3] > b[1]
+
+
+def _reproject_block_adapter(
+    block, block_info, source_data,
+    src_bounds, src_shape, y_desc,
+    src_wkt, tgt_wkt,
+    out_bounds, out_shape,
+    resampling, nodata, precision,
+    is_cupy, src_footprint_tgt,
+):
+    """``map_blocks`` adapter for reprojection.
+
+    Derives chunk bounds from *block_info* and delegates to the
+    per-chunk worker.
+    """
+    info = block_info[0]
+    (row_start, row_end), (col_start, col_end) = info['array-location']
+    chunk_shape = (row_end - row_start, col_end - col_start)
+    cb = _chunk_bounds(out_bounds, out_shape,
+                       row_start, row_end, col_start, col_end)
+
+    # Skip chunks that don't overlap the source footprint
+    if src_footprint_tgt is not None and not _bounds_overlap(cb, src_footprint_tgt):
+        return np.full(chunk_shape, nodata, dtype=np.float64)
+
+    chunk_fn = _reproject_chunk_cupy if is_cupy else _reproject_chunk_numpy
+    return chunk_fn(
+        source_data, src_bounds, src_shape, y_desc,
+        src_wkt, tgt_wkt,
+        cb, chunk_shape,
+        resampling, nodata, precision,
+    )
+
+
 def _reproject_dask(
     raster, src_bounds, src_shape, y_desc,
     src_wkt, tgt_wkt,
@@ -1132,44 +1197,43 @@ def _reproject_dask(
     resampling, nodata, precision,
     chunk_size, is_cupy,
 ):
-    """Dask+NumPy backend: build output as ``da.block`` of delayed chunks."""
-    import dask
+    """Dask+NumPy backend: ``map_blocks`` over a template array.
+
+    Uses a single ``blockwise`` layer in the HighLevelGraph instead of
+    O(N) ``dask.delayed`` nodes, keeping graph metadata O(1).
+    """
     import dask.array as da
 
     row_chunks, col_chunks = _compute_chunk_layout(out_shape, chunk_size)
-    n_row = len(row_chunks)
-    n_col = len(col_chunks)
-
-    chunk_fn = _reproject_chunk_cupy if is_cupy else _reproject_chunk_numpy
-    dtype = np.float64
 
-    blocks = [[None] * n_col for _ in range(n_row)]
+    # Precompute source footprint in target CRS for empty-chunk skipping
+    src_footprint_tgt = _source_footprint_in_target(
+        src_bounds, src_wkt, tgt_wkt
+    )
 
-    row_offset = 0
-    for i in range(n_row):
-        col_offset = 0
-        for j in range(n_col):
-            rchunk = row_chunks[i]
-            cchunk = col_chunks[j]
-            cb = _chunk_bounds(
-                out_bounds, out_shape,
-                row_offset, row_offset + rchunk,
-                col_offset, col_offset + cchunk,
-            )
-            delayed_chunk = dask.delayed(chunk_fn)(
-                raster.data,
-                src_bounds, src_shape, y_desc,
-                src_wkt, tgt_wkt,
-                cb, (rchunk, cchunk),
-                resampling, nodata, precision,
-            )
-            blocks[i][j] = da.from_delayed(
-                delayed_chunk, shape=(rchunk, cchunk), dtype=dtype
-            )
-            col_offset += cchunk
-        row_offset += rchunk
+    template = da.empty(
+        out_shape, dtype=np.float64, chunks=(row_chunks, col_chunks)
+    )
 
-    return da.block(blocks)
+    return da.map_blocks(
+        _reproject_block_adapter,
+        template,
+        source_data=raster.data,
+        src_bounds=src_bounds,
+        src_shape=src_shape,
+        y_desc=y_desc,
+        src_wkt=src_wkt,
+        tgt_wkt=tgt_wkt,
+        out_bounds=out_bounds,
+        out_shape=out_shape,
+        resampling=resampling,
+        nodata=nodata,
+        precision=precision,
+        is_cupy=is_cupy,
+        src_footprint_tgt=src_footprint_tgt,
+        dtype=np.float64,
+        meta=np.array((), dtype=np.float64),
+    )
 
 
 # ---------------------------------------------------------------------------
@@ -1434,23 +1498,38 @@ def _merge_inmemory(
     return _merge_arrays_numpy(arrays, nodata, strategy)
 
 
-def _merge_chunk_worker(
+def _merge_block_adapter(
+    block, block_info,
     raster_data_list, src_bounds_list, src_shape_list, y_desc_list,
     src_wkt_list, tgt_wkt,
-    chunk_bounds_tuple, chunk_shape,
+    out_bounds, out_shape,
     resampling, nodata, strategy, precision,
+    src_footprints_tgt,
 ):
-    """Worker for a single merge chunk."""
+    """``map_blocks`` adapter for merge."""
+    info = block_info[0]
+    (row_start, row_end), (col_start, col_end) = info['array-location']
+    chunk_shape = (row_end - row_start, col_end - col_start)
+    cb = _chunk_bounds(out_bounds, out_shape,
+                       row_start, row_end, col_start, col_end)
+
+    # Only reproject rasters whose footprint overlaps this chunk
     arrays = []
     for i in range(len(raster_data_list)):
+        if (src_footprints_tgt[i] is not None
+                and not _bounds_overlap(cb, src_footprints_tgt[i])):
+            continue
         reprojected = _reproject_chunk_numpy(
             raster_data_list[i],
             src_bounds_list[i], src_shape_list[i], y_desc_list[i],
             src_wkt_list[i], tgt_wkt,
-            chunk_bounds_tuple, chunk_shape,
+            cb, chunk_shape,
             resampling, nodata, precision,
         )
         arrays.append(reprojected)
+
+    if not arrays:
+        return np.full(chunk_shape, nodata, dtype=np.float64)
     return _merge_arrays_numpy(arrays, nodata, strategy)
 
 
@@ -1458,13 +1537,10 @@ def _merge_dask(
     raster_infos, tgt_wkt, out_bounds, out_shape,
     resampling, nodata, strategy, chunk_size,
 ):
-    """Dask merge backend."""
-    import dask
+    """Dask merge backend using ``map_blocks``."""
     import dask.array as da
 
     row_chunks, col_chunks = _compute_chunk_layout(out_shape, chunk_size)
-    n_row = len(row_chunks)
-    n_col = len(col_chunks)
 
     # Prepare lists for the worker
     data_list = [info['raster'].data for info in raster_infos]
@@ -1473,30 +1549,32 @@ def _merge_dask(
     ydesc_list = [info['y_desc'] for info in raster_infos]
     wkt_list = [info['src_wkt'] for info in raster_infos]
 
-    dtype = np.float64
-    blocks = [[None] * n_col for _ in range(n_row)]
+    # Precompute source footprints in target CRS
+    footprints = [
+        _source_footprint_in_target(bounds_list[i], wkt_list[i], tgt_wkt)
+        for i in range(len(raster_infos))
+    ]
 
-    row_offset = 0
-    for i in range(n_row):
-        col_offset = 0
-        for j in range(n_col):
-            rchunk = row_chunks[i]
-            cchunk = col_chunks[j]
-            cb = _chunk_bounds(
-                out_bounds, out_shape,
-                row_offset, row_offset + rchunk,
-                col_offset, col_offset + cchunk,
-            )
-            delayed_chunk = dask.delayed(_merge_chunk_worker)(
-                data_list, bounds_list, shape_list, ydesc_list,
-                wkt_list, tgt_wkt,
-                cb, (rchunk, cchunk),
-                resampling, nodata, strategy, 16,
-            )
-            blocks[i][j] = da.from_delayed(
-                delayed_chunk, shape=(rchunk, cchunk), dtype=dtype
-            )
-            col_offset += cchunk
-        row_offset += rchunk
+    template = da.empty(
+        out_shape, dtype=np.float64, chunks=(row_chunks, col_chunks)
+    )
 
-    return da.block(blocks)
+    return da.map_blocks(
+        _merge_block_adapter,
+        template,
+        raster_data_list=data_list,
+        src_bounds_list=bounds_list,
+        src_shape_list=shape_list,
+        y_desc_list=ydesc_list,
+        src_wkt_list=wkt_list,
+        tgt_wkt=tgt_wkt,
+        out_bounds=out_bounds,
+        out_shape=out_shape,
+        resampling=resampling,
+        nodata=nodata,
+        strategy=strategy,
+        precision=16,
+        src_footprints_tgt=footprints,
+        dtype=np.float64,
+        meta=np.array((), dtype=np.float64),
+    )
diff --git a/xrspatial/tests/test_reproject.py b/xrspatial/tests/test_reproject.py
index 75e2b258..cb2f384a 100644
--- a/xrspatial/tests/test_reproject.py
+++ b/xrspatial/tests/test_reproject.py
@@ -1159,3 +1159,126 @@ def test_cubic_nan_fallback_matches_cpu(self):
         finite = np.isfinite(cpu_result)
         np.testing.assert_allclose(cpu_result[finite], gpu_np[finite],
                                    rtol=1e-10)
+
+
+# ---------------------------------------------------------------------------
+# Dask graph optimization tests
+# ---------------------------------------------------------------------------
+
+@pytest.mark.skipif(not HAS_DASK, reason="dask not installed")
+class TestDaskGraphOptimization:
+    """Verify map_blocks conversion and empty-chunk skipping."""
+
+    def test_dask_reproject_uses_map_blocks(self):
+        """The dask path should produce a blockwise layer, not N delayed nodes."""
+        from xrspatial.reproject import reproject
+        data = np.ones((64, 64), dtype=np.float64)
+        da_data = da.from_array(data, chunks=(32, 32))
+        raster = xr.DataArray(
+            da_data, dims=['y', 'x'],
+            coords={'y': np.linspace(55, 45, 64), 'x': np.linspace(-5, 5, 64)},
+            attrs={'crs': 'EPSG:4326', 'nodata': np.nan},
+        )
+        result = reproject(raster, 'EPSG:32633', chunk_size=32)
+        # Result should be a dask array
+        assert hasattr(result.data, 'dask')
+        # Should have few graph layers (map_blocks creates 1-2, not N)
+        graph = result.data.__dask_graph__()
+        assert len(graph.layers) <= 3
+
+    def test_dask_reproject_matches_numpy(self):
+        """Dask map_blocks path should produce same values as numpy."""
+        from xrspatial.reproject import reproject
+        data = np.random.RandomState(42).rand(64, 64).astype(np.float64)
+        coords = {
+            'y': np.linspace(55, 45, 64),
+            'x': np.linspace(-5, 5, 64),
+        }
+        attrs = {'crs': 'EPSG:4326', 'nodata': np.nan}
+
+        np_raster = xr.DataArray(data, dims=['y', 'x'],
+                                 coords=coords, attrs=attrs)
+        da_raster = xr.DataArray(
+            da.from_array(data, chunks=(32, 32)),
+            dims=['y', 'x'], coords=coords, attrs=attrs,
+        )
+        np_result = reproject(np_raster, 'EPSG:32633')
+        da_result = reproject(da_raster, 'EPSG:32633')
+
+        np_vals = np_result.values
+        da_vals = da_result.values
+        # Same shape
+        assert np_vals.shape == da_vals.shape
+        # Same NaN pattern
+        np.testing.assert_array_equal(np.isnan(np_vals), np.isnan(da_vals))
+        # Same finite values
+        finite = np.isfinite(np_vals)
+        if finite.any():
+            np.testing.assert_allclose(np_vals[finite], da_vals[finite],
+                                       rtol=1e-10)
+
+    def test_empty_chunk_skipping(self):
+        """Chunks outside the source footprint should be nodata-filled
+        without touching pyproj."""
+        from xrspatial.reproject import reproject
+        # Small raster in a corner of the output grid
+        data = np.ones((16, 16), dtype=np.float64) * 42.0
+        raster = xr.DataArray(
+            da.from_array(data, chunks=(16, 16)),
+            dims=['y', 'x'],
+            coords={'y': np.linspace(50.1, 50.0, 16),
+                    'x': np.linspace(10.0, 10.1, 16)},
+            attrs={'crs': 'EPSG:4326', 'nodata': np.nan},
+        )
+        # Force a large output grid with small chunks so many are empty
+        result = reproject(raster, 'EPSG:32633', chunk_size=64,
+                           width=256, height=256)
+        vals = result.values
+        # Should have some valid pixels and some NaN (empty chunks)
+        assert np.any(np.isfinite(vals))
+        assert np.any(np.isnan(vals))
+
+    def test_merge_dask_uses_map_blocks(self):
+        """The merge dask path should also use map_blocks."""
+        from xrspatial.reproject import merge
+        t1 = xr.DataArray(
+            da.from_array(np.full((32, 32), 1.0), chunks=(16, 16)),
+            dims=['y', 'x'],
+            coords={'y': np.linspace(55, 50, 32),
+                    'x': np.linspace(-5, 0, 32)},
+            attrs={'crs': 'EPSG:4326', 'nodata': np.nan},
+        )
+        t2 = xr.DataArray(
+            da.from_array(np.full((32, 32), 2.0), chunks=(16, 16)),
+            dims=['y', 'x'],
+            coords={'y': np.linspace(50, 45, 32),
+                    'x': np.linspace(0, 5, 32)},
+            attrs={'crs': 'EPSG:4326', 'nodata': np.nan},
+        )
+        result = merge([t1, t2])
+        vals = result.values
+        assert np.any(np.isfinite(vals))
+
+    def test_source_footprint_helper(self):
+        """_source_footprint_in_target should return a valid bbox."""
+        from xrspatial.reproject import _source_footprint_in_target
+        src_bounds = (-5.0, 45.0, 5.0, 55.0)
+        fp = _source_footprint_in_target(
+            src_bounds, 'EPSG:4326', 'EPSG:32633'
+        )
+        # Should return a tuple of 4 finite values
+        assert fp is not None
+        assert len(fp) == 4
+        assert all(np.isfinite(v) for v in fp)
+        # left < right, bottom < top
+        assert fp[0] < fp[2]
+        assert fp[1] < fp[3]
+
+    def test_bounds_overlap(self):
+        """_bounds_overlap should correctly detect overlap."""
+        from xrspatial.reproject import _bounds_overlap
+        a = (0, 0, 10, 10)
+        assert _bounds_overlap(a, (5, 5, 15, 15))   # partial overlap
+        assert _bounds_overlap(a, (0, 0, 10, 10))   # identical
+        assert not _bounds_overlap(a, (11, 0, 20, 10))  # no overlap x
+        assert not _bounds_overlap(a, (0, 11, 10, 20))  # no overlap y

From baba2690a6f83bf60a42959581e97cbe90b3d4fa Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Mon, 23 Mar 2026 19:20:53 -0700
Subject: [PATCH 2/3] Fix MemoryError: bind source array via partial, not
 map_blocks kwarg

da.map_blocks scans kwargs for dask Array instances and adds them as
whole-array positional dependencies. This meant every output block
depended on the full source array, causing MemoryError on distributed
schedulers when the source exceeded the worker memory limit.

Fix: bind the source dask array (and all other params) to the adapter
function via functools.partial before passing to map_blocks. Dask
doesn't scan partial internals for array dependencies, so each block
task only depends on the template block, not the entire source.

The chunk function still slices and computes just the needed source
window at execution time.
---
 xrspatial/reproject/__init__.py   | 47 +++++++++++++++++++++++--------
 xrspatial/tests/test_reproject.py | 26 +++++++++++++++++
 2 files changed, 61 insertions(+), 12 deletions(-)

diff --git a/xrspatial/reproject/__init__.py b/xrspatial/reproject/__init__.py
index 51769641..fe14d74c 100644
--- a/xrspatial/reproject/__init__.py
+++ b/xrspatial/reproject/__init__.py
@@ -1201,7 +1201,15 @@ def _reproject_dask(
 
     Uses a single ``blockwise`` layer in the HighLevelGraph instead of
     O(N) ``dask.delayed`` nodes, keeping graph metadata O(1).
+
+    The source dask array is bound to the adapter via ``functools.partial``
+    rather than passed as a ``map_blocks`` kwarg.  This prevents dask from
+    adding the full source as a dependency of every output block (which
+    would cause a MemoryError on distributed schedulers when the source
+    exceeds the worker memory limit).
     """
+    import functools
+
     import dask.array as da
 
     row_chunks, col_chunks = _compute_chunk_layout(out_shape, chunk_size)
@@ -1211,13 +1219,11 @@ def _reproject_dask(
         src_bounds, src_wkt, tgt_wkt
     )
 
-    template = da.empty(
-        out_shape, dtype=np.float64, chunks=(row_chunks, col_chunks)
-    )
-
-    return da.map_blocks(
+    # Bind the source dask array and all scalar params via partial so
+    # map_blocks doesn't detect them as dask Array kwargs (which would
+    # add the full source as a dependency of every output block).
+    bound_adapter = functools.partial(
         _reproject_block_adapter,
-        template,
         source_data=raster.data,
         src_bounds=src_bounds,
         src_shape=src_shape,
@@ -1231,6 +1237,15 @@ def _reproject_dask(
         precision=precision,
         is_cupy=is_cupy,
         src_footprint_tgt=src_footprint_tgt,
+    )
+
+    template = da.empty(
+        out_shape, dtype=np.float64, chunks=(row_chunks, col_chunks)
+    )
+
+    return da.map_blocks(
+        bound_adapter,
+        template,
         dtype=np.float64,
         meta=np.array((), dtype=np.float64),
     )
@@ -1538,6 +1553,8 @@ def _merge_dask(
     resampling, nodata, strategy, chunk_size,
 ):
     """Dask merge backend using ``map_blocks``."""
+    import functools
+
     import dask.array as da
 
     row_chunks, col_chunks = _compute_chunk_layout(out_shape, chunk_size)
@@ -1555,13 +1572,10 @@ def _merge_dask(
         for i in range(len(raster_infos))
     ]
 
-    template = da.empty(
-        out_shape, dtype=np.float64, chunks=(row_chunks, col_chunks)
-    )
-
-    return da.map_blocks(
+    # Bind via partial to prevent map_blocks from adding dask arrays
+    # in data_list as whole-array dependencies.
+    bound_adapter = functools.partial(
         _merge_block_adapter,
-        template,
         raster_data_list=data_list,
         src_bounds_list=bounds_list,
         src_shape_list=shape_list,
@@ -1575,6 +1589,15 @@ def _merge_dask(
         strategy=strategy,
         precision=16,
         src_footprints_tgt=footprints,
+    )
+
+    template = da.empty(
+        out_shape, dtype=np.float64, chunks=(row_chunks, col_chunks)
+    )
+
+    return da.map_blocks(
+        bound_adapter,
+        template,
         dtype=np.float64,
         meta=np.array((), dtype=np.float64),
     )
diff --git a/xrspatial/tests/test_reproject.py b/xrspatial/tests/test_reproject.py
index cb2f384a..d72544d4 100644
--- a/xrspatial/tests/test_reproject.py
+++ b/xrspatial/tests/test_reproject.py
@@ -1186,6 +1186,32 @@ def test_dask_reproject_uses_map_blocks(self):
         graph = result.data.__dask_graph__()
         assert len(graph.layers) <= 3
 
+    def test_source_not_whole_array_dependency(self):
+        """Source dask array should not be a dependency of every output block.
+
+        When source_data is passed as a map_blocks kwarg, dask adds the
+        full source as a dependency of every output block -- this causes
+        MemoryError on distributed schedulers when the source exceeds
+        worker memory.  Using functools.partial avoids this.
+        """
+        from xrspatial.reproject import reproject
+        data = np.ones((64, 64), dtype=np.float64)
+        da_data = da.from_array(data, chunks=(32, 32))
+        src_name = da_data.name  # e.g. 'array-abc123'
+        raster = xr.DataArray(
+            da_data, dims=['y', 'x'],
+            coords={'y': np.linspace(55, 45, 64), 'x': np.linspace(-5, 5, 64)},
+            attrs={'crs': 'EPSG:4326', 'nodata': np.nan},
+        )
+        result = reproject(raster, 'EPSG:32633', chunk_size=32)
+        graph = result.data.__dask_graph__()
+        # The source array's layer should NOT be in the output graph's
+        # dependencies (it's captured in the function closure instead).
+        assert src_name not in graph.layers, (
+            f"source array '{src_name}' should not be a graph layer "
+            f"dependency -- use functools.partial to bind it"
+        )
+
     def test_dask_reproject_matches_numpy(self):
         """Dask map_blocks path should produce same values as numpy."""
         from xrspatial.reproject import reproject

From a5052a4c79d75e8a6c6a43bc36a7797243b36ef7 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Mon, 23 Mar 2026 20:10:36 -0700
Subject: [PATCH 3/3] Fix PROJ thread-safety crash in test_empty_chunk_skipping

The test creates many concurrent dask chunks that all call
pyproj.CRS.__init__ simultaneously via the threaded scheduler.
PROJ's C library is not fully thread-safe on macOS, causing SIGABRT.

Use dask synchronous scheduler for this test since we're verifying
empty-chunk correctness, not concurrent execution.
---
 xrspatial/tests/test_reproject.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/xrspatial/tests/test_reproject.py b/xrspatial/tests/test_reproject.py
index d72544d4..303e9365 100644
--- a/xrspatial/tests/test_reproject.py
+++ b/xrspatial/tests/test_reproject.py
@@ -1246,6 +1246,8 @@ def test_dask_reproject_matches_numpy(self):
     def test_empty_chunk_skipping(self):
         """Chunks outside the source footprint should be nodata-filled
         without touching pyproj."""
+        import dask
+
         from xrspatial.reproject import reproject
         # Small raster in a corner of the output grid
         data = np.ones((16, 16), dtype=np.float64) * 42.0
@@ -1256,10 +1258,13 @@ def test_empty_chunk_skipping(self):
                     'x': np.linspace(10.0, 10.1, 16)},
             attrs={'crs': 'EPSG:4326', 'nodata': np.nan},
         )
-        # Force a large output grid with small chunks so many are empty
-        result = reproject(raster, 'EPSG:32633', chunk_size=64,
-                           width=256, height=256)
-        vals = result.values
+        # Force a large output grid with small chunks so many are empty.
+        # Use synchronous scheduler to avoid PROJ C library thread-safety
+        # crashes on macOS when many chunks call pyproj.CRS concurrently.
+        with dask.config.set(scheduler='synchronous'):
+            result = reproject(raster, 'EPSG:32633', chunk_size=64,
+                               width=256, height=256)
+            vals = result.values
         # Should have some valid pixels and some NaN (empty chunks)
         assert np.any(np.isfinite(vals))
         assert np.any(np.isnan(vals))