diff --git a/benchmarks/RASTERIZER_BENCHMARKS.md b/benchmarks/RASTERIZER_BENCHMARKS.md
new file mode 100644
index 00000000..eb4b3dbb
--- /dev/null
+++ b/benchmarks/RASTERIZER_BENCHMARKS.md
@@ -0,0 +1,600 @@
+# Rasterizer Benchmarks
+
+Generated: 2026-03-14 04:21 UTC
+
+Compares xarray-spatial (numpy and cupy backends) against datashader, geocube, and rasterio across 10 geometry types, 3 feature counts (50/200/1000), and 5 output resolutions (100-4000 px wide).
+
+- **Polygon types** (circles, irregular, rectangles, stars, donuts, multipolygons): all 5 rasterizers compared
+- **Line types** (lines, multilines): xrspatial, geocube, rasterio (datashader uses a different API for lines)
+- **Point types** (points, multipoints): xrspatial, geocube, rasterio (datashader uses a different API for points)
+
+## Circles (64 vertices)
+
+### Timings
+
+| n | size | xrs-numpy | xrs-cupy | datashader | geocube | rasterio |
+|---:|---:|---:|---:|---:|---:|---:|
+| 50 | 100x50 | 0.59 ms | 1.64 ms | 0.42 ms | 3.62 ms | 1.16 ms |
+| 50 | 500x250 | 0.83 ms | 1.57 ms | 1.30 ms | 4.67 ms | 1.50 ms |
+| 50 | 1000x500 | 1.97 ms | 1.73 ms | 3.83 ms | 7.10 ms | 3.10 ms |
+| 50 | 2000x1000 | 7.35 ms | 2.07 ms | 13.7 ms | 18.0 ms | 10.1 ms |
+| 50 | 4000x2000 | 49.8 ms | 2.86 ms | 65.5 ms | 94.2 ms | 59.9 ms |
+| 200 | 100x50 | 0.98 ms | 1.95 ms | 0.60 ms | 7.18 ms | 4.15 ms |
+| 200 | 500x250 | 1.47 ms | 2.61 ms | 3.79 ms | 7.69 ms | 4.49 ms |
+| 200 | 1000x500 | 2.49 ms | 2.91 ms | 12.9 ms | 10.2 ms | 6.12 ms |
+| 200 | 2000x1000 | 8.65 ms | 3.80 ms | 48.9 ms | 19.5 ms | 13.9 ms |
+| 200 | 4000x2000 | 53.0 ms | 33.3 ms | 200 ms | 82.0 ms | 65.4 ms |
+| 1000 | 100x50 | 4.61 ms | 9.58 ms | 1.93 ms | 32.9 ms | 21.5 ms |
+| 1000 | 500x250 | 5.62 ms | 16.5 ms | 18.0 ms | 28.2 ms | 23.9 ms |
+| 1000 | 1000x500 | 8.84 ms | 29.7 ms | 61.0 ms | 31.4 ms | 26.6 ms |
+| 1000 | 2000x1000 | 17.9 ms | 54.3 ms | 230 ms | 45.5 ms | 37.8 ms |
+| 1000 | 4000x2000 | 79.9 ms | 39.6 ms | 912 ms | 142 ms | 106 ms |
+
+### Relative to xrs-numpy
+
+Values below 1.0 mean the competitor is faster than xrs-numpy.
+
+| n | size | xrs-cupy | datashader | geocube | rasterio |
+|---:|---:|---:|---:|---:|---:|
+| 50 | 100x50 | 2.76x | 0.70x | 6.10x | 1.94x |
+| 50 | 500x250 | 1.89x | 1.56x | 5.63x | 1.80x |
+| 50 | 1000x500 | 0.88x | 1.94x | 3.60x | 1.57x |
+| 50 | 2000x1000 | 0.28x | 1.87x | 2.45x | 1.37x |
+| 50 | 4000x2000 | 0.06x | 1.32x | 1.89x | 1.20x |
+| 200 | 100x50 | 2.00x | 0.62x | 7.36x | 4.25x |
+| 200 | 500x250 | 1.78x | 2.58x | 5.23x | 3.05x |
+| 200 | 1000x500 | 1.17x | 5.19x | 4.08x | 2.45x |
+| 200 | 2000x1000 | 0.44x | 5.65x | 2.25x | 1.60x |
+| 200 | 4000x2000 | 0.63x | 3.78x | 1.55x | 1.24x |
+| 1000 | 100x50 | 2.08x | 0.42x | 7.14x | 4.67x |
+| 1000 | 500x250 | 2.94x | 3.21x | 5.02x | 4.25x |
+| 1000 | 1000x500 | 3.36x | 6.90x | 3.55x | 3.00x |
+| 1000 | 2000x1000 | 3.04x | 12.85x | 2.54x | 2.11x |
+| 1000 | 4000x2000 | 0.49x | 11.41x | 1.78x | 1.32x |
+
+### Consistency
+
+| pair | IoU min | IoU max | RMSE range |
+|:-----|--------:|--------:|-----------:|
+| xrs-numpy vs datashader | 0.1463 | 0.9166 | 34.9 - 44.4 |
+| xrs-numpy vs geocube | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| xrs-numpy vs rasterio | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| xrs-numpy vs xrs-cupy | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| datashader vs geocube | 0.1463 | 0.9166 | 34.9 - 44.4 |
+| datashader vs rasterio | 0.1463 | 0.9166 | 34.9 - 44.4 |
+| datashader vs xrs-cupy | 0.1463 | 0.9166 | 34.9 - 44.4 |
+| geocube vs rasterio | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| geocube vs xrs-cupy | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| rasterio vs xrs-cupy | 1.0000 | 1.0000 | 0.0 - 0.0 |
+
+## Irregular (128 vertices)
+
+### Timings
+
+| n | size | xrs-numpy | xrs-cupy | datashader | geocube | rasterio |
+|---:|---:|---:|---:|---:|---:|---:|
+| 50 | 100x50 | 0.74 ms | 1.72 ms | 0.51 ms | 4.08 ms | 1.64 ms |
+| 50 | 500x250 | 1.19 ms | 2.19 ms | 3.13 ms | 4.48 ms | 2.03 ms |
+| 50 | 1000x500 | 1.97 ms | 2.27 ms | 10.1 ms | 5.82 ms | 2.72 ms |
+| 50 | 2000x1000 | 4.60 ms | 2.60 ms | 35.6 ms | 10.9 ms | 6.85 ms |
+| 50 | 4000x2000 | 49.2 ms | 23.1 ms | 150 ms | 89.9 ms | 61.7 ms |
+| 200 | 100x50 | 2.37 ms | 9.49 ms | 1.39 ms | 9.06 ms | 6.17 ms |
+| 200 | 500x250 | 3.61 ms | 18.9 ms | 12.7 ms | 10.2 ms | 7.35 ms |
+| 200 | 1000x500 | 5.83 ms | 18.5 ms | 43.8 ms | 13.8 ms | 9.82 ms |
+| 200 | 2000x1000 | 13.4 ms | 20.8 ms | 158 ms | 27.7 ms | 18.0 ms |
+| 200 | 4000x2000 | 68.5 ms | 47.1 ms | 640 ms | 111 ms | 77.6 ms |
+| 1000 | 100x50 | 9.53 ms | 15.5 ms | 5.77 ms | 37.7 ms | 34.2 ms |
+| 1000 | 500x250 | 18.2 ms | 51.1 ms | 65.3 ms | 42.5 ms | 38.0 ms |
+| 1000 | 1000x500 | 30.4 ms | 54.3 ms | 223 ms | 52.0 ms | 47.4 ms |
+| 1000 | 2000x1000 | 60.7 ms | 45.3 ms | 813 ms | 79.6 ms | 63.5 ms |
+| 1000 | 4000x2000 | 141 ms | 55.6 ms | 3095 ms | 169 ms | 140 ms |
+
+### Relative to xrs-numpy
+
+Values below 1.0 mean the competitor is faster than xrs-numpy.
+
+| n | size | xrs-cupy | datashader | geocube | rasterio |
+|---:|---:|---:|---:|---:|---:|
+| 50 | 100x50 | 2.32x | 0.69x | 5.50x | 2.22x |
+| 50 | 500x250 | 1.84x | 2.63x | 3.77x | 1.71x |
+| 50 | 1000x500 | 1.15x | 5.12x | 2.95x | 1.38x |
+| 50 | 2000x1000 | 0.57x | 7.73x | 2.36x | 1.49x |
+| 50 | 4000x2000 | 0.47x | 3.05x | 1.83x | 1.25x |
+| 200 | 100x50 | 4.01x | 0.59x | 3.83x | 2.61x |
+| 200 | 500x250 | 5.23x | 3.52x | 2.83x | 2.03x |
+| 200 | 1000x500 | 3.17x | 7.51x | 2.37x | 1.69x |
+| 200 | 2000x1000 | 1.55x | 11.84x | 2.07x | 1.35x |
+| 200 | 4000x2000 | 0.69x | 9.34x | 1.62x | 1.13x |
+| 1000 | 100x50 | 1.63x | 0.61x | 3.96x | 3.59x |
+| 1000 | 500x250 | 2.80x | 3.58x | 2.33x | 2.08x |
+| 1000 | 1000x500 | 1.78x | 7.33x | 1.71x | 1.56x |
+| 1000 | 2000x1000 | 0.75x | 13.40x | 1.31x | 1.05x |
+| 1000 | 4000x2000 | 0.39x | 21.95x | 1.20x | 1.00x |
+
+### Consistency
+
+| pair | IoU min | IoU max | RMSE range |
+|:-----|--------:|--------:|-----------:|
+| xrs-numpy vs datashader | 0.2268 | 0.9321 | 34.7 - 42.3 |
+| xrs-numpy vs geocube | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| xrs-numpy vs rasterio | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| xrs-numpy vs xrs-cupy | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| datashader vs geocube | 0.2268 | 0.9321 | 34.7 - 42.3 |
+| datashader vs rasterio | 0.2268 | 0.9321 | 34.7 - 42.3 |
+| datashader vs xrs-cupy | 0.2268 | 0.9321 | 34.7 - 42.3 |
+| geocube vs rasterio | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| geocube vs xrs-cupy | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| rasterio vs xrs-cupy | 1.0000 | 1.0000 | 0.0 - 0.0 |
+
+## Rectangles
+
+### Timings
+
+| n | size | xrs-numpy | xrs-cupy | datashader | geocube | rasterio |
+|---:|---:|---:|---:|---:|---:|---:|
+| 50 | 100x50 | 0.41 ms | 1.34 ms | 0.27 ms | 3.22 ms | 0.73 ms |
+| 50 | 500x250 | 0.57 ms | 1.56 ms | 0.39 ms | 3.45 ms | 0.88 ms |
+| 50 | 1000x500 | 0.99 ms | 1.40 ms | 0.68 ms | 4.19 ms | 1.34 ms |
+| 50 | 2000x1000 | 2.97 ms | 1.78 ms | 2.24 ms | 9.05 ms | 5.04 ms |
+| 50 | 4000x2000 | 46.5 ms | 12.5 ms | 25.7 ms | 91.5 ms | 59.2 ms |
+| 200 | 100x50 | 0.70 ms | 3.50 ms | 0.41 ms | 5.16 ms | 2.56 ms |
+| 200 | 500x250 | 1.02 ms | 5.62 ms | 0.67 ms | 5.38 ms | 2.71 ms |
+| 200 | 1000x500 | 1.76 ms | 8.05 ms | 1.66 ms | 6.38 ms | 3.33 ms |
+| 200 | 2000x1000 | 6.02 ms | 21.7 ms | 6.15 ms | 17.5 ms | 8.44 ms |
+| 200 | 4000x2000 | 54.0 ms | 47.0 ms | 39.4 ms | 98.0 ms | 64.3 ms |
+| 1000 | 100x50 | 1.38 ms | 3.75 ms | 0.58 ms | 14.6 ms | 11.2 ms |
+| 1000 | 500x250 | 3.34 ms | 7.17 ms | 2.19 ms | 15.4 ms | 18.6 ms |
+| 1000 | 1000x500 | 12.8 ms | 8.03 ms | 6.47 ms | 17.3 ms | 13.4 ms |
+| 1000 | 2000x1000 | 15.7 ms | 28.9 ms | 24.2 ms | 25.4 ms | 21.3 ms |
+| 1000 | 4000x2000 | 79.5 ms | 36.1 ms | 107 ms | 116 ms | 90.2 ms |
+
+### Relative to xrs-numpy
+
+Values below 1.0 mean the competitor is faster than xrs-numpy.
+
+| n | size | xrs-cupy | datashader | geocube | rasterio |
+|---:|---:|---:|---:|---:|---:|
+| 50 | 100x50 | 3.24x | 0.66x | 7.78x | 1.77x |
+| 50 | 500x250 | 2.76x | 0.69x | 6.08x | 1.56x |
+| 50 | 1000x500 | 1.42x | 0.69x | 4.25x | 1.36x |
+| 50 | 2000x1000 | 0.60x | 0.75x | 3.05x | 1.70x |
+| 50 | 4000x2000 | 0.27x | 0.55x | 1.97x | 1.27x |
+| 200 | 100x50 | 5.00x | 0.59x | 7.37x | 3.66x |
+| 200 | 500x250 | 5.49x | 0.65x | 5.25x | 2.65x |
+| 200 | 1000x500 | 4.58x | 0.94x | 3.62x | 1.89x |
+| 200 | 2000x1000 | 3.60x | 1.02x | 2.91x | 1.40x |
+| 200 | 4000x2000 | 0.87x | 0.73x | 1.82x | 1.19x |
+| 1000 | 100x50 | 2.72x | 0.42x | 10.55x | 8.13x |
+| 1000 | 500x250 | 2.14x | 0.65x | 4.62x | 5.57x |
+| 1000 | 1000x500 | 0.63x | 0.51x | 1.35x | 1.05x |
+| 1000 | 2000x1000 | 1.84x | 1.53x | 1.61x | 1.35x |
+| 1000 | 4000x2000 | 0.45x | 1.34x | 1.46x | 1.13x |
+
+### Consistency
+
+| pair | IoU min | IoU max | RMSE range |
+|:-----|--------:|--------:|-----------:|
+| xrs-numpy vs datashader | 0.1280 | 0.9433 | 32.5 - 42.9 |
+| xrs-numpy vs geocube | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| xrs-numpy vs rasterio | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| xrs-numpy vs xrs-cupy | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| datashader vs geocube | 0.1280 | 0.9433 | 32.5 - 42.9 |
+| datashader vs rasterio | 0.1280 | 0.9433 | 32.5 - 42.9 |
+| datashader vs xrs-cupy | 0.1280 | 0.9433 | 32.5 - 42.9 |
+| geocube vs rasterio | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| geocube vs xrs-cupy | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| rasterio vs xrs-cupy | 1.0000 | 1.0000 | 0.0 - 0.0 |
+
+## Stars (5-point, concave)
+
+### Timings
+
+| n | size | xrs-numpy | xrs-cupy | datashader | geocube | rasterio |
+|---:|---:|---:|---:|---:|---:|---:|
+| 50 | 100x50 | 0.45 ms | 1.39 ms | 0.29 ms | 3.07 ms | 0.74 ms |
+| 50 | 500x250 | 0.63 ms | 1.42 ms | 0.66 ms | 3.33 ms | 0.92 ms |
+| 50 | 1000x500 | 1.18 ms | 1.59 ms | 1.52 ms | 4.52 ms | 1.43 ms |
+| 50 | 2000x1000 | 3.01 ms | 1.71 ms | 4.95 ms | 11.9 ms | 6.22 ms |
+| 50 | 4000x2000 | 50.7 ms | 9.68 ms | 34.9 ms | 94.3 ms | 59.8 ms |
+| 200 | 100x50 | 0.78 ms | 6.69 ms | 0.47 ms | 5.33 ms | 2.58 ms |
+| 200 | 500x250 | 1.49 ms | 7.82 ms | 1.99 ms | 5.60 ms | 3.15 ms |
+| 200 | 1000x500 | 2.28 ms | 8.98 ms | 5.49 ms | 6.95 ms | 3.70 ms |
+| 200 | 2000x1000 | 11.0 ms | 16.5 ms | 18.6 ms | 20.4 ms | 9.60 ms |
+| 200 | 4000x2000 | 54.5 ms | 35.5 ms | 85.6 ms | 119 ms | 66.0 ms |
+| 1000 | 100x50 | 1.89 ms | 13.3 ms | 1.16 ms | 15.9 ms | 24.4 ms |
+| 1000 | 500x250 | 5.40 ms | 28.0 ms | 8.12 ms | 17.0 ms | 12.6 ms |
+| 1000 | 1000x500 | 10.0 ms | 21.6 ms | 25.2 ms | 20.6 ms | 15.3 ms |
+| 1000 | 2000x1000 | 23.2 ms | 20.7 ms | 84.7 ms | 31.1 ms | 24.1 ms |
+| 1000 | 4000x2000 | 93.4 ms | 27.9 ms | 329 ms | 124 ms | 84.0 ms |
+
+### Relative to xrs-numpy
+
+Values below 1.0 mean the competitor is faster than xrs-numpy.
+
+| n | size | xrs-cupy | datashader | geocube | rasterio |
+|---:|---:|---:|---:|---:|---:|
+| 50 | 100x50 | 3.10x | 0.64x | 6.85x | 1.65x |
+| 50 | 500x250 | 2.26x | 1.06x | 5.30x | 1.46x |
+| 50 | 1000x500 | 1.35x | 1.29x | 3.84x | 1.22x |
+| 50 | 2000x1000 | 0.57x | 1.65x | 3.95x | 2.07x |
+| 50 | 4000x2000 | 0.19x | 0.69x | 1.86x | 1.18x |
+| 200 | 100x50 | 8.52x | 0.60x | 6.79x | 3.29x |
+| 200 | 500x250 | 5.26x | 1.34x | 3.77x | 2.12x |
+| 200 | 1000x500 | 3.95x | 2.41x | 3.05x | 1.62x |
+| 200 | 2000x1000 | 1.50x | 1.69x | 1.85x | 0.87x |
+| 200 | 4000x2000 | 0.65x | 1.57x | 2.18x | 1.21x |
+| 1000 | 100x50 | 7.04x | 0.61x | 8.38x | 12.88x |
+| 1000 | 500x250 | 5.19x | 1.50x | 3.16x | 2.33x |
+| 1000 | 1000x500 | 2.15x | 2.51x | 2.06x | 1.53x |
+| 1000 | 2000x1000 | 0.89x | 3.64x | 1.34x | 1.03x |
+| 1000 | 4000x2000 | 0.30x | 3.52x | 1.33x | 0.90x |
+
+### Consistency
+
+| pair | IoU min | IoU max | RMSE range |
+|:-----|--------:|--------:|-----------:|
+| xrs-numpy vs datashader | 0.1084 | 0.9001 | 25.3 - 41.2 |
+| xrs-numpy vs geocube | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| xrs-numpy vs rasterio | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| xrs-numpy vs xrs-cupy | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| datashader vs geocube | 0.1084 | 0.9001 | 25.3 - 41.2 |
+| datashader vs rasterio | 0.1084 | 0.9001 | 25.3 - 41.2 |
+| datashader vs xrs-cupy | 0.1084 | 0.9001 | 25.3 - 41.2 |
+| geocube vs rasterio | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| geocube vs xrs-cupy | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| rasterio vs xrs-cupy | 1.0000 | 1.0000 | 0.0 - 0.0 |
+
+## Donuts (polygon + hole)
+
+### Timings
+
+| n | size | xrs-numpy | xrs-cupy | datashader | geocube | rasterio |
+|---:|---:|---:|---:|---:|---:|---:|
+| 50 | 100x50 | 0.70 ms | 1.65 ms | 0.52 ms | 4.51 ms | 1.94 ms |
+| 50 | 500x250 | 1.04 ms | 1.98 ms | 3.77 ms | 4.99 ms | 2.32 ms |
+| 50 | 1000x500 | 1.74 ms | 2.10 ms | 13.7 ms | 6.27 ms | 3.08 ms |
+| 50 | 2000x1000 | 3.97 ms | 2.72 ms | 51.3 ms | 11.7 ms | 8.71 ms |
+| 50 | 4000x2000 | 50.4 ms | 36.3 ms | 209 ms | 91.9 ms | 62.8 ms |
+| 200 | 100x50 | 1.62 ms | 2.96 ms | 1.31 ms | 9.92 ms | 7.79 ms |
+| 200 | 500x250 | 2.64 ms | 3.96 ms | 14.2 ms | 11.7 ms | 8.29 ms |
+| 200 | 1000x500 | 3.82 ms | 4.70 ms | 50.6 ms | 13.9 ms | 11.3 ms |
+| 200 | 2000x1000 | 10.0 ms | 21.7 ms | 192 ms | 27.8 ms | 18.1 ms |
+| 200 | 4000x2000 | 60.9 ms | 36.4 ms | 762 ms | 117 ms | 78.3 ms |
+| 1000 | 100x50 | 7.91 ms | 11.7 ms | 5.38 ms | 44.9 ms | 40.6 ms |
+| 1000 | 500x250 | 13.2 ms | 34.4 ms | 72.9 ms | 48.9 ms | 43.3 ms |
+| 1000 | 1000x500 | 19.6 ms | 47.2 ms | 261 ms | 56.1 ms | 53.3 ms |
+| 1000 | 2000x1000 | 39.7 ms | 39.1 ms | 991 ms | 78.9 ms | 68.6 ms |
+| 1000 | 4000x2000 | 97.8 ms | 49.1 ms | 3861 ms | 178 ms | 149 ms |
+
+### Relative to xrs-numpy
+
+Values below 1.0 mean the competitor is faster than xrs-numpy.
+
+| n | size | xrs-cupy | datashader | geocube | rasterio |
+|---:|---:|---:|---:|---:|---:|
+| 50 | 100x50 | 2.34x | 0.74x | 6.40x | 2.76x |
+| 50 | 500x250 | 1.91x | 3.64x | 4.82x | 2.24x |
+| 50 | 1000x500 | 1.21x | 7.89x | 3.60x | 1.77x |
+| 50 | 2000x1000 | 0.69x | 12.93x | 2.96x | 2.20x |
+| 50 | 4000x2000 | 0.72x | 4.14x | 1.82x | 1.24x |
+| 200 | 100x50 | 1.82x | 0.80x | 6.10x | 4.79x |
+| 200 | 500x250 | 1.50x | 5.36x | 4.43x | 3.14x |
+| 200 | 1000x500 | 1.23x | 13.24x | 3.63x | 2.96x |
+| 200 | 2000x1000 | 2.17x | 19.18x | 2.78x | 1.81x |
+| 200 | 4000x2000 | 0.60x | 12.52x | 1.92x | 1.29x |
+| 1000 | 100x50 | 1.48x | 0.68x | 5.68x | 5.13x |
+| 1000 | 500x250 | 2.61x | 5.54x | 3.71x | 3.29x |
+| 1000 | 1000x500 | 2.41x | 13.33x | 2.86x | 2.72x |
+| 1000 | 2000x1000 | 0.99x | 25.00x | 1.99x | 1.73x |
+| 1000 | 4000x2000 | 0.50x | 39.47x | 1.82x | 1.52x |
+
+### Consistency
+
+| pair | IoU min | IoU max | RMSE range |
+|:-----|--------:|--------:|-----------:|
+| xrs-numpy vs datashader | 0.2476 | 0.9283 | 39.4 - 43.9 |
+| xrs-numpy vs geocube | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| xrs-numpy vs rasterio | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| xrs-numpy vs xrs-cupy | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| datashader vs geocube | 0.2476 | 0.9283 | 39.4 - 43.9 |
+| datashader vs rasterio | 0.2476 | 0.9283 | 39.4 - 43.9 |
+| datashader vs xrs-cupy | 0.2476 | 0.9283 | 39.4 - 43.9 |
+| geocube vs rasterio | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| geocube vs xrs-cupy | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| rasterio vs xrs-cupy | 1.0000 | 1.0000 | 0.0 - 0.0 |
+
+## MultiPolygons (2-4 parts)
+
+### Timings
+
+| n | size | xrs-numpy | xrs-cupy | datashader | geocube | rasterio |
+|---:|---:|---:|---:|---:|---:|---:|
+| 50 | 100x50 | 0.64 ms | 1.61 ms | 0.44 ms | 4.66 ms | 2.19 ms |
+| 50 | 500x250 | 0.95 ms | 1.84 ms | 1.00 ms | 5.82 ms | 2.49 ms |
+| 50 | 1000x500 | 1.44 ms | 1.87 ms | 2.36 ms | 6.22 ms | 3.07 ms |
+| 50 | 2000x1000 | 4.10 ms | 4.39 ms | 7.39 ms | 12.2 ms | 7.27 ms |
+| 50 | 4000x2000 | 46.9 ms | 11.3 ms | 43.1 ms | 95.3 ms | 61.7 ms |
+| 200 | 100x50 | 1.38 ms | 5.43 ms | 0.69 ms | 11.1 ms | 8.37 ms |
+| 200 | 500x250 | 2.14 ms | 9.03 ms | 2.50 ms | 12.6 ms | 17.8 ms |
+| 200 | 1000x500 | 3.37 ms | 12.9 ms | 7.18 ms | 13.2 ms | 10.5 ms |
+| 200 | 2000x1000 | 7.24 ms | 20.7 ms | 22.9 ms | 22.1 ms | 22.2 ms |
+| 200 | 4000x2000 | 29.0 ms | 31.9 ms | 101 ms | 85.9 ms | 61.9 ms |
+| 1000 | 100x50 | 6.15 ms | 16.6 ms | 2.29 ms | 49.7 ms | 44.7 ms |
+| 1000 | 500x250 | 11.9 ms | 34.9 ms | 11.6 ms | 59.3 ms | 47.0 ms |
+| 1000 | 1000x500 | 14.2 ms | 33.5 ms | 33.7 ms | 57.2 ms | 57.4 ms |
+| 1000 | 2000x1000 | 25.5 ms | 40.2 ms | 112 ms | 64.7 ms | 60.3 ms |
+| 1000 | 4000x2000 | 64.4 ms | 53.5 ms | 428 ms | 143 ms | 120 ms |
+
+### Relative to xrs-numpy
+
+Values below 1.0 mean the competitor is faster than xrs-numpy.
+
+| n | size | xrs-cupy | datashader | geocube | rasterio |
+|---:|---:|---:|---:|---:|---:|
+| 50 | 100x50 | 2.52x | 0.70x | 7.28x | 3.43x |
+| 50 | 500x250 | 1.93x | 1.05x | 6.12x | 2.62x |
+| 50 | 1000x500 | 1.30x | 1.63x | 4.31x | 2.13x |
+| 50 | 2000x1000 | 1.07x | 1.80x | 2.99x | 1.77x |
+| 50 | 4000x2000 | 0.24x | 0.92x | 2.03x | 1.31x |
+| 200 | 100x50 | 3.94x | 0.50x | 8.07x | 6.08x |
+| 200 | 500x250 | 4.22x | 1.17x | 5.88x | 8.31x |
+| 200 | 1000x500 | 3.84x | 2.13x | 3.93x | 3.12x |
+| 200 | 2000x1000 | 2.86x | 3.16x | 3.05x | 3.06x |
+| 200 | 4000x2000 | 1.10x | 3.47x | 2.96x | 2.13x |
+| 1000 | 100x50 | 2.69x | 0.37x | 8.08x | 7.26x |
+| 1000 | 500x250 | 2.92x | 0.98x | 4.97x | 3.93x |
+| 1000 | 1000x500 | 2.35x | 2.37x | 4.02x | 4.03x |
+| 1000 | 2000x1000 | 1.57x | 4.38x | 2.54x | 2.36x |
+| 1000 | 4000x2000 | 0.83x | 6.64x | 2.22x | 1.87x |
+
+### Consistency
+
+| pair | IoU min | IoU max | RMSE range |
+|:-----|--------:|--------:|-----------:|
+| xrs-numpy vs datashader | 0.1543 | 0.9513 | 34.9 - 40.9 |
+| xrs-numpy vs geocube | 0.9961 | 1.0000 | 0.0 - 1.2 |
+| xrs-numpy vs rasterio | 0.9961 | 1.0000 | 0.0 - 1.2 |
+| xrs-numpy vs xrs-cupy | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| datashader vs geocube | 0.1547 | 0.9518 | 35.0 - 40.9 |
+| datashader vs rasterio | 0.1547 | 0.9518 | 35.0 - 40.9 |
+| datashader vs xrs-cupy | 0.1543 | 0.9513 | 34.9 - 40.9 |
+| geocube vs rasterio | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| geocube vs xrs-cupy | 0.9961 | 1.0000 | 0.0 - 1.2 |
+| rasterio vs xrs-cupy | 0.9961 | 1.0000 | 0.0 - 1.2 |
+
+## LineStrings
+
+### Timings
+
+| n | size | xrs-numpy | xrs-cupy | geocube | rasterio |
+|---:|---:|---:|---:|---:|---:|
+| 50 | 100x50 | 0.53 ms | 1.51 ms | 2.94 ms | 0.33 ms |
+| 50 | 500x250 | 0.56 ms | 1.43 ms | 3.05 ms | 0.45 ms |
+| 50 | 1000x500 | 0.81 ms | 1.49 ms | 3.95 ms | 0.92 ms |
+| 50 | 2000x1000 | 3.97 ms | 1.59 ms | 14.2 ms | 5.24 ms |
+| 50 | 4000x2000 | 45.7 ms | 7.66 ms | 91.1 ms | 59.2 ms |
+| 200 | 100x50 | 0.57 ms | 1.67 ms | 3.46 ms | 1.00 ms |
+| 200 | 500x250 | 0.71 ms | 1.95 ms | 3.65 ms | 1.11 ms |
+| 200 | 1000x500 | 1.02 ms | 2.62 ms | 4.60 ms | 1.67 ms |
+| 200 | 2000x1000 | 2.65 ms | 3.68 ms | 14.9 ms | 6.45 ms |
+| 200 | 4000x2000 | 46.8 ms | 12.2 ms | 93.8 ms | 61.1 ms |
+| 1000 | 100x50 | 1.27 ms | 2.61 ms | 8.00 ms | 4.26 ms |
+| 1000 | 500x250 | 1.46 ms | 2.67 ms | 17.4 ms | 4.42 ms |
+| 1000 | 1000x500 | 1.90 ms | 3.67 ms | 9.71 ms | 12.9 ms |
+| 1000 | 2000x1000 | 4.13 ms | 6.53 ms | 16.3 ms | 11.1 ms |
+| 1000 | 4000x2000 | 48.5 ms | 12.8 ms | 89.5 ms | 64.5 ms |
+
+### Relative to xrs-numpy
+
+Values below 1.0 mean the competitor is faster than xrs-numpy.
+
+| n | size | xrs-cupy | geocube | rasterio |
+|---:|---:|---:|---:|---:|
+| 50 | 100x50 | 2.83x | 5.52x | 0.62x |
+| 50 | 500x250 | 2.54x | 5.41x | 0.79x |
+| 50 | 1000x500 | 1.84x | 4.88x | 1.13x |
+| 50 | 2000x1000 | 0.40x | 3.58x | 1.32x |
+| 50 | 4000x2000 | 0.17x | 1.99x | 1.29x |
+| 200 | 100x50 | 2.93x | 6.08x | 1.75x |
+| 200 | 500x250 | 2.73x | 5.10x | 1.55x |
+| 200 | 1000x500 | 2.56x | 4.51x | 1.64x |
+| 200 | 2000x1000 | 1.39x | 5.62x | 2.43x |
+| 200 | 4000x2000 | 0.26x | 2.00x | 1.31x |
+| 1000 | 100x50 | 2.06x | 6.32x | 3.37x |
+| 1000 | 500x250 | 1.83x | 11.92x | 3.03x |
+| 1000 | 1000x500 | 1.93x | 5.11x | 6.76x |
+| 1000 | 2000x1000 | 1.58x | 3.96x | 2.69x |
+| 1000 | 4000x2000 | 0.26x | 1.85x | 1.33x |
+
+### Consistency
+
+| pair | IoU min | IoU max | RMSE range |
+|:-----|--------:|--------:|-----------:|
+| xrs-numpy vs geocube | 0.3514 | 0.9255 | 2.6 - 23.7 |
+| xrs-numpy vs rasterio | 0.3514 | 0.9255 | 2.6 - 23.7 |
+| xrs-numpy vs xrs-cupy | 1.0000 | 1.0000 | 1.4 - 32.4 |
+| geocube vs rasterio | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| geocube vs xrs-cupy | 0.3514 | 0.9255 | 2.7 - 34.7 |
+| rasterio vs xrs-cupy | 0.3514 | 0.9255 | 2.7 - 34.7 |
+
+## MultiLineStrings
+
+### Timings
+
+| n | size | xrs-numpy | xrs-cupy | geocube | rasterio |
+|---:|---:|---:|---:|---:|---:|
+| 50 | 100x50 | 0.49 ms | 2.75 ms | 3.34 ms | 0.90 ms |
+| 50 | 500x250 | 0.64 ms | 1.80 ms | 3.50 ms | 1.06 ms |
+| 50 | 1000x500 | 0.90 ms | 1.79 ms | 4.30 ms | 1.52 ms |
+| 50 | 2000x1000 | 3.00 ms | 4.34 ms | 15.4 ms | 5.90 ms |
+| 50 | 4000x2000 | 45.1 ms | 11.3 ms | 86.7 ms | 58.0 ms |
+| 200 | 100x50 | 0.70 ms | 1.82 ms | 6.10 ms | 3.26 ms |
+| 200 | 500x250 | 0.84 ms | 2.51 ms | 6.08 ms | 3.58 ms |
+| 200 | 1000x500 | 1.20 ms | 2.14 ms | 7.59 ms | 4.21 ms |
+| 200 | 2000x1000 | 3.04 ms | 4.89 ms | 18.1 ms | 8.61 ms |
+| 200 | 4000x2000 | 45.8 ms | 12.9 ms | 97.8 ms | 80.2 ms |
+| 1000 | 100x50 | 2.26 ms | 3.04 ms | 26.1 ms | 15.2 ms |
+| 1000 | 500x250 | 2.09 ms | 3.27 ms | 19.9 ms | 15.6 ms |
+| 1000 | 1000x500 | 10.1 ms | 4.33 ms | 32.0 ms | 17.5 ms |
+| 1000 | 2000x1000 | 6.61 ms | 5.32 ms | 27.9 ms | 23.3 ms |
+| 1000 | 4000x2000 | 61.9 ms | 13.9 ms | 142 ms | 82.3 ms |
+
+### Relative to xrs-numpy
+
+Values below 1.0 mean the competitor is faster than xrs-numpy.
+
+| n | size | xrs-cupy | geocube | rasterio |
+|---:|---:|---:|---:|---:|
+| 50 | 100x50 | 5.64x | 6.84x | 1.85x |
+| 50 | 500x250 | 2.81x | 5.47x | 1.66x |
+| 50 | 1000x500 | 1.99x | 4.78x | 1.69x |
+| 50 | 2000x1000 | 1.45x | 5.14x | 1.97x |
+| 50 | 4000x2000 | 0.25x | 1.92x | 1.29x |
+| 200 | 100x50 | 2.60x | 8.68x | 4.64x |
+| 200 | 500x250 | 2.98x | 7.23x | 4.25x |
+| 200 | 1000x500 | 1.79x | 6.34x | 3.52x |
+| 200 | 2000x1000 | 1.61x | 5.97x | 2.84x |
+| 200 | 4000x2000 | 0.28x | 2.13x | 1.75x |
+| 1000 | 100x50 | 1.34x | 11.53x | 6.70x |
+| 1000 | 500x250 | 1.57x | 9.54x | 7.49x |
+| 1000 | 1000x500 | 0.43x | 3.17x | 1.73x |
+| 1000 | 2000x1000 | 0.81x | 4.22x | 3.53x |
+| 1000 | 4000x2000 | 0.22x | 2.29x | 1.33x |
+
+### Consistency
+
+| pair | IoU min | IoU max | RMSE range |
+|:-----|--------:|--------:|-----------:|
+| xrs-numpy vs geocube | 0.3541 | 0.9586 | 3.4 - 23.9 |
+| xrs-numpy vs rasterio | 0.3541 | 0.9586 | 3.4 - 23.9 |
+| xrs-numpy vs xrs-cupy | 1.0000 | 1.0000 | 2.0 - 33.5 |
+| geocube vs rasterio | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| geocube vs xrs-cupy | 0.3541 | 0.9586 | 3.8 - 35.1 |
+| rasterio vs xrs-cupy | 0.3541 | 0.9586 | 3.8 - 35.1 |
+
+## Points
+
+### Timings
+
+| n | size | xrs-numpy | xrs-cupy | geocube | rasterio |
+|---:|---:|---:|---:|---:|---:|
+| 50 | 100x50 | 0.51 ms | 1.62 ms | 2.69 ms | 0.28 ms |
+| 50 | 500x250 | 0.43 ms | 1.70 ms | 2.90 ms | 0.40 ms |
+| 50 | 1000x500 | 0.75 ms | 2.41 ms | 3.91 ms | 0.85 ms |
+| 50 | 2000x1000 | 2.87 ms | 4.29 ms | 7.83 ms | 3.72 ms |
+| 50 | 4000x2000 | 45.4 ms | 12.9 ms | 85.2 ms | 55.5 ms |
+| 200 | 100x50 | 0.39 ms | 1.43 ms | 3.15 ms | 0.66 ms |
+| 200 | 500x250 | 0.49 ms | 1.99 ms | 3.74 ms | 0.78 ms |
+| 200 | 1000x500 | 0.84 ms | 1.63 ms | 4.47 ms | 1.34 ms |
+| 200 | 2000x1000 | 3.03 ms | 3.40 ms | 9.55 ms | 4.47 ms |
+| 200 | 4000x2000 | 45.2 ms | 11.3 ms | 91.2 ms | 61.6 ms |
+| 1000 | 100x50 | 0.66 ms | 1.55 ms | 5.96 ms | 2.90 ms |
+| 1000 | 500x250 | 0.75 ms | 1.58 ms | 5.95 ms | 3.25 ms |
+| 1000 | 1000x500 | 1.06 ms | 2.00 ms | 6.95 ms | 3.76 ms |
+| 1000 | 2000x1000 | 3.30 ms | 4.52 ms | 12.5 ms | 8.73 ms |
+| 1000 | 4000x2000 | 45.3 ms | 12.9 ms | 93.8 ms | 63.3 ms |
+
+### Relative to xrs-numpy
+
+Values below 1.0 mean the competitor is faster than xrs-numpy.
+
+| n | size | xrs-cupy | geocube | rasterio |
+|---:|---:|---:|---:|---:|
+| 50 | 100x50 | 3.19x | 5.30x | 0.56x |
+| 50 | 500x250 | 3.94x | 6.72x | 0.92x |
+| 50 | 1000x500 | 3.20x | 5.18x | 1.12x |
+| 50 | 2000x1000 | 1.49x | 2.73x | 1.29x |
+| 50 | 4000x2000 | 0.28x | 1.87x | 1.22x |
+| 200 | 100x50 | 3.70x | 8.17x | 1.71x |
+| 200 | 500x250 | 4.09x | 7.70x | 1.62x |
+| 200 | 1000x500 | 1.94x | 5.33x | 1.60x |
+| 200 | 2000x1000 | 1.12x | 3.15x | 1.48x |
+| 200 | 4000x2000 | 0.25x | 2.02x | 1.36x |
+| 1000 | 100x50 | 2.34x | 8.98x | 4.38x |
+| 1000 | 500x250 | 2.11x | 7.96x | 4.35x |
+| 1000 | 1000x500 | 1.89x | 6.57x | 3.55x |
+| 1000 | 2000x1000 | 1.37x | 3.78x | 2.64x |
+| 1000 | 4000x2000 | 0.28x | 2.07x | 1.40x |
+
+### Consistency
+
+| pair | IoU min | IoU max | RMSE range |
+|:-----|--------:|--------:|-----------:|
+| xrs-numpy vs geocube | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| xrs-numpy vs rasterio | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| xrs-numpy vs xrs-cupy | 1.0000 | 1.0000 | 0.0 - 11.3 |
+| geocube vs rasterio | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| geocube vs xrs-cupy | 1.0000 | 1.0000 | 0.0 - 11.3 |
+| rasterio vs xrs-cupy | 1.0000 | 1.0000 | 0.0 - 11.3 |
+
+## MultiPoints (3-8 pts)
+
+### Timings
+
+| n | size | xrs-numpy | xrs-cupy | geocube | rasterio |
+|---:|---:|---:|---:|---:|---:|
+| 50 | 100x50 | 0.35 ms | 1.24 ms | 3.57 ms | 1.14 ms |
+| 50 | 500x250 | 0.48 ms | 1.65 ms | 3.84 ms | 1.26 ms |
+| 50 | 1000x500 | 0.74 ms | 1.67 ms | 4.60 ms | 1.69 ms |
+| 50 | 2000x1000 | 2.52 ms | 4.16 ms | 13.5 ms | 5.92 ms |
+| 50 | 4000x2000 | 44.6 ms | 9.95 ms | 93.4 ms | 60.2 ms |
+| 200 | 100x50 | 0.45 ms | 3.51 ms | 7.00 ms | 4.67 ms |
+| 200 | 500x250 | 0.59 ms | 1.37 ms | 7.36 ms | 4.74 ms |
+| 200 | 1000x500 | 0.86 ms | 1.54 ms | 8.31 ms | 5.10 ms |
+| 200 | 2000x1000 | 2.73 ms | 3.52 ms | 18.4 ms | 9.61 ms |
+| 200 | 4000x2000 | 44.4 ms | 10.1 ms | 98.8 ms | 63.7 ms |
+| 1000 | 100x50 | 1.00 ms | 2.50 ms | 24.7 ms | 21.0 ms |
+| 1000 | 500x250 | 1.12 ms | 2.27 ms | 24.8 ms | 21.6 ms |
+| 1000 | 1000x500 | 7.26 ms | 9.20 ms | 26.4 ms | 22.7 ms |
+| 1000 | 2000x1000 | 3.44 ms | 6.28 ms | 36.1 ms | 28.6 ms |
+| 1000 | 4000x2000 | 46.9 ms | 13.2 ms | 106 ms | 78.6 ms |
+
+### Relative to xrs-numpy
+
+Values below 1.0 mean the competitor is faster than xrs-numpy.
+
+| n | size | xrs-cupy | geocube | rasterio |
+|---:|---:|---:|---:|---:|
+| 50 | 100x50 | 3.57x | 10.29x | 3.28x |
+| 50 | 500x250 | 3.44x | 8.01x | 2.64x |
+| 50 | 1000x500 | 2.25x | 6.21x | 2.28x |
+| 50 | 2000x1000 | 1.65x | 5.34x | 2.34x |
+| 50 | 4000x2000 | 0.22x | 2.09x | 1.35x |
+| 200 | 100x50 | 7.80x | 15.53x | 10.36x |
+| 200 | 500x250 | 2.31x | 12.39x | 7.98x |
+| 200 | 1000x500 | 1.77x | 9.60x | 5.89x |
+| 200 | 2000x1000 | 1.29x | 6.74x | 3.51x |
+| 200 | 4000x2000 | 0.23x | 2.22x | 1.43x |
+| 1000 | 100x50 | 2.50x | 24.63x | 20.99x |
+| 1000 | 500x250 | 2.03x | 22.14x | 19.34x |
+| 1000 | 1000x500 | 1.27x | 3.63x | 3.12x |
+| 1000 | 2000x1000 | 1.83x | 10.49x | 8.33x |
+| 1000 | 4000x2000 | 0.28x | 2.27x | 1.68x |
+
+### Consistency
+
+| pair | IoU min | IoU max | RMSE range |
+|:-----|--------:|--------:|-----------:|
+| xrs-numpy vs geocube | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| xrs-numpy vs rasterio | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| xrs-numpy vs xrs-cupy | 1.0000 | 1.0000 | 0.0 - 23.4 |
+| geocube vs rasterio | 1.0000 | 1.0000 | 0.0 - 0.0 |
+| geocube vs xrs-cupy | 1.0000 | 1.0000 | 0.0 - 23.4 |
+| rasterio vs xrs-cupy | 1.0000 | 1.0000 | 0.0 - 23.4 |
+
+## Where xrs-numpy is slower
+
+Top 10 configurations where another rasterizer beats xrs-numpy.
+
+| # | geometry | n | size | faster lib | xrs-numpy | other | xrs slower by |
+|--:|:---------|--:|-----:|:-----------|----------:|------:|--------------:|
+| 1 | multipolygons | 1000 | 100x50 | datashader | 6.2 ms | 2.3 ms | 2.7x |
+| 2 | circles_64v | 1000 | 100x50 | datashader | 4.6 ms | 1.9 ms | 2.4x |
+| 3 | rectangles | 1000 | 100x50 | datashader | 1.4 ms | 0.6 ms | 2.4x |
+| 4 | multipolygons | 200 | 100x50 | datashader | 1.4 ms | 0.7 ms | 2.0x |
+| 5 | rectangles | 1000 | 1000x500 | datashader | 12.8 ms | 6.5 ms | 2.0x |
+| 6 | rectangles | 50 | 4000x2000 | datashader | 46.5 ms | 25.7 ms | 1.8x |
+| 7 | points | 50 | 100x50 | rasterio | 0.5 ms | 0.3 ms | 1.8x |
+| 8 | irregular_128v | 200 | 100x50 | datashader | 2.4 ms | 1.4 ms | 1.7x |
+| 9 | rectangles | 200 | 100x50 | datashader | 0.7 ms | 0.4 ms | 1.7x |
+| 10 | stars_5pt | 200 | 100x50 | datashader | 0.8 ms | 0.5 ms | 1.7x |
diff --git a/benchmarks/rasterizer_benchmarks.py b/benchmarks/rasterizer_benchmarks.py
new file mode 100644
index 00000000..7f122169
--- /dev/null
+++ b/benchmarks/rasterizer_benchmarks.py
@@ -0,0 +1,720 @@
+"""
+Benchmark: xarray-spatial vs datashader vs geocube vs rasterio rasterization.
+
+Compares wall-clock time and output consistency across a range of geometry
+types, output resolutions, and feature counts.  All libraries rasterize
+the same geometries into the same pixel grid.
+
+Usage:
+    python benchmarks/rasterizer_benchmarks.py
+
+Outputs:
+    - Console: timing tables, consistency checks, top-10 regressions
+    - benchmarks/RASTERIZER_BENCHMARKS.md: formatted markdown report
+"""
+
+import json
+import os
+import time
+from collections import defaultdict
+from datetime import datetime, timezone
+
+import numpy as np
+import geopandas as gpd
+import spatialpandas
+from rasterio.features import rasterize as rio_rasterize
+from rasterio.transform import from_bounds
+from shapely.geometry import (
+    Point, Polygon, MultiPolygon, LineString, MultiLineString, MultiPoint,
+    box,
+)
+
+import datashader as ds
+from geocube.api.core import make_geocube
+from xrspatial.rasterize import rasterize as xrs_rasterize
+
+
+# ---------------------------------------------------------------------------
+# Geometry generators
+# ---------------------------------------------------------------------------
+
+def make_circles(n, rng):
+    """Random buffered points (64-vertex circles)."""
+    cx = rng.uniform(-150, 150, n)
+    cy = rng.uniform(-70, 70, n)
+    radii = rng.uniform(2, 15, n)
+    geoms = [Point(x, y).buffer(r, resolution=16)
+             for x, y, r in zip(cx, cy, radii)]
+    vals = rng.uniform(1, 100, n)
+    return geoms, vals
+
+
+def make_irregular(n, rng, n_verts=128):
+    """Irregular star-like polygons with many vertices."""
+    geoms = []
+    vals = rng.uniform(1, 100, n)
+    for _ in range(n):
+        cx = rng.uniform(-150, 150)
+        cy = rng.uniform(-70, 70)
+        r = rng.uniform(5, 20)
+        angles = np.sort(rng.uniform(0, 2 * np.pi, n_verts))
+        radii = r * (0.7 + 0.3 * rng.uniform(size=n_verts))
+        xs = cx + radii * np.cos(angles)
+        ys = cy + radii * np.sin(angles)
+        coords = list(zip(xs, ys))
+        coords.append(coords[0])
+        geoms.append(Polygon(coords))
+    return geoms, vals
+
+
+def make_rectangles(n, rng):
+    """Axis-aligned rectangles."""
+    geoms = []
+    vals = rng.uniform(1, 100, n)
+    for _ in range(n):
+        cx = rng.uniform(-150, 150)
+        cy = rng.uniform(-70, 70)
+        hw = rng.uniform(2, 20)
+        hh = rng.uniform(2, 15)
+        geoms.append(box(cx - hw, cy - hh, cx + hw, cy + hh))
+    return geoms, vals
+
+
+def make_stars(n, rng, n_points=5):
+    """Concave star polygons with alternating inner/outer vertices."""
+    geoms = []
+    vals = rng.uniform(1, 100, n)
+    for _ in range(n):
+        cx = rng.uniform(-150, 150)
+        cy = rng.uniform(-70, 70)
+        r_outer = rng.uniform(5, 20)
+        r_inner = r_outer * rng.uniform(0.3, 0.5)
+        angles = np.linspace(0, 2 * np.pi, 2 * n_points, endpoint=False)
+        angles -= np.pi / 2
+        coords = []
+        for i, a in enumerate(angles):
+            r = r_outer if i % 2 == 0 else r_inner
+            coords.append((cx + r * np.cos(a), cy + r * np.sin(a)))
+        coords.append(coords[0])
+        geoms.append(Polygon(coords))
+    return geoms, vals
+
+
+def make_donuts(n, rng):
+    """Polygons with holes (annuli)."""
+    geoms = []
+    vals = rng.uniform(1, 100, n)
+    for _ in range(n):
+        cx = rng.uniform(-150, 150)
+        cy = rng.uniform(-70, 70)
+        r_outer = rng.uniform(8, 20)
+        r_inner = r_outer * rng.uniform(0.3, 0.6)
+        shell = Point(cx, cy).buffer(r_outer, resolution=16)
+        hole = Point(cx, cy).buffer(r_inner, resolution=16)
+        geoms.append(shell.difference(hole))
+    return geoms, vals
+
+
+def make_multipolygons(n, rng):
+    """MultiPolygon collections (2-4 parts each)."""
+    geoms = []
+    vals = rng.uniform(1, 100, n)
+    for _ in range(n):
+        n_parts = rng.integers(2, 5)
+        parts = []
+        for _ in range(n_parts):
+            cx = rng.uniform(-150, 150)
+            cy = rng.uniform(-70, 70)
+            r = rng.uniform(2, 8)
+            parts.append(Point(cx, cy).buffer(r, resolution=8))
+        geoms.append(MultiPolygon(parts))
+    return geoms, vals
+
+
+def make_lines(n, rng):
+    """Random multi-segment LineStrings."""
+    geoms = []
+    vals = rng.uniform(1, 100, n)
+    for _ in range(n):
+        n_pts = rng.integers(3, 10)
+        xs = np.cumsum(rng.uniform(-20, 20, n_pts)) + rng.uniform(-140, 140)
+        ys = np.cumsum(rng.uniform(-10, 10, n_pts)) + rng.uniform(-60, 60)
+        xs = np.clip(xs, -179, 179)
+        ys = np.clip(ys, -89, 89)
+        geoms.append(LineString(zip(xs, ys)))
+    return geoms, vals
+
+
+def make_multilines(n, rng):
+    """MultiLineString collections (2-4 parts each)."""
+    geoms = []
+    vals = rng.uniform(1, 100, n)
+    for _ in range(n):
+        n_parts = rng.integers(2, 5)
+        parts = []
+        for _ in range(n_parts):
+            n_pts = rng.integers(2, 6)
+            xs = np.cumsum(rng.uniform(-15, 15, n_pts)) + rng.uniform(-140, 140)
+            ys = np.cumsum(rng.uniform(-8, 8, n_pts)) + rng.uniform(-60, 60)
+            xs = np.clip(xs, -179, 179)
+            ys = np.clip(ys, -89, 89)
+            parts.append(LineString(zip(xs, ys)))
+        geoms.append(MultiLineString(parts))
+    return geoms, vals
+
+
+def make_points(n, rng):
+    """Random points."""
+    xs = rng.uniform(-170, 170, n)
+    ys = rng.uniform(-80, 80, n)
+    geoms = [Point(x, y) for x, y in zip(xs, ys)]
+    vals = rng.uniform(1, 100, n)
+    return geoms, vals
+
+
+def make_multipoints(n, rng):
+    """MultiPoint collections (3-8 points each)."""
+    geoms = []
+    vals = rng.uniform(1, 100, n)
+    for _ in range(n):
+        n_pts = rng.integers(3, 9)
+        xs = rng.uniform(-170, 170, n_pts)
+        ys = rng.uniform(-80, 80, n_pts)
+        geoms.append(MultiPoint(list(zip(xs, ys))))
+    return geoms, vals
+
+
+# ---------------------------------------------------------------------------
+# Runners
+# ---------------------------------------------------------------------------
+
+BOUNDS = (-180, -90, 180, 90)
+
+GEOM_JSON = json.dumps({
+    "type": "Polygon",
+    "coordinates": [[
+        [BOUNDS[0], BOUNDS[1]],
+        [BOUNDS[2], BOUNDS[1]],
+        [BOUNDS[2], BOUNDS[3]],
+        [BOUNDS[0], BOUNDS[3]],
+        [BOUNDS[0], BOUNDS[1]],
+    ]],
+})
+
+
+def run_xrspatial(pairs, width, height):
+    return xrs_rasterize(
+        pairs, width=width, height=height,
+        bounds=BOUNDS, fill=0.0,
+    )
+
+
+def run_datashader(spd_gdf, width, height):
+    cvs = ds.Canvas(
+        plot_width=width, plot_height=height,
+        x_range=(BOUNDS[0], BOUNDS[2]),
+        y_range=(BOUNDS[1], BOUNDS[3]),
+    )
+    return cvs.polygons(spd_gdf, geometry="geometry", agg=ds.last("value"))
+
+
+def run_geocube(gdf, width, height):
+    x_res = (BOUNDS[2] - BOUNDS[0]) / width
+    y_res = (BOUNDS[3] - BOUNDS[1]) / height
+    return make_geocube(
+        gdf,
+        measurements=["value"],
+        resolution=(-y_res, x_res),
+        geom=GEOM_JSON,
+        fill=0.0,
+    )
+
+
+def run_rasterio(shapes, width, height):
+    transform = from_bounds(*BOUNDS, width, height)
+    return rio_rasterize(
+        shapes,
+        out_shape=(height, width),
+        transform=transform,
+        fill=0.0,
+        dtype=np.float64,
+    )
+
+
+def run_xrspatial_cupy(pairs, width, height):
+    import cupy
+    result = xrs_rasterize(
+        pairs, width=width, height=height,
+        bounds=BOUNDS, fill=0.0, use_cuda=True,
+    )
+    cupy.cuda.Device().synchronize()
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Timing helper
+# ---------------------------------------------------------------------------
+
+def bench(fn, *args, warmup=1, repeats=5):
+    """Return (result, mean_seconds, std_seconds) over *repeats* after *warmup*."""
+    for _ in range(warmup):
+        fn(*args)
+    result = None
+    times = []
+    for _ in range(repeats):
+        t0 = time.perf_counter()
+        result = fn(*args)
+        t1 = time.perf_counter()
+        times.append(t1 - t0)
+    arr = np.array(times)
+    return result, arr.mean(), arr.std()
+
+
+# ---------------------------------------------------------------------------
+# Output consistency helpers
+# ---------------------------------------------------------------------------
+
+def _to_numpy(result, label):
+    """Extract a 2-D float64 numpy array from any rasterizer output."""
+    if isinstance(result, np.ndarray):
+        return result.astype(np.float64)
+    try:
+        import xarray as xr
+        if isinstance(result, xr.Dataset):
+            return result["value"].values.astype(np.float64)
+    except Exception:
+        pass
+    data = getattr(result, "data", result)
+    try:
+        data = data.get()  # cupy
+    except AttributeError:
+        pass
+    return np.asarray(data, dtype=np.float64)
+
+
+def _coverage_mask(arr):
+    """Boolean mask of covered (non-zero, non-NaN) pixels."""
+    return np.isfinite(arr) & (arr != 0.0)
+
+
+def check_consistency(results, width, height):
+    """Compare outputs pairwise. Returns list of dicts with results."""
+    arrays = {}
+    for label, res in results.items():
+        arr = _to_numpy(res, label)
+        if arr.shape != (height, width):
+            arr = arr[:height, :width]
+        arrays[label] = arr
+
+    labels = list(arrays.keys())
+    checks = []
+
+    for i in range(len(labels)):
+        for j in range(i + 1, len(labels)):
+            a_lbl, b_lbl = labels[i], labels[j]
+            a, b = arrays[a_lbl], arrays[b_lbl]
+
+            mask_a = _coverage_mask(a)
+            mask_b = _coverage_mask(b)
+
+            intersection = np.sum(mask_a & mask_b)
+            union = np.sum(mask_a | mask_b)
+            iou = intersection / union if union > 0 else 1.0
+
+            overlap = mask_a & mask_b
+            if np.any(overlap):
+                rmse = np.sqrt(np.mean((a[overlap] - b[overlap]) ** 2))
+            else:
+                rmse = float("nan")
+
+            checks.append({
+                "a": a_lbl, "b": b_lbl,
+                "iou": iou, "rmse": rmse,
+                "ok": iou > 0.85,
+            })
+
+    return checks
+
+
+# ---------------------------------------------------------------------------
+# Geometry type registry
+# ---------------------------------------------------------------------------
+
+GEN_LABELS = {
+    "circles_64v":     ("Circles (64 vertices)",    "polygon"),
+    "irregular_128v":  ("Irregular (128 vertices)", "polygon"),
+    "rectangles":      ("Rectangles",               "polygon"),
+    "stars_5pt":       ("Stars (5-point, concave)", "polygon"),
+    "donuts":          ("Donuts (polygon + hole)",   "polygon"),
+    "multipolygons":   ("MultiPolygons (2-4 parts)","polygon"),
+    "lines":           ("LineStrings",              "line"),
+    "multilines":      ("MultiLineStrings",         "line"),
+    "points":          ("Points",                   "point"),
+    "multipoints":     ("MultiPoints (3-8 pts)",    "point"),
+}
+
+GENERATORS = {
+    "circles_64v": {
+        "fn": lambda n, rng: make_circles(n, rng),
+        "kind": "polygon",
+    },
+    "irregular_128v": {
+        "fn": lambda n, rng: make_irregular(n, rng, n_verts=128),
+        "kind": "polygon",
+    },
+    "rectangles": {
+        "fn": lambda n, rng: make_rectangles(n, rng),
+        "kind": "polygon",
+    },
+    "stars_5pt": {
+        "fn": lambda n, rng: make_stars(n, rng, n_points=5),
+        "kind": "polygon",
+    },
+    "donuts": {
+        "fn": lambda n, rng: make_donuts(n, rng),
+        "kind": "polygon",
+    },
+    "multipolygons": {
+        "fn": lambda n, rng: make_multipolygons(n, rng),
+        "kind": "polygon",
+    },
+    "lines": {
+        "fn": lambda n, rng: make_lines(n, rng),
+        "kind": "line",
+    },
+    "multilines": {
+        "fn": lambda n, rng: make_multilines(n, rng),
+        "kind": "line",
+    },
+    "points": {
+        "fn": lambda n, rng: make_points(n, rng),
+        "kind": "point",
+    },
+    "multipoints": {
+        "fn": lambda n, rng: make_multipoints(n, rng),
+        "kind": "point",
+    },
+}
+
+GEN_ORDER = [
+    "circles_64v", "irregular_128v", "rectangles", "stars_5pt",
+    "donuts", "multipolygons", "lines", "multilines", "points", "multipoints",
+]
+
+
+# ---------------------------------------------------------------------------
+# Markdown report generation
+# ---------------------------------------------------------------------------
+
+def _fmt_ms(seconds):
+    ms = seconds * 1000
+    if ms >= 100:
+        return f"{ms:.0f} ms"
+    if ms >= 10:
+        return f"{ms:.1f} ms"
+    return f"{ms:.2f} ms"
+
+
+def generate_markdown(all_results, slower_cases, has_cupy):
+    """Build the full markdown report string from collected results."""
+    lines = []
+    w = lines.append
+
+    w("# Rasterizer Benchmarks")
+    w("")
+    now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
+    w(f"Generated: {now}")
+    w("")
+    w("Compares xarray-spatial (numpy and cupy backends) against datashader, "
+      "geocube, and rasterio across 10 geometry types, 3 feature counts "
+      "(50/200/1000), and 5 output resolutions (100-4000 px wide).")
+    w("")
+    w("- **Polygon types** (circles, irregular, rectangles, stars, donuts, "
+      "multipolygons): all 5 rasterizers compared")
+    w("- **Line types** (lines, multilines): xrspatial, geocube, rasterio "
+      "(datashader uses a different API for lines)")
+    w("- **Point types** (points, multipoints): xrspatial, geocube, rasterio "
+      "(datashader uses a different API for points)")
+    w("")
+
+    # Per-geometry sections
+    for gen_key in GEN_ORDER:
+        if gen_key not in all_results:
+            continue
+        label, kind = GEN_LABELS[gen_key]
+        entries = all_results[gen_key]
+        is_poly = kind == "polygon"
+
+        w(f"## {label}")
+        w("")
+
+        # Timing table
+        w("### Timings")
+        w("")
+        if is_poly:
+            h = "| n | size | xrs-numpy | xrs-cupy | datashader | geocube | rasterio |"
+            s = "|---:|---:|---:|---:|---:|---:|---:|"
+        else:
+            h = "| n | size | xrs-numpy | xrs-cupy | geocube | rasterio |"
+            s = "|---:|---:|---:|---:|---:|---:|"
+        w(h)
+        w(s)
+
+        for e in entries:
+            size = f"{e['w']}x{e['h']}"
+            xnp = _fmt_ms(e["xrs_mean"])
+            xcu = _fmt_ms(e["cupy_mean"]) if e["cupy_mean"] is not None else "--"
+            gc = _fmt_ms(e["gc_mean"])
+            rio = _fmt_ms(e["rio_mean"])
+            if is_poly:
+                dsh = _fmt_ms(e["ds_mean"]) if e["ds_mean"] is not None else "--"
+                w(f"| {e['n']} | {size} | {xnp} | {xcu} | {dsh} | {gc} | {rio} |")
+            else:
+                w(f"| {e['n']} | {size} | {xnp} | {xcu} | {gc} | {rio} |")
+
+        w("")
+
+        # Ratio table
+        w("### Relative to xrs-numpy")
+        w("")
+        w("Values below 1.0 mean the competitor is faster than xrs-numpy.")
+        w("")
+        if is_poly:
+            h = "| n | size | xrs-cupy | datashader | geocube | rasterio |"
+            s = "|---:|---:|---:|---:|---:|---:|"
+        else:
+            h = "| n | size | xrs-cupy | geocube | rasterio |"
+            s = "|---:|---:|---:|---:|---:|"
+        w(h)
+        w(s)
+
+        for e in entries:
+            size = f"{e['w']}x{e['h']}"
+            xrs = e["xrs_mean"]
+            cu_r = f"{e['cupy_mean']/xrs:.2f}x" if e["cupy_mean"] is not None and xrs > 0 else "--"
+            gc_r = f"{e['gc_mean']/xrs:.2f}x" if xrs > 0 else "--"
+            rio_r = f"{e['rio_mean']/xrs:.2f}x" if xrs > 0 else "--"
+            if is_poly:
+                ds_r = f"{e['ds_mean']/xrs:.2f}x" if e["ds_mean"] is not None and xrs > 0 else "--"
+                w(f"| {e['n']} | {size} | {cu_r} | {ds_r} | {gc_r} | {rio_r} |")
+            else:
+                w(f"| {e['n']} | {size} | {cu_r} | {gc_r} | {rio_r} |")
+
+        w("")
+
+        # Consistency summary
+        w("### Consistency")
+        w("")
+        pair_data = defaultdict(lambda: {"ious": [], "rmses": []})
+        for e in entries:
+            for c in e["consistency"]:
+                key = f"{c['a']} vs {c['b']}"
+                pair_data[key]["ious"].append(c["iou"])
+                pair_data[key]["rmses"].append(c["rmse"])
+
+        w("| pair | IoU min | IoU max | RMSE range |")
+        w("|:-----|--------:|--------:|-----------:|")
+        for pair, d in pair_data.items():
+            imin = min(d["ious"])
+            imax = max(d["ious"])
+            rmin = min(r for r in d["rmses"] if not np.isnan(r)) if any(not np.isnan(r) for r in d["rmses"]) else float("nan")
+            rmax = max(r for r in d["rmses"] if not np.isnan(r)) if any(not np.isnan(r) for r in d["rmses"]) else float("nan")
+            w(f"| {pair} | {imin:.4f} | {imax:.4f} | {rmin:.1f} - {rmax:.1f} |")
+
+        w("")
+
+    # Top 10 slower cases
+    if slower_cases:
+        slower_cases.sort(key=lambda x: -x[1])
+        top = slower_cases[:10]
+
+        w("## Where xrs-numpy is slower")
+        w("")
+        w("Top 10 configurations where another rasterizer beats xrs-numpy.")
+        w("")
+        w("| # | geometry | n | size | faster lib | xrs-numpy | other | xrs slower by |")
+        w("|--:|:---------|--:|-----:|:-----------|----------:|------:|--------------:|")
+        for i, (comp, ratio, gen, n, ww, hh, xrs_ms, comp_ms) in enumerate(top, 1):
+            size = f"{ww}x{hh}"
+            w(f"| {i} | {gen} | {n} | {size} | {comp} | {xrs_ms:.1f} ms | {comp_ms:.1f} ms | {ratio:.1f}x |")
+        w("")
+
+    return "\n".join(lines)
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def main():
+    rng = np.random.default_rng(42)
+
+    resolutions = [100, 500, 1000, 2000, 4000]
+    feature_counts = [50, 200, 1000]
+
+    try:
+        import cupy  # noqa: F401
+        has_cupy = True
+    except ImportError:
+        has_cupy = False
+
+    # Structured results: gen_key -> list of row dicts
+    all_results = defaultdict(list)
+    slower_cases = []
+
+    header = (
+        f"{'generator':<18} {'n':>5} {'width':>6} {'height':>6}"
+        f"  {'xrs-numpy (s)':>14}"
+        f"  {'xrs-cupy (s)':>14}"
+        f"  {'datashader (s)':>14}"
+        f"  {'geocube (s)':>14}"
+        f"  {'rasterio (s)':>14}"
+        f"  {'ds/xrs':>8}"
+        f"  {'gc/xrs':>8}"
+        f"  {'rio/xrs':>8}"
+        f"  {'cupy/xrs':>8}"
+    )
+    print(header)
+    print("-" * len(header))
+
+    for gen_name, gen_cfg in GENERATORS.items():
+        gen_fn = gen_cfg["fn"]
+        kind = gen_cfg["kind"]
+        use_datashader = kind == "polygon"
+
+        for n in feature_counts:
+            geoms, vals = gen_fn(n, rng)
+            pairs = list(zip(geoms, vals))
+
+            gdf = gpd.GeoDataFrame(
+                {"value": vals}, geometry=geoms, crs="EPSG:4326")
+            rio_shapes = [(g, v) for g, v in zip(geoms, vals)]
+
+            if use_datashader:
+                spd_gdf = spatialpandas.GeoDataFrame(gdf)
+
+            for width in resolutions:
+                height = width // 2
+
+                xrs_res, xrs_mean, xrs_std = bench(
+                    run_xrspatial, pairs, width, height)
+                gc_res, gc_mean, gc_std = bench(
+                    run_geocube, gdf, width, height)
+                rio_res, rio_mean, rio_std = bench(
+                    run_rasterio, rio_shapes, width, height)
+
+                ds_mean = ds_std = ds_res = None
+                if use_datashader:
+                    ds_res, ds_mean, ds_std = bench(
+                        run_datashader, spd_gdf, width, height)
+                    ds_ratio = ds_mean / xrs_mean if xrs_mean > 0 else float("inf")
+                    ds_col = f"  {ds_mean:>8.4f}+-{ds_std:.4f}"
+                    ds_rat = f"  {ds_ratio:>7.2f}x"
+                else:
+                    ds_col = f"  {'--':>14}"
+                    ds_rat = f"  {'--':>8}"
+
+                cupy_mean = cupy_std = cupy_res = None
+                if has_cupy:
+                    cupy_res, cupy_mean, cupy_std = bench(
+                        run_xrspatial_cupy, pairs, width, height)
+                    cupy_ratio = (cupy_mean / xrs_mean
+                                  if xrs_mean > 0 else float("inf"))
+                    cupy_col = f"  {cupy_mean:>8.4f}+-{cupy_std:.4f}"
+                    cupy_rat = f"  {cupy_ratio:>7.2f}x"
+                else:
+                    cupy_col = f"  {'n/a':>14}"
+                    cupy_rat = f"  {'n/a':>8}"
+
+                gc_ratio = gc_mean / xrs_mean if xrs_mean > 0 else float("inf")
+                rio_ratio = rio_mean / xrs_mean if xrs_mean > 0 else float("inf")
+
+                # Track slower-than cases
+                if use_datashader and ds_mean is not None:
+                    ds_r = ds_mean / xrs_mean if xrs_mean > 0 else float("inf")
+                    if ds_r < 1.0:
+                        slower_cases.append((
+                            "datashader", 1.0 / ds_r, gen_name, n,
+                            width, height, xrs_mean * 1000, ds_mean * 1000))
+                if gc_ratio < 1.0:
+                    slower_cases.append((
+                        "geocube", 1.0 / gc_ratio, gen_name, n,
+                        width, height, xrs_mean * 1000, gc_mean * 1000))
+                if rio_ratio < 1.0:
+                    slower_cases.append((
+                        "rasterio", 1.0 / rio_ratio, gen_name, n,
+                        width, height, xrs_mean * 1000, rio_mean * 1000))
+
+                # Console output
+                print(
+                    f"{gen_name:<18} {n:>5} {width:>6} {height:>6}"
+                    f"  {xrs_mean:>8.4f}+-{xrs_std:.4f}"
+                    f"{cupy_col}"
+                    f"{ds_col}"
+                    f"  {gc_mean:>8.4f}+-{gc_std:.4f}"
+                    f"  {rio_mean:>8.4f}+-{rio_std:.4f}"
+                    f"{ds_rat}"
+                    f"  {gc_ratio:>7.2f}x"
+                    f"  {rio_ratio:>7.2f}x"
+                    f"{cupy_rat}"
+                )
+
+                # Consistency checks
+                consistency_inputs = {"xrs-numpy": xrs_res}
+                if ds_res is not None:
+                    consistency_inputs["datashader"] = ds_res
+                consistency_inputs["geocube"] = gc_res
+                consistency_inputs["rasterio"] = rio_res
+                if cupy_res is not None:
+                    consistency_inputs["xrs-cupy"] = cupy_res
+                checks = check_consistency(consistency_inputs, width, height)
+
+                for c in checks:
+                    status = "OK" if c["ok"] else "WARN"
+                    print(f"    {c['a']} vs {c['b']}: "
+                          f"IoU={c['iou']:.4f}  RMSE={c['rmse']:.4f}  "
+                          f"[{status}]")
+
+                # Store structured result
+                all_results[gen_name].append({
+                    "n": n, "w": width, "h": height,
+                    "xrs_mean": xrs_mean, "xrs_std": xrs_std,
+                    "cupy_mean": cupy_mean, "cupy_std": cupy_std,
+                    "ds_mean": ds_mean, "ds_std": ds_std,
+                    "gc_mean": gc_mean, "gc_std": gc_std,
+                    "rio_mean": rio_mean, "rio_std": rio_std,
+                    "consistency": checks,
+                })
+
+    # Console: top 10 slower cases
+    if slower_cases:
+        slower_cases.sort(key=lambda x: -x[1])
+        top = slower_cases[:10]
+        print()
+        print("=" * 95)
+        print("  Top 10 cases where xrs-numpy is SLOWER than a competitor")
+        print("=" * 95)
+        print()
+        print(f"  {'#':>2}  {'geometry':<18} {'n':>5}  {'size':>10}"
+              f"  {'faster lib':<12} {'xrs-np':>9}  {'other':>9}"
+              f"  {'xrs slower by':>13}")
+        print(f"  {'':->2}  {'':->18} {'':->5}  {'':->10}"
+              f"  {'':->12} {'':->9}  {'':->9}  {'':->13}")
+        for i, (comp, ratio, gen, n, w, h, xrs_ms, comp_ms) in enumerate(
+                top, 1):
+            size = f"{w}x{h}"
+            print(f"  {i:>2}  {gen:<18} {n:>5}  {size:>10}"
+                  f"  {comp:<12} {xrs_ms:>7.1f}ms  {comp_ms:>7.1f}ms"
+                  f"  {ratio:>11.1f}x")
+        print()
+
+    # Write markdown report
+    md = generate_markdown(all_results, slower_cases, has_cupy)
+    md_path = os.path.join(os.path.dirname(__file__), "RASTERIZER_BENCHMARKS.md")
+    with open(md_path, "w") as f:
+        f.write(md)
+    print(f"Wrote {md_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/source/reference/hydrology.rst b/docs/source/reference/hydrology.rst
index f7a7689f..4e5216ef 100644
--- a/docs/source/reference/hydrology.rst
+++ b/docs/source/reference/hydrology.rst
@@ -32,6 +32,13 @@ Flow Accumulation
 
     xrspatial.flow_accumulation.flow_accumulation
 
+Flow Accumulation (D-infinity)
+===============================
+.. autosummary::
+    :toctree: _autosummary
+
+    xrspatial.flow_accumulation_dinf.flow_accumulation_dinf
+
 Flow Accumulation (MFD)
 =======================
 .. autosummary::
diff --git a/xrspatial/__init__.py b/xrspatial/__init__.py
index 1c964840..a0fb4a28 100644
--- a/xrspatial/__init__.py
+++ b/xrspatial/__init__.py
@@ -39,6 +39,7 @@
 from xrspatial.flood import inundation  # noqa
 from xrspatial.flood import travel_time  # noqa
 from xrspatial.flow_accumulation import flow_accumulation  # noqa
+from xrspatial.flow_accumulation_dinf import flow_accumulation_dinf  # noqa
 from xrspatial.flow_accumulation_mfd import flow_accumulation_mfd  # noqa
 from xrspatial.flow_direction import flow_direction  # noqa
 from xrspatial.flow_direction_dinf import flow_direction_dinf  # noqa
diff --git a/xrspatial/basin.py b/xrspatial/basin.py
index f80f5c63..d7a404c4 100644
--- a/xrspatial/basin.py
+++ b/xrspatial/basin.py
@@ -364,7 +364,10 @@ def basin(flow_dir: xr.DataArray,
         fd = data.astype(np.float64)
         h, w = fd.shape
         labels = _basins_init_labels(fd, h, w, h, w, 0, 0)
-        out = _watershed_cpu(fd, labels, h, w)
+        # Build state array: 0=nodata(NaN), 1=unresolved(-1), 3=resolved
+        state = np.where(np.isnan(labels), 0,
+                         np.where(labels == -1.0, 1, 3)).astype(np.int8)
+        out = _watershed_cpu(fd, labels, state, h, w)
 
     elif has_cuda_and_cupy() and is_cupy_array(data):
         out = _basins_cupy(data)
diff --git a/xrspatial/flow_accumulation.py b/xrspatial/flow_accumulation.py
index 3e75ebf9..0593fe49 100644
--- a/xrspatial/flow_accumulation.py
+++ b/xrspatial/flow_accumulation.py
@@ -1,15 +1,13 @@
 """Flow accumulation: count of upstream cells draining through each cell.
 
-Supports both D8 (integer codes) and D-infinity (continuous angles)
-flow direction grids, auto-detected from input values.
+Supports D8 (integer codes) flow direction grids.  For D-infinity
+(continuous angles), see ``flow_accumulation_dinf``.
 
 For D8 input, each cell drains to exactly one downstream neighbor.
-For D-infinity, flow is split proportionally between two neighbors
-following Tarboton (1997).
 
 Algorithm
 ---------
-CPU : Kahn's BFS topological sort — O(N).
+CPU : Kahn's BFS topological sort -- O(N).
 GPU : iterative frontier peeling with pull-based kernels.
 Dask: iterative tile sweep with boundary propagation (one tile in
       RAM at a time), following the ``cost_distance.py`` pattern.
@@ -90,7 +88,7 @@ def _code_to_offset_py(code):
 
 
 # =====================================================================
-# Dinf helpers
+# Flow type detection (kept for backward compat)
 # =====================================================================
 
 @ngjit
@@ -142,52 +140,6 @@ def _detect_flow_type(data):
     return "dinf" if result == 1 else "d8"
 
 
-@ngjit
-def _angle_to_neighbors(angle):
-    """Decompose Dinf angle into two neighbors and weights.
-
-    Returns (dy1, dx1, w1, dy2, dx2, w2).
-    Pit (angle < 0) or NaN returns all zeros.
-    """
-    if angle < 0.0 or angle != angle:  # pit or NaN
-        return 0, 0, 0.0, 0, 0, 0.0
-
-    pi_over_4 = 0.7853981633974483  # np.pi / 4
-    k = int(angle / pi_over_4)
-    if k > 7:
-        k = 7
-    alpha = angle - k * pi_over_4
-    w1 = 1.0 - alpha / pi_over_4
-    w2 = alpha / pi_over_4
-
-    if k == 0:
-        dy1, dx1 = 0, 1      # E
-        dy2, dx2 = -1, 1     # NE
-    elif k == 1:
-        dy1, dx1 = -1, 1     # NE
-        dy2, dx2 = -1, 0     # N
-    elif k == 2:
-        dy1, dx1 = -1, 0     # N
-        dy2, dx2 = -1, -1    # NW
-    elif k == 3:
-        dy1, dx1 = -1, -1    # NW
-        dy2, dx2 = 0, -1     # W
-    elif k == 4:
-        dy1, dx1 = 0, -1     # W
-        dy2, dx2 = 1, -1     # SW
-    elif k == 5:
-        dy1, dx1 = 1, -1     # SW
-        dy2, dx2 = 1, 0      # S
-    elif k == 6:
-        dy1, dx1 = 1, 0      # S
-        dy2, dx2 = 1, 1      # SE
-    else:  # k == 7
-        dy1, dx1 = 1, 1      # SE
-        dy2, dx2 = 0, 1      # E
-
-    return dy1, dx1, w1, dy2, dx2, w2
-
-
 # =====================================================================
 # CPU kernel
 # =====================================================================
@@ -256,78 +208,6 @@ def _flow_accum_cpu(flow_dir, height, width):
     return accum
 
 
-@ngjit
-def _flow_accum_dinf_cpu(flow_dir, height, width):
-    """Kahn's BFS topological sort for Dinf flow accumulation."""
-    accum = np.empty((height, width), dtype=np.float64)
-    in_degree = np.zeros((height, width), dtype=np.int32)
-    valid = np.zeros((height, width), dtype=np.int8)
-
-    for r in range(height):
-        for c in range(width):
-            v = flow_dir[r, c]
-            if v == v:  # not NaN
-                valid[r, c] = 1
-                accum[r, c] = 1.0
-            else:
-                accum[r, c] = np.nan
-
-    for r in range(height):
-        for c in range(width):
-            if valid[r, c] == 0:
-                continue
-            dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(flow_dir[r, c])
-            if w1 > 0.0:
-                nr, nc = r + dy1, c + dx1
-                if 0 <= nr < height and 0 <= nc < width and valid[nr, nc] == 1:
-                    in_degree[nr, nc] += 1
-            if w2 > 0.0:
-                nr, nc = r + dy2, c + dx2
-                if 0 <= nr < height and 0 <= nc < width and valid[nr, nc] == 1:
-                    in_degree[nr, nc] += 1
-
-    queue_r = np.empty(height * width, dtype=np.int64)
-    queue_c = np.empty(height * width, dtype=np.int64)
-    head = np.int64(0)
-    tail = np.int64(0)
-
-    for r in range(height):
-        for c in range(width):
-            if valid[r, c] == 1 and in_degree[r, c] == 0:
-                queue_r[tail] = r
-                queue_c[tail] = c
-                tail += 1
-
-    while head < tail:
-        r = queue_r[head]
-        c = queue_c[head]
-        head += 1
-
-        dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(flow_dir[r, c])
-
-        if w1 > 0.0:
-            nr, nc = r + dy1, c + dx1
-            if 0 <= nr < height and 0 <= nc < width and valid[nr, nc] == 1:
-                accum[nr, nc] += accum[r, c] * w1
-                in_degree[nr, nc] -= 1
-                if in_degree[nr, nc] == 0:
-                    queue_r[tail] = nr
-                    queue_c[tail] = nc
-                    tail += 1
-
-        if w2 > 0.0:
-            nr, nc = r + dy2, c + dx2
-            if 0 <= nr < height and 0 <= nc < width and valid[nr, nc] == 1:
-                accum[nr, nc] += accum[r, c] * w2
-                in_degree[nr, nc] -= 1
-                if in_degree[nr, nc] == 0:
-                    queue_r[tail] = nr
-                    queue_c[tail] = nc
-                    tail += 1
-
-    return accum
-
-
 # =====================================================================
 # GPU kernels
 # =====================================================================
@@ -348,7 +228,7 @@ def _init_accum_indegree(flow_dir, accum, in_degree, state, H, W):
     state[i, j] = 1
     accum[i, j] = 1.0
 
-    # Decode direction (inline — can't call @ngjit from @cuda.jit)
+    # Decode direction (inline -- can't call @ngjit from @cuda.jit)
     code = int(v)
     dy = 0
     dx = 0
@@ -380,7 +260,7 @@ def _init_accum_indegree(flow_dir, accum, in_degree, state, H, W):
 
 @cuda.jit
 def _find_ready_and_finalize(in_degree, state, changed, H, W):
-    """Finalize previous frontier (2→3), mark new frontier (1→2)."""
+    """Finalize previous frontier (2->3), mark new frontier (1->2)."""
     i, j = cuda.grid(2)
     if i >= H or j >= W:
         return
@@ -543,232 +423,6 @@ def _flow_accum_tile_cupy(flow_dir_data,
     return accum
 
 
-# =====================================================================
-# Dinf GPU kernels
-# =====================================================================
-
-@cuda.jit
-def _init_accum_indegree_dinf(flow_dir, accum, in_degree, state, H, W):
-    """Initialise accum/in_degree/state for Dinf on GPU."""
-    i, j = cuda.grid(2)
-    if i >= H or j >= W:
-        return
-
-    v = flow_dir[i, j]
-    if v != v:  # NaN
-        state[i, j] = 0
-        accum[i, j] = 0.0
-        return
-
-    state[i, j] = 1
-    accum[i, j] = 1.0
-
-    if v < 0.0:  # pit
-        return
-
-    pi_over_4 = 0.7853981633974483
-    k = int(v / pi_over_4)
-    if k > 7:
-        k = 7
-    alpha = v - k * pi_over_4
-    w1 = 1.0 - alpha / pi_over_4
-    w2 = alpha / pi_over_4
-
-    # Facet offsets (inline)
-    if k == 0:
-        dy1, dx1 = 0, 1
-        dy2, dx2 = -1, 1
-    elif k == 1:
-        dy1, dx1 = -1, 1
-        dy2, dx2 = -1, 0
-    elif k == 2:
-        dy1, dx1 = -1, 0
-        dy2, dx2 = -1, -1
-    elif k == 3:
-        dy1, dx1 = -1, -1
-        dy2, dx2 = 0, -1
-    elif k == 4:
-        dy1, dx1 = 0, -1
-        dy2, dx2 = 1, -1
-    elif k == 5:
-        dy1, dx1 = 1, -1
-        dy2, dx2 = 1, 0
-    elif k == 6:
-        dy1, dx1 = 1, 0
-        dy2, dx2 = 1, 1
-    else:
-        dy1, dx1 = 1, 1
-        dy2, dx2 = 0, 1
-
-    if w1 > 0.0:
-        ni, nj = i + dy1, j + dx1
-        if 0 <= ni < H and 0 <= nj < W:
-            cuda.atomic.add(in_degree, (ni, nj), 1)
-    if w2 > 0.0:
-        ni, nj = i + dy2, j + dx2
-        if 0 <= ni < H and 0 <= nj < W:
-            cuda.atomic.add(in_degree, (ni, nj), 1)
-
-
-@cuda.jit
-def _pull_from_frontier_dinf(flow_dir, accum, in_degree, state, H, W):
-    """Active Dinf cells pull accumulation from frontier neighbours."""
-    i, j = cuda.grid(2)
-    if i >= H or j >= W:
-        return
-
-    if state[i, j] != 1:
-        return
-
-    pi_over_4 = 0.7853981633974483
-
-    for nbr in range(8):
-        if nbr == 0:
-            dy, dx = 0, 1
-        elif nbr == 1:
-            dy, dx = 1, 1
-        elif nbr == 2:
-            dy, dx = 1, 0
-        elif nbr == 3:
-            dy, dx = 1, -1
-        elif nbr == 4:
-            dy, dx = 0, -1
-        elif nbr == 5:
-            dy, dx = -1, -1
-        elif nbr == 6:
-            dy, dx = -1, 0
-        else:
-            dy, dx = -1, 1
-
-        ni = i + dy
-        nj = j + dx
-        if ni < 0 or ni >= H or nj < 0 or nj >= W:
-            continue
-        if state[ni, nj] != 2:
-            continue
-
-        nv = flow_dir[ni, nj]
-        if nv < 0.0 or nv != nv:
-            continue
-
-        nk = int(nv / pi_over_4)
-        if nk > 7:
-            nk = 7
-        nalpha = nv - nk * pi_over_4
-        nw1 = 1.0 - nalpha / pi_over_4
-        nw2 = nalpha / pi_over_4
-
-        # Inline facet offsets for neighbor's direction
-        if nk == 0:
-            ndy1, ndx1 = 0, 1
-            ndy2, ndx2 = -1, 1
-        elif nk == 1:
-            ndy1, ndx1 = -1, 1
-            ndy2, ndx2 = -1, 0
-        elif nk == 2:
-            ndy1, ndx1 = -1, 0
-            ndy2, ndx2 = -1, -1
-        elif nk == 3:
-            ndy1, ndx1 = -1, -1
-            ndy2, ndx2 = 0, -1
-        elif nk == 4:
-            ndy1, ndx1 = 0, -1
-            ndy2, ndx2 = 1, -1
-        elif nk == 5:
-            ndy1, ndx1 = 1, -1
-            ndy2, ndx2 = 1, 0
-        elif nk == 6:
-            ndy1, ndx1 = 1, 0
-            ndy2, ndx2 = 1, 1
-        else:
-            ndy1, ndx1 = 1, 1
-            ndy2, ndx2 = 0, 1
-
-        if nw1 > 0.0 and ni + ndy1 == i and nj + ndx1 == j:
-            accum[i, j] += accum[ni, nj] * nw1
-            in_degree[i, j] -= 1
-        if nw2 > 0.0 and ni + ndy2 == i and nj + ndx2 == j:
-            accum[i, j] += accum[ni, nj] * nw2
-            in_degree[i, j] -= 1
-
-
-def _flow_accum_dinf_cupy(flow_dir_data):
-    """GPU driver: iterative frontier peeling for Dinf."""
-    import cupy as cp
-
-    H, W = flow_dir_data.shape
-    flow_dir_f64 = flow_dir_data.astype(cp.float64)
-
-    accum = cp.zeros((H, W), dtype=cp.float64)
-    in_degree = cp.zeros((H, W), dtype=cp.int32)
-    state = cp.zeros((H, W), dtype=cp.int32)
-    changed = cp.zeros(1, dtype=cp.int32)
-
-    griddim, blockdim = cuda_args((H, W))
-
-    _init_accum_indegree_dinf[griddim, blockdim](
-        flow_dir_f64, accum, in_degree, state, H, W)
-
-    max_iter = H * W
-    for _ in range(max_iter):
-        changed[0] = 0
-        _find_ready_and_finalize[griddim, blockdim](
-            in_degree, state, changed, H, W)
-
-        if int(changed[0]) == 0:
-            break
-
-        _pull_from_frontier_dinf[griddim, blockdim](
-            flow_dir_f64, accum, in_degree, state, H, W)
-
-    accum = cp.where(state == 0, cp.nan, accum)
-    return accum
-
-
-def _flow_accum_dinf_tile_cupy(flow_dir_data,
-                                seed_top, seed_bottom, seed_left, seed_right,
-                                seed_tl, seed_tr, seed_bl, seed_br):
-    """GPU seeded Dinf flow accumulation for a single tile."""
-    import cupy as cp
-
-    H, W = flow_dir_data.shape
-    flow_dir_f64 = flow_dir_data.astype(cp.float64)
-
-    accum = cp.zeros((H, W), dtype=cp.float64)
-    in_degree = cp.zeros((H, W), dtype=cp.int32)
-    state = cp.zeros((H, W), dtype=cp.int32)
-    changed = cp.zeros(1, dtype=cp.int32)
-
-    griddim, blockdim = cuda_args((H, W))
-
-    _init_accum_indegree_dinf[griddim, blockdim](
-        flow_dir_f64, accum, in_degree, state, H, W)
-
-    accum[0, :] += cp.asarray(seed_top)
-    accum[H - 1, :] += cp.asarray(seed_bottom)
-    accum[:, 0] += cp.asarray(seed_left)
-    accum[:, W - 1] += cp.asarray(seed_right)
-    accum[0, 0] += seed_tl
-    accum[0, W - 1] += seed_tr
-    accum[H - 1, 0] += seed_bl
-    accum[H - 1, W - 1] += seed_br
-
-    max_iter = H * W
-    for _ in range(max_iter):
-        changed[0] = 0
-        _find_ready_and_finalize[griddim, blockdim](
-            in_degree, state, changed, H, W)
-
-        if int(changed[0]) == 0:
-            break
-
-        _pull_from_frontier_dinf[griddim, blockdim](
-            flow_dir_f64, accum, in_degree, state, H, W)
-
-    accum = cp.where(state == 0, cp.nan, accum)
-    return accum
-
-
 # =====================================================================
 # Tile kernel for dask iterative path
 # =====================================================================
@@ -915,14 +569,14 @@ def _compute_seeds(iy, ix, boundaries, flow_bdry,
         nb_fdir = flow_bdry.get('bottom', iy - 1, ix)
         nb_accum = boundaries.get('bottom', iy - 1, ix)
         w = len(nb_fdir)
-        # Cardinal S (code 4): nb cell j → our (0, j)
+        # Cardinal S (code 4): nb cell j -> our (0, j)
         mask = (nb_fdir == 4)
         seed_top += np.where(mask, nb_accum, 0.0)
         if w > 1:
-            # Diagonal SE (code 2): nb cell j → our (0, j+1)
+            # Diagonal SE (code 2): nb cell j -> our (0, j+1)
             mask = (nb_fdir[:-1] == 2)
             seed_top[1:] += np.where(mask, nb_accum[:-1], 0.0)
-            # Diagonal SW (code 8): nb cell j → our (0, j-1)
+            # Diagonal SW (code 8): nb cell j -> our (0, j-1)
             mask = (nb_fdir[1:] == 8)
             seed_top[:-1] += np.where(mask, nb_accum[1:], 0.0)
 
@@ -935,10 +589,10 @@ def _compute_seeds(iy, ix, boundaries, flow_bdry,
         mask = (nb_fdir == 64)
         seed_bottom += np.where(mask, nb_accum, 0.0)
         if w > 1:
-            # Diagonal NE (code 128): nb cell j → our (h-1, j+1)
+            # Diagonal NE (code 128): nb cell j -> our (h-1, j+1)
             mask = (nb_fdir[:-1] == 128)
             seed_bottom[1:] += np.where(mask, nb_accum[:-1], 0.0)
-            # Diagonal NW (code 32): nb cell j → our (h-1, j-1)
+            # Diagonal NW (code 32): nb cell j -> our (h-1, j-1)
             mask = (nb_fdir[1:] == 32)
             seed_bottom[:-1] += np.where(mask, nb_accum[1:], 0.0)
 
@@ -951,10 +605,10 @@ def _compute_seeds(iy, ix, boundaries, flow_bdry,
         mask = (nb_fdir == 1)
         seed_left += np.where(mask, nb_accum, 0.0)
         if h > 1:
-            # Diagonal SE (code 2): nb cell i → our (i+1, 0)
+            # Diagonal SE (code 2): nb cell i -> our (i+1, 0)
             mask = (nb_fdir[:-1] == 2)
             seed_left[1:] += np.where(mask, nb_accum[:-1], 0.0)
-            # Diagonal NE (code 128): nb cell i → our (i-1, 0)
+            # Diagonal NE (code 128): nb cell i -> our (i-1, 0)
             mask = (nb_fdir[1:] == 128)
             seed_left[:-1] += np.where(mask, nb_accum[1:], 0.0)
 
@@ -967,10 +621,10 @@ def _compute_seeds(iy, ix, boundaries, flow_bdry,
         mask = (nb_fdir == 16)
         seed_right += np.where(mask, nb_accum, 0.0)
         if h > 1:
-            # Diagonal SW (code 8): nb cell i → our (i+1, w-1)
+            # Diagonal SW (code 8): nb cell i -> our (i+1, w-1)
             mask = (nb_fdir[:-1] == 8)
             seed_right[1:] += np.where(mask, nb_accum[:-1], 0.0)
-            # Diagonal NW (code 32): nb cell i → our (i-1, w-1)
+            # Diagonal NW (code 32): nb cell i -> our (i-1, w-1)
             mask = (nb_fdir[1:] == 32)
             seed_right[:-1] += np.where(mask, nb_accum[1:], 0.0)
 
@@ -1222,437 +876,6 @@ def _flow_accum_dask_cupy(flow_dir_da):
                                   chunks_y, chunks_x, n_tile_y, n_tile_x)
 
 
-# =====================================================================
-# Dinf tile kernel for dask
-# =====================================================================
-
-@ngjit
-def _flow_accum_dinf_tile_kernel(flow_dir, h, w,
-                                  seed_top, seed_bottom,
-                                  seed_left, seed_right,
-                                  seed_tl, seed_tr, seed_bl, seed_br):
-    """Seeded BFS Dinf flow accumulation for a single tile."""
-    accum = np.empty((h, w), dtype=np.float64)
-    in_degree = np.zeros((h, w), dtype=np.int32)
-    valid = np.zeros((h, w), dtype=np.int8)
-
-    for r in range(h):
-        for c in range(w):
-            v = flow_dir[r, c]
-            if v == v:
-                valid[r, c] = 1
-                accum[r, c] = 1.0
-            else:
-                accum[r, c] = np.nan
-
-    # Add external seeds
-    for c in range(w):
-        if valid[0, c] == 1:
-            accum[0, c] += seed_top[c]
-        if valid[h - 1, c] == 1:
-            accum[h - 1, c] += seed_bottom[c]
-    for r in range(h):
-        if valid[r, 0] == 1:
-            accum[r, 0] += seed_left[r]
-        if valid[r, w - 1] == 1:
-            accum[r, w - 1] += seed_right[r]
-    if valid[0, 0] == 1:
-        accum[0, 0] += seed_tl
-    if valid[0, w - 1] == 1:
-        accum[0, w - 1] += seed_tr
-    if valid[h - 1, 0] == 1:
-        accum[h - 1, 0] += seed_bl
-    if valid[h - 1, w - 1] == 1:
-        accum[h - 1, w - 1] += seed_br
-
-    # In-degrees via angle decomposition
-    for r in range(h):
-        for c in range(w):
-            if valid[r, c] == 0:
-                continue
-            dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(flow_dir[r, c])
-            if w1 > 0.0:
-                nr, nc = r + dy1, c + dx1
-                if 0 <= nr < h and 0 <= nc < w and valid[nr, nc] == 1:
-                    in_degree[nr, nc] += 1
-            if w2 > 0.0:
-                nr, nc = r + dy2, c + dx2
-                if 0 <= nr < h and 0 <= nc < w and valid[nr, nc] == 1:
-                    in_degree[nr, nc] += 1
-
-    queue_r = np.empty(h * w, dtype=np.int64)
-    queue_c = np.empty(h * w, dtype=np.int64)
-    head = np.int64(0)
-    tail = np.int64(0)
-
-    for r in range(h):
-        for c in range(w):
-            if valid[r, c] == 1 and in_degree[r, c] == 0:
-                queue_r[tail] = r
-                queue_c[tail] = c
-                tail += 1
-
-    while head < tail:
-        r = queue_r[head]
-        c = queue_c[head]
-        head += 1
-
-        dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(flow_dir[r, c])
-
-        if w1 > 0.0:
-            nr, nc = r + dy1, c + dx1
-            if 0 <= nr < h and 0 <= nc < w and valid[nr, nc] == 1:
-                accum[nr, nc] += accum[r, c] * w1
-                in_degree[nr, nc] -= 1
-                if in_degree[nr, nc] == 0:
-                    queue_r[tail] = nr
-                    queue_c[tail] = nc
-                    tail += 1
-
-        if w2 > 0.0:
-            nr, nc = r + dy2, c + dx2
-            if 0 <= nr < h and 0 <= nc < w and valid[nr, nc] == 1:
-                accum[nr, nc] += accum[r, c] * w2
-                in_degree[nr, nc] -= 1
-                if in_degree[nr, nc] == 0:
-                    queue_r[tail] = nr
-                    queue_c[tail] = nc
-                    tail += 1
-
-    return accum
-
-
-# =====================================================================
-# Dinf dask iterative tile sweep
-# =====================================================================
-
-def _compute_seeds_dinf(iy, ix, boundaries, flow_bdry,
-                        chunks_y, chunks_x, n_tile_y, n_tile_x):
-    """Compute seed arrays for tile (iy, ix) using Dinf angle decomposition."""
-    tile_h = chunks_y[iy]
-    tile_w = chunks_x[ix]
-
-    seed_top = np.zeros(tile_w, dtype=np.float64)
-    seed_bottom = np.zeros(tile_w, dtype=np.float64)
-    seed_left = np.zeros(tile_h, dtype=np.float64)
-    seed_right = np.zeros(tile_h, dtype=np.float64)
-    seed_tl = 0.0
-    seed_tr = 0.0
-    seed_bl = 0.0
-    seed_br = 0.0
-
-    # --- Top edge: bottom row of tile above flows south (dy=1) ---
-    if iy > 0:
-        nb_fdir = flow_bdry.get('bottom', iy - 1, ix)
-        nb_accum = boundaries.get('bottom', iy - 1, ix)
-        for c in range(len(nb_fdir)):
-            dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(nb_fdir[c])
-            if w1 > 0.0 and dy1 == 1:
-                tc = c + dx1
-                if 0 <= tc < tile_w:
-                    seed_top[tc] += nb_accum[c] * w1
-            if w2 > 0.0 and dy2 == 1:
-                tc = c + dx2
-                if 0 <= tc < tile_w:
-                    seed_top[tc] += nb_accum[c] * w2
-
-    # --- Bottom edge: top row of tile below flows north (dy=-1) ---
-    if iy < n_tile_y - 1:
-        nb_fdir = flow_bdry.get('top', iy + 1, ix)
-        nb_accum = boundaries.get('top', iy + 1, ix)
-        for c in range(len(nb_fdir)):
-            dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(nb_fdir[c])
-            if w1 > 0.0 and dy1 == -1:
-                tc = c + dx1
-                if 0 <= tc < tile_w:
-                    seed_bottom[tc] += nb_accum[c] * w1
-            if w2 > 0.0 and dy2 == -1:
-                tc = c + dx2
-                if 0 <= tc < tile_w:
-                    seed_bottom[tc] += nb_accum[c] * w2
-
-    # --- Left edge: right column of tile to the left flows east (dx=1) ---
-    if ix > 0:
-        nb_fdir = flow_bdry.get('right', iy, ix - 1)
-        nb_accum = boundaries.get('right', iy, ix - 1)
-        for r in range(len(nb_fdir)):
-            dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(nb_fdir[r])
-            if w1 > 0.0 and dx1 == 1:
-                tr = r + dy1
-                if 0 <= tr < tile_h:
-                    seed_left[tr] += nb_accum[r] * w1
-            if w2 > 0.0 and dx2 == 1:
-                tr = r + dy2
-                if 0 <= tr < tile_h:
-                    seed_left[tr] += nb_accum[r] * w2
-
-    # --- Right edge: left column of tile to the right flows west (dx=-1) ---
-    if ix < n_tile_x - 1:
-        nb_fdir = flow_bdry.get('left', iy, ix + 1)
-        nb_accum = boundaries.get('left', iy, ix + 1)
-        for r in range(len(nb_fdir)):
-            dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(nb_fdir[r])
-            if w1 > 0.0 and dx1 == -1:
-                tr = r + dy1
-                if 0 <= tr < tile_h:
-                    seed_right[tr] += nb_accum[r] * w1
-            if w2 > 0.0 and dx2 == -1:
-                tr = r + dy2
-                if 0 <= tr < tile_h:
-                    seed_right[tr] += nb_accum[r] * w2
-
-    # --- Diagonal corner seeds ---
-    # TL: bottom-right of (iy-1, ix-1) flows SE (dy=1, dx=1)
-    if iy > 0 and ix > 0:
-        fv = flow_bdry.get('bottom', iy - 1, ix - 1)[-1]
-        av = float(boundaries.get('bottom', iy - 1, ix - 1)[-1])
-        dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(fv)
-        if w1 > 0.0 and dy1 == 1 and dx1 == 1:
-            seed_tl += av * w1
-        if w2 > 0.0 and dy2 == 1 and dx2 == 1:
-            seed_tl += av * w2
-
-    # TR: bottom-left of (iy-1, ix+1) flows SW (dy=1, dx=-1)
-    if iy > 0 and ix < n_tile_x - 1:
-        fv = flow_bdry.get('bottom', iy - 1, ix + 1)[0]
-        av = float(boundaries.get('bottom', iy - 1, ix + 1)[0])
-        dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(fv)
-        if w1 > 0.0 and dy1 == 1 and dx1 == -1:
-            seed_tr += av * w1
-        if w2 > 0.0 and dy2 == 1 and dx2 == -1:
-            seed_tr += av * w2
-
-    # BL: top-right of (iy+1, ix-1) flows NE (dy=-1, dx=1)
-    if iy < n_tile_y - 1 and ix > 0:
-        fv = flow_bdry.get('top', iy + 1, ix - 1)[-1]
-        av = float(boundaries.get('top', iy + 1, ix - 1)[-1])
-        dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(fv)
-        if w1 > 0.0 and dy1 == -1 and dx1 == 1:
-            seed_bl += av * w1
-        if w2 > 0.0 and dy2 == -1 and dx2 == 1:
-            seed_bl += av * w2
-
-    # BR: top-left of (iy+1, ix+1) flows NW (dy=-1, dx=-1)
-    if iy < n_tile_y - 1 and ix < n_tile_x - 1:
-        fv = flow_bdry.get('top', iy + 1, ix + 1)[0]
-        av = float(boundaries.get('top', iy + 1, ix + 1)[0])
-        dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(fv)
-        if w1 > 0.0 and dy1 == -1 and dx1 == -1:
-            seed_br += av * w1
-        if w2 > 0.0 and dy2 == -1 and dx2 == -1:
-            seed_br += av * w2
-
-    return (seed_top, seed_bottom, seed_left, seed_right,
-            seed_tl, seed_tr, seed_bl, seed_br)
-
-
-def _process_tile_dinf(iy, ix, flow_dir_da, boundaries, flow_bdry,
-                       chunks_y, chunks_x, n_tile_y, n_tile_x):
-    """Run seeded Dinf BFS on one tile; update boundaries in-place."""
-    chunk = np.asarray(
-        flow_dir_da.blocks[iy, ix].compute(), dtype=np.float64)
-    h, w = chunk.shape
-
-    seeds = _compute_seeds_dinf(
-        iy, ix, boundaries, flow_bdry,
-        chunks_y, chunks_x, n_tile_y, n_tile_x)
-
-    accum = _flow_accum_dinf_tile_kernel(chunk, h, w, *seeds)
-
-    new_top = accum[0, :].copy()
-    new_bottom = accum[-1, :].copy()
-    new_left = accum[:, 0].copy()
-    new_right = accum[:, -1].copy()
-
-    change = 0.0
-    for side, new in (('top', new_top), ('bottom', new_bottom),
-                      ('left', new_left), ('right', new_right)):
-        old = boundaries.get(side, iy, ix)
-        with np.errstate(invalid='ignore'):
-            diff = np.abs(new - old)
-        diff = np.where(np.isnan(diff), 0.0, diff)
-        m = float(np.max(diff))
-        if m > change:
-            change = m
-
-    boundaries.set('top', iy, ix, new_top)
-    boundaries.set('bottom', iy, ix, new_bottom)
-    boundaries.set('left', iy, ix, new_left)
-    boundaries.set('right', iy, ix, new_right)
-
-    return change
-
-
-def _flow_accum_dinf_dask_iterative(flow_dir_da):
-    """Iterative boundary-propagation for Dinf dask arrays."""
-    chunks_y = flow_dir_da.chunks[0]
-    chunks_x = flow_dir_da.chunks[1]
-    n_tile_y = len(chunks_y)
-    n_tile_x = len(chunks_x)
-
-    flow_bdry = _preprocess_tiles(flow_dir_da, chunks_y, chunks_x)
-    flow_bdry = flow_bdry.snapshot()
-
-    boundaries = BoundaryStore(chunks_y, chunks_x, fill_value=0.0)
-
-    max_iterations = max(n_tile_y, n_tile_x) + 10
-
-    for _iteration in range(max_iterations):
-        max_change = 0.0
-
-        for iy in range(n_tile_y):
-            for ix in range(n_tile_x):
-                c = _process_tile_dinf(iy, ix, flow_dir_da, boundaries,
-                                       flow_bdry, chunks_y, chunks_x,
-                                       n_tile_y, n_tile_x)
-                if c > max_change:
-                    max_change = c
-
-        for iy in reversed(range(n_tile_y)):
-            for ix in reversed(range(n_tile_x)):
-                c = _process_tile_dinf(iy, ix, flow_dir_da, boundaries,
-                                       flow_bdry, chunks_y, chunks_x,
-                                       n_tile_y, n_tile_x)
-                if c > max_change:
-                    max_change = c
-
-        if max_change == 0.0:
-            break
-
-    boundaries = boundaries.snapshot()
-
-    return _assemble_result_dinf(flow_dir_da, boundaries, flow_bdry,
-                                 chunks_y, chunks_x, n_tile_y, n_tile_x)
-
-
-def _assemble_result_dinf(flow_dir_da, boundaries, flow_bdry,
-                          chunks_y, chunks_x, n_tile_y, n_tile_x):
-    """Build lazy dask array by re-running each Dinf tile with converged seeds."""
-
-    def _tile_fn(flow_dir_block, block_info=None):
-        if block_info is None or 0 not in block_info:
-            return np.full(flow_dir_block.shape, np.nan, dtype=np.float64)
-        iy, ix = block_info[0]['chunk-location']
-        h, w = flow_dir_block.shape
-        seeds = _compute_seeds_dinf(
-            iy, ix, boundaries, flow_bdry,
-            chunks_y, chunks_x, n_tile_y, n_tile_x)
-        return _flow_accum_dinf_tile_kernel(
-            np.asarray(flow_dir_block, dtype=np.float64), h, w, *seeds)
-
-    return da.map_blocks(
-        _tile_fn,
-        flow_dir_da,
-        dtype=np.float64,
-        meta=np.array((), dtype=np.float64),
-    )
-
-
-def _process_tile_dinf_cupy(iy, ix, flow_dir_da, boundaries, flow_bdry,
-                             chunks_y, chunks_x, n_tile_y, n_tile_x):
-    """Run seeded GPU Dinf flow accumulation on one tile."""
-    import cupy as cp
-
-    chunk = cp.asarray(
-        flow_dir_da.blocks[iy, ix].compute(), dtype=cp.float64)
-
-    seeds = _compute_seeds_dinf(
-        iy, ix, boundaries, flow_bdry,
-        chunks_y, chunks_x, n_tile_y, n_tile_x)
-
-    accum = _flow_accum_dinf_tile_cupy(chunk, *seeds)
-
-    new_top = accum[0, :].get().copy()
-    new_bottom = accum[-1, :].get().copy()
-    new_left = accum[:, 0].get().copy()
-    new_right = accum[:, -1].get().copy()
-
-    change = 0.0
-    for side, new in (('top', new_top), ('bottom', new_bottom),
-                      ('left', new_left), ('right', new_right)):
-        old = boundaries.get(side, iy, ix)
-        with np.errstate(invalid='ignore'):
-            diff = np.abs(new - old)
-        diff = np.where(np.isnan(diff), 0.0, diff)
-        m = float(np.max(diff))
-        if m > change:
-            change = m
-
-    boundaries.set('top', iy, ix, new_top)
-    boundaries.set('bottom', iy, ix, new_bottom)
-    boundaries.set('left', iy, ix, new_left)
-    boundaries.set('right', iy, ix, new_right)
-
-    return change
-
-
-def _assemble_result_dinf_cupy(flow_dir_da, boundaries, flow_bdry,
-                                chunks_y, chunks_x, n_tile_y, n_tile_x):
-    """Build lazy dask+cupy array using GPU Dinf tile kernel."""
-    import cupy as cp
-
-    def _tile_fn(flow_dir_block, block_info=None):
-        if block_info is None or 0 not in block_info:
-            return cp.full(flow_dir_block.shape, cp.nan, dtype=cp.float64)
-        iy, ix = block_info[0]['chunk-location']
-        seeds = _compute_seeds_dinf(
-            iy, ix, boundaries, flow_bdry,
-            chunks_y, chunks_x, n_tile_y, n_tile_x)
-        return _flow_accum_dinf_tile_cupy(
-            cp.asarray(flow_dir_block, dtype=cp.float64), *seeds)
-
-    return da.map_blocks(
-        _tile_fn,
-        flow_dir_da,
-        dtype=np.float64,
-        meta=cp.array((), dtype=cp.float64),
-    )
-
-
-def _flow_accum_dinf_dask_cupy(flow_dir_da):
-    """Dask+CuPy Dinf: native GPU processing per tile."""
-    chunks_y = flow_dir_da.chunks[0]
-    chunks_x = flow_dir_da.chunks[1]
-    n_tile_y = len(chunks_y)
-    n_tile_x = len(chunks_x)
-
-    flow_bdry = _preprocess_tiles(flow_dir_da, chunks_y, chunks_x)
-    flow_bdry = flow_bdry.snapshot()
-
-    boundaries = BoundaryStore(chunks_y, chunks_x, fill_value=0.0)
-
-    max_iterations = max(n_tile_y, n_tile_x) + 10
-
-    for _iteration in range(max_iterations):
-        max_change = 0.0
-
-        for iy in range(n_tile_y):
-            for ix in range(n_tile_x):
-                c = _process_tile_dinf_cupy(
-                    iy, ix, flow_dir_da, boundaries,
-                    flow_bdry, chunks_y, chunks_x,
-                    n_tile_y, n_tile_x)
-                if c > max_change:
-                    max_change = c
-
-        for iy in reversed(range(n_tile_y)):
-            for ix in reversed(range(n_tile_x)):
-                c = _process_tile_dinf_cupy(
-                    iy, ix, flow_dir_da, boundaries,
-                    flow_bdry, chunks_y, chunks_x,
-                    n_tile_y, n_tile_x)
-                if c > max_change:
-                    max_change = c
-
-        if max_change == 0.0:
-            break
-
-    boundaries = boundaries.snapshot()
-
-    return _assemble_result_dinf_cupy(flow_dir_da, boundaries, flow_bdry,
-                                       chunks_y, chunks_x, n_tile_y, n_tile_x)
-
-
 # =====================================================================
 # Public API
 # =====================================================================
@@ -1660,26 +883,17 @@ def _flow_accum_dinf_dask_cupy(flow_dir_da):
 @supports_dataset
 def flow_accumulation(flow_dir: xr.DataArray,
                       name: str = 'flow_accumulation') -> xr.DataArray:
-    """Compute flow accumulation from a flow direction grid.
+    """Compute flow accumulation from a D8 flow direction grid.
 
-    Supports both D8 (integer codes) and D-infinity (continuous angles)
-    flow direction grids, auto-detected from input values.
-
-    For D8 input, each cell drains to exactly one downstream neighbor.
-    For D-infinity, flow is split proportionally between two neighbors
-    following Tarboton (1997).
+    Each cell drains to exactly one downstream neighbor based on
+    integer D8 direction codes.  For D-infinity (continuous angle)
+    grids, use ``flow_accumulation_dinf`` instead.
 
     Parameters
     ----------
     flow_dir : xarray.DataArray or xr.Dataset
-        2D flow direction grid. Accepts:
-
-        - D8 codes: 0/1/2/4/8/16/32/64/128 (NaN for nodata)
-        - D-infinity angles: continuous radians in ``[0, 2*pi)``,
-          ``-1.0`` for pits, NaN for nodata (as produced by
-          ``flow_direction_dinf``)
-
-        The type is auto-detected from values.
+        2D flow direction grid with D8 codes:
+        0/1/2/4/8/16/32/64/128 (NaN for nodata).
         Supported backends: NumPy, CuPy, NumPy-backed Dask,
         CuPy-backed Dask.
         If a Dataset is passed, the operation is applied to each
@@ -1700,38 +914,21 @@ def flow_accumulation(flow_dir: xr.DataArray,
     Structure from Digital Elevation Data for Geographic Information
     System Analysis. Photogrammetric Engineering and Remote Sensing,
     54(11), 1593-1600.
-
-    Tarboton, D.G. (1997). A new method for the determination of flow
-    directions and upslope areas in grid digital elevation models.
-    Water Resources Research, 33(2), 309-319.
     """
     _validate_raster(flow_dir, func_name='flow_accumulation', name='flow_dir')
 
     data = flow_dir.data
-    flow_type = _detect_flow_type(data)
-
-    if flow_type == "dinf":
-        if isinstance(data, np.ndarray):
-            out = _flow_accum_dinf_cpu(data.astype(np.float64), *data.shape)
-        elif has_cuda_and_cupy() and is_cupy_array(data):
-            out = _flow_accum_dinf_cupy(data)
-        elif has_cuda_and_cupy() and is_dask_cupy(flow_dir):
-            out = _flow_accum_dinf_dask_cupy(data)
-        elif da is not None and isinstance(data, da.Array):
-            out = _flow_accum_dinf_dask_iterative(data)
-        else:
-            raise TypeError(f"Unsupported array type: {type(data)}")
+
+    if isinstance(data, np.ndarray):
+        out = _flow_accum_cpu(data.astype(np.float64), *data.shape)
+    elif has_cuda_and_cupy() and is_cupy_array(data):
+        out = _flow_accum_cupy(data)
+    elif has_cuda_and_cupy() and is_dask_cupy(flow_dir):
+        out = _flow_accum_dask_cupy(data)
+    elif da is not None and isinstance(data, da.Array):
+        out = _flow_accum_dask_iterative(data)
     else:
-        if isinstance(data, np.ndarray):
-            out = _flow_accum_cpu(data.astype(np.float64), *data.shape)
-        elif has_cuda_and_cupy() and is_cupy_array(data):
-            out = _flow_accum_cupy(data)
-        elif has_cuda_and_cupy() and is_dask_cupy(flow_dir):
-            out = _flow_accum_dask_cupy(data)
-        elif da is not None and isinstance(data, da.Array):
-            out = _flow_accum_dask_iterative(data)
-        else:
-            raise TypeError(f"Unsupported array type: {type(data)}")
+        raise TypeError(f"Unsupported array type: {type(data)}")
 
     return xr.DataArray(out,
                         name=name,
diff --git a/xrspatial/flow_accumulation_dinf.py b/xrspatial/flow_accumulation_dinf.py
new file mode 100644
index 00000000..9b45dcc1
--- /dev/null
+++ b/xrspatial/flow_accumulation_dinf.py
@@ -0,0 +1,902 @@
+"""Flow accumulation for D-infinity (continuous angle) flow direction grids.
+
+Takes the continuous-angle output from ``flow_direction_dinf`` and
+accumulates upstream area, splitting flow proportionally between two
+neighbors following Tarboton (1997).
+
+Algorithm
+---------
+CPU : Kahn's BFS topological sort -- O(N).
+GPU : iterative frontier peeling with pull-based kernels.
+Dask: iterative tile sweep with boundary propagation (one tile in
+      RAM at a time), following the ``flow_accumulation.py`` pattern.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import xarray as xr
+from numba import cuda
+
+try:
+    import cupy
+except ImportError:
+    class cupy:  # type: ignore[no-redef]
+        ndarray = False
+
+try:
+    import dask.array as da
+except ImportError:
+    da = None
+
+from xrspatial.utils import (
+    _validate_raster,
+    cuda_args,
+    has_cuda_and_cupy,
+    is_cupy_array,
+    is_dask_cupy,
+    ngjit,
+)
+from xrspatial._boundary_store import BoundaryStore
+from xrspatial.dataset_support import supports_dataset
+from xrspatial.flow_accumulation import (
+    _find_ready_and_finalize,
+    _preprocess_tiles,
+)
+
+
+def _to_numpy_f64(arr):
+    """Convert *arr* to a contiguous numpy float64 array.
+
+    Handles CuPy arrays transparently via ``.get()``.
+    """
+    if hasattr(arr, 'get'):
+        arr = arr.get()
+    return np.asarray(arr, dtype=np.float64)
+
+
+# =====================================================================
+# Dinf helpers
+# =====================================================================
+
+@ngjit
+def _angle_to_neighbors(angle):
+    """Decompose Dinf angle into two neighbors and weights.
+
+    Returns (dy1, dx1, w1, dy2, dx2, w2).
+    Pit (angle < 0) or NaN returns all zeros.
+    """
+    if angle < 0.0 or angle != angle:  # pit or NaN
+        return 0, 0, 0.0, 0, 0, 0.0
+
+    pi_over_4 = 0.7853981633974483  # np.pi / 4
+    k = int(angle / pi_over_4)
+    if k > 7:
+        k = 7
+    alpha = angle - k * pi_over_4
+    w1 = 1.0 - alpha / pi_over_4
+    w2 = alpha / pi_over_4
+
+    if k == 0:
+        dy1, dx1 = 0, 1      # E
+        dy2, dx2 = -1, 1     # NE
+    elif k == 1:
+        dy1, dx1 = -1, 1     # NE
+        dy2, dx2 = -1, 0     # N
+    elif k == 2:
+        dy1, dx1 = -1, 0     # N
+        dy2, dx2 = -1, -1    # NW
+    elif k == 3:
+        dy1, dx1 = -1, -1    # NW
+        dy2, dx2 = 0, -1     # W
+    elif k == 4:
+        dy1, dx1 = 0, -1     # W
+        dy2, dx2 = 1, -1     # SW
+    elif k == 5:
+        dy1, dx1 = 1, -1     # SW
+        dy2, dx2 = 1, 0      # S
+    elif k == 6:
+        dy1, dx1 = 1, 0      # S
+        dy2, dx2 = 1, 1      # SE
+    else:  # k == 7
+        dy1, dx1 = 1, 1      # SE
+        dy2, dx2 = 0, 1      # E
+
+    return dy1, dx1, w1, dy2, dx2, w2
+
+
+# =====================================================================
+# CPU kernel
+# =====================================================================
+
+@ngjit
+def _flow_accum_dinf_cpu(flow_dir, height, width):
+    """Kahn's BFS topological sort for Dinf flow accumulation."""
+    accum = np.empty((height, width), dtype=np.float64)
+    in_degree = np.zeros((height, width), dtype=np.int32)
+    valid = np.zeros((height, width), dtype=np.int8)
+
+    for r in range(height):
+        for c in range(width):
+            v = flow_dir[r, c]
+            if v == v:  # not NaN
+                valid[r, c] = 1
+                accum[r, c] = 1.0
+            else:
+                accum[r, c] = np.nan
+
+    for r in range(height):
+        for c in range(width):
+            if valid[r, c] == 0:
+                continue
+            dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(flow_dir[r, c])
+            if w1 > 0.0:
+                nr, nc = r + dy1, c + dx1
+                if 0 <= nr < height and 0 <= nc < width and valid[nr, nc] == 1:
+                    in_degree[nr, nc] += 1
+            if w2 > 0.0:
+                nr, nc = r + dy2, c + dx2
+                if 0 <= nr < height and 0 <= nc < width and valid[nr, nc] == 1:
+                    in_degree[nr, nc] += 1
+
+    queue_r = np.empty(height * width, dtype=np.int64)
+    queue_c = np.empty(height * width, dtype=np.int64)
+    head = np.int64(0)
+    tail = np.int64(0)
+
+    for r in range(height):
+        for c in range(width):
+            if valid[r, c] == 1 and in_degree[r, c] == 0:
+                queue_r[tail] = r
+                queue_c[tail] = c
+                tail += 1
+
+    while head < tail:
+        r = queue_r[head]
+        c = queue_c[head]
+        head += 1
+
+        dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(flow_dir[r, c])
+
+        if w1 > 0.0:
+            nr, nc = r + dy1, c + dx1
+            if 0 <= nr < height and 0 <= nc < width and valid[nr, nc] == 1:
+                accum[nr, nc] += accum[r, c] * w1
+                in_degree[nr, nc] -= 1
+                if in_degree[nr, nc] == 0:
+                    queue_r[tail] = nr
+                    queue_c[tail] = nc
+                    tail += 1
+
+        if w2 > 0.0:
+            nr, nc = r + dy2, c + dx2
+            if 0 <= nr < height and 0 <= nc < width and valid[nr, nc] == 1:
+                accum[nr, nc] += accum[r, c] * w2
+                in_degree[nr, nc] -= 1
+                if in_degree[nr, nc] == 0:
+                    queue_r[tail] = nr
+                    queue_c[tail] = nc
+                    tail += 1
+
+    return accum
+
+
+# =====================================================================
+# GPU kernels
+# =====================================================================
+
+@cuda.jit
+def _init_accum_indegree_dinf(flow_dir, accum, in_degree, state, H, W):
+    """Initialise accum/in_degree/state for Dinf on GPU."""
+    i, j = cuda.grid(2)
+    if i >= H or j >= W:
+        return
+
+    v = flow_dir[i, j]
+    if v != v:  # NaN
+        state[i, j] = 0
+        accum[i, j] = 0.0
+        return
+
+    state[i, j] = 1
+    accum[i, j] = 1.0
+
+    if v < 0.0:  # pit
+        return
+
+    pi_over_4 = 0.7853981633974483
+    k = int(v / pi_over_4)
+    if k > 7:
+        k = 7
+    alpha = v - k * pi_over_4
+    w1 = 1.0 - alpha / pi_over_4
+    w2 = alpha / pi_over_4
+
+    # Facet offsets (inline)
+    if k == 0:
+        dy1, dx1 = 0, 1
+        dy2, dx2 = -1, 1
+    elif k == 1:
+        dy1, dx1 = -1, 1
+        dy2, dx2 = -1, 0
+    elif k == 2:
+        dy1, dx1 = -1, 0
+        dy2, dx2 = -1, -1
+    elif k == 3:
+        dy1, dx1 = -1, -1
+        dy2, dx2 = 0, -1
+    elif k == 4:
+        dy1, dx1 = 0, -1
+        dy2, dx2 = 1, -1
+    elif k == 5:
+        dy1, dx1 = 1, -1
+        dy2, dx2 = 1, 0
+    elif k == 6:
+        dy1, dx1 = 1, 0
+        dy2, dx2 = 1, 1
+    else:
+        dy1, dx1 = 1, 1
+        dy2, dx2 = 0, 1
+
+    if w1 > 0.0:
+        ni, nj = i + dy1, j + dx1
+        if 0 <= ni < H and 0 <= nj < W:
+            cuda.atomic.add(in_degree, (ni, nj), 1)
+    if w2 > 0.0:
+        ni, nj = i + dy2, j + dx2
+        if 0 <= ni < H and 0 <= nj < W:
+            cuda.atomic.add(in_degree, (ni, nj), 1)
+
+
+@cuda.jit
+def _pull_from_frontier_dinf(flow_dir, accum, in_degree, state, H, W):
+    """Active Dinf cells pull accumulation from frontier neighbours."""
+    i, j = cuda.grid(2)
+    if i >= H or j >= W:
+        return
+
+    if state[i, j] != 1:
+        return
+
+    pi_over_4 = 0.7853981633974483
+
+    for nbr in range(8):
+        if nbr == 0:
+            dy, dx = 0, 1
+        elif nbr == 1:
+            dy, dx = 1, 1
+        elif nbr == 2:
+            dy, dx = 1, 0
+        elif nbr == 3:
+            dy, dx = 1, -1
+        elif nbr == 4:
+            dy, dx = 0, -1
+        elif nbr == 5:
+            dy, dx = -1, -1
+        elif nbr == 6:
+            dy, dx = -1, 0
+        else:
+            dy, dx = -1, 1
+
+        ni = i + dy
+        nj = j + dx
+        if ni < 0 or ni >= H or nj < 0 or nj >= W:
+            continue
+        if state[ni, nj] != 2:
+            continue
+
+        nv = flow_dir[ni, nj]
+        if nv < 0.0 or nv != nv:
+            continue
+
+        nk = int(nv / pi_over_4)
+        if nk > 7:
+            nk = 7
+        nalpha = nv - nk * pi_over_4
+        nw1 = 1.0 - nalpha / pi_over_4
+        nw2 = nalpha / pi_over_4
+
+        # Inline facet offsets for neighbor's direction
+        if nk == 0:
+            ndy1, ndx1 = 0, 1
+            ndy2, ndx2 = -1, 1
+        elif nk == 1:
+            ndy1, ndx1 = -1, 1
+            ndy2, ndx2 = -1, 0
+        elif nk == 2:
+            ndy1, ndx1 = -1, 0
+            ndy2, ndx2 = -1, -1
+        elif nk == 3:
+            ndy1, ndx1 = -1, -1
+            ndy2, ndx2 = 0, -1
+        elif nk == 4:
+            ndy1, ndx1 = 0, -1
+            ndy2, ndx2 = 1, -1
+        elif nk == 5:
+            ndy1, ndx1 = 1, -1
+            ndy2, ndx2 = 1, 0
+        elif nk == 6:
+            ndy1, ndx1 = 1, 0
+            ndy2, ndx2 = 1, 1
+        else:
+            ndy1, ndx1 = 1, 1
+            ndy2, ndx2 = 0, 1
+
+        if nw1 > 0.0 and ni + ndy1 == i and nj + ndx1 == j:
+            accum[i, j] += accum[ni, nj] * nw1
+            in_degree[i, j] -= 1
+        if nw2 > 0.0 and ni + ndy2 == i and nj + ndx2 == j:
+            accum[i, j] += accum[ni, nj] * nw2
+            in_degree[i, j] -= 1
+
+
+def _flow_accum_dinf_cupy(flow_dir_data):
+    """GPU driver: iterative frontier peeling for Dinf."""
+    import cupy as cp
+
+    H, W = flow_dir_data.shape
+    flow_dir_f64 = flow_dir_data.astype(cp.float64)
+
+    accum = cp.zeros((H, W), dtype=cp.float64)
+    in_degree = cp.zeros((H, W), dtype=cp.int32)
+    state = cp.zeros((H, W), dtype=cp.int32)
+    changed = cp.zeros(1, dtype=cp.int32)
+
+    griddim, blockdim = cuda_args((H, W))
+
+    _init_accum_indegree_dinf[griddim, blockdim](
+        flow_dir_f64, accum, in_degree, state, H, W)
+
+    max_iter = H * W
+    for _ in range(max_iter):
+        changed[0] = 0
+        _find_ready_and_finalize[griddim, blockdim](
+            in_degree, state, changed, H, W)
+
+        if int(changed[0]) == 0:
+            break
+
+        _pull_from_frontier_dinf[griddim, blockdim](
+            flow_dir_f64, accum, in_degree, state, H, W)
+
+    accum = cp.where(state == 0, cp.nan, accum)
+    return accum
+
+
+def _flow_accum_dinf_tile_cupy(flow_dir_data,
+                                seed_top, seed_bottom, seed_left, seed_right,
+                                seed_tl, seed_tr, seed_bl, seed_br):
+    """GPU seeded Dinf flow accumulation for a single tile."""
+    import cupy as cp
+
+    H, W = flow_dir_data.shape
+    flow_dir_f64 = flow_dir_data.astype(cp.float64)
+
+    accum = cp.zeros((H, W), dtype=cp.float64)
+    in_degree = cp.zeros((H, W), dtype=cp.int32)
+    state = cp.zeros((H, W), dtype=cp.int32)
+    changed = cp.zeros(1, dtype=cp.int32)
+
+    griddim, blockdim = cuda_args((H, W))
+
+    _init_accum_indegree_dinf[griddim, blockdim](
+        flow_dir_f64, accum, in_degree, state, H, W)
+
+    accum[0, :] += cp.asarray(seed_top)
+    accum[H - 1, :] += cp.asarray(seed_bottom)
+    accum[:, 0] += cp.asarray(seed_left)
+    accum[:, W - 1] += cp.asarray(seed_right)
+    accum[0, 0] += seed_tl
+    accum[0, W - 1] += seed_tr
+    accum[H - 1, 0] += seed_bl
+    accum[H - 1, W - 1] += seed_br
+
+    max_iter = H * W
+    for _ in range(max_iter):
+        changed[0] = 0
+        _find_ready_and_finalize[griddim, blockdim](
+            in_degree, state, changed, H, W)
+
+        if int(changed[0]) == 0:
+            break
+
+        _pull_from_frontier_dinf[griddim, blockdim](
+            flow_dir_f64, accum, in_degree, state, H, W)
+
+    accum = cp.where(state == 0, cp.nan, accum)
+    return accum
+
+
+# =====================================================================
+# Dinf tile kernel for dask
+# =====================================================================
+
+@ngjit
+def _flow_accum_dinf_tile_kernel(flow_dir, h, w,
+                                  seed_top, seed_bottom,
+                                  seed_left, seed_right,
+                                  seed_tl, seed_tr, seed_bl, seed_br):
+    """Seeded BFS Dinf flow accumulation for a single tile."""
+    accum = np.empty((h, w), dtype=np.float64)
+    in_degree = np.zeros((h, w), dtype=np.int32)
+    valid = np.zeros((h, w), dtype=np.int8)
+
+    for r in range(h):
+        for c in range(w):
+            v = flow_dir[r, c]
+            if v == v:
+                valid[r, c] = 1
+                accum[r, c] = 1.0
+            else:
+                accum[r, c] = np.nan
+
+    # Add external seeds
+    for c in range(w):
+        if valid[0, c] == 1:
+            accum[0, c] += seed_top[c]
+        if valid[h - 1, c] == 1:
+            accum[h - 1, c] += seed_bottom[c]
+    for r in range(h):
+        if valid[r, 0] == 1:
+            accum[r, 0] += seed_left[r]
+        if valid[r, w - 1] == 1:
+            accum[r, w - 1] += seed_right[r]
+    if valid[0, 0] == 1:
+        accum[0, 0] += seed_tl
+    if valid[0, w - 1] == 1:
+        accum[0, w - 1] += seed_tr
+    if valid[h - 1, 0] == 1:
+        accum[h - 1, 0] += seed_bl
+    if valid[h - 1, w - 1] == 1:
+        accum[h - 1, w - 1] += seed_br
+
+    # In-degrees via angle decomposition
+    for r in range(h):
+        for c in range(w):
+            if valid[r, c] == 0:
+                continue
+            dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(flow_dir[r, c])
+            if w1 > 0.0:
+                nr, nc = r + dy1, c + dx1
+                if 0 <= nr < h and 0 <= nc < w and valid[nr, nc] == 1:
+                    in_degree[nr, nc] += 1
+            if w2 > 0.0:
+                nr, nc = r + dy2, c + dx2
+                if 0 <= nr < h and 0 <= nc < w and valid[nr, nc] == 1:
+                    in_degree[nr, nc] += 1
+
+    queue_r = np.empty(h * w, dtype=np.int64)
+    queue_c = np.empty(h * w, dtype=np.int64)
+    head = np.int64(0)
+    tail = np.int64(0)
+
+    for r in range(h):
+        for c in range(w):
+            if valid[r, c] == 1 and in_degree[r, c] == 0:
+                queue_r[tail] = r
+                queue_c[tail] = c
+                tail += 1
+
+    while head < tail:
+        r = queue_r[head]
+        c = queue_c[head]
+        head += 1
+
+        dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(flow_dir[r, c])
+
+        if w1 > 0.0:
+            nr, nc = r + dy1, c + dx1
+            if 0 <= nr < h and 0 <= nc < w and valid[nr, nc] == 1:
+                accum[nr, nc] += accum[r, c] * w1
+                in_degree[nr, nc] -= 1
+                if in_degree[nr, nc] == 0:
+                    queue_r[tail] = nr
+                    queue_c[tail] = nc
+                    tail += 1
+
+        if w2 > 0.0:
+            nr, nc = r + dy2, c + dx2
+            if 0 <= nr < h and 0 <= nc < w and valid[nr, nc] == 1:
+                accum[nr, nc] += accum[r, c] * w2
+                in_degree[nr, nc] -= 1
+                if in_degree[nr, nc] == 0:
+                    queue_r[tail] = nr
+                    queue_c[tail] = nc
+                    tail += 1
+
+    return accum
+
+
+# =====================================================================
+# Dinf dask iterative tile sweep
+# =====================================================================
+
+def _compute_seeds_dinf(iy, ix, boundaries, flow_bdry,
+                        chunks_y, chunks_x, n_tile_y, n_tile_x):
+    """Compute seed arrays for tile (iy, ix) using Dinf angle decomposition."""
+    tile_h = chunks_y[iy]
+    tile_w = chunks_x[ix]
+
+    seed_top = np.zeros(tile_w, dtype=np.float64)
+    seed_bottom = np.zeros(tile_w, dtype=np.float64)
+    seed_left = np.zeros(tile_h, dtype=np.float64)
+    seed_right = np.zeros(tile_h, dtype=np.float64)
+    seed_tl = 0.0
+    seed_tr = 0.0
+    seed_bl = 0.0
+    seed_br = 0.0
+
+    # --- Top edge: bottom row of tile above flows south (dy=1) ---
+    if iy > 0:
+        nb_fdir = flow_bdry.get('bottom', iy - 1, ix)
+        nb_accum = boundaries.get('bottom', iy - 1, ix)
+        for c in range(len(nb_fdir)):
+            dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(nb_fdir[c])
+            if w1 > 0.0 and dy1 == 1:
+                tc = c + dx1
+                if 0 <= tc < tile_w:
+                    seed_top[tc] += nb_accum[c] * w1
+            if w2 > 0.0 and dy2 == 1:
+                tc = c + dx2
+                if 0 <= tc < tile_w:
+                    seed_top[tc] += nb_accum[c] * w2
+
+    # --- Bottom edge: top row of tile below flows north (dy=-1) ---
+    if iy < n_tile_y - 1:
+        nb_fdir = flow_bdry.get('top', iy + 1, ix)
+        nb_accum = boundaries.get('top', iy + 1, ix)
+        for c in range(len(nb_fdir)):
+            dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(nb_fdir[c])
+            if w1 > 0.0 and dy1 == -1:
+                tc = c + dx1
+                if 0 <= tc < tile_w:
+                    seed_bottom[tc] += nb_accum[c] * w1
+            if w2 > 0.0 and dy2 == -1:
+                tc = c + dx2
+                if 0 <= tc < tile_w:
+                    seed_bottom[tc] += nb_accum[c] * w2
+
+    # --- Left edge: right column of tile to the left flows east (dx=1) ---
+    if ix > 0:
+        nb_fdir = flow_bdry.get('right', iy, ix - 1)
+        nb_accum = boundaries.get('right', iy, ix - 1)
+        for r in range(len(nb_fdir)):
+            dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(nb_fdir[r])
+            if w1 > 0.0 and dx1 == 1:
+                tr = r + dy1
+                if 0 <= tr < tile_h:
+                    seed_left[tr] += nb_accum[r] * w1
+            if w2 > 0.0 and dx2 == 1:
+                tr = r + dy2
+                if 0 <= tr < tile_h:
+                    seed_left[tr] += nb_accum[r] * w2
+
+    # --- Right edge: left column of tile to the right flows west (dx=-1) ---
+    if ix < n_tile_x - 1:
+        nb_fdir = flow_bdry.get('left', iy, ix + 1)
+        nb_accum = boundaries.get('left', iy, ix + 1)
+        for r in range(len(nb_fdir)):
+            dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(nb_fdir[r])
+            if w1 > 0.0 and dx1 == -1:
+                tr = r + dy1
+                if 0 <= tr < tile_h:
+                    seed_right[tr] += nb_accum[r] * w1
+            if w2 > 0.0 and dx2 == -1:
+                tr = r + dy2
+                if 0 <= tr < tile_h:
+                    seed_right[tr] += nb_accum[r] * w2
+
+    # --- Diagonal corner seeds ---
+    # TL: bottom-right of (iy-1, ix-1) flows SE (dy=1, dx=1)
+    if iy > 0 and ix > 0:
+        fv = flow_bdry.get('bottom', iy - 1, ix - 1)[-1]
+        av = float(boundaries.get('bottom', iy - 1, ix - 1)[-1])
+        dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(fv)
+        if w1 > 0.0 and dy1 == 1 and dx1 == 1:
+            seed_tl += av * w1
+        if w2 > 0.0 and dy2 == 1 and dx2 == 1:
+            seed_tl += av * w2
+
+    # TR: bottom-left of (iy-1, ix+1) flows SW (dy=1, dx=-1)
+    if iy > 0 and ix < n_tile_x - 1:
+        fv = flow_bdry.get('bottom', iy - 1, ix + 1)[0]
+        av = float(boundaries.get('bottom', iy - 1, ix + 1)[0])
+        dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(fv)
+        if w1 > 0.0 and dy1 == 1 and dx1 == -1:
+            seed_tr += av * w1
+        if w2 > 0.0 and dy2 == 1 and dx2 == -1:
+            seed_tr += av * w2
+
+    # BL: top-right of (iy+1, ix-1) flows NE (dy=-1, dx=1)
+    if iy < n_tile_y - 1 and ix > 0:
+        fv = flow_bdry.get('top', iy + 1, ix - 1)[-1]
+        av = float(boundaries.get('top', iy + 1, ix - 1)[-1])
+        dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(fv)
+        if w1 > 0.0 and dy1 == -1 and dx1 == 1:
+            seed_bl += av * w1
+        if w2 > 0.0 and dy2 == -1 and dx2 == 1:
+            seed_bl += av * w2
+
+    # BR: top-left of (iy+1, ix+1) flows NW (dy=-1, dx=-1)
+    if iy < n_tile_y - 1 and ix < n_tile_x - 1:
+        fv = flow_bdry.get('top', iy + 1, ix + 1)[0]
+        av = float(boundaries.get('top', iy + 1, ix + 1)[0])
+        dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(fv)
+        if w1 > 0.0 and dy1 == -1 and dx1 == -1:
+            seed_br += av * w1
+        if w2 > 0.0 and dy2 == -1 and dx2 == -1:
+            seed_br += av * w2
+
+    return (seed_top, seed_bottom, seed_left, seed_right,
+            seed_tl, seed_tr, seed_bl, seed_br)
+
+
+def _process_tile_dinf(iy, ix, flow_dir_da, boundaries, flow_bdry,
+                       chunks_y, chunks_x, n_tile_y, n_tile_x):
+    """Run seeded Dinf BFS on one tile; update boundaries in-place."""
+    chunk = np.asarray(
+        flow_dir_da.blocks[iy, ix].compute(), dtype=np.float64)
+    h, w = chunk.shape
+
+    seeds = _compute_seeds_dinf(
+        iy, ix, boundaries, flow_bdry,
+        chunks_y, chunks_x, n_tile_y, n_tile_x)
+
+    accum = _flow_accum_dinf_tile_kernel(chunk, h, w, *seeds)
+
+    new_top = accum[0, :].copy()
+    new_bottom = accum[-1, :].copy()
+    new_left = accum[:, 0].copy()
+    new_right = accum[:, -1].copy()
+
+    change = 0.0
+    for side, new in (('top', new_top), ('bottom', new_bottom),
+                      ('left', new_left), ('right', new_right)):
+        old = boundaries.get(side, iy, ix)
+        with np.errstate(invalid='ignore'):
+            diff = np.abs(new - old)
+        diff = np.where(np.isnan(diff), 0.0, diff)
+        m = float(np.max(diff))
+        if m > change:
+            change = m
+
+    boundaries.set('top', iy, ix, new_top)
+    boundaries.set('bottom', iy, ix, new_bottom)
+    boundaries.set('left', iy, ix, new_left)
+    boundaries.set('right', iy, ix, new_right)
+
+    return change
+
+
+def _flow_accum_dinf_dask_iterative(flow_dir_da):
+    """Iterative boundary-propagation for Dinf dask arrays."""
+    chunks_y = flow_dir_da.chunks[0]
+    chunks_x = flow_dir_da.chunks[1]
+    n_tile_y = len(chunks_y)
+    n_tile_x = len(chunks_x)
+
+    flow_bdry = _preprocess_tiles(flow_dir_da, chunks_y, chunks_x)
+    flow_bdry = flow_bdry.snapshot()
+
+    boundaries = BoundaryStore(chunks_y, chunks_x, fill_value=0.0)
+
+    max_iterations = max(n_tile_y, n_tile_x) + 10
+
+    for _iteration in range(max_iterations):
+        max_change = 0.0
+
+        for iy in range(n_tile_y):
+            for ix in range(n_tile_x):
+                c = _process_tile_dinf(iy, ix, flow_dir_da, boundaries,
+                                       flow_bdry, chunks_y, chunks_x,
+                                       n_tile_y, n_tile_x)
+                if c > max_change:
+                    max_change = c
+
+        for iy in reversed(range(n_tile_y)):
+            for ix in reversed(range(n_tile_x)):
+                c = _process_tile_dinf(iy, ix, flow_dir_da, boundaries,
+                                       flow_bdry, chunks_y, chunks_x,
+                                       n_tile_y, n_tile_x)
+                if c > max_change:
+                    max_change = c
+
+        if max_change == 0.0:
+            break
+
+    boundaries = boundaries.snapshot()
+
+    return _assemble_result_dinf(flow_dir_da, boundaries, flow_bdry,
+                                 chunks_y, chunks_x, n_tile_y, n_tile_x)
+
+
+def _assemble_result_dinf(flow_dir_da, boundaries, flow_bdry,
+                          chunks_y, chunks_x, n_tile_y, n_tile_x):
+    """Build lazy dask array by re-running each Dinf tile with converged seeds."""
+
+    def _tile_fn(flow_dir_block, block_info=None):
+        if block_info is None or 0 not in block_info:
+            return np.full(flow_dir_block.shape, np.nan, dtype=np.float64)
+        iy, ix = block_info[0]['chunk-location']
+        h, w = flow_dir_block.shape
+        seeds = _compute_seeds_dinf(
+            iy, ix, boundaries, flow_bdry,
+            chunks_y, chunks_x, n_tile_y, n_tile_x)
+        return _flow_accum_dinf_tile_kernel(
+            np.asarray(flow_dir_block, dtype=np.float64), h, w, *seeds)
+
+    return da.map_blocks(
+        _tile_fn,
+        flow_dir_da,
+        dtype=np.float64,
+        meta=np.array((), dtype=np.float64),
+    )
+
+
+def _process_tile_dinf_cupy(iy, ix, flow_dir_da, boundaries, flow_bdry,
+                             chunks_y, chunks_x, n_tile_y, n_tile_x):
+    """Run seeded GPU Dinf flow accumulation on one tile."""
+    import cupy as cp
+
+    chunk = cp.asarray(
+        flow_dir_da.blocks[iy, ix].compute(), dtype=cp.float64)
+
+    seeds = _compute_seeds_dinf(
+        iy, ix, boundaries, flow_bdry,
+        chunks_y, chunks_x, n_tile_y, n_tile_x)
+
+    accum = _flow_accum_dinf_tile_cupy(chunk, *seeds)
+
+    new_top = accum[0, :].get().copy()
+    new_bottom = accum[-1, :].get().copy()
+    new_left = accum[:, 0].get().copy()
+    new_right = accum[:, -1].get().copy()
+
+    change = 0.0
+    for side, new in (('top', new_top), ('bottom', new_bottom),
+                      ('left', new_left), ('right', new_right)):
+        old = boundaries.get(side, iy, ix)
+        with np.errstate(invalid='ignore'):
+            diff = np.abs(new - old)
+        diff = np.where(np.isnan(diff), 0.0, diff)
+        m = float(np.max(diff))
+        if m > change:
+            change = m
+
+    boundaries.set('top', iy, ix, new_top)
+    boundaries.set('bottom', iy, ix, new_bottom)
+    boundaries.set('left', iy, ix, new_left)
+    boundaries.set('right', iy, ix, new_right)
+
+    return change
+
+
+def _assemble_result_dinf_cupy(flow_dir_da, boundaries, flow_bdry,
+                                chunks_y, chunks_x, n_tile_y, n_tile_x):
+    """Build lazy dask+cupy array using GPU Dinf tile kernel."""
+    import cupy as cp
+
+    def _tile_fn(flow_dir_block, block_info=None):
+        if block_info is None or 0 not in block_info:
+            return cp.full(flow_dir_block.shape, cp.nan, dtype=cp.float64)
+        iy, ix = block_info[0]['chunk-location']
+        seeds = _compute_seeds_dinf(
+            iy, ix, boundaries, flow_bdry,
+            chunks_y, chunks_x, n_tile_y, n_tile_x)
+        return _flow_accum_dinf_tile_cupy(
+            cp.asarray(flow_dir_block, dtype=cp.float64), *seeds)
+
+    return da.map_blocks(
+        _tile_fn,
+        flow_dir_da,
+        dtype=np.float64,
+        meta=cp.array((), dtype=cp.float64),
+    )
+
+
+def _flow_accum_dinf_dask_cupy(flow_dir_da):
+    """Dask+CuPy Dinf: native GPU processing per tile."""
+    chunks_y = flow_dir_da.chunks[0]
+    chunks_x = flow_dir_da.chunks[1]
+    n_tile_y = len(chunks_y)
+    n_tile_x = len(chunks_x)
+
+    flow_bdry = _preprocess_tiles(flow_dir_da, chunks_y, chunks_x)
+    flow_bdry = flow_bdry.snapshot()
+
+    boundaries = BoundaryStore(chunks_y, chunks_x, fill_value=0.0)
+
+    max_iterations = max(n_tile_y, n_tile_x) + 10
+
+    for _iteration in range(max_iterations):
+        max_change = 0.0
+
+        for iy in range(n_tile_y):
+            for ix in range(n_tile_x):
+                c = _process_tile_dinf_cupy(
+                    iy, ix, flow_dir_da, boundaries,
+                    flow_bdry, chunks_y, chunks_x,
+                    n_tile_y, n_tile_x)
+                if c > max_change:
+                    max_change = c
+
+        for iy in reversed(range(n_tile_y)):
+            for ix in reversed(range(n_tile_x)):
+                c = _process_tile_dinf_cupy(
+                    iy, ix, flow_dir_da, boundaries,
+                    flow_bdry, chunks_y, chunks_x,
+                    n_tile_y, n_tile_x)
+                if c > max_change:
+                    max_change = c
+
+        if max_change == 0.0:
+            break
+
+    boundaries = boundaries.snapshot()
+
+    return _assemble_result_dinf_cupy(flow_dir_da, boundaries, flow_bdry,
+                                       chunks_y, chunks_x, n_tile_y, n_tile_x)
+
+
+# =====================================================================
+# Public API
+# =====================================================================
+
+@supports_dataset
+def flow_accumulation_dinf(flow_dir: xr.DataArray,
+                           name: str = 'flow_accumulation') -> xr.DataArray:
+    """Compute flow accumulation from a D-infinity flow direction grid.
+
+    Takes a continuous-angle flow direction grid (as produced by
+    ``flow_direction_dinf``) and accumulates upstream contributing
+    area.  Flow is split proportionally between two neighbors
+    following Tarboton (1997).
+
+    Parameters
+    ----------
+    flow_dir : xarray.DataArray or xr.Dataset
+        2-D D-infinity flow direction grid.  Values are continuous
+        angles in radians ``[0, 2*pi)``, with ``-1.0`` for pits and
+        NaN for nodata (as produced by ``flow_direction_dinf``).
+        Supported backends: NumPy, CuPy, NumPy-backed Dask,
+        CuPy-backed Dask.
+        If a Dataset is passed, the operation is applied to each
+        data variable independently.
+    name : str, default='flow_accumulation'
+        Name of output DataArray.
+
+    Returns
+    -------
+    xarray.DataArray or xr.Dataset
+        2-D float64 array of flow accumulation values.  Each cell
+        holds the total upstream contributing area (including itself)
+        that drains through it, weighted by D-inf proportional
+        splitting.  NaN where the input has NaN.
+
+    References
+    ----------
+    Tarboton, D.G. (1997). A new method for the determination of flow
+    directions and upslope areas in grid digital elevation models.
+    Water Resources Research, 33(2), 309-319.
+    """
+    _validate_raster(flow_dir, func_name='flow_accumulation_dinf',
+                     name='flow_dir')
+
+    data = flow_dir.data
+
+    if isinstance(data, np.ndarray):
+        out = _flow_accum_dinf_cpu(data.astype(np.float64), *data.shape)
+    elif has_cuda_and_cupy() and is_cupy_array(data):
+        out = _flow_accum_dinf_cupy(data)
+    elif has_cuda_and_cupy() and is_dask_cupy(flow_dir):
+        out = _flow_accum_dinf_dask_cupy(data)
+    elif da is not None and isinstance(data, da.Array):
+        out = _flow_accum_dinf_dask_iterative(data)
+    else:
+        raise TypeError(f"Unsupported array type: {type(data)}")
+
+    return xr.DataArray(out,
+                        name=name,
+                        coords=flow_dir.coords,
+                        dims=flow_dir.dims,
+                        attrs=flow_dir.attrs)
diff --git a/xrspatial/flow_direction_dinf.py b/xrspatial/flow_direction_dinf.py
index 34987aa4..a1e69757 100644
--- a/xrspatial/flow_direction_dinf.py
+++ b/xrspatial/flow_direction_dinf.py
@@ -51,10 +51,14 @@ def _cpu(data, cellsize_x, cellsize_y):
     nb_dy = np.array([0, -1, -1, -1, 0, 1, 1, 1])
     nb_dx = np.array([1, 1, 0, -1, -1, -1, 0, 1])
 
-    # d1 for each facet (distance from center to e1)
-    d1 = np.array([cx, diag, cy, diag, cx, diag, cy, diag])
-    # d2 for each facet (distance from e1 to e2)
+    # Tarboton facet decomposition: e1=cardinal, e2=diagonal
+    e1_idx = np.array([0, 2, 2, 4, 4, 6, 6, 0])  # cardinal neighbor
+    e2_idx = np.array([1, 1, 3, 3, 5, 5, 7, 7])  # diagonal neighbor
+    # d1: center->cardinal, d2: cardinal->diagonal (perpendicular)
+    d1 = np.array([cx, cy, cy, cx, cx, cy, cy, cx])
     d2 = np.array([cy, cx, cx, cy, cy, cx, cx, cy])
+    ac = np.array([0, 2, 2, 4, 4, 6, 6, 8])   # angle base (pi/4 units)
+    af = np.array([1, -1, 1, -1, 1, -1, 1, -1])  # angle sign
 
     for y in range(1, rows - 1):
         for x in range(1, cols - 1):
@@ -76,9 +80,8 @@ def _cpu(data, cellsize_x, cellsize_y):
             best_angle = -1.0
 
             for k in range(8):
-                k2 = (k + 1) % 8
-                e1 = data[y + nb_dy[k], x + nb_dx[k]]
-                e2 = data[y + nb_dy[k2], x + nb_dx[k2]]
+                e1 = data[y + nb_dy[e1_idx[k]], x + nb_dx[e1_idx[k]]]
+                e2 = data[y + nb_dy[e2_idx[k]], x + nb_dx[e2_idx[k]]]
 
                 s1 = (center - e1) / d1[k]
                 s2 = (e1 - e2) / d2[k]
@@ -89,13 +92,13 @@ def _cpu(data, cellsize_x, cellsize_y):
                     s = s1
                 elif r > pi_over_4:
                     r = pi_over_4
-                    s = (center - e2) / math.sqrt(d1[k] * d1[k] + d2[k] * d2[k])
+                    s = (center - e2) / diag
                 else:
                     s = math.sqrt(s1 * s1 + s2 * s2)
 
                 if s > max_slope:
                     max_slope = s
-                    best_angle = k * pi_over_4 + r
+                    best_angle = ac[k] * pi_over_4 + af[k] * r
 
             if max_slope <= 0.0:
                 out[y, x] = -1.0
@@ -158,21 +161,21 @@ def _gpu(arr, cellsize_x, cellsize_y):
         max_slope = slope_val
         best_angle = r  # 0 + r
 
-    # Facet 1: e1=NE, e2=N, d1=diag, d2=cx, start=pi/4
-    s1 = (center - ne) / diag
-    s2 = (ne - n) / cx
+    # Facet 1: e1=N, e2=NE, d1=cy, d2=cx, angle=pi/2-r
+    s1 = (center - n) / cy
+    s2 = (n - ne) / cx
     r = math.atan2(s2, s1)
     if r < 0.0:
         r = 0.0
         slope_val = s1
     elif r > pi_over_4:
         r = pi_over_4
-        slope_val = (center - n) / (diag * diag + cx * cx) ** 0.5
+        slope_val = (center - ne) / diag
     else:
         slope_val = (s1 * s1 + s2 * s2) ** 0.5
     if slope_val > max_slope:
         max_slope = slope_val
-        best_angle = pi_over_4 + r
+        best_angle = 2.0 * pi_over_4 - r
 
     # Facet 2: e1=N, e2=NW, d1=cy, d2=cx, start=pi/2
     s1 = (center - n) / cy
@@ -190,21 +193,21 @@ def _gpu(arr, cellsize_x, cellsize_y):
         max_slope = slope_val
         best_angle = 2.0 * pi_over_4 + r
 
-    # Facet 3: e1=NW, e2=W, d1=diag, d2=cy, start=3*pi/4
-    s1 = (center - nw) / diag
-    s2 = (nw - w) / cy
+    # Facet 3: e1=W, e2=NW, d1=cx, d2=cy, angle=pi-r
+    s1 = (center - w) / cx
+    s2 = (w - nw) / cy
     r = math.atan2(s2, s1)
     if r < 0.0:
         r = 0.0
         slope_val = s1
     elif r > pi_over_4:
         r = pi_over_4
-        slope_val = (center - w) / (diag * diag + cy * cy) ** 0.5
+        slope_val = (center - nw) / diag
     else:
         slope_val = (s1 * s1 + s2 * s2) ** 0.5
     if slope_val > max_slope:
         max_slope = slope_val
-        best_angle = 3.0 * pi_over_4 + r
+        best_angle = 4.0 * pi_over_4 - r
 
     # Facet 4: e1=W, e2=SW, d1=cx, d2=cy, start=pi
     s1 = (center - w) / cx
@@ -222,21 +225,21 @@ def _gpu(arr, cellsize_x, cellsize_y):
         max_slope = slope_val
         best_angle = 4.0 * pi_over_4 + r
 
-    # Facet 5: e1=SW, e2=S, d1=diag, d2=cx, start=5*pi/4
-    s1 = (center - sw) / diag
-    s2 = (sw - s) / cx
+    # Facet 5: e1=S, e2=SW, d1=cy, d2=cx, angle=3*pi/2-r
+    s1 = (center - s) / cy
+    s2 = (s - sw) / cx
     r = math.atan2(s2, s1)
     if r < 0.0:
         r = 0.0
         slope_val = s1
     elif r > pi_over_4:
         r = pi_over_4
-        slope_val = (center - s) / (diag * diag + cx * cx) ** 0.5
+        slope_val = (center - sw) / diag
     else:
         slope_val = (s1 * s1 + s2 * s2) ** 0.5
     if slope_val > max_slope:
         max_slope = slope_val
-        best_angle = 5.0 * pi_over_4 + r
+        best_angle = 6.0 * pi_over_4 - r
 
     # Facet 6: e1=S, e2=SE, d1=cy, d2=cx, start=3*pi/2
     s1 = (center - s) / cy
@@ -254,21 +257,21 @@ def _gpu(arr, cellsize_x, cellsize_y):
         max_slope = slope_val
         best_angle = 6.0 * pi_over_4 + r
 
-    # Facet 7: e1=SE, e2=E, d1=diag, d2=cy, start=7*pi/4
-    s1 = (center - se) / diag
-    s2 = (se - e) / cy
+    # Facet 7: e1=E, e2=SE, d1=cx, d2=cy, angle=2*pi-r
+    s1 = (center - e) / cx
+    s2 = (e - se) / cy
     r = math.atan2(s2, s1)
     if r < 0.0:
         r = 0.0
         slope_val = s1
     elif r > pi_over_4:
         r = pi_over_4
-        slope_val = (center - e) / (diag * diag + cy * cy) ** 0.5
+        slope_val = (center - se) / diag
     else:
         slope_val = (s1 * s1 + s2 * s2) ** 0.5
     if slope_val > max_slope:
         max_slope = slope_val
-        best_angle = 7.0 * pi_over_4 + r
+        best_angle = 8.0 * pi_over_4 - r
 
     if max_slope <= 0.0:
         return -1.0
diff --git a/xrspatial/rasterize.py b/xrspatial/rasterize.py
index 164b8216..d8f4519d 100644
--- a/xrspatial/rasterize.py
+++ b/xrspatial/rasterize.py
@@ -11,6 +11,7 @@
 """
 from __future__ import annotations
 
+import warnings
 from typing import Optional, Tuple, Union
 
 import numpy as np
@@ -136,13 +137,74 @@ def _classify_geometries(geometries, props_array):
     Where poly_props is (N_poly, P), line_props is (N_line, P),
     point_props is (N_point, P) float64 arrays.
     """
+    if _HAS_SHAPELY2:
+        return _classify_geometries_vectorized(geometries, props_array)
+    return _classify_geometries_loop(geometries, props_array)
+
+
+def _classify_geometries_vectorized(geometries, props_array):
+    """Vectorized classification using shapely 2.0 array ops."""
+    import shapely
+
+    n_props = props_array.shape[1] if props_array.ndim == 2 else 1
+    geom_arr = np.array(geometries, dtype=object)
+    n = len(geom_arr)
+
+    if n == 0:
+        empty_props = np.empty((0, n_props), dtype=np.float64)
+        return (([], empty_props, []),
+                ([], empty_props.copy()),
+                ([], empty_props.copy()))
+
+    type_ids = shapely.get_type_id(geom_arr)
+    empty = shapely.is_empty(geom_arr)
+    valid = ~empty
+
+    # Type ID mapping:
+    # 0=Point, 1=LineString, 2=LinearRing, 3=Polygon,
+    # 4=MultiPoint, 5=MultiLineString, 6=MultiPolygon,
+    # 7=GeometryCollection
+    poly_mask = valid & ((type_ids == 3) | (type_ids == 6))
+    line_mask = valid & ((type_ids == 1) | (type_ids == 5))
+    point_mask = valid & ((type_ids == 0) | (type_ids == 4))
+    gc_mask = valid & (type_ids == 7)
+
+    has_gc = np.any(gc_mask)
+
+    # Fast path: no GeometryCollections (common case)
+    if not has_gc:
+        poly_idx = np.where(poly_mask)[0]
+        line_idx = np.where(line_mask)[0]
+        point_idx = np.where(point_mask)[0]
+
+        poly_geoms = [geometries[i] for i in poly_idx]
+        poly_ids = list(range(len(poly_idx)))
+        poly_props = (props_array[poly_idx] if len(poly_idx) > 0
+                      else np.empty((0, n_props), dtype=np.float64))
+
+        line_geoms = [geometries[i] for i in line_idx]
+        line_props = (props_array[line_idx] if len(line_idx) > 0
+                      else np.empty((0, n_props), dtype=np.float64))
+
+        point_geoms = [geometries[i] for i in point_idx]
+        point_props = (props_array[point_idx] if len(point_idx) > 0
+                       else np.empty((0, n_props), dtype=np.float64))
+
+        return ((poly_geoms, poly_props, poly_ids),
+                (line_geoms, line_props),
+                (point_geoms, point_props))
+
+    # Slow path: unpack GeometryCollections, then classify
+    return _classify_geometries_loop(geometries, props_array)
+
+
+def _classify_geometries_loop(geometries, props_array):
+    """Per-object classification fallback (handles GeometryCollections)."""
     n_props = props_array.shape[1] if props_array.ndim == 2 else 1
     poly_geoms, poly_prop_rows, poly_ids = [], [], []
     line_geoms, line_prop_rows = [], []
     point_geoms, point_prop_rows = [], []
 
-    # poly_counter tracks per-type indices that both preserve input order
-    # and serve as valid indices into the poly_props table.
     poly_counter = [0]
 
     def _classify_one(geom, prop_row, global_idx):
@@ -1083,12 +1145,12 @@ def _scanline_fill_gpu(out, written, edge_y_min, edge_x_at_ymin,
         if count == 0:
             return
 
-        MAX_ISECT = 512
+        MAX_ISECT = 2048
         if count > MAX_ISECT:
             count = MAX_ISECT
 
-        xs = cuda.local.array(512, dtype=np.float64)
-        gs = cuda.local.array(512, dtype=np.int32)
+        xs = cuda.local.array(2048, dtype=np.float64)
+        gs = cuda.local.array(2048, dtype=np.int32)
 
         actual = 0
         for k in range(start, end):
@@ -1287,6 +1349,18 @@ def _run_cupy(geometries, props_array, bounds, height, width, fill, dtype,
         row_ptr, col_idx = _build_row_csr_numba(
             edge_y_min, edge_y_max, height)
 
+        # Check for rows exceeding the GPU kernel's local array limit
+        _GPU_MAX_ISECT = 2048
+        max_edges_per_row = int(np.diff(row_ptr).max())
+        if max_edges_per_row > _GPU_MAX_ISECT:
+            warnings.warn(
+                f"rasterize CUDA kernel: {max_edges_per_row} active edges "
+                f"in a single row exceeds the limit of {_GPU_MAX_ISECT}. "
+                f"Excess edges will be silently dropped, causing incorrect "
+                f"results. Reduce polygon density or use the numpy backend.",
+                stacklevel=3,
+            )
+
         d_y_min = cupy.asarray(edge_y_min)
         d_x_at_ymin = cupy.asarray(edge_x_at_ymin)
         d_inv_slope = cupy.asarray(edge_inv_slope)
@@ -1814,13 +1888,9 @@ def _parse_input(geometries, column=None, columns=None):
 
     props_array = np.array(value_list, dtype=np.float64).reshape(-1, 1)
 
-    # Compute per-geometry bboxes (reused by dask tile filtering later)
-    geom_bboxes = _geometry_bboxes(geom_list)
-    if len(geom_bboxes) == 0:
-        return geom_list, props_array, None
-    total_bounds = (geom_bboxes[:, 0].min(), geom_bboxes[:, 1].min(),
-                    geom_bboxes[:, 2].max(), geom_bboxes[:, 3].max())
-    return geom_list, props_array, total_bounds
+    # Bounds computation is deferred: return None here and let the
+    # caller compute bboxes only when bounds are actually needed.
+    return geom_list, props_array, None
 
 
 def _extract_grid_from_like(like):
@@ -2023,6 +2093,14 @@ def rasterize(
         final_bounds = like_bounds
     if final_bounds is None:
         final_bounds = inferred_bounds
+    if final_bounds is None and geom_list:
+        # Compute bounds lazily only when not supplied by the caller
+        geom_bboxes = _geometry_bboxes(geom_list)
+        if len(geom_bboxes) > 0:
+            final_bounds = (geom_bboxes[:, 0].min(),
+                            geom_bboxes[:, 1].min(),
+                            geom_bboxes[:, 2].max(),
+                            geom_bboxes[:, 3].max())
     if final_bounds is None:
         raise ValueError(
             "bounds must be provided when geometries are empty or have "
diff --git a/xrspatial/stream_order.py b/xrspatial/stream_order.py
index aa731202..c7122dad 100644
--- a/xrspatial/stream_order.py
+++ b/xrspatial/stream_order.py
@@ -694,6 +694,7 @@ def _preprocess_stream_tiles(flow_dir_da, accum_da, threshold,
                 accum_da.blocks[iy, ix].compute())
             sm = np.where(ac_chunk >= threshold, 1.0, 0.0)
             sm = np.where(np.isnan(ac_chunk), 0.0, sm)
+            sm = np.where(np.isnan(fd_chunk), 0.0, sm)
 
             for side, row_data_fd, row_data_sm in [
                 ('top', fd_chunk[0, :], sm[0, :]),
@@ -986,6 +987,7 @@ def _process_strahler_tile(iy, ix, flow_dir_da, accum_da, threshold,
         accum_da.blocks[iy, ix].compute(), dtype=np.float64)
     sm = np.where(ac_chunk >= threshold, 1, 0).astype(np.int8)
     sm = np.where(np.isnan(ac_chunk), 0, sm).astype(np.int8)
+    sm = np.where(np.isnan(fd_chunk), 0, sm).astype(np.int8)
     h, w = fd_chunk.shape
 
     seeds = _compute_strahler_seeds(
@@ -1031,6 +1033,7 @@ def _process_shreve_tile(iy, ix, flow_dir_da, accum_da, threshold,
         accum_da.blocks[iy, ix].compute(), dtype=np.float64)
     sm = np.where(ac_chunk >= threshold, 1, 0).astype(np.int8)
     sm = np.where(np.isnan(ac_chunk), 0, sm).astype(np.int8)
+    sm = np.where(np.isnan(fd_chunk), 0, sm).astype(np.int8)
     h, w = fd_chunk.shape
 
     seeds = _compute_shreve_seeds(
@@ -1111,6 +1114,7 @@ def _tile_fn(block, accum_block, block_info=None):
         ac = np.asarray(accum_block, dtype=np.float64)
         sm = np.where(ac >= _threshold, 1, 0).astype(np.int8)
         sm = np.where(np.isnan(ac), 0, sm).astype(np.int8)
+        sm = np.where(np.isnan(fd), 0, sm).astype(np.int8)
         h, w = fd.shape
         seeds = _compute_strahler_seeds(
             iy, ix, _bdry_max, _bdry_cnt, _flow_bdry, _mask_bdry,
@@ -1173,6 +1177,7 @@ def _tile_fn(block, accum_block, block_info=None):
         ac = np.asarray(accum_block, dtype=np.float64)
         sm = np.where(ac >= _threshold, 1, 0).astype(np.int8)
         sm = np.where(np.isnan(ac), 0, sm).astype(np.int8)
+        sm = np.where(np.isnan(fd), 0, sm).astype(np.int8)
         h, w = fd.shape
         seeds = _compute_shreve_seeds(
             iy, ix, _boundaries, _flow_bdry, _mask_bdry,
diff --git a/xrspatial/stream_order_mfd.py b/xrspatial/stream_order_mfd.py
index 5c09d8cf..cd216b24 100644
--- a/xrspatial/stream_order_mfd.py
+++ b/xrspatial/stream_order_mfd.py
@@ -590,7 +590,15 @@ def _strahler_mfd_tile_kernel(fractions, stream_mask, h, w,
                     queue_c[tail] = nc
                     tail += 1
 
-    return order
+    # Fix headwater cells: represent (order=1, no inputs) as (max=1, cnt=1)
+    # so that the boundary (max, cnt) reconstruction works correctly.
+    for r in range(h):
+        for c in range(w):
+            if stream_mask[r, c] == 1 and max_in[r, c] == 0.0:
+                max_in[r, c] = order[r, c]
+                cnt_max[r, c] = 1
+
+    return order, max_in, cnt_max
 
 
 @ngjit
@@ -1092,17 +1100,22 @@ def _process_strahler_tile_mfd(iy, ix, fractions_da, accum_da, threshold,
         iy, ix, bdry_max, bdry_cnt, frac_bdry, mask_bdry,
         chunks_y, chunks_x, n_tile_y, n_tile_x)
 
-    order = _strahler_mfd_tile_kernel(frac_chunk, sm, h, w, *seeds)
+    order, ki_max_in, ki_cnt_max = _strahler_mfd_tile_kernel(
+        frac_chunk, sm, h, w, *seeds)
 
-    # Extract boundary order values and recompute max/cnt for boundaries
+    # Extract boundary max_in/cnt_max values (not final order) so that
+    # the seed reconstruction (cnt>=2 -> order+1) works at tile borders.
     change = 0.0
-    for side, strip in [('top', order[0, :]),
-                        ('bottom', order[-1, :]),
-                        ('left', order[:, 0]),
-                        ('right', order[:, -1])]:
-        new_vals = strip.copy()
-        new_max = np.where(np.isnan(new_vals), 0.0, new_vals)
-        new_cnt = np.where(new_max > 0, 1.0, 0.0)
+    bdry_slices = [
+        ('top', order[0, :], ki_max_in[0, :], ki_cnt_max[0, :]),
+        ('bottom', order[-1, :], ki_max_in[-1, :], ki_cnt_max[-1, :]),
+        ('left', order[:, 0], ki_max_in[:, 0], ki_cnt_max[:, 0]),
+        ('right', order[:, -1], ki_max_in[:, -1], ki_cnt_max[:, -1]),
+    ]
+    for side, order_strip, mi_strip, cm_strip in bdry_slices:
+        is_nan = np.isnan(order_strip)
+        new_max = np.where(is_nan, 0.0, mi_strip.astype(np.float64))
+        new_cnt = np.where(is_nan, 0.0, cm_strip.astype(np.float64))
 
         old_max = bdry_max.get(side, iy, ix).copy()
         with np.errstate(invalid='ignore'):
@@ -1224,7 +1237,7 @@ def _stream_order_mfd_dask_strahler(fractions_da, accum_da, threshold):
                 iy, ix, _bdry_max, _bdry_cnt, _frac_bdry, _mask_bdry,
                 chunks_y, chunks_x, n_tile_y, n_tile_x)
 
-            tile_order = _strahler_mfd_tile_kernel(
+            tile_order, _, _ = _strahler_mfd_tile_kernel(
                 frac_chunk, sm, h, w, *seeds)
             row.append(da.from_array(tile_order, chunks=tile_order.shape))
         rows.append(row)
diff --git a/xrspatial/tests/test_flow_accumulation.py b/xrspatial/tests/test_flow_accumulation.py
index 9c2565b6..925e2db0 100644
--- a/xrspatial/tests/test_flow_accumulation.py
+++ b/xrspatial/tests/test_flow_accumulation.py
@@ -15,7 +15,7 @@
 # ---------------------------------------------------------------------------
 
 def test_known_bowl():
-    """5×5 bowl: hand-computed flow_dir and expected accum.
+    """5x5 bowl: hand-computed flow_dir and expected accum.
 
     Bowl surface:
         9  9  9  9  9
@@ -72,7 +72,7 @@ def test_linear_chain_east():
 # ---------------------------------------------------------------------------
 
 def test_pit_center():
-    """3×3 grid, all 8 neighbours flow to center → center accum = 9."""
+    """3x3 grid, all 8 neighbours flow to center -> center accum = 9."""
     flow_dir = np.array([
         [2.0, 4.0, 8.0],
         [1.0, 0.0, 16.0],
@@ -92,7 +92,7 @@ def test_pit_center():
 # ---------------------------------------------------------------------------
 
 def test_all_pits():
-    """Every cell code = 0 → every cell accum = 1."""
+    """Every cell code = 0 -> every cell accum = 1."""
     flow_dir = np.zeros((4, 5), dtype=np.float64)
     agg = create_test_raster(flow_dir)
     result = flow_accumulation(agg)
@@ -104,7 +104,7 @@ def test_all_pits():
 # ---------------------------------------------------------------------------
 
 def test_nan_handling():
-    """NaN flow_dir → NaN accum; valid neighbours still correct."""
+    """NaN flow_dir -> NaN accum; valid neighbours still correct."""
     flow_dir = np.array([
         [1.0, 1.0, 0.0],
         [np.nan, 1.0, 0.0],
@@ -126,14 +126,14 @@ def test_nan_handling():
 # ---------------------------------------------------------------------------
 
 def test_code_0_pit():
-    """Pit cells (code 0) count themselves → accum = 1."""
+    """Pit cells (code 0) count themselves -> accum = 1."""
     flow_dir = np.array([
         [1.0, 0.0],
         [64.0, 0.0],
     ], dtype=np.float64)
     agg = create_test_raster(flow_dir)
     result = flow_accumulation(agg)
-    # (1,0) → N → (0,0) → E → (0,1) pit
+    # (1,0) -> N -> (0,0) -> E -> (0,1) pit
     assert result.data[1, 0] == 1.0  # leaf, just itself
     assert result.data[0, 0] == 2.0  # itself + (1,0)
     assert result.data[0, 1] == 3.0  # itself + (0,0) chain
@@ -179,7 +179,7 @@ def test_output_dtype():
     "dtype", [np.int32, np.int64, np.float32, np.float64])
 def test_dtype_acceptance(dtype):
     """Function accepts int32/int64/float32/float64 flow_dir inputs."""
-    # All zeros (pits) — valid for any dtype
+    # All zeros (pits) -- valid for any dtype
     flow_dir = np.zeros((3, 4), dtype=dtype)
     agg = create_test_raster(flow_dir)
     result = flow_accumulation(agg)
@@ -210,7 +210,7 @@ def test_dataset_support():
 # ---------------------------------------------------------------------------
 
 def _make_cross_backend_flow_dir():
-    """6×6 flow-east grid for cross-backend comparison."""
+    """6x6 flow-east grid for cross-backend comparison."""
     flow_dir = np.full((6, 6), 1.0, dtype=np.float64)
     flow_dir[:, -1] = 0.0  # last column = pits
     return flow_dir
@@ -369,204 +369,3 @@ def test_dask_cupy_chunk_configs(chunks):
     np.testing.assert_allclose(
         np_result.data, dcp_result.data.compute().get(), equal_nan=True,
     ), f"Mismatch with chunks={chunks}"
-
-
-# ===========================================================================
-# Dinf flow accumulation tests
-# ===========================================================================
-
-from xrspatial.flow_accumulation import _detect_flow_type
-
-
-def test_dinf_detection_d8():
-    """D8 codes detected as 'd8'."""
-    flow_dir = np.array([[1.0, 2.0], [4.0, 0.0]], dtype=np.float64)
-    assert _detect_flow_type(flow_dir) == "d8"
-
-
-def test_dinf_detection_dinf():
-    """Dinf angles detected as 'dinf'."""
-    flow_dir = np.array([[0.5, 1.2], [-1.0, np.nan]], dtype=np.float64)
-    assert _detect_flow_type(flow_dir) == "dinf"
-
-
-def test_dinf_detection_all_nan():
-    """All-NaN grid detected as 'd8' (default)."""
-    flow_dir = np.full((3, 3), np.nan, dtype=np.float64)
-    assert _detect_flow_type(flow_dir) == "d8"
-
-
-def test_dinf_detection_pit_minus_one():
-    """-1.0 (Dinf pit) triggers 'dinf' detection."""
-    flow_dir = np.array([[0.0, -1.0], [0.0, 0.0]], dtype=np.float64)
-    assert _detect_flow_type(flow_dir) == "dinf"
-
-
-def test_dinf_cardinal_east_chain():
-    """Pure east angles (0.0) chain: matches D8 code=1 result."""
-    N = 6
-    # Dinf: angle=0 means east, pit=-1
-    flow_dir = np.full((1, N), 0.0, dtype=np.float64)
-    flow_dir[0, -1] = -1.0  # last cell is pit
-    agg = create_test_raster(flow_dir)
-    result = flow_accumulation(agg)
-    expected = np.arange(1, N + 1, dtype=np.float64).reshape(1, N)
-    np.testing.assert_allclose(result.data, expected)
-
-
-def test_dinf_pit_center():
-    """8 neighbours with exact cardinal/diagonal angles point to center."""
-    pi = np.pi
-    flow_dir = np.array([
-        [7 * pi / 4,  3 * pi / 2,  5 * pi / 4],
-        [0.0,         -1.0,         pi],
-        [pi / 4,      pi / 2,       3 * pi / 4],
-    ], dtype=np.float64)
-    agg = create_test_raster(flow_dir)
-    result = flow_accumulation(agg)
-    assert result.data[1, 1] == 9.0
-    for r, c in [(0, 0), (0, 1), (0, 2), (1, 0), (1, 2),
-                 (2, 0), (2, 1), (2, 2)]:
-        assert result.data[r, c] == 1.0, f"Cell ({r},{c}) = {result.data[r,c]}"
-
-
-def test_dinf_proportional_split():
-    """angle=pi/8 splits flow 50/50 to E and NE."""
-    pi = np.pi
-    flow_dir = np.array([
-        [-1.0,  -1.0,  -1.0],
-        [-1.0,  pi / 8, -1.0],
-        [-1.0,  -1.0,  -1.0],
-    ], dtype=np.float64)
-    agg = create_test_raster(flow_dir)
-    result = flow_accumulation(agg)
-    # Center splits 50/50 to E (1,2) and NE (0,2)
-    np.testing.assert_allclose(result.data[1, 2], 1.5)
-    np.testing.assert_allclose(result.data[0, 2], 1.5)
-    assert result.data[1, 1] == 1.0
-
-
-def test_dinf_chain_with_split():
-    """Multi-cell chain with fractional flow, verify cascading sums."""
-    pi = np.pi
-    flow_dir = np.array([
-        [-1.0,   -1.0,   -1.0],
-        [pi / 8, pi / 8, -1.0],
-        [-1.0,   -1.0,   -1.0],
-    ], dtype=np.float64)
-    agg = create_test_raster(flow_dir)
-    result = flow_accumulation(agg)
-    # (1,0) sends 0.5 to E(1,1) and 0.5 to NE(0,1)
-    # (1,1) has accum=1.5, sends 0.75 to E(1,2) and 0.75 to NE(0,2)
-    np.testing.assert_allclose(result.data[1, 0], 1.0)
-    np.testing.assert_allclose(result.data[0, 1], 1.5)
-    np.testing.assert_allclose(result.data[1, 1], 1.5)
-    np.testing.assert_allclose(result.data[1, 2], 1.75)
-    np.testing.assert_allclose(result.data[0, 2], 1.75)
-
-
-def test_dinf_end_to_end():
-    """flow_direction_dinf -> flow_accumulation produces valid results."""
-    from xrspatial import flow_direction_dinf
-
-    rng = np.random.default_rng(42)
-    elev = create_test_raster(rng.random((20, 20)) * 100)
-    fdir = flow_direction_dinf(elev)
-    acc = flow_accumulation(fdir)
-    data = acc.data
-    valid = data[~np.isnan(data)]
-    assert len(valid) > 0
-    assert np.all(valid >= 1.0)
-    assert np.max(valid) <= data.size
-
-
-def test_dinf_nan_handling():
-    """NaN flow dir produces NaN accumulation."""
-    pi = np.pi
-    flow_dir = np.array([
-        [np.nan,   pi / 2],
-        [0.0,      -1.0],
-    ], dtype=np.float64)
-    agg = create_test_raster(flow_dir)
-    result = flow_accumulation(agg)
-    assert np.isnan(result.data[0, 0])
-    np.testing.assert_allclose(result.data[0, 1], 1.0)
-    np.testing.assert_allclose(result.data[1, 0], 1.0)
-    np.testing.assert_allclose(result.data[1, 1], 2.0)
-
-
-@dask_array_available
-@pytest.mark.parametrize("chunks", [
-    (3, 3), (5, 5), (2, 6), (6, 2), (1, 1), (6, 6),
-])
-def test_dinf_dask_matches_numpy(chunks):
-    """Dinf dask matches numpy across chunk sizes."""
-    from xrspatial import flow_direction_dinf
-
-    rng = np.random.default_rng(123)
-    elev = create_test_raster(rng.random((6, 6)) * 100)
-    fdir_np = flow_direction_dinf(elev).data
-
-    numpy_agg = create_test_raster(fdir_np, backend='numpy')
-    dask_agg = create_test_raster(fdir_np, backend='dask', chunks=chunks)
-    np_result = flow_accumulation(numpy_agg)
-    da_result = flow_accumulation(dask_agg)
-    np.testing.assert_allclose(
-        np_result.data, da_result.data.compute(), equal_nan=True,
-    ), f"Mismatch with chunks={chunks}"
-
-
-@dask_array_available
-def test_dinf_dask_larger_grid():
-    """Dinf dask on a larger grid with cross-tile flow."""
-    from xrspatial import flow_direction_dinf
-
-    rng = np.random.default_rng(99)
-    elev = create_test_raster(rng.random((12, 12)) * 100)
-    fdir_np = flow_direction_dinf(elev).data
-
-    numpy_agg = create_test_raster(fdir_np, backend='numpy')
-    np_result = flow_accumulation(numpy_agg)
-
-    for chunks in [(3, 3), (4, 6), (6, 4), (2, 2)]:
-        dask_agg = create_test_raster(fdir_np, backend='dask', chunks=chunks)
-        da_result = flow_accumulation(dask_agg)
-        np.testing.assert_allclose(
-            np_result.data, da_result.data.compute(), equal_nan=True,
-        ), f"Mismatch with chunks={chunks}"
-
-
-@cuda_and_cupy_available
-def test_dinf_cupy_matches_numpy():
-    """Dinf CuPy matches NumPy."""
-    from xrspatial import flow_direction_dinf
-
-    rng = np.random.default_rng(42)
-    elev = create_test_raster(rng.random((10, 10)) * 100)
-    fdir_np = flow_direction_dinf(elev).data
-
-    numpy_agg = create_test_raster(fdir_np, backend='numpy')
-    cupy_agg = create_test_raster(fdir_np, backend='cupy')
-    np_result = flow_accumulation(numpy_agg)
-    cp_result = flow_accumulation(cupy_agg)
-    np.testing.assert_allclose(
-        np_result.data, cp_result.data.get(), equal_nan=True)
-
-
-@dask_array_available
-@cuda_and_cupy_available
-def test_dinf_dask_cupy_matches_numpy():
-    """Dinf Dask+CuPy matches NumPy."""
-    from xrspatial import flow_direction_dinf
-
-    rng = np.random.default_rng(42)
-    elev = create_test_raster(rng.random((8, 8)) * 100)
-    fdir_np = flow_direction_dinf(elev).data
-
-    numpy_agg = create_test_raster(fdir_np, backend='numpy')
-    dask_cupy_agg = create_test_raster(fdir_np, backend='dask+cupy',
-                                        chunks=(4, 4))
-    np_result = flow_accumulation(numpy_agg)
-    dcp_result = flow_accumulation(dask_cupy_agg)
-    np.testing.assert_allclose(
-        np_result.data, dcp_result.data.compute().get(), equal_nan=True)
diff --git a/xrspatial/tests/test_flow_accumulation_dinf.py b/xrspatial/tests/test_flow_accumulation_dinf.py
new file mode 100644
index 00000000..8665d05a
--- /dev/null
+++ b/xrspatial/tests/test_flow_accumulation_dinf.py
@@ -0,0 +1,216 @@
+import numpy as np
+import pytest
+
+from xrspatial.flow_accumulation_dinf import flow_accumulation_dinf
+from xrspatial.flow_accumulation import _detect_flow_type
+from xrspatial.tests.general_checks import (
+    create_test_raster,
+    cuda_and_cupy_available,
+    dask_array_available,
+)
+
+
+# ===========================================================================
+# Detection tests
+# ===========================================================================
+
+def test_dinf_detection_d8():
+    """D8 codes detected as 'd8'."""
+    flow_dir = np.array([[1.0, 2.0], [4.0, 0.0]], dtype=np.float64)
+    assert _detect_flow_type(flow_dir) == "d8"
+
+
+def test_dinf_detection_dinf():
+    """Dinf angles detected as 'dinf'."""
+    flow_dir = np.array([[0.5, 1.2], [-1.0, np.nan]], dtype=np.float64)
+    assert _detect_flow_type(flow_dir) == "dinf"
+
+
+def test_dinf_detection_all_nan():
+    """All-NaN grid detected as 'd8' (default)."""
+    flow_dir = np.full((3, 3), np.nan, dtype=np.float64)
+    assert _detect_flow_type(flow_dir) == "d8"
+
+
+def test_dinf_detection_pit_minus_one():
+    """-1.0 (Dinf pit) triggers 'dinf' detection."""
+    flow_dir = np.array([[0.0, -1.0], [0.0, 0.0]], dtype=np.float64)
+    assert _detect_flow_type(flow_dir) == "dinf"
+
+
+# ===========================================================================
+# Core functionality tests
+# ===========================================================================
+
+def test_dinf_cardinal_east_chain():
+    """Pure east angles (0.0) chain: matches D8 code=1 result."""
+    N = 6
+    # Dinf: angle=0 means east, pit=-1
+    flow_dir = np.full((1, N), 0.0, dtype=np.float64)
+    flow_dir[0, -1] = -1.0  # last cell is pit
+    agg = create_test_raster(flow_dir)
+    result = flow_accumulation_dinf(agg)
+    expected = np.arange(1, N + 1, dtype=np.float64).reshape(1, N)
+    np.testing.assert_allclose(result.data, expected)
+
+
+def test_dinf_pit_center():
+    """8 neighbours with exact cardinal/diagonal angles point to center."""
+    pi = np.pi
+    flow_dir = np.array([
+        [7 * pi / 4,  3 * pi / 2,  5 * pi / 4],
+        [0.0,         -1.0,         pi],
+        [pi / 4,      pi / 2,       3 * pi / 4],
+    ], dtype=np.float64)
+    agg = create_test_raster(flow_dir)
+    result = flow_accumulation_dinf(agg)
+    assert result.data[1, 1] == 9.0
+    for r, c in [(0, 0), (0, 1), (0, 2), (1, 0), (1, 2),
+                 (2, 0), (2, 1), (2, 2)]:
+        assert result.data[r, c] == 1.0, f"Cell ({r},{c}) = {result.data[r,c]}"
+
+
+def test_dinf_proportional_split():
+    """angle=pi/8 splits flow 50/50 to E and NE."""
+    pi = np.pi
+    flow_dir = np.array([
+        [-1.0,  -1.0,  -1.0],
+        [-1.0,  pi / 8, -1.0],
+        [-1.0,  -1.0,  -1.0],
+    ], dtype=np.float64)
+    agg = create_test_raster(flow_dir)
+    result = flow_accumulation_dinf(agg)
+    # Center splits 50/50 to E (1,2) and NE (0,2)
+    np.testing.assert_allclose(result.data[1, 2], 1.5)
+    np.testing.assert_allclose(result.data[0, 2], 1.5)
+    assert result.data[1, 1] == 1.0
+
+
+def test_dinf_chain_with_split():
+    """Multi-cell chain with fractional flow, verify cascading sums."""
+    pi = np.pi
+    flow_dir = np.array([
+        [-1.0,   -1.0,   -1.0],
+        [pi / 8, pi / 8, -1.0],
+        [-1.0,   -1.0,   -1.0],
+    ], dtype=np.float64)
+    agg = create_test_raster(flow_dir)
+    result = flow_accumulation_dinf(agg)
+    # (1,0) sends 0.5 to E(1,1) and 0.5 to NE(0,1)
+    # (1,1) has accum=1.5, sends 0.75 to E(1,2) and 0.75 to NE(0,2)
+    np.testing.assert_allclose(result.data[1, 0], 1.0)
+    np.testing.assert_allclose(result.data[0, 1], 1.5)
+    np.testing.assert_allclose(result.data[1, 1], 1.5)
+    np.testing.assert_allclose(result.data[1, 2], 1.75)
+    np.testing.assert_allclose(result.data[0, 2], 1.75)
+
+
+def test_dinf_end_to_end():
+    """flow_direction_dinf -> flow_accumulation_dinf produces valid results."""
+    from xrspatial import flow_direction_dinf
+
+    rng = np.random.default_rng(42)
+    elev = create_test_raster(rng.random((20, 20)) * 100)
+    fdir = flow_direction_dinf(elev)
+    acc = flow_accumulation_dinf(fdir)
+    data = acc.data
+    valid = data[~np.isnan(data)]
+    assert len(valid) > 0
+    assert np.all(valid >= 1.0)
+    assert np.max(valid) <= data.size
+
+
+def test_dinf_nan_handling():
+    """NaN flow dir produces NaN accumulation."""
+    pi = np.pi
+    flow_dir = np.array([
+        [np.nan,   pi / 2],
+        [0.0,      -1.0],
+    ], dtype=np.float64)
+    agg = create_test_raster(flow_dir)
+    result = flow_accumulation_dinf(agg)
+    assert np.isnan(result.data[0, 0])
+    np.testing.assert_allclose(result.data[0, 1], 1.0)
+    np.testing.assert_allclose(result.data[1, 0], 1.0)
+    np.testing.assert_allclose(result.data[1, 1], 2.0)
+
+
+# ===========================================================================
+# Cross-backend tests
+# ===========================================================================
+
+@dask_array_available
+@pytest.mark.parametrize("chunks", [
+    (3, 3), (5, 5), (2, 6), (6, 2), (1, 1), (6, 6),
+])
+def test_dinf_dask_matches_numpy(chunks):
+    """Dinf dask matches numpy across chunk sizes."""
+    from xrspatial import flow_direction_dinf
+
+    rng = np.random.default_rng(123)
+    elev = create_test_raster(rng.random((6, 6)) * 100)
+    fdir_np = flow_direction_dinf(elev).data
+
+    numpy_agg = create_test_raster(fdir_np, backend='numpy')
+    dask_agg = create_test_raster(fdir_np, backend='dask', chunks=chunks)
+    np_result = flow_accumulation_dinf(numpy_agg)
+    da_result = flow_accumulation_dinf(dask_agg)
+    np.testing.assert_allclose(
+        np_result.data, da_result.data.compute(), equal_nan=True,
+    ), f"Mismatch with chunks={chunks}"
+
+
+@dask_array_available
+def test_dinf_dask_larger_grid():
+    """Dinf dask on a larger grid with cross-tile flow."""
+    from xrspatial import flow_direction_dinf
+
+    rng = np.random.default_rng(99)
+    elev = create_test_raster(rng.random((12, 12)) * 100)
+    fdir_np = flow_direction_dinf(elev).data
+
+    numpy_agg = create_test_raster(fdir_np, backend='numpy')
+    np_result = flow_accumulation_dinf(numpy_agg)
+
+    for chunks in [(3, 3), (4, 6), (6, 4), (2, 2)]:
+        dask_agg = create_test_raster(fdir_np, backend='dask', chunks=chunks)
+        da_result = flow_accumulation_dinf(dask_agg)
+        np.testing.assert_allclose(
+            np_result.data, da_result.data.compute(), equal_nan=True,
+        ), f"Mismatch with chunks={chunks}"
+
+
+@cuda_and_cupy_available
+def test_dinf_cupy_matches_numpy():
+    """Dinf CuPy matches NumPy."""
+    from xrspatial import flow_direction_dinf
+
+    rng = np.random.default_rng(42)
+    elev = create_test_raster(rng.random((10, 10)) * 100)
+    fdir_np = flow_direction_dinf(elev).data
+
+    numpy_agg = create_test_raster(fdir_np, backend='numpy')
+    cupy_agg = create_test_raster(fdir_np, backend='cupy')
+    np_result = flow_accumulation_dinf(numpy_agg)
+    cp_result = flow_accumulation_dinf(cupy_agg)
+    np.testing.assert_allclose(
+        np_result.data, cp_result.data.get(), equal_nan=True)
+
+
+@dask_array_available
+@cuda_and_cupy_available
+def test_dinf_dask_cupy_matches_numpy():
+    """Dinf Dask+CuPy matches NumPy."""
+    from xrspatial import flow_direction_dinf
+
+    rng = np.random.default_rng(42)
+    elev = create_test_raster(rng.random((8, 8)) * 100)
+    fdir_np = flow_direction_dinf(elev).data
+
+    numpy_agg = create_test_raster(fdir_np, backend='numpy')
+    dask_cupy_agg = create_test_raster(fdir_np, backend='dask+cupy',
+                                        chunks=(4, 4))
+    np_result = flow_accumulation_dinf(numpy_agg)
+    dcp_result = flow_accumulation_dinf(dask_cupy_agg)
+    np.testing.assert_allclose(
+        np_result.data, dcp_result.data.compute().get(), equal_nan=True)
diff --git a/xrspatial/tests/test_flow_direction_dinf.py b/xrspatial/tests/test_flow_direction_dinf.py
index 48c4baec..e53bf2e8 100644
--- a/xrspatial/tests/test_flow_direction_dinf.py
+++ b/xrspatial/tests/test_flow_direction_dinf.py
@@ -120,6 +120,128 @@ def test_interior_facet_angle():
         f"Center = {result.data[1,1]}, expected {expected}")
 
 
+# ---------------------------------------------------------------------------
+# Tarboton odd-facet tests
+# ---------------------------------------------------------------------------
+
+def test_odd_facet_1_tarboton():
+    """Facet 1 (N, NE) wins with interior r; angle = pi/2 - r."""
+    # Center=10, N=7, NE=6; all others equal to center.
+    # Facet 1: s1=(10-7)/1=3, s2=(7-6)/1=1, r=atan2(1,3), s=sqrt(10)
+    # Facet 0: s1=(10-10)/1=0, s2=(10-6)/1=4, r clamped pi/4, s=(10-6)/sqrt(2)~2.83
+    # Facet 1 wins (sqrt(10)~3.16 > 2.83).
+    data = np.array([
+        [10, 7,  6],
+        [10, 10, 10],
+        [10, 10, 10],
+    ], dtype=np.float64)
+    agg = _make_raster(data)
+    result = flow_direction_dinf(agg)
+    r = math.atan2(1, 3)
+    expected = math.pi / 2.0 - r
+    assert abs(result.data[1, 1] - expected) < 1e-10, (
+        f"Center = {result.data[1,1]}, expected {expected}")
+
+
+def test_odd_facet_7_tarboton():
+    """Facet 7 (E, SE) wins with interior r; angle = 2*pi - r."""
+    # Center=10, E=7, SE=6; all others equal to center.
+    # Facet 7: s1=(10-7)/1=3, s2=(7-6)/1=1, r=atan2(1,3), s=sqrt(10)
+    data = np.array([
+        [10, 10, 10],
+        [10, 10,  7],
+        [10, 10,  6],
+    ], dtype=np.float64)
+    agg = _make_raster(data)
+    result = flow_direction_dinf(agg)
+    r = math.atan2(1, 3)
+    expected = 2.0 * math.pi - r
+    assert abs(result.data[1, 1] - expected) < 1e-10, (
+        f"Center = {result.data[1,1]}, expected {expected}")
+
+
+def _reference_tarboton(data, cx, cy):
+    """Pure-Python reference Tarboton (1997) D-inf implementation."""
+    rows, cols = data.shape
+    out = np.full(data.shape, np.nan)
+
+    nb_dy = [0, -1, -1, -1, 0, 1, 1, 1]
+    nb_dx = [1, 1, 0, -1, -1, -1, 0, 1]
+    e1_idx = [0, 2, 2, 4, 4, 6, 6, 0]
+    e2_idx = [1, 1, 3, 3, 5, 5, 7, 7]
+    d1_arr = [cx, cy, cy, cx, cx, cy, cy, cx]
+    d2_arr = [cy, cx, cx, cy, cy, cx, cx, cy]
+    ac = [0, 2, 2, 4, 4, 6, 6, 8]
+    af = [1, -1, 1, -1, 1, -1, 1, -1]
+
+    diag = math.sqrt(cx * cx + cy * cy)
+    pi4 = math.pi / 4.0
+
+    for y in range(1, rows - 1):
+        for x in range(1, cols - 1):
+            center = data[y, x]
+            if np.isnan(center):
+                continue
+            has_nan = False
+            nbs = []
+            for k in range(8):
+                v = data[y + nb_dy[k], x + nb_dx[k]]
+                if np.isnan(v):
+                    has_nan = True
+                    break
+                nbs.append(v)
+            if has_nan:
+                continue
+
+            max_slope = -1e308
+            best_angle = -1.0
+            for k in range(8):
+                e1 = nbs[e1_idx[k]]
+                e2 = nbs[e2_idx[k]]
+                s1 = (center - e1) / d1_arr[k]
+                s2 = (e1 - e2) / d2_arr[k]
+                r = math.atan2(s2, s1)
+                if r < 0.0:
+                    r = 0.0
+                    s = s1
+                elif r > pi4:
+                    r = pi4
+                    s = (center - e2) / diag
+                else:
+                    s = math.sqrt(s1 * s1 + s2 * s2)
+                if s > max_slope:
+                    max_slope = s
+                    best_angle = ac[k] * pi4 + af[k] * r
+
+            if max_slope <= 0.0:
+                out[y, x] = -1.0
+            else:
+                if best_angle >= 2.0 * math.pi:
+                    best_angle = 0.0
+                out[y, x] = best_angle
+    return out
+
+
+def test_reference_tarboton_agreement():
+    """Cone DEM: xrspatial matches reference Tarboton on all interior cells."""
+    n = 51
+    cx, cy = 1.0, 1.0
+    yy, xx = np.mgrid[0:n, 0:n]
+    center = (n - 1) / 2.0
+    data = np.sqrt((yy - center) ** 2 + (xx - center) ** 2).astype(np.float64)
+
+    agg = _make_raster(data, res=(cy, cx))
+    result = flow_direction_dinf(agg)
+    ref = _reference_tarboton(data, cx, cy)
+
+    # Compare interior cells (skip edges which are NaN)
+    interior = ~np.isnan(ref)
+    assert interior.sum() > 0
+    np.testing.assert_allclose(
+        result.data[interior], ref[interior], atol=1e-12,
+        err_msg="xrspatial dinf does not match reference Tarboton")
+
+
 # ---------------------------------------------------------------------------
 # Pit / flat / NaN tests
 # ---------------------------------------------------------------------------
diff --git a/xrspatial/watershed.py b/xrspatial/watershed.py
index c11f50f8..439177d4 100644
--- a/xrspatial/watershed.py
+++ b/xrspatial/watershed.py
@@ -93,40 +93,45 @@ def _code_to_offset_py(code):
 # =====================================================================
 
 @ngjit
-def _watershed_cpu(flow_dir, labels, h, w):
+def _watershed_cpu(flow_dir, labels, state, h, w):
     """Downstream tracing with path compression for watershed.
 
-    ``labels`` is pre-initialised: pour points >= 0, NaN for nodata,
-    -1 for unresolved.  On return every reachable cell has the label
-    of its pour point, unreachable cells are NaN.
+    Uses a separate ``state`` array to track cell status, so that
+    pour-point labels can be any float value (including negative).
+
+    State values: 0=nodata, 1=unresolved, 2=in-trace, 3=resolved.
+    On return every reachable cell has state 3 and the label of its
+    pour point; unreachable cells have state 0 and NaN.
     """
     path_r = np.empty(h * w, dtype=np.int64)
     path_c = np.empty(h * w, dtype=np.int64)
 
     for r in range(h):
         for c in range(w):
-            if labels[r, c] != -1.0:
-                continue  # already resolved or NaN
+            if state[r, c] != 1:
+                continue  # already resolved, nodata, or in-trace
 
             # Trace downstream, collecting path
             path_len = 0
             cr, cc = r, c
             found_label = np.nan
+            found = False
 
             while True:
-                lbl = labels[cr, cc]
-                if lbl >= 0.0:
-                    # Hit a labeled cell (pour point or previously resolved)
-                    found_label = lbl
+                s = state[cr, cc]
+                if s == 3:
+                    # Hit a resolved cell (pour point or previously resolved)
+                    found_label = labels[cr, cc]
+                    found = True
                     break
-                if lbl != -1.0:
-                    # NaN or in-trace marker (-2) → cycle or dead end
+                if s != 1:
+                    # nodata (0) or in-trace (2) → cycle or dead end
                     break
 
                 path_r[path_len] = cr
                 path_c[path_len] = cc
                 path_len += 1
-                labels[cr, cc] = -2.0  # in-trace marker
+                state[cr, cc] = 2  # in-trace marker
 
                 v = flow_dir[cr, cc]
                 if v != v:  # NaN
@@ -141,7 +146,12 @@ def _watershed_cpu(flow_dir, labels, h, w):
 
             # Assign label to entire traced path
             for i in range(path_len):
-                labels[path_r[i], path_c[i]] = found_label
+                if found:
+                    labels[path_r[i], path_c[i]] = found_label
+                    state[path_r[i], path_c[i]] = 3
+                else:
+                    labels[path_r[i], path_c[i]] = np.nan
+                    state[path_r[i], path_c[i]] = 0
 
     return labels
 
@@ -278,61 +288,72 @@ def _watershed_tile_kernel(flow_dir, h, w, pour_points,
                            exit_tl, exit_tr, exit_bl, exit_br):
     """Seeded downstream tracing for a single tile.
 
-    Labels are initialised from pour_points and exit labels (resolved
-    labels of the destination cell in adjacent tiles).  Downstream
-    tracing with path compression resolves as many cells as possible
-    within the tile.
+    Uses a separate state array so pour-point labels can be any float
+    value (including negative).  State: 0=nodata, 1=unresolved,
+    2=in-trace, 3=resolved.
     """
     labels = np.empty((h, w), dtype=np.float64)
+    state = np.empty((h, w), dtype=np.int8)
 
-    # Initialise labels
+    # Initialise labels and state
     for r in range(h):
         for c in range(w):
             v = flow_dir[r, c]
             if v != v:  # NaN
                 labels[r, c] = np.nan
+                state[r, c] = 0
                 continue
             pp = pour_points[r, c]
             if pp == pp:  # not NaN → pour point
                 labels[r, c] = pp
+                state[r, c] = 3
                 continue
-            labels[r, c] = -1.0  # unresolved
+            labels[r, c] = np.nan
+            state[r, c] = 1  # unresolved
 
     # Apply exit labels to boundary cells that flow OUT of tile
     # Top row: cells flowing north
     for c in range(w):
-        if labels[0, c] == -1.0:
+        if state[0, c] == 1:
             el = exit_top[c]
-            if el == el and el >= 0.0:  # not NaN and resolved
+            if el == el:  # not NaN → resolved
                 labels[0, c] = el
+                state[0, c] = 3
     # Bottom row
     for c in range(w):
-        if labels[h - 1, c] == -1.0:
+        if state[h - 1, c] == 1:
             el = exit_bottom[c]
-            if el == el and el >= 0.0:
+            if el == el:
                 labels[h - 1, c] = el
+                state[h - 1, c] = 3
     # Left column
     for r in range(h):
-        if labels[r, 0] == -1.0:
+        if state[r, 0] == 1:
             el = exit_left[r]
-            if el == el and el >= 0.0:
+            if el == el:
                 labels[r, 0] = el
+                state[r, 0] = 3
     # Right column
     for r in range(h):
-        if labels[r, w - 1] == -1.0:
+        if state[r, w - 1] == 1:
             el = exit_right[r]
-            if el == el and el >= 0.0:
+            if el == el:
                 labels[r, w - 1] = el
+                state[r, w - 1] = 3
 
     # Corners
-    if labels[0, 0] == -1.0 and exit_tl == exit_tl and exit_tl >= 0.0:
+    if state[0, 0] == 1 and exit_tl == exit_tl:
         labels[0, 0] = exit_tl
-    if labels[0, w - 1] == -1.0 and exit_tr == exit_tr and exit_tr >= 0.0:
+        state[0, 0] = 3
+    if state[0, w - 1] == 1 and exit_tr == exit_tr:
         labels[0, w - 1] = exit_tr
-    if labels[h - 1, 0] == -1.0 and exit_bl == exit_bl and exit_bl >= 0.0:
+        state[0, w - 1] = 3
+    if state[h - 1, 0] == 1 and exit_bl == exit_bl:
         labels[h - 1, 0] = exit_bl
-    if labels[h - 1, w - 1] == -1.0 and exit_br == exit_br and exit_br >= 0.0:
+        state[h - 1, 0] = 3
+    if state[h - 1, w - 1] == 1 and exit_br == exit_br:
         labels[h - 1, w - 1] = exit_br
+        state[h - 1, w - 1] = 3
 
     # Downstream tracing with path compression
     path_r = np.empty(h * w, dtype=np.int64)
@@ -340,25 +361,28 @@ def _watershed_tile_kernel(flow_dir, h, w, pour_points,
 
     for r in range(h):
         for c in range(w):
-            if labels[r, c] != -1.0:
+            if state[r, c] != 1:
                 continue
 
             path_len = 0
             cr, cc = r, c
             found_label = np.nan
+            found = False
+            exit_tile = False
 
             while True:
-                lbl = labels[cr, cc]
-                if lbl >= 0.0:
-                    found_label = lbl
+                s = state[cr, cc]
+                if s == 3:
+                    found_label = labels[cr, cc]
+                    found = True
                     break
-                if lbl != -1.0:
+                if s != 1:
                     break
 
                 path_r[path_len] = cr
                 path_c[path_len] = cc
                 path_len += 1
-                labels[cr, cc] = -2.0
+                state[cr, cc] = 2
 
                 v = flow_dir[cr, cc]
                 if v != v:
@@ -368,13 +392,20 @@ def _watershed_tile_kernel(flow_dir, h, w, pour_points,
                     break
                 nr, nc = cr + dy, cc + dx
                 if nr < 0 or nr >= h or nc < 0 or nc >= w:
-                    # Exits tile — leave as unresolved (-1)
-                    found_label = -1.0
+                    # Exits tile — leave as unresolved
+                    exit_tile = True
                     break
                 cr, cc = nr, nc
 
             for i in range(path_len):
-                labels[path_r[i], path_c[i]] = found_label
+                if found:
+                    labels[path_r[i], path_c[i]] = found_label
+                    state[path_r[i], path_c[i]] = 3
+                elif exit_tile:
+                    state[path_r[i], path_c[i]] = 1  # still unresolved
+                else:
+                    labels[path_r[i], path_c[i]] = np.nan
+                    state[path_r[i], path_c[i]] = 0  # dead end
 
     return labels
 
@@ -701,9 +732,6 @@ def _tile_fn(flow_dir_block, pp_block, block_info=None):
             h, w,
             np.asarray(pp_block, dtype=np.float64),
             *exits)
-        # After convergence, any remaining unresolved cells → NaN
-        result = np.where((result == -1.0) | (result == -2.0),
-                          np.nan, result)
         return result
 
     return da.map_blocks(
@@ -742,29 +770,29 @@ def _watershed_tile_cupy(flow_dir_data, pour_points_data,
     # Inject exit labels at boundary cells where active (state==1)
     # and exit label is resolved (not NaN, >= 0).
     exit_top_cp = cp.asarray(exit_top)
-    m = (state[0, :] == 1) & ~cp.isnan(exit_top_cp) & (exit_top_cp >= 0)
+    m = (state[0, :] == 1) & ~cp.isnan(exit_top_cp)
     labels[0, :] = cp.where(m, exit_top_cp, labels[0, :])
     state[0, :] = cp.where(m, 2, state[0, :])
 
     exit_bot_cp = cp.asarray(exit_bottom)
-    m = (state[H - 1, :] == 1) & ~cp.isnan(exit_bot_cp) & (exit_bot_cp >= 0)
+    m = (state[H - 1, :] == 1) & ~cp.isnan(exit_bot_cp)
     labels[H - 1, :] = cp.where(m, exit_bot_cp, labels[H - 1, :])
     state[H - 1, :] = cp.where(m, 2, state[H - 1, :])
 
     exit_left_cp = cp.asarray(exit_left)
-    m = (state[:, 0] == 1) & ~cp.isnan(exit_left_cp) & (exit_left_cp >= 0)
+    m = (state[:, 0] == 1) & ~cp.isnan(exit_left_cp)
     labels[:, 0] = cp.where(m, exit_left_cp, labels[:, 0])
     state[:, 0] = cp.where(m, 2, state[:, 0])
 
     exit_right_cp = cp.asarray(exit_right)
-    m = (state[:, W - 1] == 1) & ~cp.isnan(exit_right_cp) & (exit_right_cp >= 0)
+    m = (state[:, W - 1] == 1) & ~cp.isnan(exit_right_cp)
     labels[:, W - 1] = cp.where(m, exit_right_cp, labels[:, W - 1])
     state[:, W - 1] = cp.where(m, 2, state[:, W - 1])
 
     # Corner exit labels
     for r, c, val in [(0, 0, exit_tl), (0, W - 1, exit_tr),
                        (H - 1, 0, exit_bl), (H - 1, W - 1, exit_br)]:
-        if val == val and val >= 0 and int(state[r, c]) == 1:
+        if val == val and int(state[r, c]) == 1:
             labels[r, c] = val
             state[r, c] = 2
 
@@ -938,16 +966,20 @@ def watershed(flow_dir: xr.DataArray,
         fd = data.astype(np.float64)
         pp = np.asarray(pp_data, dtype=np.float64)
         h, w = fd.shape
-        # Init labels: pour points get their value, NaN flow_dir → NaN,
-        # others → -1
-        labels = np.full((h, w), -1.0, dtype=np.float64)
+        # Init labels and state: pour points → resolved (state 3),
+        # NaN flow_dir → nodata (state 0), others → unresolved (state 1)
+        labels = np.full((h, w), np.nan, dtype=np.float64)
+        state = np.zeros((h, w), dtype=np.int8)
         for r in range(h):
             for c in range(w):
                 if fd[r, c] != fd[r, c]:  # NaN
-                    labels[r, c] = np.nan
+                    pass  # state 0, label NaN
                 elif pp[r, c] == pp[r, c]:  # not NaN → pour point
                     labels[r, c] = pp[r, c]
-        out = _watershed_cpu(fd, labels, h, w)
+                    state[r, c] = 3
+                else:
+                    state[r, c] = 1  # unresolved
+        out = _watershed_cpu(fd, labels, state, h, w)
 
     elif has_cuda_and_cupy() and is_cupy_array(data):
         out = _watershed_cupy(data, pp_data)