From 18802d69875af15a624f5fac43d909ba0be56574 Mon Sep 17 00:00:00 2001
From: Simba Zhang <solderzzc@gmail.com>
Date: Mon, 9 Mar 2026 23:19:30 -0700
Subject: [PATCH 1/2] feat(detection): route Apple Silicon YOLO to Neural
 Engine (NPU)

On Apple Silicon, force CoreML compute_units to CPU_AND_NE so object
detection runs on the 16-core Neural Engine instead of GPU, leaving
GPU free for LLM/VLM inference.

Changes:
- env_config.py: add compute_units field to BackendSpec, MPS defaults
  to cpu_and_ne, monkey-patch coremltools.MLModel during YOLO load
- detect.py: report compute_units in ready event
- SKILL.md: add compute_units parameter, update acceleration table
- test_env_config_ane.py: 15 unit tests for NPU logic
---
 skills/detection/yolo-detection-2026/SKILL.md |  25 +-
 .../yolo-detection-2026/scripts/detect.py     |   7 +-
 .../yolo-detection-2026/scripts/env_config.py |  78 ++++-
 skills/lib/env_config.py                      |  78 ++++-
 skills/lib/test_env_config_ane.py             | 276 ++++++++++++++++++
 5 files changed, 447 insertions(+), 17 deletions(-)
 create mode 100644 skills/lib/test_env_config_ane.py

diff --git a/skills/detection/yolo-detection-2026/SKILL.md b/skills/detection/yolo-detection-2026/SKILL.md
index 278d924..939099a 100644
--- a/skills/detection/yolo-detection-2026/SKILL.md
+++ b/skills/detection/yolo-detection-2026/SKILL.md
@@ -66,6 +66,15 @@ parameters:
     description: "Auto-convert model to optimized format for faster inference"
     group: Performance
 
+  - name: compute_units
+    label: "Apple Compute Units"
+    type: select
+    options: ["auto", "cpu_and_ne", "all", "cpu_only", "cpu_and_gpu"]
+    default: "auto"
+    description: "CoreML compute target — 'auto' routes to Neural Engine (NPU), leaving GPU free for LLM/VLM"
+    group: Performance
+    platform: macos
+
 capabilities:
   live_detection:
     script: scripts/detect.py
@@ -89,13 +98,15 @@ Real-time object detection using the latest YOLO 2026 models. Detects 80+ COCO o
 
 The skill uses [`env_config.py`](../../lib/env_config.py) to **automatically detect hardware** and convert the model to the fastest format for your platform. Conversion happens once during deployment and is cached.
 
-| Platform | Backend | Optimized Format | Expected Speedup |
-|----------|---------|------------------|:----------------:|
-| NVIDIA GPU | CUDA | TensorRT `.engine` | ~3-5x |
-| Apple Silicon (M1+) | MPS | CoreML `.mlpackage` | ~2x |
-| Intel CPU/GPU/NPU | OpenVINO | OpenVINO IR `.xml` | ~2-3x |
-| AMD GPU | ROCm | ONNX Runtime | ~1.5-2x |
-| CPU (any) | CPU | ONNX Runtime | ~1.5x |
+| Platform | Backend | Optimized Format | Compute Units | Expected Speedup |
+|----------|---------|------------------|:-------------:|:----------------:|
+| NVIDIA GPU | CUDA | TensorRT `.engine` | GPU | ~3-5x |
+| Apple Silicon (M1+) | MPS | CoreML `.mlpackage` | **Neural Engine** (NPU) | ~2x |
+| Intel CPU/GPU/NPU | OpenVINO | OpenVINO IR `.xml` | CPU/GPU/NPU | ~2-3x |
+| AMD GPU | ROCm | ONNX Runtime | GPU | ~1.5-2x |
+| CPU (any) | CPU | ONNX Runtime | CPU | ~1.5x |
+
+> **Apple Silicon Note**: Detection defaults to `cpu_and_ne` (CPU + Neural Engine), keeping the GPU free for LLM/VLM inference. Set `compute_units: all` to include GPU if not running local LLM.
 
 ### How It Works
 
diff --git a/skills/detection/yolo-detection-2026/scripts/detect.py b/skills/detection/yolo-detection-2026/scripts/detect.py
index d149374..40bea8b 100644
--- a/skills/detection/yolo-detection-2026/scripts/detect.py
+++ b/skills/detection/yolo-detection-2026/scripts/detect.py
@@ -248,7 +248,7 @@ def main():
         perf.model_load_ms = env.load_ms
         perf.export_ms = env.export_ms
 
-        emit({
+        ready_event = {
             "event": "ready",
             "model": f"yolo2026{model_size[0]}",
             "model_size": model_size,
@@ -260,7 +260,10 @@ def main():
             "fps": fps,
             "model_load_ms": round(env.load_ms, 1),
             "available_sizes": list(MODEL_SIZE_MAP.keys()),
-        })
+        }
+        if hasattr(env, 'compute_units') and env.backend == "mps":
+            ready_event["compute_units"] = env.compute_units
+        emit(ready_event)
     except Exception as e:
         emit({"event": "error", "message": f"Failed to load model: {e}", "retriable": False})
         sys.exit(1)
diff --git a/skills/detection/yolo-detection-2026/scripts/env_config.py b/skills/detection/yolo-detection-2026/scripts/env_config.py
index ff42e6f..7c46c05 100644
--- a/skills/detection/yolo-detection-2026/scripts/env_config.py
+++ b/skills/detection/yolo-detection-2026/scripts/env_config.py
@@ -40,6 +40,7 @@ class BackendSpec:
     model_suffix: str       # file extension/dir to look for cached model
     half: bool = True       # use FP16
     extra_export_args: dict = field(default_factory=dict)
+    compute_units: Optional[str] = None  # CoreML compute units: "cpu_and_ne", "all", etc.
 
 
 BACKEND_SPECS = {
@@ -61,6 +62,7 @@ class BackendSpec:
         model_suffix=".mlpackage",
         half=True,
         extra_export_args={"nms": False},
+        compute_units="cpu_and_ne",  # Route to Neural Engine, leave GPU free for LLM/VLM
     ),
     "intel": BackendSpec(
         name="intel",
@@ -86,6 +88,7 @@ class HardwareEnv:
     backend: str = "cpu"              # "cuda" | "rocm" | "mps" | "intel" | "cpu"
     device: str = "cpu"               # torch device string
     export_format: str = "onnx"       # optimal export format
+    compute_units: str = "all"        # CoreML compute units (Apple only)
     gpu_name: str = ""                # human-readable GPU name
     gpu_memory_mb: int = 0            # GPU memory in MB
     driver_version: str = ""          # GPU driver version
@@ -113,9 +116,11 @@ def detect() -> "HardwareEnv":
         else:
             env._fallback_cpu()
 
-        # Set export format from backend spec
+        # Set export format and compute units from backend spec
         spec = BACKEND_SPECS.get(env.backend, BACKEND_SPECS["cpu"])
         env.export_format = spec.export_format
+        if spec.compute_units:
+            env.compute_units = spec.compute_units
 
         # Check if optimized runtime is available
         env.framework_ok = env._check_framework()
@@ -439,6 +444,58 @@ def export_model(self, model, model_name: str) -> Optional[Path]:
 
         return None
 
+    def _load_coreml_with_compute_units(self, model_path: str):
+        """
+        Load a CoreML model via YOLO with specific compute_units.
+
+        Monkey-patches coremltools.MLModel to inject compute_units
+        (e.g. CPU_AND_NE for Neural Engine) since ultralytics doesn't
+        expose this parameter. Patch is scoped and immediately restored.
+        """
+        from ultralytics import YOLO
+
+        # Map string config → coremltools enum
+        _COMPUTE_UNIT_MAP = {
+            "all": "ALL",
+            "cpu_only": "CPU_ONLY",
+            "cpu_and_gpu": "CPU_AND_GPU",
+            "cpu_and_ne": "CPU_AND_NE",
+        }
+
+        ct_enum_name = _COMPUTE_UNIT_MAP.get(self.compute_units)
+        if not ct_enum_name:
+            _log(f"Unknown compute_units '{self.compute_units}', using default")
+            return YOLO(model_path)
+
+        try:
+            import coremltools as ct
+            target_units = getattr(ct.ComputeUnit, ct_enum_name, None)
+            if target_units is None:
+                _log(f"coremltools.ComputeUnit.{ct_enum_name} not available")
+                return YOLO(model_path)
+
+            # Temporarily patch MLModel to inject compute_units
+            _OrigMLModel = ct.models.MLModel
+
+            class _PatchedMLModel(_OrigMLModel):
+                def __init__(self, *args, **kwargs):
+                    kwargs.setdefault('compute_units', target_units)
+                    super().__init__(*args, **kwargs)
+
+            ct.models.MLModel = _PatchedMLModel
+            try:
+                model = YOLO(model_path)
+            finally:
+                ct.models.MLModel = _OrigMLModel  # Always restore
+
+            _log(f"CoreML model loaded with compute_units={ct_enum_name} "
+                 f"(Neural Engine preferred)")
+            return model
+
+        except ImportError:
+            _log("coremltools not available, loading without compute_units")
+            return YOLO(model_path)
+
     def load_optimized(self, model_name: str, use_optimized: bool = True):
         """
         Load the best available model for this hardware.
@@ -455,7 +512,12 @@ def load_optimized(self, model_name: str, use_optimized: bool = True):
             optimized_path = self.get_optimized_path(model_name)
             if optimized_path.exists():
                 try:
-                    model = YOLO(str(optimized_path))
+                    # On Apple Silicon: route CoreML to Neural Engine
+                    if self.backend == "mps" and self.compute_units != "all":
+                        model = self._load_coreml_with_compute_units(
+                            str(optimized_path))
+                    else:
+                        model = YOLO(str(optimized_path))
                     self.load_ms = (time.perf_counter() - t0) * 1000
                     _log(f"Loaded {self.export_format} model ({self.load_ms:.0f}ms)")
                     return model, self.export_format
@@ -467,7 +529,12 @@ def load_optimized(self, model_name: str, use_optimized: bool = True):
             exported = self.export_model(pt_model, model_name)
             if exported:
                 try:
-                    model = YOLO(str(exported))
+                    # On Apple Silicon: route CoreML to Neural Engine
+                    if self.backend == "mps" and self.compute_units != "all":
+                        model = self._load_coreml_with_compute_units(
+                            str(exported))
+                    else:
+                        model = YOLO(str(exported))
                     self.load_ms = (time.perf_counter() - t0) * 1000
                     _log(f"Loaded freshly exported {self.export_format} model ({self.load_ms:.0f}ms)")
                     return model, self.export_format
@@ -508,7 +575,7 @@ def load_optimized(self, model_name: str, use_optimized: bool = True):
 
     def to_dict(self) -> dict:
         """Serialize environment info for JSON output."""
-        return {
+        d = {
             "backend": self.backend,
             "device": self.device,
             "export_format": self.export_format,
@@ -519,6 +586,9 @@ def to_dict(self) -> dict:
             "export_ms": round(self.export_ms, 1),
             "load_ms": round(self.load_ms, 1),
         }
+        if self.backend == "mps":
+            d["compute_units"] = self.compute_units
+        return d
 
 
 # ─── CLI: run standalone for diagnostics ─────────────────────────────────────
diff --git a/skills/lib/env_config.py b/skills/lib/env_config.py
index ff42e6f..7c46c05 100644
--- a/skills/lib/env_config.py
+++ b/skills/lib/env_config.py
@@ -40,6 +40,7 @@ class BackendSpec:
     model_suffix: str       # file extension/dir to look for cached model
     half: bool = True       # use FP16
     extra_export_args: dict = field(default_factory=dict)
+    compute_units: Optional[str] = None  # CoreML compute units: "cpu_and_ne", "all", etc.
 
 
 BACKEND_SPECS = {
@@ -61,6 +62,7 @@ class BackendSpec:
         model_suffix=".mlpackage",
         half=True,
         extra_export_args={"nms": False},
+        compute_units="cpu_and_ne",  # Route to Neural Engine, leave GPU free for LLM/VLM
     ),
     "intel": BackendSpec(
         name="intel",
@@ -86,6 +88,7 @@ class HardwareEnv:
     backend: str = "cpu"              # "cuda" | "rocm" | "mps" | "intel" | "cpu"
     device: str = "cpu"               # torch device string
     export_format: str = "onnx"       # optimal export format
+    compute_units: str = "all"        # CoreML compute units (Apple only)
     gpu_name: str = ""                # human-readable GPU name
     gpu_memory_mb: int = 0            # GPU memory in MB
     driver_version: str = ""          # GPU driver version
@@ -113,9 +116,11 @@ def detect() -> "HardwareEnv":
         else:
             env._fallback_cpu()
 
-        # Set export format from backend spec
+        # Set export format and compute units from backend spec
         spec = BACKEND_SPECS.get(env.backend, BACKEND_SPECS["cpu"])
         env.export_format = spec.export_format
+        if spec.compute_units:
+            env.compute_units = spec.compute_units
 
         # Check if optimized runtime is available
         env.framework_ok = env._check_framework()
@@ -439,6 +444,58 @@ def export_model(self, model, model_name: str) -> Optional[Path]:
 
         return None
 
+    def _load_coreml_with_compute_units(self, model_path: str):
+        """
+        Load a CoreML model via YOLO with specific compute_units.
+
+        Monkey-patches coremltools.MLModel to inject compute_units
+        (e.g. CPU_AND_NE for Neural Engine) since ultralytics doesn't
+        expose this parameter. Patch is scoped and immediately restored.
+        """
+        from ultralytics import YOLO
+
+        # Map string config → coremltools enum
+        _COMPUTE_UNIT_MAP = {
+            "all": "ALL",
+            "cpu_only": "CPU_ONLY",
+            "cpu_and_gpu": "CPU_AND_GPU",
+            "cpu_and_ne": "CPU_AND_NE",
+        }
+
+        ct_enum_name = _COMPUTE_UNIT_MAP.get(self.compute_units)
+        if not ct_enum_name:
+            _log(f"Unknown compute_units '{self.compute_units}', using default")
+            return YOLO(model_path)
+
+        try:
+            import coremltools as ct
+            target_units = getattr(ct.ComputeUnit, ct_enum_name, None)
+            if target_units is None:
+                _log(f"coremltools.ComputeUnit.{ct_enum_name} not available")
+                return YOLO(model_path)
+
+            # Temporarily patch MLModel to inject compute_units
+            _OrigMLModel = ct.models.MLModel
+
+            class _PatchedMLModel(_OrigMLModel):
+                def __init__(self, *args, **kwargs):
+                    kwargs.setdefault('compute_units', target_units)
+                    super().__init__(*args, **kwargs)
+
+            ct.models.MLModel = _PatchedMLModel
+            try:
+                model = YOLO(model_path)
+            finally:
+                ct.models.MLModel = _OrigMLModel  # Always restore
+
+            _log(f"CoreML model loaded with compute_units={ct_enum_name} "
+                 f"(Neural Engine preferred)")
+            return model
+
+        except ImportError:
+            _log("coremltools not available, loading without compute_units")
+            return YOLO(model_path)
+
     def load_optimized(self, model_name: str, use_optimized: bool = True):
         """
         Load the best available model for this hardware.
@@ -455,7 +512,12 @@ def load_optimized(self, model_name: str, use_optimized: bool = True):
             optimized_path = self.get_optimized_path(model_name)
             if optimized_path.exists():
                 try:
-                    model = YOLO(str(optimized_path))
+                    # On Apple Silicon: route CoreML to Neural Engine
+                    if self.backend == "mps" and self.compute_units != "all":
+                        model = self._load_coreml_with_compute_units(
+                            str(optimized_path))
+                    else:
+                        model = YOLO(str(optimized_path))
                     self.load_ms = (time.perf_counter() - t0) * 1000
                     _log(f"Loaded {self.export_format} model ({self.load_ms:.0f}ms)")
                     return model, self.export_format
@@ -467,7 +529,12 @@ def load_optimized(self, model_name: str, use_optimized: bool = True):
             exported = self.export_model(pt_model, model_name)
             if exported:
                 try:
-                    model = YOLO(str(exported))
+                    # On Apple Silicon: route CoreML to Neural Engine
+                    if self.backend == "mps" and self.compute_units != "all":
+                        model = self._load_coreml_with_compute_units(
+                            str(exported))
+                    else:
+                        model = YOLO(str(exported))
                     self.load_ms = (time.perf_counter() - t0) * 1000
                     _log(f"Loaded freshly exported {self.export_format} model ({self.load_ms:.0f}ms)")
                     return model, self.export_format
@@ -508,7 +575,7 @@ def load_optimized(self, model_name: str, use_optimized: bool = True):
 
     def to_dict(self) -> dict:
         """Serialize environment info for JSON output."""
-        return {
+        d = {
             "backend": self.backend,
             "device": self.device,
             "export_format": self.export_format,
@@ -519,6 +586,9 @@ def to_dict(self) -> dict:
             "export_ms": round(self.export_ms, 1),
             "load_ms": round(self.load_ms, 1),
         }
+        if self.backend == "mps":
+            d["compute_units"] = self.compute_units
+        return d
 
 
 # ─── CLI: run standalone for diagnostics ─────────────────────────────────────
diff --git a/skills/lib/test_env_config_ane.py b/skills/lib/test_env_config_ane.py
new file mode 100644
index 0000000..dc032eb
--- /dev/null
+++ b/skills/lib/test_env_config_ane.py
@@ -0,0 +1,276 @@
+#!/usr/bin/env python3
+"""
+Unit tests for Apple Neural Engine (ANE) compute_units in env_config.py.
+
+Tests compute_units configuration, monkey-patch scoping, and CoreML
+load-time injection — all mocked, no Apple hardware required.
+
+Run:  python -m pytest skills/lib/test_env_config_ane.py -v
+"""
+
+import platform
+import subprocess
+import sys
+from pathlib import Path
+from unittest import mock
+
+import pytest
+
+# Ensure env_config is importable from skills/lib/
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+from env_config import BackendSpec, BACKEND_SPECS, HardwareEnv, _log  # noqa: E402
+
+
+# ── Tests: BackendSpec compute_units ────────────────────────────────────────
+
+class TestBackendSpecComputeUnits:
+    """Verify compute_units field on backend specs."""
+
+    def test_mps_spec_has_cpu_and_ne(self):
+        """MPS backend defaults to cpu_and_ne (Neural Engine)."""
+        spec = BACKEND_SPECS["mps"]
+        assert spec.compute_units == "cpu_and_ne"
+
+    def test_cuda_spec_has_no_compute_units(self):
+        """Non-Apple backends have no compute_units set."""
+        assert BACKEND_SPECS["cuda"].compute_units is None
+
+    def test_cpu_spec_has_no_compute_units(self):
+        assert BACKEND_SPECS["cpu"].compute_units is None
+
+    def test_rocm_spec_has_no_compute_units(self):
+        assert BACKEND_SPECS["rocm"].compute_units is None
+
+    def test_intel_spec_has_no_compute_units(self):
+        assert BACKEND_SPECS["intel"].compute_units is None
+
+
+# ── Tests: HardwareEnv compute_units field ──────────────────────────────────
+
+class TestHardwareEnvComputeUnits:
+    """Verify compute_units is set correctly during detection."""
+
+    def test_default_compute_units_is_all(self):
+        """Default HardwareEnv has compute_units='all'."""
+        env = HardwareEnv()
+        assert env.compute_units == "all"
+
+    @mock.patch("env_config.platform.system", return_value="Darwin")
+    @mock.patch("env_config.platform.machine", return_value="arm64")
+    @mock.patch("env_config.subprocess.run")
+    @mock.patch("env_config.shutil.which", return_value=None)
+    @mock.patch("env_config.Path.is_dir", return_value=False)
+    def test_mps_sets_compute_units_cpu_and_ne(
+        self, _dir, _which, mock_run, _machine, _system
+    ):
+        """Apple Silicon detection sets compute_units to 'cpu_and_ne'."""
+        mock_run.return_value = subprocess.CompletedProcess(
+            args=[], returncode=0, stdout="Apple M3 Max"
+        )
+
+        env = HardwareEnv()
+        result = env._try_mps()
+        assert result is True
+
+        # Simulate what detect() does after _try_mps
+        spec = BACKEND_SPECS.get(env.backend, BACKEND_SPECS["cpu"])
+        if spec.compute_units:
+            env.compute_units = spec.compute_units
+
+        assert env.backend == "mps"
+        assert env.compute_units == "cpu_and_ne"
+
+    def test_to_dict_includes_compute_units_for_mps(self):
+        """to_dict() includes compute_units when backend is mps."""
+        env = HardwareEnv()
+        env.backend = "mps"
+        env.compute_units = "cpu_and_ne"
+        d = env.to_dict()
+        assert "compute_units" in d
+        assert d["compute_units"] == "cpu_and_ne"
+
+    def test_to_dict_excludes_compute_units_for_non_mps(self):
+        """to_dict() does NOT include compute_units for non-mps backends."""
+        env = HardwareEnv()
+        env.backend = "cuda"
+        d = env.to_dict()
+        assert "compute_units" not in d
+
+
+# ── Tests: _load_coreml_with_compute_units ──────────────────────────────────
+
+class TestLoadCoremlWithComputeUnits:
+    """Test the monkey-patch mechanism for CoreML compute_units."""
+
+    def test_monkey_patch_injects_compute_units(self):
+        """MLModel is temporarily patched to inject CPU_AND_NE."""
+        env = HardwareEnv()
+        env.backend = "mps"
+        env.compute_units = "cpu_and_ne"
+
+        # Create mock coremltools module
+        mock_ct = mock.MagicMock()
+        mock_ct.ComputeUnit.CPU_AND_NE = "CPU_AND_NE_SENTINEL"
+        mock_ct.ComputeUnit.ALL = "ALL_SENTINEL"
+
+        # Track MLModel calls to verify compute_units was injected
+        original_mlmodel = mock.MagicMock()
+        mock_ct.models.MLModel = original_mlmodel
+
+        captured_kwargs = {}
+
+        mock_yolo_cls = mock.MagicMock()
+
+        def capture_yolo_init(path):
+            """When YOLO loads the model, check if MLModel was patched."""
+            # Simulate what YOLO does internally: call ct.models.MLModel
+            current_mlmodel = mock_ct.models.MLModel
+            # The patched class should be different from original
+            instance = current_mlmodel("test.mlpackage")
+            return mock.MagicMock()
+
+        mock_yolo_cls.side_effect = capture_yolo_init
+
+        with mock.patch.dict("sys.modules", {"coremltools": mock_ct}):
+            with mock.patch("env_config.YOLO", mock_yolo_cls, create=True):
+                # Can't easily test the full flow since YOLO import is inside
+                # the method. Instead, test the logic directly.
+                pass
+
+        # Direct test: verify the patch class works correctly
+        class MockMLModel:
+            def __init__(self, *args, **kwargs):
+                self.kwargs = kwargs
+
+        mock_ct.models.MLModel = MockMLModel
+
+        with mock.patch.dict("sys.modules", {"coremltools": mock_ct}):
+            # Simulate the patching logic
+            _OrigMLModel = mock_ct.models.MLModel
+            target_units = mock_ct.ComputeUnit.CPU_AND_NE
+
+            class _PatchedMLModel(_OrigMLModel):
+                def __init__(self, *args, **kwargs):
+                    kwargs.setdefault('compute_units', target_units)
+                    super().__init__(*args, **kwargs)
+
+            # Verify patch injects compute_units
+            patched = _PatchedMLModel("test.mlpackage")
+            assert patched.kwargs.get('compute_units') == "CPU_AND_NE_SENTINEL"
+
+            # Verify explicit override is preserved
+            explicit = _PatchedMLModel("test.mlpackage", compute_units="CUSTOM")
+            assert explicit.kwargs.get('compute_units') == "CUSTOM"
+
+    def test_monkey_patch_restored_after_load(self):
+        """MLModel is restored to original after YOLO load, even on error."""
+        env = HardwareEnv()
+        env.backend = "mps"
+        env.compute_units = "cpu_and_ne"
+
+        mock_ct = mock.MagicMock()
+        mock_ct.ComputeUnit.CPU_AND_NE = "CPU_AND_NE_SENTINEL"
+        original_mlmodel = mock.MagicMock()
+        mock_ct.models.MLModel = original_mlmodel
+
+        mock_yolo = mock.MagicMock(side_effect=Exception("test error"))
+
+        with mock.patch.dict("sys.modules", {
+            "coremltools": mock_ct,
+            "ultralytics": mock.MagicMock(YOLO=mock_yolo),
+        }):
+            try:
+                env._load_coreml_with_compute_units("test.mlpackage")
+            except Exception:
+                pass
+
+            # MLModel should be restored to original even after error
+            assert mock_ct.models.MLModel is original_mlmodel
+
+    def test_unknown_compute_units_falls_back(self):
+        """Unknown compute_units string falls back to plain YOLO load."""
+        env = HardwareEnv()
+        env.backend = "mps"
+        env.compute_units = "unknown_units"
+
+        mock_yolo = mock.MagicMock()
+        mock_model = mock.MagicMock()
+        mock_yolo.return_value = mock_model
+
+        with mock.patch.dict("sys.modules", {
+            "ultralytics": mock.MagicMock(YOLO=mock_yolo),
+        }):
+            result = env._load_coreml_with_compute_units("test.mlpackage")
+            mock_yolo.assert_called_once_with("test.mlpackage")
+
+    def test_coremltools_missing_falls_back(self):
+        """If coremltools import fails, falls back to plain YOLO load."""
+        env = HardwareEnv()
+        env.backend = "mps"
+        env.compute_units = "cpu_and_ne"
+
+        mock_yolo = mock.MagicMock()
+        mock_model = mock.MagicMock()
+        mock_yolo.return_value = mock_model
+
+        # Make coremltools import fail
+        with mock.patch.dict("sys.modules", {
+            "coremltools": None,
+            "ultralytics": mock.MagicMock(YOLO=mock_yolo),
+        }):
+            result = env._load_coreml_with_compute_units("test.mlpackage")
+            mock_yolo.assert_called_once_with("test.mlpackage")
+
+
+# ── Tests: load_optimized integration ───────────────────────────────────────
+
+class TestLoadOptimizedMPS:
+    """Test that load_optimized routes through compute_units on MPS."""
+
+    def test_mps_cached_model_uses_compute_units(self):
+        """When cached .mlpackage exists, loads via _load_coreml_with_compute_units."""
+        env = HardwareEnv()
+        env.backend = "mps"
+        env.device = "mps"
+        env.export_format = "coreml"
+        env.framework_ok = True
+        env.compute_units = "cpu_and_ne"
+
+        mock_model = mock.MagicMock()
+
+        with mock.patch.object(env, "_load_coreml_with_compute_units",
+                               return_value=mock_model) as mock_load:
+            with mock.patch.object(env, "get_optimized_path") as mock_path:
+                mock_path.return_value = mock.MagicMock(exists=lambda: True)
+
+                with mock.patch.dict("sys.modules", {
+                    "ultralytics": mock.MagicMock(),
+                }):
+                    model, fmt = env.load_optimized("yolo26n")
+
+                assert fmt == "coreml"
+                mock_load.assert_called_once()
+
+    def test_mps_compute_units_all_skips_monkey_patch(self):
+        """When compute_units='all', loads via standard YOLO path."""
+        env = HardwareEnv()
+        env.backend = "mps"
+        env.device = "mps"
+        env.export_format = "coreml"
+        env.framework_ok = True
+        env.compute_units = "all"  # explicit: use all units including GPU
+
+        mock_yolo = mock.MagicMock()
+        mock_model = mock.MagicMock()
+        mock_yolo.return_value = mock_model
+
+        with mock.patch.object(env, "get_optimized_path") as mock_path:
+            mock_path.return_value = mock.MagicMock(exists=lambda: True)
+
+            with mock.patch.dict("sys.modules", {
+                "ultralytics": mock.MagicMock(YOLO=mock_yolo),
+            }):
+                model, fmt = env.load_optimized("yolo26n")
+
+            assert fmt == "coreml"
+            mock_yolo.assert_called_once()

From 44dce05f72441ec68291f77ddbe9f8774770415e Mon Sep 17 00:00:00 2001
From: Simba Zhang <solderzzc@gmail.com>
Date: Mon, 9 Mar 2026 23:49:33 -0700
Subject: [PATCH 2/2] feat(skills): add deploy.bat Windows equivalents for all
 skills with deploy.sh

- yolo-detection-2026: full GPU detection, venv, requirements, model optimization
- homesafe-bench: npm install
- smarthome-bench: yt-dlp/ffmpeg checks + npm install

ROCm/MPS blocks omitted (Linux/macOS only). Uses py.exe launcher
for Python discovery with fallback to python/python3 on PATH.
---
 skills/analysis/homesafe-bench/deploy.bat     |  20 ++
 skills/analysis/smarthome-bench/deploy.bat    |  75 ++++++
 .../detection/yolo-detection-2026/deploy.bat  | 221 ++++++++++++++++++
 3 files changed, 316 insertions(+)
 create mode 100644 skills/analysis/homesafe-bench/deploy.bat
 create mode 100644 skills/analysis/smarthome-bench/deploy.bat
 create mode 100644 skills/detection/yolo-detection-2026/deploy.bat

diff --git a/skills/analysis/homesafe-bench/deploy.bat b/skills/analysis/homesafe-bench/deploy.bat
new file mode 100644
index 0000000..c616195
--- /dev/null
+++ b/skills/analysis/homesafe-bench/deploy.bat
@@ -0,0 +1,20 @@
+@echo off
+REM HomeSafe-Bench deployment script (Windows)
+REM Runs npm install to fetch openai SDK dependency
+
+cd /d "%~dp0"
+
+where npm >nul 2>&1
+if %errorlevel% neq 0 (
+    echo ERROR: npm not found. Install Node.js from https://nodejs.org and retry.
+    exit /b 1
+)
+
+npm install
+if %errorlevel% neq 0 (
+    echo ERROR: npm install failed
+    exit /b 1
+)
+
+echo HomeSafe-Bench dependencies installed
+exit /b 0
diff --git a/skills/analysis/smarthome-bench/deploy.bat b/skills/analysis/smarthome-bench/deploy.bat
new file mode 100644
index 0000000..62a7764
--- /dev/null
+++ b/skills/analysis/smarthome-bench/deploy.bat
@@ -0,0 +1,75 @@
+@echo off
+REM SmartHome-Bench deployment script (Windows)
+REM Called by Aegis deployment agent during skill installation
+
+setlocal enabledelayedexpansion
+
+set "SKILL_DIR=%~dp0"
+if "%SKILL_DIR:~-1%"=="\" set "SKILL_DIR=%SKILL_DIR:~0,-1%"
+echo Deploying SmartHome-Bench from: %SKILL_DIR%
+
+REM ── Check system dependencies ────────────────────────────────────────────────
+
+echo Checking system dependencies...
+
+where yt-dlp >nul 2>&1
+if %errorlevel% neq 0 (
+    echo WARNING: yt-dlp not found. Attempting install via pip...
+    where pip >nul 2>&1
+    if !errorlevel! equ 0 (
+        pip install yt-dlp
+    ) else (
+        where pip3 >nul 2>&1
+        if !errorlevel! equ 0 (
+            pip3 install yt-dlp
+        ) else (
+            echo ERROR: Cannot install yt-dlp automatically. Please install manually:
+            echo   pip install yt-dlp
+            echo   OR download from https://github.com/yt-dlp/yt-dlp/releases
+            exit /b 1
+        )
+    )
+)
+
+REM Verify yt-dlp is now available
+where yt-dlp >nul 2>&1
+if %errorlevel% neq 0 (
+    echo ERROR: yt-dlp installation failed
+    exit /b 1
+)
+for /f "tokens=*" %%V in ('yt-dlp --version 2^>nul') do echo   yt-dlp: %%V
+
+where ffmpeg >nul 2>&1
+if %errorlevel% neq 0 (
+    echo ERROR: ffmpeg not found. Please install manually:
+    echo   winget install ffmpeg
+    echo   OR download from https://ffmpeg.org/download.html
+    exit /b 1
+)
+for /f "tokens=1-3" %%A in ('ffmpeg -version 2^>^&1') do (
+    if "%%A"=="ffmpeg" echo   ffmpeg: %%B %%C
+    goto :ffmpeg_done
+)
+:ffmpeg_done
+
+REM ── Install npm dependencies ─────────────────────────────────────────────────
+
+echo Installing npm dependencies...
+
+where npm >nul 2>&1
+if %errorlevel% neq 0 (
+    echo ERROR: npm not found. Install Node.js from https://nodejs.org and retry.
+    exit /b 1
+)
+
+cd /d "%SKILL_DIR%"
+npm install --production
+if %errorlevel% neq 0 (
+    echo ERROR: npm install failed
+    exit /b 1
+)
+
+echo SmartHome-Bench deployed successfully
+
+endlocal
+exit /b 0
diff --git a/skills/detection/yolo-detection-2026/deploy.bat b/skills/detection/yolo-detection-2026/deploy.bat
new file mode 100644
index 0000000..6661322
--- /dev/null
+++ b/skills/detection/yolo-detection-2026/deploy.bat
@@ -0,0 +1,221 @@
+@echo off
+REM deploy.bat — Zero-assumption bootstrapper for YOLO 2026 Detection Skill (Windows)
+REM
+REM Probes the system for Python, GPU backends, and installs the minimum
+REM viable stack. Called by Aegis skill-runtime-manager during installation.
+REM
+REM Uses skills\lib\env_config.py for hardware detection and model optimization.
+REM
+REM Exit codes:
+REM   0  = success
+REM   1  = fatal error (no Python found)
+REM   2  = partial success (CPU-only fallback)
+
+setlocal enabledelayedexpansion
+
+set "SKILL_DIR=%~dp0"
+REM Remove trailing backslash
+if "%SKILL_DIR:~-1%"=="\" set "SKILL_DIR=%SKILL_DIR:~0,-1%"
+set "VENV_DIR=%SKILL_DIR%\.venv"
+set "LOG_PREFIX=[YOLO-2026-deploy]"
+
+REM Resolve lib dir (two levels up + lib)
+set "LIB_DIR="
+if exist "%SKILL_DIR%\..\..\lib\env_config.py" (
+    pushd "%SKILL_DIR%\..\..\lib"
+    set "LIB_DIR=!CD!"
+    popd
+)
+
+REM ─── Step 1: Find Python ───────────────────────────────────────────────────
+
+echo %LOG_PREFIX% Searching for Python...>&2
+
+set "PYTHON_CMD="
+
+REM Try the Windows Python launcher (py.exe) first — ships with python.org installer
+for %%V in (3.12 3.11 3.10 3.9) do (
+    if not defined PYTHON_CMD (
+        py -%%V --version >nul 2>&1
+        if !errorlevel! equ 0 (
+            set "PYTHON_CMD=py -%%V"
+        )
+    )
+)
+
+REM Fallback: bare python3 / python on PATH
+if not defined PYTHON_CMD (
+    python3 --version >nul 2>&1
+    if !errorlevel! equ 0 (
+        REM Verify version >= 3.9
+        for /f "tokens=2 delims= " %%A in ('python3 --version 2^>^&1') do set "_pyver=%%A"
+        for /f "tokens=1,2 delims=." %%A in ("!_pyver!") do (
+            if %%A geq 3 if %%B geq 9 set "PYTHON_CMD=python3"
+        )
+    )
+)
+
+if not defined PYTHON_CMD (
+    python --version >nul 2>&1
+    if !errorlevel! equ 0 (
+        for /f "tokens=2 delims= " %%A in ('python --version 2^>^&1') do set "_pyver=%%A"
+        for /f "tokens=1,2 delims=." %%A in ("!_pyver!") do (
+            if %%A geq 3 if %%B geq 9 set "PYTHON_CMD=python"
+        )
+    )
+)
+
+if not defined PYTHON_CMD (
+    echo %LOG_PREFIX% ERROR: No Python ^>=3.9 found. Install Python 3.9+ and retry.>&2
+    echo {"event": "error", "stage": "python", "message": "No Python >=3.9 found"}
+    exit /b 1
+)
+
+for /f "tokens=*" %%A in ('!PYTHON_CMD! --version 2^>^&1') do set "PY_VERSION=%%A"
+echo %LOG_PREFIX% Using Python: %PYTHON_CMD% (%PY_VERSION%)>&2
+echo {"event": "progress", "stage": "python", "message": "Found %PY_VERSION%"}
+
+REM ─── Step 2: Create virtual environment ────────────────────────────────────
+
+if not exist "%VENV_DIR%\Scripts\python.exe" (
+    echo %LOG_PREFIX% Creating virtual environment...>&2
+    %PYTHON_CMD% -m venv "%VENV_DIR%"
+    if !errorlevel! neq 0 (
+        echo %LOG_PREFIX% ERROR: Failed to create virtual environment>&2
+        echo {"event": "error", "stage": "venv", "message": "Failed to create venv"}
+        exit /b 1
+    )
+)
+
+set "PIP=%VENV_DIR%\Scripts\pip.exe"
+set "VPYTHON=%VENV_DIR%\Scripts\python.exe"
+
+"%PIP%" install --upgrade pip -q >nul 2>&1
+
+echo {"event": "progress", "stage": "venv", "message": "Virtual environment ready"}
+
+REM ─── Step 2.5: Bundle env_config.py alongside detect.py ────────────────────
+
+if defined LIB_DIR (
+    if exist "%LIB_DIR%\env_config.py" (
+        copy /Y "%LIB_DIR%\env_config.py" "%SKILL_DIR%\scripts\env_config.py" >nul 2>&1
+        echo %LOG_PREFIX% Bundled env_config.py into scripts\>&2
+    )
+)
+
+REM ─── Step 3: Detect hardware via env_config ────────────────────────────────
+
+set "BACKEND=cpu"
+
+REM Find env_config.py — bundled copy or repo lib\
+set "ENV_CONFIG_DIR="
+if exist "%SKILL_DIR%\scripts\env_config.py" (
+    set "ENV_CONFIG_DIR=%SKILL_DIR%\scripts"
+) else if defined LIB_DIR (
+    if exist "%LIB_DIR%\env_config.py" (
+        set "ENV_CONFIG_DIR=%LIB_DIR%"
+    )
+)
+
+if defined ENV_CONFIG_DIR (
+    echo %LOG_PREFIX% Detecting hardware via env_config.py...>&2
+
+    REM Run env_config detection via Python
+    for /f "tokens=*" %%B in ('"%VPYTHON%" -c "import sys; sys.path.insert(0, r'!ENV_CONFIG_DIR!'); from env_config import HardwareEnv; env = HardwareEnv.detect(); print(env.backend)" 2^>nul') do (
+        set "DETECTED_BACKEND=%%B"
+    )
+
+    REM Validate backend value (Windows: only cuda, intel, cpu are realistic)
+    if "!DETECTED_BACKEND!"=="cuda" (
+        set "BACKEND=cuda"
+    ) else if "!DETECTED_BACKEND!"=="intel" (
+        set "BACKEND=intel"
+    ) else if "!DETECTED_BACKEND!"=="cpu" (
+        set "BACKEND=cpu"
+    ) else (
+        echo %LOG_PREFIX% env_config returned '!DETECTED_BACKEND!', falling back to heuristic>&2
+        set "BACKEND=cpu"
+    )
+
+    echo %LOG_PREFIX% env_config detected backend: !BACKEND!>&2
+) else (
+    echo %LOG_PREFIX% env_config.py not found, using heuristic detection...>&2
+
+    REM Fallback: inline GPU detection via nvidia-smi
+    where nvidia-smi >nul 2>&1
+    if !errorlevel! equ 0 (
+        for /f "tokens=*" %%G in ('nvidia-smi --query-gpu^=driver_version --format^=csv^,noheader 2^>nul') do (
+            if not "%%G"=="" (
+                set "BACKEND=cuda"
+                echo %LOG_PREFIX% Detected NVIDIA GPU ^(driver: %%G^)>&2
+            )
+        )
+    )
+)
+
+echo {"event": "progress", "stage": "gpu", "backend": "!BACKEND!", "message": "Compute backend: !BACKEND!"}
+
+REM ─── Step 4: Install requirements ──────────────────────────────────────────
+
+set "REQ_FILE=%SKILL_DIR%\requirements_!BACKEND!.txt"
+
+if not exist "!REQ_FILE!" (
+    echo %LOG_PREFIX% WARNING: !REQ_FILE! not found, falling back to CPU>&2
+    set "REQ_FILE=%SKILL_DIR%\requirements_cpu.txt"
+    set "BACKEND=cpu"
+)
+
+echo %LOG_PREFIX% Installing dependencies from !REQ_FILE! ...>&2
+echo {"event": "progress", "stage": "install", "message": "Installing !BACKEND! dependencies..."}
+
+if "!BACKEND!"=="cuda" (
+    REM CUDA on Windows: install torch with CUDA index, then remaining deps
+    "%PIP%" install torch torchvision --index-url https://download.pytorch.org/whl/cu124 -q 2>&1 | findstr /V "^$" >nul
+    if !errorlevel! neq 0 (
+        echo %LOG_PREFIX% WARNING: CUDA torch install failed, trying cu121...>&2
+        "%PIP%" install torch torchvision --index-url https://download.pytorch.org/whl/cu121 -q 2>&1 | findstr /V "^$" >nul
+    )
+    REM Install remaining requirements (ultralytics, etc.)
+    "%PIP%" install -r "!REQ_FILE!" -q 2>&1 | findstr /V "^$" >nul
+) else (
+    "%PIP%" install -r "!REQ_FILE!" -q 2>&1 | findstr /V "^$" >nul
+)
+
+REM ─── Step 5: Pre-convert model to optimized format ─────────────────────────
+
+if "!BACKEND!" neq "cpu" (
+    echo %LOG_PREFIX% Pre-converting model to optimized format for !BACKEND!...>&2
+    echo {"event": "progress", "stage": "optimize", "message": "Converting model for !BACKEND! (~30-120s)..."}
+
+    "%VPYTHON%" -c "import sys; sys.path.insert(0, r'!ENV_CONFIG_DIR!'); from env_config import HardwareEnv; env = HardwareEnv.detect(); from ultralytics import YOLO; model = YOLO('yolo26n.pt'); result = env.export_model(model, 'yolo26n'); print(f'Optimized model exported: {result}' if result else 'Export skipped or failed')" 2>&1
+
+    if !errorlevel! equ 0 (
+        echo {"event": "progress", "stage": "optimize", "message": "Model optimization complete"}
+    ) else (
+        echo %LOG_PREFIX% WARNING: Model optimization failed, will use PyTorch at runtime>&2
+        echo {"event": "progress", "stage": "optimize", "message": "Optimization failed — PyTorch fallback"}
+    )
+) else if exist "%SKILL_DIR%\requirements_cpu.txt" (
+    echo %LOG_PREFIX% Pre-converting model to ONNX for CPU...>&2
+    echo {"event": "progress", "stage": "optimize", "message": "Converting model for cpu (~30-120s)..."}
+
+    "%VPYTHON%" -c "import sys; sys.path.insert(0, r'!ENV_CONFIG_DIR!'); from env_config import HardwareEnv; env = HardwareEnv.detect(); from ultralytics import YOLO; model = YOLO('yolo26n.pt'); result = env.export_model(model, 'yolo26n'); print(f'Optimized model exported: {result}' if result else 'Export skipped or failed')" 2>&1
+
+    if !errorlevel! equ 0 (
+        echo {"event": "progress", "stage": "optimize", "message": "Model optimization complete"}
+    ) else (
+        echo %LOG_PREFIX% WARNING: Model optimization failed, will use PyTorch at runtime>&2
+        echo {"event": "progress", "stage": "optimize", "message": "Optimization failed — PyTorch fallback"}
+    )
+)
+
+REM ─── Step 6: Verify installation ───────────────────────────────────────────
+
+echo %LOG_PREFIX% Verifying installation...>&2
+"%VPYTHON%" -c "import sys, json; sys.path.insert(0, r'!ENV_CONFIG_DIR!'); from env_config import HardwareEnv; env = HardwareEnv.detect(); print(json.dumps(env.to_dict(), indent=2))" 2>&1
+
+echo {"event": "complete", "backend": "!BACKEND!", "message": "YOLO 2026 skill installed (!BACKEND! backend)"}
+echo %LOG_PREFIX% Done! Backend: !BACKEND!>&2
+
+endlocal
+exit /b 0