From e98df9d88a1ae880d19dea0c79d747060c1daa1f Mon Sep 17 00:00:00 2001
From: Simba Zhang <solderzzc@gmail.com>
Date: Sat, 14 Mar 2026 16:29:28 -0700
Subject: [PATCH 01/14] feat(skills): add TransformSkillBase reusable base
 class for transform skills

Introduces a 377-line abstract base class that standardizes the stdin/stdout
JSONL protocol, device selection, config loading (AEGIS_SKILL_PARAMS + CLI +
file), graceful signal handling, and performance telemetry for all transform
skills. New skills subclass TransformSkillBase and implement load_model() and
transform_frame() only.
---
 .../scripts/transform_base.py                 | 377 ++++++++++++++++++
 1 file changed, 377 insertions(+)
 create mode 100644 skills/transformation/depth-estimation/scripts/transform_base.py
diff --git a/skills/transformation/depth-estimation/scripts/transform_base.py b/skills/transformation/depth-estimation/scripts/transform_base.py
new file mode 100644
index 0000000..98d9013
--- /dev/null
+++ b/skills/transformation/depth-estimation/scripts/transform_base.py
@@ -0,0 +1,377 @@
+#!/usr/bin/env python3
+"""
+TransformSkillBase — Abstract base class for Aegis privacy/transform skills.
+
+Any skill that transforms camera frames (depth maps, blur, pixelation, etc.)
+should subclass TransformSkillBase and implement the `transform_frame` method.
+
+## Protocol (JSONL over stdin/stdout)
+
+### Aegis → Skill (stdin)
+```jsonl
+{"event": "frame", "frame_id": "cam1_1710001", "camera_id": "front_door", "frame_path": "/tmp/frame.jpg", "timestamp": "..."}
+{"command": "stop"}
+{"command": "config-update", "config": {"opacity": 0.8}}
+```
+
+### Skill → Aegis (stdout)
+```jsonl
+{"event": "ready", "model": "depth-anything-v2-small", "device": "mps"}
+{"event": "transform", "frame_id": "cam1_1710001", "camera_id": "front_door", "transform_path": "/tmp/depth_001.jpg"}
+{"event": "transform", "frame_id": "cam1_1710001", "camera_id": "front_door", "transform_data": "<base64 JPEG>"}
+{"event": "error", "message": "...", "retriable": true}
+{"event": "perf_stats", "total_frames": 100, "timings_ms": {...}}
+```
+
+## Implementing a new transform skill
+
+```python
+from transform_base import TransformSkillBase
+
+class MyCustomTransform(TransformSkillBase):
+    def load_model(self, config):
+        # Load your model here
+        self.model = load_my_model(config["model"])
+        return {"model": config["model"], "device": self.device}
+
+    def transform_frame(self, image, metadata):
+        # Transform the image (numpy BGR array)
+        result = self.model.process(image)
+        return result  # Return numpy BGR array
+
+if __name__ == "__main__":
+    MyCustomTransform().run()
+```
+"""
+
+import sys
+import json
+import signal
+import time
+import argparse
+import tempfile
+import base64
+from abc import ABC, abstractmethod
+from pathlib import Path
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# Performance Tracker
+# ═══════════════════════════════════════════════════════════════════════════════
+
+class PerfTracker:
+    """Collects per-frame timings and emits periodic aggregate stats."""
+
+    def __init__(self, interval: int = 50):
+        self.interval = interval
+        self.frame_count = 0
+        self.total_frames = 0
+        self.error_count = 0
+        self.model_load_ms = 0.0
+
+        self._timings: dict[str, list[float]] = {
+            "file_read": [],
+            "transform": [],
+            "encode": [],
+            "emit": [],
+            "total": [],
+        }
+
+    def record(self, stage: str, duration_ms: float):
+        if stage in self._timings:
+            self._timings[stage].append(duration_ms)
+
+    def record_frame(self):
+        self.frame_count += 1
+        self.total_frames += 1
+        if self.frame_count >= self.interval:
+            self.emit_stats()
+            self.frame_count = 0
+
+    def emit_stats(self):
+        stats = {
+            "event": "perf_stats",
+            "total_frames": self.total_frames,
+            "window_size": len(self._timings["total"]) or 1,
+            "errors": self.error_count,
+            "model_load_ms": round(self.model_load_ms, 1),
+            "timings_ms": {},
+        }
+        for stage, values in self._timings.items():
+            if not values:
+                continue
+            sv = sorted(values)
+            n = len(sv)
+            stats["timings_ms"][stage] = {
+                "avg": round(sum(sv) / n, 2),
+                "min": round(sv[0], 2),
+                "max": round(sv[-1], 2),
+                "p50": round(sv[n // 2], 2),
+                "p95": round(sv[int(n * 0.95)], 2),
+            }
+        _emit(stats)
+        for key in self._timings:
+            self._timings[key].clear()
+
+    def emit_final(self):
+        if self._timings["total"]:
+            self.emit_stats()
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# JSONL helpers
+# ═══════════════════════════════════════════════════════════════════════════════
+
+def _emit(event: dict):
+    """Emit a JSONL event to stdout."""
+    print(json.dumps(event), flush=True)
+
+
+def _log(msg: str, tag: str = "TransformSkill"):
+    """Log to stderr (not captured by Aegis JSONL parser)."""
+    print(f"[{tag}] {msg}", file=sys.stderr, flush=True)
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# Base Class
+# ═══════════════════════════════════════════════════════════════════════════════
+
+class TransformSkillBase(ABC):
+    """
+    Abstract base class for privacy/transform skills.
+
+    Subclasses MUST implement:
+      - load_model(config) → dict   : Load the model, return ready event fields
+      - transform_frame(image, meta) → ndarray : Transform a single frame (BGR in, BGR out)
+
+    Subclasses MAY override:
+      - parse_extra_args(parser)   : Add custom CLI arguments
+      - on_config_update(config)   : Handle live config updates
+      - get_output_mode()          : Return 'path' (default) or 'base64'
+    """
+
+    def __init__(self):
+        self.device = "cpu"
+        self.config = {}
+        self.perf = PerfTracker()
+        self._running = True
+        self._tag = self.__class__.__name__
+
+    # ── Abstract methods ─────────────────────────────────────────────────
+
+    @abstractmethod
+    def load_model(self, config: dict) -> dict:
+        """
+        Load the transform model.
+
+        Args:
+            config: Merged config from AEGIS_SKILL_PARAMS / CLI / config file
+
+        Returns:
+            dict with at least {"model": str, "device": str} for the ready event.
+        """
+        ...
+
+    @abstractmethod
+    def transform_frame(self, image, metadata: dict):
+        """
+        Transform a single frame.
+
+        Args:
+            image: numpy BGR array (from cv2.imread)
+            metadata: {"camera_id": str, "frame_id": str, "timestamp": str, ...}
+
+        Returns:
+            numpy BGR array (transformed image)
+        """
+        ...
+
+    # ── Optional overrides ───────────────────────────────────────────────
+
+    def parse_extra_args(self, parser: argparse.ArgumentParser):
+        """Override to add skill-specific CLI arguments."""
+        pass
+
+    def on_config_update(self, config: dict):
+        """Override to handle live config updates from Aegis."""
+        pass
+
+    def get_output_mode(self) -> str:
+        """Return 'path' (write to temp file) or 'base64' (inline data)."""
+        return "path"
+
+    # ── Main entry point ─────────────────────────────────────────────────
+
+    def run(self):
+        """Parse args, load model, enter stdin loop."""
+        args = self._parse_args()
+        self.config = self._load_config(args)
+        self.device = self._select_device(self.config.get("device", "auto"))
+
+        # Load model
+        try:
+            _emit({"event": "progress", "stage": "init", "message": "Loading model..."})
+            t0 = time.perf_counter()
+            ready_fields = self.load_model(self.config)
+            self.perf.model_load_ms = (time.perf_counter() - t0) * 1000
+
+            ready_event = {
+                "event": "ready",
+                "model_load_ms": round(self.perf.model_load_ms, 1),
+                **ready_fields,
+            }
+            _emit(ready_event)
+        except Exception as e:
+            _emit({"event": "error", "message": f"Model load failed: {e}", "retriable": False})
+            sys.exit(1)
+
+        # Graceful shutdown handler
+        def handle_signal(signum, frame):
+            sig_name = "SIGTERM" if signum == signal.SIGTERM else "SIGINT"
+            _log(f"Received {sig_name}, shutting down", self._tag)
+            self.perf.emit_final()
+            sys.exit(0)
+
+        signal.signal(signal.SIGTERM, handle_signal)
+        signal.signal(signal.SIGINT, handle_signal)
+
+        # Main JSONL stdin loop
+        self._mainloop()
+
+    def _mainloop(self):
+        import cv2  # noqa: delayed import
+
+        output_mode = self.get_output_mode()
+
+        for line in sys.stdin:
+            if not self._running:
+                break
+            line = line.strip()
+            if not line:
+                continue
+
+            try:
+                msg = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+
+            # ── Commands ─────────────────────────────────────────────
+            if msg.get("command") == "stop":
+                break
+            if msg.get("command") == "config-update":
+                self.on_config_update(msg.get("config", {}))
+                continue
+
+            # ── Frame events ─────────────────────────────────────────
+            if msg.get("event") == "frame":
+                t_start = time.perf_counter()
+
+                frame_path = msg.get("frame_path")
+                frame_id = msg.get("frame_id", "")
+                camera_id = msg.get("camera_id", "unknown")
+                timestamp = msg.get("timestamp", "")
+
+                if not frame_path or not Path(frame_path).exists():
+                    _emit({
+                        "event": "error",
+                        "frame_id": frame_id,
+                        "message": f"Frame not found: {frame_path}",
+                        "retriable": True,
+                    })
+                    self.perf.error_count += 1
+                    continue
+
+                try:
+                    # Read frame
+                    t0 = time.perf_counter()
+                    image = cv2.imread(frame_path)
+                    if image is None:
+                        raise ValueError(f"cv2.imread returned None for {frame_path}")
+                    self.perf.record("file_read", (time.perf_counter() - t0) * 1000)
+
+                    # Transform
+                    t0 = time.perf_counter()
+                    metadata = {
+                        "camera_id": camera_id,
+                        "frame_id": frame_id,
+                        "timestamp": timestamp,
+                    }
+                    result_image = self.transform_frame(image, metadata)
+                    self.perf.record("transform", (time.perf_counter() - t0) * 1000)
+
+                    # Encode and emit
+                    t0 = time.perf_counter()
+                    transform_event = {
+                        "event": "transform",
+                        "frame_id": frame_id,
+                        "camera_id": camera_id,
+                        "timestamp": timestamp,
+                    }
+
+                    if output_mode == "base64":
+                        _, buf = cv2.imencode(".jpg", result_image, [cv2.IMWRITE_JPEG_QUALITY, 85])
+                        transform_event["transform_data"] = base64.b64encode(buf).decode("ascii")
+                    else:
+                        out_path = tempfile.mktemp(suffix=".jpg", dir=tempfile.gettempdir())
+                        cv2.imwrite(out_path, result_image, [cv2.IMWRITE_JPEG_QUALITY, 90])
+                        transform_event["transform_path"] = out_path
+
+                    self.perf.record("encode", (time.perf_counter() - t0) * 1000)
+
+                    t0 = time.perf_counter()
+                    _emit(transform_event)
+                    self.perf.record("emit", (time.perf_counter() - t0) * 1000)
+
+                except Exception as e:
+                    _emit({
+                        "event": "error",
+                        "frame_id": frame_id,
+                        "message": f"Transform error: {e}",
+                        "retriable": True,
+                    })
+                    self.perf.error_count += 1
+                    continue
+
+                self.perf.record("total", (time.perf_counter() - t_start) * 1000)
+                self.perf.record_frame()
+
+        self.perf.emit_final()
+
+    # ── Config loading ───────────────────────────────────────────────────
+
+    def _parse_args(self):
+        parser = argparse.ArgumentParser(description=f"{self._tag} Skill")
+        parser.add_argument("--config", type=str, help="Path to config JSON file")
+        parser.add_argument("--device", type=str, default="auto",
+                            choices=["auto", "cpu", "cuda", "mps", "rocm"])
+        self.parse_extra_args(parser)
+        return parser.parse_args()
+
+    def _load_config(self, args) -> dict:
+        import os
+        env_params = os.environ.get("AEGIS_SKILL_PARAMS")
+        if env_params:
+            try:
+                return json.loads(env_params)
+            except json.JSONDecodeError:
+                pass
+        if args.config:
+            config_path = Path(args.config)
+            if config_path.exists():
+                with open(config_path) as f:
+                    return json.load(f)
+        return {"device": args.device}
+
+    @staticmethod
+    def _select_device(pref: str) -> str:
+        if pref != "auto":
+            return pref
+        try:
+            import torch
+            if torch.cuda.is_available():
+                return "cuda"
+            if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+                return "mps"
+        except ImportError:
+            pass
+        return "cpu"

From 772473de91a8afa13c42fc7a1f69329865d6ca8c Mon Sep 17 00:00:00 2001
From: Simba Zhang <solderzzc@gmail.com>
Date: Sat, 14 Mar 2026 16:29:36 -0700
Subject: [PATCH 02/14] feat(depth-estimation): refactor to TransformSkillBase
 + privacy-first defaults

Refactors depth-estimation skill to subclass TransformSkillBase, reducing
transform.py from ~160 lines of boilerplate to ~100 lines of pure skill logic.

Key changes:
- Default blend_mode changed to 'depth_only' for privacy anonymization
- Version bumped to 1.1.0, category set to 'privacy'
- SKILL.md documents the TransformSkillBase interface for new skill authors
- Protocol updated: frame_id tracking, config-update command, base64 output
- Adds on_config_update() for live parameter changes from Aegis
---
 .../transformation/depth-estimation/SKILL.md  |  38 ++-
 .../depth-estimation/scripts/transform.py     | 228 ++++++++----------
 2 files changed, 133 insertions(+), 133 deletions(-)

diff --git a/skills/transformation/depth-estimation/SKILL.md b/skills/transformation/depth-estimation/SKILL.md
index eb0f2ea..e837fba 100644
--- a/skills/transformation/depth-estimation/SKILL.md
+++ b/skills/transformation/depth-estimation/SKILL.md
@@ -1,7 +1,8 @@
 ---
 name: depth-estimation
-description: "Real-time depth map estimation using Depth Anything v2"
-version: 1.0.0
+description: "Real-time depth map estimation for privacy transforms using Depth Anything v2"
+version: 1.1.0
+category: privacy
 
 parameters:
   - name: model
@@ -14,8 +15,8 @@ parameters:
   - name: blend_mode
     label: "Display Mode"
     type: select
-    options: ["overlay", "side_by_side", "depth_only"]
-    default: "overlay"
+    options: ["depth_only", "overlay", "side_by_side"]
+    default: "depth_only"
     group: Display
 
   - name: opacity
@@ -46,27 +47,50 @@ capabilities:
     description: "Real-time depth estimation overlay on live feed"
 ---
 
-# Depth Estimation
+# Depth Estimation (Privacy)
 
 Real-time monocular depth estimation using Depth Anything v2. Transforms camera feeds with colorized depth maps — near objects appear warm, far objects appear cool.
 
+When used for **privacy mode**, the `depth_only` blend mode fully anonymizes the scene while preserving spatial layout and activity, enabling security monitoring without revealing identities.
+
 ## What You Get
 
+- **Privacy anonymization** — depth-only mode hides all visual identity
 - **Depth overlays** on live camera feeds
 - **Distance estimation** — approximate distance to detected objects
 - **3D scene understanding** — spatial layout of the scene
 
+## Interface: TransformSkillBase
+
+This skill implements the `TransformSkillBase` interface. Any new privacy skill can be created by subclassing `TransformSkillBase` and implementing two methods:
+
+```python
+from transform_base import TransformSkillBase
+
+class MyPrivacySkill(TransformSkillBase):
+    def load_model(self, config):
+        # Load your model, return {"model": "...", "device": "..."}
+        ...
+
+    def transform_frame(self, image, metadata):
+        # Transform BGR image, return BGR image
+        ...
+```
+
 ## Protocol
 
 ### Aegis → Skill (stdin)
 ```jsonl
-{"event": "frame", "camera_id": "front_door", "frame_path": "/tmp/frame.jpg", "timestamp": "..."}
+{"event": "frame", "frame_id": "cam1_1710001", "camera_id": "front_door", "frame_path": "/tmp/frame.jpg", "timestamp": "..."}
+{"command": "config-update", "config": {"opacity": 0.8, "blend_mode": "overlay"}}
+{"command": "stop"}
 ```
 
 ### Skill → Aegis (stdout)
 ```jsonl
 {"event": "ready", "model": "depth-anything-v2-small", "device": "mps"}
-{"event": "transformed_frame", "camera_id": "front_door", "frame_path": "/tmp/depth_001.jpg", "metadata": {"min_depth": 0.2, "max_depth": 15.0}}
+{"event": "transform", "frame_id": "cam1_1710001", "camera_id": "front_door", "transform_data": "<base64 JPEG>"}
+{"event": "perf_stats", "total_frames": 50, "timings_ms": {"transform": {"avg": 45.2, ...}}}
 ```
 
 ## Setup
diff --git a/skills/transformation/depth-estimation/scripts/transform.py b/skills/transformation/depth-estimation/scripts/transform.py
index 56ccf8a..d45c91e 100644
--- a/skills/transformation/depth-estimation/scripts/transform.py
+++ b/skills/transformation/depth-estimation/scripts/transform.py
@@ -1,56 +1,25 @@
 #!/usr/bin/env python3
 """
-Depth Estimation Skill — Real-time monocular depth maps.
+Depth Estimation Privacy Skill — Monocular depth maps via Depth Anything v2.
 
-Transforms camera frames with Depth Anything v2 colorized depth overlays.
+Implements the TransformSkillBase interface to provide real-time depth map
+overlays on camera feeds. When used as a privacy skill, the depth-only mode
+anonymizes the scene while preserving spatial layout and activity recognition.
+
+Usage:
+  python transform.py --model depth-anything-v2-small --device auto
+  python transform.py --config config.json
 """
 
 import sys
-import json
 import argparse
-import signal
-import tempfile
 from pathlib import Path
 
+# Import the base class from the same directory
+_script_dir = Path(__file__).resolve().parent
+sys.path.insert(0, str(_script_dir))
 
-def parse_args():
-    parser = argparse.ArgumentParser(description="Depth Estimation Skill")
-    parser.add_argument("--config", type=str)
-    parser.add_argument("--model", type=str, default="depth-anything-v2-small")
-    parser.add_argument("--colormap", type=str, default="inferno")
-    parser.add_argument("--blend-mode", type=str, default="overlay")
-    parser.add_argument("--opacity", type=float, default=0.5)
-    parser.add_argument("--device", type=str, default="auto")
-    return parser.parse_args()
-
-
-def load_config(args):
-    if args.config and Path(args.config).exists():
-        with open(args.config) as f:
-            return json.load(f)
-    return {
-        "model": args.model,
-        "colormap": args.colormap,
-        "blend_mode": args.blend_mode,
-        "opacity": args.opacity,
-        "device": args.device,
-    }
-
-
-def select_device(pref):
-    if pref != "auto":
-        return pref
-    try:
-        import torch
-        if torch.cuda.is_available(): return "cuda"
-        if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): return "mps"
-    except ImportError:
-        pass
-    return "cpu"
-
-
-def emit(event):
-    print(json.dumps(event), flush=True)
+from transform_base import TransformSkillBase, _log  # noqa: E402
 
 
 COLORMAP_MAP = {
@@ -62,94 +31,101 @@ def emit(event):
 }
 
 
-def main():
-    args = parse_args()
-    config = load_config(args)
-    device = select_device(config.get("device", "auto"))
+class DepthEstimationSkill(TransformSkillBase):
+    """
+    Depth estimation using Depth Anything v2.
+
+    Produces colorized depth maps that can be blended with the original frame
+    (overlay mode), shown side-by-side, or displayed as depth-only anonymized view.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._tag = "DepthEstimation"
+        self.model = None
+        self.colormap_id = 1
+        self.opacity = 0.5
+        self.blend_mode = "depth_only"  # Default for privacy: depth_only anonymizes
+
+    def parse_extra_args(self, parser: argparse.ArgumentParser):
+        parser.add_argument("--model", type=str, default="depth-anything-v2-small",
+                            choices=["depth-anything-v2-small", "depth-anything-v2-base",
+                                     "depth-anything-v2-large", "midas-small"])
+        parser.add_argument("--colormap", type=str, default="inferno",
+                            choices=list(COLORMAP_MAP.keys()))
+        parser.add_argument("--blend-mode", type=str, default="depth_only",
+                            choices=["overlay", "side_by_side", "depth_only"])
+        parser.add_argument("--opacity", type=float, default=0.5)
+
+    def load_model(self, config: dict) -> dict:
+        import torch
 
-    try:
+        model_name = config.get("model", "depth-anything-v2-small")
+        self.colormap_id = COLORMAP_MAP.get(config.get("colormap", "inferno"), 1)
+        self.opacity = config.get("opacity", 0.5)
+        self.blend_mode = config.get("blend_mode", "depth_only")
+
+        _log(f"Loading {model_name} on {self.device}", self._tag)
+
+        # Load model via torch hub
+        hub_name = model_name.replace("-", "_")
+        self.model = torch.hub.load(
+            "LiheYoung/Depth-Anything-V2",
+            hub_name,
+            trust_repo=True,
+        )
+        self.model.to(self.device)
+        self.model.eval()
+
+        _log(f"Model loaded: {model_name} on {self.device}", self._tag)
+
+        return {
+            "model": model_name,
+            "device": self.device,
+            "blend_mode": self.blend_mode,
+            "colormap": config.get("colormap", "inferno"),
+        }
+
+    def transform_frame(self, image, metadata: dict):
         import torch
         import cv2
         import numpy as np
 
-        model_name = config.get("model", "depth-anything-v2-small")
-        model = torch.hub.load("LiheYoung/Depth-Anything-V2", model_name.replace("-", "_"), trust_repo=True)
-        model.to(device)
-        model.eval()
-
-        emit({"event": "ready", "model": model_name, "device": device})
-    except Exception as e:
-        emit({"event": "error", "message": f"Failed to load model: {e}", "retriable": False})
-        sys.exit(1)
-
-    running = True
-    def handle_signal(s, f):
-        nonlocal running
-        running = False
-    signal.signal(signal.SIGTERM, handle_signal)
-    signal.signal(signal.SIGINT, handle_signal)
-
-    colormap_id = COLORMAP_MAP.get(config.get("colormap", "inferno"), 1)
-    opacity = config.get("opacity", 0.5)
-    blend_mode = config.get("blend_mode", "overlay")
-
-    for line in sys.stdin:
-        if not running:
-            break
-        line = line.strip()
-        if not line:
-            continue
-        try:
-            msg = json.loads(line)
-        except json.JSONDecodeError:
-            continue
-
-        if msg.get("command") == "stop":
-            break
-
-        if msg.get("event") == "frame":
-            frame_path = msg.get("frame_path")
-            if not frame_path or not Path(frame_path).exists():
-                continue
-
-            try:
-                import torch
-                import cv2
-                import numpy as np
-
-                image = cv2.imread(frame_path)
-                rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-
-                with torch.no_grad():
-                    depth = model.infer_image(rgb)
-
-                # Normalize depth to 0-255
-                depth_norm = ((depth - depth.min()) / (depth.max() - depth.min() + 1e-8) * 255).astype(np.uint8)
-                depth_colored = cv2.applyColorMap(depth_norm, colormap_id)
-
-                if blend_mode == "overlay":
-                    output = cv2.addWeighted(image, 1 - opacity, depth_colored, opacity, 0)
-                elif blend_mode == "side_by_side":
-                    output = np.hstack([image, depth_colored])
-                else:  # depth_only
-                    output = depth_colored
-
-                out_path = tempfile.mktemp(suffix=".jpg", dir="/tmp")
-                cv2.imwrite(out_path, output, [cv2.IMWRITE_JPEG_QUALITY, 90])
-
-                emit({
-                    "event": "transformed_frame",
-                    "camera_id": msg.get("camera_id", "unknown"),
-                    "timestamp": msg.get("timestamp", ""),
-                    "frame_path": out_path,
-                    "metadata": {
-                        "min_depth": float(depth.min()),
-                        "max_depth": float(depth.max()),
-                    },
-                })
-            except Exception as e:
-                emit({"event": "error", "message": f"Depth error: {e}", "retriable": True})
+        rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+        with torch.no_grad():
+            depth = self.model.infer_image(rgb)
+
+        # Normalize depth to 0-255
+        d_min, d_max = depth.min(), depth.max()
+        depth_norm = ((depth - d_min) / (d_max - d_min + 1e-8) * 255).astype(np.uint8)
+        depth_colored = cv2.applyColorMap(depth_norm, self.colormap_id)
+
+        if self.blend_mode == "overlay":
+            output = cv2.addWeighted(image, 1 - self.opacity, depth_colored, self.opacity, 0)
+        elif self.blend_mode == "side_by_side":
+            output = np.hstack([image, depth_colored])
+        else:  # depth_only — full anonymization
+            output = depth_colored
+
+        return output
+
+    def on_config_update(self, config: dict):
+        """Handle live config updates from Aegis."""
+        if "colormap" in config:
+            self.colormap_id = COLORMAP_MAP.get(config["colormap"], self.colormap_id)
+            _log(f"Colormap updated: {config['colormap']}", self._tag)
+        if "opacity" in config:
+            self.opacity = float(config["opacity"])
+            _log(f"Opacity updated: {self.opacity}", self._tag)
+        if "blend_mode" in config:
+            self.blend_mode = config["blend_mode"]
+            _log(f"Blend mode updated: {self.blend_mode}", self._tag)
+
+    def get_output_mode(self) -> str:
+        """Use base64 for privacy transforms — avoids temp file cleanup issues."""
+        return "base64"
 
 
 if __name__ == "__main__":
-    main()
+    DepthEstimationSkill().run()

From 2cfba37b3e4c2ddbd28ca30913f24df7d69b1828 Mon Sep 17 00:00:00 2001
From: Simba Zhang <solderzzc@gmail.com>
Date: Sat, 14 Mar 2026 16:29:43 -0700
Subject: [PATCH 03/14] feat(registry): add privacy category and
 depth-estimation skill entry

Adds 'privacy' as a new skill category in skills.json for transforms that
anonymize camera feeds (depth maps, blur, blind mode). Registers the
depth-estimation skill (v1.1.0) with privacy-specific capabilities
(live_transform, privacy_overlay) and UI unlock flags (blind_mode).
---
 skills.json | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/skills.json b/skills.json
index 5fde718..a35f483 100644
--- a/skills.json
+++ b/skills.json
@@ -7,6 +7,7 @@
     "detection": "Object detection, person recognition, visual grounding",
     "analysis": "VLM scene understanding, interactive segmentation",
     "transformation": "Depth estimation, style transfer, video effects",
+    "privacy": "Privacy transforms — depth maps, blur, anonymization for blind mode",
     "annotation": "Dataset labeling, COCO export, training data",
     "camera-providers": "Camera brand integrations — clip feed, live stream",
     "streaming": "RTSP/WebRTC live view via go2rtc",
@@ -130,6 +131,40 @@
         "monitoring",
         "recording"
       ]
+    },
+    {
+      "id": "depth-estimation",
+      "name": "Depth Estimation (Privacy)",
+      "description": "Privacy-first depth map transforms — anonymize camera feeds with Depth Anything v2 while preserving spatial awareness.",
+      "version": "1.1.0",
+      "category": "privacy",
+      "path": "skills/transformation/depth-estimation",
+      "tags": [
+        "privacy",
+        "depth",
+        "transform",
+        "anonymization",
+        "blind-mode"
+      ],
+      "platforms": [
+        "linux-x64",
+        "linux-arm64",
+        "darwin-arm64",
+        "darwin-x64",
+        "win-x64"
+      ],
+      "requirements": {
+        "python": ">=3.9",
+        "ram_gb": 2
+      },
+      "capabilities": [
+        "live_transform",
+        "privacy_overlay"
+      ],
+      "ui_unlocks": [
+        "privacy_overlay",
+        "blind_mode"
+      ]
     }
   ]
 }
\ No newline at end of file

From a7bb89572acf8665c04b87d79cce34001a5ea8f8 Mon Sep 17 00:00:00 2001
From: Simba Zhang <solderzzc@gmail.com>
Date: Sat, 14 Mar 2026 17:49:28 -0700
Subject: [PATCH 04/14] feat(depth-estimation): wire HardwareEnv for
 multi-backend GPU support + pin torch/torchvision versions

- Replace basic _select_device() with full HardwareEnv.detect() from skills/lib/env_config.py
- Supports: NVIDIA CUDA, AMD ROCm, Apple MPS/Neural Engine, Intel OpenVINO/NPU, CPU
- Pin torch~=2.7.0 and torchvision~=0.22.0 to prevent pip resolver conflicts
- Move torch/torchvision above depth-anything-v2 in requirements.txt for install order
- Expose self.env (HardwareEnv) to subclasses for GPU name, memory, backend info
- Include backend and gpu_name in ready event for Aegis UI display
---
 .../depth-estimation/requirements.txt         |  8 +-
 .../scripts/transform_base.py                 | 90 +++++++++++++++----
 2 files changed, 80 insertions(+), 18 deletions(-)

diff --git a/skills/transformation/depth-estimation/requirements.txt b/skills/transformation/depth-estimation/requirements.txt
index 9bec188..6ea8915 100644
--- a/skills/transformation/depth-estimation/requirements.txt
+++ b/skills/transformation/depth-estimation/requirements.txt
@@ -1,7 +1,9 @@
-# Depth Estimation
+# Depth Estimation — Privacy Transform Skill
+# NOTE: torch and torchvision MUST be version-paired.
+# Loose ranges cause pip to flip between incompatible versions.
+torch~=2.7.0
+torchvision~=0.22.0
 depth-anything-v2>=0.1.0
-torch>=2.0.0
-torchvision>=0.15.0
 numpy>=1.24.0
 opencv-python-headless>=4.8.0
 Pillow>=10.0.0
diff --git a/skills/transformation/depth-estimation/scripts/transform_base.py b/skills/transformation/depth-estimation/scripts/transform_base.py
index 98d9013..48f251a 100644
--- a/skills/transformation/depth-estimation/scripts/transform_base.py
+++ b/skills/transformation/depth-estimation/scripts/transform_base.py
@@ -46,6 +46,7 @@ def transform_frame(self, image, metadata):
 
 import sys
 import json
+import os
 import signal
 import time
 import argparse
@@ -55,6 +56,52 @@ def transform_frame(self, image, metadata):
 from pathlib import Path
 
 
+# ═══════════════════════════════════════════════════════════════════════════════
+# Hardware detection — reuse env_config.py from skills/lib/
+# ═══════════════════════════════════════════════════════════════════════════════
+
+_script_dir = Path(__file__).resolve().parent
+_lib_candidates = [
+    _script_dir,                                          # bundled alongside script
+    _script_dir.parent.parent.parent.parent / "lib",      # repo: skills/lib/
+    _script_dir.parent / "lib",                           # skill-level lib/
+]
+_env_config_loaded = False
+for _lib_path in _lib_candidates:
+    if (_lib_path / "env_config.py").exists():
+        sys.path.insert(0, str(_lib_path))
+        from env_config import HardwareEnv  # noqa: E402
+        _env_config_loaded = True
+        break
+
+if not _env_config_loaded:
+    # Minimal fallback — auto-detect via PyTorch only
+    class HardwareEnv:  # type: ignore[no-redef]
+        def __init__(self):
+            self.backend = "cpu"
+            self.device = "cpu"
+            self.gpu_name = ""
+            self.gpu_memory_mb = 0
+            self.export_format = "none"
+            self.framework_ok = False
+
+        @staticmethod
+        def detect():
+            env = HardwareEnv()
+            try:
+                import torch
+                if torch.cuda.is_available():
+                    env.backend = "cuda"; env.device = "cuda"
+                elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+                    env.backend = "mps"; env.device = "mps"
+            except ImportError:
+                pass
+            return env
+
+        def to_dict(self):
+            return {"backend": self.backend, "device": self.device}
+
+
 # ═══════════════════════════════════════════════════════════════════════════════
 # Performance Tracker
 # ═══════════════════════════════════════════════════════════════════════════════
@@ -152,6 +199,7 @@ class TransformSkillBase(ABC):
 
     def __init__(self):
         self.device = "cpu"
+        self.env = None  # HardwareEnv — populated in run()
         self.config = {}
         self.perf = PerfTracker()
         self._running = True
@@ -206,11 +254,17 @@ def run(self):
         """Parse args, load model, enter stdin loop."""
         args = self._parse_args()
         self.config = self._load_config(args)
-        self.device = self._select_device(self.config.get("device", "auto"))
+
+        # Hardware detection — full multi-backend probe
+        device_pref = self.config.get("device", "auto")
+        self.env = self._detect_hardware(device_pref)
+        self.device = self.env.device
 
         # Load model
         try:
-            _emit({"event": "progress", "stage": "init", "message": "Loading model..."})
+            gpu_msg = f"{self.env.gpu_name} ({self.env.backend})" if self.env.gpu_name else self.env.backend
+            _emit({"event": "progress", "stage": "init", "message": f"Hardware: {gpu_msg}"})
+            _emit({"event": "progress", "stage": "model", "message": "Loading model..."})
             t0 = time.perf_counter()
             ready_fields = self.load_model(self.config)
             self.perf.model_load_ms = (time.perf_counter() - t0) * 1000
@@ -218,6 +272,8 @@ def run(self):
             ready_event = {
                 "event": "ready",
                 "model_load_ms": round(self.perf.model_load_ms, 1),
+                "backend": self.env.backend,
+                "gpu": self.env.gpu_name,
                 **ready_fields,
             }
             _emit(ready_event)
@@ -348,7 +404,6 @@ def _parse_args(self):
         return parser.parse_args()
 
     def _load_config(self, args) -> dict:
-        import os
         env_params = os.environ.get("AEGIS_SKILL_PARAMS")
         if env_params:
             try:
@@ -363,15 +418,20 @@ def _load_config(self, args) -> dict:
         return {"device": args.device}
 
     @staticmethod
-    def _select_device(pref: str) -> str:
-        if pref != "auto":
-            return pref
-        try:
-            import torch
-            if torch.cuda.is_available():
-                return "cuda"
-            if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
-                return "mps"
-        except ImportError:
-            pass
-        return "cpu"
+    def _detect_hardware(device_pref: str = "auto") -> HardwareEnv:
+        """
+        Full hardware detection using shared env_config.py.
+
+        Supports: NVIDIA CUDA, AMD ROCm, Apple MPS/Neural Engine,
+                  Intel OpenVINO/NPU, CPU fallback.
+
+        Returns a HardwareEnv with .backend, .device, .gpu_name, etc.
+        """
+        env = HardwareEnv.detect()
+
+        # Honour explicit device preference
+        if device_pref != "auto":
+            env.device = device_pref
+            env.backend = device_pref
+
+        return env

From 38da2503408dcb66c0af42461ba349296b3312e7 Mon Sep 17 00:00:00 2001
From: Simba Zhang <solderzzc@gmail.com>
Date: Sat, 14 Mar 2026 18:08:44 -0700
Subject: [PATCH 05/14] fix(depth-estimation): replace dead torch.hub.load with
 HF hub + pip package

torch.hub.load('LiheYoung/Depth-Anything-V2', ...) returns 404.
Switch to direct DepthAnythingV2 class from depth_anything_v2 pip package
with weights downloaded via huggingface_hub.hf_hub_download (cached).

Tested: model loads successfully on MPS (Apple Silicon).
---
 .../depth-estimation/scripts/transform.py     | 44 ++++++++++++++++---
 1 file changed, 38 insertions(+), 6 deletions(-)

diff --git a/skills/transformation/depth-estimation/scripts/transform.py b/skills/transformation/depth-estimation/scripts/transform.py
index d45c91e..82b427f 100644
--- a/skills/transformation/depth-estimation/scripts/transform.py
+++ b/skills/transformation/depth-estimation/scripts/transform.py
@@ -59,6 +59,8 @@ def parse_extra_args(self, parser: argparse.ArgumentParser):
 
     def load_model(self, config: dict) -> dict:
         import torch
+        from depth_anything_v2.dpt import DepthAnythingV2
+        from huggingface_hub import hf_hub_download
 
         model_name = config.get("model", "depth-anything-v2-small")
         self.colormap_id = COLORMAP_MAP.get(config.get("colormap", "inferno"), 1)
@@ -67,13 +69,43 @@ def load_model(self, config: dict) -> dict:
 
         _log(f"Loading {model_name} on {self.device}", self._tag)
 
-        # Load model via torch hub
-        hub_name = model_name.replace("-", "_")
-        self.model = torch.hub.load(
-            "LiheYoung/Depth-Anything-V2",
-            hub_name,
-            trust_repo=True,
+        # Model configs: encoder name, features, HF repo, weight filename
+        MODEL_CONFIGS = {
+            "depth-anything-v2-small": {
+                "encoder": "vits", "features": 64,
+                "out_channels": [48, 96, 192, 384],
+                "repo": "depth-anything/Depth-Anything-V2-Small",
+                "filename": "depth_anything_v2_vits.pth",
+            },
+            "depth-anything-v2-base": {
+                "encoder": "vitb", "features": 128,
+                "out_channels": [96, 192, 384, 768],
+                "repo": "depth-anything/Depth-Anything-V2-Base",
+                "filename": "depth_anything_v2_vitb.pth",
+            },
+            "depth-anything-v2-large": {
+                "encoder": "vitl", "features": 256,
+                "out_channels": [256, 512, 1024, 1024],
+                "repo": "depth-anything/Depth-Anything-V2-Large",
+                "filename": "depth_anything_v2_vitl.pth",
+            },
+        }
+
+        cfg = MODEL_CONFIGS.get(model_name)
+        if not cfg:
+            raise ValueError(f"Unknown model: {model_name}. Choose from: {list(MODEL_CONFIGS.keys())}")
+
+        # Download weights from HuggingFace Hub (cached after first download)
+        _log(f"Downloading weights from HF: {cfg['repo']}", self._tag)
+        weights_path = hf_hub_download(cfg["repo"], cfg["filename"])
+
+        # Build model from pip package
+        self.model = DepthAnythingV2(
+            encoder=cfg["encoder"],
+            features=cfg["features"],
+            out_channels=cfg["out_channels"],
         )
+        self.model.load_state_dict(torch.load(weights_path, map_location=self.device, weights_only=True))
         self.model.to(self.device)
         self.model.eval()
 

From d5849a5e018f2a5650fae198923baf56a0348b8b Mon Sep 17 00:00:00 2001
From: Simba Zhang <solderzzc@gmail.com>
Date: Sat, 14 Mar 2026 18:19:49 -0700
Subject: [PATCH 06/14] refactor(benchmark): remove fixed word/number count
 constraints from LLM tests

- Topic Classification: remove '3-6 words' / 'short phrase' from prompts,
  now just 'Respond with ONLY the topic title'
- Remove word count assertion (wc <= 8) and upper char bounds
- Chat & JSON: remove upper-bound char limits (<2000, <500, <3000)
- Narrative Synthesis: remove <4000 char limit
- Contradictory Instructions: 'under 50 words' -> 'succinct'
- Context Preprocessing: 'brief 1-line summary' -> 'summary'

LLMs perform poorly on fixed word count targets. Validation
assertions for minimum response length and JSON structure preserved.
---
 .../scripts/run-benchmark.cjs                 | 28 +++++++++----------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs
index 74afced..c0f32fa 100644
--- a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs
+++ b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs
@@ -446,7 +446,7 @@ ${userMessage}
 
 ## Response Format
 Respond with ONLY a valid JSON object, no other text:
-{"keep": [<actual index numbers from the list above>], "summary": "<brief 1-line summary of what was dropped>"}
+{"keep": [<actual index numbers from the list above>], "summary": "<summary of what was dropped>"}
 
 Example: if keeping messages at indices 0, 18, 22 → {"keep": [0, 18, 22], "summary": "Removed 4 duplicate 'what happened today' questions"}
 If nothing should be dropped, keep ALL indices and set summary to "".`;
@@ -566,16 +566,14 @@ suite('📋 Context Preprocessing', async () => {
 // ═══════════════════════════════════════════════════════════════════════════════
 
 suite('🏷️ Topic Classification', async () => {
-    await test('First turn → topic title (3-6 words)', async () => {
+    await test('First turn → topic title', async () => {
         const r = await llmCall([{
-            role: 'user', content: `Classify this exchange's topic in 3-6 words. Respond with ONLY the topic title.
+            role: 'user', content: `Classify this exchange's topic. Respond with ONLY the topic title.
 User: "What has happened today on the cameras?"
 Assistant: "Today, your cameras captured motion events including a person at the front door at 9:40 AM..."` }]);
         const cleaned = stripThink(r.content).split('\n').filter(l => l.trim()).pop().replace(/^["'*]+|["'*]+$/g, '').replace(/^(new\s+)?topic\s*:\s*/i, '').trim();
         assert(cleaned.length > 0, 'Topic empty');
-        const wc = cleaned.split(/\s+/).length;
-        assert(wc <= 8, `Too verbose: ${wc} words`);
-        return `"${cleaned}" (${wc} words)`;
+        return `"${cleaned}"`;
     });
 
     await test('Same topic → SAME', async () => {
@@ -585,7 +583,7 @@ User: "Show me the clip from 9:40 AM"
 Assistant: "Here's the clip from 9:40 AM showing a person at the front door..."
 Current topic: "Camera Events Review"
 If the topic hasn't changed, respond: SAME
-Otherwise respond with ONLY the new topic title (3-6 words).` }]);
+Otherwise respond with ONLY the new topic title.` }]);
         const cleaned = stripThink(r.content).split('\n').filter(l => l.trim()).pop().replace(/^["'*]+|["'*]+$/g, '');
         assert(cleaned.toUpperCase() === 'SAME', `Expected SAME, got "${cleaned}"`);
         return 'SAME ✓';
@@ -598,7 +596,7 @@ User: "What's the system status? How much storage am I using?"
 Assistant: "System healthy. Storage: 45GB of 500GB, VLM running on GPU."
 Current topic: "Camera Events Review"
 If the topic hasn't changed, respond: SAME
-Otherwise respond with ONLY the new topic title (3-6 words).` }]);
+Otherwise respond with ONLY the new topic title.` }]);
         const cleaned = stripThink(r.content).split('\n').filter(l => l.trim()).pop().replace(/^["'*]+|["'*]+$/g, '').replace(/^(new\s+)?topic\s*:\s*/i, '').trim();
         assert(cleaned.toUpperCase() !== 'SAME', 'Expected new topic');
         return `"${cleaned}"`;
@@ -606,11 +604,11 @@ Otherwise respond with ONLY the new topic title (3-6 words).` }]);
 
     await test('Greeting → valid topic', async () => {
         const r = await llmCall([{
-            role: 'user', content: `Classify this exchange's topic in 3-6 words. Respond with ONLY the topic title.
+            role: 'user', content: `Classify this exchange's topic. Respond with ONLY the topic title.
 User: "Hi, good morning!"
 Assistant: "Good morning! How can I help you with your home security today?"` }]);
         const cleaned = stripThink(r.content).split('\n').filter(l => l.trim()).pop().replace(/^["'*]+|["'*]+$/g, '').trim();
-        assert(cleaned.length > 0 && cleaned.length < 50, `Bad: "${cleaned}"`);
+        assert(cleaned.length > 0, `Bad: empty topic`);
         return `"${cleaned}"`;
     });
 });
@@ -818,7 +816,7 @@ suite('💬 Chat & JSON Compliance', async () => {
             { role: 'user', content: 'What can you do?' },
         ]);
         const c = stripThink(r.content);
-        assert(c.length > 20 && c.length < 2000, `Length ${c.length}`);
+        assert(c.length > 20, `Response too short: ${c.length} chars`);
         return `${c.length} chars`;
     });
 
@@ -827,7 +825,7 @@ suite('💬 Chat & JSON Compliance', async () => {
             { role: 'system', content: 'You are Aegis. When you have nothing to say, respond ONLY: NO_REPLY' },
             { role: 'user', content: '[Tool Context] video_search returned 3 clips' },
         ]);
-        assert(stripThink(r.content).length < 500, 'Response too long for tool context');
+        // No upper-bound length check — LLMs may be verbose
         return `"${stripThink(r.content).slice(0, 40)}"`;
     });
 
@@ -907,13 +905,13 @@ suite('💬 Chat & JSON Compliance', async () => {
 
     await test('Contradictory instructions → balanced response', async () => {
         const r = await llmCall([
-            { role: 'system', content: 'You are Aegis. Keep all responses under 50 words.' },
+            { role: 'system', content: 'You are Aegis. Keep all responses succinct.' },
             { role: 'user', content: 'Give me a very detailed, comprehensive explanation of how the security classification system works with all four levels and examples of each.' },
         ]);
         const c = stripThink(r.content);
         // Model should produce something reasonable — not crash or refuse
         assert(c.length > 30, 'Response too short');
-        assert(c.length < 3000, 'Response unreasonably long');
+        // No upper-bound length check — LLMs may produce varying lengths
         return `${c.split(/\s+/).length} words, ${c.length} chars`;
     });
 
@@ -1035,7 +1033,7 @@ suite('📝 Narrative Synthesis', async () => {
         const c = stripThink(r.content);
         // Should be concise — not just repeat all 22 events
         assert(c.length > 100, `Response too short: ${c.length} chars`);
-        assert(c.length < 4000, `Response too long (raw dump?): ${c.length} chars`);
+        // No upper-bound length check — narrative length varies by model
         // Should mention key categories
         const lower = c.toLowerCase();
         assert(lower.includes('deliver') || lower.includes('package'),

From b5d5babb95f72f2114608ccf518d10d27419198f Mon Sep 17 00:00:00 2001
From: Simba Zhang <solderzzc@gmail.com>
Date: Sat, 14 Mar 2026 18:37:11 -0700
Subject: [PATCH 07/14] fix(depth-estimation): add huggingface_hub as explicit
 dependency

Was only a transitive dep via gradio/depth-anything-v2, getting dropped
by pip's resolver. Now explicitly required for hf_hub_download.
---
 skills/transformation/depth-estimation/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/skills/transformation/depth-estimation/requirements.txt b/skills/transformation/depth-estimation/requirements.txt
index 6ea8915..bf03902 100644
--- a/skills/transformation/depth-estimation/requirements.txt
+++ b/skills/transformation/depth-estimation/requirements.txt
@@ -4,6 +4,7 @@
 torch~=2.7.0
 torchvision~=0.22.0
 depth-anything-v2>=0.1.0
+huggingface_hub>=0.20.0
 numpy>=1.24.0
 opencv-python-headless>=4.8.0
 Pillow>=10.0.0

From 1f32a9b7cf2f92855acad00fa978233a380d77a0 Mon Sep 17 00:00:00 2001
From: Simba Zhang <solderzzc@gmail.com>
Date: Sat, 14 Mar 2026 18:41:24 -0700
Subject: [PATCH 08/14] docs(depth-estimation): add README with privacy focus,
 hardware support, model table

---
 .../transformation/depth-estimation/README.md | 77 +++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100644 skills/transformation/depth-estimation/README.md

diff --git a/skills/transformation/depth-estimation/README.md b/skills/transformation/depth-estimation/README.md
new file mode 100644
index 0000000..099e75e
--- /dev/null
+++ b/skills/transformation/depth-estimation/README.md
@@ -0,0 +1,77 @@
+# Depth Estimation — Privacy Transform
+
+Transform camera feeds into **colorized depth maps** using [Depth Anything v2](https://github.com/DepthAnything/Depth-Anything-V2), providing real-time privacy protection for security monitoring.
+
+In **privacy mode** (`depth_only`), the scene is fully anonymized — no faces, no clothing, no identifying features — while preserving spatial layout and activity patterns for security awareness.
+
+![Privacy Transform Flow](https://img.shields.io/badge/category-privacy-blue)
+![Depth Anything v2](https://img.shields.io/badge/model-Depth%20Anything%20v2-green)
+
+## How It Works
+
+```
+Camera Frame → Depth Anything v2 → Colorized Depth Map → Aegis Overlay
+   (BGR)         (monocular)         (warm=near, cool=far)    (0.5 FPS)
+```
+
+The depth model converts each frame into a distance map where **warm colors** (red/orange) indicate nearby objects and **cool colors** (blue/purple) indicate distant ones. This preserves enough spatial information to understand activity (someone approaching, car in driveway, etc.) without revealing identity.
+
+## Hardware Support
+
+Auto-detected via `HardwareEnv` from `skills/lib/env_config.py`:
+
+| Platform | Backend | Notes |
+|----------|---------|-------|
+| **NVIDIA** | CUDA | FP16 on GPU |
+| **AMD** | ROCm | PyTorch HIP |
+| **Apple Silicon** | MPS | Unified memory, leaves Neural Engine free |
+| **Intel** | OpenVINO | CPU + NPU support |
+| **CPU** | PyTorch | Fallback, slower |
+
+## Models
+
+| Model | Size | Speed | Quality |
+|-------|------|-------|---------|
+| `depth-anything-v2-small` | 25MB | Fast | Good (default) |
+| `depth-anything-v2-base` | 98MB | Medium | Better |
+| `depth-anything-v2-large` | 335MB | Slow | Best |
+
+Weights are downloaded from HuggingFace Hub on first run and cached locally.
+
+## Display Modes
+
+- **`depth_only`** (default) — Full anonymization. Only the depth map is shown.
+- **`overlay`** — Depth map blended on top of the original feed (adjustable opacity).
+- **`side_by_side`** — Original and depth map shown next to each other.
+
+## Setup
+
+```bash
+python3 -m venv .venv && source .venv/bin/activate
+pip install -r requirements.txt
+```
+
+## Integration with Aegis
+
+This skill communicates with Aegis via **JSONL over stdin/stdout**. Aegis sends frame events, the skill returns transformed frames (base64 JPEG). See [SKILL.md](SKILL.md) for the full protocol specification and the `TransformSkillBase` interface for building new privacy skills.
+
+## Creating New Privacy Skills
+
+Subclass `TransformSkillBase` and implement two methods:
+
+```python
+from transform_base import TransformSkillBase
+
+class MyPrivacySkill(TransformSkillBase):
+    def load_model(self, config):
+        self.model = load_my_model()
+        return {"model": "my-model", "device": self.device}
+
+    def transform_frame(self, image, metadata):
+        return self.model.anonymize(image)
+
+if __name__ == "__main__":
+    MyPrivacySkill().run()
+```
+
+The base class handles JSONL protocol, performance tracking, hardware detection, rate limiting, and graceful shutdown.

From 4b2bcd272e1fa8caa38c94dda245ce36b8c7629d Mon Sep 17 00:00:00 2001
From: Simba Zhang <solderzzc@gmail.com>
Date: Sat, 14 Mar 2026 18:43:23 -0700
Subject: [PATCH 09/14] docs: add Privacy section to main README, update skill
 catalog status
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Change depth-estimation category from Transformation to Privacy
- Mark depth-estimation as ✅ Ready (was 📐 Planned)
- Add dedicated '🔒 Privacy — Depth Map Anonymization' section
- Link to TransformSkillBase for building new privacy skills
---
 README.md | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index fb4ab18..be80e26 100644
--- a/README.md
+++ b/README.md
@@ -71,7 +71,7 @@ Each skill is a self-contained module with its own model, parameters, and [commu
 | **Detection** | [`yolo-detection-2026`](skills/detection/yolo-detection-2026/) | Real-time 80+ class detection — auto-accelerated via TensorRT / CoreML / OpenVINO / ONNX | ✅|
 | **Analysis** | [`home-security-benchmark`](skills/analysis/home-security-benchmark/) | [143-test evaluation suite](#-homesec-bench--how-secure-is-your-local-ai) for LLM & VLM security performance | ✅ |
 | | [`sam2-segmentation`](skills/analysis/sam2-segmentation/) | Click-to-segment with pixel-perfect masks | 📐 |
-| **Transformation** | [`depth-estimation`](skills/transformation/depth-estimation/) | Monocular depth maps with Depth Anything v2 | 📐 |
+| **Privacy** | [`depth-estimation`](skills/transformation/depth-estimation/) | [Real-time depth-map privacy transform](#-privacy--depth-map-anonymization) — anonymize camera feeds while preserving activity | ✅ |
 | **Annotation** | [`dataset-annotation`](skills/annotation/dataset-annotation/) | AI-assisted labeling → COCO export | 📐 |
 | **Camera Providers** | [`eufy`](skills/camera-providers/eufy/) · [`reolink`](skills/camera-providers/reolink/) · [`tapo`](skills/camera-providers/tapo/) | Direct camera integrations via RTSP | 📐 |
 | **Streaming** | [`go2rtc-cameras`](skills/streaming/go2rtc-cameras/) | RTSP → WebRTC live view | 📐 |
@@ -143,6 +143,24 @@ Camera → Frame Governor → detect.py (JSONL) → Aegis IPC → Live Overlay
 
 📖 [Full Skill Documentation →](skills/detection/yolo-detection-2026/SKILL.md)
 
+## 🔒 Privacy — Depth Map Anonymization
+
+Watch your cameras **without seeing faces, clothing, or identities**. The [depth-estimation skill](skills/transformation/depth-estimation/) transforms live feeds into colorized depth maps using [Depth Anything v2](https://github.com/DepthAnything/Depth-Anything-V2) — warm colors for nearby objects, cool colors for distant ones.
+
+```
+Camera Frame ──→ Depth Anything v2 ──→ Colorized Depth Map ──→ Aegis Overlay
+   (live)          (0.5 FPS)           warm=near, cool=far      (privacy on)
+```
+
+- 🛡️ **Full anonymization** — `depth_only` mode hides all visual identity while preserving spatial activity
+- 🎨 **Overlay mode** — blend depth on top of original feed with adjustable opacity
+- ⚡ **Rate-limited** — 0.5 FPS frontend capture + backend scheduler keeps GPU load minimal
+- 🧩 **Extensible** — new privacy skills (blur, pixelation, silhouette) can subclass [`TransformSkillBase`](skills/transformation/depth-estimation/scripts/transform_base.py)
+
+Runs on the same [hardware acceleration stack](#hardware-acceleration) as YOLO detection — CUDA, MPS, ROCm, OpenVINO, or CPU.
+
+📖 [Full Skill Documentation →](skills/transformation/depth-estimation/SKILL.md) · 📖 [README →](skills/transformation/depth-estimation/README.md)
+
 ## 📊 HomeSec-Bench — How Secure Is Your Local AI?
 
 **HomeSec-Bench** is a 143-test security benchmark that measures how well your local AI performs as a security guard. It tests what matters: Can it detect a person in fog? Classify a break-in vs. a delivery? Resist prompt injection? Route alerts correctly at 3 AM?

From 79eac4b25543000b4135aedfaee6943837f20a77 Mon Sep 17 00:00:00 2001
From: Interstellar Apex <hengyi.zhang.composing@gmail.com>
Date: Sat, 14 Mar 2026 18:50:44 -0700
Subject: [PATCH 10/14] fix(depth-estimation): use --ignore-requires-python for
 Python 3.11 compat

The depth-anything-v2 PyPI wheel (0.1.0) declares python_requires>=3.12
but is pure Python (py3-none-any) and works on 3.11+. Updated SKILL.md
setup instructions and added a comment in requirements.txt so the
deployment agent uses the correct pip flags.
---
 skills/transformation/depth-estimation/SKILL.md         | 2 +-
 skills/transformation/depth-estimation/requirements.txt | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/skills/transformation/depth-estimation/SKILL.md b/skills/transformation/depth-estimation/SKILL.md
index e837fba..8b1f2c5 100644
--- a/skills/transformation/depth-estimation/SKILL.md
+++ b/skills/transformation/depth-estimation/SKILL.md
@@ -97,5 +97,5 @@ class MyPrivacySkill(TransformSkillBase):
 
 ```bash
 python3 -m venv .venv && source .venv/bin/activate
-pip install -r requirements.txt
+pip install --ignore-requires-python -r requirements.txt
 ```
diff --git a/skills/transformation/depth-estimation/requirements.txt b/skills/transformation/depth-estimation/requirements.txt
index bf03902..f934768 100644
--- a/skills/transformation/depth-estimation/requirements.txt
+++ b/skills/transformation/depth-estimation/requirements.txt
@@ -1,6 +1,10 @@
 # Depth Estimation — Privacy Transform Skill
 # NOTE: torch and torchvision MUST be version-paired.
 # Loose ranges cause pip to flip between incompatible versions.
+#
+# INSTALL WITH: pip install --ignore-requires-python -r requirements.txt
+# The depth-anything-v2 PyPI wheel declares python_requires>=3.12 in its
+# metadata, but is pure Python (py3-none-any) and works on Python 3.11+.
 torch~=2.7.0
 torchvision~=0.22.0
 depth-anything-v2>=0.1.0

From debf56b9f53534af5511a23f6151c13c26cd0b55 Mon Sep 17 00:00:00 2001
From: Simba Zhang <solderzzc@gmail.com>
Date: Sat, 14 Mar 2026 19:57:24 -0700
Subject: [PATCH 11/14] feat(depth-estimation): CoreML-first backend on macOS +
 PyTorch fallback

On macOS, loads CoreML .mlpackage from ~/.aegis-ai/models/feature-extraction/
using coremltools (Neural Engine). Auto-downloads from
apple/coreml-depth-anything-v2-small on HuggingFace if not present.

On other platforms, falls back to PyTorch DepthAnythingV2 + hf_hub_download.

Verified: CoreML inference at 65.7ms/frame (~15 FPS) on Apple Silicon.

- requirements.txt: add coremltools>=8.0 (darwin-only platform marker)
- SKILL.md: v1.2.0, hardware backend table, CoreML variant parameter
---
 .../transformation/depth-estimation/SKILL.md  |  32 ++-
 .../depth-estimation/requirements.txt         |  17 +-
 .../depth-estimation/scripts/transform.py     | 246 +++++++++++++++---
 3 files changed, 243 insertions(+), 52 deletions(-)

diff --git a/skills/transformation/depth-estimation/SKILL.md b/skills/transformation/depth-estimation/SKILL.md
index 8b1f2c5..b7ec942 100644
--- a/skills/transformation/depth-estimation/SKILL.md
+++ b/skills/transformation/depth-estimation/SKILL.md
@@ -1,17 +1,24 @@
 ---
 name: depth-estimation
-description: "Real-time depth map estimation for privacy transforms using Depth Anything v2"
-version: 1.1.0
+description: "Real-time depth map privacy transforms using Depth Anything v2 (CoreML + PyTorch)"
+version: 1.2.0
 category: privacy
 
 parameters:
   - name: model
     label: "Depth Model"
     type: select
-    options: ["depth-anything-v2-small", "depth-anything-v2-base", "depth-anything-v2-large", "midas-small"]
+    options: ["depth-anything-v2-small", "depth-anything-v2-base", "depth-anything-v2-large"]
     default: "depth-anything-v2-small"
     group: Model
 
+  - name: variant
+    label: "CoreML Variant (macOS)"
+    type: select
+    options: ["DepthAnythingV2SmallF16", "DepthAnythingV2SmallF16INT8", "DepthAnythingV2SmallF32"]
+    default: "DepthAnythingV2SmallF16"
+    group: Model
+
   - name: blend_mode
     label: "Display Mode"
     type: select
@@ -30,7 +37,7 @@ parameters:
   - name: colormap
     label: "Depth Colormap"
     type: select
-    options: ["inferno", "viridis", "plasma", "magma", "jet"]
+    options: ["inferno", "viridis", "plasma", "magma", "jet", "turbo", "hot", "cool"]
     default: "inferno"
     group: Display
 
@@ -53,12 +60,21 @@ Real-time monocular depth estimation using Depth Anything v2. Transforms camera
 
 When used for **privacy mode**, the `depth_only` blend mode fully anonymizes the scene while preserving spatial layout and activity, enabling security monitoring without revealing identities.
 
+## Hardware Backends
+
+| Platform | Backend | Runtime | Model |
+|----------|---------|---------|-------|
+| **macOS** | CoreML | Apple Neural Engine | `apple/coreml-depth-anything-v2-small` (.mlpackage) |
+| Linux/Windows | PyTorch | CUDA / CPU | `depth-anything/Depth-Anything-V2-Small` (.pth) |
+
+On macOS, CoreML runs on the Neural Engine, leaving the GPU free for other tasks. The model is auto-downloaded from HuggingFace and stored at `~/.aegis-ai/models/feature-extraction/`.
+
 ## What You Get
 
 - **Privacy anonymization** — depth-only mode hides all visual identity
 - **Depth overlays** on live camera feeds
-- **Distance estimation** — approximate distance to detected objects
 - **3D scene understanding** — spatial layout of the scene
+- **CoreML acceleration** — Neural Engine on Apple Silicon (3-5x faster than MPS)
 
 ## Interface: TransformSkillBase
 
@@ -88,14 +104,14 @@ class MyPrivacySkill(TransformSkillBase):
 
 ### Skill → Aegis (stdout)
 ```jsonl
-{"event": "ready", "model": "depth-anything-v2-small", "device": "mps"}
+{"event": "ready", "model": "coreml-DepthAnythingV2SmallF16", "device": "neural_engine", "backend": "coreml"}
 {"event": "transform", "frame_id": "cam1_1710001", "camera_id": "front_door", "transform_data": "<base64 JPEG>"}
-{"event": "perf_stats", "total_frames": 50, "timings_ms": {"transform": {"avg": 45.2, ...}}}
+{"event": "perf_stats", "total_frames": 50, "timings_ms": {"transform": {"avg": 12.5, ...}}}
 ```
 
 ## Setup
 
 ```bash
 python3 -m venv .venv && source .venv/bin/activate
-pip install --ignore-requires-python -r requirements.txt
+pip install -r requirements.txt
 ```
diff --git a/skills/transformation/depth-estimation/requirements.txt b/skills/transformation/depth-estimation/requirements.txt
index f934768..2717a00 100644
--- a/skills/transformation/depth-estimation/requirements.txt
+++ b/skills/transformation/depth-estimation/requirements.txt
@@ -1,13 +1,20 @@
 # Depth Estimation — Privacy Transform Skill
-# NOTE: torch and torchvision MUST be version-paired.
-# Loose ranges cause pip to flip between incompatible versions.
+# CoreML-first on macOS (Neural Engine), PyTorch fallback on other platforms.
 #
-# INSTALL WITH: pip install --ignore-requires-python -r requirements.txt
-# The depth-anything-v2 PyPI wheel declares python_requires>=3.12 in its
-# metadata, but is pure Python (py3-none-any) and works on Python 3.11+.
+# macOS: coremltools loads .mlpackage models — fast, leaves GPU free.
+# Other: PyTorch + depth-anything-v2 pip package + HF weights.
+# Common: opencv, numpy, pillow, huggingface_hub for model download.
+
+# ── CoreML (macOS only) ──────────────────────────────────────────────
+coremltools>=8.0; sys_platform == "darwin"
+
+# ── PyTorch fallback (non-macOS, or if CoreML unavailable) ───────────
+# NOTE: torch and torchvision MUST be version-paired.
 torch~=2.7.0
 torchvision~=0.22.0
 depth-anything-v2>=0.1.0
+
+# ── Common dependencies ─────────────────────────────────────────────
 huggingface_hub>=0.20.0
 numpy>=1.24.0
 opencv-python-headless>=4.8.0
diff --git a/skills/transformation/depth-estimation/scripts/transform.py b/skills/transformation/depth-estimation/scripts/transform.py
index 82b427f..c4013c3 100644
--- a/skills/transformation/depth-estimation/scripts/transform.py
+++ b/skills/transformation/depth-estimation/scripts/transform.py
@@ -2,6 +2,10 @@
 """
 Depth Estimation Privacy Skill — Monocular depth maps via Depth Anything v2.
 
+Backend selection:
+  macOS  → CoreML (.mlpackage via coremltools) — runs on Neural Engine
+  Other  → PyTorch (depth_anything_v2 pip package + HF weights) — runs on CUDA/MPS/CPU
+
 Implements the TransformSkillBase interface to provide real-time depth map
 overlays on camera feeds. When used as a privacy skill, the depth-only mode
 anonymizes the scene while preserving spatial layout and activity recognition.
@@ -12,6 +16,8 @@
 """
 
 import sys
+import os
+import platform
 import argparse
 from pathlib import Path
 
@@ -28,6 +34,62 @@
     "plasma": 13,   # cv2.COLORMAP_PLASMA
     "magma": 12,    # cv2.COLORMAP_MAGMA
     "jet": 2,       # cv2.COLORMAP_JET
+    "turbo": 18,    # cv2.COLORMAP_TURBO
+    "hot": 11,      # cv2.COLORMAP_HOT
+    "cool": 8,      # cv2.COLORMAP_COOL
+}
+
+# CoreML model registry — mirrors apple/coreml-depth-anything-v2-small HF repo
+COREML_VARIANTS = {
+    "DepthAnythingV2SmallF16": {
+        "precision": "float16",
+        "size_mb": 49.8,
+        "description": "Float16 — optimized for Neural Engine",
+    },
+    "DepthAnythingV2SmallF16INT8": {
+        "precision": "float16_int8",
+        "size_mb": 25.0,
+        "description": "Float16 + INT8 quantization — smallest",
+    },
+    "DepthAnythingV2SmallF32": {
+        "precision": "float32",
+        "size_mb": 99.2,
+        "description": "Float32 — highest precision",
+    },
+}
+
+# Default CoreML variant (best balance of speed + quality on Neural Engine)
+DEFAULT_COREML_VARIANT = "DepthAnythingV2SmallF16"
+
+# HuggingFace repo for CoreML models
+COREML_HF_REPO = "apple/coreml-depth-anything-v2-small"
+
+# CoreML input size — MUST match model exactly (multiples of 14 for ViT)
+COREML_INPUT_SIZE = (518, 392)  # width, height
+
+# Where Aegis DepthVisionStudio stores downloaded models
+MODELS_DIR = Path.home() / ".aegis-ai" / "models" / "feature-extraction"
+
+# PyTorch model configs (fallback on non-macOS)
+PYTORCH_CONFIGS = {
+    "depth-anything-v2-small": {
+        "encoder": "vits", "features": 64,
+        "out_channels": [48, 96, 192, 384],
+        "repo": "depth-anything/Depth-Anything-V2-Small",
+        "filename": "depth_anything_v2_vits.pth",
+    },
+    "depth-anything-v2-base": {
+        "encoder": "vitb", "features": 128,
+        "out_channels": [96, 192, 384, 768],
+        "repo": "depth-anything/Depth-Anything-V2-Base",
+        "filename": "depth_anything_v2_vitb.pth",
+    },
+    "depth-anything-v2-large": {
+        "encoder": "vitl", "features": 256,
+        "out_channels": [256, 512, 1024, 1024],
+        "repo": "depth-anything/Depth-Anything-V2-Large",
+        "filename": "depth_anything_v2_vitl.pth",
+    },
 }
 
 
@@ -43,14 +105,18 @@ def __init__(self):
         super().__init__()
         self._tag = "DepthEstimation"
         self.model = None
+        self.backend = None  # "coreml" or "pytorch"
         self.colormap_id = 1
         self.opacity = 0.5
         self.blend_mode = "depth_only"  # Default for privacy: depth_only anonymizes
+        self._coreml_input_size = COREML_INPUT_SIZE
 
     def parse_extra_args(self, parser: argparse.ArgumentParser):
         parser.add_argument("--model", type=str, default="depth-anything-v2-small",
                             choices=["depth-anything-v2-small", "depth-anything-v2-base",
-                                     "depth-anything-v2-large", "midas-small"])
+                                     "depth-anything-v2-large"])
+        parser.add_argument("--variant", type=str, default=DEFAULT_COREML_VARIANT,
+                            help="CoreML variant ID (macOS only)")
         parser.add_argument("--colormap", type=str, default="inferno",
                             choices=list(COLORMAP_MAP.keys()))
         parser.add_argument("--blend-mode", type=str, default="depth_only",
@@ -58,42 +124,91 @@ def parse_extra_args(self, parser: argparse.ArgumentParser):
         parser.add_argument("--opacity", type=float, default=0.5)
 
     def load_model(self, config: dict) -> dict:
-        import torch
-        from depth_anything_v2.dpt import DepthAnythingV2
-        from huggingface_hub import hf_hub_download
-
         model_name = config.get("model", "depth-anything-v2-small")
         self.colormap_id = COLORMAP_MAP.get(config.get("colormap", "inferno"), 1)
         self.opacity = config.get("opacity", 0.5)
         self.blend_mode = config.get("blend_mode", "depth_only")
 
-        _log(f"Loading {model_name} on {self.device}", self._tag)
-
-        # Model configs: encoder name, features, HF repo, weight filename
-        MODEL_CONFIGS = {
-            "depth-anything-v2-small": {
-                "encoder": "vits", "features": 64,
-                "out_channels": [48, 96, 192, 384],
-                "repo": "depth-anything/Depth-Anything-V2-Small",
-                "filename": "depth_anything_v2_vits.pth",
-            },
-            "depth-anything-v2-base": {
-                "encoder": "vitb", "features": 128,
-                "out_channels": [96, 192, 384, 768],
-                "repo": "depth-anything/Depth-Anything-V2-Base",
-                "filename": "depth_anything_v2_vitb.pth",
-            },
-            "depth-anything-v2-large": {
-                "encoder": "vitl", "features": 256,
-                "out_channels": [256, 512, 1024, 1024],
-                "repo": "depth-anything/Depth-Anything-V2-Large",
-                "filename": "depth_anything_v2_vitl.pth",
-            },
+        # Try CoreML first on macOS
+        if platform.system() == "Darwin":
+            try:
+                info = self._load_coreml(config)
+                return info
+            except Exception as e:
+                _log(f"CoreML load failed ({e}), falling back to PyTorch", self._tag)
+
+        # Fallback: PyTorch
+        return self._load_pytorch(model_name, config)
+
+    # ── CoreML backend (macOS) ────────────────────────────────────────
+
+    def _load_coreml(self, config: dict) -> dict:
+        """Load CoreML .mlpackage model — runs on Apple Neural Engine."""
+        import coremltools as ct
+
+        variant_id = config.get("variant", DEFAULT_COREML_VARIANT)
+        model_path = MODELS_DIR / f"{variant_id}.mlpackage"
+
+        # Auto-download from HuggingFace if not present
+        if not model_path.exists():
+            _log(f"CoreML model not found at {model_path}, downloading from HF...", self._tag)
+            self._download_coreml_model(variant_id)
+
+        if not model_path.exists():
+            raise FileNotFoundError(f"CoreML model not found: {model_path}")
+
+        _log(f"Loading CoreML model: {variant_id} (Neural Engine)", self._tag)
+        self.model = ct.models.MLModel(str(model_path), compute_units=ct.ComputeUnit.ALL)
+        self.backend = "coreml"
+
+        _log(f"CoreML model loaded: {variant_id}", self._tag)
+        return {
+            "model": f"coreml-{variant_id}",
+            "device": "neural_engine",
+            "blend_mode": self.blend_mode,
+            "colormap": config.get("colormap", "inferno"),
+            "backend": "coreml",
         }
 
-        cfg = MODEL_CONFIGS.get(model_name)
+    def _download_coreml_model(self, variant_id: str):
+        """Download CoreML .mlpackage from HuggingFace using huggingface_hub."""
+        try:
+            from huggingface_hub import snapshot_download
+
+            MODELS_DIR.mkdir(parents=True, exist_ok=True)
+            mlpackage_name = f"{variant_id}.mlpackage"
+
+            _log(f"Downloading {mlpackage_name} from {COREML_HF_REPO}...", self._tag)
+
+            # Download only the specific variant's .mlpackage directory
+            snapshot_download(
+                COREML_HF_REPO,
+                local_dir=str(MODELS_DIR),
+                allow_patterns=[f"{mlpackage_name}/**"],
+            )
+
+            model_path = MODELS_DIR / mlpackage_name
+            if model_path.exists():
+                _log(f"Downloaded CoreML model: {model_path}", self._tag)
+            else:
+                _log(f"Download completed but model not found at {model_path}", self._tag)
+        except Exception as e:
+            _log(f"CoreML model download failed: {e}", self._tag)
+            raise
+
+    # ── PyTorch backend (fallback) ────────────────────────────────────
+
+    def _load_pytorch(self, model_name: str, config: dict) -> dict:
+        """Load PyTorch model — fallback for non-macOS or when CoreML is unavailable."""
+        import torch
+        from depth_anything_v2.dpt import DepthAnythingV2
+        from huggingface_hub import hf_hub_download
+
+        _log(f"Loading {model_name} on {self.device} (PyTorch)", self._tag)
+
+        cfg = PYTORCH_CONFIGS.get(model_name)
         if not cfg:
-            raise ValueError(f"Unknown model: {model_name}. Choose from: {list(MODEL_CONFIGS.keys())}")
+            raise ValueError(f"Unknown model: {model_name}. Choose from: {list(PYTORCH_CONFIGS.keys())}")
 
         # Download weights from HuggingFace Hub (cached after first download)
         _log(f"Downloading weights from HF: {cfg['repo']}", self._tag)
@@ -108,17 +223,76 @@ def load_model(self, config: dict) -> dict:
         self.model.load_state_dict(torch.load(weights_path, map_location=self.device, weights_only=True))
         self.model.to(self.device)
         self.model.eval()
+        self.backend = "pytorch"
 
-        _log(f"Model loaded: {model_name} on {self.device}", self._tag)
-
+        _log(f"PyTorch model loaded: {model_name} on {self.device}", self._tag)
         return {
             "model": model_name,
             "device": self.device,
             "blend_mode": self.blend_mode,
             "colormap": config.get("colormap", "inferno"),
+            "backend": "pytorch",
         }
 
+    # ── Frame transform ───────────────────────────────────────────────
+
     def transform_frame(self, image, metadata: dict):
+        import cv2
+        import numpy as np
+
+        if self.backend == "coreml":
+            depth_colored = self._infer_coreml(image)
+        else:
+            depth_colored = self._infer_pytorch(image)
+
+        if self.blend_mode == "overlay":
+            output = cv2.addWeighted(image, 1 - self.opacity, depth_colored, self.opacity, 0)
+        elif self.blend_mode == "side_by_side":
+            output = np.hstack([image, depth_colored])
+        else:  # depth_only — full anonymization
+            output = depth_colored
+
+        return output
+
+    def _infer_coreml(self, image):
+        """Run CoreML inference and return colorized depth map (BGR, original size)."""
+        import cv2
+        import numpy as np
+        from PIL import Image
+
+        original_h, original_w = image.shape[:2]
+        input_w, input_h = self._coreml_input_size
+
+        # BGR → RGB → resize to model input → PIL
+        rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        resized = cv2.resize(rgb, (input_w, input_h), interpolation=cv2.INTER_LINEAR)
+        pil_image = Image.fromarray(resized, mode="RGB")
+
+        # Inference
+        prediction = self.model.predict({"image": pil_image})
+
+        # Extract depth map (first output key)
+        output_key = list(prediction.keys())[0]
+        depth_map = prediction[output_key]
+
+        # Convert PIL Image to numpy if needed
+        if isinstance(depth_map, Image.Image):
+            depth_map = np.array(depth_map)
+
+        depth_map = np.array(depth_map)
+        if depth_map.ndim > 2:
+            depth_map = np.squeeze(depth_map)
+
+        # Normalize → uint8 → colormap → resize back
+        depth_norm = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min() + 1e-8)
+        depth_uint8 = (depth_norm * 255).astype(np.uint8)
+        depth_colored = cv2.applyColorMap(depth_uint8, self.colormap_id)
+        depth_colored = cv2.resize(depth_colored, (original_w, original_h))
+
+        return depth_colored
+
+    def _infer_pytorch(self, image):
+        """Run PyTorch inference and return colorized depth map (BGR, original size)."""
         import torch
         import cv2
         import numpy as np
@@ -128,19 +302,13 @@ def transform_frame(self, image, metadata: dict):
         with torch.no_grad():
             depth = self.model.infer_image(rgb)
 
-        # Normalize depth to 0-255
         d_min, d_max = depth.min(), depth.max()
         depth_norm = ((depth - d_min) / (d_max - d_min + 1e-8) * 255).astype(np.uint8)
         depth_colored = cv2.applyColorMap(depth_norm, self.colormap_id)
 
-        if self.blend_mode == "overlay":
-            output = cv2.addWeighted(image, 1 - self.opacity, depth_colored, self.opacity, 0)
-        elif self.blend_mode == "side_by_side":
-            output = np.hstack([image, depth_colored])
-        else:  # depth_only — full anonymization
-            output = depth_colored
+        return depth_colored
 
-        return output
+    # ── Config updates ────────────────────────────────────────────────
 
     def on_config_update(self, config: dict):
         """Handle live config updates from Aegis."""

From c5ceab701e98af1fe1ec45ac551de0367118ad0a Mon Sep 17 00:00:00 2001
From: Simba Zhang <solderzzc@gmail.com>
Date: Sat, 14 Mar 2026 20:15:06 -0700
Subject: [PATCH 12/14] feat(depth-estimation): add deploy.sh for
 platform-aware install

macOS: installs coremltools + common deps only (fast ~10s),
auto-downloads DepthAnythingV2SmallF16.mlpackage from HF.
Other: full PyTorch stack via requirements.txt.
---
 .../transformation/depth-estimation/deploy.sh | 91 +++++++++++++++++++
 1 file changed, 91 insertions(+)
 create mode 100755 skills/transformation/depth-estimation/deploy.sh

diff --git a/skills/transformation/depth-estimation/deploy.sh b/skills/transformation/depth-estimation/deploy.sh
new file mode 100755
index 0000000..abfb23a
--- /dev/null
+++ b/skills/transformation/depth-estimation/deploy.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+# deploy.sh — Platform-aware dependency install for Depth Estimation
+#
+# macOS:  CoreML only (fast ~10s install, Neural Engine inference)
+# Other:  Full PyTorch stack (torch + torchvision + depth-anything-v2)
+#
+# The Aegis deployment agent calls this instead of raw pip install.
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+VENV_DIR="$SCRIPT_DIR/.venv"
+MODELS_DIR="$HOME/.aegis-ai/models/feature-extraction"
+COREML_VARIANT="DepthAnythingV2SmallF16"
+COREML_HF_REPO="apple/coreml-depth-anything-v2-small"
+
+echo "=== Depth Estimation (Privacy) — Setup ==="
+echo "Platform: $(uname -s) / $(uname -m)"
+
+# ── Create venv ──────────────────────────────────────────────────────
+if [ ! -d "$VENV_DIR" ]; then
+    echo "Creating virtual environment..."
+    python3 -m venv "$VENV_DIR"
+fi
+
+PIP="$VENV_DIR/bin/pip"
+PYTHON="$VENV_DIR/bin/python"
+
+# Upgrade pip
+"$PIP" install --upgrade pip --quiet
+
+# ── Platform detection ───────────────────────────────────────────────
+if [ "$(uname -s)" = "Darwin" ]; then
+    echo ""
+    echo "=== macOS detected — CoreML backend (Neural Engine) ==="
+    echo "Installing CoreML dependencies only (fast)..."
+    "$PIP" install --quiet \
+        "coremltools>=8.0" \
+        "huggingface_hub>=0.20.0" \
+        "numpy>=1.24.0" \
+        "opencv-python-headless>=4.8.0" \
+        "Pillow>=10.0.0" \
+        "matplotlib>=3.7.0"
+
+    echo "✅ CoreML dependencies installed"
+
+    # ── Download CoreML model if not present ─────────────────────────
+    MODEL_PATH="$MODELS_DIR/$COREML_VARIANT.mlpackage"
+    if [ -d "$MODEL_PATH" ]; then
+        echo "✅ CoreML model already present: $MODEL_PATH"
+    else
+        echo "Downloading CoreML model: $COREML_VARIANT from $COREML_HF_REPO..."
+        mkdir -p "$MODELS_DIR"
+        "$PYTHON" -c "
+from huggingface_hub import snapshot_download
+snapshot_download(
+    '$COREML_HF_REPO',
+    local_dir='$MODELS_DIR',
+    allow_patterns=['$COREML_VARIANT.mlpackage/**'],
+)
+print('✅ CoreML model downloaded')
+"
+    fi
+
+    # Verify
+    "$PYTHON" -c "
+import coremltools, cv2, numpy, PIL
+from pathlib import Path
+model_path = Path('$MODEL_PATH')
+assert model_path.exists(), f'Model not found: {model_path}'
+print(f'✅ Verified: coremltools={coremltools.__version__}, model={model_path.name}')
+"
+
+else
+    echo ""
+    echo "=== Non-macOS — PyTorch backend ==="
+    echo "Installing full PyTorch dependencies..."
+    "$PIP" install --quiet -r "$SCRIPT_DIR/requirements.txt"
+
+    echo "✅ PyTorch dependencies installed"
+
+    # Verify
+    "$PYTHON" -c "
+import torch, cv2, numpy, PIL
+from depth_anything_v2.dpt import DepthAnythingV2
+print(f'✅ Verified: torch={torch.__version__}, CUDA={torch.cuda.is_available()}')
+"
+fi
+
+echo ""
+echo "=== Setup complete ==="

From 3b26dc131c499549fc20d4865bb80dc359f756d1 Mon Sep 17 00:00:00 2001
From: Simba Zhang <solderzzc@gmail.com>
Date: Sat, 14 Mar 2026 23:18:25 -0700
Subject: [PATCH 13/14] refactor: move sam2-segmentation from analysis to
 annotation category
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Move skill directory: skills/analysis/ → skills/annotation/
- Update README.md skill catalog table accordingly
---
 README.md                                                     | 4 ++--
 skills/{analysis => annotation}/sam2-segmentation/SKILL.md    | 0
 .../sam2-segmentation/requirements.txt                        | 0
 .../sam2-segmentation/scripts/segment.py                      | 0
 4 files changed, 2 insertions(+), 2 deletions(-)
 rename skills/{analysis => annotation}/sam2-segmentation/SKILL.md (100%)
 rename skills/{analysis => annotation}/sam2-segmentation/requirements.txt (100%)
 rename skills/{analysis => annotation}/sam2-segmentation/scripts/segment.py (100%)

diff --git a/README.md b/README.md
index be80e26..a78bb3e 100644
--- a/README.md
+++ b/README.md
@@ -70,9 +70,9 @@ Each skill is a self-contained module with its own model, parameters, and [commu
 |----------|-------|--------------|:------:|
 | **Detection** | [`yolo-detection-2026`](skills/detection/yolo-detection-2026/) | Real-time 80+ class detection — auto-accelerated via TensorRT / CoreML / OpenVINO / ONNX | ✅|
 | **Analysis** | [`home-security-benchmark`](skills/analysis/home-security-benchmark/) | [143-test evaluation suite](#-homesec-bench--how-secure-is-your-local-ai) for LLM & VLM security performance | ✅ |
-| | [`sam2-segmentation`](skills/analysis/sam2-segmentation/) | Click-to-segment with pixel-perfect masks | 📐 |
 | **Privacy** | [`depth-estimation`](skills/transformation/depth-estimation/) | [Real-time depth-map privacy transform](#-privacy--depth-map-anonymization) — anonymize camera feeds while preserving activity | ✅ |
-| **Annotation** | [`dataset-annotation`](skills/annotation/dataset-annotation/) | AI-assisted labeling → COCO export | 📐 |
+| **Annotation** | [`sam2-segmentation`](skills/annotation/sam2-segmentation/) | Click-to-segment with pixel-perfect masks | 📐 |
+| | [`dataset-annotation`](skills/annotation/dataset-annotation/) | AI-assisted labeling → COCO export | 📐 |
 | **Camera Providers** | [`eufy`](skills/camera-providers/eufy/) · [`reolink`](skills/camera-providers/reolink/) · [`tapo`](skills/camera-providers/tapo/) | Direct camera integrations via RTSP | 📐 |
 | **Streaming** | [`go2rtc-cameras`](skills/streaming/go2rtc-cameras/) | RTSP → WebRTC live view | 📐 |
 | **Channels** | [`matrix`](skills/channels/matrix/) · [`line`](skills/channels/line/) · [`signal`](skills/channels/signal/) | Messaging channels for Clawdbot agent | 📐 |
diff --git a/skills/analysis/sam2-segmentation/SKILL.md b/skills/annotation/sam2-segmentation/SKILL.md
similarity index 100%
rename from skills/analysis/sam2-segmentation/SKILL.md
rename to skills/annotation/sam2-segmentation/SKILL.md
diff --git a/skills/analysis/sam2-segmentation/requirements.txt b/skills/annotation/sam2-segmentation/requirements.txt
similarity index 100%
rename from skills/analysis/sam2-segmentation/requirements.txt
rename to skills/annotation/sam2-segmentation/requirements.txt
diff --git a/skills/analysis/sam2-segmentation/scripts/segment.py b/skills/annotation/sam2-segmentation/scripts/segment.py
similarity index 100%
rename from skills/analysis/sam2-segmentation/scripts/segment.py
rename to skills/annotation/sam2-segmentation/scripts/segment.py

From 1c48af4a5783841bedb9bddd24989bcd79f319eb Mon Sep 17 00:00:00 2001
From: Simba Zhang <solderzzc@gmail.com>
Date: Sat, 14 Mar 2026 23:22:06 -0700
Subject: [PATCH 14/14] feat: add model-training skill and Training category
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- New skill: skills/training/model-training/ with SKILL.md manifest
  documenting the Aegis Training Agent pipeline:
  annotated dataset → YOLO fine-tune → auto-export → deploy
- Add 'training' category to skills.json
- Add model-training entry to skills.json registry
- Update README skill catalog with Training row
- Skill count: 18 → 19 skills, 9 → 10 categories
---
 README.md                                     |   3 +-
 skills.json                                   |  32 ++++++
 skills/training/model-training/SKILL.md       | 105 ++++++++++++++++++
 .../training/model-training/requirements.txt  |   5 +
 4 files changed, 144 insertions(+), 1 deletion(-)
 create mode 100644 skills/training/model-training/SKILL.md
 create mode 100644 skills/training/model-training/requirements.txt

diff --git a/README.md b/README.md
index a78bb3e..84f4703 100644
--- a/README.md
+++ b/README.md
@@ -60,7 +60,7 @@
 - [x] **AI/LLM-assisted skill installation** — community-contributed skills installed and configured via AI agent
 - [x] **GPU / NPU / CPU (AIPC) aware installation** — auto-detect hardware, install matching frameworks, convert models to optimal format
 - [x] **Hardware environment layer** — shared [`env_config.py`](skills/lib/env_config.py) for auto-detection + model optimization across NVIDIA, AMD, Apple Silicon, Intel, and CPU
-- [ ] **Skill development** — 18 skills across 9 categories, actively expanding with community contributions
+- [ ] **Skill development** — 19 skills across 10 categories, actively expanding with community contributions
 
 ## 🧩 Skill Catalog
 
@@ -73,6 +73,7 @@ Each skill is a self-contained module with its own model, parameters, and [commu
 | **Privacy** | [`depth-estimation`](skills/transformation/depth-estimation/) | [Real-time depth-map privacy transform](#-privacy--depth-map-anonymization) — anonymize camera feeds while preserving activity | ✅ |
 | **Annotation** | [`sam2-segmentation`](skills/annotation/sam2-segmentation/) | Click-to-segment with pixel-perfect masks | 📐 |
 | | [`dataset-annotation`](skills/annotation/dataset-annotation/) | AI-assisted labeling → COCO export | 📐 |
+| **Training** | [`model-training`](skills/training/model-training/) | Agent-driven YOLO fine-tuning — annotate, train, export, deploy | 📐 |
 | **Camera Providers** | [`eufy`](skills/camera-providers/eufy/) · [`reolink`](skills/camera-providers/reolink/) · [`tapo`](skills/camera-providers/tapo/) | Direct camera integrations via RTSP | 📐 |
 | **Streaming** | [`go2rtc-cameras`](skills/streaming/go2rtc-cameras/) | RTSP → WebRTC live view | 📐 |
 | **Channels** | [`matrix`](skills/channels/matrix/) · [`line`](skills/channels/line/) · [`signal`](skills/channels/signal/) | Messaging channels for Clawdbot agent | 📐 |
diff --git a/skills.json b/skills.json
index a35f483..3440a5e 100644
--- a/skills.json
+++ b/skills.json
@@ -9,6 +9,7 @@
     "transformation": "Depth estimation, style transfer, video effects",
     "privacy": "Privacy transforms — depth maps, blur, anonymization for blind mode",
     "annotation": "Dataset labeling, COCO export, training data",
+    "training": "Model fine-tuning, hardware-optimized export, deployment",
     "camera-providers": "Camera brand integrations — clip feed, live stream",
     "streaming": "RTSP/WebRTC live view via go2rtc",
     "channels": "Messaging platform channels for Clawdbot agent",
@@ -165,6 +166,37 @@
         "privacy_overlay",
         "blind_mode"
       ]
+    },
+    {
+      "id": "model-training",
+      "name": "Model Training",
+      "description": "Agent-driven YOLO fine-tuning — annotate, train, auto-export to TensorRT/CoreML/OpenVINO, deploy as detection skill.",
+      "version": "1.0.0",
+      "category": "training",
+      "path": "skills/training/model-training",
+      "tags": [
+        "training",
+        "fine-tuning",
+        "yolo",
+        "custom-model",
+        "export"
+      ],
+      "platforms": [
+        "linux-x64",
+        "linux-arm64",
+        "darwin-arm64",
+        "darwin-x64",
+        "win-x64"
+      ],
+      "requirements": {
+        "python": ">=3.9",
+        "ram_gb": 4
+      },
+      "capabilities": [
+        "fine_tuning",
+        "model_export",
+        "deployment"
+      ]
     }
   ]
 }
\ No newline at end of file
diff --git a/skills/training/model-training/SKILL.md b/skills/training/model-training/SKILL.md
new file mode 100644
index 0000000..32fc658
--- /dev/null
+++ b/skills/training/model-training/SKILL.md
@@ -0,0 +1,105 @@
+---
+name: model-training
+description: "Agent-driven YOLO fine-tuning — annotate, train, export, deploy"
+version: 1.0.0
+
+parameters:
+  - name: base_model
+    label: "Base Model"
+    type: select
+    options: ["yolo26n", "yolo26s", "yolo26m", "yolo26l"]
+    default: "yolo26n"
+    description: "Pre-trained model to fine-tune"
+    group: Training
+
+  - name: dataset_dir
+    label: "Dataset Directory"
+    type: string
+    default: "~/datasets"
+    description: "Path to COCO-format dataset (from dataset-annotation skill)"
+    group: Training
+
+  - name: epochs
+    label: "Training Epochs"
+    type: number
+    default: 50
+    group: Training
+
+  - name: batch_size
+    label: "Batch Size"
+    type: number
+    default: 16
+    description: "Adjust based on GPU VRAM"
+    group: Training
+
+  - name: auto_export
+    label: "Auto-Export to Optimal Format"
+    type: boolean
+    default: true
+    description: "Automatically convert to TensorRT/CoreML/OpenVINO after training"
+    group: Deployment
+
+  - name: deploy_as_skill
+    label: "Deploy as Detection Skill"
+    type: boolean
+    default: false
+    description: "Replace the active YOLO detection model with the fine-tuned version"
+    group: Deployment
+
+capabilities:
+  training:
+    script: scripts/train.py
+    description: "Fine-tune YOLO models on custom annotated datasets"
+---
+
+# Model Training
+
+Agent-driven custom model training powered by Aegis's Training Agent. Closes the annotation-to-deployment loop: take a COCO dataset from `dataset-annotation`, fine-tune a YOLO model, auto-export to the optimal format for your hardware, and optionally deploy it as your active detection skill.
+
+## What You Get
+
+- **Fine-tune YOLO26** — start from nano/small/medium/large pre-trained weights
+- **COCO dataset input** — uses standard format from `dataset-annotation` skill
+- **Hardware-aware training** — auto-detects CUDA, MPS, ROCm, or CPU
+- **Auto-export** — converts trained model to TensorRT / CoreML / OpenVINO / ONNX via `env_config.py`
+- **One-click deploy** — replace the active detection model with your fine-tuned version
+- **Training telemetry** — real-time loss, mAP, and epoch progress streamed to Aegis UI
+
+## Training Loop (Aegis Training Agent)
+
+```
+dataset-annotation          model-training              yolo-detection-2026
+┌─────────────┐        ┌──────────────────┐        ┌──────────────────┐
+│ Annotate    │───────▶│ Fine-tune YOLO   │───────▶│ Deploy custom    │
+│ Review      │  COCO  │ Auto-export      │ .pt    │ model as active  │
+│ Export      │  JSON  │ Validate mAP     │ .engine│ detection skill  │
+└─────────────┘        └──────────────────┘        └──────────────────┘
+       ▲                                                    │
+       └────────────────────────────────────────────────────┘
+                    Feedback loop: better detection → better annotation
+```
+
+## Protocol
+
+### Aegis → Skill (stdin)
+```jsonl
+{"event": "train", "dataset_path": "~/datasets/front_door_people/", "base_model": "yolo26n", "epochs": 50, "batch_size": 16}
+{"event": "export", "model_path": "runs/train/best.pt", "formats": ["coreml", "tensorrt"]}
+{"event": "validate", "model_path": "runs/train/best.pt", "dataset_path": "~/datasets/front_door_people/"}
+```
+
+### Skill → Aegis (stdout)
+```jsonl
+{"event": "ready", "gpu": "mps", "base_models": ["yolo26n", "yolo26s", "yolo26m", "yolo26l"]}
+{"event": "progress", "epoch": 12, "total_epochs": 50, "loss": 0.043, "mAP50": 0.87, "mAP50_95": 0.72}
+{"event": "training_complete", "model_path": "runs/train/best.pt", "metrics": {"mAP50": 0.91, "mAP50_95": 0.78, "params": "2.6M"}}
+{"event": "export_complete", "format": "coreml", "path": "runs/train/best.mlpackage", "speedup": "2.1x vs PyTorch"}
+{"event": "validation", "mAP50": 0.91, "per_class": [{"class": "person", "ap": 0.95}, {"class": "car", "ap": 0.88}]}
+```
+
+## Setup
+
+```bash
+python3 -m venv .venv && source .venv/bin/activate
+pip install -r requirements.txt
+```
diff --git a/skills/training/model-training/requirements.txt b/skills/training/model-training/requirements.txt
new file mode 100644
index 0000000..b8f145d
--- /dev/null
+++ b/skills/training/model-training/requirements.txt
@@ -0,0 +1,5 @@
+ultralytics>=8.3.0
+torch>=2.0.0
+coremltools>=7.0; sys_platform == 'darwin'
+onnx>=1.14.0
+onnxruntime>=1.16.0