Add files via upload

FonaTech · web-flow · commit 43343d1da3e7 · 2026-04-26T14:10:27.000+08:00
diff --git a/chronos/mlx/__init__.py b/chronos/mlx/__init__.py
@@ -16,12 +16,6 @@
   - torch.compile unavailable; mx.compile() used instead
   - All modules are mlx.nn.Module, not torch.nn.Module
 """
-from chronos.mlx.model import ChronosMLXModel
-from chronos.mlx.moe import ChronosMLXMOE
-from chronos.mlx.attention import MLAAttentionMLX, SlidingWindowAttentionMLX
-from chronos.mlx.expert_store import MLXExpertStore
-from chronos.mlx.inference import ChronosMLXInferenceEngine
-
 __all__ = [
     "ChronosMLXModel",
     "ChronosMLXMOE",
@@ -30,3 +24,31 @@
     "MLXExpertStore",
     "ChronosMLXInferenceEngine",
 ]
+
+
+def __getattr__(name):
+    if name == "ChronosMLXModel":
+        from chronos.mlx.model import ChronosMLXModel
+
+        return ChronosMLXModel
+    if name == "ChronosMLXMOE":
+        from chronos.mlx.moe import ChronosMLXMOE
+
+        return ChronosMLXMOE
+    if name == "MLAAttentionMLX":
+        from chronos.mlx.attention import MLAAttentionMLX
+
+        return MLAAttentionMLX
+    if name == "SlidingWindowAttentionMLX":
+        from chronos.mlx.attention import SlidingWindowAttentionMLX
+
+        return SlidingWindowAttentionMLX
+    if name == "MLXExpertStore":
+        from chronos.mlx.expert_store import MLXExpertStore
+
+        return MLXExpertStore
+    if name == "ChronosMLXInferenceEngine":
+        from chronos.mlx.inference import ChronosMLXInferenceEngine
+
+        return ChronosMLXInferenceEngine
+    raise AttributeError(name)
diff --git a/chronos/mlx/inference.py b/chronos/mlx/inference.py
@@ -50,8 +50,14 @@ def __init__(self, model, config, ssd_dir: str = "./expert_cache_mlx"):
         self._runtime_stats = {
             "resident_hits": 0,
             "resident_misses": 0,
+            "resident_vram_hits": 0,
+            "resident_ram_hits": 0,
+            "selection_hits": 0,
+            "selection_misses": 0,
             "prediction_hits": 0,
             "prediction_total": 0,
+            "prefetch_queue_drops": 0,
+            "prefetch_wait_time_s": 0.0,
             "sync_ssd_loads": 0,
             "on_demand_load_time_s": 0.0,
         }
@@ -75,6 +81,11 @@ def loader(eid: int) -> bool:
                     eid in self.store._warm
                     and self.store._layer_states_complete(self.store._warm.get(eid))
                 )
+            with self._stats_lock:
+                if was_hot or was_warm:
+                    self._runtime_stats["selection_hits"] += 1
+                else:
+                    self._runtime_stats["selection_misses"] += 1
             t0 = time.monotonic()
             if not was_hot and not was_warm:
                 with self._stats_lock:
@@ -85,18 +96,29 @@ def loader(eid: int) -> bool:
                 self._runtime_stats["on_demand_load_time_s"] += elapsed
                 if ok:
                     self._runtime_stats["resident_hits" if (was_hot or was_warm) else "resident_misses"] += 1
+                    if was_hot:
+                        self._runtime_stats["resident_vram_hits"] += 1
+                    elif was_warm:
+                        self._runtime_stats["resident_ram_hits"] += 1
                 else:
                     self._runtime_stats["resident_misses"] += 1
                 self._runtime_stats["prediction_total"] += 1
-                if eid in self._last_predicted:
+                if (was_hot or was_warm) and eid in self._last_predicted:
                     self._runtime_stats["prediction_hits"] += 1
             return ok
 
         def touch(eid: int) -> None:
+            eid = int(eid)
             with self.store._lock:
                 if eid in self.store._hot_lru:
-                    if self.store._expert_live_all_layers(int(eid)):
+                    if self.store._expert_live_all_layers(eid):
                         self.store._hot_lru.move_to_end(eid)
+                        with self._stats_lock:
+                            self._runtime_stats["selection_hits"] += 1
+                            self._runtime_stats["resident_vram_hits"] += 1
+                            self._runtime_stats["prediction_total"] += 1
+                            if eid in self._last_predicted:
+                                self._runtime_stats["prediction_hits"] += 1
                     else:
                         self.store._hot_lru.pop(eid, None)
 
@@ -119,10 +141,44 @@ def _prefetch_loop(self):
             self._prefetch_q.task_done()
 
     def _schedule_prefetch(self, expert_ids: List[int]):
+        if not expert_ids:
+            return
         try:
             self._prefetch_q.put_nowait(expert_ids)
         except queue.Full:
-            pass
+            with self._stats_lock:
+                self._runtime_stats["prefetch_queue_drops"] += 1
+
+    def _prefetch_and_promote_window(self, expert_ids: List[int], timeout_s: float = 0.012) -> None:
+        if not expert_ids or self.store.storage_format == "full_dram":
+            return
+        pending = []
+        for eid in dict.fromkeys(int(eid) for eid in expert_ids):
+            with self.store._lock:
+                if eid in self.store._hot_lru:
+                    continue
+                if eid in self.store._warm and self.store._layer_states_complete(self.store._warm.get(eid)):
+                    pending.append(eid)
+                    continue
+            pending.append(eid)
+        if not pending:
+            return
+        self._schedule_prefetch(pending)
+        deadline = time.monotonic() + max(0.0, float(timeout_s))
+        while time.monotonic() < deadline:
+            self._promote_ready(pending)
+            with self.store._lock:
+                ready = all(
+                    eid in self.store._hot_lru
+                    or (eid in self.store._warm and self.store._layer_states_complete(self.store._warm.get(eid)))
+                    for eid in pending
+                )
+            if ready:
+                break
+            time.sleep(0.001)
+        self._promote_ready(pending)
+        with self._stats_lock:
+            self._runtime_stats["prefetch_wait_time_s"] += max(0.0, time.monotonic() - (deadline - max(0.0, float(timeout_s))))
 
     def stop(self):
         self._stop.set()
@@ -157,8 +213,14 @@ def generate(
             self._runtime_stats = {
                 "resident_hits": 0,
                 "resident_misses": 0,
+                "resident_vram_hits": 0,
+                "resident_ram_hits": 0,
+                "selection_hits": 0,
+                "selection_misses": 0,
                 "prediction_hits": 0,
                 "prediction_total": 0,
+                "prefetch_queue_drops": 0,
+                "prefetch_wait_time_s": 0.0,
                 "sync_ssd_loads": 0,
                 "on_demand_load_time_s": 0.0,
             }
@@ -186,6 +248,10 @@ def generate(
         next_token = self._sample(logits[:, -1, :], temperature, top_p)
         activated_ids: List[int] = []
         tokens = 1
+        if scheduler is None and lookahead_probs is not None:
+            future_ids = self._predict_future_experts(lookahead_probs)
+            self._last_predicted = set(int(eid) for eid in future_ids)
+            self._prefetch_and_promote_window(future_ids, timeout_s=0.025)
         yield int(next_token.item())
 
         for _ in range(max_new_tokens - 1):
@@ -196,8 +262,7 @@ def generate(
                 if lookahead_probs is not None:
                     future_ids = self._predict_future_experts(lookahead_probs)
                     self._last_predicted = set(int(eid) for eid in future_ids)
-                    self._schedule_prefetch(future_ids)
-                    self._promote_ready(future_ids)
+                    self._prefetch_and_promote_window(future_ids)
                 avail_masks = self._build_avail_masks(next_token)
 
             token_in = next_token.reshape(1, 1)
@@ -272,16 +337,22 @@ def _promote_ready(self, expert_ids: List[int]) -> None:
     def _runtime_stat_fields(self) -> dict:
         with self._stats_lock:
             stats = dict(self._runtime_stats)
-        total = int(stats.get("resident_hits", 0)) + int(stats.get("resident_misses", 0))
+        total = int(stats.get("selection_hits", 0)) + int(stats.get("selection_misses", 0))
         pred_total = int(stats.get("prediction_total", 0))
         return {
-            "resident_hit_rate": round(float(stats.get("resident_hits", 0)) / max(total, 1), 4),
-            "cache_hit_rate": round(float(stats.get("resident_hits", 0)) / max(total, 1), 4),
-            "cache_hits": int(stats.get("resident_hits", 0)),
-            "cache_misses": int(stats.get("resident_misses", 0)),
+            "resident_hit_rate": round(float(stats.get("selection_hits", 0)) / max(total, 1), 4),
+            "cache_hit_rate": round(float(stats.get("selection_hits", 0)) / max(total, 1), 4),
+            "cache_hits": int(stats.get("selection_hits", 0)),
+            "cache_misses": int(stats.get("selection_misses", 0)),
+            "expert_selection_hits": int(stats.get("selection_hits", 0)),
+            "expert_selection_misses": int(stats.get("selection_misses", 0)),
             "prediction_hit_rate": round(float(stats.get("prediction_hits", 0)) / max(pred_total, 1), 4),
             "prediction_hits": int(stats.get("prediction_hits", 0)),
             "prediction_total": pred_total,
+            "resident_vram_hits": int(stats.get("resident_vram_hits", 0)),
+            "resident_ram_hits": int(stats.get("resident_ram_hits", 0)),
+            "prefetch_queue_drops": int(stats.get("prefetch_queue_drops", 0)),
+            "prefetch_wait_time_s": round(float(stats.get("prefetch_wait_time_s", 0.0)), 4),
             "sync_ssd_loads": int(stats.get("sync_ssd_loads", 0)),
             "on_demand_loads": int(stats.get("resident_misses", 0)),
             "on_demand_load_time_s": round(float(stats.get("on_demand_load_time_s", 0.0)), 4),
@@ -309,16 +380,18 @@ def _sample(logits: mx.array, temperature: float, top_p: float) -> mx.array:
     @staticmethod
     def _memory_snapshot() -> dict:
         try:
-            from chronos.backend.mac_diagnostics import mlx_memory_snapshot
+            from chronos.backend.mac_diagnostics import mlx_memory_snapshot, rss_snapshot
 
-            return mlx_memory_snapshot()
+            out = dict(mlx_memory_snapshot())
+            out.update(rss_snapshot())
+            return out
         except Exception:
             return {}
 
     @staticmethod
     def _memory_fields(snapshot: dict, suffix: str) -> dict:
         out = {}
-        for key in ("mlx_active_gb", "mlx_cache_gb", "mlx_peak_gb"):
+        for key in ("rss_gb", "mlx_active_gb", "mlx_cache_gb", "mlx_peak_gb"):
             if key in snapshot:
                 prefix = key[:-3] if key.endswith("_gb") else key
                 out[f"{prefix}_{suffix}_gb"] = snapshot[key]
diff --git a/chronos/mlx/moe.py b/chronos/mlx/moe.py
@@ -120,6 +120,8 @@ def __call__(
 
                 if python_avail is not None:
                     selected = bool(mx.any(tok_mask).item())
+                    if not selected:
+                        continue
                     is_avail = i in python_avail
                     if selected and not is_avail:
                         loader = getattr(self, "runtime_on_demand_loader", None)
diff --git a/chronos/mlx/training/trainer.py b/chronos/mlx/training/trainer.py
@@ -5,11 +5,7 @@
 import time
 from dataclasses import dataclass
 
-import mlx.core as mx
-import mlx.nn as nn
-import mlx.optimizers as optim
 import torch
-from mlx.utils import tree_map
 
 from chronos.model.model_chronos import ChronosForCausalLM
 from chronos.model.checkpoint import (
@@ -18,11 +14,26 @@
     load_state_dict_controlled,
     save_state_dict_with_config,
 )
-from chronos.mlx.model import ChronosMLXModel
-from chronos.mlx.moe import ChronosMLXMOE
-from chronos.mlx.training.io import mlx_state_to_torch
 from chronos.trainer.optim_utils import get_lr
 
+try:
+    import mlx.core as mx
+    import mlx.nn as nn
+    import mlx.optimizers as optim
+    from mlx.utils import tree_map
+
+    from chronos.mlx.model import ChronosMLXModel
+    from chronos.mlx.moe import ChronosMLXMOE
+    from chronos.mlx.training.io import mlx_state_to_torch
+except ModuleNotFoundError:
+    mx = None
+    nn = None
+    optim = None
+    tree_map = None
+    ChronosMLXModel = None
+    ChronosMLXMOE = None
+    mlx_state_to_torch = None
+
 
 def _ids_to_mx(x) -> mx.array:
     if isinstance(x, mx.array):
@@ -494,7 +505,7 @@ def _planned_total_steps(data_iter, epochs: int, max_steps) -> int:
 def _normalize_mlx_dtype_name(dtype_name: str | None) -> str:
     value = (dtype_name or "auto").strip().lower()
     if value in {"auto", ""}:
-        return "bfloat16" if hasattr(mx, "bfloat16") else "float32"
+        return "bfloat16" if mx is not None and hasattr(mx, "bfloat16") else "float32"
     if value in {"fp16", "float16", "half"}:
         return "float16"
     if value in {"bf16", "bfloat16"}:
diff --git a/chronos/trainer/device_utils.py b/chronos/trainer/device_utils.py
@@ -284,11 +284,7 @@ def dataloader_kwargs(
         .lower()
         in {"1", "true", "yes", "on"}
     )
-    force_single_process = (
-        sys.platform == "darwin"
-        and metal_backend
-        and not allow_metal_workers
-    )
+    force_single_process = metal_backend and not allow_metal_workers
     if num_workers in (None, "", "auto"):
         workers = 0 if force_single_process else max(1, min(4, _physical_cores() // 4))
         if device_type == "xpu":
diff --git a/tests/test_smoke.py b/tests/test_smoke.py
@@ -642,12 +642,16 @@ def test_configure_cpu_threads_overrides_single_thread_env(monkeypatch):
 
 
 def test_configure_cpu_threads_ignores_stale_chronos_env_by_default(monkeypatch):
-    import psutil
     from chronos.trainer.device_utils import configure_cpu_threads, cpu_thread_snapshot
 
     monkeypatch.setenv("CHRONOS_CPU_THREADS", "1")
     monkeypatch.setenv("OMP_NUM_THREADS", "1")
-    physical = int(psutil.cpu_count(logical=False) or os.cpu_count() or 1)
+    try:
+        import psutil
+
+        physical = int(psutil.cpu_count(logical=False) or os.cpu_count() or 1)
+    except Exception:
+        physical = int(os.cpu_count() or 1)
     threads = configure_cpu_threads("auto", budget_percent=100)
     snap = cpu_thread_snapshot()
     assert threads == physical
@@ -2012,6 +2016,8 @@ def test_inference_stats_helpers_are_structured():
             "resident_hit_rate": 0.75,
             "prediction_hit_rate": 0.5,
             "on_demand_loads": 1,
+            "prefetch_queue_drops": 0,
+            "prefetch_wait_time_s": 0.012,
             "async_cold_miss_prefetches": 2,
             "sync_ssd_loads": 1,
             "miss_policy": "on_demand",
@@ -2051,7 +2057,8 @@ def test_inference_stats_helpers_are_structured():
     assert "Setup RSS delta" in md and "Prefill RSS delta" in md and "Decode RSS" in md
     assert "1.200 GB" in md and "2.030 GB" in md
     assert "Load budget" in md
-    assert "On-demand loads" in md and "Async misses" in md and "Predict hit" in md
+    assert "On-demand loads" in md and "Prefetch wait" in md and "Predict hit" in md
+    assert "Expert hits/misses" in md
     assert "lazy_offload" in md and "full_dram" in md
     assert set(df.columns) == {"metric", "mode", "x", "value", "normalized_value", "unit"}
     assert set(df["mode"]) == {"lazy_offload", "full_dram"}
@@ -2074,6 +2081,9 @@ def test_inference_offload_budget_caps_at_125_percent():
     assert budget["effective_vram_expert_budget"] == 6
     assert budget["effective_ram_expert_budget"] == 6
     assert budget["routing_top_k"] == 3
+    low = _bounded_offload_expert_budget(cfg, 0.10)
+    assert low["effective_expert_budget"] == 1
+    assert low["effective_ram_expert_budget"] == 1
     assert budget["num_moe_layers"] == 8
 
     cfg2 = ChronosConfig(num_experts=64, num_experts_per_tok=4, num_hidden_layers=8)
@@ -2090,6 +2100,7 @@ def test_inference_ram_load_ratio_accepts_custom_values():
         RAM_LOAD_RATIO_CHOICES,
         RAM_LOAD_SWEEP_RATIOS,
         _bounded_offload_expert_budget,
+        _clone_model_cfg,
         _normalize_ram_load_ratio,
     )
 
@@ -2101,15 +2112,27 @@ def test_inference_ram_load_ratio_accepts_custom_values():
     custom = _bounded_offload_expert_budget(cfg, "0.33")
     assert custom["requested_ram_load_ratio"] == 0.33
     assert custom["effective_expert_budget"] == 11
-    assert custom["effective_ram_expert_budget"] == 32
+    assert custom["effective_ram_expert_budget"] == 11
 
     custom_high = _bounded_offload_expert_budget(cfg, "1.10")
     assert custom_high["requested_ram_load_ratio"] == 1.1
     assert custom_high["effective_expert_budget"] == 36
-    assert custom_high["effective_ram_expert_budget"] == 40
+    assert custom_high["effective_ram_expert_budget"] == 36
 
     assert _normalize_ram_load_ratio("not-a-number") == 1.0
 
+    small = ChronosConfig(
+        num_experts=6,
+        num_experts_per_tok=3,
+        num_hidden_layers=8,
+        recommended_resident_experts=2,
+    )
+    budgets = [
+        _bounded_offload_expert_budget(_clone_model_cfg(small), ratio)["effective_expert_budget"]
+        for ratio in RAM_LOAD_SWEEP_RATIOS
+    ]
+    assert budgets == [1, 2, 2, 2, 3, 5, 5, 6, 6, 6, 6]
+
 
 def test_generate_api_returns_plain_json_with_chart_records():
     from ui.tabs import inference_tab as mod
diff --git a/ui/tabs/inference_tab.py b/ui/tabs/inference_tab.py