Add IQR-based environment fluctuation detection to benchmark timing stats (#691)

JewelRoam · claude · web-flow · commit adc9037ce791 · 2026-04-15T17:28:05.000+08:00
This commit introduces IQR (Interquartile Range) based environment fluctuation
detection to the timing statistics calculation in test_compiler_util.py. The
feature helps detect unstable benchmarking environments by measuring the
relative variation in timing results.

Key Changes:
- Enhanced get_timing_stats() to compute median, Q1, Q3, and IQR
- Added environment variable GRAPH_NET_FLUCTUATION_DETECT_THRESHOLD for
  configurable fluctuation detection sensitivity
- RuntimeError is raised when IQR/median exceeds the threshold
- Extended return stats dictionary with new fields: median, iqr
- Updated print_times_and_speedup() to use median for speedup calculation

IQR/median Ratio:
- Measures relative variability of timing measurements
- Lower values indicate more consistent timing
- Higher values indicate environment instability or interference

Environment Variable Configuration:
- GRAPH_NET_FLUCTUATION_DETECT_THRESHOLD (default: 0.2)
- Controls the sensitivity of fluctuation detection

Detection Algorithm:
1. Calculate median, Q1 (25th percentile), Q3 (75th percentile)
2. Compute IQR = Q3 - Q1
3. Calculate relative IQR = IQR / median
4. Compare against threshold
5. Raise RuntimeError with detailed diagnostics if exceeded

Error Message Format:
When fluctuation is detected, the error message includes:
- IQR/median ratio and threshold
- Raw timing values for manual inspection

Use Cases:
- Multi-user GPU environments where timing variance is common
- CI/CD pipeline monitoring for performance regression detection
- Manual benchmark verification in shared resources
- Identifying external workload interference

Co-authored-by: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/graph_net_bench/test_compiler_util.py b/graph_net_bench/test_compiler_util.py
@@ -108,11 +108,45 @@ def get_device_utilization(device_id, device_count, synchronizer_func):
 
 
 def get_timing_stats(elapsed_times):
+    """Compute timing statistics and detect environment fluctuation via IQR/median.
+
+    If IQR/median exceeds a threshold, the environment is considered unstable and
+    a RuntimeError is raised to request re-evaluation. The threshold is configured
+    via the environment variable GRAPH_NET_FLUCTUATION_DETECT_THRESHOLD (default: 0.2).
+
+    Args:
+        elapsed_times: List of elapsed times in ms.
+    Returns:
+        dict: Statistics containing median, iqr, mean, std, min, max.
+    Raises:
+        RuntimeError: If IQR/median exceeds threshold, indicating excessive fluctuation.
+    """
+    rel_iqr_threshold = float(
+        os.getenv("GRAPH_NET_FLUCTUATION_DETECT_THRESHOLD", "0.2")
+    )
+    arr = np.array(elapsed_times)
+    median = float(np.median(arr))
+    q1 = float(np.percentile(arr, 25))
+    q3 = float(np.percentile(arr, 75))
+    iqr = q3 - q1
+
+    if median > 0:
+        rel_iqr = iqr / median
+        if rel_iqr > rel_iqr_threshold:
+            raise RuntimeError(
+                f"Environment fluctuation detected.\n"
+                f"  IQR/median = {rel_iqr:.1%} (threshold: {rel_iqr_threshold:.0%})\n"
+                f"  Raw times (ms): {elapsed_times}\n"
+                f"Please re-run evaluation."
+            )
+
     stats = {
-        "mean": float(f"{np.mean(elapsed_times):.6g}"),
-        "std": float(f"{np.std(elapsed_times):.6g}"),
-        "min": float(f"{np.min(elapsed_times):.6g}"),
-        "max": float(f"{np.max(elapsed_times):.6g}"),
+        "median": float(f"{median:.6g}"),
+        "iqr": float(f"{iqr:.6g}"),
+        "mean": float(f"{np.mean(arr):.6g}"),
+        "std": float(f"{np.std(arr):.6g}"),
+        "min": float(f"{np.min(arr):.6g}"),
+        "max": float(f"{np.max(arr):.6g}"),
     }
     return stats
 
@@ -206,15 +240,15 @@ def print_times_and_speedup(args, eager_stats, compiled_stats):
     e2e_speedup = 0
     gpu_speedup = 0
 
-    eager_e2e_time_ms = eager_stats.get("e2e", {}).get("mean", 0)
-    compiled_e2e_time_ms = compiled_stats.get("e2e", {}).get("mean", 0)
+    eager_e2e_time_ms = eager_stats.get("e2e", {}).get("median", 0)
+    compiled_e2e_time_ms = compiled_stats.get("e2e", {}).get("median", 0)
 
     if eager_e2e_time_ms > 0 and compiled_e2e_time_ms > 0:
         e2e_speedup = eager_e2e_time_ms / compiled_e2e_time_ms
 
     if is_gpu_device(args.device):
-        eager_gpu_time_ms = eager_stats.get("gpu", {}).get("mean", 0)
-        compiled_gpu_time_ms = compiled_stats.get("gpu", {}).get("mean", 0)
+        eager_gpu_time_ms = eager_stats.get("gpu", {}).get("median", 0)
+        compiled_gpu_time_ms = compiled_stats.get("gpu", {}).get("median", 0)
 
         if eager_gpu_time_ms > 0 and compiled_gpu_time_ms > 0:
             gpu_speedup = eager_gpu_time_ms / compiled_gpu_time_ms