Skip to content

Commit f153d66

Browse files
authored
Merge pull request #3 from cachevector/benchmarking
Add benchmarking module
2 parents ed570be + e61437a commit f153d66

7 files changed

Lines changed: 419 additions & 0 deletions

File tree

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
88
## [Unreleased]
99

1010
### Added
11+
- **Benchmarking module** (`comprexx.benchmark`): `cx.benchmark()` measures real
12+
inference latency with configurable warmup/iters, reporting mean, median, std,
13+
p50/p90/p99, min/max, and throughput. `cx.compare_benchmarks()` returns a
14+
before/after comparison with speedup and latency/throughput deltas. Quantized
15+
models are automatically run on CPU. New `comprexx bench` CLI command.
1116
- GitHub Actions CI workflow running `pytest` on Python 3.10, 3.11, 3.12 plus a
1217
`ruff check` lint job.
1318
- `CHANGELOG.md` with history for v0.1.0 and v0.2.0.

README.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,29 @@ pipeline = cx.Pipeline([
121121

122122
The `perturbation` can be `"prune"` (zero the smallest weights) or `"noise"` (add Gaussian noise scaled by weight std). Each layer is snapshotted and restored in place, so no deep copies of the model are made.
123123

124+
### Benchmark inference latency
125+
126+
Param counts and FLOPs tell you how small a model got. They don't tell you how fast it runs. `cx.benchmark` measures real latency:
127+
128+
```python
129+
result = cx.benchmark(model, input_shape=(1, 3, 224, 224), warmup=10, iters=50)
130+
print(result.summary())
131+
```
132+
133+
You get mean, median, std, p50/p90/p99, min/max, and throughput in inferences per second. To see what compression actually bought you, run `compare_benchmarks` on the baseline and compressed models:
134+
135+
```python
136+
cmp = cx.compare_benchmarks(
137+
baseline_model, result.model,
138+
input_shape=(1, 3, 224, 224),
139+
iters=50,
140+
)
141+
print(cmp.summary())
142+
print(f"{cmp.speedup:.2f}x faster")
143+
```
144+
145+
Warmup iterations are excluded from measurements so caches and JIT settle first. Quantized models are automatically run on CPU regardless of the `device` argument.
146+
124147
### Export to ONNX
125148

126149
```python
@@ -177,6 +200,9 @@ comprexx analyze model.pt --input-shape "1,3,224,224" --json
177200
comprexx compress model.pt --recipe recipe.yaml --input-shape "1,3,224,224"
178201
comprexx compress model.pt --recipe recipe.yaml --input-shape "1,3,224,224" --dry-run
179202

203+
# Benchmark
204+
comprexx bench model.pt --input-shape "1,3,224,224" --iters 50
205+
180206
# Export
181207
comprexx export model.pt --format onnx --input-shape "1,3,224,224"
182208
```

comprexx/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,12 @@
99
SensitivityReport,
1010
analyze_sensitivity,
1111
)
12+
from comprexx.benchmark.runner import (
13+
BenchmarkComparison,
14+
BenchmarkResult,
15+
benchmark,
16+
)
17+
from comprexx.benchmark.runner import compare as compare_benchmarks
1218
from comprexx.core.exceptions import (
1319
AccuracyGuardTriggered,
1420
CalibrationError,
@@ -27,6 +33,8 @@
2733
__all__ = [
2834
"AccuracyGuard",
2935
"AccuracyGuardTriggered",
36+
"BenchmarkComparison",
37+
"BenchmarkResult",
3038
"CalibrationError",
3139
"CompressionReport",
3240
"ComprexxError",
@@ -43,6 +51,8 @@
4351
"UnsupportedLayerError",
4452
"analyze",
4553
"analyze_sensitivity",
54+
"benchmark",
55+
"compare_benchmarks",
4656
"load_recipe",
4757
"stages",
4858
]

comprexx/benchmark/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
"""Inference benchmarking."""
2+
3+
from comprexx.benchmark.runner import (
4+
BenchmarkComparison,
5+
BenchmarkResult,
6+
benchmark,
7+
compare,
8+
)
9+
10+
__all__ = ["BenchmarkComparison", "BenchmarkResult", "benchmark", "compare"]

comprexx/benchmark/runner.py

Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
"""Latency/throughput benchmarking for PyTorch models.
2+
3+
Measures real inference performance so compression reports can show actual
4+
speedups, not just parameter-count reductions.
5+
"""
6+
7+
from __future__ import annotations
8+
9+
import json
10+
import statistics
11+
import time
12+
from dataclasses import asdict, dataclass, field
13+
from typing import Callable, Optional
14+
15+
import torch
16+
import torch.nn as nn
17+
18+
19+
@dataclass
20+
class BenchmarkResult:
21+
"""Latency statistics from a benchmark run."""
22+
23+
device: str
24+
dtype: str
25+
batch_size: int
26+
warmup: int
27+
iters: int
28+
mean_ms: float
29+
median_ms: float
30+
std_ms: float
31+
min_ms: float
32+
max_ms: float
33+
p50_ms: float
34+
p90_ms: float
35+
p99_ms: float
36+
throughput_ips: float
37+
samples_ms: list[float] = field(default_factory=list)
38+
39+
def to_dict(self) -> dict:
40+
return asdict(self)
41+
42+
def to_json(self) -> str:
43+
return json.dumps(self.to_dict(), indent=2)
44+
45+
def summary(self) -> str:
46+
return (
47+
f"Benchmark ({self.device}, batch={self.batch_size}, iters={self.iters})\n"
48+
f" Mean: {self.mean_ms:.3f} ms\n"
49+
f" Median: {self.median_ms:.3f} ms\n"
50+
f" Std: {self.std_ms:.3f} ms\n"
51+
f" p50/p90/p99:{self.p50_ms:.3f} / {self.p90_ms:.3f} / {self.p99_ms:.3f} ms\n"
52+
f" Min/Max: {self.min_ms:.3f} / {self.max_ms:.3f} ms\n"
53+
f" Throughput: {self.throughput_ips:.1f} inferences/sec"
54+
)
55+
56+
57+
@dataclass
58+
class BenchmarkComparison:
59+
"""Before/after benchmark comparison."""
60+
61+
baseline: BenchmarkResult
62+
compressed: BenchmarkResult
63+
64+
@property
65+
def speedup(self) -> float:
66+
if self.compressed.mean_ms == 0:
67+
return float("inf")
68+
return self.baseline.mean_ms / self.compressed.mean_ms
69+
70+
@property
71+
def latency_reduction_pct(self) -> float:
72+
if self.baseline.mean_ms == 0:
73+
return 0.0
74+
return (1 - self.compressed.mean_ms / self.baseline.mean_ms) * 100
75+
76+
@property
77+
def throughput_gain_pct(self) -> float:
78+
if self.baseline.throughput_ips == 0:
79+
return 0.0
80+
return (self.compressed.throughput_ips / self.baseline.throughput_ips - 1) * 100
81+
82+
def to_dict(self) -> dict:
83+
return {
84+
"baseline": self.baseline.to_dict(),
85+
"compressed": self.compressed.to_dict(),
86+
"speedup": self.speedup,
87+
"latency_reduction_pct": self.latency_reduction_pct,
88+
"throughput_gain_pct": self.throughput_gain_pct,
89+
}
90+
91+
def to_json(self) -> str:
92+
return json.dumps(self.to_dict(), indent=2)
93+
94+
def summary(self) -> str:
95+
return (
96+
f"Benchmark Comparison ({self.baseline.device})\n"
97+
f" Baseline: {self.baseline.mean_ms:.3f} ms "
98+
f"({self.baseline.throughput_ips:.1f} ips)\n"
99+
f" Compressed: {self.compressed.mean_ms:.3f} ms "
100+
f"({self.compressed.throughput_ips:.1f} ips)\n"
101+
f" Speedup: {self.speedup:.2f}x "
102+
f"({self.latency_reduction_pct:+.1f}% latency, "
103+
f"{self.throughput_gain_pct:+.1f}% throughput)"
104+
)
105+
106+
107+
def _make_input(
108+
input_shape: tuple[int, ...] | list[tuple[int, ...]],
109+
device: torch.device,
110+
dtype: torch.dtype,
111+
) -> torch.Tensor | tuple[torch.Tensor, ...]:
112+
if isinstance(input_shape, list):
113+
return tuple(torch.randn(*s, device=device, dtype=dtype) for s in input_shape)
114+
return torch.randn(*input_shape, device=device, dtype=dtype)
115+
116+
117+
def _sync(device: torch.device) -> None:
118+
if device.type == "cuda":
119+
torch.cuda.synchronize()
120+
121+
122+
def _percentile(values: list[float], pct: float) -> float:
123+
if not values:
124+
return 0.0
125+
s = sorted(values)
126+
k = (len(s) - 1) * (pct / 100.0)
127+
lo = int(k)
128+
hi = min(lo + 1, len(s) - 1)
129+
frac = k - lo
130+
return s[lo] * (1 - frac) + s[hi] * frac
131+
132+
133+
def benchmark(
134+
model: nn.Module,
135+
input_shape: tuple[int, ...] | list[tuple[int, ...]],
136+
device: str = "cpu",
137+
dtype: torch.dtype = torch.float32,
138+
warmup: int = 10,
139+
iters: int = 50,
140+
input_fn: Optional[Callable[[], torch.Tensor | tuple[torch.Tensor, ...]]] = None,
141+
) -> BenchmarkResult:
142+
"""Benchmark a model's inference latency.
143+
144+
Args:
145+
model: Model to benchmark (set to eval mode internally).
146+
input_shape: Single tensor shape, or list of shapes for multi-input models.
147+
device: "cpu" or "cuda".
148+
dtype: Input tensor dtype. Quantized models ignore this.
149+
warmup: Warmup iterations (not measured) to stabilize caches/JIT.
150+
iters: Measured iterations.
151+
input_fn: Optional callable returning a fresh input per call. Overrides
152+
`input_shape` when provided.
153+
"""
154+
if iters <= 0:
155+
raise ValueError("iters must be positive")
156+
157+
dev = torch.device(device)
158+
model = model.eval()
159+
160+
# Quantized models must run on CPU
161+
is_quantized = any(
162+
"quantized" in type(m).__module__ for m in model.modules()
163+
)
164+
if is_quantized and dev.type != "cpu":
165+
dev = torch.device("cpu")
166+
167+
try:
168+
model = model.to(dev)
169+
except (RuntimeError, NotImplementedError):
170+
# Some quantized modules refuse .to() transfers; fall through
171+
pass
172+
173+
def _gen_input():
174+
if input_fn is not None:
175+
x = input_fn()
176+
else:
177+
x = _make_input(input_shape, dev, dtype)
178+
return x
179+
180+
sample = _gen_input()
181+
batch_size = (
182+
sample[0].shape[0] if isinstance(sample, tuple) else sample.shape[0]
183+
)
184+
185+
with torch.inference_mode():
186+
# Warmup
187+
for _ in range(warmup):
188+
x = _gen_input()
189+
if isinstance(x, tuple):
190+
model(*x)
191+
else:
192+
model(x)
193+
_sync(dev)
194+
195+
# Measure
196+
samples_ms: list[float] = []
197+
for _ in range(iters):
198+
x = _gen_input()
199+
_sync(dev)
200+
t0 = time.perf_counter()
201+
if isinstance(x, tuple):
202+
model(*x)
203+
else:
204+
model(x)
205+
_sync(dev)
206+
samples_ms.append((time.perf_counter() - t0) * 1000.0)
207+
208+
mean_ms = statistics.fmean(samples_ms)
209+
median_ms = statistics.median(samples_ms)
210+
std_ms = statistics.pstdev(samples_ms) if len(samples_ms) > 1 else 0.0
211+
throughput = (batch_size * 1000.0 / mean_ms) if mean_ms > 0 else 0.0
212+
213+
return BenchmarkResult(
214+
device=str(dev),
215+
dtype=str(dtype).replace("torch.", ""),
216+
batch_size=batch_size,
217+
warmup=warmup,
218+
iters=iters,
219+
mean_ms=mean_ms,
220+
median_ms=median_ms,
221+
std_ms=std_ms,
222+
min_ms=min(samples_ms),
223+
max_ms=max(samples_ms),
224+
p50_ms=_percentile(samples_ms, 50),
225+
p90_ms=_percentile(samples_ms, 90),
226+
p99_ms=_percentile(samples_ms, 99),
227+
throughput_ips=throughput,
228+
samples_ms=samples_ms,
229+
)
230+
231+
232+
def compare(
233+
baseline_model: nn.Module,
234+
compressed_model: nn.Module,
235+
input_shape: tuple[int, ...] | list[tuple[int, ...]],
236+
device: str = "cpu",
237+
dtype: torch.dtype = torch.float32,
238+
warmup: int = 10,
239+
iters: int = 50,
240+
) -> BenchmarkComparison:
241+
"""Benchmark baseline and compressed models and return a comparison."""
242+
base = benchmark(
243+
baseline_model, input_shape, device=device, dtype=dtype,
244+
warmup=warmup, iters=iters,
245+
)
246+
comp = benchmark(
247+
compressed_model, input_shape, device=device, dtype=dtype,
248+
warmup=warmup, iters=iters,
249+
)
250+
return BenchmarkComparison(baseline=base, compressed=comp)

comprexx/cli/main.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,5 +177,30 @@ def export_cmd(
177177
raise typer.Exit(1)
178178

179179

180+
@app.command()
181+
def bench(
182+
model_source: str = typer.Argument(..., help="Model path or Python module path"),
183+
input_shape: str = typer.Option(..., "--input-shape", help="Input shape, e.g. '1,3,224,224'"),
184+
device: str = typer.Option("cpu", help="Device (cpu or cuda)"),
185+
warmup: int = typer.Option(10, "--warmup", help="Warmup iterations"),
186+
iters: int = typer.Option(50, "--iters", help="Measured iterations"),
187+
json_output: bool = typer.Option(False, "--json", help="Output as JSON"),
188+
):
189+
"""Benchmark a model's inference latency."""
190+
from comprexx.benchmark.runner import benchmark
191+
192+
shape = _parse_input_shape(input_shape)
193+
model = _load_model(model_source)
194+
195+
with console.status("Benchmarking..."):
196+
result = benchmark(model, input_shape=shape, device=device,
197+
warmup=warmup, iters=iters)
198+
199+
if json_output:
200+
console.print(result.to_json())
201+
else:
202+
console.print(Panel(result.summary(), title="Comprexx Benchmark"))
203+
204+
180205
if __name__ == "__main__":
181206
app()

0 commit comments

Comments
 (0)