diff --git a/.github/ISSUE_TEMPLATE/new_suite.md b/.github/ISSUE_TEMPLATE/new_suite.md new file mode 100644 index 00000000..4c358a18 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/new_suite.md @@ -0,0 +1,75 @@ +--- +name: Propose a new suite +about: Propose a new benchmark suite (new model, scenario mix, or scaling axis) +title: "[Suite] " +labels: suite-proposal +assignees: '' +--- + + + +## Why this suite? + + + +## Suite contract (draft) + +| Field | Proposed value | +|---|---| +| **Suite ID** | `suite_` | +| **Model** | `` | +| **Model revision** | `` | +| **Chip count** | `1` / `auto` / specific number | +| **Precision** | `BF16` / `FP16` / list of allowed precisions | +| **Dataset** | existing (`sharegpt_standard_v1`, `sharegpt_edge_v1`, `sharegpt_longctx_v1`) or new | +| **Max model length** | tokens | +| **Output tokens (max)** | tokens | +| **Concurrency levels** | e.g. `[8, 32, 128]` | +| **Default scenarios** | subset of `accuracy / offline / online / interactive / sustained` | +| **Extra scenarios** | optional: `sustained / speculative / burst / …` | +| **Primary metric** | `offline_throughput`, `max_valid_qps`, … | +| **Expected run time on A100** | minutes | + +## Accuracy baseline + + + +- [ ] I will provide an A100 (or equivalent reference) BF16 baseline score + to add to `schema/accuracy_baselines.json`. +- [ ] If a new dataset is required, I will submit it under + `datasets/_v1/` with a `README.md` that documents the source + and upstream license (see [`datasets/README.md`](../../datasets/README.md)). + +## Custom orchestration? + + + +- [ ] Standard scenario dispatch is enough — no `suite.py` needed. +- [ ] A `suite.py` plugin is required. Reason: + +## Reference result plan + + + +- Reference hardware: +- Runner: `` +- Who will run it: <@your-handle / vendor / community member> + +## Open questions + + diff --git a/.github/workflows/generate_leaderboard.yml b/.github/workflows/generate_leaderboard.yml index 31dec72d..04d51173 100644 --- a/.github/workflows/generate_leaderboard.yml +++ b/.github/workflows/generate_leaderboard.yml @@ -11,8 +11,9 @@ on: paths: - 'results/**' - 'leaderboard/**' + - 'suites/**' + - 'schema/**' - 'tools/generate_platforms_matrix.py' - - 'schema/platforms.json' - 'runners/*/meta.json' # Allow manual trigger from Actions tab (useful for first deploy or to @@ -37,6 +38,9 @@ jobs: - name: Validate all runner meta.json files and hashes run: python runners/validate_runners.py + - name: Validate all suite definitions + run: python runners/validate_suites.py + generate: name: Generate and deploy leaderboard runs-on: ubuntu-latest diff --git a/.github/workflows/validate_pr.yml b/.github/workflows/validate_pr.yml index d24bbf16..541af133 100644 --- a/.github/workflows/validate_pr.yml +++ b/.github/workflows/validate_pr.yml @@ -8,7 +8,8 @@ on: paths: - 'results/**' - 'runners/**' - - 'schema/platforms.json' + - 'suites/**' + - 'schema/**' - 'tools/generate_platforms_matrix.py' - 'README.md' - 'leaderboard/site/**' @@ -89,6 +90,29 @@ jobs: python tools/generate_platforms_matrix.py --check echo "::endgroup::" + validate-suites: + name: Validate suite definitions + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: pip + + - name: Install dependencies + run: pip install jsonschema + + # Always validate every suite (and re-validate on schema changes too). + # This catches drift introduced by shared changes — e.g. a + # suite.schema.json edit that breaks an unrelated existing suite. + - name: Validate all suite folders (drift check) + run: | + echo "::group::Validating every suite folder in the repo" + python runners/validate_suites.py + echo "::endgroup::" + validate: name: Validate result submissions runs-on: ubuntu-latest @@ -225,4 +249,47 @@ jobs: # extra files to leaderboard/site/test/ to widen coverage; the # glob below picks them up automatically. - name: Run leaderboard frontend tests - run: node --test leaderboard/site/test/*.test.mjs \ No newline at end of file + run: node --test leaderboard/site/test/*.test.mjs + + python-tests: + name: Python unit tests (serve + skill) + runs-on: ubuntu-latest + # Lightweight checks for the FastAPI serve layer and the OpenClaw skill + # entry point. No GPU, no real model — everything is mocked. Tests are + # opt-in per package so missing deps in one folder don't take the rest + # of the suite down with them. + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: pip + + - name: Install test dependencies + # numpy is pulled in transitively by loadgen (imported when serve.server + # touches runners.benchmark_runner). Keep this list lean — these are the + # only packages required to *collect and run* the unit tests; no torch, + # no vendor SDKs, no real runner. + run: | + pip install --quiet pytest pydantic fastapi httpx pyyaml jsonschema numpy + + - name: Run serve unit tests + run: | + if [ -d serve/tests ]; then + echo "::group::pytest serve/tests" + python -m pytest serve/tests -q --no-header --color=no + echo "::endgroup::" + else + echo "serve/tests/ not present — skipping." + fi + + - name: Run OpenClaw skill unit tests + run: | + if [ -d openclaw_skill/tests ]; then + echo "::group::pytest openclaw_skill/tests" + python -m pytest openclaw_skill/tests -q --no-header --color=no + echo "::endgroup::" + else + echo "openclaw_skill/tests/ not present — skipping." + fi \ No newline at end of file diff --git a/.gitignore b/.gitignore index 07e5795e..36e1481f 100644 --- a/.gitignore +++ b/.gitignore @@ -12,11 +12,20 @@ env/ # ── Editor / IDE ──────────────────────────────────────────────────────────── .idea/ .vscode/ +.cursor/ *.swp *.swo *~ *.tmp .DS_Store +.aider* +.envrc +.direnv/ + +# ── Node / frontend tooling ───────────────────────────────────────────────── +node_modules/ +.eslintcache +npm-debug.log* # ── Test / lint caches ────────────────────────────────────────────────────── .pytest_cache/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6ff8d5d1..a3044cdc 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -320,6 +320,21 @@ CI then re-runs the schema validator and the runner-folder integrity check. When both pass and a contributor reviews the diff, the PR is merged and your result shows up on the leaderboard on the next site build. +### Optional: preview the leaderboard locally + +The static site is generated from `results/` by `leaderboard/generate.py`. +After dropping your result into `results/community//`, you can +preview the final UI before opening the PR: + +```bash +python leaderboard/generate.py # writes leaderboard/site/leaderboard.js + api/ +python -m http.server -d leaderboard/site 8000 # serve the static site +# open http://localhost:8000 +``` + +Both `leaderboard.js` and `leaderboard/site/api/` are gitignored — the GitHub +Actions workflow regenerates them on every merge to `main`. + ### Alternative: open a submission issue (no git required) If you'd rather not use git, paste your `result.json` into a diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 343a3132..98cddbca 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -32,13 +32,14 @@ AccelMark/ │ ├── loadgen.py ← Shared timing and measurement engine │ └── types.py ← InferenceResult, SampleRecord ├── suites/ -│ ├── suite_A/suite.json + requests.jsonl -│ ├── suite_B/suite.json + requests.jsonl -│ ├── suite_C/suite.json + suite.py + requests.jsonl -│ ├── suite_D/suite.json + requests.jsonl -│ ├── suite_E/suite.json + suite.py + requests.jsonl -│ ├── suite_F/suite.json + requests.jsonl -│ └── suite_G/suite.json + requests.jsonl +│ ├── suite_A/suite.json +│ ├── suite_B/suite.json +│ ├── suite_C/suite.json + suite.py ← suite.py is optional; only C and E ship one +│ ├── suite_D/suite.json +│ ├── suite_E/suite.json + suite.py +│ ├── suite_F/suite.json +│ └── suite_G/suite.json +│ (request data lives in datasets/, referenced by "dataset" in suite.json) ├── datasets/ │ ├── sharegpt_standard_v1/requests.jsonl ← 500 prompts, ~280/310 tok │ ├── sharegpt_longctx_v1/requests.jsonl ← 200 prompts, ~28K input tok (Suite D) @@ -554,12 +555,15 @@ descriptions and distributions. If you need a custom distribution: 1. Create `datasets/{your_dataset}_v1/requests.jsonl` -2. Create `datasets/{your_dataset}_v1/README.md` +2. Create `datasets/{your_dataset}_v1/README.md` (must document source + + upstream license — see `datasets/README.md`) 3. Set `"dataset": "{your_dataset}_v1"` in your suite.json -If your suite needs a custom dataset only used by that suite, you can -also place `requests.jsonl` directly in `suites/suite_X/` — the -benchmark runner checks there as a fallback. +The `dataset` field is **required** — `BenchmarkRunner._resolve_requests_path` +loads `datasets//requests.jsonl` and raises `FileNotFoundError` if it +cannot find the file. Earlier versions allowed putting `requests.jsonl` +directly under `suites/suite_X/`; that fallback has been removed in favor +of the immutable, versioned `datasets/` layout. Dataset format (one JSON object per line): ```json @@ -622,6 +626,38 @@ not shown on the main leaderboard. --- +## Adding a new scenario type + +If you need a scenario name that none of `accuracy / offline / online / +interactive / sustained / speculative / burst` covers, you can register +one without forking the dispatch logic: + +1. Open `runners/benchmark_runner.py` and add a row to + `_SCENARIO_REGISTRY` near the top of the file: + + ```python + "your_scenario": ScenarioSpec( + name="your_scenario", + inference_kind="streaming", # or "offline" + needs_streaming=True, # require SUPPORTS_STREAMING? + use_async=True, # passed to load_model() + merge_key="your_scenario", # None = no-merge (e.g. accuracy) + ), + ``` + +2. If the scenario needs special LoadGen behaviour (e.g. like `sustained`), + add a branch under "Run benchmark" inside `_run_single_scenario`. + +3. List the new scenario name in your suite's + `scenarios.{default,extra}` array — the merge order is derived from + the registry automatically. + +Without a registry entry the base class falls back to a streaming +inference path with `merge_key = `. Register an entry whenever +you want the scenario to be treated differently (offline, no merge, etc.). + +--- + ## Suite plugin system Suites with custom orchestration logic (multiple subprocesses, special @@ -1098,6 +1134,6 @@ python runners/validate_submission.py --dir /tmp/accelmark_test/ ## Questions and Support - **Bug in LoadGen or schema:** Open a GitHub Issue -- **New suite proposal:** Open a GitHub Issue with the "Request new suite" template +- **New suite proposal:** Open a GitHub Issue with the [**Propose a new suite**](https://github.com/JuhaoLiang1997/AccelMark/issues/new?template=new_suite.md) template - **New platform support:** Open a PR with a working platform script and at least one verified result - **Leaderboard question:** Check `leaderboard/generate.py` — it's well-commented \ No newline at end of file diff --git a/NOTICE b/NOTICE new file mode 100644 index 00000000..d904b638 --- /dev/null +++ b/NOTICE @@ -0,0 +1,71 @@ +AccelMark +Copyright 2024-2026 Juhao Liang and The AccelMark Contributors + +This product includes software developed as part of the AccelMark project +(https://github.com/JuhaoLiang1997/AccelMark). + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +================================================================================ +Third-party bundled data +================================================================================ + +The AccelMark source tree includes a small amount of third-party data so that +benchmark runs are fully reproducible without network access. Each bundled +dataset retains its upstream license; the Apache 2.0 license above covers only +the AccelMark code, schemas, and configuration around it. + +-------------------------------------------------------------------------------- +1. datasets/sharegpt_standard_v1/requests.jsonl (500 prompts) + datasets/sharegpt_edge_v1/requests.jsonl (500 prompts) + datasets/sharegpt_longctx_v1/requests.jsonl (200 prompts) +-------------------------------------------------------------------------------- + + Derived from the ShareGPT GPT-4 conversational dataset curated by: + + shibing624/sharegpt_gpt4 + https://huggingface.co/datasets/shibing624/sharegpt_gpt4 + License: CC BY 4.0 + (https://creativecommons.org/licenses/by/4.0/) + + The upstream corpus was assembled from publicly shared ChatGPT/GPT-4 + conversations. AccelMark's variants are filtered subsets used as fixed + benchmark inputs; no derivation is intended as the authoritative copy. + + Attribution: shibing624/sharegpt_gpt4 contributors, distributed under CC BY 4.0. + + See datasets//README.md for the per-subset filtering criteria and + token statistics. + +-------------------------------------------------------------------------------- +2. schema/accuracy_subset.jsonl (100 multiple-choice items) +-------------------------------------------------------------------------------- + + A 100-question subset of MMLU (Massive Multitask Language Understanding): + + Hendrycks, D., Burns, C., Basart, S., Zou, A., Mazeika, M., Song, D., + & Steinhardt, J. (2021). "Measuring Massive Multitask Language + Understanding." International Conference on Learning Representations. + https://arxiv.org/abs/2009.03300 + https://github.com/hendrycks/test + + License: MIT + (https://opensource.org/licenses/MIT) + + AccelMark uses this subset purely as an accuracy gate (model-quality + sanity check) — it is NOT a measurement of MMLU performance. The subset + is immutable; see CONTRIBUTING.md "A few rules". + +================================================================================ +Third-party software dependencies +================================================================================ + +AccelMark's Python runtime dependencies (jsonschema, numpy, pyyaml, …) and +the framework backends invoked by each runner (vLLM, SGLang, mlx-lm, +vllm-ascend, vllm-rocm, vllm-tpu, vllm-musa, …) retain their own licenses. +See each runner's requirements.txt for pinned versions; see the upstream +projects for the corresponding license terms. diff --git a/README.md b/README.md index 7fac6641..5fb2f691 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@

Live Leaderboard - License: MIT + License: Apache 2.0 Contributions welcome

@@ -23,6 +23,16 @@ Development

+

+ AccelMark framework pipeline: five stages — workload suites, pinned execution, validation, publish, consume — with a community contribution loop closing back to the suites. +

+ +

+ From workload spec to published result — every row on the leaderboard carries its runner hash, environment fingerprint, and accuracy receipt. +

+ --- ## Why AccelMark? @@ -53,9 +63,11 @@ python run.py --runner nvidia_vllm_47f5d58e --suite suite_A # 4. Submit your result — open a pull request: # git checkout -b submit/ -# cp results/your-result.json results/community//result.json -# git add results/ env_info.json && git commit -m "results: " +# git add results/community// && git commit -m "results: " # gh pr create # or open via the GitHub web UI +# +# is the directory auto-created by run.py — it already contains +# your result.json and env_info.json; no manual file moves are needed. ``` See [CONTRIBUTING.md](CONTRIBUTING.md) for the full guide. If you'd rather skip the PR workflow, [open a submission issue](https://github.com/JuhaoLiang1997/AccelMark/issues/new?template=community_submission.md) instead and a bot will draft the PR for you. @@ -80,6 +92,18 @@ See [suites/README.md](suites/README.md) for full specs, time budgets, SLA defin --- +## Currently on the leaderboard + +

+ Chips currently on the AccelMark leaderboard, sized by submission count and coloured by vendor — NVIDIA, Huawei Ascend, Google TPU, Moore Threads, and Apple. +

+ +A snapshot of accelerators that have at least one submission on the leaderboard. Tile size is proportional to submission count; colour denotes vendor. See the [**live leaderboard**](https://juhaoliang1997.github.io/AccelMark) for current rankings, per-suite breakdowns, and the underlying `result.json` files. + +--- + ## Supported platforms Reference runners live under `runners/` (see each folder’s `meta.json`). The table below is **auto-generated** from each runner's `meta.json` — never hand-edited. Add a runner, declare its `suite_support` in `meta.json`, and the matrix updates on its own. @@ -140,8 +164,8 @@ If you use AccelMark results in research, please cite: ```bibtex @misc{accelmark2026, - title = {AccelMark: Open Benchmark Leaderboard for AI Accelerators on LLM Workloads}, - author = {Liang, Juhao and {The AccelMark Contributors}}, + title = {Beyond NVIDIA! A Multi-Regime Framework for Benchmarking Heterogeneous AI Accelerators}, + author = {Liang, Juhao and Zhang, Zhiyuan and Li, Siyu and Lin, Zhihang and Yu, Minchen and Zeng, Li and Chen, Zizhong and Sun, Ruoyu and Wang, Benyou}, year = {2026}, url = {https://github.com/JuhaoLiang1997/AccelMark} } @@ -151,5 +175,6 @@ If you use AccelMark results in research, please cite: ## License -MIT — see [LICENSE](LICENSE). -Submitted benchmark results are contributed under [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/). \ No newline at end of file +Apache 2.0 — see [LICENSE](LICENSE). +Submitted benchmark results are contributed under [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/). +Bundled third-party data (datasets, accuracy subsets) keeps its upstream license — see [NOTICE](NOTICE). \ No newline at end of file diff --git a/datasets/README.md b/datasets/README.md index 737d2318..82c06df4 100644 --- a/datasets/README.md +++ b/datasets/README.md @@ -41,3 +41,19 @@ Each line in `requests.jsonl`: "prompt_type": "conversational" } ``` + +## License & attribution + +Bundled prompt data keeps its **upstream license**, not AccelMark's +Apache-2.0. The three ShareGPT-derived datasets shipped here are +redistributed under **[CC BY 4.0](https://creativecommons.org/licenses/by/4.0/)** +from [shibing624/sharegpt_gpt4](https://huggingface.co/datasets/shibing624/sharegpt_gpt4). + +If you add a new dataset, its `README.md` **must** include: + +1. The upstream source (URL or HuggingFace ID). +2. The upstream license (link to the canonical text). +3. A citation block if the upstream authors request one. + +See [`../NOTICE`](../NOTICE) for the full third-party attribution that ships +with the repository. diff --git a/datasets/sharegpt_edge_v1/README.md b/datasets/sharegpt_edge_v1/README.md index 2d53ad7e..0626fd46 100644 --- a/datasets/sharegpt_edge_v1/README.md +++ b/datasets/sharegpt_edge_v1/README.md @@ -5,18 +5,30 @@ Short-turn ShareGPT conversational prompts. Used by Suite F (consumer/edge bench Filtered from `shibing624/sharegpt_gpt4` to retain only short-turn exchanges, producing a distribution representative of interactive consumer inference workloads. -| Field | Value | -|-------------------|----------------------------------| -| Source | shibing624/sharegpt_gpt4 | -| Prompts | 500 | -| Input tokens p50 | ~95 | -| Input tokens p99 | ~600 | -| Output tokens p50 | ~150 | -| Output tokens p99 | ~400 | -| Type | Conversational, single-turn | +| Field | Value | +|-------------------|--------------------------------------------------------------------------------------| +| Source | [shibing624/sharegpt_gpt4](https://huggingface.co/datasets/shibing624/sharegpt_gpt4) | +| Prompts | 500 | +| Input tokens p50 | ~95 | +| Input tokens p99 | ~600 | +| Output tokens p50 | ~150 | +| Output tokens p99 | ~400 | +| Type | Conversational, single-turn | ## Difference from sharegpt_standard_v1 `sharegpt_standard_v1` (Suites A, B, C, and E) has p50 input ~280 tokens and p50 output ~310 tokens. `sharegpt_edge_v1` uses shorter prompts to keep benchmark runtime practical on consumer GPUs -and to reflect the latency-sensitive interactive use cases they are typically deployed for. \ No newline at end of file +and to reflect the latency-sensitive interactive use cases they are typically deployed for. + +## License & attribution + +The prompts are derived from `shibing624/sharegpt_gpt4` and are redistributed +under the upstream license, **[CC BY 4.0](https://creativecommons.org/licenses/by/4.0/)**. + +Apache-2.0 (the AccelMark repository license) covers only the AccelMark code, +schemas, and selection logic — not the prompt text itself. See [`../../NOTICE`](../../NOTICE) +for the full third-party attribution. + +If you use these prompts in research, please cite the upstream dataset and +this repository. \ No newline at end of file diff --git a/datasets/sharegpt_longctx_v1/README.md b/datasets/sharegpt_longctx_v1/README.md index 72f86980..6d70a1ef 100644 --- a/datasets/sharegpt_longctx_v1/README.md +++ b/datasets/sharegpt_longctx_v1/README.md @@ -4,8 +4,23 @@ Long-context prompts for Suite D (~28K-token inputs; `max_model_len` 30,208 in ` | Field | Value | |---|---| -| Source | Long-context subset of ShareGPT | +| Source | [shibing624/sharegpt_gpt4](https://huggingface.co/datasets/shibing624/sharegpt_gpt4) (long-context subset) | | Prompts | 200 | | Input tokens p50 | ~28,000 | | Output tokens p50 | ~256 (suite caps generation) | | Type | Document QA, long-form input | + +## License & attribution + +The prompts are derived from the same ShareGPT GPT-4 corpus as +`sharegpt_standard_v1` and are redistributed under the upstream license, +**[CC BY 4.0](https://creativecommons.org/licenses/by/4.0/)**. Long-context +items are selected by tokenized input length; no additional editorial +modification beyond filtering is applied. + +Apache-2.0 (the AccelMark repository license) covers only the AccelMark code, +schemas, and selection logic — not the prompt text itself. See [`../../NOTICE`](../../NOTICE) +for the full third-party attribution. + +If you use these prompts in research, please cite the upstream dataset and +this repository. diff --git a/datasets/sharegpt_standard_v1/README.md b/datasets/sharegpt_standard_v1/README.md index 2f1655a6..f2413ee2 100644 --- a/datasets/sharegpt_standard_v1/README.md +++ b/datasets/sharegpt_standard_v1/README.md @@ -4,8 +4,20 @@ Standard ShareGPT conversational prompts. Used by Suite A, B, C, E. | Field | Value | |---|---| -| Source | shibing624/sharegpt_gpt4 | +| Source | [shibing624/sharegpt_gpt4](https://huggingface.co/datasets/shibing624/sharegpt_gpt4) | | Prompts | 500 | | Input tokens p50 | ~280 | | Output tokens p50 | ~310 | | Type | Conversational, single-turn | + +## License & attribution + +The prompts are derived from `shibing624/sharegpt_gpt4` and are redistributed +under the upstream license, **[CC BY 4.0](https://creativecommons.org/licenses/by/4.0/)**. + +Apache-2.0 (the AccelMark repository license) covers only the AccelMark code, +schemas, and selection logic — not the prompt text itself. See [`../../NOTICE`](../../NOTICE) +for the full third-party attribution. + +If you use these prompts in research, please cite the upstream dataset and +this repository. diff --git a/docs/assets/chip-cloud.png b/docs/assets/chip-cloud.png new file mode 100644 index 00000000..e3ccccec Binary files /dev/null and b/docs/assets/chip-cloud.png differ diff --git a/docs/assets/framework-overview.png b/docs/assets/framework-overview.png new file mode 100644 index 00000000..ddd06573 Binary files /dev/null and b/docs/assets/framework-overview.png differ diff --git a/leaderboard/site/assets/data/suite-meta.js b/leaderboard/site/assets/data/suite-meta.js new file mode 100644 index 00000000..b0430207 --- /dev/null +++ b/leaderboard/site/assets/data/suite-meta.js @@ -0,0 +1,230 @@ +// suite-meta.js — editorial metadata for each suite shown on the leaderboard. +// +// This file is pure copy + display rules ("editorial"). The runtime +// benchmark contract (model_id, dataset, scenario list, …) lives in +// `suites//suite.json` on the Python side and is injected into +// the page as `window.SUITE_SPECS`. Splitting the two keeps editorial +// edits (taglines, descriptions, primary-metric units) out of the diff +// when the actual benchmark contract changes — and vice versa. +// +// Consumers should keep importing `SUITE_META` from `../data.js`; this +// file is its single source of truth and only exists to keep `data.js` +// at a manageable size. + +export const SUITE_META = { + suite_A: { + letter: "A", + title: "Single-chip throughput", + tagline: "How fast can one accelerator serve an 8B model?", + description: + "The canonical bandwidth-bound regime. 8B Llama on a single accelerator is small enough to fit comfortably in HBM, large enough that decode is memory-bandwidth-bound rather than compute-bound. This is the bread-and-butter serving workload that anchors most other LLM benchmarks, and the suite where vendor marketing numbers usually land.", + primary: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" }, + workload: { + model: "meta-llama/Meta-Llama-3-8B-Instruct", + chips: "1", + precision: "BF16", + dataset: "sharegpt_standard_v1", + inputTokens: "~280", + outputTokens: "~310", + }, + scenarios: [ + { name: "accuracy", isExtra: false, + desc: "MMLU subset score against the baseline. Gate for a valid submission." }, + { name: "offline", isExtra: false, + desc: "Max throughput with all requests batched at once.", + metric: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" } }, + { name: "online", isExtra: false, + desc: "Highest QPS that meets the 500 ms p99 TTFT SLA under Poisson arrivals.", + metric: { key: "online_max_qps", label: "queries/sec", direction: "desc", unit: "queries/sec" } }, + { name: "interactive", isExtra: true, + desc: "Single-stream first-token latency. No concurrency.", + metric: { key: "interactive_ttft_p99", label: "TTFT p99", direction: "asc", unit: "ms", decimals: 0 } }, + { name: "sustained", isExtra: true, + desc: "30 min fixed-concurrency load. Reports throughput stability and throttle ratio.", + metric: { key: "sustained_throughput", label: "sustained throughput", direction: "desc", unit: "tokens/sec" } }, + { name: "speculative", isExtra: true, + desc: "Offline workload with a 1B draft model loaded. Reports acceptance rate." }, + { name: "burst", isExtra: true, + desc: "TTFT p99 during 5x burst windows versus steady. KV pressure test." }, + ], + }, + suite_B: { + letter: "B", + title: "Multi-chip throughput", + tagline: "Large-model serving across multiple chips.", + description: + "70B Llama distributed across multiple accelerators. Two effects compound: the model itself no longer fits on one chip (capacity-bound) and tensor-parallel inference shards KV cache, activations, and all-reduce traffic over the interconnect. Both the framework's TP path and the chip's NVLink / Infinity Fabric / scale-out fabric come under test here.", + primary: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" }, + workload: { + model: "meta-llama/Meta-Llama-3-70B-Instruct", + chips: "flexible (typ. 4 / 8)", + precision: "BF16", + dataset: "sharegpt_standard_v1", + inputTokens: "~280", + outputTokens: "~310", + }, + scenarios: [ + { name: "accuracy", isExtra: false, + desc: "MMLU subset score against the 70B baseline." }, + { name: "offline", isExtra: false, + desc: "Aggregate throughput across N chips serving the 70B model.", + metric: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" } }, + { name: "online", isExtra: false, + desc: "Highest QPS that meets the 500 ms p99 TTFT SLA at 70B scale.", + metric: { key: "online_max_qps", label: "queries/sec", direction: "desc", unit: "queries/sec" } }, + { name: "interactive", isExtra: true, + desc: "Single-stream TTFT at 70B. Decode-bound." }, + { name: "sustained", isExtra: true, + desc: "30 min fixed load; concurrency 4 (70B leaves less KV headroom than 8B).", + metric: { key: "sustained_throughput", label: "sustained throughput", direction: "desc", unit: "tokens/sec" } }, + { name: "burst", isExtra: true, + desc: "Burst vs steady TTFT p99 at 70B scale." }, + ], + }, + suite_C: { + letter: "C", + title: "Quantization efficiency", + tagline: "Quality-adjusted throughput across precision formats.", + description: + "The bandwidth-to-compute transition. The same 8B model is run at five precision formats (BF16, FP8, W8A8, W8A16, W4A16); quality efficiency multiplies throughput speedup by the accuracy drop so a chip can't trade quality for speed silently. Reveals which chips have working low-precision tensor cores and which fall back to BF16 on the same instruction.", + primary: { key: "quant_quality_eff", label: "quality efficiency", direction: "desc", unit: "" }, + workload: { + model: "meta-llama/Llama-3.1-8B-Instruct", + chips: "1", + precision: "BF16, FP8, W8A8, W8A16, W4A16", + dataset: "sharegpt_standard_v1", + inputTokens: "~280", + outputTokens: "~310", + }, + scenarios: [ + { name: "accuracy", isExtra: false, + desc: "Per-format accuracy gate (each format has its own threshold)." }, + { name: "offline (×5 formats)", isExtra: false, + desc: "Offline throughput at each precision. Quality efficiency = throughput × accuracy.", + metric: { key: "quant_quality_eff", label: "quality efficiency", direction: "desc", unit: "" } }, + { name: "online", isExtra: true, + desc: "Online QPS sweep per format. Extra: 5 formats × QPS levels is expensive." }, + { name: "sustained", isExtra: true, + desc: "15 min sustained load per format." }, + ], + }, + suite_D: { + letter: "D", + title: "Long-context inference", + tagline: "28K-token prefill, compute-bound regime.", + description: + "Compute-bound prefill. ~28K-token prompts push arithmetic intensity past the roofline knee, so chips with more raw FLOPS pull ahead of bandwidth-rich ones. The output cap (256 tokens) keeps decode short on purpose; this suite isolates the prefill side and is where Suite A's bandwidth-bound rankings begin to invert.", + primary: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" }, + workload: { + model: "meta-llama/Llama-3.1-8B-Instruct", + chips: "1", + precision: "BF16; max_model_len 30,208", + dataset: "sharegpt_longctx_v1", + inputTokens: "~28K", + outputTokens: "≤256", + }, + scenarios: [ + { name: "accuracy", isExtra: false, + desc: "MMLU gate against the 8B Llama-3.1 baseline." }, + { name: "offline", isExtra: false, + desc: "Offline throughput at ~28K input tokens. Prefill-bound, tests raw FLOPS.", + metric: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" } }, + { name: "interactive", isExtra: true, + desc: "Long-context TTFT (~11 s per request at 28K). p90 is primary." }, + { name: "online", isExtra: true, + desc: "Sub-QPS levels (0.5 / 1 / 2). Rate-bound at long context." }, + { name: "sustained", isExtra: true, + desc: "30 min sustained at concurrency 8. Throttle ratio is the headline." }, + { name: "speculative", isExtra: true, + desc: "Long-context offline with 1B draft model. Prefill-bound speculative." }, + ], + }, + suite_E: { + letter: "E", + title: "Multi-chip scaling efficiency", + tagline: "How well does 8B throughput scale to 2 / 4 / 8 chips?", + description: + "The Amdahl penalty in numbers. The same 8B model runs at 1×, 2×, and (optionally) 4× / 8× chip counts; the headline metric is 2× scaling efficiency = T_2× / (2 · T_1×). Reveals NVLink / Infinity Fabric / PCIe ceilings, and exposes flagships whose per-chip throughput grew faster than the interconnect did.", + primary: { key: "scaling_efficiency_2x", label: "2× scaling efficiency", direction: "desc", unit: "%", scale: 100, decimals: 1 }, + workload: { + model: "meta-llama/Meta-Llama-3-8B-Instruct", + chips: "1× / 2× required; 4× / 8× optional", + precision: "BF16", + dataset: "sharegpt_standard_v1", + inputTokens: "~280", + outputTokens: "~310", + }, + scenarios: [ + { name: "offline (1× / 2×)", isExtra: false, + desc: "Two-chip scaling efficiency vs single chip. Required for a valid submission.", + metric: { key: "scaling_efficiency_2x", label: "2× scaling efficiency", direction: "desc", unit: "%", scale: 100, decimals: 1 } }, + { name: "offline (4×)", isExtra: false, + desc: "Four-chip scaling efficiency. Optional but commonly reported.", + metric: { key: "scaling_efficiency_4x", label: "4× scaling efficiency", direction: "desc", unit: "%", scale: 100, decimals: 1 } }, + { name: "offline (8×)", isExtra: false, + desc: "Eight-chip scaling. Communication overhead is the binding constraint here." }, + ], + }, + suite_F: { + letter: "F", + title: "Edge / consumer hardware", + tagline: "Small models on single-GPU edge hardware.", + description: + "The pure-bandwidth lower bound. Qwen2.5-0.5B with ~95-token prompts strips away residual compute interference and short-circuits prefill, exposing raw HBM headroom and software overhead. Commodity GPUs (RTX 4090, A6000) tend to be most competitive per dollar here, and the suite doubles as a regression check for low-VRAM deployments.", + primary: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" }, + workload: { + model: "Qwen/Qwen2.5-0.5B-Instruct", + chips: "1 (≥4 GB VRAM)", + precision: "BF16", + dataset: "sharegpt_edge_v1", + inputTokens: "~95", + outputTokens: "~150", + }, + scenarios: [ + { name: "accuracy", isExtra: false, + desc: "MMLU gate against the 0.5B baseline." }, + { name: "offline", isExtra: false, + desc: "Offline throughput on the edge dataset (~95 tok prompts).", + metric: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" } }, + { name: "online", isExtra: false, + desc: "Max QPS at the standard 500 ms p99 TTFT SLA.", + metric: { key: "online_max_qps", label: "queries/sec", direction: "desc", unit: "queries/sec" } }, + { name: "interactive", isExtra: false, + desc: "Single-stream TTFT on consumer hardware.", + metric: { key: "interactive_ttft_p99", label: "TTFT p99", direction: "asc", unit: "ms", decimals: 0 } }, + { name: "sustained", isExtra: true, + desc: "15 min sustained load (shorter than datacenter suites)." }, + ], + }, + suite_G: { + letter: "G", + title: "Mixture-of-Experts (MoE)", + tagline: "Sparse routing; bandwidth-bound multi-chip serving.", + description: + "Sparse activation. Mixtral 8×7B activates only 2 of 8 experts per token, which keeps arithmetic intensity below dense 8B inference even at multi-chip scale. Chips with high aggregate HBM bandwidth (HBM3e generation) pay off here; pure-FLOPS advantages from compute-bound suites don't translate.", + primary: { key: "sustained_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" }, + workload: { + model: "mistralai/Mixtral-8x7B-Instruct-v0.1", + chips: "≥2 (auto)", + precision: "BF16", + dataset: "sharegpt_standard_v1", + inputTokens: "~280", + outputTokens: "~310", + }, + scenarios: [ + { name: "accuracy", isExtra: false, + desc: "MMLU gate against the Mixtral baseline." }, + { name: "offline", isExtra: false, + desc: "Aggregate MoE throughput. Only 2 of 8 experts activate per token.", + metric: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" } }, + { name: "online", isExtra: false, + desc: "Max QPS under the 500 ms p99 TTFT SLA on MoE serving.", + metric: { key: "online_max_qps", label: "queries/sec", direction: "desc", unit: "queries/sec" } }, + { name: "interactive", isExtra: true, + desc: "Single-stream TTFT on MoE inference." }, + { name: "sustained", isExtra: true, + desc: "30 min sustained MoE load. Several chips show thermal onset on this suite.", + metric: { key: "sustained_throughput", label: "sustained throughput", direction: "desc", unit: "tokens/sec" } }, + ], + }, +}; diff --git a/leaderboard/site/assets/js/data.js b/leaderboard/site/assets/js/data.js index 470af671..8c7bcba2 100644 --- a/leaderboard/site/assets/js/data.js +++ b/leaderboard/site/assets/js/data.js @@ -9,232 +9,22 @@ import { groupBy, chipSlug, toTitleCase } from "./utils.js"; -// Each suite has a "primary metric" most relevant to a buyer's question. -// This drives default sort on the rankings page and the top-3 podium on home. +// Editorial copy and per-suite display rules live in +// `../data/suite-meta.js`. We re-export so existing consumers +// ("import { SUITE_META } from './data.js'") keep working unchanged. +// +// Why split? `data.js` is the runtime / view-state hub; `suite-meta.js` +// is pure editorial content (titles, taglines, descriptions, primary- +// metric units). Keeping them separate lets copy edits land without +// touching the data-processing diff, and vice versa. // // `primary.scale` multiplies raw value at display (e.g. 0.945 → 94.5 %). // `primary.decimals` overrides automatic decimal selection. -// Suite workload constants — fixed per suite definition (suites/README.md). // `inputTokens` / `outputTokens` are the dataset p50s used at benchmark // time and are NOT derived from data files; they're part of the suite // contract and only change with a suite revision. -export const SUITE_META = { - suite_A: { - letter: "A", - title: "Single-chip throughput", - tagline: "How fast can one accelerator serve an 8B model?", - description: - "The canonical bandwidth-bound regime. 8B Llama on a single accelerator is small enough to fit comfortably in HBM, large enough that decode is memory-bandwidth-bound rather than compute-bound. This is the bread-and-butter serving workload that anchors most other LLM benchmarks, and the suite where vendor marketing numbers usually land.", - primary: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" }, - workload: { - model: "meta-llama/Meta-Llama-3-8B-Instruct", - chips: "1", - precision: "BF16", - dataset: "sharegpt_standard_v1", - inputTokens: "~280", - outputTokens: "~310", - }, - scenarios: [ - { name: "accuracy", isExtra: false, - desc: "MMLU subset score against the baseline. Gate for a valid submission." }, - { name: "offline", isExtra: false, - desc: "Max throughput with all requests batched at once.", - metric: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" } }, - { name: "online", isExtra: false, - desc: "Highest QPS that meets the 500 ms p99 TTFT SLA under Poisson arrivals.", - metric: { key: "online_max_qps", label: "queries/sec", direction: "desc", unit: "queries/sec" } }, - { name: "interactive", isExtra: true, - desc: "Single-stream first-token latency. No concurrency.", - metric: { key: "interactive_ttft_p99", label: "TTFT p99", direction: "asc", unit: "ms", decimals: 0 } }, - { name: "sustained", isExtra: true, - desc: "30 min fixed-concurrency load. Reports throughput stability and throttle ratio.", - metric: { key: "sustained_throughput", label: "sustained throughput", direction: "desc", unit: "tokens/sec" } }, - { name: "speculative", isExtra: true, - desc: "Offline workload with a 1B draft model loaded. Reports acceptance rate." }, - { name: "burst", isExtra: true, - desc: "TTFT p99 during 5x burst windows versus steady. KV pressure test." }, - ], - }, - suite_B: { - letter: "B", - title: "Multi-chip throughput", - tagline: "Large-model serving across multiple chips.", - description: - "70B Llama distributed across multiple accelerators. Two effects compound: the model itself no longer fits on one chip (capacity-bound) and tensor-parallel inference shards KV cache, activations, and all-reduce traffic over the interconnect. Both the framework's TP path and the chip's NVLink / Infinity Fabric / scale-out fabric come under test here.", - primary: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" }, - workload: { - model: "meta-llama/Meta-Llama-3-70B-Instruct", - chips: "flexible (typ. 4 / 8)", - precision: "BF16", - dataset: "sharegpt_standard_v1", - inputTokens: "~280", - outputTokens: "~310", - }, - scenarios: [ - { name: "accuracy", isExtra: false, - desc: "MMLU subset score against the 70B baseline." }, - { name: "offline", isExtra: false, - desc: "Aggregate throughput across N chips serving the 70B model.", - metric: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" } }, - { name: "online", isExtra: false, - desc: "Highest QPS that meets the 500 ms p99 TTFT SLA at 70B scale.", - metric: { key: "online_max_qps", label: "queries/sec", direction: "desc", unit: "queries/sec" } }, - { name: "interactive", isExtra: true, - desc: "Single-stream TTFT at 70B. Decode-bound." }, - { name: "sustained", isExtra: true, - desc: "30 min fixed load; concurrency 4 (70B leaves less KV headroom than 8B).", - metric: { key: "sustained_throughput", label: "sustained throughput", direction: "desc", unit: "tokens/sec" } }, - { name: "burst", isExtra: true, - desc: "Burst vs steady TTFT p99 at 70B scale." }, - ], - }, - suite_C: { - letter: "C", - title: "Quantization efficiency", - tagline: "Quality-adjusted throughput across precision formats.", - description: - "The bandwidth-to-compute transition. The same 8B model is run at five precision formats (BF16, FP8, W8A8, W8A16, W4A16); quality efficiency multiplies throughput speedup by the accuracy drop so a chip can't trade quality for speed silently. Reveals which chips have working low-precision tensor cores and which fall back to BF16 on the same instruction.", - primary: { key: "quant_quality_eff", label: "quality efficiency", direction: "desc", unit: "" }, - workload: { - model: "meta-llama/Llama-3.1-8B-Instruct", - chips: "1", - precision: "BF16, FP8, W8A8, W8A16, W4A16", - dataset: "sharegpt_standard_v1", - inputTokens: "~280", - outputTokens: "~310", - }, - scenarios: [ - { name: "accuracy", isExtra: false, - desc: "Per-format accuracy gate (each format has its own threshold)." }, - { name: "offline (×5 formats)", isExtra: false, - desc: "Offline throughput at each precision. Quality efficiency = throughput × accuracy.", - metric: { key: "quant_quality_eff", label: "quality efficiency", direction: "desc", unit: "" } }, - { name: "online", isExtra: true, - desc: "Online QPS sweep per format. Extra: 5 formats × QPS levels is expensive." }, - { name: "sustained", isExtra: true, - desc: "15 min sustained load per format." }, - ], - }, - suite_D: { - letter: "D", - title: "Long-context inference", - tagline: "28K-token prefill, compute-bound regime.", - description: - "Compute-bound prefill. ~28K-token prompts push arithmetic intensity past the roofline knee, so chips with more raw FLOPS pull ahead of bandwidth-rich ones. The output cap (256 tokens) keeps decode short on purpose; this suite isolates the prefill side and is where Suite A's bandwidth-bound rankings begin to invert.", - primary: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" }, - workload: { - model: "meta-llama/Llama-3.1-8B-Instruct", - chips: "1", - precision: "BF16; max_model_len 30,208", - dataset: "sharegpt_longctx_v1", - inputTokens: "~28K", - outputTokens: "≤256", - }, - scenarios: [ - { name: "accuracy", isExtra: false, - desc: "MMLU gate against the 8B Llama-3.1 baseline." }, - { name: "offline", isExtra: false, - desc: "Offline throughput at ~28K input tokens. Prefill-bound, tests raw FLOPS.", - metric: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" } }, - { name: "interactive", isExtra: true, - desc: "Long-context TTFT (~11 s per request at 28K). p90 is primary." }, - { name: "online", isExtra: true, - desc: "Sub-QPS levels (0.5 / 1 / 2). Rate-bound at long context." }, - { name: "sustained", isExtra: true, - desc: "30 min sustained at concurrency 8. Throttle ratio is the headline." }, - { name: "speculative", isExtra: true, - desc: "Long-context offline with 1B draft model. Prefill-bound speculative." }, - ], - }, - suite_E: { - letter: "E", - title: "Multi-chip scaling efficiency", - tagline: "How well does 8B throughput scale to 2 / 4 / 8 chips?", - description: - "The Amdahl penalty in numbers. The same 8B model runs at 1×, 2×, and (optionally) 4× / 8× chip counts; the headline metric is 2× scaling efficiency = T_2× / (2 · T_1×). Reveals NVLink / Infinity Fabric / PCIe ceilings, and exposes flagships whose per-chip throughput grew faster than the interconnect did.", - primary: { key: "scaling_efficiency_2x", label: "2× scaling efficiency", direction: "desc", unit: "%", scale: 100, decimals: 1 }, - workload: { - model: "meta-llama/Meta-Llama-3-8B-Instruct", - chips: "1× / 2× required; 4× / 8× optional", - precision: "BF16", - dataset: "sharegpt_standard_v1", - inputTokens: "~280", - outputTokens: "~310", - }, - scenarios: [ - { name: "offline (1× / 2×)", isExtra: false, - desc: "Two-chip scaling efficiency vs single chip. Required for a valid submission.", - metric: { key: "scaling_efficiency_2x", label: "2× scaling efficiency", direction: "desc", unit: "%", scale: 100, decimals: 1 } }, - { name: "offline (4×)", isExtra: false, - desc: "Four-chip scaling efficiency. Optional but commonly reported.", - metric: { key: "scaling_efficiency_4x", label: "4× scaling efficiency", direction: "desc", unit: "%", scale: 100, decimals: 1 } }, - { name: "offline (8×)", isExtra: false, - desc: "Eight-chip scaling. Communication overhead is the binding constraint here." }, - ], - }, - suite_F: { - letter: "F", - title: "Edge / consumer hardware", - tagline: "Small models on single-GPU edge hardware.", - description: - "The pure-bandwidth lower bound. Qwen2.5-0.5B with ~95-token prompts strips away residual compute interference and short-circuits prefill, exposing raw HBM headroom and software overhead. Commodity GPUs (RTX 4090, A6000) tend to be most competitive per dollar here, and the suite doubles as a regression check for low-VRAM deployments.", - primary: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" }, - workload: { - model: "Qwen/Qwen2.5-0.5B-Instruct", - chips: "1 (≥4 GB VRAM)", - precision: "BF16", - dataset: "sharegpt_edge_v1", - inputTokens: "~95", - outputTokens: "~150", - }, - scenarios: [ - { name: "accuracy", isExtra: false, - desc: "MMLU gate against the 0.5B baseline." }, - { name: "offline", isExtra: false, - desc: "Offline throughput on the edge dataset (~95 tok prompts).", - metric: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" } }, - { name: "online", isExtra: false, - desc: "Max QPS at the standard 500 ms p99 TTFT SLA.", - metric: { key: "online_max_qps", label: "queries/sec", direction: "desc", unit: "queries/sec" } }, - { name: "interactive", isExtra: false, - desc: "Single-stream TTFT on consumer hardware.", - metric: { key: "interactive_ttft_p99", label: "TTFT p99", direction: "asc", unit: "ms", decimals: 0 } }, - { name: "sustained", isExtra: true, - desc: "15 min sustained load (shorter than datacenter suites)." }, - ], - }, - suite_G: { - letter: "G", - title: "Mixture-of-Experts (MoE)", - tagline: "Sparse routing; bandwidth-bound multi-chip serving.", - description: - "Sparse activation. Mixtral 8×7B activates only 2 of 8 experts per token, which keeps arithmetic intensity below dense 8B inference even at multi-chip scale. Chips with high aggregate HBM bandwidth (HBM3e generation) pay off here; pure-FLOPS advantages from compute-bound suites don't translate.", - primary: { key: "sustained_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" }, - workload: { - model: "mistralai/Mixtral-8x7B-Instruct-v0.1", - chips: "≥2 (auto)", - precision: "BF16", - dataset: "sharegpt_standard_v1", - inputTokens: "~280", - outputTokens: "~310", - }, - scenarios: [ - { name: "accuracy", isExtra: false, - desc: "MMLU gate against the Mixtral baseline." }, - { name: "offline", isExtra: false, - desc: "Aggregate MoE throughput. Only 2 of 8 experts activate per token.", - metric: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" } }, - { name: "online", isExtra: false, - desc: "Max QPS under the 500 ms p99 TTFT SLA on MoE serving.", - metric: { key: "online_max_qps", label: "queries/sec", direction: "desc", unit: "queries/sec" } }, - { name: "interactive", isExtra: true, - desc: "Single-stream TTFT on MoE inference." }, - { name: "sustained", isExtra: true, - desc: "30 min sustained MoE load. Several chips show thermal onset on this suite.", - metric: { key: "sustained_throughput", label: "sustained throughput", direction: "desc", unit: "tokens/sec" } }, - ], - }, -}; +export { SUITE_META } from "../data/suite-meta.js"; +import { SUITE_META } from "../data/suite-meta.js"; // House style: headline-style Title Case for all suite titles so they // look correct everywhere they surface (home cards, rankings hero, diff --git a/pyproject.toml b/pyproject.toml index ea8d476d..bbdcf567 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ name = "accelmark" version = "0.1.0" description = "Open benchmark leaderboard for AI accelerators on LLM workloads" readme = "README.md" -license = "MIT" +license = "Apache-2.0" license-files = ["LICENSE"] requires-python = ">=3.10" authors = [ @@ -31,7 +31,7 @@ classifiers = [ "Intended Audience :: Science/Research", "Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: System :: Benchmark", - "License :: OSI Approved :: MIT License", + "License :: OSI Approved :: Apache Software License", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", @@ -60,4 +60,20 @@ requires = ["setuptools>=68"] build-backend = "setuptools.build_meta" [tool.setuptools.packages.find] -include = ["loadgen*"] +# AccelMark is primarily a clone-and-run repository (see `python run.py` +# in README.md). `pip install -e .` is supported so contributors can pick +# up the shared deps and import the helper packages (`loadgen`, the base +# `runners.benchmark_runner` class, `serve.adapter` / `serve.capacity`, +# and the `openclaw_skill` entry points) from anywhere — but there is no +# top-level `accelmark` package to invoke via `python -m`. +include = [ + "loadgen*", + "runners*", + "serve*", + "openclaw_skill*", +] +exclude = [ + "tests*", + "*.tests", + "*.tests.*", +] diff --git a/runners/README.md b/runners/README.md index aaf4d812..c5180111 100644 --- a/runners/README.md +++ b/runners/README.md @@ -89,13 +89,15 @@ class MyFrameworkRunner(BenchmarkRunner): # You almost never need to restrict this below ["bf16", "fp16", "fp32"]. SUPPORTED_PRECISIONS = ["bf16", "fp16", "fp32"] - # Declare supported quantization formats for Suite C. - # BF16 is always included. List only formats your framework can load. - # FP8 requires native FP8 hardware (H100, MI300X). - SUPPORTED_QUANTIZATIONS = ["fp8", "w8a8", "w8a16", "w4a16"] # H100 full support - # SUPPORTED_QUANTIZATIONS = ["w8a8", "w8a16", "w4a16"] # A100 (no FP8) - # SUPPORTED_QUANTIZATIONS = ["w8a8", "w4a16"] # ROCm example - # SUPPORTED_QUANTIZATIONS = [] # Apple MLX + # Declare the framework's quantization backends. Suite C cross-references + # each precision_model_map entry's engine_kwargs.quantization against this + # list to decide which formats to run on this runner. The strings must + # match the engine's own backend names (e.g. vLLM's `quantization=` kwarg), + # NOT suite-level precision tags like W8A8/FP8. + SUPPORTED_QUANTIZATION_BACKENDS = ["fp8", "compressed-tensors", "gptq_marlin"] # vLLM full + # SUPPORTED_QUANTIZATION_BACKENDS = ["compressed-tensors", "gptq_marlin"] # A100 (no FP8) + # SUPPORTED_QUANTIZATION_BACKENDS = ["compressed-tensors", "gptq_marlin"] # ROCm + # SUPPORTED_QUANTIZATION_BACKENDS = [] # Apple MLX def load_model(self, model_path: str, parallelism: dict) -> None: from myframework import Engine @@ -268,7 +270,7 @@ Override these class attributes in your runner to declare what the framework sup | `SUPPORTS_ONLINE` | `True` | Set `False` if framework cannot handle concurrent requests | | `SUPPORTS_MULTI_CHIP` | `True` | Set `False` if no tensor parallelism — tensor_parallel_size from runner config and CLI is ignored; runner always uses 1 chip | | `SUPPORTED_PRECISIONS` | `["bf16", "fp16", "fp32"]` | Maximum compute precisions on capable hardware. Hardware detection automatically restricts this (V100 → FP16, MI100 → FP16, M1 → FP16). Only restrict below the default if your framework genuinely cannot use a precision regardless of hardware. | -| `SUPPORTED_QUANTIZATIONS` | `[]` | Quantization formats supported for Suite C. Use uppercase strings: `"FP8"`, `"W8A8"`, `"W8A16"`, `"W4A16"`. BF16 is always supported and does not need to be listed. Empty list means this runner skips all quantized formats in Suite C. | +| `SUPPORTED_QUANTIZATION_BACKENDS` | `[]` | Framework-level quantization backends Suite C can use, named after the engine's own identifiers (vLLM examples: `"fp8"`, `"compressed-tensors"`, `"gptq_marlin"`, `"awq"`). NOT the suite precision tags (`W8A8`, `FP8`, `W4A16` …). BF16/FP16/FP32 are always allowed and must not be listed. Empty list means this runner skips every quantized entry in Suite C's `precision_model_map`. | --- diff --git a/runners/benchmark_runner.py b/runners/benchmark_runner.py index 5b0c2747..747071af 100644 --- a/runners/benchmark_runner.py +++ b/runners/benchmark_runner.py @@ -82,12 +82,43 @@ class InferenceRequest: extra: dict = dataclass_field(default_factory=dict) -# ── Scenario constants ──────────────────────────────────────────────────────── +# ── Scenario registry ──────────────────────────────────────────────────────── +# +# Each ScenarioSpec describes how the base class should drive a scenario name +# at runtime. Adding a new scenario means appending one row to +# ``_SCENARIO_REGISTRY`` (and, if needed, implementing a new inference method +# on the runner) — no edits to the if/elif ladders or merge order constants. + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class ScenarioSpec: + """Declarative contract for one scenario name.""" + + name: str + inference_kind: str # "offline" | "streaming" + needs_streaming: bool # raise an error if SUPPORTS_STREAMING is False + use_async: bool # passed to load_model() as use_async + merge_key: Optional[str] # key under metrics dict to merge (None = no-merge, e.g. accuracy) + + +_SCENARIO_REGISTRY: "dict[str, ScenarioSpec]" = { + "accuracy": ScenarioSpec("accuracy", "offline", False, False, None), + "offline": ScenarioSpec("offline", "offline", False, False, "offline"), + "online": ScenarioSpec("online", "streaming", True, True, "online"), + "interactive": ScenarioSpec("interactive", "streaming", True, True, "interactive"), + "sustained": ScenarioSpec("sustained", "streaming", True, True, "sustained"), + "speculative": ScenarioSpec("speculative", "offline", False, False, "speculative"), + "burst": ScenarioSpec("burst", "streaming", True, True, "burst"), + "training": ScenarioSpec("training", "offline", False, False, "training"), +} # Canonical order in which scenario metrics are merged into a suite result. -_MERGE_SCENARIO_KEYS = [ - "offline", "online", "interactive", "sustained", "training", - "speculative", "burst", +# Derived from the registry so adding a new scenario only requires editing +# the registry above. +_MERGE_SCENARIO_KEYS: list[str] = [ + spec.merge_key for spec in _SCENARIO_REGISTRY.values() if spec.merge_key ] # ── Base class ──────────────────────────────────────────────────────────────── @@ -315,13 +346,9 @@ def get_peak_memory_gb(self) -> Optional[float]: """ return None - def format_prompt(self, prompt: str) -> str: - """ - Apply chat template or other prompt formatting. - Override if the platform requires specific prompt formatting. - Default: return prompt unchanged. - """ - return prompt + # ``format_prompt`` is defined further down (it depends on self.tokenizer + # which subclasses populate during load_model). Keeping it as a single + # source of truth avoids two definitions on the same class. def get_supported_precisions( self, chip_name: str, env_info: dict @@ -933,6 +960,152 @@ def parse_args(self) -> argparse.Namespace: return args + # ── Scenario dispatch helpers ──────────────────────────────────────────── + + @classmethod + def _scenario_spec(cls, scenario: str) -> ScenarioSpec: + """Return the ScenarioSpec for ``scenario``. + + Falls back to a synthetic streaming spec for names not declared in + the global registry — this preserves the historical behaviour where + unknown scenarios defaulted to streaming inference. New scenarios + SHOULD register themselves in ``_SCENARIO_REGISTRY`` so the merge + order and use_async flag are picked up automatically. + """ + spec = _SCENARIO_REGISTRY.get(scenario) + if spec is not None: + return spec + return ScenarioSpec( + name=scenario, + inference_kind="streaming", + needs_streaming=False, + use_async=True, + merge_key=scenario, + ) + + def _resolve_inference_fn(self, scenario: str): + """Pick the runner's inference function for the given scenario name. + + Dispatch rules are derived from the scenario registry: + - ``inference_kind == "offline"`` → ``inference_fn_offline`` + - ``inference_kind == "streaming"`` → ``inference_fn_streaming`` + (requires ``SUPPORTS_STREAMING = True``; aborts otherwise when + the scenario explicitly demands streaming) + - Unknown / non-streaming runners fall back to a sync wrapper + around ``inference_fn_offline``. + """ + spec = self._scenario_spec(scenario) + + if spec.inference_kind == "offline": + return self.inference_fn_offline + + if spec.inference_kind == "streaming": + if self.SUPPORTS_STREAMING: + return self.inference_fn_streaming + if spec.needs_streaming: + print( + f"Error: scenario '{scenario}' requires " + f"SUPPORTS_STREAMING = True." + ) + sys.exit(1) + def _sync_wrapper(request: InferenceRequest) -> InferenceResult: + results = self.inference_fn_offline([request]) + return results[0] + return _sync_wrapper + + raise ValueError( + f"Unknown inference_kind '{spec.inference_kind}' for scenario " + f"'{scenario}'. Update _SCENARIO_REGISTRY in benchmark_runner.py." + ) + + # ── Shared load-context preparation ─────────────────────────────────────── + + def _prepare_load_context(self, args, suite: dict, output_dir: Path) -> dict: + """ + Common pre-`load_model` plumbing shared by accuracy and benchmark + scenarios. Resolves the precision-aware model id, model path, + parallelism, env_info, and configures the precision-related instance + variables (``_precision_dtype_override``, ``_precision_engine_kwargs``, + ``_effective_precision``). Returns a dict of locally useful values. + + Centralising this avoids the precision_model_map / dtype-override / + engine_kwargs glue being copy-pasted between branches. + """ + # For Suite C subprocesses, --precision is set and precision_model_map + # holds the actual checkpoint being loaded. Use it for the display + # label so the log doesn't show "Loading meta-llama/..." when in fact + # loading the FP8/W8A8/... variant. + _precision_arg = getattr(args, "precision", None) + _precision_model_map = suite.get("precision_model_map", {}) + _fmt_entry = _precision_model_map.get((_precision_arg or "").upper(), {}) + + model_id = _fmt_entry.get("model_id") or suite.get("model_id", "unknown") + effective_model_path = self._resolve_model_path( + model_id, getattr(args, "model_path", None) + ) + + if getattr(args, "model_note", None): + self._model_note_override = args.model_note + if getattr(args, "model_name", None): + self._model_name_override = args.model_name + + _par = getattr(self, "_parallelism", {}) + parallelism = { + "tensor_parallel_size": _par.get("tensor_parallel_size", 1), + "pipeline_parallel_size": _par.get("pipeline_parallel_size", 1), + "expert_parallel_size": _par.get("expert_parallel_size", 1), + "data_parallel_size": _par.get("data_parallel_size", 1), + } + + # Read env_info.json from task directory. For standalone runs it's in + # output_dir; for --scenario all it's in the parent. For deeply nested + # subprocess runs it may be two levels up — search up the tree. + env_info: dict = {} + for _candidate in (output_dir, output_dir.parent, output_dir.parent.parent): + _p = _candidate / "env_info.json" + if _p.exists(): + with open(_p) as _f: + env_info = json.load(_f) + break + + # Resolve precision — explicit --precision (e.g. set by a suite + # subprocess) takes priority over hardware-derived selection. + if getattr(args, "precision", None): + effective_precision = args.precision.upper() + else: + effective_precision = self._resolve_precision(suite, env_info) + self._effective_precision = effective_precision + + # Inject dtype_override and engine_kwargs from precision_model_map + # so the runner can apply the correct quantization kernel and dtype. + self._precision_dtype_override = _fmt_entry.get("dtype_override") + self._precision_engine_kwargs = dict(_fmt_entry.get("engine_kwargs") or {}) + + # If the precision_model_map entry declares a quantization + # engine_kwarg, the runner will use dtype="auto", which lets vLLM + # default the compute dtype to BF16 internally. On pre-Ampere + # hardware (V100/T4) that does not support BF16 this silently + # produces wrong results — force float16 when no dtype_override was + # already set and the hardware can't do BF16. + _entry_has_quantization = bool( + (_fmt_entry.get("engine_kwargs") or {}).get("quantization") + ) + if ( + not self._precision_dtype_override + and _entry_has_quantization + and "BF16" not in self._detect_supported_precisions(env_info) + ): + self._precision_dtype_override = "float16" + + return { + "model_id": model_id, + "effective_model_path": effective_model_path, + "parallelism": parallelism, + "env_info": env_info, + "effective_precision": effective_precision, + "fmt_entry": _fmt_entry, + } + # ── Single scenario ─────────────────────────────────────────────────────── def _run_single_scenario(self, args, suite: dict) -> dict: @@ -956,96 +1129,23 @@ def _run_single_scenario(self, args, suite: dict) -> dict: output_dir.mkdir(parents=True, exist_ok=True) self._setup_logging(str(output_dir)) - # Resolve and load model - # For Suite C subprocesses, --precision is set — use precision_model_map - # to get the actual checkpoint model_id for display and metadata. - _precision_arg = getattr(args, "precision", None) - _precision_model_map = suite.get("precision_model_map", {}) - _fmt_entry = _precision_model_map.get((_precision_arg or "").upper(), {}) - model_id = ( - _fmt_entry.get("model_id") - or suite.get("model_id", "unknown") - ) - effective_model_path = self._resolve_model_path( - model_id, getattr(args, "model_path", None) - ) - if getattr(args, "model_note", None): - self._model_note_override = args.model_note - if getattr(args, "model_name", None): - self._model_name_override = args.model_name - _par = getattr(self, "_parallelism", {}) - tp_size = _par.get("tensor_parallel_size", 1) - pp_size = _par.get("pipeline_parallel_size", 1) - ep_size = _par.get("expert_parallel_size", 1) - dp_size = _par.get("data_parallel_size", 1) - - # Load env_info for precision resolution (search up to 2 levels) - _acc_env_info: dict = {} - for _c in [output_dir, output_dir.parent, output_dir.parent.parent]: - _p = _c / "env_info.json" - if _p.exists(): - with open(_p) as _f: - _acc_env_info = json.load(_f) - break - - if getattr(args, "precision", None): - effective_precision = args.precision.upper() - else: - effective_precision = self._resolve_precision(suite, _acc_env_info) - self._effective_precision = effective_precision - - # Inject dtype_override and engine_kwargs from precision_model_map entry - # so the runner can apply the correct quantization kernel and dtype. - self._precision_dtype_override = _fmt_entry.get("dtype_override") - self._precision_engine_kwargs = dict(_fmt_entry.get("engine_kwargs") or {}) - - # If the precision_model_map entry declares a quantization engine_kwarg, the - # runner will use dtype="auto", which lets vLLM default the compute dtype to - # BF16 internally. On pre-Ampere hardware (V100/T4) that doesn't support BF16 - # this silently produces wrong results. If no dtype_override was already set - # by the suite entry and the hardware doesn't support BF16, force float16. - _entry_has_quantization = bool( - (_fmt_entry.get("engine_kwargs") or {}).get("quantization") - ) - if (not self._precision_dtype_override - and _entry_has_quantization - and "BF16" not in self._detect_supported_precisions(_acc_env_info)): - self._precision_dtype_override = "float16" - - if (args.scenario == "speculative" - and "speculative_model" not in self._precision_engine_kwargs): - _draft_id = suite.get("speculative_draft_model_id") - if _draft_id: - _saved = ( - getattr(self, "_model_source", None), - getattr(self, "_model_name_override", None), - getattr(self, "_model_note_override", None), - ) - _draft_path = self._resolve_model_path(_draft_id, None) - (self._model_source, - self._model_name_override, - self._model_note_override) = _saved - self._precision_engine_kwargs["speculative_model"] = _draft_path - self._precision_engine_kwargs.setdefault( - "num_speculative_tokens", - suite.get("speculative_num_tokens", 4), - ) - self._precision_engine_kwargs.setdefault( - "speculative_draft_tensor_parallel_size", 1, - ) + # Resolve precision-aware model_id, parallelism, env_info, and + # configure self._precision_* via the shared helper. Accuracy is + # always plain decode, so no speculative-draft injection here. + _ctx = self._prepare_load_context(args, suite, output_dir) + model_id = _ctx["model_id"] + effective_model_path = _ctx["effective_model_path"] + parallelism = _ctx["parallelism"] print(f"Loading {model_id} for accuracy check...") t_load = time.perf_counter() self._current_scenario = "accuracy" self._advance_dist_port() self.load_model(effective_model_path, { - "tensor_parallel_size": tp_size, - "pipeline_parallel_size": pp_size, - "expert_parallel_size": ep_size, - "data_parallel_size": dp_size, - "max_tokens": suite.get("output_tokens_max", 512), - "max_model_len": suite.get("max_model_len"), - "use_async": False, + **parallelism, + "max_tokens": suite.get("output_tokens_max", 512), + "max_model_len": suite.get("max_model_len"), + "use_async": False, }) print(f"Model loaded in {round(time.perf_counter() - t_load, 1)}s") @@ -1072,74 +1172,17 @@ def _run_single_scenario(self, args, suite: dict) -> dict: # Load submitter profile profile = self._load_submitter_profile() - # Resolve model path - # For Suite C subprocesses, --precision is set and precision_model_map holds - # the actual checkpoint being loaded. Use it for the display label so the log - # doesn't show "Loading meta-llama/Llama-3.1-8B-Instruct..." when loading FP8. - _precision_arg = getattr(args, "precision", None) - _precision_model_map = suite.get("precision_model_map", {}) - _fmt_entry = _precision_model_map.get((_precision_arg or "").upper(), {}) - model_id = ( - _fmt_entry.get("model_id") - or suite.get("model_id", "unknown") - ) - effective_model_path = self._resolve_model_path( - model_id, getattr(args, "model_path", None) - ) - if getattr(args, "model_note", None): - self._model_note_override = args.model_note - if getattr(args, "model_name", None): - self._model_name_override = args.model_name - - # Read env_info.json from task directory. - # For standalone runs it's in output_dir; for --scenario all it's in the parent. - # For deeply nested subprocess runs it may be two levels up — search up the tree. - env_info = {} - for _candidate in [output_dir, output_dir.parent, output_dir.parent.parent]: - _p = _candidate / "env_info.json" - if _p.exists(): - with open(_p) as f: - env_info = json.load(f) - break - - # Load model - _par = getattr(self, "_parallelism", {}) - tp_size = _par.get("tensor_parallel_size", 1) - pp_size = _par.get("pipeline_parallel_size", 1) - ep_size = _par.get("expert_parallel_size", 1) - dp_size = _par.get("data_parallel_size", 1) - - print(f"Loading {model_id}...") - t_load_start = time.perf_counter() - self._current_scenario = args.scenario - self._advance_dist_port() - - # Resolve precision — handles BF16→FP16 fallback for older hardware. - # Explicit --precision (e.g. set by a suite subprocess) takes priority. - if getattr(args, "precision", None): - effective_precision = args.precision.upper() - else: - effective_precision = self._resolve_precision(suite, env_info) - self._effective_precision = effective_precision - - # Inject dtype_override and engine_kwargs from precision_model_map entry - # so the runner can apply the correct quantization kernel and dtype. - self._precision_dtype_override = _fmt_entry.get("dtype_override") - self._precision_engine_kwargs = dict(_fmt_entry.get("engine_kwargs") or {}) - - # If the precision_model_map entry declares a quantization engine_kwarg, the - # runner will use dtype="auto", which lets vLLM default the compute dtype to - # BF16 internally. On pre-Ampere hardware (V100/T4) that doesn't support BF16 - # this silently produces wrong results. If no dtype_override was already set - # by the suite entry and the hardware doesn't support BF16, force float16. - _entry_has_quantization = bool( - (_fmt_entry.get("engine_kwargs") or {}).get("quantization") - ) - if (not self._precision_dtype_override - and _entry_has_quantization - and "BF16" not in self._detect_supported_precisions(env_info)): - self._precision_dtype_override = "float16" - + # Resolve precision-aware model_id, parallelism, env_info, and + # configure self._precision_* via the shared helper. + _ctx = self._prepare_load_context(args, suite, output_dir) + model_id = _ctx["model_id"] + effective_model_path = _ctx["effective_model_path"] + parallelism = _ctx["parallelism"] + env_info = _ctx["env_info"] + + # Inject speculative-decoding draft model (only relevant in the + # ``speculative`` scenario branch — accuracy / offline / online never + # need a draft model and the suite contract may not declare one). if (args.scenario == "speculative" and "speculative_model" not in self._precision_engine_kwargs): _draft_id = suite.get("speculative_draft_model_id") @@ -1162,14 +1205,16 @@ def _run_single_scenario(self, args, suite: dict) -> dict: "speculative_draft_tensor_parallel_size", 1, ) + print(f"Loading {model_id}...") + t_load_start = time.perf_counter() + self._current_scenario = args.scenario + self._advance_dist_port() + self.load_model(effective_model_path, { - "tensor_parallel_size": tp_size, - "pipeline_parallel_size": pp_size, - "expert_parallel_size": ep_size, - "data_parallel_size": dp_size, - "max_tokens": suite.get("output_tokens_max", 512), - "max_model_len": suite.get("max_model_len"), - "use_async": args.scenario not in ("offline", "accuracy", "speculative"), + **parallelism, + "max_tokens": suite.get("output_tokens_max", 512), + "max_model_len": suite.get("max_model_len"), + "use_async": self._scenario_spec(args.scenario).use_async, }) model_load_seconds = round(time.perf_counter() - t_load_start, 1) print(f"Model loaded in {model_load_seconds}s") @@ -1204,29 +1249,10 @@ def _run_single_scenario(self, args, suite: dict) -> dict: chip_count=chip_count, ) - # Select inference function - if args.scenario == "offline": - inference_fn = self.inference_fn_offline - elif args.scenario == "speculative": - inference_fn = self.inference_fn_offline - elif args.scenario == "sustained": - if not self.SUPPORTS_STREAMING: - print(f"Error: sustained scenario requires SUPPORTS_STREAMING = True.") - sys.exit(1) - inference_fn = self.inference_fn_streaming - elif args.scenario == "burst": - if not self.SUPPORTS_STREAMING: - print(f"Error: burst scenario requires SUPPORTS_STREAMING = True.") - sys.exit(1) - inference_fn = self.inference_fn_streaming - elif self.SUPPORTS_STREAMING: - inference_fn = self.inference_fn_streaming - else: - # Fallback for platforms without streaming - def _sync_wrapper(request: InferenceRequest) -> InferenceResult: - results = self.inference_fn_offline([request]) - return results[0] - inference_fn = _sync_wrapper + # Select inference function via the scenario registry. Unknown + # scenarios fall through with a sensible default — streaming when + # supported, otherwise a sync wrapper around inference_fn_offline. + inference_fn = self._resolve_inference_fn(args.scenario) # Run benchmark benchmark_start = datetime.now(timezone.utc) @@ -1458,7 +1484,6 @@ def _run_all_scenarios(self, args, suite: dict) -> None: else: print(" --skip-accuracy-gate set -- continuing anyway.\n") acc_result = None - acc_result = None else: # Subprocess succeeded — read accuracy.json written by the child with open(acc_json_path) as f: @@ -1746,33 +1771,19 @@ def _load_accuracy_baseline_for_format( except Exception: return None - def _run_accuracy_scenario( - self, - suite: dict, - output_dir: Path, - ) -> dict: + def _score_accuracy_questions(self, questions: list) -> tuple: """ - Run accuracy check as a proper scenario. - Uses inference_fn_offline() — same model, framework, precision as the benchmark. - - Args: - suite: Parsed suite.json dict - output_dir: Where to write accuracy.json + Run the accuracy question bank through ``inference_fn_offline`` and + score the answers. - Returns: - Accuracy dict with subset_score, baseline_delta, valid fields. + Returns ``(score, correct, total, wrong_examples, scored_outputs)`` + — shared by both :meth:`_run_accuracy_scenario` and + :meth:`_run_accuracy_scenario_for_format` so the inference/scoring + path stays identical (only baseline policy differs between callers). """ - questions = self._load_accuracy_questions() - - print(f"\n{'='*60}") - print(f" Accuracy Check ({len(questions)} questions)") - print(f" Framework: {self._get_framework_name()}") - print(f" Precision: {getattr(self, '_effective_precision', None) or suite.get('precision_required', 'BF16')}") - print(f"{'='*60}\n") - # Build InferenceRequest objects with raw (unformatted) prompts. - # format_prompt() is called by the runner's inference_fn_offline internally — - # passing raw prompts here avoids double-formatting. + # format_prompt() is called by the runner's inference_fn_offline + # internally — passing raw prompts here avoids double-formatting. accuracy_requests = [] for i, q in enumerate(questions): raw = ( @@ -1788,7 +1799,6 @@ def _run_accuracy_scenario( request_id=i, )) - # Run through inference_fn_offline — same model, framework, precision t_start = time.perf_counter() try: results = self.inference_fn_offline(accuracy_requests) @@ -1797,10 +1807,9 @@ def _run_accuracy_scenario( elapsed = round(time.perf_counter() - t_start, 1) print(f"Completed in {elapsed}s") - # Score answers correct = 0 - wrong_examples = [] - scored_outputs = [] + wrong_examples: list[str] = [] + scored_outputs: list[dict] = [] for i, result in enumerate(results): text = (result.output_text or "").strip() match = re.search(r"\b([ABCD])\b", text.upper()) @@ -1817,15 +1826,58 @@ def _run_accuracy_scenario( ) scored_outputs.append({ "question_id": questions[i].get("question_id", i), - "question": questions[i]["question"], - "choices": questions[i]["choices"], - "expected": expected, - "predicted": predicted, - "correct": is_correct, - "raw_output": text[:500], + "question": questions[i]["question"], + "choices": questions[i]["choices"], + "expected": expected, + "predicted": predicted, + "correct": is_correct, + "raw_output": text[:500], }) score = round(correct / len(questions), 4) if questions else 0.0 + return score, correct, len(questions), wrong_examples, scored_outputs + + @staticmethod + def _write_accuracy_artifacts( + output_dir: Path, acc: dict, scored_outputs: list + ) -> None: + """Persist accuracy.json and accuracy_outputs.jsonl for one scenario.""" + acc_path = output_dir / "accuracy.json" + with open(acc_path, "w") as f: + json.dump(acc, f, indent=2) + print(f"Saved to: {acc_path}") + + outputs_path = output_dir / "accuracy_outputs.jsonl" + with open(outputs_path, "w") as f: + for row in scored_outputs: + f.write(json.dumps(row) + "\n") + + def _run_accuracy_scenario( + self, + suite: dict, + output_dir: Path, + ) -> dict: + """ + Run accuracy check as a proper scenario. + Uses inference_fn_offline() — same model, framework, precision as the benchmark. + + Args: + suite: Parsed suite.json dict + output_dir: Where to write accuracy.json + + Returns: + Accuracy dict with subset_score, baseline_delta, valid fields. + """ + questions = self._load_accuracy_questions() + + print(f"\n{'='*60}") + print(f" Accuracy Check ({len(questions)} questions)") + print(f" Framework: {self._get_framework_name()}") + print(f" Precision: {getattr(self, '_effective_precision', None) or suite.get('precision_required', 'BF16')}") + print(f"{'='*60}\n") + + score, correct, total, wrong_examples, scored_outputs = \ + self._score_accuracy_questions(questions) # Compare to baseline — one-sided: score must not drop more than threshold # below baseline. Scoring ABOVE baseline is always valid. @@ -1844,7 +1896,7 @@ def _run_accuracy_scenario( valid = (delta >= -threshold) if delta is not None else True # Print results - print(f"Score: {correct}/{len(questions)} = {score:.4f}") + print(f"Score: {correct}/{total} = {score:.4f}") if baseline_score is not None: sign = "+" if delta >= 0 else "" print(f"Baseline: {baseline_score:.4f}") @@ -1859,30 +1911,18 @@ def _run_accuracy_scenario( f"(threshold: {threshold}) — submission will be flagged") acc = { - "subset_score": score, + "subset_score": score, "baseline_delta": delta, - "valid": valid, - "framework": self._get_framework_name(), - "precision": getattr(self, "_effective_precision", None) or suite.get("precision_required", "BF16"), + "valid": valid, + "framework": self._get_framework_name(), + "precision": getattr(self, "_effective_precision", None) or suite.get("precision_required", "BF16"), "notes": ( f"Integrated accuracy check — used same " f"{self._get_framework_name()} instance as benchmark." ), } - # Save accuracy.json to submission directory - acc_path = output_dir / "accuracy.json" - with open(acc_path, "w") as f: - json.dump(acc, f, indent=2) - print(f"Saved to: {acc_path}") - - # Save per-question outputs (gitignored — for local debugging only) - outputs_path = output_dir / "accuracy_outputs.jsonl" - with open(outputs_path, "w") as f: - for row in scored_outputs: - f.write(json.dumps(row) + "\n") - print(f"Per-question outputs saved to: {outputs_path}") - + self._write_accuracy_artifacts(output_dir, acc, scored_outputs) return acc def _run_accuracy_scenario_for_format( @@ -1918,60 +1958,8 @@ def _run_accuracy_scenario_for_format( print(f" Framework: {self._get_framework_name()}") print(f"{'='*60}\n") - # Build InferenceRequest objects with raw (unformatted) prompts. - # format_prompt() is called by the runner's inference_fn_offline internally. - accuracy_requests = [] - for i, q in enumerate(questions): - raw = ( - f"Question: {q['question']}\n" - f"A) {q['choices'][0]}\n" - f"B) {q['choices'][1]}\n" - f"C) {q['choices'][2]}\n" - f"D) {q['choices'][3]}\n" - f"Answer:" - ) - accuracy_requests.append(InferenceRequest( - prompt=raw, - request_id=i, - )) - - t_start = time.perf_counter() - try: - results = self.inference_fn_offline(accuracy_requests) - except Exception as e: - raise RuntimeError(f"Accuracy inference failed: {e}") from e - elapsed = round(time.perf_counter() - t_start, 1) - print(f"Completed in {elapsed}s") - - # Score answers - correct = 0 - wrong_examples = [] - scored_outputs = [] - for i, result in enumerate(results): - text = (result.output_text or "").strip() - match = re.search(r"\b([ABCD])\b", text.upper()) - predicted = match.group(1) if match else "?" - expected = questions[i].get("answer", "") - is_correct = (predicted == expected) - if is_correct: - correct += 1 - elif len(wrong_examples) < 3: - wrong_examples.append( - f" Q: {questions[i]['question'][:65]}\n" - f" Expected: {expected}, Got: {predicted} " - f"(raw: '{text[:20]}')" - ) - scored_outputs.append({ - "question_id": questions[i].get("question_id", i), - "question": questions[i]["question"], - "choices": questions[i]["choices"], - "expected": expected, - "predicted": predicted, - "correct": is_correct, - "raw_output": text[:500], - }) - - score = round(correct / len(questions), 4) if questions else 0.0 + score, correct, total, wrong_examples, scored_outputs = \ + self._score_accuracy_questions(questions) # Per-format baseline and threshold baseline_score = self._load_accuracy_baseline_for_format(model_id, precision) @@ -1982,7 +1970,7 @@ def _run_accuracy_scenario_for_format( # None = baseline not set yet (placeholder) — not a failure # Print results - print(f"Score: {correct}/{len(questions)} = {score:.4f}") + print(f"Score: {correct}/{total} = {score:.4f}") if baseline_score is not None: sign = "+" if delta >= 0 else "" print(f"Baseline ({precision}): {baseline_score:.4f}") @@ -2010,18 +1998,7 @@ def _run_accuracy_scenario_for_format( "notes": f"Suite C per-format accuracy check. Threshold: {threshold}", } - # Write accuracy.json - acc_path = output_dir / "accuracy.json" - with open(acc_path, "w") as f: - json.dump(acc, f, indent=2) - print(f"Saved to: {acc_path}") - - # Write per-question outputs (gitignored) - outputs_path = output_dir / "accuracy_outputs.jsonl" - with open(outputs_path, "w") as f: - for row in scored_outputs: - f.write(json.dumps(row) + "\n") - + self._write_accuracy_artifacts(output_dir, acc, scored_outputs) return acc # ── GPU memory release ──────────────────────────────────────────────────── @@ -2111,28 +2088,10 @@ def _build_result_json( ep_size = _par.get("expert_parallel_size", 1) dp_size = _par.get("data_parallel_size", 1) - # For Suite C subprocesses, --precision is set and precision_model_map holds - # the actual quantized checkpoint. Use it so each per-format result.json records - # the real model_id/revision (e.g. RedHatAI/...-FP8), not the suite-level model_id. - _result_precision = ( - getattr(self, "_effective_precision", None) - or getattr(args, "precision", None) - ) - _pm_entry = suite.get("precision_model_map", {}).get( - (_result_precision or "").upper(), {} - ) - _result_model_id = ( - _pm_entry.get("model_id") - or suite.get("model_id", "unknown") - ) - _result_model_revision = ( - _pm_entry.get("model_revision") - or suite.get("model_revision", "unknown") - ) - - # For Suite C subprocesses, --precision is set and precision_model_map holds - # the actual quantized checkpoint. Use it so each per-format result.json records - # the real model_id/revision (e.g. RedHatAI/...-FP8), not the suite-level model_id. + # For Suite C subprocesses, --precision is set and precision_model_map + # holds the actual quantized checkpoint. Use it so each per-format + # result.json records the real model_id/revision (e.g. + # RedHatAI/...-FP8), not the suite-level model_id. _result_precision = ( getattr(self, "_effective_precision", None) or getattr(args, "precision", None) @@ -2196,7 +2155,7 @@ def _build_result_json( "subset_score": None, "baseline_delta": None, "valid": False, - "notes": "Run --scenario accuracy to check model accuracy.", + "notes": "Run --scenario accuracy to check model accuracy.", }, "meta": { "submitted_by": profile.get("submitted_by", ""), diff --git a/runners/template/runner.py b/runners/template/runner.py index 2b502d32..7e797770 100644 --- a/runners/template/runner.py +++ b/runners/template/runner.py @@ -65,11 +65,24 @@ class TemplateRunner(BenchmarkRunner): BenchmarkRunner auto-detects hardware limits and intersects with this list. """ - SUPPORTED_QUANTIZATIONS = [] + SUPPORTED_QUANTIZATION_BACKENDS = [] """ - Quantization formats for Suite C. List any of: "fp8", "w8a8", "w8a16", "w4a16" - BF16 is always supported — do not list it here. - Empty list = this runner skips all quantized formats in Suite C. + Framework-level quantization backends supported by this runner. The + values are passed directly to the engine (e.g. vLLM's `quantization=` + kwarg), so the names mirror the engine's vocabulary — NOT the suite-level + precision tags (W8A8, FP8, W4A16, …). + + Suite C cross-references each precision_model_map entry's + engine_kwargs.quantization against this list to decide which formats to + run on this runner. Adding a new quantized format becomes a pure suite + edit — no runner change is needed if the backend is already supported. + + Examples (vLLM names): + SUPPORTED_QUANTIZATION_BACKENDS = ["fp8", "compressed-tensors", "gptq_marlin"] + SUPPORTED_QUANTIZATION_BACKENDS = ["compressed-tensors", "gptq_marlin"] + SUPPORTED_QUANTIZATION_BACKENDS = [] # BF16/FP16/FP32 only + + BF16/FP16/FP32 are always allowed — do not list them here. """ # ── Initializer ─────────────────────────────────────────────────────────── diff --git a/runners/validate_suites.py b/runners/validate_suites.py new file mode 100644 index 00000000..a7b60c79 --- /dev/null +++ b/runners/validate_suites.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 +""" +Validate suite folders under suites/. + +Checks per folder: + - suite.json exists and parses as JSON + - suite.json validates against schema/suite.schema.json + - suite.suite_id matches the folder name + - suite.dataset resolves to datasets//requests.jsonl + +Usage: + # Validate every suite + python runners/validate_suites.py + + # Validate a specific suite folder (name or path) + python runners/validate_suites.py --dir suite_A + python runners/validate_suites.py --dir suites/suite_A + python runners/validate_suites.py --dir /abs/path/to/suite_A +""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +try: + import jsonschema + HAS_JSONSCHEMA = True +except ImportError: + HAS_JSONSCHEMA = False + print("Warning: jsonschema not installed — schema validation skipped") + +REPO_ROOT = Path(__file__).resolve().parent.parent +SUITES_DIR = REPO_ROOT / "suites" +SCHEMA_PATH = REPO_ROOT / "schema" / "suite.schema.json" +DATASETS_DIR = REPO_ROOT / "datasets" + +# Files / folders that live flat under suites/ — not suite folders +_NON_SUITE_NAMES = {"README.md", "__pycache__", ".DS_Store"} + + +def _load_schema() -> dict | None: + if not HAS_JSONSCHEMA: + return None + if not SCHEMA_PATH.exists(): + print(f"Error: schema not found at {SCHEMA_PATH}") + sys.exit(1) + return json.loads(SCHEMA_PATH.read_text()) + + +def _iter_suite_folders() -> list[Path]: + if not SUITES_DIR.exists(): + return [] + out = [] + for entry in sorted(SUITES_DIR.iterdir()): + if not entry.is_dir() or entry.name in _NON_SUITE_NAMES or entry.name.startswith("."): + continue + out.append(entry) + return out + + +def _resolve_target(target: str) -> Path: + p = Path(target) + if p.is_absolute(): + return p + # Allow "suite_A" or "suites/suite_A" + if (SUITES_DIR / target).exists(): + return SUITES_DIR / target + return REPO_ROOT / target + + +def validate_suite(folder: Path, schema: dict | None) -> list[str]: + errors: list[str] = [] + name = folder.name + suite_json = folder / "suite.json" + + if not suite_json.exists(): + errors.append(f"missing suite.json at {suite_json}") + return errors + + try: + data = json.loads(suite_json.read_text()) + except json.JSONDecodeError as exc: + errors.append(f"suite.json is not valid JSON: {exc}") + return errors + + declared_id = data.get("suite_id") + if declared_id != name: + errors.append( + f"suite_id mismatch: folder is '{name}' but suite.suite_id is " + f"'{declared_id}'." + ) + + if schema is not None: + validator = jsonschema.Draft7Validator(schema) + for err in validator.iter_errors(data): + path = ".".join(str(p) for p in err.absolute_path) or "" + errors.append(f"schema: {path}: {err.message}") + + dataset = data.get("dataset") + if dataset: + dataset_path = DATASETS_DIR / dataset / "requests.jsonl" + if not dataset_path.exists(): + errors.append( + f"dataset '{dataset}' referenced by suite.json does not exist " + f"at {dataset_path}. Add the dataset to datasets/ or fix the " + f"'dataset' field." + ) + + return errors + + +def _print_result(folder: Path, errors: list[str]) -> None: + if errors: + print(f"FAIL {folder.name}") + for err in errors: + print(f" - {err}") + else: + print(f"OK {folder.name}") + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Validate suite folders under suites/." + ) + parser.add_argument( + "--dir", + default=None, + help="Validate a single suite folder (name, relative, or absolute path).", + ) + args = parser.parse_args() + + schema = _load_schema() + + if args.dir: + target = _resolve_target(args.dir) + if not target.exists() or not target.is_dir(): + print(f"Error: '{args.dir}' is not an existing directory.") + return 2 + folders = [target] + else: + folders = _iter_suite_folders() + if not folders: + print("No suite folders found under suites/.") + return 0 + + total_errors = 0 + for folder in folders: + errs = validate_suite(folder, schema) + _print_result(folder, errs) + total_errors += len(errs) + + print() + if total_errors: + print(f"Found {total_errors} problem(s) across {len(folders)} suite folder(s).") + return 1 + print(f"All {len(folders)} suite folder(s) valid.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/schema/accuracy_subset.README.md b/schema/accuracy_subset.README.md new file mode 100644 index 00000000..4705f0c2 --- /dev/null +++ b/schema/accuracy_subset.README.md @@ -0,0 +1,58 @@ +# `accuracy_subset.jsonl` — accuracy gate question bank + +100 multiple-choice items drawn from +[MMLU](https://github.com/hendrycks/test) (Massive Multitask Language +Understanding). Every benchmark run executes this subset against the loaded +model as a "model-quality sanity check" before measuring throughput or +latency. The subset is **immutable** — see `CONTRIBUTING.md` "A few rules" +and `benchmark_runner.py::_run_accuracy_scenario`. + +## File format + +One JSON object per line: + +```json +{ + "question_id": "mmlu_0096", + "subject": "machine_learning", + "question": "Which of the following statements about Naive Bayes is incorrect?", + "choices": ["...", "...", "...", "..."], + "answer": "B" +} +``` + +| Field | Notes | +|---------------|---------------------------------------------------| +| `question_id` | Stable identifier (`mmlu_`) — never reused | +| `subject` | MMLU subject tag (e.g. `machine_learning`) | +| `question` | Plain-text prompt | +| `choices` | List of exactly 4 strings | +| `answer` | Letter in `{"A", "B", "C", "D"}` | + +## How AccelMark uses it + +- Loaded by `runners/benchmark_runner.py` (`_run_accuracy_scenario`, ~line 1700). +- Scored as `correct / total`; compared against per-suite baselines in + [`accuracy_baselines.json`](accuracy_baselines.json). +- A failed gate aborts the benchmark unless the user passes + `--skip-accuracy-gate` (the resulting submission is permanently flagged). + +This is **not** a measurement of MMLU performance — the subset is too small. +It exists only to catch grossly broken model weights / quantization configs +before runtime measurements waste hours of compute. + +## License & attribution + +The questions are a 100-item subset of MMLU: + +> Hendrycks, D., Burns, C., Basart, S., Zou, A., Mazeika, M., Song, D., & +> Steinhardt, J. (2021). **Measuring Massive Multitask Language +> Understanding.** *International Conference on Learning Representations.* +> arXiv:[2009.03300](https://arxiv.org/abs/2009.03300) +> Source: + +MMLU is distributed under the **MIT License**. AccelMark redistributes +this subset under the same license; the AccelMark Apache-2.0 license +covers only the surrounding evaluation code, not the question content. + +See [`../NOTICE`](../NOTICE) for the full third-party attribution. diff --git a/schema/suite.schema.json b/schema/suite.schema.json new file mode 100644 index 00000000..1367fe0d --- /dev/null +++ b/schema/suite.schema.json @@ -0,0 +1,215 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "title": "AccelMark Suite", + "description": "Contract for suites//suite.json. Validates the fields BenchmarkRunner and the leaderboard generator depend on. Inline notes (keys prefixed with '_') are intentionally allowed.", + "type": "object", + + "required": [ + "suite_id", + "description", + "model_id", + "model_revision", + "dataset", + "scenarios", + "precision_required", + "allowed_precisions", + "max_model_len", + "output_tokens_max", + "concurrency_levels", + "num_runs", + "warmup_runs", + "request_count" + ], + + "additionalProperties": true, + + "properties": { + "suite_id": { + "type": "string", + "pattern": "^suite_[A-Z][A-Za-z0-9_]*$", + "description": "Folder name under suites/. Must match the directory name." + }, + "description": { + "type": "string", + "minLength": 1 + }, + "model_id": { + "type": "string", + "minLength": 1, + "description": "Canonical model identifier (typically a HuggingFace repo id)." + }, + "model_revision": { + "type": "string", + "minLength": 1, + "description": "Pinned model revision (commit SHA or tag) — never 'main'." + }, + "dataset": { + "type": "string", + "pattern": "^[a-z0-9][a-z0-9_]*_v[0-9]+$", + "description": "Dataset folder name under datasets/. Must exist as datasets//requests.jsonl." + }, + + "scenarios": { + "type": "object", + "required": ["default", "extra"], + "additionalProperties": false, + "properties": { + "default": { + "type": "array", + "items": { "$ref": "#/definitions/scenarioName" }, + "uniqueItems": true, + "minItems": 1, + "description": "Scenarios executed when `--scenario default` (or no --scenario) is passed." + }, + "extra": { + "type": "array", + "items": { "$ref": "#/definitions/scenarioName" }, + "uniqueItems": true, + "description": "Opt-in scenarios runnable with --scenario all or --scenario ." + } + } + }, + + "precision_required": { "$ref": "#/definitions/precisionTag" }, + "allowed_precisions": { + "type": "array", + "items": { "$ref": "#/definitions/precisionTag" }, + "uniqueItems": true, + "minItems": 1 + }, + + "max_model_len": { "type": "integer", "minimum": 128 }, + "output_tokens_max": { "type": "integer", "minimum": 1 }, + + "concurrency_levels": { + "type": "array", + "items": { "type": "integer", "minimum": 1 }, + "minItems": 1, + "uniqueItems": true + }, + + "num_runs": { "type": "integer", "minimum": 1 }, + "warmup_runs": { "type": "integer", "minimum": 0 }, + "warmup_minutes": { "type": "number", "minimum": 0 }, + + "request_count": { "type": "integer", "minimum": 1 }, + + "request_distribution": { + "type": "object", + "additionalProperties": true, + "properties": { + "input_tokens_p25": { "type": "number", "minimum": 0 }, + "input_tokens_p50": { "type": "number", "minimum": 0 }, + "input_tokens_p75": { "type": "number", "minimum": 0 }, + "input_tokens_p99": { "type": "number", "minimum": 0 }, + "output_tokens_p50": { "type": "number", "minimum": 0 }, + "output_tokens_p99": { "type": "number", "minimum": 0 }, + "source": { "type": "string" } + } + }, + + "online_qps_levels": { + "type": ["array", "null"], + "items": { "type": "number", "exclusiveMinimum": 0 }, + "minItems": 1, + "uniqueItems": true + }, + "online_sla_ttft_ms": { "type": ["integer", "null"], "minimum": 1 }, + "online_sla_ttft_ms_relaxed": { "type": ["integer", "null"], "minimum": 1 }, + "online_request_count": { "type": ["integer", "null"], "minimum": 1 }, + "online_warmup_runs": { "type": "integer", "minimum": 0 }, + + "interactive_request_count": { "type": ["integer", "null"], "minimum": 1 }, + "interactive_warmup_runs": { "type": "integer", "minimum": 0 }, + + "sustained_concurrency": { "type": "integer", "minimum": 1 }, + "duration_minutes": { "type": "number", "minimum": 0 }, + "sample_interval_seconds": { "type": "number", "minimum": 0 }, + + "accuracy_threshold_delta": { "type": "number" }, + + "required_chips": { + "oneOf": [ + { "type": "integer", "minimum": 1 }, + { "type": "string", "enum": ["auto"] } + ], + "description": "Either an explicit chip count or the literal string 'auto'." + }, + + "chip_counts_required": { + "type": "array", + "items": { "type": "integer", "minimum": 1 }, + "uniqueItems": true + }, + "chip_counts_optional": { + "type": "array", + "items": { "type": "integer", "minimum": 1 }, + "uniqueItems": true + }, + "chip_counts_all": { + "type": "array", + "items": { "type": "integer", "minimum": 1 }, + "uniqueItems": true + }, + + "speculative_draft_model_id": { "type": "string", "minLength": 1 }, + "speculative_draft_model_revision": { "type": "string", "minLength": 1 }, + "speculative_num_tokens": { "type": "integer", "minimum": 1 }, + + "burst_steady_qps": { "type": "number", "exclusiveMinimum": 0 }, + "burst_peak_qps": { "type": "number", "exclusiveMinimum": 0 }, + "burst_duration_seconds": { "type": "number", "exclusiveMinimum": 0 }, + "burst_interval_seconds": { "type": "number", "exclusiveMinimum": 0 }, + + "precision_model_map": { + "type": "object", + "description": "Suite C only — maps each precision tag to a fixed quantized checkpoint.", + "patternProperties": { + "^[A-Z][A-Z0-9]*$": { + "type": "object", + "required": ["model_id", "model_revision"], + "additionalProperties": true, + "properties": { + "model_id": { "type": "string", "minLength": 1 }, + "model_revision": { "type": "string", "minLength": 1 }, + "dtype_override": { "type": "string" }, + "engine_kwargs": { "type": "object" } + } + } + } + }, + "precision_levels": { + "type": "array", + "items": { "$ref": "#/definitions/precisionTag" }, + "uniqueItems": true, + "description": "Suite C only — ordered list of precisions to evaluate." + }, + "accuracy_thresholds": { + "type": "object", + "description": "Suite C only — per-format accuracy delta thresholds.", + "patternProperties": { + "^[A-Z][A-Z0-9]*$": { "type": "number" } + } + } + }, + + "definitions": { + "scenarioName": { + "type": "string", + "enum": [ + "accuracy", + "offline", + "online", + "interactive", + "sustained", + "speculative", + "burst" + ] + }, + "precisionTag": { + "type": "string", + "pattern": "^[A-Z][A-Z0-9]*$", + "description": "Uppercase precision tag (BF16, FP16, FP32, FP8, W8A8, W8A16, W4A16, …)." + } + } +} diff --git a/serve/server.py b/serve/server.py index 59d85036..a8482c7b 100644 --- a/serve/server.py +++ b/serve/server.py @@ -24,7 +24,6 @@ from contextlib import asynccontextmanager from typing import Optional, Union -import uvicorn from fastapi import Depends, FastAPI, Header, HTTPException, Request, status from fastapi.responses import StreamingResponse @@ -397,6 +396,10 @@ def start_server( logger.info("=" * 60) # ── Launch uvicorn ───────────────────────────────────────────────────── + # Imported lazily so importing `serve.server` (e.g. from tests, or to + # build the ASGI `app` for an external runner) does not require uvicorn. + import uvicorn + uvicorn.run( app, host=host, diff --git a/serve/tests/mock_runner.py b/serve/tests/mock_runner.py index 9d2c42c5..c2d7d9b1 100644 --- a/serve/tests/mock_runner.py +++ b/serve/tests/mock_runner.py @@ -73,10 +73,14 @@ async def inference_fn_streaming(self, request: InferenceRequest) -> InferenceRe ) async def inference_fn_token_stream(self, request: InferenceRequest): - """Yield response word by word to simulate token streaming.""" - for word in self._response_text.split(): - await asyncio.sleep(0.001) - yield word + " " + """ + Per RunnerProtocol, true token streaming is optional. MockRunner + declares "not supported" by raising NotImplementedError so the + serve layer exercises its single-chunk fallback path. Use + TokenStreamingMockRunner below to test the true-streaming path. + """ + raise NotImplementedError("MockRunner does not implement true token streaming") + yield # pragma: no cover - keeps this an async generator for the protocol shape def format_prompt(self, prompt: str) -> str: return prompt # pass through unchanged @@ -93,4 +97,22 @@ def _compute_implementation_id(self) -> Optional[str]: class NoStreamingMockRunner(MockRunner): """Mock runner that declares SUPPORTS_STREAMING = False.""" - SUPPORTS_STREAMING = False \ No newline at end of file + SUPPORTS_STREAMING = False + + +class TokenStreamingMockRunner(MockRunner): + """ + Mock runner that *does* implement true token streaming — yields the + response text word by word with a small async delay. Used by tests + that exercise the multi-chunk SSE path in serve/server.py. + + Spaces are emitted as a leading separator before each word *after* + the first, so concatenating every delta reconstructs the original + response text exactly (no trailing space) — matching how real + tokenizers stream BPE / SentencePiece pieces. + """ + + async def inference_fn_token_stream(self, request: InferenceRequest): + for i, word in enumerate(self._response_text.split()): + await asyncio.sleep(0.001) + yield (" " + word) if i else word \ No newline at end of file diff --git a/suites/README.md b/suites/README.md index e3cdbfb6..aedb5652 100644 --- a/suites/README.md +++ b/suites/README.md @@ -309,8 +309,10 @@ not model version differences. Each format runs against the same 100 prompts with concurrency levels `[1, 4, 16, 64]` from `suite_C/suite.json` (not the same sweep as Suite A’s -`[8, 32, 128]`). Format availability depends on the runner's `SUPPORTED_QUANTIZATIONS` -declaration — unsupported formats are skipped automatically. +`[8, 32, 128]`). Format availability depends on the runner's +`SUPPORTED_QUANTIZATION_BACKENDS` declaration — unsupported formats are +skipped automatically by matching each entry's `engine_kwargs.quantization` +against the runner's backend list. ### Metrics @@ -341,13 +343,15 @@ On H100, FP8 would show ~1.5-1.8× speedup. ### Runner requirements -Declare which formats your runner supports: +Declare which quantization backends your runner's framework supports. The +strings are the engine's own backend identifiers (vLLM names shown), NOT +suite precision tags such as W8A8/FP8/W4A16: ```python # In your runner class: -SUPPORTED_QUANTIZATIONS = ["fp8", "w8a8", "w8a16", "w4a16"] # H100 -SUPPORTED_QUANTIZATIONS = ["w8a8", "w8a16", "w4a16"] # A100 (no native FP8) -SUPPORTED_QUANTIZATIONS = [] # BF16 only +SUPPORTED_QUANTIZATION_BACKENDS = ["fp8", "compressed-tensors", "gptq_marlin"] # vLLM full +SUPPORTED_QUANTIZATION_BACKENDS = ["compressed-tensors", "gptq_marlin"] # No native FP8 +SUPPORTED_QUANTIZATION_BACKENDS = [] # BF16 only ``` Each format's checkpoint must be available locally. Add to @@ -712,7 +716,7 @@ submissions. ## Adding a new suite -1. Open a GitHub Issue using the "Request new suite" template +1. Open a GitHub Issue using the [**Propose a new suite**](https://github.com/JuhaoLiang1997/AccelMark/issues/new?template=new_suite.md) template 2. Specify: model, chip count, scenarios, and rationale 3. Discuss the proposal in the issue thread — interested contributors weigh in 4. Create `suites/suite_X/suite.json` referencing a shared dataset diff --git a/suites/suite_C/suite.json b/suites/suite_C/suite.json index 8adbb45c..fa14a3c5 100644 --- a/suites/suite_C/suite.json +++ b/suites/suite_C/suite.json @@ -47,7 +47,7 @@ }, "precision_levels": ["BF16", "FP16", "FP8", "W8A8", "W8A16", "W4A16"], - "_precision_levels_note": "FP16 runs on all hardware including pre-Ampere. FP8 requires Ampere+ and is skipped automatically on FP16-only runners via SUPPORTED_QUANTIZATIONS.", + "_precision_levels_note": "FP16 runs on all hardware including pre-Ampere. FP8 requires Ampere+ and is skipped automatically on FP16-only runners via SUPPORTED_QUANTIZATION_BACKENDS.", "accuracy_thresholds": { "BF16": 0.03,