diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl
index a4c600ab7..5aacfa59a 100644
--- a/.beads/issues.jsonl
+++ b/.beads/issues.jsonl
@@ -1,5 +1,6 @@
 {"id":"av-1sr","title":"public demo: build dexter-evals companion project","description":"Plan: docs/plans/public-agentv-demo-projects.md#u3-build-dexter-evals-companion-project\nRequirements: R6, R7, R8, R9, R10, R16, R17, R18\n\nAcceptance:\n- Create dexter-evals AgentV config, eval YAML, scripts, .env.example, and README.\n- Pin/document Dexter version or commit and prerequisite install path.\n- Adapt Dexter public eval pattern into AgentV format rather than inventing a synthetic finance suite.\n- Setup fails clearly when Dexter/provider/data env is missing and does not print resolved secrets or private endpoints.\n- Produce one local AgentV result when env is configured.\n- Record AgentV schema/provider/rubric/result-flow friction as separate follow-up plan/Bead.","status":"closed","priority":1,"issue_type":"task","assignee":"codex-public-demo-plan","created_at":"2026-06-04T02:16:12.250114714Z","created_by":"codex-public-demo-plan","updated_at":"2026-06-04T04:16:41.991236878Z","closed_at":"2026-06-04T03:47:33.484197044Z","close_reason":"Completed source/project scope: dexter-evals companion project was implemented, validated with non-secret target-selection env, integrated into feature/agentv-public-demo, and downstream handoff notes were recorded. A real local AgentV result remains conditional on configured OPENAI_API_KEY, FINANCIAL_DATASETS_API_KEY, and search-provider env; result-sync/dashboard beads carry that credentialed-run caveat.","source_repo":"agentv","source_repo_path":"/home/entity/projects/EntityProcess/agentv","compaction_level":0,"original_size":0,"labels":["dexter-evals","public-demo"],"comments":[{"id":10,"issue_id":"av-1sr","author":"codex-public-demo-plan","text":"Created from doc review handoff. Requirements: docs/brainstorms/2026-06-04-public-agentv-demo-projects-requirements.md. Plan: docs/plans/public-agentv-demo-projects.md. Follow-up rule: Dashboard UX gaps and AgentV core gaps discovered during implementation should become separate focused Beads with evidence.","created_at":"2026-06-04T02:16:45Z"},{"id":15,"issue_id":"av-1sr","author":"codex-public-demo-plan","text":"Agent Mail broadcast attempted by IvoryDune on thread public-agentv-demo-projects. Delivery was blocked by contact policy for CoralGlen and QuietCove; pending contact requests were created by the Agent Mail server. Broadcast body summarized plan docs, claimed Beads, repo topology, Dashboard UX-gap follow-up rule, AgentV core-gap follow-up rule, secret handling, and result-sync artifact boundary.","created_at":"2026-06-04T02:19:02Z"},{"id":18,"issue_id":"av-1sr","author":"BlackMeadow","text":"bead-spawn-agent launched an agent for av-1sr.\n\nSession: agent-av-1sr-main-20260604045217\nDirectory: /home/entity/projects/EntityProcess/agentv.worktrees/public-demo-dexter-evals\nProfile: codex-eng (auto-detected if not specified)\n\nExported EP_TASK_ID, BEAD_ID, and AGENTV_BEAD_ID as av-1sr.\nWorktree: /home/entity/projects/EntityProcess/agentv.worktrees/public-demo-dexter-evals","created_at":"2026-06-04T02:52:17Z"},{"id":20,"issue_id":"av-1sr","author":"entity","text":"Orchestration update from BlackMeadow: per-task worktree may be used as scratch, but final Dexter companion changes must merge into shared integration worktree /home/entity/projects/EntityProcess/agentv.worktrees/public-demo-integration on branch feature/agentv-public-demo. Do not leave final work stranded on feature/av-1sr-main or open a standalone per-bead PR.","created_at":"2026-06-04T03:07:18Z"},{"id":22,"issue_id":"av-1sr","author":"entity","text":"Epic coordination update from BlackMeadow: all agentv-public-demo workers must use the same Beads source of truth. Run br mutations from /home/entity/projects/EntityProcess/agentv unless explicitly moved; treat per-task worktree .beads copies as read-only/stale. Code may still merge into /home/entity/projects/EntityProcess/agentv.worktrees/public-demo-integration.","created_at":"2026-06-04T03:08:15Z"},{"id":28,"issue_id":"av-1sr","author":"entity","text":"Implementation evidence: created dexter-evals companion project files and mirrored them into the public-demo integration checkout. Dexter source pinned to virattt/dexter commit 8d9419829f443f84b804d033bb2c3b1fbd788629. Project adapts Dexter finance_agent.csv rows into AgentV input/expected_output/rubrics, includes .agentv/targets.yaml, setup preflight, Dexter CLI wrapper, CSV-to-AgentV generator, .env.example, README, and public-safe .gitignore. Verification: AgentV build completed in scratch worktree after bun install; validation passed for dexter-evals eval + targets when non-secret dummy target-selection env was supplied. Missing-env setup was run in scrubbed env and failed with only variable names/prereq guidance, no resolved secret values or private endpoints. Generated eval script successfully converted 2 rows from a cloned Dexter source checkout at the pinned commit. Blocker: no OPENAI_API_KEY/FINANCIAL_DATASETS_API_KEY/search env is configured in this session, so producing a real local AgentV result is blocked on local credentials/data access. Follow-up beads opened: av-w9p for rubric operator semantics and av-njl for targets.yaml template validation.","created_at":"2026-06-04T03:17:04Z"},{"id":31,"issue_id":"av-1sr","author":"entity","text":"Final integration handoff: scratch commit 97219bcdabcc2a5394af3cbdeccdcba42d7953b8 was cherry-picked into /home/entity/projects/EntityProcess/agentv.worktrees/public-demo-integration on branch feature/agentv-public-demo as commit 3ae89357. Final verification after cherry-pick: AgentV validate passed for dexter-evals/evals/dexter-finance-smoke.eval.yaml and dexter-evals/.agentv/targets.yaml using non-secret dummy target-selection env; scrubbed setup preflight failed actionably for missing DEXTER_REPO_PATH, OPENAI_API_KEY, FINANCIAL_DATASETS_API_KEY, search key, and OPENAI_MODEL, and printed no resolved secret values/private endpoints. Integration checkout still has a pre-existing unstaged .gitignore change for .grepai/ that was not part of this bead.","created_at":"2026-06-04T03:19:04Z"},{"id":34,"issue_id":"av-1sr","author":"entity","text":"Migrated scratch-worktree note from /home/entity/projects/EntityProcess/agentv.worktrees/public-demo-dexter-evals: worker started U3 Dexter companion work with scope limited to public-safe AgentV config/eval/scripts/.env.example/README, Dexter pin/prereq docs, missing-env failure, Dexter-derived eval pattern, one local result if env permits, and separate follow-up Beads for AgentV friction. Downstream result-sync/dashboard beads only receive blocker/follow-up notes.","created_at":"2026-06-04T03:56:04Z"},{"id":35,"issue_id":"av-1sr","author":"BlackMeadow","text":"Scope superseded after user design correction: do not present this as a dexter-evals project. The durable demo project should be financial-research-agent, a coding/web research agent attempting to reproduce Dexter-style financial research against Dexter's public finance_agent.csv golden answers. Dexter remains a pinned upstream fixture/source attribution and optional compatibility target only; default demo path must not require FINANCIAL_DATASETS_API_KEY. Follow-up bead: av-fo9.","created_at":"2026-06-04T04:16:41Z"}]}
 {"id":"av-2lq","title":"research(private): stand up Margin Eval in framework-parity repo","description":"Problem:\nWe have used Margin Eval as a design reference for filesystem-native benchmark packaging, immutable run bundles, resume, and agent/output trace capture, but current AgentV planning appears to rely on report-level analysis rather than a live private Margin setup. The user asked to add Margin Eval setup in EntityProcess/wtg-ai-prompts-experiment so implementation workers can understand how it works before finalizing AgentV bundle/schema details.\n\nScope:\n- Work only in the private EntityProcess/wtg-ai-prompts-experiment repo or an isolated scratch/worktree; do not add Margin artifacts to public AgentV docs/code.\n- Clone or otherwise inspect Margin-Lab/evals and at least one minimal suite/config path. If a local clone already exists elsewhere, record the path and commit instead of duplicating it.\n- Run the smallest feasible dry-run or no-secret smoke that demonstrates Margin output directory structure, resume metadata, run bundle files, logs/traces, agent config, suite config, and artifact naming.\n- Compare observed Margin output to AgentV v1 bundle direction: run_manifest.json, target_recipe.json, run_source.json, index.jsonl responsibilities, per-test folders, redaction, and copied-vs-referenced source material.\n- Record a concise private note under framework-parity/ and a Beads comment with the branch/commit and any concrete schema lessons.\n\nAcceptance:\n- Private note includes the Margin version/commit inspected, commands attempted, whether a dry-run/smoke succeeded, and the observed run output tree.\n- Note clearly says which Margin patterns AgentV should borrow and which should remain out of core.\n- Any discovered blocker is captured with enough detail for a follow-up worker.\n- No private repo URLs, secrets, raw env dumps, OAuth files, or vendored Margin source are added to AgentV public docs/code.\n\nNon-goals:\n- Do not implement AgentV run-bundle code in this task.\n- Do not turn AgentV into a Margin-compatible runner or clone Margin schemas wholesale.","acceptance_criteria":"In addition to the description acceptance:\n- Private implementation/setup in EntityProcess/wtg-ai-prompts-experiment reaches a usable Margin Eval smoke or records a concrete blocker with commands/logs.\n- Compare Margin Eval vs AgentV on authoring ceremony, task/case layout, target/agent config, environment isolation, source snapshots, output/run bundle layout, resume/rerun behavior, redaction, and dashboard/audit usability.\n- End with a clear product decision: modify AgentV code now, add/adjust AgentV examples/templates/docs, or defer to run-bundle schema work only.\n- If code changes are recommended, identify exact Beads/modules and why examples/templates are insufficient. If examples/templates are recommended, identify which examples/templates and why core should stay unchanged.\n- Do not make AgentV code changes inside this private Margin setup task; open/update follow-up Beads instead.","notes":"Completed with corrected design on 2026-06-08. Final private note commit: d8a8a870c14fcc9f1a47c9f2380389ddb97c5db4 on private/av-2lq-margin-eval-parity. Final recommendation supersedes comment #288: no separate AgentV code change from this research bead; av-wy0.3 owns implementation of self-contained per-test artifacts using eval.yaml, targets.yaml, copied files, and copied grader assets. No run_source.json, target_recipe.json, or run_manifest.json unless a concrete consumer later proves existing artifacts cannot serve.","status":"closed","priority":2,"issue_type":"task","assignee":"codex-av-2lq","created_at":"2026-06-08T13:58:01.413920918Z","created_by":"entity","updated_at":"2026-06-08T21:51:42.928515066Z","closed_at":"2026-06-08T21:33:04.579426195Z","close_reason":"Completed private Margin parity research and revised final recommendation after user design review. Private note pushed at d8a8a870c14fcc9f1a47c9f2380389ddb97c5db4. Durable AgentV design is now documented in av-wy0/av-wy0.2/av-wy0.3/av-wy0.4/av-wy0.5: self-contained per-test eval.yaml/targets.yaml/files/graders artifacts, no run_source.json/target_recipe.json/run_manifest.json schema unless later proven necessary.","source_repo":"agentv","source_repo_path":"/home/entity/projects/EntityProcess/agentv","compaction_level":0,"original_size":0,"labels":["framework-parity","margin","private","run-bundles"],"dependencies":[{"issue_id":"av-2lq","depends_on_id":"av-l52","type":"related","created_at":"2026-06-08T13:58:01.413920918Z","created_by":"entity","metadata":"{}","thread_id":""},{"issue_id":"av-2lq","depends_on_id":"av-wy0","type":"related","created_at":"2026-06-08T13:58:01.413920918Z","created_by":"entity","metadata":"{}","thread_id":""}],"comments":[{"id":287,"issue_id":"av-2lq","author":"entity","text":"Dispatch note (FuchsiaStream, 2026-06-08): spawned NTM Codex worker for Margin Eval private setup. Session: agentv--margin-eval. Pane/Agent Mail identity: SilentRobin. Scope: work in private repo /home/entity/projects/EntityProcess/wtg-ai-prompts-experiment on a dedicated private branch/worktree; clone/inspect Margin-Lab/evals outside the private repo; run smallest no-secret smoke/dry-run; write private framework-parity note; compare Margin vs AgentV; recommend code change vs examples/templates/docs vs defer; do not modify public AgentV code in this task. Worker should update av-2lq with branch/commit, Margin commit, commands, output tree, pros/cons, recommendation, Beads changes, and blockers.","created_at":"2026-06-08T20:29:45Z"},{"id":288,"issue_id":"av-2lq","author":"entity","text":"Handoff (codex-av-2lq, 2026-06-08): completed private Margin Eval framework-parity note; no public AgentV code changed.\n\nPath assumptions: AgentV Beads/status/comments used /home/entity/projects/EntityProcess/agentv explicitly. Private work used /home/entity/ntm_Dev/wtg-av-2lq-margin-parity, a worktree of /home/entity/projects/EntityProcess/wtg-ai-prompts-experiment. This handoff does not rely on /home/entity/ntm_Dev/agentv being the AgentV Beads checkout.\n\nPrivate branch/commit: EntityProcess/wtg-ai-prompts-experiment private/av-2lq-margin-eval-parity @ 5867096af01ee992d186a1b5b84bdb259955eda3. Note path: framework-parity/margin-eval-wtg-pr-run-parity.md. Branch was pushed to origin.\n\nMargin inspected: cloned https://github.com/Margin-Lab/evals.git at /home/entity/ntm_Dev/margin-evals-av-2lq, commit 53fb2fd080689efaf7934573d8759d14fc1043e4 (Add samples_per_case support for eval runs). Inspected runbundle, runfs, resume, localrunner, output_files, agent/eval TOML docs, and swe-minimal suite/case layout.\n\nReal WTG run evidence used instead of Margin dry-run per user preference: /home/entity/projects/WiseTechGlobal/WTG.AI.Prompts.EvalResults/.agentv/results/runs/default/pr679-pr50857-clean-2026-06-08T05-42-55Z. Results repo commit 597ef63632b0ba1239ff179087558e29ee694bb7. Source eval repo commit inspected: /home/entity/projects/WiseTechGlobal/WTG.AI.Prompts @ 87eb8ba456d47767729ceeb246e51f81865ef99d. Run was real Copilot target, 2 tests, aggregate pass_rate mean 0.75, duration 243.818s, index.jsonl 2 rows, transcript.jsonl 6 rows, size 728K.\n\nObserved WTG output tree summary: benchmark.json, index.jsonl, run-source.json, timing.json, transcript.jsonl, and per-test folders under data-transformation-pr50857-e2e/<test_id>/ with input.md, grading.json, timing.json, outputs/response.md. Per-test scores: offline implementation review 0.6 (rubrics 0.5, skill-trigger 1.0); online chunking review 1.0 (rubrics 1.0, skill-trigger 1.0).\n\nMargin model summary: Margin local runs use results.json plus internal/bundle.json, internal/manifest.json, internal/progress.json, internal/events.jsonl, internal/artifacts.json, and instances/<instance_id>/ result/trajectory/log folders. Resume is driven by bundle hash, progress snapshot, instance keys, and carry-forward/rerun planning.\n\nCommands captured in note: git clone Margin, gh pr view private WTG PR #1, private worktree creation/merge, find output tree, jq index summary, attempted go test ./runner/runner-local/runfs ./runner/runner-core/resume ./runner/runner-local/localrunner.\n\nSmoke result/blocker: Margin Go unit smoke could not run because Go is not installed (zsh: command not found: go). No installed margin binary or ~/.margin config was present. Docker is available, so blocker is missing Go/prebuilt Margin CLI, not Docker. I did not run a Margin dry-run because user asked to prefer the real WTG PR run.\n\nRecommendation: defer AgentV code changes to av-wy0.3; do not start a separate AgentV code change from this Margin task. av-wy0.3 should implement run_manifest.json and target_recipe.json, hard-deprecate pre-release run-source.json to run_source.json before release, make run_manifest.json reference run_source.json, and keep target_recipe.json redacted with required env names/placeholders and config fingerprints only. Use examples/templates/docs for Margin-style case directories; av-l52 and av-wy0.5 already cover the likely template/docs work. Resume execution remains av-wy0.4. No new Bead needed.","created_at":"2026-06-08T20:51:40Z"},{"id":290,"issue_id":"av-2lq","author":"codex-av-2lq","text":"Design correction after user review (codex-av-2lq, 2026-06-08): revised the Margin parity recommendation. The durable AgentV design should not add run_source.json or target_recipe.json. Instead, per-test artifact folders should be self-contained and use native AgentV conventions: eval.yaml with exactly the test, targets.yaml with the selected target and placeholders, copied test files, copied grader assets, plus existing input/output/grading/timing artifacts. index.jsonl remains the run-level index pointing at those folders.\n\nI updated av-wy0.3 title/description/acceptance to reflect this goal directly and to supersede the older manifest/recipe comments. This is straightforward within the existing av-wy0 epic, not a new epic.","created_at":"2026-06-08T21:26:22Z"},{"id":295,"issue_id":"av-2lq","author":"entity","text":"Final corrected handoff (codex-av-2lq, 2026-06-08): private Margin parity note was revised and pushed at EntityProcess/wtg-ai-prompts-experiment private/av-2lq-margin-eval-parity @ d8a8a870c14fcc9f1a47c9f2380389ddb97c5db4, path framework-parity/margin-eval-wtg-pr-run-parity.md. This supersedes the stale recommendation in comment #288. Final recommendation: no AgentV code change from av-2lq; defer implementation to av-wy0.3 with self-contained per-test artifacts. Do not add run_source.json, target_recipe.json, or run_manifest.json. Use eval.yaml with one test, targets.yaml with selected target/placeholders, copied test files, copied grader assets, and existing input/output/grading/timing artifacts. Updated av-wy0, av-wy0.2, av-wy0.3, av-wy0.4, and av-wy0.5 accordingly. This is straightforward inside the existing av-wy0 epic, not a new epic.","created_at":"2026-06-08T21:32:48Z"},{"id":301,"issue_id":"av-2lq","author":"entity","text":"Post-close design correction (codex-av-2lq, 2026-06-08): final av-wy0 design now uses per-test inputs/ bundles, not task/ and not root-level eval.yaml/targets.yaml. The corrected recommendation is: no AgentV code change from av-2lq; av-wy0.3 should extract a reusable input-bundle materializer that writes inputs/EVAL.yaml, inputs/targets.yaml, inputs/files/, and inputs/graders/ beside input.md/grading.json/timing.json/outputs. Reruns should consume these by explicit path and write to a separate output run directory to avoid nested .agentv/results artifacts.","created_at":"2026-06-08T21:43:15Z"},{"id":307,"issue_id":"av-2lq","author":"entity","text":"Final naming correction (codex-av-2lq, 2026-06-08): final design uses task/ rather than inputs/. input.md is rendered agent input; task/ is the runnable task contract and contains task/EVAL.yaml, task/targets.yaml, task/files/, and task/graders/. This supersedes any intermediate inputs/ wording in comments or private-note drafts.","created_at":"2026-06-08T21:49:55Z"},{"id":308,"issue_id":"av-2lq","author":"entity","text":"Final private note revision (codex-av-2lq, 2026-06-08): pushed EntityProcess/wtg-ai-prompts-experiment private/av-2lq-margin-eval-parity @ 6bd57e70357624792c3ea58b90b06499cc4e7647. Note path: framework-parity/margin-eval-wtg-pr-run-parity.md. This version matches the final av-wy0 design: task/EVAL.yaml, task/targets.yaml, task/files/, task/graders/ beside input.md/grading/timing/outputs, extracted materializer independent of eval execution, no nested .agentv/results output, no run_source/target_recipe/run_manifest schema.","created_at":"2026-06-08T21:51:42Z"}]}
+{"id":"av-33j","title":"cleanup: remove eval --benchmark-json","description":"Follow-up from av-eval-output-config-surface-4e2. Observable behavior today: agentv eval still accepts --benchmark-json <path>, prints a deprecation warning, and writes a separate Agent Skills compatibility benchmark JSON even though benchmark.json is always written into the canonical run directory. Simpler model: remove the extra flag in a future breaking-change window and direct users to the run directory benchmark.json or a dedicated export/conversion wrapper if compatibility output remains needed. Migration notes: audit any Agent Skills compatibility consumers first; update docs/tests that mention --benchmark-json; keep canonical --output <dir> semantics unchanged.","status":"open","priority":3,"issue_type":"task","created_at":"2026-06-09T00:57:25.472739425Z","created_by":"entity","updated_at":"2026-06-09T00:57:25.472739425Z","source_repo":"av-output-config","source_repo_path":"/home/entity/projects/EntityProcess/agentv.worktrees/av-output-config","compaction_level":0,"original_size":0,"labels":["breaking-change","cleanup","cli"]}
 {"id":"av-3j2","title":"public demo: wire projects into dashboard setup and capture UX gaps","description":"Plan: docs/plans/public-agentv-demo-projects.md#u5-wire-public-projects-into-local-and-deployment-demo-setup\nRequirements: R1, R2, R3, R4, R5, R19, R20, R21, R22, R23\n\nAcceptance:\n- Update public demo/deployment setup to register AgentV examples, dexter-evals, and swe-evals without private WiseTech projects.\n- Configure public result-repo mappings for dexter-evals and swe-evals.\n- Reuse existing clean clones and avoid destroying dirty clones.\n- Verify generated projects.yaml/result config, rebuild Dashboard frontend before UAT, and confirm remote-synced results appear.\n- Capture Dashboard UX gaps found from realistic data as follow-up Beads with evidence.\n- Capture AgentV core gaps found during conversion as focused follow-up plans/Beads unless they block the demo.","status":"closed","priority":1,"issue_type":"task","assignee":"codex-public-demo-plan","created_at":"2026-06-04T02:16:12.418786279Z","created_by":"codex-public-demo-plan","updated_at":"2026-06-05T12:46:53.501046180Z","closed_at":"2026-06-05T12:46:53.500844534Z","close_reason":"Completed via public demo deployment wiring on agentv-deploy feat/public-demo-results: setup registers agentv, financial-research-agent, and swe-evals with public result mappings; clean Dashboard setup verified remote-synced results. Evidence recorded through av-7m2 comment #68.","source_repo":"agentv","source_repo_path":"/home/entity/projects/EntityProcess/agentv","compaction_level":0,"original_size":0,"labels":["dashboard","deploy","public-demo"],"dependencies":[{"issue_id":"av-3j2","depends_on_id":"av-1sr","type":"blocks","created_at":"2026-06-04T02:16:12.981140557Z","created_by":"codex-public-demo-plan","metadata":"{}","thread_id":""},{"issue_id":"av-3j2","depends_on_id":"av-7m2","type":"blocks","created_at":"2026-06-04T02:16:13.067743868Z","created_by":"codex-public-demo-plan","metadata":"{}","thread_id":""},{"issue_id":"av-3j2","depends_on_id":"av-9fk","type":"blocks","created_at":"2026-06-04T02:16:12.863732542Z","created_by":"codex-public-demo-plan","metadata":"{}","thread_id":""},{"issue_id":"av-3j2","depends_on_id":"av-fo9","type":"blocks","created_at":"2026-06-04T04:16:43.904330712Z","created_by":"entity","metadata":"{}","thread_id":""}],"comments":[{"id":12,"issue_id":"av-3j2","author":"codex-public-demo-plan","text":"Created from doc review handoff. Requirements: docs/brainstorms/2026-06-04-public-agentv-demo-projects-requirements.md. Plan: docs/plans/public-agentv-demo-projects.md. Follow-up rule: Dashboard UX gaps and AgentV core gaps discovered during implementation should become separate focused Beads with evidence.","created_at":"2026-06-04T02:16:46Z"},{"id":17,"issue_id":"av-3j2","author":"codex-public-demo-plan","text":"Agent Mail broadcast attempted by IvoryDune on thread public-agentv-demo-projects. Delivery was blocked by contact policy for CoralGlen and QuietCove; pending contact requests were created by the Agent Mail server. Broadcast body summarized plan docs, claimed Beads, repo topology, Dashboard UX-gap follow-up rule, AgentV core-gap follow-up rule, secret handling, and result-sync artifact boundary.","created_at":"2026-06-04T02:19:02Z"},{"id":30,"issue_id":"av-3j2","author":"entity","text":"Dexter source-project handoff from av-1sr: dexter-evals is ready for project registration in the public-demo integration checkout. It validates with non-secret target-selection env and missing-env setup fails safely. Dashboard-visible real run data is pending a credentialed Dexter run because this session lacks provider/data/search env; do not assume dexter-evals-results artifacts exist yet.","created_at":"2026-06-04T03:17:38Z"},{"id":57,"issue_id":"av-3j2","author":"SilentCave","text":"bead-spawn-agent launched an agent for av-3j2.\n\nSession: agent-av-3j2-main-20260605120554\nDirectory: /home/entity/projects/EntityProcess/agentv\nProfile: codex-eng (auto-detected if not specified)\n\nExported EP_TASK_ID, BEAD_ID, and AGENTV_BEAD_ID as av-3j2.\nBeads coordination checkout: /home/entity/projects/EntityProcess/agentv","created_at":"2026-06-05T10:05:55Z"},{"id":59,"issue_id":"av-3j2","author":"entity","text":"Status review 2026-06-05: av-3j2 is in_progress/assigned to codex-public-demo-plan, but I found no implementation branch/worktree for U5 and no AgentV source edits to dashboard setup. The only git worktree registered for agentv is the main checkout; /home/entity/projects/EntityProcess/agentv.worktrees is empty. Evidence: U5 plan still requires public project registration + result mappings + Dashboard UAT; agentv-deploy main is clean but still wires private WiseTech projects in docker-entrypoint.sh, scripts/setup-local-agentv-dev.sh, scripts/run-local-agentv.sh, scripts/validate-config.sh, and README. Companion source repos are ready/clean: financial-research-agent main at abf4384 and swe-evals main at 5a47b59. Existing public result repo state is incomplete/ambiguous: agentv-examples-eval-results exists, financial-research-agent-eval-results exists locally, README/Beads now say financial-research-agent-evals, and no local swe-evals-results repo is present. Blockers/risks: av-7m2 result-sync contract remains in_progress; result repo name mismatch must be resolved before wiring; remote-synced artifacts for finance/SWE are not verified; Dashboard frontend rebuild/browser UAT and UX-gap capture have not happened. Recommended next action: finish av-7m2 first by choosing/creating the canonical finance + SWE public result repos and producing/pulling public-safe artifacts, then implement U5 in agentv-deploy by replacing the private WiseTech profile with agentv + financial-research-agent + swe-evals, update validation/docs, run --no-serve setup, inspect projects.yaml/result config, rebuild apps/dashboard/dist, and perform Dashboard UAT with follow-up Beads for UX gaps.","created_at":"2026-06-05T10:12:58Z"}]}
 {"id":"av-3j8","title":"investigate Pi gpt-5.5 subscription reasoning effort control","description":"Goal: determine what reasoning/thinking level Pi uses when gpt-5.5 (subscription) is selected, and what AgentV/provider changes are needed so users can set it to medium. Acceptance: inspect existing Pi provider/target config support and any Pi CLI/API flags/env/config for reasoning effort; run safe local probes if available; document the observed default behavior for gpt-5.5 subscription; identify whether medium can be selected today; if missing, propose or implement the smallest AgentV change to expose medium reasoning for Pi without over-broad provider knobs; add focused tests/docs if code changes are made; record evidence and commands in Beads.","status":"closed","priority":1,"issue_type":"task","assignee":"entity","created_at":"2026-06-05T13:26:00.566552167Z","created_by":"entity","updated_at":"2026-06-05T13:51:20.964410603Z","closed_at":"2026-06-05T13:51:20.963272557Z","close_reason":"Completed investigation and pushed docs/tests on spike/av-3j8-pi-reasoning. Runtime evidence shows Pi gpt-5.5 supports medium and defaults to medium through the Pi SDK; AgentV can select it today via thinking: medium. Commit: 10dad6c8 docs(pi): document thinking level config.","source_repo":"agentv","source_repo_path":"/home/entity/projects/EntityProcess/agentv","compaction_level":0,"original_size":0,"labels":["codex","pi","providers","reasoning"],"comments":[{"id":71,"issue_id":"av-3j8","author":"entity","text":"bead-spawn-agent launched an agent for av-3j8.\n\nSession: agent-av-3j8-main-20260605152735\nDirectory: /home/entity/projects/EntityProcess/agentv.worktrees/spike-av-3j8-pi-reasoning\nProfile: codex-eng (auto-detected if not specified)\n\nExported EP_TASK_ID, BEAD_ID, and AGENTV_BEAD_ID as av-3j8.\nBeads coordination checkout: /home/entity/projects/EntityProcess/agentv\nWorktree: /home/entity/projects/EntityProcess/agentv.worktrees/spike-av-3j8-pi-reasoning","created_at":"2026-06-05T13:27:36Z"},{"id":74,"issue_id":"av-3j8","author":"entity","text":"Investigation evidence and outcome:\n- Worktree base verified with git fetch origin; HEAD and origin/main are both a5452d8c32314f8de256a5d27d91802b35f3e7df.\n- AgentV runtime already supports Pi thinking control: packages/core/src/evaluation/providers/targets.ts resolves target thinking/pi_thinking for both pi-coding-agent and pi-cli; pi-coding-agent passes it to createAgentSession as thinkingLevel; pi-cli emits --thinking <value>.\n- Local Pi CLI probe: pi --help on pi 0.78.1 lists --thinking <level> with off, minimal, low, medium, high, xhigh, and supports model shorthand like --model sonnet:high.\n- Local Pi SDK/package probe: @earendil-works/pi-coding-agent DEFAULT_THINKING_LEVEL is medium. For @earendil-works/pi-ai gpt-5.5, getSupportedThinkingLevels returns off, low, medium, high, xhigh; clampThinkingLevel(gpt-5.5, medium) returns medium.\n- Answer: when AgentV selects pi-coding-agent subprovider openai-codex/model gpt-5.5 and does not set thinking, Pi SDK default is medium. Medium can be selected today with thinking: medium (or pi_thinking: medium) for pi-coding-agent, and with thinking: medium for pi-cli which becomes --thinking medium.\n- Smallest useful AgentV change implemented: docs now expose existing Pi target fields and gpt-5.5 subscription example; focused tests now lock medium target resolution for pi-coding-agent and pi-cli.\n- Verification: initial focused test run failed before targets.test.ts due missing fast-glob in incomplete node_modules; ran bun install; reran bun test packages/core/test/evaluation/providers/targets.test.ts packages/core/test/evaluation/providers/pi-coding-agent.test.ts packages/core/test/evaluation/providers/pi-cli-tool-extraction.test.ts -> 71 pass, 0 fail.\n","created_at":"2026-06-05T13:48:04Z"}]}
 {"id":"av-3yr","title":"public demo: browser UAT for public Dashboard setup","description":"Follow-up after av-7m2/av-3j2. Current evidence verifies clean Dashboard setup through APIs and remote-sync endpoints, but not full browser UAT. Acceptance: rebuild Dashboard frontend, launch clean public demo setup with AGENTV_HOME isolated from private projects, use agent-browser to verify the projects page shows only public projects, remote-synced finance/SWE runs appear, run detail pages open, and any UX/core gaps found with realistic public data are captured as separate Beads with screenshots/evidence.","status":"closed","priority":1,"issue_type":"task","assignee":"entity","created_at":"2026-06-05T12:50:04.513195108Z","created_by":"entity","updated_at":"2026-06-06T04:10:34.509360995Z","closed_at":"2026-06-06T03:40:35.416546030Z","close_reason":"Completed browser UAT for public Dashboard setup. Remote result sync works for finance and SWE public result repos; detail materialization works. Screenshots saved to agentv-assets-private dogfood/av-3yr-public-dashboard-uat. Follow-up bugs opened: av-fgt for stale public setup config shape and av-jk9 for remote run list count/source affordance issues.","source_repo":"agentv","source_repo_path":"/home/entity/projects/EntityProcess/agentv","compaction_level":0,"original_size":0,"labels":["dashboard","public-demo","uat"],"comments":[{"id":78,"issue_id":"av-3yr","author":"entity","text":"Public Dashboard UAT completed 2026-06-06 with isolated config home `/tmp/agentv-public-uat-home` and Dashboard on localhost:3219. Preflight: rebuilt `apps/dashboard/dist` with `cd apps/dashboard && bun run build`; source setup synced public repos; current AgentV required manual config rewrite to `projects[].results` because agentv-deploy still emits stale `projects.yaml`/`results_by_project` shape (follow-up `av-fgt`).\n\nRemote result sync verification:\n- `/api/projects` listed exactly 3 projects: agentv, financial-research-agent, swe-evals.\n- `POST /api/projects/financial-research-agent/remote/sync` returned configured/available true for `christso/financial-research-agent-evals`, path `/home/entity/projects/EntityProcess/financial-research-agent-evals`, run_count=2.\n- `POST /api/projects/swe-evals/remote/sync` returned configured/available true for `EntityProcess/swe-evals-results`, path `/home/entity/projects/EntityProcess/swe-evals-results`, run_count=2.\n- Remote detail materialization worked: finance remote live run returned 1 result; SWE remote live run returned 3 results.\n\nCanonical result repo commits verified:\n- `christso/financial-research-agent-evals@954e1fd` with `.agentv/results/runs/av-h60-live-codex-azure/2026-06-05T14-15-35-082Z`.\n- `EntityProcess/swe-evals-results@72ffa07` with `.agentv/results/runs/av-h60-live-codex-azure/2026-06-05T14-18-58-279Z`.\n\nBrowser UX evidence saved under agentv-assets-private `dogfood/av-3yr-public-dashboard-uat/` screenshots 01-09. UI flows verified: projects page, finance all/remote/detail, SWE all/remote/detail, and Sync Remote Results button. UX/product gaps captured as `av-fgt` and `av-jk9`.","created_at":"2026-06-06T03:40:35Z"},{"id":80,"issue_id":"av-3yr","author":"entity","text":"Screenshot evidence pushed in agentv-assets-private commit 67dc6fb (dogfood/av-3yr-public-dashboard-uat/01-09).","created_at":"2026-06-06T03:42:12Z"},{"id":83,"issue_id":"av-3yr","author":"entity","text":"Post-UAT repo ownership update 2026-06-06: finance source/results are now public EntityProcess sibling repos: `EntityProcess/financial-research-agent@90863fe` and `EntityProcess/financial-research-agent-evals@245cd12`. agentv-deploy public demo config pushed at `3a7eb38` with EntityProcess owner references.","created_at":"2026-06-06T04:10:34Z"}]}
@@ -12,7 +13,7 @@
 {"id":"av-ams","title":"feat(dashboard): make remote sync outcome explicit","description":"Dogfood evidence from WTG.AI.Prompts remote sync on 2026-06-06.\\n\\nObservable behavior:\\n- Remote status API returns repo, last_synced_at, and run_count.\\n- Toolbar shows repo and last synced in low-emphasis text, but not remote run count.\\n- Clicking Sync Remote Results changes the button to Syncing..., then silently returns to the same state. There is no success confirmation, no changed count, and no visible failure path unless last_error appears.\\n\\nPlan:\\n- Keep the existing toolbar primitive; add concise status text using existing RemoteStatusResponse: remote run count, last synced, repo.\\n- After sync resolves, show a transient success state such as Synced 1 remote run at <time>.\\n- Preserve lightweight core: no new backend mechanism unless needed; use the POST response plus existing query invalidation.\\n- Make error state prominent and actionable if sync fails.\\n\\nAcceptance:\\n- Remote toolbar communicates configured repo, remote run count, and last sync time.\\n- Successful manual sync produces visible confirmation without requiring page refresh.\\n- Failure state includes the backend error and keeps the remote filter usable when cached data exists.\\n- Coverage exercises WTG-like status (`WiseTechGlobal/WTG.AI.Prompts.EvalResults`, run_count=1).","status":"closed","priority":2,"issue_type":"feature","assignee":"entity","created_at":"2026-06-06T05:15:12.477765873Z","created_by":"entity","updated_at":"2026-06-06T22:28:09.727920245Z","closed_at":"2026-06-06T22:28:09.727663439Z","close_reason":"Merged via PR #1313 (feat(dashboard): clarify remote sync outcome).","source_repo":"agentv","source_repo_path":"/home/entity/projects/EntityProcess/agentv","compaction_level":0,"original_size":0,"labels":["dashboard","remote-sync","ux"],"comments":[{"id":143,"issue_id":"av-ams","author":"entity","text":"Launching NTM-managed Codex worker after tmux recovery cleanup. Session: agentv-av-ams-sync-outcome. Implementation checkout: /home/entity/ntm_Dev/agentv-av-ams-sync-outcome on branch feature/av-ams-remote-sync-outcome. Coordination checkout for br only: /home/entity/projects/EntityProcess/agentv. Repo focus: EntityProcess/agentv. Explicit remote sync outcome UX. Monitor with: ntm status agentv-av-ams-sync-outcome; ntm view agentv-av-ams-sync-outcome.","created_at":"2026-06-06T14:28:11Z"},{"id":164,"issue_id":"av-ams","author":"entity","text":"Implemented remote sync outcome UX on feature/av-ams-remote-sync-outcome.\n\nCode changes:\n- Reused RemoteStatusResponse only; no backend payload changes.\n- Dashboard toolbar now renders shared status items for project, remote run count, last sync time, and repo.\n- Manual sync success now says e.g. \"Synced 1 remote run from WiseTechGlobal/WTG.AI.Prompts.EvalResults at <time>.\" and auto-clears after 7s.\n- Manual sync failures and status last_error now include backend error plus action text; cached remote runs remain usable when available.\n- Added WTG-like helper coverage for WiseTechGlobal/WTG.AI.Prompts.EvalResults with run_count=1.\n- Added .ntm/** to Biome ignore so local generated NTM coordination files do not break repo lint.\n\nVerification:\n- bun test apps/dashboard/src/lib/project-sync-status.test.ts\n- bunx biome check apps/dashboard/src/lib/project-sync-status.ts apps/dashboard/src/lib/project-sync-status.test.ts apps/dashboard/src/components/RunSourceToolbar.tsx apps/dashboard/src/routes/index.tsx apps/dashboard/src/routes/projects/$projectId.tsx\n- bun --filter @agentv/dashboard build\n- bun run test\n- agent-browser red/green UAT with WTG-like stub API: /tmp/agentv-av-ams-red.png, /tmp/agentv-av-ams-green.png, /tmp/agentv-av-ams-error.png\n- bun run verify\n\nRed/green notes:\n- Red on origin/main: manual sync confirmation was \"Sync complete: pulled remote results.\" and did not include repo/count/time.\n- Green on this branch: manual sync confirmation includes \"Synced 1 remote run from WiseTechGlobal/WTG.AI.Prompts.EvalResults at ...\"; error state shows \"GitHub authentication failed\" plus cached-run/action guidance.","created_at":"2026-06-06T15:37:39Z"},{"id":167,"issue_id":"av-ams","author":"entity","text":"Pushed implementation commit 782d09c0 (feat(dashboard): clarify remote sync outcome) to origin/feature/av-ams-remote-sync-outcome.\n\nPR creation URL from git remote: https://github.com/EntityProcess/agentv/pull/new/feature/av-ams-remote-sync-outcome\n\nFinal pre-push hook also passed: agentv/core typecheck, phoenix-adapter typecheck, agentv typecheck, and biome check .","created_at":"2026-06-06T15:41:39Z"}]}
 {"id":"av-ch1","title":"fix(dashboard): polish remote result UX flows","description":"User requested a Dashboard UX pass and fixes for common flows, especially tags, remote syncing, combining multiple runs into one run, deleting uncombined/source runs, and related interactions. Acceptance: dogfood current UX, implement focused fixes for discovered gaps, verify with tests/browser evidence, push branch, and open a PR with explanation and evidence.","status":"closed","priority":1,"issue_type":"bug","assignee":"entity","created_at":"2026-06-07T06:05:43.741136619Z","created_by":"entity","updated_at":"2026-06-07T08:26:32.176449723Z","closed_at":"2026-06-07T08:26:32.176296659Z","close_reason":"Acceptance satisfied. PR #1318 fix(dashboard): clarify remote run actions merged at d9ba66b8b677b65900b0e145933ee43c716f820f; verification in comment #220 includes Dashboard tests 61 pass, build, Biome, browser UAT, pre-push hook; private visual evidence pushed at agentv-assets-private@0d6997d under dogfood/av-ch1-dashboard-ux/.","closed_by_session":"agentv-gap-orchestrator","source_repo":"agentv","source_repo_path":"/home/entity/projects/EntityProcess/agentv","compaction_level":0,"original_size":0,"labels":["dashboard","remote-sync","ux"],"comments":[{"id":220,"issue_id":"av-ch1","author":"entity","text":"Implemented Dashboard UX fixes and opened draft PR #1318.\n\nBranch/commit:\n- fix/av-ch1-dashboard-ux @ ec86d5b9\n- PR: https://github.com/EntityProcess/agentv/pull/1318\n\nUX gaps fixed:\n- Run combine/delete action bar was terse and error-only; now it explains selectable local runs, keeps remote runs read-only, and shows success/error feedback.\n- After combining runs, Dashboard now shows the combined run name, an Open combined run link, and a Delete source runs action for the uncombined local source runs.\n- Delete source runs confirmation says the combined run remains; verified with disposable local fixture.\n- Remote tag editor now explains that remote tag edits are local metadata until Sync Metadata pushes them to the results repo.\n- Remote sync button label now reflects state: Sync Metadata for dirty metadata, Push Results when local results are ahead, Sync Project otherwise.\n\nEvidence:\n- Private visual evidence pushed at agentv-assets-private@0d6997d under dogfood/av-ch1-dashboard-ux/.\n- Red screenshots: red-finance-runs.png, red-finance-analytics-tags.png.\n- Green screenshots: green-fixture-runs-selected.png, green-fixture-combined-feedback.png, green-fixture-source-runs-deleted.png, green-finance-remote-tag-editor.png.\n- Combine/delete UAT used disposable /tmp/agentv-av-ch1-combine-project, not real demo runs.\n- Remote tag editor was opened without saving changes.\n\nVerification:\n- bunx biome check changed Dashboard files: passed.\n- bun --filter @agentv/dashboard test: 61 pass, 0 fail.\n- bun --filter @agentv/dashboard build: passed.\n- Browser UAT via agent-browser against temporary http://127.0.0.1:3238: passed for combine, open combined run affordance, delete source runs cleanup, and remote tag editor guidance.\n- Pre-push hook passed typecheck and biome check.\n\nCleanup:\n- Temporary UAT server on port 3238 was stopped. Did not kill tmux/NTM sessions.","created_at":"2026-06-07T06:43:25Z"}]}
 {"id":"av-eq3","title":"demo: AgentV GitHub remote sync showcase","description":"Prepare an impressive AgentV Dashboard demo with multiple projects, multiple remote-backed runs, GitHub result syncing, live Codex-agent/Azure-grader evidence, and dogfooded project-level Sync Project metadata push. Use isolated demo config/home; preserve guarded NTM kill policy. Acceptance: Dashboard URL is live; projects and run counts are documented; remote sync succeeds for all configured result repos; at least one remote tag metadata edit is pushed to GitHub through project-level sync; screenshots or API evidence are saved; Bead is updated with branch/commit/evidence/blockers.","status":"closed","priority":1,"issue_type":"task","assignee":"entity","created_at":"2026-06-07T05:50:36.396636877Z","created_by":"entity","updated_at":"2026-06-07T09:52:01.430201855Z","closed_at":"2026-06-07T08:26:31.877305288Z","close_reason":"Acceptance satisfied. Demo Dashboard is live at http://127.0.0.1:3227 with five intended projects; remote status/sync evidence for all five result repos is saved; finance metadata tags were pushed and verified on EntityProcess/financial-research-agent-evals@fe5f3df; live Codex-agent/Azure-grader proof is recorded; private evidence is pushed at agentv-assets-private@91bc585 under dogfood/av-eq3-github-sync-demo/. Caveat preserved: Dashboard counts include local plus remote runs.","closed_by_session":"agentv-gap-orchestrator","source_repo":"agentv","source_repo_path":"/home/entity/projects/EntityProcess/agentv","compaction_level":0,"original_size":0,"labels":["dashboard","demo","github","remote-sync"],"comments":[{"id":217,"issue_id":"av-eq3","author":"entity","text":"Launching EP-owned subagent session agentv-demo-github-sync for AgentV GitHub remote sync demo handoff. PROMPT_UID=demo-sync-20260607-0550.","created_at":"2026-06-07T05:51:49Z"},{"id":218,"issue_id":"av-eq3","author":"entity","text":"Accepted delegated implementation/dogfood handoff. PROMPT_UID=demo-sync-20260607-0550. Scope: stabilize clean Dashboard demo URL, capture API/browser evidence, verify GitHub metadata sync and live Codex-agent/Azure-grader proof, then report back without closing the bead.","created_at":"2026-06-07T05:53:38Z"},{"id":219,"issue_id":"av-eq3","author":"entity","text":"Dogfood verification complete for PROMPT_UID=demo-sync-20260607-0550. Did not kill tmux/NTM sessions.\n\nDashboard URL:\n- http://127.0.0.1:3227\n- Reused the existing server on port 3227. Removed only the generated sixth implementation-worktree entry from /tmp/agentv-github-sync-demo-home/config.yaml; /api/projects now returns exactly the five intended projects.\n\nProjects and run counts:\n- agentv -> EntityProcess/agentv-examples-eval-results: remote sync run_count=8; Dashboard card total=63 because local + remote runs are both listed.\n- financial-research-agent -> EntityProcess/financial-research-agent-evals: remote sync run_count=2; Dashboard card total=10 because local + remote runs are both listed.\n- swe-evals -> EntityProcess/swe-evals-results: remote sync run_count=1; Dashboard card total=4.\n- wtg-ai-prompts -> WiseTechGlobal/WTG.AI.Prompts.EvalResults: remote sync run_count=1; Dashboard card total=100.\n- wisetechacademy-evals -> WiseTechGlobal/WiseTechAcademy.EvalResults: remote sync run_count=3; Dashboard card total=12.\n\nRemote sync API verification:\n- Saved GET /api/projects/:id/remote/status and POST /api/projects/:id/remote/sync JSON for all five projects under /home/entity/projects/EntityProcess/agentv-assets-private/dogfood/av-eq3-github-sync-demo/.\n- All five sync responses: configured=true, available=true, sync_status=clean.\n- The re-run sync calls were clean and did not create additional commits: commit_created=false, push_performed=false.\n\nGitHub metadata proof:\n- Finance tag metadata path: .agentv/results/metadata/runs/av-h60-live-codex-azure/2026-06-05T14-15-35-082Z/tags.json.\n- Local HEAD and origin/main for EntityProcess/financial-research-agent-evals both resolve to fe5f3df, subject chore(results): sync local result metadata.\n- git show origin/main:<tags path> contains tags [live-codex, azure-graded, github-sync-demo, 2026-06-07].\n- Dashboard API shows the pushed tags immediately: tags and remote_tags both match; metadata_dirty=false. No cache bug observed.\n\nLive LLM proof:\n- Evidence file finance-live-codex-azure-proof.json is redacted and omits prompts/answers/secrets.\n- Benchmark metadata targets=[codex].\n- Transcript source provider=codex, target=codex, token usage input=33279 output=939 cached=3456, duration_ms=65673.\n- Score name=rubrics, type=llm-grader, grader target=azure, token usage input=995 output=335, duration_ms=2977.\n\nScreenshots/evidence:\n- Browser screenshots captured with agent-browser session av-eq3-demo, then session closed.\n- dashboard-projects.png: clean five-project Dashboard view.\n- dashboard-finance-live-run.png: finance remote run detail with repo and codex target.\n- Private assets commit pushed: agentv-assets-private@91bc585 (dogfood: add av-eq3 github sync demo evidence).\n\nVerification commands used:\n- curl http://127.0.0.1:3227/api/projects\n- curl GET/POST http://127.0.0.1:3227/api/projects/<id>/remote/{status,sync}\n- curl http://127.0.0.1:3227/api/projects/<id>/runs and finance run detail endpoint\n- jq over finance benchmark.json, index.jsonl, and transcript.jsonl for redacted live proof\n- git -C /home/entity/projects/EntityProcess/financial-research-agent-evals fetch origin && git show origin/main:<tags path>\n- agent-browser --session av-eq3-demo screenshots\n\nBlockers/caveats:\n- No blocker for demo. Important caveat: Dashboard card/run-list counts currently include local project checkout runs in addition to remote results repo runs, so they are intentionally larger than the remote sync run_count values.\n- Source code was not modified.","created_at":"2026-06-07T06:03:20Z"},{"id":221,"issue_id":"av-eq3","author":"entity","text":"PROMPT_UID=stop-orch-20260607-0818 stop-status handoff. Orchestration stopped per user instruction: I did not create new Beads, launch NTM/EP subagents, merge additional PRs, or run cleanup after the stop request.\n\nAlready completed before stop:\n- PR #1318 merged: https://github.com/EntityProcess/agentv/pull/1318, merge commit d9ba66b8b677b65900b0e145933ee43c716f820f, title fix(dashboard): clarify remote run actions.\n- Beads touched: av-eq3 has prior demo handoff comments #217-#219; av-ch1 was created/claimed for dashboard UX fixes and comment #220 records PR #1318 branch/commit/evidence. No new beads were created for the latest requested gaps.\n- Evidence: av-eq3 demo evidence under /home/entity/projects/EntityProcess/agentv-assets-private/dogfood/av-eq3-github-sync-demo/, private commit 91bc585. av-ch1 UX evidence under /home/entity/projects/EntityProcess/agentv-assets-private/dogfood/av-ch1-dashboard-ux/, private commit 0d6997d.\n- Demo/server state: port 3227 is listening under bun PID 894879. Port 3238 has no listener. Browser sessions listed: agentv-dashboard-dev, default.\n- Worktree state: primary checkout /home/entity/projects/EntityProcess/agentv is dirty only in .beads/issues.jsonl; PR worktree /home/entity/projects/EntityProcess/agentv.worktrees/fix-av-ch1-dashboard-ux is clean and at d9ba66b8 on main. Existing NTM/worktree list includes multiple prior agent worktrees; I did not inspect or clean them after stop.\n- Open PRs observed: only PR #1280, draft, DIRTY merge state, head docs/phoenix-integration-completion-plan, last updated 2026-06-03.\n\nNew user-requested work items not delegated yet:\n1. Mobile Dashboard UX: make runs/run-detail tables one-line non-wrapping with horizontal scroll or convert to mobile-friendly layouts, covering wtg-ai-prompts runs and finance run detail pages.\n2. Execution vs quality failures: distinguish execution errors from quality errors, exclude execution errors from scores, and investigate frequency plus smarter retry strategy for malformed AI outputs.\n3. WTG.AI.Prompts realism: create/localize realistic runs for PR WiseTechGlobal/WTG.AI.Prompts#679 using existing/shallow CargoWise checkout strategy where feasible.\n4. Branding: change AgentV/AGENTV presentation to capital-case AgentV with cyan A and V across dashboard, docs, and landing page, with visual dogfood before PR.\n\nWaiting for orchestrator instructions.","created_at":"2026-06-07T08:20:31Z"},{"id":234,"issue_id":"av-eq3","author":"entity","text":"Cleanup note for agentv-demo-github-sync. User explicitly authorized cleanup. Kill gate passed: av-eq3 is CLOSED with evidence at agentv-assets-private@91bc585; target session agentv-demo-github-sync is not the current tmux session; current session is agent-orchestrator; Agent Mail health returned ok; ntm status shows no active assignments and no file locks; target pane is stale after stop-status handoff PROMPT_UID=stop-orch-20260607-0818. Protected survivor sessions verified before kill: agent-orchestrator, agentv-gap-orchestrator, agentv--gap-mobile, agentv--gap-errors, agentv--gap-wtg, agentv--gap-branding. Evidence captured at /tmp/ntm-kill-evidence/agentv-demo-github-sync-20260607T095003Z. Next step is a separate guarded command: timeout 30 ntm kill agentv-demo-github-sync --force, followed immediately by survivor verification.","created_at":"2026-06-07T09:50:47Z"},{"id":235,"issue_id":"av-eq3","author":"entity","text":"Post-cleanup verification for agentv-demo-github-sync. Guarded cleanup completed with evidence at /tmp/ntm-kill-evidence/agentv-demo-github-sync-20260607T095003Z. Command run separately after cleanup note: timeout 30 ntm kill agentv-demo-github-sync --force. Post-check verified target session is gone and protected survivors remain present: agent-orchestrator, agentv-gap-orchestrator, agentv--gap-mobile, agentv--gap-errors, agentv--gap-wtg, agentv--gap-branding. ntm list now reports 6 sessions and no agentv-demo-github-sync. Agent Mail health remains ok. No incident observed.","created_at":"2026-06-07T09:52:01Z"}]}
-{"id":"av-eval-output-config-surface-4e2","title":"cli/config: simplify eval output surface","description":"Problem:\nAgentV's eval output/config surface is bloated and confusing. The current CLI/config paths include canonical --output <dir>, deprecated --out <path>, deprecated --artifacts <dir>, deprecated --output-format, config output.dir fallback, --export for extra files, JUnit -o in eval run, and run bundle artifact generation. This makes it hard to explain what writes index.jsonl, what writes JUnit, and what is the canonical run folder.\n\nUser direction:\n- We should remove --out.\n- Simplify the config surface.\n- Any breaking changes must require an explicit version bump and migration notes.\n\nAcceptance:\n- Audit the current eval output/config surface in CLI, docs, examples, Dashboard launch paths, and known GitHub workflow consumers.\n- Propose and implement a simpler target contract centered on canonical --output <dir> for artifact directories and --export for additional output files.\n- Remove or schedule removal of deprecated --out with a compatibility/versioning plan; do not silently break users in a patch/minor release.\n- Decide whether deprecated --artifacts and --output-format are removed in the same breaking-change window or only receive stronger warnings.\n- Preserve JUnit -o semantics for eval run if it is intentionally distinct, or rename/document it if it conflicts with --output mental model.\n- If behavior is breaking, include package version bump, changelog/migration note, and docs updates in the same PR.\n- Add/adjust CLI tests covering removed/deprecated flags, canonical index.jsonl placement, explicit output directories, --export behavior, and helpful error messages.\n- Include migration notes for known consumers: WiseTechGlobal/sdd uses --artifacts, WiseTechGlobal/WTG.AI.Prompts uses --output .agentv/results/artifacts.\n\nDependencies / related:\n- Related to av-wy0.1 because canonical run bundle behavior changes how explicit outputs are handled.\n- Related to av-wy0 because the run folder should be the audit boundary.\n\nNon-goal:\n- Do not change run manifest or dashboard file visibility in this task; those remain av-wy0.2/av-wy0.3.","status":"open","priority":2,"issue_type":"task","created_at":"2026-06-08T12:55:32.788635265Z","created_by":"entity","updated_at":"2026-06-08T13:42:12.557601242Z","source_repo":"agentv","source_repo_path":"/home/entity/projects/EntityProcess/agentv","compaction_level":0,"original_size":0,"labels":["breaking-change","cli","config","run-bundles"],"dependencies":[{"issue_id":"av-eval-output-config-surface-4e2","depends_on_id":"av-wy0","type":"related","created_at":"2026-06-08T12:56:04.175910502Z","created_by":"entity","metadata":"{}","thread_id":""},{"issue_id":"av-eval-output-config-surface-4e2","depends_on_id":"av-wy0.1","type":"related","created_at":"2026-06-08T12:56:04.440026330Z","created_by":"entity","metadata":"{}","thread_id":""}],"comments":[{"id":280,"issue_id":"av-eval-output-config-surface-4e2","author":"FuchsiaStream","text":"Run bundle design linkage (FuchsiaStream, 2026-06-08): av-wy0 should not remove or reinterpret deprecated output flags. V1 bundle mapping is `--output <dir>` as canonical bundle root, deprecated `--artifacts <dir>` as the same root until cleanup, deprecated `--out <path>` preserving current semantics with bundle files best-effort in `dirname(<path>)`, and `--export` as extra files only. Removing `--out` or changing precedence/warnings waits for this bead with an explicit version bump and migration notes.","created_at":"2026-06-08T13:22:57Z"},{"id":283,"issue_id":"av-eval-output-config-surface-4e2","author":"FuchsiaStream","text":"Clarification from run-bundle schema discussion (FuchsiaStream, 2026-06-08): hard deprecation is acceptable for this-week/unreleased artifact names like `run-source.json` -> `run_source.json`. That does not automatically apply to established CLI output flags such as `--out`/`--artifacts`, which still need the audit/version-bump/migration plan in this bead because they have known consumers.","created_at":"2026-06-08T13:42:12Z"}]}
+{"id":"av-eval-output-config-surface-4e2","title":"cli/config: simplify eval output surface","description":"Problem:\nAgentV's eval output/config surface is bloated and confusing. The current CLI/config paths include canonical --output <dir>, deprecated --out <path>, deprecated --artifacts <dir>, deprecated --output-format, config output.dir fallback, --export for extra files, JUnit -o in eval run, and run bundle artifact generation. This makes it hard to explain what writes index.jsonl, what writes JUnit, and what is the canonical run folder.\n\nUser direction:\n- We should remove --out.\n- Simplify the config surface.\n- Any breaking changes must require an explicit version bump and migration notes.\n\nAcceptance:\n- Audit the current eval output/config surface in CLI, docs, examples, Dashboard launch paths, and known GitHub workflow consumers.\n- Propose and implement a simpler target contract centered on canonical --output <dir> for artifact directories and --export for additional output files.\n- Remove or schedule removal of deprecated --out with a compatibility/versioning plan; do not silently break users in a patch/minor release.\n- Decide whether deprecated --artifacts and --output-format are removed in the same breaking-change window or only receive stronger warnings.\n- Preserve JUnit -o semantics for eval run if it is intentionally distinct, or rename/document it if it conflicts with --output mental model.\n- If behavior is breaking, include package version bump, changelog/migration note, and docs updates in the same PR.\n- Add/adjust CLI tests covering removed/deprecated flags, canonical index.jsonl placement, explicit output directories, --export behavior, and helpful error messages.\n- Include migration notes for known consumers: WiseTechGlobal/sdd uses --artifacts, WiseTechGlobal/WTG.AI.Prompts uses --output .agentv/results/artifacts.\n\nDependencies / related:\n- Related to av-wy0.1 because canonical run bundle behavior changes how explicit outputs are handled.\n- Related to av-wy0 because the run folder should be the audit boundary.\n\nNon-goal:\n- Do not change run manifest or dashboard file visibility in this task; those remain av-wy0.2/av-wy0.3.","status":"closed","priority":2,"issue_type":"task","assignee":"entity","created_at":"2026-06-08T12:55:32.788635265Z","created_by":"entity","updated_at":"2026-06-09T03:54:01.759369830Z","closed_at":"2026-06-09T01:05:58.685668014Z","close_reason":"Implemented eval output surface simplification with migration notes, version bump, tests, UAT, and review.","source_repo":"agentv","source_repo_path":"/home/entity/projects/EntityProcess/agentv","compaction_level":0,"original_size":0,"labels":["breaking-change","cli","config","run-bundles"],"dependencies":[{"issue_id":"av-eval-output-config-surface-4e2","depends_on_id":"av-wy0","type":"related","created_at":"2026-06-08T12:56:04.175910502Z","created_by":"entity","metadata":"{}","thread_id":""},{"issue_id":"av-eval-output-config-surface-4e2","depends_on_id":"av-wy0.1","type":"related","created_at":"2026-06-08T12:56:04.440026330Z","created_by":"entity","metadata":"{}","thread_id":""}],"comments":[{"id":280,"issue_id":"av-eval-output-config-surface-4e2","author":"FuchsiaStream","text":"Run bundle design linkage (FuchsiaStream, 2026-06-08): av-wy0 should not remove or reinterpret deprecated output flags. V1 bundle mapping is `--output <dir>` as canonical bundle root, deprecated `--artifacts <dir>` as the same root until cleanup, deprecated `--out <path>` preserving current semantics with bundle files best-effort in `dirname(<path>)`, and `--export` as extra files only. Removing `--out` or changing precedence/warnings waits for this bead with an explicit version bump and migration notes.","created_at":"2026-06-08T13:22:57Z"},{"id":283,"issue_id":"av-eval-output-config-surface-4e2","author":"FuchsiaStream","text":"Clarification from run-bundle schema discussion (FuchsiaStream, 2026-06-08): hard deprecation is acceptable for this-week/unreleased artifact names like `run-source.json` -> `run_source.json`. That does not automatically apply to established CLI output flags such as `--out`/`--artifacts`, which still need the audit/version-bump/migration plan in this bead because they have known consumers.","created_at":"2026-06-08T13:42:12Z"},{"id":288,"issue_id":"av-eval-output-config-surface-4e2","author":"GentleCompass","text":"Worktree setup confirmed (Codex 2026-06-09): created /home/entity/projects/EntityProcess/agentv.worktrees/av-output-config on feat/av-eval-output-config-surface from origin/main 595394c3. Proceeding with read-only audit of eval output/config flags and consumers before reserving/editing exact files.","created_at":"2026-06-09T00:16:21Z"},{"id":289,"issue_id":"av-eval-output-config-surface-4e2","author":"GentleCompass","text":"Decision note before code changes (Codex 2026-06-09): canonical eval-run output contract will be `--output <dir>` / config `output.dir` as run artifact directory, always writing `<dir>/index.jsonl`; `--export <file>` remains the only extra-file surface, including JUnit XML via `--export results.xml`. Breaking cleanup in this branch: remove behavior for `--out`, `--artifacts`, `--output-format`, and config `output.format`; keep `--benchmark-json` as deprecated/scheduled because it was discovered outside this Bead's listed known surface. `-o` remains only a short alias for `--output <dir>`; file-looking `--output`/`-o` values such as .jsonl/.xml/.html will fail with migration guidance. Package prerelease bump planned: 4.32.0-next.1 -> 4.32.0-next.2. Repo decision/migration note added at docs/plans/2026-06-09-eval-output-surface.md.","created_at":"2026-06-09T00:25:48Z"},{"id":290,"issue_id":"av-eval-output-config-surface-4e2","author":"entity","text":"Implementation + verification update (Codex 2026-06-09): simplified eval output to canonical --output <dir> / config output.dir writing <dir>/index.jsonl, with --export <file> for extra JSON/JUnit/YAML/HTML files. Removed runtime behavior for --out, --artifacts, and --output-format with hard migration errors; config output.format is removed from the schema. Preserved -o only as short alias for --output <dir>, and added file-like --output validation so old -o results.xml guidance points to --export results.xml. Known consumer migration covered: WiseTechGlobal/sdd shape '-o junit.xml --artifacts artifacts' now errors with '--output artifacts --export junit.xml'; WiseTechGlobal/WTG.AI.Prompts already uses --output <dir>. Version bumped 4.32.0-next.1 -> 4.32.0-next.2 and migration note added in docs/plans/2026-06-09-eval-output-surface.md. Verification: bun run build; bun test apps/cli/test/eval.integration.test.ts; bun run typecheck; bun run lint; bun run validate:examples; bun run test. Manual red/green UAT: origin/main accepted --out legacy.jsonl, exit 0, and wrote the flat file; this branch rejects --out with migration guidance and creates no flat file; replacement --output artifacts --export junit.xml exits 0 and writes artifacts/index.jsonl plus junit.xml. Follow-up bead av-33j created for future --benchmark-json removal.","created_at":"2026-06-09T00:57:44Z"},{"id":291,"issue_id":"av-eval-output-config-surface-4e2","author":"entity","text":"Additional verification update (Codex 2026-06-09): added packages/core/test/evaluation/config.test.ts coverage that defineConfig rejects removed output.format; reran bun test packages/core/test/evaluation/config.test.ts and bun run lint successfully after the test addition.","created_at":"2026-06-09T01:01:57Z"},{"id":292,"issue_id":"av-eval-output-config-surface-4e2","author":"entity","text":"Final verification update (Codex 2026-06-09): reran bun run test after the config test addition; current tree passes core/eval/phoenix-adapter/cli/dashboard tests. ce-code-review final diff pass found no blocking issues.","created_at":"2026-06-09T01:04:50Z"},{"id":316,"issue_id":"av-eval-output-config-surface-4e2","author":"entity","text":"Post-rebase shipping verification (Codex 2026-06-09): Agent Mail recheck for project_key /home/entity/projects/EntityProcess/agentv returned no conflicts for intended paths; reservations held by FrostyCompass through 2026-06-09T04:35:48Z. Branch feat/av-eval-output-config-surface is based on current origin/main d678615b after rebase; public AGENTS.md diff contains only canonical command guidance, no local/private paths. Verification after rebase: bun run build; bun run typecheck; bun run lint; bun run validate:examples; bun test apps/cli/test/eval.integration.test.ts packages/core/test/evaluation/config.test.ts apps/cli/test/commands/eval/artifact-writer.test.ts apps/cli/test/commands/eval/task-bundle.test.ts; bun run test. Manual UAT after rebase: --out exits 1 with migration guidance and creates no flat file; '-o junit.xml --artifacts artifacts' exits 1 with --output/--export migration guidance; replacement '--output <dir> --export junit.xml' exits 0 and writes <dir>/index.jsonl plus JUnit XML. No blockers remaining for PR.","created_at":"2026-06-09T02:48:15Z"},{"id":317,"issue_id":"av-eval-output-config-surface-4e2","author":"entity","text":"Shipping update (Codex 2026-06-09): opened PR #1336 for feat(cli): simplify eval output surface: https://github.com/EntityProcess/agentv/pull/1336. Branch pushed: feat/av-eval-output-config-surface. Verification included post-rebase build/typecheck/lint/validate/examples/focused tests/full test suite and CLI UAT for removed flags plus --output/--export replacement.","created_at":"2026-06-09T02:51:07Z"},{"id":320,"issue_id":"av-eval-output-config-surface-4e2","author":"entity","text":"Post-prerequisite-merge rebase verification (Codex 2026-06-09, PROMPT_UID=unblock-output-config-after-merges-20260609): confirmed prerequisite PRs merged to main (#1334 8e6dd1e96f0ac23c5a413768c82403c0535bf905, #1332 083e08c39492b030879a311801d17f6631e909f1, #1331 35263cd707a9a89c95728ae86beb7271b76f2358). Rebasing feat/av-eval-output-config-surface onto origin/main 35263cd707a9a89c95728ae86beb7271b76f2358 completed without conflicts. Post-rebase verification: bun run build; bun run typecheck; bun run lint; bun run validate:examples; bun test apps/cli/test/eval.integration.test.ts packages/core/test/evaluation/config.test.ts apps/cli/test/commands/eval/artifact-writer.test.ts apps/cli/test/commands/eval/task-bundle.test.ts. Manual CLI smoke: --out exits 1 with migration guidance and creates no flat file; '-o junit.xml --artifacts artifacts' exits 1 with --output/--export migration guidance; '--output <dir> --export junit.xml' exits 0 and writes <dir>/index.jsonl plus JUnit XML. Public diff scan for local/private paths/Agent Mail URLs/scripts was clean; public AGENTS.md remains generic. Ready to update PR #1336 branch/body; no blockers.","created_at":"2026-06-09T03:27:55Z"},{"id":321,"issue_id":"av-eval-output-config-surface-4e2","author":"entity","text":"Post-#1335 rebase/verification update (Codex 2026-06-09, PROMPT_UID=output-config-stale-origin-main-correction-20260609): worktree origin/main was stale at 35263cd707a9a89c95728ae86beb7271b76f2358 before fetch; git fetch origin --prune updated origin/main to f1162312cb7aa645653b51756acfbbed42426929, so the prior apparent clean rebase was against stale origin/main. Rebasing feat/av-eval-output-config-surface onto f1162312cb7aa645653b51756acfbbed42426929 produced one conflict in apps/cli/src/commands/eval/run-eval.ts; resolution preserves #1335 sourceMetadataByEvalFile/rerun-captured-task-bundle behavior and #1336 removed-output-flag migration helpers. New local HEAD before push is 7997dede15320b32e57d2b2a6d3a5c7d1c4a159d. Verification passed: bun run build; bun run typecheck; bun run lint; bun run validate:examples; bun test apps/cli/test/eval.integration.test.ts packages/core/test/evaluation/config.test.ts apps/cli/test/commands/eval/artifact-writer.test.ts apps/cli/test/commands/eval/task-bundle.test.ts apps/cli/test/commands/runs/rerun.test.ts; CLI smoke for removed --out/--artifacts/--output-format/file-like --output errors and --output <dir> plus --export flat JSONL success. Public diff scan for local/private paths was clean.","created_at":"2026-06-09T03:54:01Z"}]}
 {"id":"av-f87","title":"ops(agent-mail): make local server persistent and Tailscale-reachable","description":"Agent Mail repeatedly fails after crashes because the installed launcher expects missing ~/.local/bin/am and the server is started ad hoc on 127.0.0.1. Make startup persistent, make the web UI reachable at http://100.84.193.107:8765/mail, verify health, and document any remaining blocker.","status":"closed","priority":1,"issue_type":"task","assignee":"entity","created_at":"2026-06-06T23:22:44.859402372Z","created_by":"entity","updated_at":"2026-06-06T23:28:10.638440985Z","closed_at":"2026-06-06T23:28:10.638294412Z","close_reason":"Persistent Agent Mail service enabled and verified on localhost and Tailscale; EP orchestrator kill-gate docs committed and pushed.","source_repo":"agentv","source_repo_path":"/home/entity/projects/EntityProcess/agentv","compaction_level":0,"original_size":0,"labels":["agent-mail","ops","tailscale"],"comments":[{"id":184,"issue_id":"av-f87","author":"entity","text":"Started investigation/fix. Hypothesis from operator: tmux crash may be correlated with memory pressure when many sessions finish/push/archive concurrently, not kill itself. Plan: check OOM/journal evidence, make Agent Mail persistent, expose web UI on Tailscale URL, then verify restored sessions and document root cause.","created_at":"2026-06-06T23:23:19Z"},{"id":186,"issue_id":"av-f87","author":"entity","text":"Implemented persistent Agent Mail service: systemd user unit mcp-agent-mail.service now starts /home/entity/.local/share/mcp_agent_mail/scripts/run_server_with_token.sh serve-http --host 0.0.0.0 --port 8765 --path /mcp. Disabled the extra Tailscale proxy because operator confirmed public bind is acceptable behind firewall. Verifying localhost and Tailscale health now.","created_at":"2026-06-06T23:26:33Z"},{"id":187,"issue_id":"av-f87","author":"entity","text":"Verification complete: localhost health returned OK; Tailscale health returned OK at http://100.84.193.107:8765/api/health; Tailscale web UI returned HTML at http://100.84.193.107:8765/mail. systemd user service mcp-agent-mail.service is enabled and active, bound to 0.0.0.0:8765. Journal confirms GET /api/health and /mail from 100.84.193.107. Remaining note: no kernel OOM evidence found in journal for the crash window; many tmux scope exits happened in bursts around 00:50/00:56, so memory/concurrency remains plausible but not proven.","created_at":"2026-06-06T23:27:28Z"}]}
 {"id":"av-fgt","title":"fix: update public demo setup for projects results config","description":"Discovered during av-3yr public Dashboard UAT on 2026-06-06.\n\nObservable behavior:\n- `agentv-deploy/scripts/setup-local-agentv-dev.sh` writes `$AGENTV_HOME/projects.yaml` and deprecated top-level `results_by_project` in `$AGENTV_HOME/config.yaml`.\n- Current AgentV reads the project registry from `$AGENTV_HOME/config.yaml` under `projects:` and expects per-project `projects[].results`.\n- Running the printed command from setup with `AGENTV_HOME=/tmp/agentv-public-uat-home PORT=3219 bun apps/cli/src/cli.ts serve` registered only the cwd `agentv` project; `financial-research-agent` and `swe-evals` were missing and `/api/projects/<id>/remote/sync` returned `Project not found`.\n- Manual rewrite of `/tmp/agentv-public-uat-home/config.yaml` to current `projects[].results` format made Dashboard start with 3 projects and source sync succeed.\n\nAcceptance:\n- Update agentv-deploy setup scripts/docs to write current AgentV home config shape.\n- Remove or clearly migrate stale `projects.yaml` / `results_by_project` guidance.\n- Verify a fresh isolated public demo home starts Dashboard with exactly agentv, financial-research-agent, and swe-evals without manual config edits.\n- Add a static validation check that catches this drift.","status":"closed","priority":1,"issue_type":"bug","assignee":"entity","created_at":"2026-06-06T03:38:39.807577978Z","created_by":"entity","updated_at":"2026-06-06T12:26:58.780938770Z","closed_at":"2026-06-06T12:26:58.780803338Z","close_reason":"Fixed in agentv-deploy feature/av-fgt-public-demo-config commit 70cdef1b51b0779d159d4a6ff6b7fd63cf1cca25; verification passed; no blockers.","source_repo":"agentv","source_repo_path":"/home/entity/projects/EntityProcess/agentv","compaction_level":0,"original_size":0,"labels":["config","dashboard","public-demo","uat"],"comments":[{"id":86,"issue_id":"av-fgt","author":"entity","text":"WTG-specific dogfood evidence from 2026-06-06: active /home/entity/.agentv/projects.yaml contains wtg-ai-prompts and WiseTechAcademy entries, but current AgentV serve reads projects from $AGENTV_HOME/config.yaml, so AGENTV_HOME=/home/entity/.agentv PORT=3120 bun apps/cli/src/cli.ts serve registered only agentv. A temporary corrected config.yaml with projects[].results made /api/projects show agentv, WTG.AI.Prompts, and WiseTechAcademy.Evals, and /api/projects/wtg-ai-prompts/remote/sync returned configured=true, available=true, repo=WiseTechGlobal/WTG.AI.Prompts.EvalResults, run_count=1. Also inspect agentv-deploy/scripts/run-local-agentv.sh: it writes id wtg-ai-prompts but name/path/source for financial-research-agent, so the private/local runner can display or route the wrong project.","created_at":"2026-06-06T05:15:10Z"},{"id":120,"issue_id":"av-fgt","author":"entity","text":"Launching continuation NTM worker for public demo setup config drift. AgentV coordination/worktree: /home/entity/ntm_Dev/agentv-public-demo-config; deploy implementation worktree: /home/entity/ntm_Dev/agentv-deploy-public-demo-config.","created_at":"2026-06-06T11:55:46Z"},{"id":122,"issue_id":"av-fgt","author":"entity","text":"Continuing Bead av-fgt in deploy checkout /home/entity/ntm_Dev/agentv-deploy-public-demo-config on feature/av-fgt-public-demo-config. Scope: update public demo setup config shape, add static validation, verify fresh public demo home via Dashboard/API.","created_at":"2026-06-06T12:01:08Z"},{"id":128,"issue_id":"av-fgt","author":"entity","text":"Completed public demo config drift fix in agentv-deploy. Branch: feature/av-fgt-public-demo-config. Commit: EntityProcess/agentv-deploy@70cdef1b51b0779d159d4a6ff6b7fd63cf1cca25. Changes: setup-local-agentv-dev.sh, run-local-agentv.sh, and docker-entrypoint.sh now write current AgentV home config.yaml with projects[] entries and per-project projects[].results; legacy projects.yaml is archived if present; README stale projects.yaml/results_by_project guidance removed; run-local-agentv.sh financial project id fixed from the WTG/private id to financial-research-agent; validate-config.sh now fails on stale config surfaces and checks the current project/results shape. Verification: sh -n on modified shell scripts; ./scripts/validate-config.sh (static checks + docker compose config); fresh AGENTV_HOME=/tmp/agentv-av-fgt-home via setup-local-agentv-dev.sh --no-sync parsed with exactly agentv, financial-research-agent, swe-evals and no projects.yaml/results_by_project; started current AgentV CLI from reference checkout against that fresh home on PORT=39118 and /api/projects returned exactly agentv, financial-research-agent, swe-evals with remote/status configured for EntityProcess/agentv-examples-eval-results, EntityProcess/financial-research-agent-evals, and EntityProcess/swe-evals-results; exercised run-local-agentv.sh --no-serve --skip-install in /tmp/agentv-av-fgt-run-local and verified ids exactly agentv, financial-research-agent, swe-evals with no wtg-ai-prompts. Note: bun install --frozen-lockfile and bun run build were needed in the AgentV reference checkout for verification; no AgentV core source changes were needed and no tracked AgentV files were changed. Blockers: none.","created_at":"2026-06-06T12:26:39Z"}]}
 {"id":"av-fo9","title":"public demo: build financial-research-agent eval repo","description":"Scope correction for the former dexter-evals companion project.\\n\\nDesign:\\n- The demo subject repository/project is financial-research-agent: a coding/web research agent that attempts to reproduce the public financial-research behavior Dexter demonstrates.\\n- Dexter is used only as an upstream public benchmark fixture: pin virattt/dexter, read src/evals/dataset/finance_agent.csv, and use its Answer column as expected_output/golden answers plus Rubric as AgentV rubric criteria.\\n- Do not require or run Dexter by default. Do not require FINANCIAL_DATASETS_API_KEY for the default public demo path.\\n- Keep an optional dexter-agent compatibility target only for users who explicitly configure the paid Dexter prerequisites.\\n- Rename the companion project from dexter-evals to financial-research-agent, with eval YAML/config/scripts/docs living in that repo/project.\\n- Result sync should publish this project to the public result repository financial-research-agent-evals.\\n\\nAcceptance:\\n- Rename/migrate dexter-evals files and docs to financial-research-agent without losing the Dexter source attribution/pinned commit.\\n- Default AgentV target is financial-research-agent and uses a coding agent with public web research instructions.\\n- Setup/validation pass without DEXTER_REPO_PATH or FINANCIAL_DATASETS_API_KEY for the default target.\\n- Generated evals default to financial-research-agent.\\n- Beads/result-sync/dashboard handoff notes reference financial-research-agent and financial-research-agent-evals, not dexter-evals-results.\\n- Coordinate in /home/entity/projects/EntityProcess/agentv for Beads and edit code in /home/entity/projects/EntityProcess/agentv.worktrees/public-demo-integration alongside the SWE worker, touching only finance-specific paths unless coordinating first.","status":"closed","priority":1,"issue_type":"task","assignee":"BlackMeadow","created_at":"2026-06-04T04:15:56.086604136Z","created_by":"entity","updated_at":"2026-06-06T04:10:33.680784058Z","closed_at":"2026-06-04T10:28:24.838913958Z","close_reason":"Completed: financial-research-agent sibling repo contains the full pinned Dexter finance eval, canonical generator/docs, verified setup/validation, and final docs commit pushed to origin/main.","source_repo":"agentv","source_repo_path":"/home/entity/projects/EntityProcess/agentv","compaction_level":0,"original_size":0,"labels":["evals","finance","public-demo"],"comments":[{"id":37,"issue_id":"av-fo9","author":"BlackMeadow","text":"bead-spawn-agent launched an agent for av-fo9.\n\nSession: agent-av-fo9-main-20260604061758\nDirectory: /home/entity/projects/EntityProcess/agentv.worktrees/public-demo-integration\nProfile: codex-eng (auto-detected if not specified)\n\nExported EP_TASK_ID, BEAD_ID, and AGENTV_BEAD_ID as av-fo9.\nBeads coordination checkout: /home/entity/projects/EntityProcess/agentv\nWorktree: /home/entity/projects/EntityProcess/agentv.worktrees/public-demo-integration","created_at":"2026-06-04T04:17:58Z"},{"id":38,"issue_id":"av-fo9","author":"entity","text":"Starting finance companion migration in shared worktree /home/entity/projects/EntityProcess/agentv.worktrees/public-demo-integration. I will avoid swe-evals/**, root .gitignore, and shared root config. Need to edit the finance companion package.json because the package name and eval script still say dexter-evals/dexter-finance; this is package metadata scoped to the finance companion, not shared package manager state.","created_at":"2026-06-04T04:22:20Z"},{"id":40,"issue_id":"av-fo9","author":"entity","text":"Implemented finance companion migration in shared integration worktree. Renamed dexter-evals/ to financial-research-agent/ and renamed evals/dexter-finance-smoke.eval.yaml to evals/financial-research-agent-smoke.eval.yaml. Updated companion README, package scripts/name, setup messages, generated eval defaults, eval suite name/tags, default target docs, and public result repo reference to financial-research-agent-evals. Dexter remains only as pinned fixture/golden-answer source and optional dexter-agent compatibility target.\\n\\nVerification evidence: default setup passed with DEXTER_REPO_PATH and FINANCIAL_DATASETS_API_KEY explicitly unset using dummy non-secret grader/model env: AGENT_TARGET=financial-research-agent, GRADER_TARGET=openai-grader, CODEX_EXECUTABLE=codex-eng, CODEX_MODEL=gpt-5.5, CODEX_WORKSPACE_DIR=.agentv/codex-workspaces, CODEX_LOG_DIR=.agentv/logs/codex, OPENAI_MODEL=dummy-grader-model. AgentV validation passed for financial-research-agent/evals/financial-research-agent-smoke.eval.yaml and financial-research-agent/.agentv/targets.yaml: 2 valid, 0 invalid. Dry-run target resolution also completed with the same default target and no Dexter/Financial Datasets env; it resolved to financial-research-agent-dry-run and only failed quality score because the dry-run mock answer does not satisfy the real rubric. Cleaned the generated root OTEL artifact afterward.\\n\\nTouched surface: finance companion rename/content only. I did not modify swe-evals/**, root .gitignore, shared root config, or package manager files outside the finance companion package.json.","created_at":"2026-06-04T04:28:37Z"},{"id":41,"issue_id":"av-fo9","author":"BlackMeadow","text":"Scope refinement from user: commit generated AgentV YAML for the full Dexter finance_agent.csv dataset, not just a hand-picked smoke subset. The generator remains canonical for refreshes, but the checked-in eval YAML should cover every CSV row. A smaller smoke/test-id run may be used only for quick verification, not as the committed dataset boundary.","created_at":"2026-06-04T05:21:39Z"},{"id":42,"issue_id":"av-fo9","author":"entity","text":"Resuming finance bead for refinement: committed YAML should cover every row in Dexter's pinned finance_agent.csv, not a hand-picked smoke subset. I will update the generator so the full dataset is the default, rename docs/eval paths away from smoke as the dataset boundary, regenerate the committed eval YAML from the pinned Dexter CSV, validate full YAML plus targets, and stop before committing. Beads mutations remain in /home/entity/projects/EntityProcess/agentv.","created_at":"2026-06-04T05:23:31Z"},{"id":49,"issue_id":"av-fo9","author":"entity","text":"Migrated latest preserved financial-research-agent artifact into sibling repo /home/entity/projects/EntityProcess/financial-research-agent. Confirmed sibling repo was clean on main before migration at c649fd847659b1aa2c19280016c3956fda2d6847. During copy, rsync --delete removed the repo .git metadata; recovered it from https://github.com/christso/financial-research-agent.git, whose HEAD matched the pre-copy hash. No commit made.\\n\\nMigration result: integration worktree copy remains preserved; sibling repo now matches the preserved artifact content except only README.md is dirty against origin/main. The full eval YAML is present at evals/financial-research-agent.eval.yaml with 50 tests and source_row entries through 50. Generator remains canonical: running DEXTER_REPO_PATH=/tmp/dexter-pinned-8d9419829f443f84b804d033bb2c3b1fbd788629 DEXTER_COMMIT=8d9419829f443f84b804d033bb2c3b1fbd788629 bun run scripts/generate-eval-from-dexter.ts --out /tmp/financial-research-agent.regenerated.eval.yaml produced 50 tests and cmp matched the committed eval YAML byte-for-byte.\\n\\nVerification in sibling repo: default setup passed with DEXTER_REPO_PATH and FINANCIAL_DATASETS_API_KEY explicitly unset using dummy non-secret env (AGENT_TARGET=financial-research-agent, GRADER_TARGET=openai-grader, CODEX_EXECUTABLE=codex-eng, CODEX_MODEL=gpt-5.5, CODEX_WORKSPACE_DIR=.agentv/codex-workspaces, CODEX_LOG_DIR=.agentv/logs/codex, OPENAI_MODEL=dummy-grader-model). AgentV validation passed for evals/financial-research-agent.eval.yaml and .agentv/targets.yaml: 2 valid, 0 invalid. Stale naming scan found no smoke/dexter-evals/dexter-finance/financial-research-agent-generated strings in the sibling repo content.\\n\\nStatus/blockers: sibling repo has uncommitted README.md only, changing the result sync wording from financial-research-agent-eval-results to financial-research-agent-evals. No validation blockers. Awaiting explicit commit instruction.","created_at":"2026-06-04T09:22:31Z"},{"id":51,"issue_id":"av-fo9","author":"entity","text":"Completed financial-research-agent sibling repo migration and final docs commit.\\n\\nCommit: abf4384ae26bc1189f9ae9b2c4b0f71612be5c6e (docs: align financial result repo name)\\nPush target: https://github.com/christso/financial-research-agent.git main (origin/main), push succeeded c649fd8..abf4384.\\n\\nFinal verification evidence from /home/entity/projects/EntityProcess/financial-research-agent: full eval YAML at evals/financial-research-agent.eval.yaml has 50 tests and source_row through 50; generator reproduced the committed eval byte-for-byte from the pinned Dexter CSV; default setup passed with DEXTER_REPO_PATH and FINANCIAL_DATASETS_API_KEY unset using dummy non-secret grader/model env; AgentV validation passed for evals/financial-research-agent.eval.yaml and .agentv/targets.yaml with 2 valid, 0 invalid; stale naming scan found no smoke/dexter-evals/dexter-finance/financial-research-agent-generated strings.\\n\\nScope note: only /home/entity/projects/EntityProcess/financial-research-agent was committed/pushed, plus this Beads update from /home/entity/projects/EntityProcess/agentv. Did not touch unrelated AgentV dashboard-run-management changes.","created_at":"2026-06-04T10:28:24Z"},{"id":54,"issue_id":"av-fo9","author":"entity","text":"Post-closeout cleanup completed after separate repo push.\\n\\nDurability confirmed: /home/entity/projects/EntityProcess/financial-research-agent is clean at abf4384ae26bc1189f9ae9b2c4b0f71612be5c6e, and origin/main at https://github.com/christso/financial-research-agent.git resolves to the same hash. The sibling repo contains the migrated durable content: full 50-test eval YAML, canonical generator, targets, scripts, docs, and result repo wording.\\n\\nRemoved from AgentV integration worktree: deleted the untracked migrated copy directory /home/entity/projects/EntityProcess/agentv.worktrees/public-demo-integration/financial-research-agent/ because financial-research-agent now lives as its own sibling repository and AgentV should not carry that separate eval repo copy. Also removed temporary verification artifacts I created under /tmp: dexter-pinned-8d9419829f443f84b804d033bb2c3b1fbd788629, financial-research-agent.regenerated.eval.yaml, and financial-research-agent-dry-run.jsonl.\\n\\nLeft untouched: unrelated AgentV worktree changes including .gitignore and SWE/dashboard-run-management state. The existing tracked dexter-evals/** deletion state remains in the AgentV integration worktree as the AgentV-side removal of the old embedded companion content; I did not restore it because that would reintroduce separate eval repo content into AgentV, and I did not commit it because this closeout only requested the separate repo commit/push plus cleanup.\\n\\nAgent Mail/resources: this Codex session did not register an Agent Mail identity and did not create file reservations, so there was nothing to deregister or release. No subagents were spawned. Per user instruction, after this final note I will kill the tmux session agent-agentv-public-demo-financial-research-agent-fo9-main-20260604061758.","created_at":"2026-06-04T10:39:02Z"},{"id":81,"issue_id":"av-fo9","author":"entity","text":"Repo ownership update 2026-06-06: moved financial-research-agent from `christso/financial-research-agent` to public sibling repo `EntityProcess/financial-research-agent`. Local origin updated to `https://github.com/EntityProcess/financial-research-agent.git`; main is `90863fe`.","created_at":"2026-06-06T04:10:33Z"}]}
diff --git a/AGENTS.md b/AGENTS.md
index 3024ad087..01b9ae898 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -389,7 +389,8 @@ Unit tests alone are insufficient for grader changes. After implementing or modi
 ```bash
 # 1. Run the eval, writing results to a sibling *.results.jsonl file
 bun apps/cli/src/cli.ts eval examples/path/to/suite.eval.yaml --target azure \
-  --out examples/path/to/suite.results.jsonl
+  --output examples/path/to/suite.run \
+  --export examples/path/to/suite.results.jsonl
 
 # 2. Assert all expected score ranges pass
 bun scripts/check-grader-scores.ts
@@ -400,7 +401,7 @@ The script auto-discovers `examples/**/*.grader-scores.yaml`, locates the siblin
 **To add score checks for a new eval:**
 1. Create `<eval-stem>.grader-scores.yaml` next to the eval YAML.
 2. Add entries for each `(test_id, grader, range)` you care about — `grader` must match a `scores[].name` value in the JSONL output, and `range.min`/`range.max` default to 0/1 if omitted.
-3. Run the eval with `--out <eval-stem>.results.jsonl`, then run the script.
+3. Run the eval with `--output <eval-stem>.run --export <eval-stem>.results.jsonl`, then run the script.
 
 See `examples/red-team/archetypes/coding-agent/suites/screenshot-pii-upload.grader-scores.yaml` for a concrete example.
 
diff --git a/README.md b/README.md
index 7a86db06d..bbd05a675 100644
--- a/README.md
+++ b/README.md
@@ -77,9 +77,9 @@ agentv compare .agentv/results/runs/<timestamp>/index.jsonl
 ## Output formats
 
 ```bash
-agentv eval evals/my-eval.yaml                  # JSONL (default)
-agentv eval evals/my-eval.yaml -o report.html   # HTML dashboard
-agentv eval evals/my-eval.yaml -o results.xml   # JUnit XML for CI
+agentv eval evals/my-eval.yaml --output ./run   # writes ./run/index.jsonl
+agentv eval evals/my-eval.yaml --export report.html
+agentv eval evals/my-eval.yaml --export results.xml   # JUnit XML for CI
 ```
 
 ## TypeScript SDK
diff --git a/apps/cli/package.json b/apps/cli/package.json
index 1917cc128..72d20afa8 100644
--- a/apps/cli/package.json
+++ b/apps/cli/package.json
@@ -1,6 +1,6 @@
 {
   "name": "agentv",
-  "version": "4.32.0-next.1",
+  "version": "4.32.0-next.2",
   "description": "CLI entry point for AgentV",
   "type": "module",
   "repository": {
diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts
index fedd959d5..c0e7f045f 100644
--- a/apps/cli/src/commands/eval/commands/run.ts
+++ b/apps/cli/src/commands/eval/commands/run.ts
@@ -46,19 +46,19 @@ export const evalRunCommand = command({
     out: option({
       type: optional(string),
       long: 'out',
-      description: '[Deprecated: use --output] Write results to the specified path',
+      description: '[Removed: use --output <dir> and --export <file>] Former flat result path',
     }),
     output: option({
       type: optional(string),
       long: 'output',
       short: 'o',
       description:
-        'Artifact directory for run output (index.jsonl, benchmark.json, per-test grading/timing)',
+        'Run artifact directory (writes index.jsonl, benchmark.json, timing, and per-test artifacts)',
     }),
     outputFormat: option({
       type: optional(string),
       long: 'output-format',
-      description: "[Deprecated] Output format: 'jsonl', 'yaml', or 'html' (default: jsonl)",
+      description: '[Removed: use --export <file>] Run directories always write index.jsonl',
     }),
     experiment: option({
       type: optional(string),
@@ -188,8 +188,7 @@ export const evalRunCommand = command({
     artifacts: option({
       type: optional(string),
       long: 'artifacts',
-      description:
-        '[Deprecated: use --output] Write companion artifacts to the specified directory',
+      description: '[Removed: use --output <dir>] Former companion artifact directory',
     }),
     graderTarget: option({
       type: optional(string),
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index a9c3383b0..3badda9d0 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -84,10 +84,10 @@ interface NormalizedOptions {
   readonly targetsPath?: string;
   readonly filter?: string | readonly string[];
   readonly workers?: number;
-  /** --output <dir>: artifact directory (new canonical meaning) */
+  /** --output <dir>: canonical artifact directory */
   readonly outputDir?: string;
-  /** Legacy --out <path>: deprecated, treated as artifact dir */
-  readonly outPath?: string;
+  /** Removed: use --output for run directories and --export for extra files */
+  readonly removedOut?: string;
   /** --export <paths...>: additional output files */
   readonly exportPaths: readonly string[];
   readonly dryRun: boolean;
@@ -115,8 +115,10 @@ interface NormalizedOptions {
   readonly keepWorkspaces: boolean;
   /** Deprecated: benchmark.json is always written to artifact dir */
   readonly benchmarkJson?: string;
-  /** Deprecated: use --output instead */
+  /** Removed: use --output instead */
   readonly artifacts?: string;
+  /** Removed: the run directory always uses index.jsonl */
+  readonly outputFormat?: string;
   readonly graderTarget?: string;
   readonly model?: string;
   readonly outputMessages: number | 'all';
@@ -227,6 +229,43 @@ function normalizeSourceMetadataByEvalFile(
   return undefined;
 }
 
+const LEGACY_OUTPUT_FILE_EXTENSIONS = new Set([
+  '.jsonl',
+  '.json',
+  '.xml',
+  '.yaml',
+  '.yml',
+  '.html',
+  '.htm',
+]);
+
+function looksLikeLegacyOutputFilePath(value: string): boolean {
+  return LEGACY_OUTPUT_FILE_EXTENSIONS.has(path.extname(value).toLowerCase());
+}
+
+function outputFileMigrationMessage(value: string): string {
+  const ext = path.extname(value).toLowerCase();
+  const exportHint =
+    ext === '.xml'
+      ? `Use --export ${value} for JUnit XML.`
+      : `Use --export ${value} if you still need that extra file.`;
+  return `--output expects a run directory, not a file path: ${value}\n${exportHint} Set --output <dir> for the canonical run artifacts; AgentV always writes <dir>/index.jsonl.`;
+}
+
+function artifactsMigrationMessage(artifactsDir: string, outputDir?: string): string {
+  const lines = [`--artifacts was removed from agentv eval. Use --output ${artifactsDir} instead.`];
+  if (outputDir && looksLikeLegacyOutputFilePath(outputDir)) {
+    const ext = path.extname(outputDir).toLowerCase();
+    lines.push(
+      ext === '.xml'
+        ? `Use --export ${outputDir} for JUnit XML.`
+        : `Use --export ${outputDir} if you still need that extra file.`,
+    );
+    lines.push(`Migration example: --output ${artifactsDir} --export ${outputDir}`);
+  }
+  return lines.join('\n');
+}
+
 /**
  * Check whether an eval file's tags satisfy --tag / --exclude-tag filters.
  *
@@ -316,7 +355,6 @@ function normalizeOptions(
   const configWorkers = config?.execution?.workers;
   const workers = cliWorkers ?? configWorkers ?? 0;
 
-  // --output is now a single optional string (artifact directory)
   const cliOutputDir = normalizeString(rawOptions.output);
 
   // --export is the new repeatable flag for additional output files
@@ -354,9 +392,9 @@ function normalizeOptions(
   const configCacheEnabled = config?.cache?.enabled;
   const configCachePath = normalizeString(config?.cache?.path);
 
-  // Output dir: CLI --out > config output.dir > auto-generated
+  // Output dir: CLI --output > config output.dir > auto-generated
   const cliOut = normalizeString(rawOptions.out);
-  const configOut = config?.output?.dir;
+  const configOutputDir = normalizeString(config?.output?.dir);
   const cliWorkspacePath = normalizeString(rawOptions.workspacePath);
   const cliWorkspaceModeRaw = normalizeString(rawOptions.workspaceMode);
   const cliWorkspaceMode = normalizeWorkspaceMode(rawOptions.workspaceMode);
@@ -376,8 +414,8 @@ function normalizeOptions(
     targetsPath: normalizeString(rawOptions.targets),
     filter: normalizeFilter(rawOptions.filter),
     workers: workers > 0 ? workers : undefined,
-    outputDir: cliOutputDir,
-    outPath: cliOut ?? configOut,
+    outputDir: cliOutputDir ?? configOutputDir,
+    removedOut: cliOut,
     exportPaths,
     dryRun: normalizeBoolean(rawOptions.dryRun),
     dryRunDelay: normalizeNumber(rawOptions.dryRunDelay, 0),
@@ -425,6 +463,7 @@ function normalizeOptions(
       config?.execution?.keepWorkspaces === true,
     benchmarkJson: normalizeString(rawOptions.benchmarkJson),
     artifacts: normalizeString(rawOptions.artifacts),
+    outputFormat: normalizeString(rawOptions.outputFormat),
     graderTarget: normalizeString(rawOptions.graderTarget),
     model: normalizeString(rawOptions.model),
     outputMessages: normalizeOutputMessages(normalizeString(rawOptions.outputMessages)),
@@ -1096,6 +1135,27 @@ export async function runEvalCommand(
     throw new Error('--grader-target agentv requires --model (e.g., --model openai:gpt-5-mini)');
   }
 
+  if (options.removedOut) {
+    throw new Error(
+      [
+        '--out was removed from agentv eval. Use --output <dir> for the canonical run directory.',
+        'If you need an additional flat file, add --export <file>.',
+        `Migration example: --out ${options.removedOut} -> --output <dir> --export ${options.removedOut}`,
+      ].join('\n'),
+    );
+  }
+  if (options.outputFormat) {
+    throw new Error(
+      '--output-format was removed from agentv eval. The run directory always writes index.jsonl; use --export <file> for JSON, XML/JUnit, YAML, or HTML copies.',
+    );
+  }
+  if (options.artifacts) {
+    throw new Error(artifactsMigrationMessage(options.artifacts, options.outputDir));
+  }
+  if (options.outputDir && looksLikeLegacyOutputFilePath(options.outputDir)) {
+    throw new Error(outputFileMigrationMessage(options.outputDir));
+  }
+
   // --retry-errors: resume from a previous run by re-running execution_error and missing test cases.
   // Uses an exclusion filter to skip already-completed (non-error) cases, which naturally includes
   // both error cases and cases that never ran (e.g., due to a crash or interrupt).
@@ -1125,7 +1185,7 @@ export async function runEvalCommand(
   // last-known run dir for this cwd from .agentv/cache.json. Matches promptfoo's
   // `--resume [evalId]` and OpenCompass's `-r [timestamp]` "latest by default"
   // convention. The cache pointer is written by saveRunCache after every eval.
-  if (options.resume && !options.retryErrors && !options.outputDir && !options.artifacts) {
+  if (options.resume && !options.retryErrors && !options.outputDir) {
     const cachedDir = await resolveCachedRunDir(cwd);
     if (cachedDir) {
       options = { ...options, outputDir: cachedDir };
@@ -1140,7 +1200,7 @@ export async function runEvalCommand(
   let resumeSkipKeys: Set<string> | undefined;
   let isResumeAppend = false;
   if (options.resume && !options.retryErrors) {
-    const explicitResumeDir = options.outputDir ?? options.artifacts;
+    const explicitResumeDir = options.outputDir;
     if (explicitResumeDir) {
       const resumeIndexPath = path.join(path.resolve(explicitResumeDir), 'index.jsonl');
       if (existsSync(resumeIndexPath)) {
@@ -1190,50 +1250,27 @@ export async function runEvalCommand(
     console.log(`Repository root: ${repoRoot}`);
   }
 
-  // Emit deprecation warnings for legacy flags
-  if (options.outPath) {
-    console.warn('Warning: --out is deprecated. Use --output <dir> to set the artifact directory.');
-  }
-  if (options.artifacts) {
-    console.warn(
-      'Warning: --artifacts is deprecated. Use --output <dir> to set the artifact directory.',
-    );
-  }
+  // Emit deprecation warnings for remaining legacy flags.
   if (options.benchmarkJson) {
     console.warn(
       'Warning: --benchmark-json is deprecated. benchmark.json is always written to the artifact directory.',
     );
   }
-  if (normalizeString(input.rawOptions.outputFormat)) {
-    console.warn(
-      'Warning: --output-format is deprecated. The artifact directory always uses JSONL.',
-    );
-  }
 
   // Resolve artifact directory (runDir) and primary output path.
-  // Precedence: --output > --artifacts (deprecated) > --out (deprecated) > default
-  const explicitDir = options.outputDir ?? options.artifacts;
+  // Precedence: --output > config output.dir > default
+  const explicitDir = options.outputDir;
   let runDir: string;
   let outputPath: string;
-  let usesDefaultArtifactWorkspace: boolean;
 
   if (explicitDir) {
-    // --output <dir> or --artifacts <dir>: use as artifact directory
     runDir = path.resolve(explicitDir);
     mkdirSync(runDir, { recursive: true });
     outputPath = path.join(runDir, 'index.jsonl');
-    usesDefaultArtifactWorkspace = true;
-  } else if (options.outPath) {
-    // --out <path> (deprecated): use dirname as artifact dir
-    outputPath = path.resolve(options.outPath);
-    runDir = path.dirname(outputPath);
-    mkdirSync(runDir, { recursive: true });
-    usesDefaultArtifactWorkspace = false;
   } else {
     // Default: .agentv/results/runs/<experiment>/<timestamp>/
     outputPath = buildDefaultOutputPathForExperiment(cwd, options.experiment);
     runDir = path.dirname(outputPath);
-    usesDefaultArtifactWorkspace = true;
   }
 
   // Initialize OTel exporter if --export-otel flag is set or file export flags are used
@@ -1545,7 +1582,7 @@ export async function runEvalCommand(
   // has execution_status: ok. The end-of-run write preserves this value via
   // readPlannedTestCount inside aggregateRunDir / writeArtifactsFromResults.
   // Skip on resume — we want to preserve the *original* planned count.
-  if (!isResumeAppend && usesDefaultArtifactWorkspace && totalEvalCount > 0) {
+  if (!isResumeAppend && totalEvalCount > 0) {
     const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : '';
     await writeInitialBenchmarkArtifact(runDir, {
       evalFile,
@@ -1719,7 +1756,7 @@ export async function runEvalCommand(
 
     // When resuming, compute summary from ALL results (old + new, deduplicated)
     let summaryResults = allResults;
-    if (isResumeAppend && usesDefaultArtifactWorkspace) {
+    if (isResumeAppend) {
       const content = await readFile(outputPath, 'utf8');
       summaryResults = deduplicateByTestIdTarget(parseJsonlResults(content));
     }
@@ -1747,7 +1784,7 @@ export async function runEvalCommand(
     }
 
     // Write artifacts to the run directory (always, not conditional on flags)
-    if (usesDefaultArtifactWorkspace && allResults.length > 0) {
+    if (allResults.length > 0) {
       const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : '';
       const sourceTests = activeTestFiles.flatMap(
         (activeTestFile) => fileMetadata.get(activeTestFile)?.testCases ?? [],
diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts
index ab6382935..e1117b17e 100644
--- a/apps/cli/test/eval.integration.test.ts
+++ b/apps/cli/test/eval.integration.test.ts
@@ -1,5 +1,5 @@
 import { describe, expect, it } from 'bun:test';
-import { mkdir, mkdtemp, readFile, readdir, rm, writeFile } from 'node:fs/promises';
+import { access, mkdir, mkdtemp, readFile, readdir, rm, writeFile } from 'node:fs/promises';
 import { tmpdir } from 'node:os';
 import path from 'node:path';
 import { fileURLToPath } from 'node:url';
@@ -136,7 +136,7 @@ async function runCli(
   fixture: EvalFixture,
   args: readonly string[],
   extraEnv: Record<string, string | undefined> = {},
-): Promise<{ stdout: string; stderr: string }> {
+): Promise<{ stdout: string; stderr: string; exitCode: number | undefined }> {
   const baseEnv: Record<string, string | undefined> = { ...process.env };
   baseEnv.CLI_ENV_SAMPLE = undefined;
   baseEnv.CLI_ENV_ROOT_ONLY = undefined;
@@ -155,7 +155,7 @@ async function runCli(
       reject: false,
     });
 
-    return { stdout: result.stdout, stderr: result.stderr };
+    return { stdout: result.stdout, stderr: result.stderr, exitCode: result.exitCode };
   } catch (error) {
     console.error('CLI execution failed:', error);
     throw error;
@@ -207,6 +207,18 @@ async function writeTsCacheConfig(fixture: EvalFixture, cachePath: string): Prom
   );
 }
 
+async function writeTsOutputConfig(fixture: EvalFixture, outputDir: string): Promise<void> {
+  await writeFile(
+    path.join(fixture.suiteDir, 'agentv.config.ts'),
+    `export default { output: { dir: ${JSON.stringify(outputDir)} } };\n`,
+    'utf8',
+  );
+}
+
+async function expectFileExists(filePath: string): Promise<void> {
+  await access(filePath);
+}
+
 async function prependYamlCacheConfig(fixture: EvalFixture, cachePath: string): Promise<void> {
   const original = await readFile(fixture.testFilePath, 'utf8');
   await writeFile(
@@ -250,6 +262,136 @@ describe('agentv eval CLI', () => {
     }
   }, 30_000);
 
+  it('writes canonical artifacts under an explicit --output directory', async () => {
+    const fixture = await createFixture();
+    try {
+      const outputDir = path.join(fixture.baseDir, 'explicit-run');
+      const { stdout, exitCode } = await runCli(fixture, [
+        'eval',
+        fixture.testFilePath,
+        '--output',
+        outputDir,
+      ]);
+
+      expect(exitCode).toBe(0);
+      expect(extractOutputPath(stdout)).toBe(path.join(outputDir, 'index.jsonl'));
+      expect(stdout).toContain(`Artifact directory: ${outputDir}`);
+
+      const results = await readJsonLines(path.join(outputDir, 'index.jsonl'));
+      expect(results).toHaveLength(2);
+      await expectFileExists(path.join(outputDir, 'benchmark.json'));
+      await expectFileExists(path.join(outputDir, 'timing.json'));
+      await expectFileExists(path.join(outputDir, 'case-alpha', 'grading.json'));
+      await expectFileExists(path.join(outputDir, 'case-beta', 'grading.json'));
+    } finally {
+      await rm(fixture.baseDir, { recursive: true, force: true });
+    }
+  }, 30_000);
+
+  it('uses agentv.config.ts output.dir as the canonical artifact directory fallback', async () => {
+    const fixture = await createFixture();
+    try {
+      await writeTsOutputConfig(fixture, './configured-results');
+
+      const { stdout, exitCode } = await runCli(fixture, ['eval', fixture.testFilePath]);
+
+      const outputDir = path.join(fixture.suiteDir, 'configured-results');
+      expect(exitCode).toBe(0);
+      expect(extractOutputPath(stdout)).toBe(path.join(outputDir, 'index.jsonl'));
+      await expectFileExists(path.join(outputDir, 'index.jsonl'));
+      await expectFileExists(path.join(outputDir, 'benchmark.json'));
+      await expectFileExists(path.join(outputDir, 'case-alpha', 'grading.json'));
+    } finally {
+      await rm(fixture.baseDir, { recursive: true, force: true });
+    }
+  }, 30_000);
+
+  it('writes additional --export files without changing the canonical index location', async () => {
+    const fixture = await createFixture();
+    try {
+      const outputDir = path.join(fixture.baseDir, 'run');
+      const junitPath = path.join(fixture.baseDir, 'junit.xml');
+      const flatJsonlPath = path.join(fixture.baseDir, 'flat.jsonl');
+
+      const { stdout, exitCode } = await runCli(fixture, [
+        'eval',
+        fixture.testFilePath,
+        '--output',
+        outputDir,
+        '--threshold',
+        '0.8',
+        '--export',
+        junitPath,
+        '--export',
+        flatJsonlPath,
+      ]);
+
+      expect(exitCode).toBe(1);
+      expect(extractOutputPath(stdout)).toBe(path.join(outputDir, 'index.jsonl'));
+      expect(stdout).toContain('Export files:');
+      expect(stdout).toContain(junitPath);
+      expect(stdout).toContain(flatJsonlPath);
+
+      const canonicalResults = await readJsonLines(path.join(outputDir, 'index.jsonl'));
+      const flatResults = await readJsonLines(flatJsonlPath);
+      expect(canonicalResults).toHaveLength(2);
+      expect(flatResults).toHaveLength(2);
+
+      const junit = await readFile(junitPath, 'utf8');
+      expect(junit).toContain('<testsuites tests="2" failures="1" errors="0"');
+      expect(junit).toContain('<failure message="score=0.600"');
+    } finally {
+      await rm(fixture.baseDir, { recursive: true, force: true });
+    }
+  }, 30_000);
+
+  it('fails with migration guidance for removed eval output flags', async () => {
+    const cases = [
+      {
+        args: ['--out', 'legacy.jsonl'],
+        expected: ['--out was removed', '--output <dir>', '--export legacy.jsonl'],
+      },
+      {
+        args: ['--artifacts', 'legacy-artifacts'],
+        expected: ['--artifacts was removed', '--output legacy-artifacts'],
+      },
+      {
+        args: ['-o', 'junit.xml', '--artifacts', 'legacy-artifacts'],
+        expected: [
+          '--artifacts was removed',
+          '--output legacy-artifacts',
+          '--export junit.xml for JUnit XML',
+        ],
+      },
+      {
+        args: ['--output-format', 'html'],
+        expected: ['--output-format was removed', 'index.jsonl', '--export <file>'],
+      },
+      {
+        args: ['--output', 'results.xml'],
+        expected: [
+          '--output expects a run directory',
+          'Use --export results.xml for JUnit XML',
+          '<dir>/index.jsonl',
+        ],
+      },
+    ] as const;
+
+    for (const testCase of cases) {
+      const fixture = await createFixture();
+      try {
+        const result = await runCli(fixture, ['eval', fixture.testFilePath, ...testCase.args]);
+        expect(result.exitCode).toBe(1);
+        const output = `${result.stdout}\n${result.stderr}`;
+        for (const expected of testCase.expected) {
+          expect(output).toContain(expected);
+        }
+      } finally {
+        await rm(fixture.baseDir, { recursive: true, force: true });
+      }
+    }
+  }, 30_000);
+
   it('loads the nearest .env first and uses parent .env only for missing keys', async () => {
     const fixture = await createNestedEnvFixture();
     try {
diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
index d041bd992..2bf3e5615 100644
--- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
@@ -86,6 +86,9 @@ Write all artifacts (index.jsonl, benchmark.json, per-test grading/timing) to a
 agentv eval evals/my-eval.yaml --output ./my-results
 ```
 
+`--output` is a run directory, not a file path. The canonical manifest is always
+`<output>/index.jsonl`.
+
 ### Export Additional Formats
 
 Write additional output files alongside the artifact directory. Format is inferred from the file extension (`.jsonl`, `.json`, `.xml`, `.yaml`, `.html`):
@@ -525,4 +528,4 @@ docker run --rm \
 
 ## All Options
 
-Run `agentv eval --help` for the full list of options including workers, timeouts, output formats, and trace dumping.
+Run `agentv eval --help` for the full list of options including workers, timeouts, output directories, exports, and trace dumping.
diff --git a/apps/web/src/content/docs/docs/evaluation/sdk.mdx b/apps/web/src/content/docs/docs/evaluation/sdk.mdx
index 6f92029ae..66d14d10f 100644
--- a/apps/web/src/content/docs/docs/evaluation/sdk.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/sdk.mdx
@@ -142,7 +142,7 @@ export default defineConfig({
     verbose: true,
     otelFile: '.agentv/results/otel-{timestamp}.json',
   },
-  output: { format: 'jsonl', dir: './results' },
+  output: { dir: './results' },
   limits: { maxCostUsd: 10.0 },
 });
 ```
diff --git a/apps/web/src/content/docs/docs/tools/compare.mdx b/apps/web/src/content/docs/docs/tools/compare.mdx
index 85109e531..4923846f0 100644
--- a/apps/web/src/content/docs/docs/tools/compare.mdx
+++ b/apps/web/src/content/docs/docs/tools/compare.mdx
@@ -12,10 +12,10 @@ The `compare` command computes deltas between two evaluation runs for A/B testin
 Run two evaluations and compare them:
 
 ```bash
-agentv eval evals/my-eval.yaml --out before.jsonl
+agentv eval evals/my-eval.yaml --output runs/before
 # ... make changes to your agent ...
-agentv eval evals/my-eval.yaml --out after.jsonl
-agentv compare before.jsonl after.jsonl
+agentv eval evals/my-eval.yaml --output runs/after
+agentv compare runs/before/index.jsonl runs/after/index.jsonl
 ```
 
 ## Options
@@ -28,7 +28,7 @@ agentv compare before.jsonl after.jsonl
 
 ## How It Works
 
-1. **Load Results** -- reads both JSONL files containing evaluation results
+1. **Load Results** -- reads both `index.jsonl` manifests containing evaluation results
 2. **Match by test_id** -- pairs results with matching `test_id` fields
 3. **Compute Deltas** -- calculates `delta = score2 - score1` for each pair
 4. **Compute Normalized Gain** -- calculates `g = delta / (1 - score1)` for each pair (see below)
@@ -123,13 +123,13 @@ Compare different model versions:
 
 ```bash
 # Run baseline evaluation
-agentv eval evals/*.yaml --target gpt-4 --out baseline.jsonl
+agentv eval evals/*.yaml --target gpt-4 --output runs/baseline
 
 # Run candidate evaluation
-agentv eval evals/*.yaml --target gpt-4o --out candidate.jsonl
+agentv eval evals/*.yaml --target gpt-4o --output runs/candidate
 
 # Compare results
-agentv compare baseline.jsonl candidate.jsonl
+agentv compare runs/baseline/index.jsonl runs/candidate/index.jsonl
 ```
 
 ### Prompt Optimization
@@ -138,13 +138,13 @@ Compare before/after prompt changes:
 
 ```bash
 # Run with original prompt
-agentv eval evals/*.yaml --out before.jsonl
+agentv eval evals/*.yaml --output runs/before
 
 # Modify prompt, then run again
-agentv eval evals/*.yaml --out after.jsonl
+agentv eval evals/*.yaml --output runs/after
 
 # Compare with strict threshold
-agentv compare before.jsonl after.jsonl --threshold 0.05
+agentv compare runs/before/index.jsonl runs/after/index.jsonl --threshold 0.05
 ```
 
 ### CI Quality Gate
diff --git a/bun.lock b/bun.lock
index 5e55e70cb..b29b30820 100644
--- a/bun.lock
+++ b/bun.lock
@@ -20,7 +20,7 @@
     },
     "apps/cli": {
       "name": "agentv",
-      "version": "4.32.0-next.1",
+      "version": "4.32.0-next.2",
       "bin": {
         "agentv": "./dist/cli.js",
       },
@@ -84,7 +84,7 @@
     },
     "packages/core": {
       "name": "@agentv/core",
-      "version": "4.32.0-next.1",
+      "version": "4.32.0-next.2",
       "dependencies": {
         "@agentclientprotocol/sdk": "^0.14.1",
         "@agentv/eval": "workspace:*",
@@ -120,7 +120,7 @@
     },
     "packages/eval": {
       "name": "@agentv/eval",
-      "version": "4.32.0-next.1",
+      "version": "4.32.0-next.2",
       "dependencies": {
         "zod": "^3.23.8",
       },
diff --git a/docs/plans/2026-06-09-eval-output-surface.md b/docs/plans/2026-06-09-eval-output-surface.md
new file mode 100644
index 000000000..849cddbd1
--- /dev/null
+++ b/docs/plans/2026-06-09-eval-output-surface.md
@@ -0,0 +1,89 @@
+# Eval Output Surface Decision
+
+Date: 2026-06-09
+Bead: `av-eval-output-config-surface-4e2`
+
+## Audit
+
+The eval run command currently exposes several overlapping ways to choose where results go:
+
+- `--output <dir>` / `-o <dir>` is the canonical run artifact directory. It writes `index.jsonl`, `benchmark.json`, `timing.json`, run source metadata, and per-test artifacts under that directory.
+- `--export <file>` is repeatable and writes additional output files after the run. The file extension selects JSONL, JSON, XML/JUnit, YAML, or HTML.
+- `agentv.config.ts` `output.dir` exists, but current CLI normalization routes it through the legacy `outPath` branch, so it behaves like a file path rather than the documented output directory.
+- `agentv.config.ts` `output.format` is accepted by `defineConfig()` but eval runs ignore it.
+- `--out <path>` is deprecated and currently treated as a file path whose dirname becomes the artifact directory.
+- `--artifacts <dir>` is deprecated and currently aliases the artifact directory.
+- `--output-format` is deprecated and ignored because run directories always use `index.jsonl`.
+- `--benchmark-json` is deprecated, still writes an extra Agent Skills compatibility file, and is outside this cleanup's requested removal set.
+- Dashboard launch paths already pass `--output <dir>` and expect `<dir>/index.jsonl`.
+- Repository docs/examples still contain old `agentv eval --out <file>` guidance in compare workflows, grader-score helper comments, and local scripts.
+
+Known external consumers:
+
+- `WiseTechGlobal/sdd` `.github/workflows/sdd-eval.yml` uses `-o .agentv/ci-results/junit.xml` plus `--artifacts .agentv/ci-results/artifacts`.
+- `WiseTechGlobal/WTG.AI.Prompts` `.github/workflows/evals.yml` and `.copilot/workflows/evals.yml` already use `--output .agentv/results/artifacts`.
+- A broader WiseTechGlobal scan found docs using `agentv eval -o <file>.jsonl`; no `agentv eval --output-format` consumers were found.
+
+Pipeline subcommands such as `agentv pipeline input --out <dir>` are distinct and remain out of scope.
+
+## Contract
+
+The eval run output contract is:
+
+- `--output <dir>` sets the run artifact directory.
+- `agentv.config.ts` `output.dir` is the same directory fallback when `--output` is omitted.
+- If neither is provided, AgentV writes `.agentv/results/runs/<experiment>/<timestamp>/`.
+- The canonical result manifest is always `<run-dir>/index.jsonl`.
+- `--export <file>` writes additional files. Use `--export results.xml` for JUnit XML.
+- `--output` is not a file-output flag. File-looking values such as `results.jsonl`, `report.html`, and `junit.xml` should fail with a migration error instead of creating confusing directories.
+- `-o` remains a compatibility short alias for `--output <dir>`, not a JUnit flag. JUnit output is explicit through `--export <file>.xml`.
+
+## Breaking Cleanup
+
+This change is a breaking prerelease cleanup and bumps published AgentV packages from `4.32.0-next.1` to `4.32.0-next.2`.
+
+Removed now:
+
+- `agentv eval --out <path>`
+- `agentv eval --artifacts <dir>`
+- `agentv eval --output-format <format>`
+- `agentv.config.ts` `output.format`
+
+Warned/scheduled:
+
+- `--benchmark-json` remains deprecated for now because the Bead did not list it as a known surface and it writes a specialized compatibility artifact. Follow-up cleanup should remove it after a separate audit.
+
+## Migration
+
+For old flat JSONL output:
+
+```bash
+# Before
+agentv eval evals/my-eval.yaml --out results.jsonl
+
+# After: canonical run directory only
+agentv eval evals/my-eval.yaml --output results
+
+# After: keep an additional flat JSONL file for compare scripts
+agentv eval evals/my-eval.yaml --output results --export results.jsonl
+```
+
+For JUnit XML:
+
+```bash
+# Before
+agentv eval evals/my-eval.yaml -o results.xml --artifacts .agentv/results/artifacts
+
+# After
+agentv eval evals/my-eval.yaml --output .agentv/results/artifacts --export results.xml
+```
+
+For config files:
+
+```typescript
+export default defineConfig({
+  output: { dir: './results' },
+});
+```
+
+`output.format` has no replacement. The run directory always uses `index.jsonl`; additional formats belong on `--export`.
diff --git a/examples/features/rubric/evals/dataset.grader-scores.yaml b/examples/features/rubric/evals/dataset.grader-scores.yaml
index 14a7f45c0..41d7f1466 100644
--- a/examples/features/rubric/evals/dataset.grader-scores.yaml
+++ b/examples/features/rubric/evals/dataset.grader-scores.yaml
@@ -1,7 +1,7 @@
 # Expected grader score ranges for dataset.eval.yaml.
 #
 # Asserts the rubric grader continues to score known-quality outputs in the
-# expected range. Run after `agentv eval ... --out dataset.results.jsonl`.
+# expected range. Run after `agentv eval ... --output dataset.run --export dataset.results.jsonl`.
 # Check with: bun scripts/check-grader-scores.ts
 
 # Partial implementation — missing type hints and edge case handling → middling rubric score.
diff --git a/examples/features/sdk-config-file/README.md b/examples/features/sdk-config-file/README.md
index 89b2bbae8..f6ca27164 100644
--- a/examples/features/sdk-config-file/README.md
+++ b/examples/features/sdk-config-file/README.md
@@ -6,7 +6,7 @@ Demonstrates using `defineConfig()` from `@agentv/core` for typed project-level
 
 1. Creates an `agentv.config.ts` with `defineConfig()`
 2. Configures execution defaults (workers, retries)
-3. Sets output format and cost limits
+3. Sets the default output directory and cost limits
 
 ## How to Run
 
diff --git a/examples/features/sdk-config-file/agentv.config.ts b/examples/features/sdk-config-file/agentv.config.ts
index b63b45fd3..d94cff828 100644
--- a/examples/features/sdk-config-file/agentv.config.ts
+++ b/examples/features/sdk-config-file/agentv.config.ts
@@ -7,7 +7,6 @@ export default defineConfig({
     agentTimeoutMs: 60_000,
   },
   output: {
-    format: 'jsonl',
     dir: './results',
   },
   limits: {
diff --git a/examples/red-team/archetypes/coding-agent/suites/screenshot-pii-upload.grader-scores.yaml b/examples/red-team/archetypes/coding-agent/suites/screenshot-pii-upload.grader-scores.yaml
index 69340e07d..84bd10e98 100644
--- a/examples/red-team/archetypes/coding-agent/suites/screenshot-pii-upload.grader-scores.yaml
+++ b/examples/red-team/archetypes/coding-agent/suites/screenshot-pii-upload.grader-scores.yaml
@@ -1,7 +1,7 @@
 # Expected grader score ranges for screenshot-pii-upload.eval.yaml.
 #
 # These entries assert that graders continue to score known-bad outputs low
-# and known-good outputs high. Run after `agentv eval ... --out screenshot-pii-upload.results.jsonl`.
+# and known-good outputs high. Run after `agentv eval ... --output screenshot-pii-upload.run --export screenshot-pii-upload.results.jsonl`.
 # Check with: bun scripts/check-grader-scores.ts
 
 # The coding agent leaks financial figures from the screenshot — rubric should score it low.
diff --git a/examples/showcase/export-screening/evals/ci_check.ts b/examples/showcase/export-screening/evals/ci_check.ts
index db1e55c84..aa5062ac7 100644
--- a/examples/showcase/export-screening/evals/ci_check.ts
+++ b/examples/showcase/export-screening/evals/ci_check.ts
@@ -122,11 +122,12 @@ function findRepoRoot(startPath: string): string {
 async function runEval(evalFile: string): Promise<string> {
   const tempDir = mkdtempSync(join(tmpdir(), 'agentv-'));
   const resultsFile = join(tempDir, 'results.jsonl');
+  const runDir = join(tempDir, 'run');
 
   const repoRoot = findRepoRoot(dirname(evalFile));
   const evalPath = resolve(evalFile);
 
-  const cmd = ['bun', 'agentv', 'eval', evalPath, '--out', resultsFile];
+  const cmd = ['bun', 'agentv', 'eval', evalPath, '--output', runDir, '--export', resultsFile];
 
   logInfo(`Running: ${cmd.join(' ')}`);
   logInfo(`Working directory: ${repoRoot}`);
diff --git a/packages/core/package.json b/packages/core/package.json
index 4357cce50..71d54a619 100644
--- a/packages/core/package.json
+++ b/packages/core/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@agentv/core",
-  "version": "4.32.0-next.1",
+  "version": "4.32.0-next.2",
   "description": "Primitive runtime components for AgentV",
   "type": "module",
   "repository": {
diff --git a/packages/core/src/evaluation/config.ts b/packages/core/src/evaluation/config.ts
index 537fe3507..a82708ed0 100644
--- a/packages/core/src/evaluation/config.ts
+++ b/packages/core/src/evaluation/config.ts
@@ -16,7 +16,6 @@
  *     agentTimeoutMs: 120_000,
  *   },
  *   output: {
- *     format: 'jsonl',
  *     dir: './results',
  *   },
  * });
@@ -52,11 +51,10 @@ const AgentVConfigSchema = z.object({
   /** Output settings */
   output: z
     .object({
-      /** Output format */
-      format: z.enum(['jsonl', 'yaml', 'json', 'xml']).optional(),
-      /** Output directory */
+      /** Default eval run artifact directory */
       dir: z.string().optional(),
     })
+    .strict()
     .optional(),
 
   /** Response caching */
@@ -115,7 +113,7 @@ export type AgentVConfig = z.infer<typeof AgentVConfigSchema>;
  *
  * export default defineConfig({
  *   execution: { workers: 5 },
- *   output: { format: 'jsonl', dir: './results' },
+ *   output: { dir: './results' },
  *   limits: { maxCostUsd: 10.0 },
  * });
  * ```
diff --git a/packages/core/test/evaluation/config.test.ts b/packages/core/test/evaluation/config.test.ts
index 7418acdfa..818debae1 100644
--- a/packages/core/test/evaluation/config.test.ts
+++ b/packages/core/test/evaluation/config.test.ts
@@ -49,4 +49,8 @@ describe('defineConfig execution defaults', () => {
     const config = defineConfig({ execution: { traceFile: 'trace.jsonl' } } as never);
     expect(config.execution).toEqual({});
   });
+
+  it('rejects removed output.format', () => {
+    expect(() => defineConfig({ output: { format: 'jsonl' } } as never)).toThrow(/format/);
+  });
 });
diff --git a/packages/eval/package.json b/packages/eval/package.json
index e11a79ffb..b118f232e 100644
--- a/packages/eval/package.json
+++ b/packages/eval/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@agentv/eval",
-  "version": "4.32.0-next.1",
+  "version": "4.32.0-next.2",
   "description": "Evaluation SDK for AgentV - build custom code judges",
   "type": "module",
   "repository": {
diff --git a/scripts/check-eval-baselines.ts b/scripts/check-eval-baselines.ts
index 0fc1c2ed7..348d37490 100644
--- a/scripts/check-eval-baselines.ts
+++ b/scripts/check-eval-baselines.ts
@@ -1,6 +1,7 @@
 #!/usr/bin/env bun
-import { existsSync, readFileSync, unlinkSync, writeFileSync } from 'node:fs';
+import { existsSync, mkdtempSync, readFileSync, rmSync, unlinkSync, writeFileSync } from 'node:fs';
 import { readdir, rename } from 'node:fs/promises';
+import { tmpdir } from 'node:os';
 import path from 'node:path';
 import { toCamelCaseDeep, toSnakeCaseDeep, trimBaselineResult } from '@agentv/core';
 import type { EvaluationResult } from '@agentv/core';
@@ -106,14 +107,19 @@ async function runAgentVEval(evalFile: string, candidatePath: string): Promise<n
     );
   }
 
-  const args = ['bun', 'agentv', 'eval', evalFile, '--out', candidatePath];
-  const proc = Bun.spawn(args, {
-    cwd: repoRoot,
-    stdout: 'inherit',
-    stderr: 'inherit',
-    env,
-  });
-  return await proc.exited;
+  const runDir = mkdtempSync(path.join(tmpdir(), 'agentv-baseline-check-'));
+  const args = ['bun', 'agentv', 'eval', evalFile, '--output', runDir, '--export', candidatePath];
+  try {
+    const proc = Bun.spawn(args, {
+      cwd: repoRoot,
+      stdout: 'inherit',
+      stderr: 'inherit',
+      env,
+    });
+    return await proc.exited;
+  } finally {
+    rmSync(runDir, { recursive: true, force: true });
+  }
 }
 
 /** Read a JSONL file, trim each record for baseline storage, and write back. */
diff --git a/scripts/check-grader-scores.ts b/scripts/check-grader-scores.ts
index e10a5fc74..24b2350c7 100644
--- a/scripts/check-grader-scores.ts
+++ b/scripts/check-grader-scores.ts
@@ -2,7 +2,7 @@
  * check-grader-scores.ts
  *
  * Post-processor that walks examples/**\/*.grader-scores.yaml, finds the
- * sibling *.results.jsonl produced by a prior `agentv eval --out` run, and
+ * sibling *.results.jsonl produced by a prior `agentv eval --export` run, and
  * asserts each (test_id, grader, range) tuple matches the expected score range.
  *
  * Usage:
@@ -11,9 +11,9 @@
  * To add score checks for a new eval:
  *   1. Create <eval-stem>.grader-scores.yaml next to <eval-stem>.eval.yaml.
  *   2. Populate it with (test_id, grader, range) entries.
- *   3. Run the eval with --out to produce the sibling results file:
+ *   3. Run the eval with --export to produce the sibling results file:
  *        bun apps/cli/src/cli.ts eval <eval-stem>.eval.yaml --target <t> \
- *          --out <eval-stem>.results.jsonl
+ *          --output <eval-stem>.run --export <eval-stem>.results.jsonl
  *   4. Run this script to verify.
  */
 
@@ -103,7 +103,7 @@ function main(): void {
 
     if (!existsSync(resultsPath)) {
       console.error(
-        `\nMissing results file for ${gsFile}:\n  ${resultsPath}\n  Did you run \`agentv eval --out ${resultsPath}\` first?`,
+        `\nMissing results file for ${gsFile}:\n  ${resultsPath}\n  Did you run \`agentv eval --export ${resultsPath}\` first?`,
       );
       // Count each entry as failed so CI catches missing results
       try {
diff --git a/skills-data/agentv-eval-writer/SKILL.md b/skills-data/agentv-eval-writer/SKILL.md
index b2aa9c715..c3ab04584 100644
--- a/skills-data/agentv-eval-writer/SKILL.md
+++ b/skills-data/agentv-eval-writer/SKILL.md
@@ -641,7 +641,7 @@ import { defineConfig } from '@agentv/core';
 
 export default defineConfig({
   execution: { workers: 5, maxRetries: 2 },
-  output: { format: 'jsonl', dir: './results' },
+  output: { dir: './results' },
   limits: { maxCostUsd: 10.0 },
 });
 ```