From c7a7276aaa67a95a44f5247bcfa3737de70b4650 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Tue, 9 Jun 2026 08:19:27 +0200 Subject: [PATCH] fix(results): supersede stale remote sync errors --- .beads/issues.jsonl | 5 ++ packages/core/src/evaluation/results-repo.ts | 25 +++++++++ .../core/test/evaluation/results-repo.test.ts | 52 +++++++++++++++++++ 3 files changed, 82 insertions(+) diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl index 5aacfa59..655f385c 100644 --- a/.beads/issues.jsonl +++ b/.beads/issues.jsonl @@ -16,6 +16,10 @@ {"id":"av-eval-output-config-surface-4e2","title":"cli/config: simplify eval output surface","description":"Problem:\nAgentV's eval output/config surface is bloated and confusing. The current CLI/config paths include canonical --output , deprecated --out , deprecated --artifacts , deprecated --output-format, config output.dir fallback, --export for extra files, JUnit -o in eval run, and run bundle artifact generation. This makes it hard to explain what writes index.jsonl, what writes JUnit, and what is the canonical run folder.\n\nUser direction:\n- We should remove --out.\n- Simplify the config surface.\n- Any breaking changes must require an explicit version bump and migration notes.\n\nAcceptance:\n- Audit the current eval output/config surface in CLI, docs, examples, Dashboard launch paths, and known GitHub workflow consumers.\n- Propose and implement a simpler target contract centered on canonical --output for artifact directories and --export for additional output files.\n- Remove or schedule removal of deprecated --out with a compatibility/versioning plan; do not silently break users in a patch/minor release.\n- Decide whether deprecated --artifacts and --output-format are removed in the same breaking-change window or only receive stronger warnings.\n- Preserve JUnit -o semantics for eval run if it is intentionally distinct, or rename/document it if it conflicts with --output mental model.\n- If behavior is breaking, include package version bump, changelog/migration note, and docs updates in the same PR.\n- Add/adjust CLI tests covering removed/deprecated flags, canonical index.jsonl placement, explicit output directories, --export behavior, and helpful error messages.\n- Include migration notes for known consumers: WiseTechGlobal/sdd uses --artifacts, WiseTechGlobal/WTG.AI.Prompts uses --output .agentv/results/artifacts.\n\nDependencies / related:\n- Related to av-wy0.1 because canonical run bundle behavior changes how explicit outputs are handled.\n- Related to av-wy0 because the run folder should be the audit boundary.\n\nNon-goal:\n- Do not change run manifest or dashboard file visibility in this task; those remain av-wy0.2/av-wy0.3.","status":"closed","priority":2,"issue_type":"task","assignee":"entity","created_at":"2026-06-08T12:55:32.788635265Z","created_by":"entity","updated_at":"2026-06-09T03:54:01.759369830Z","closed_at":"2026-06-09T01:05:58.685668014Z","close_reason":"Implemented eval output surface simplification with migration notes, version bump, tests, UAT, and review.","source_repo":"agentv","source_repo_path":"/home/entity/projects/EntityProcess/agentv","compaction_level":0,"original_size":0,"labels":["breaking-change","cli","config","run-bundles"],"dependencies":[{"issue_id":"av-eval-output-config-surface-4e2","depends_on_id":"av-wy0","type":"related","created_at":"2026-06-08T12:56:04.175910502Z","created_by":"entity","metadata":"{}","thread_id":""},{"issue_id":"av-eval-output-config-surface-4e2","depends_on_id":"av-wy0.1","type":"related","created_at":"2026-06-08T12:56:04.440026330Z","created_by":"entity","metadata":"{}","thread_id":""}],"comments":[{"id":280,"issue_id":"av-eval-output-config-surface-4e2","author":"FuchsiaStream","text":"Run bundle design linkage (FuchsiaStream, 2026-06-08): av-wy0 should not remove or reinterpret deprecated output flags. V1 bundle mapping is `--output ` as canonical bundle root, deprecated `--artifacts ` as the same root until cleanup, deprecated `--out ` preserving current semantics with bundle files best-effort in `dirname()`, and `--export` as extra files only. Removing `--out` or changing precedence/warnings waits for this bead with an explicit version bump and migration notes.","created_at":"2026-06-08T13:22:57Z"},{"id":283,"issue_id":"av-eval-output-config-surface-4e2","author":"FuchsiaStream","text":"Clarification from run-bundle schema discussion (FuchsiaStream, 2026-06-08): hard deprecation is acceptable for this-week/unreleased artifact names like `run-source.json` -> `run_source.json`. That does not automatically apply to established CLI output flags such as `--out`/`--artifacts`, which still need the audit/version-bump/migration plan in this bead because they have known consumers.","created_at":"2026-06-08T13:42:12Z"},{"id":288,"issue_id":"av-eval-output-config-surface-4e2","author":"GentleCompass","text":"Worktree setup confirmed (Codex 2026-06-09): created /home/entity/projects/EntityProcess/agentv.worktrees/av-output-config on feat/av-eval-output-config-surface from origin/main 595394c3. Proceeding with read-only audit of eval output/config flags and consumers before reserving/editing exact files.","created_at":"2026-06-09T00:16:21Z"},{"id":289,"issue_id":"av-eval-output-config-surface-4e2","author":"GentleCompass","text":"Decision note before code changes (Codex 2026-06-09): canonical eval-run output contract will be `--output ` / config `output.dir` as run artifact directory, always writing `/index.jsonl`; `--export ` remains the only extra-file surface, including JUnit XML via `--export results.xml`. Breaking cleanup in this branch: remove behavior for `--out`, `--artifacts`, `--output-format`, and config `output.format`; keep `--benchmark-json` as deprecated/scheduled because it was discovered outside this Bead's listed known surface. `-o` remains only a short alias for `--output `; file-looking `--output`/`-o` values such as .jsonl/.xml/.html will fail with migration guidance. Package prerelease bump planned: 4.32.0-next.1 -> 4.32.0-next.2. Repo decision/migration note added at docs/plans/2026-06-09-eval-output-surface.md.","created_at":"2026-06-09T00:25:48Z"},{"id":290,"issue_id":"av-eval-output-config-surface-4e2","author":"entity","text":"Implementation + verification update (Codex 2026-06-09): simplified eval output to canonical --output / config output.dir writing /index.jsonl, with --export for extra JSON/JUnit/YAML/HTML files. Removed runtime behavior for --out, --artifacts, and --output-format with hard migration errors; config output.format is removed from the schema. Preserved -o only as short alias for --output , and added file-like --output validation so old -o results.xml guidance points to --export results.xml. Known consumer migration covered: WiseTechGlobal/sdd shape '-o junit.xml --artifacts artifacts' now errors with '--output artifacts --export junit.xml'; WiseTechGlobal/WTG.AI.Prompts already uses --output . Version bumped 4.32.0-next.1 -> 4.32.0-next.2 and migration note added in docs/plans/2026-06-09-eval-output-surface.md. Verification: bun run build; bun test apps/cli/test/eval.integration.test.ts; bun run typecheck; bun run lint; bun run validate:examples; bun run test. Manual red/green UAT: origin/main accepted --out legacy.jsonl, exit 0, and wrote the flat file; this branch rejects --out with migration guidance and creates no flat file; replacement --output artifacts --export junit.xml exits 0 and writes artifacts/index.jsonl plus junit.xml. Follow-up bead av-33j created for future --benchmark-json removal.","created_at":"2026-06-09T00:57:44Z"},{"id":291,"issue_id":"av-eval-output-config-surface-4e2","author":"entity","text":"Additional verification update (Codex 2026-06-09): added packages/core/test/evaluation/config.test.ts coverage that defineConfig rejects removed output.format; reran bun test packages/core/test/evaluation/config.test.ts and bun run lint successfully after the test addition.","created_at":"2026-06-09T01:01:57Z"},{"id":292,"issue_id":"av-eval-output-config-surface-4e2","author":"entity","text":"Final verification update (Codex 2026-06-09): reran bun run test after the config test addition; current tree passes core/eval/phoenix-adapter/cli/dashboard tests. ce-code-review final diff pass found no blocking issues.","created_at":"2026-06-09T01:04:50Z"},{"id":316,"issue_id":"av-eval-output-config-surface-4e2","author":"entity","text":"Post-rebase shipping verification (Codex 2026-06-09): Agent Mail recheck for project_key /home/entity/projects/EntityProcess/agentv returned no conflicts for intended paths; reservations held by FrostyCompass through 2026-06-09T04:35:48Z. Branch feat/av-eval-output-config-surface is based on current origin/main d678615b after rebase; public AGENTS.md diff contains only canonical command guidance, no local/private paths. Verification after rebase: bun run build; bun run typecheck; bun run lint; bun run validate:examples; bun test apps/cli/test/eval.integration.test.ts packages/core/test/evaluation/config.test.ts apps/cli/test/commands/eval/artifact-writer.test.ts apps/cli/test/commands/eval/task-bundle.test.ts; bun run test. Manual UAT after rebase: --out exits 1 with migration guidance and creates no flat file; '-o junit.xml --artifacts artifacts' exits 1 with --output/--export migration guidance; replacement '--output --export junit.xml' exits 0 and writes /index.jsonl plus JUnit XML. No blockers remaining for PR.","created_at":"2026-06-09T02:48:15Z"},{"id":317,"issue_id":"av-eval-output-config-surface-4e2","author":"entity","text":"Shipping update (Codex 2026-06-09): opened PR #1336 for feat(cli): simplify eval output surface: https://github.com/EntityProcess/agentv/pull/1336. Branch pushed: feat/av-eval-output-config-surface. Verification included post-rebase build/typecheck/lint/validate/examples/focused tests/full test suite and CLI UAT for removed flags plus --output/--export replacement.","created_at":"2026-06-09T02:51:07Z"},{"id":320,"issue_id":"av-eval-output-config-surface-4e2","author":"entity","text":"Post-prerequisite-merge rebase verification (Codex 2026-06-09, PROMPT_UID=unblock-output-config-after-merges-20260609): confirmed prerequisite PRs merged to main (#1334 8e6dd1e96f0ac23c5a413768c82403c0535bf905, #1332 083e08c39492b030879a311801d17f6631e909f1, #1331 35263cd707a9a89c95728ae86beb7271b76f2358). Rebasing feat/av-eval-output-config-surface onto origin/main 35263cd707a9a89c95728ae86beb7271b76f2358 completed without conflicts. Post-rebase verification: bun run build; bun run typecheck; bun run lint; bun run validate:examples; bun test apps/cli/test/eval.integration.test.ts packages/core/test/evaluation/config.test.ts apps/cli/test/commands/eval/artifact-writer.test.ts apps/cli/test/commands/eval/task-bundle.test.ts. Manual CLI smoke: --out exits 1 with migration guidance and creates no flat file; '-o junit.xml --artifacts artifacts' exits 1 with --output/--export migration guidance; '--output --export junit.xml' exits 0 and writes /index.jsonl plus JUnit XML. Public diff scan for local/private paths/Agent Mail URLs/scripts was clean; public AGENTS.md remains generic. Ready to update PR #1336 branch/body; no blockers.","created_at":"2026-06-09T03:27:55Z"},{"id":321,"issue_id":"av-eval-output-config-surface-4e2","author":"entity","text":"Post-#1335 rebase/verification update (Codex 2026-06-09, PROMPT_UID=output-config-stale-origin-main-correction-20260609): worktree origin/main was stale at 35263cd707a9a89c95728ae86beb7271b76f2358 before fetch; git fetch origin --prune updated origin/main to f1162312cb7aa645653b51756acfbbed42426929, so the prior apparent clean rebase was against stale origin/main. Rebasing feat/av-eval-output-config-surface onto f1162312cb7aa645653b51756acfbbed42426929 produced one conflict in apps/cli/src/commands/eval/run-eval.ts; resolution preserves #1335 sourceMetadataByEvalFile/rerun-captured-task-bundle behavior and #1336 removed-output-flag migration helpers. New local HEAD before push is 7997dede15320b32e57d2b2a6d3a5c7d1c4a159d. Verification passed: bun run build; bun run typecheck; bun run lint; bun run validate:examples; bun test apps/cli/test/eval.integration.test.ts packages/core/test/evaluation/config.test.ts apps/cli/test/commands/eval/artifact-writer.test.ts apps/cli/test/commands/eval/task-bundle.test.ts apps/cli/test/commands/runs/rerun.test.ts; CLI smoke for removed --out/--artifacts/--output-format/file-like --output errors and --output plus --export flat JSONL success. Public diff scan for local/private paths was clean.","created_at":"2026-06-09T03:54:01Z"}]} {"id":"av-f87","title":"ops(agent-mail): make local server persistent and Tailscale-reachable","description":"Agent Mail repeatedly fails after crashes because the installed launcher expects missing ~/.local/bin/am and the server is started ad hoc on 127.0.0.1. Make startup persistent, make the web UI reachable at http://100.84.193.107:8765/mail, verify health, and document any remaining blocker.","status":"closed","priority":1,"issue_type":"task","assignee":"entity","created_at":"2026-06-06T23:22:44.859402372Z","created_by":"entity","updated_at":"2026-06-06T23:28:10.638440985Z","closed_at":"2026-06-06T23:28:10.638294412Z","close_reason":"Persistent Agent Mail service enabled and verified on localhost and Tailscale; EP orchestrator kill-gate docs committed and pushed.","source_repo":"agentv","source_repo_path":"/home/entity/projects/EntityProcess/agentv","compaction_level":0,"original_size":0,"labels":["agent-mail","ops","tailscale"],"comments":[{"id":184,"issue_id":"av-f87","author":"entity","text":"Started investigation/fix. Hypothesis from operator: tmux crash may be correlated with memory pressure when many sessions finish/push/archive concurrently, not kill itself. Plan: check OOM/journal evidence, make Agent Mail persistent, expose web UI on Tailscale URL, then verify restored sessions and document root cause.","created_at":"2026-06-06T23:23:19Z"},{"id":186,"issue_id":"av-f87","author":"entity","text":"Implemented persistent Agent Mail service: systemd user unit mcp-agent-mail.service now starts /home/entity/.local/share/mcp_agent_mail/scripts/run_server_with_token.sh serve-http --host 0.0.0.0 --port 8765 --path /mcp. Disabled the extra Tailscale proxy because operator confirmed public bind is acceptable behind firewall. Verifying localhost and Tailscale health now.","created_at":"2026-06-06T23:26:33Z"},{"id":187,"issue_id":"av-f87","author":"entity","text":"Verification complete: localhost health returned OK; Tailscale health returned OK at http://100.84.193.107:8765/api/health; Tailscale web UI returned HTML at http://100.84.193.107:8765/mail. systemd user service mcp-agent-mail.service is enabled and active, bound to 0.0.0.0:8765. Journal confirms GET /api/health and /mail from 100.84.193.107. Remaining note: no kernel OOM evidence found in journal for the crash window; many tmux scope exits happened in bursts around 00:50/00:56, so memory/concurrency remains plausible but not proven.","created_at":"2026-06-06T23:27:28Z"}]} {"id":"av-fgt","title":"fix: update public demo setup for projects results config","description":"Discovered during av-3yr public Dashboard UAT on 2026-06-06.\n\nObservable behavior:\n- `agentv-deploy/scripts/setup-local-agentv-dev.sh` writes `$AGENTV_HOME/projects.yaml` and deprecated top-level `results_by_project` in `$AGENTV_HOME/config.yaml`.\n- Current AgentV reads the project registry from `$AGENTV_HOME/config.yaml` under `projects:` and expects per-project `projects[].results`.\n- Running the printed command from setup with `AGENTV_HOME=/tmp/agentv-public-uat-home PORT=3219 bun apps/cli/src/cli.ts serve` registered only the cwd `agentv` project; `financial-research-agent` and `swe-evals` were missing and `/api/projects//remote/sync` returned `Project not found`.\n- Manual rewrite of `/tmp/agentv-public-uat-home/config.yaml` to current `projects[].results` format made Dashboard start with 3 projects and source sync succeed.\n\nAcceptance:\n- Update agentv-deploy setup scripts/docs to write current AgentV home config shape.\n- Remove or clearly migrate stale `projects.yaml` / `results_by_project` guidance.\n- Verify a fresh isolated public demo home starts Dashboard with exactly agentv, financial-research-agent, and swe-evals without manual config edits.\n- Add a static validation check that catches this drift.","status":"closed","priority":1,"issue_type":"bug","assignee":"entity","created_at":"2026-06-06T03:38:39.807577978Z","created_by":"entity","updated_at":"2026-06-06T12:26:58.780938770Z","closed_at":"2026-06-06T12:26:58.780803338Z","close_reason":"Fixed in agentv-deploy feature/av-fgt-public-demo-config commit 70cdef1b51b0779d159d4a6ff6b7fd63cf1cca25; verification passed; no blockers.","source_repo":"agentv","source_repo_path":"/home/entity/projects/EntityProcess/agentv","compaction_level":0,"original_size":0,"labels":["config","dashboard","public-demo","uat"],"comments":[{"id":86,"issue_id":"av-fgt","author":"entity","text":"WTG-specific dogfood evidence from 2026-06-06: active /home/entity/.agentv/projects.yaml contains wtg-ai-prompts and WiseTechAcademy entries, but current AgentV serve reads projects from $AGENTV_HOME/config.yaml, so AGENTV_HOME=/home/entity/.agentv PORT=3120 bun apps/cli/src/cli.ts serve registered only agentv. A temporary corrected config.yaml with projects[].results made /api/projects show agentv, WTG.AI.Prompts, and WiseTechAcademy.Evals, and /api/projects/wtg-ai-prompts/remote/sync returned configured=true, available=true, repo=WiseTechGlobal/WTG.AI.Prompts.EvalResults, run_count=1. Also inspect agentv-deploy/scripts/run-local-agentv.sh: it writes id wtg-ai-prompts but name/path/source for financial-research-agent, so the private/local runner can display or route the wrong project.","created_at":"2026-06-06T05:15:10Z"},{"id":120,"issue_id":"av-fgt","author":"entity","text":"Launching continuation NTM worker for public demo setup config drift. AgentV coordination/worktree: /home/entity/ntm_Dev/agentv-public-demo-config; deploy implementation worktree: /home/entity/ntm_Dev/agentv-deploy-public-demo-config.","created_at":"2026-06-06T11:55:46Z"},{"id":122,"issue_id":"av-fgt","author":"entity","text":"Continuing Bead av-fgt in deploy checkout /home/entity/ntm_Dev/agentv-deploy-public-demo-config on feature/av-fgt-public-demo-config. Scope: update public demo setup config shape, add static validation, verify fresh public demo home via Dashboard/API.","created_at":"2026-06-06T12:01:08Z"},{"id":128,"issue_id":"av-fgt","author":"entity","text":"Completed public demo config drift fix in agentv-deploy. Branch: feature/av-fgt-public-demo-config. Commit: EntityProcess/agentv-deploy@70cdef1b51b0779d159d4a6ff6b7fd63cf1cca25. Changes: setup-local-agentv-dev.sh, run-local-agentv.sh, and docker-entrypoint.sh now write current AgentV home config.yaml with projects[] entries and per-project projects[].results; legacy projects.yaml is archived if present; README stale projects.yaml/results_by_project guidance removed; run-local-agentv.sh financial project id fixed from the WTG/private id to financial-research-agent; validate-config.sh now fails on stale config surfaces and checks the current project/results shape. Verification: sh -n on modified shell scripts; ./scripts/validate-config.sh (static checks + docker compose config); fresh AGENTV_HOME=/tmp/agentv-av-fgt-home via setup-local-agentv-dev.sh --no-sync parsed with exactly agentv, financial-research-agent, swe-evals and no projects.yaml/results_by_project; started current AgentV CLI from reference checkout against that fresh home on PORT=39118 and /api/projects returned exactly agentv, financial-research-agent, swe-evals with remote/status configured for EntityProcess/agentv-examples-eval-results, EntityProcess/financial-research-agent-evals, and EntityProcess/swe-evals-results; exercised run-local-agentv.sh --no-serve --skip-install in /tmp/agentv-av-fgt-run-local and verified ids exactly agentv, financial-research-agent, swe-evals with no wtg-ai-prompts. Note: bun install --frozen-lockfile and bun run build were needed in the AgentV reference checkout for verification; no AgentV core source changes were needed and no tracked AgentV files were changed. Blockers: none.","created_at":"2026-06-06T12:26:39Z"}]} +{"id":"av-fis","title":"dogfood: adversarial remote result sync before production","description":"Production dogfood request:\nBefore deploying AgentV to production, adversarially dogfood remote result repository sync across mutable dashboard and CLI workflows, then implement focused fixes for production-blocking bugs.\n\nScope and scenarios:\n- Add, edit, and remove run/result tags locally, sync to remote, pull into a clean clone, and verify persistence and dirty-state behavior.\n- Combine runs, sync combined runs, delete original/constituent/combined runs in different orders, and verify indexes, dashboard lists, and remote state stay coherent.\n- Delete local runs after remote sync, re-sync/pull remote-only state, delete remote runs while local has metadata edits, and verify conflict/dirty messaging is explicit.\n- Exercise local-only, remote-only, missing metadata, empty remote repo, partial/corrupt index.jsonl, interrupted sync/retry, idempotent repeated sync, auth failure/offline remote, branch/default-branch mismatch, and multiple result repo/project cases where supported.\n- Cover both dashboard UX and CLI/API paths for sync/status/mutation flows.\n- Confirm no data loss, no silent overwrite of user metadata, clear recovery guidance, and safe defaults for production.\n\nWorker requirements:\n- Use repo-local AGENTS.md, Beads, Agent Mail reservations, and a dedicated worktree unless the worker proves a shared checkout is clean/current and the change is tiny.\n- Use ep-engineering:ntm for subagent orchestration, agentv-dev skills when eval fixtures/runs are involved, ce-dogfood-beta/ce-debug/ce-code-review/ce-frontend-design as appropriate, and GitHub skills for PR/CI if needed.\n- Coordinate with active run-bundle work, especially av-wy0.3/av-wy0.2 paths. Do not edit files reserved by another worker without Agent Mail coordination.\n- Produce a durable dogfood report with scenario matrix, evidence paths, fixes, residual risks, and a production readiness verdict.\n- Implement small, unambiguous fixes discovered by dogfood. For larger design changes, create linked follow-up Beads with reproduction evidence.\n\nAcceptance:\n- A committed dogfood report covers the scenario matrix above and records exact commands, dashboard routes, remote repo setup, screenshots/log evidence, and pass/fail outcomes.\n- Production-blocking remote-sync bugs found during the sweep are fixed with focused tests and documented verification.\n- Any unfixed behavior is converted into linked Beads with severity, reproduction steps, and recommended owner.\n- Final Bead/PR notes state whether AgentV remote repo sync is production-ready for mutable tags, combine/delete, and remote sync workflows.","status":"in_review","priority":0,"issue_type":"task","assignee":"agentv--remote-sync-dogfood","created_at":"2026-06-08T23:44:22.359476381Z","created_by":"entity","updated_at":"2026-06-09T05:53:39.280389061Z","external_ref":"https://github.com/EntityProcess/agentv/pull/1332","source_repo":"agentv","source_repo_path":"/home/entity/projects/EntityProcess/agentv","compaction_level":0,"original_size":0,"labels":["dashboard","dogfood","production","remote-sync","results"],"comments":[{"id":310,"issue_id":"av-fis","author":"entity","text":"Spawned NTM session agentv--remote-sync-dogfood. Assignment: orchestrate adversarial remote result repo sync dogfood before production, spawn subagents as needed, implement focused production-blocking fixes, and record a durable report/evidence plus follow-up Beads for larger findings.","created_at":"2026-06-08T23:46:10Z"},{"id":311,"issue_id":"av-fis","author":"agentv--remote-sync-dogfood","text":"Initial plan: create an isolated worktree from current origin/main because the primary checkout has dirty Beads/NTM state; reserve docs/dogfood-reports, remote-sync implementation/tests, and any dashboard sync UX files before edits; inspect current CLI/API/dashboard remote result sync code and existing tests; draft the adversarial matrix before fixes; dogfood against throwaway local/file remotes and temporary clones only; implement only small production-blocking fixes with focused regression tests; create linked follow-up Beads for larger or ambiguous design issues; keep this Bead updated with evidence and final readiness verdict.","created_at":"2026-06-08T23:48:58Z"},{"id":312,"issue_id":"av-fis","author":"agentv--remote-sync-dogfood","text":"Matrix checkpoint: created docs/dogfood-reports/2026-06-08-av-fis-remote-sync-dogfood.md in the dedicated worktree with 35 pending adversarial scenarios before any fixes. Initial inspection found remote sync is Dashboard/API-first; CLI coverage is eval auto-export plus local results combine/delete, with no direct agentv results remote sync/status command exposed.","created_at":"2026-06-08T23:58:57Z"},{"id":313,"issue_id":"av-fis","author":"entity","text":"Dogfood checkpoint/final: completed adversarial remote-result sync matrix in dedicated worktree /home/entity/projects/EntityProcess/agentv.worktrees/av-fis-remote-sync-dogfood. Durable report: docs/dogfood-reports/2026-06-08-av-fis-remote-sync-dogfood.md with evidence under docs/dogfood-reports/evidence/2026-06-08-av-fis-remote-sync/. Implemented two focused fixes: (1) sync failure now preserves cached remote run_count instead of reporting 0; (2) API merged run listing dedupes synced local+remote copies in favor of local runs. Verification: focused suite passed 121 tests / 0 failures / 381 expectations; bun run build passed. Readiness verdict: remote result data path is production-ready for controlled rollout after these fixes, but Dashboard production UX still needs follow-up/waiver for av-fis.1 and av-xqm; av-fis.3 remains a regression-test follow-up for interrupted retry.","created_at":"2026-06-09T00:45:10Z"},{"id":314,"issue_id":"av-fis","author":"entity","text":"Commit created in dogfood worktree: 0c3ae8b5 fix(results): harden remote sync dogfood. Includes remote.ts fixes, serve regression tests, dogfood report, and evidence artifacts. Note: Bead JSONL state in the primary checkout was already dirty with unrelated orchestration changes, so it was intentionally not staged into this worktree commit.","created_at":"2026-06-09T00:47:41Z"},{"id":315,"issue_id":"av-fis","author":"entity","text":"Final commit hash after evidence whitespace cleanup: e64364ce fix(results): harden remote sync dogfood. Post-commit check: git show --check HEAD passed; dogfood worktree is clean and ahead of origin/main by 1.","created_at":"2026-06-09T00:49:34Z"},{"id":316,"issue_id":"av-fis","author":"entity","text":"Shipping handoff complete: pushed dogfood/av-fis-remote-sync and opened PR #1332 against main: https://github.com/EntityProcess/agentv/pull/1332. PR is open/not draft; merge not attempted. Initial status: mergeable, CI running with Build/Typecheck/Lint/Test/Validate Evals in progress and Check Links/Validate Marketplace/Cloudflare Pages passing at first poll.","created_at":"2026-06-09T00:55:27Z"},{"id":317,"issue_id":"av-fis","author":"entity","text":"PR #1332 CI update: initial Test job failed on unrelated timing-threshold flake in packages/core/test/evaluation/orchestrator.test.ts (expected duration >=20ms, received 19ms). Reproduced the single test locally on the PR branch and it passed; reran the failed GitHub Actions job. All PR checks now pass. No merge attempted.","created_at":"2026-06-09T00:59:18Z"},{"id":318,"issue_id":"av-fis","author":"entity","text":"Private evidence cleanup for PR #1332 complete. AgentV public branch dogfood/av-fis-remote-sync was rebased/force-pushed at c81c89ca fix(results): harden remote sync dogfood; GitHub PR diff now contains only apps/cli/src/commands/results/remote.ts and apps/cli/test/commands/results/serve.test.ts, with no docs/dogfood-reports artifacts. Durable dogfood report/evidence moved to EntityProcess/agentv-private branch dogfood/av-fis-remote-sync-evidence at 5d81c8d, path dogfood/av-fis/2026-06-08-remote-sync/. PR body updated with private branch/commit/path. Verification after cleanup: bun test apps/cli/test/commands/results/serve.test.ts passed 83/83; git show --check HEAD passed; PR #1332 checks all pass. Commit 9bff9024ebad5d569e90efa20aa652677e281b7a was not resolvable from local refs, origin, or GitHub commit API during cleanup.","created_at":"2026-06-09T02:48:12Z"},{"id":319,"issue_id":"av-fis","author":"OliveHeron","text":"Reconciled NTM-only remote sync follow-up Beads into the canonical AgentV graph from /home/entity/ntm_Dev/agentv. Preserved issue IDs av-fis, av-fis.1, av-fis.2, av-fis.3, and av-xqm; parent-child dependencies imported cleanly. Backup before import: /tmp/agentv-beads-backups-20260609T054751Z. Acceptance criteria sharpened from ao-rl9.4 follow-up guidance.","created_at":"2026-06-09T05:53:39Z"}]} +{"id":"av-fis.1","title":"bug: Dashboard sync button can remain syncing after concurrent remote sync","description":"Dogfood evidence from av-fis: while Dashboard project /projects/av-fis-dashboard had dirty remote metadata, agent-browser clicked Sync Metadata and a concurrent POST /api/projects/av-fis-dashboard/remote/sync was issued. The API sync completed cleanly, remote status returned sync_status=clean with run_count=1, dirty_paths=[], and the metadata overlay was committed to the throwaway remote. The browser row cleared Pending sync, but the sync button stayed disabled with label 'Syncing...' until page reload. Evidence in dogfood report worktree: docs/dogfood-reports/evidence/2026-06-08-av-fis-remote-sync/dashboard-project-after-sync-settled.png and dashboard-project-after-reload.png. Recommended follow-up: add a focused browser/API test for concurrent sync responses and make the UI clear in-flight state after a blocked/syncing response or after status refetch returns clean.","acceptance_criteria":"- Reproduce the dogfood failure where a Dashboard-triggered sync plus a concurrent API sync can leave the sync control disabled with a `Syncing...` label after the backend status has settled clean.\n- Fix the Dashboard in-flight state so the sync button clears after the current sync response settles, after a blocked/syncing response is superseded by a status refetch, or when the latest status is clean with no pending dirty paths.\n- Keep this bead scoped to UI/request state cleanup. If investigation shows a persistent stale sync marker or backend lock-cleanup problem, create and link a child bead instead of expanding this one.\n- Add focused regression coverage and, for visible Dashboard changes, browser UAT evidence under the private dogfood evidence path.","status":"open","priority":1,"issue_type":"bug","assignee":"agentv","created_at":"2026-06-09T00:23:52.033802077Z","created_by":"entity","updated_at":"2026-06-09T05:53:38.507144505Z","source_repo":"agentv","source_repo_path":"/home/entity/projects/EntityProcess/agentv","compaction_level":0,"original_size":0,"labels":["dashboard","dogfood","follow-up","remote-sync"],"dependencies":[{"issue_id":"av-fis.1","depends_on_id":"av-fis","type":"parent-child","created_at":"2026-06-09T00:23:52.033802077Z","created_by":"entity","metadata":"{}","thread_id":""}]} +{"id":"av-fis.2","title":"feature: decide CLI contract for remote results sync/status","description":"Dogfood evidence from av-fis: remote results sync/status is exposed through Dashboard/API routes (/api/remote/status, /api/remote/sync, and project-scoped variants), while 'agentv results' exposes combine/delete/export/report/summary/failures/show/validate and has no direct remote sync/status subcommand. CLI-adjacent paths are covered by eval auto-export plus local results combine/delete. Recommended follow-up: decide whether production needs an explicit 'agentv results remote status/sync' command or whether docs should state that manual sync is Dashboard/API-only.","acceptance_criteria":"- Expose first-class CLI commands for remote result repositories, specifically `agentv results remote status` and `agentv results remote sync` unless implementation discovers an already-equivalent command surface.\n- Reuse the existing core remote status/sync implementation and existing project/config resolution; do not introduce a new remote-sync primitive or provider-specific knobs.\n- Support both concise human output and `--json` output. JSON must use snake_case boundary keys and the existing core status/sync fields rather than a CLI-only shape.\n- Return actionable nonzero failures for blocked, conflicted, auth/offline, rejected push, or missing-configuration states while preserving cached run/status information when available.\n- Add focused CLI tests plus help/docs updates for command names, JSON output, exit behavior, and relationship to Dashboard/API sync.","status":"open","priority":2,"issue_type":"feature","assignee":"agentv","created_at":"2026-06-09T00:23:52.184047052Z","created_by":"entity","updated_at":"2026-06-09T05:53:38.696553073Z","source_repo":"agentv","source_repo_path":"/home/entity/projects/EntityProcess/agentv","compaction_level":0,"original_size":0,"labels":["cli","dogfood","follow-up","remote-sync"],"dependencies":[{"issue_id":"av-fis.2","depends_on_id":"av-fis","type":"parent-child","created_at":"2026-06-09T00:23:52.184047052Z","created_by":"entity","metadata":"{}","thread_id":""}]} +{"id":"av-fis.3","title":"test: add interrupted remote sync retry coverage","description":"Dogfood gap from av-fis: idempotent repeated sync, concurrent sync, offline failure, dirty/behind/diverged/conflicted states were covered with throwaway remotes, but a true interrupted sync/retry case was not deterministically exercised. Recommended follow-up: build a focused harness that interrupts Dashboard/API sync between fetch/commit/push phases using a controllable git remote or injectable git runner, then verifies retry leaves the results repo clean or explicitly blocked without data loss.","acceptance_criteria":"- Add deterministic interrupted-sync retry coverage for git states that dogfood did not fully exercise: `.git/index.lock`, interrupted merge/rebase markers, rejected push after fetch, and retry after cleanup.\n- Verify cached remote run/status information remains available when sync is blocked and that block reasons identify the actionable cleanup or retry step.\n- Verify retry either leaves the results repository clean after the interrupted state is removed or remains explicitly blocked without data loss or silent overwrite.\n- Prefer a throwaway local remote or injectable git runner test harness; keep production changes minimal and generic to AgentV remote results sync.","status":"open","priority":2,"issue_type":"task","assignee":"agentv","created_at":"2026-06-09T00:24:36.809871416Z","created_by":"entity","updated_at":"2026-06-09T05:53:38.869128106Z","source_repo":"agentv","source_repo_path":"/home/entity/projects/EntityProcess/agentv","compaction_level":0,"original_size":0,"labels":["dogfood","follow-up","remote-sync","test"],"dependencies":[{"issue_id":"av-fis.3","depends_on_id":"av-fis","type":"parent-child","created_at":"2026-06-09T00:24:36.809871416Z","created_by":"entity","metadata":"{}","thread_id":""}]} {"id":"av-fo9","title":"public demo: build financial-research-agent eval repo","description":"Scope correction for the former dexter-evals companion project.\\n\\nDesign:\\n- The demo subject repository/project is financial-research-agent: a coding/web research agent that attempts to reproduce the public financial-research behavior Dexter demonstrates.\\n- Dexter is used only as an upstream public benchmark fixture: pin virattt/dexter, read src/evals/dataset/finance_agent.csv, and use its Answer column as expected_output/golden answers plus Rubric as AgentV rubric criteria.\\n- Do not require or run Dexter by default. Do not require FINANCIAL_DATASETS_API_KEY for the default public demo path.\\n- Keep an optional dexter-agent compatibility target only for users who explicitly configure the paid Dexter prerequisites.\\n- Rename the companion project from dexter-evals to financial-research-agent, with eval YAML/config/scripts/docs living in that repo/project.\\n- Result sync should publish this project to the public result repository financial-research-agent-evals.\\n\\nAcceptance:\\n- Rename/migrate dexter-evals files and docs to financial-research-agent without losing the Dexter source attribution/pinned commit.\\n- Default AgentV target is financial-research-agent and uses a coding agent with public web research instructions.\\n- Setup/validation pass without DEXTER_REPO_PATH or FINANCIAL_DATASETS_API_KEY for the default target.\\n- Generated evals default to financial-research-agent.\\n- Beads/result-sync/dashboard handoff notes reference financial-research-agent and financial-research-agent-evals, not dexter-evals-results.\\n- Coordinate in /home/entity/projects/EntityProcess/agentv for Beads and edit code in /home/entity/projects/EntityProcess/agentv.worktrees/public-demo-integration alongside the SWE worker, touching only finance-specific paths unless coordinating first.","status":"closed","priority":1,"issue_type":"task","assignee":"BlackMeadow","created_at":"2026-06-04T04:15:56.086604136Z","created_by":"entity","updated_at":"2026-06-06T04:10:33.680784058Z","closed_at":"2026-06-04T10:28:24.838913958Z","close_reason":"Completed: financial-research-agent sibling repo contains the full pinned Dexter finance eval, canonical generator/docs, verified setup/validation, and final docs commit pushed to origin/main.","source_repo":"agentv","source_repo_path":"/home/entity/projects/EntityProcess/agentv","compaction_level":0,"original_size":0,"labels":["evals","finance","public-demo"],"comments":[{"id":37,"issue_id":"av-fo9","author":"BlackMeadow","text":"bead-spawn-agent launched an agent for av-fo9.\n\nSession: agent-av-fo9-main-20260604061758\nDirectory: /home/entity/projects/EntityProcess/agentv.worktrees/public-demo-integration\nProfile: codex-eng (auto-detected if not specified)\n\nExported EP_TASK_ID, BEAD_ID, and AGENTV_BEAD_ID as av-fo9.\nBeads coordination checkout: /home/entity/projects/EntityProcess/agentv\nWorktree: /home/entity/projects/EntityProcess/agentv.worktrees/public-demo-integration","created_at":"2026-06-04T04:17:58Z"},{"id":38,"issue_id":"av-fo9","author":"entity","text":"Starting finance companion migration in shared worktree /home/entity/projects/EntityProcess/agentv.worktrees/public-demo-integration. I will avoid swe-evals/**, root .gitignore, and shared root config. Need to edit the finance companion package.json because the package name and eval script still say dexter-evals/dexter-finance; this is package metadata scoped to the finance companion, not shared package manager state.","created_at":"2026-06-04T04:22:20Z"},{"id":40,"issue_id":"av-fo9","author":"entity","text":"Implemented finance companion migration in shared integration worktree. Renamed dexter-evals/ to financial-research-agent/ and renamed evals/dexter-finance-smoke.eval.yaml to evals/financial-research-agent-smoke.eval.yaml. Updated companion README, package scripts/name, setup messages, generated eval defaults, eval suite name/tags, default target docs, and public result repo reference to financial-research-agent-evals. Dexter remains only as pinned fixture/golden-answer source and optional dexter-agent compatibility target.\\n\\nVerification evidence: default setup passed with DEXTER_REPO_PATH and FINANCIAL_DATASETS_API_KEY explicitly unset using dummy non-secret grader/model env: AGENT_TARGET=financial-research-agent, GRADER_TARGET=openai-grader, CODEX_EXECUTABLE=codex-eng, CODEX_MODEL=gpt-5.5, CODEX_WORKSPACE_DIR=.agentv/codex-workspaces, CODEX_LOG_DIR=.agentv/logs/codex, OPENAI_MODEL=dummy-grader-model. AgentV validation passed for financial-research-agent/evals/financial-research-agent-smoke.eval.yaml and financial-research-agent/.agentv/targets.yaml: 2 valid, 0 invalid. Dry-run target resolution also completed with the same default target and no Dexter/Financial Datasets env; it resolved to financial-research-agent-dry-run and only failed quality score because the dry-run mock answer does not satisfy the real rubric. Cleaned the generated root OTEL artifact afterward.\\n\\nTouched surface: finance companion rename/content only. I did not modify swe-evals/**, root .gitignore, shared root config, or package manager files outside the finance companion package.json.","created_at":"2026-06-04T04:28:37Z"},{"id":41,"issue_id":"av-fo9","author":"BlackMeadow","text":"Scope refinement from user: commit generated AgentV YAML for the full Dexter finance_agent.csv dataset, not just a hand-picked smoke subset. The generator remains canonical for refreshes, but the checked-in eval YAML should cover every CSV row. A smaller smoke/test-id run may be used only for quick verification, not as the committed dataset boundary.","created_at":"2026-06-04T05:21:39Z"},{"id":42,"issue_id":"av-fo9","author":"entity","text":"Resuming finance bead for refinement: committed YAML should cover every row in Dexter's pinned finance_agent.csv, not a hand-picked smoke subset. I will update the generator so the full dataset is the default, rename docs/eval paths away from smoke as the dataset boundary, regenerate the committed eval YAML from the pinned Dexter CSV, validate full YAML plus targets, and stop before committing. Beads mutations remain in /home/entity/projects/EntityProcess/agentv.","created_at":"2026-06-04T05:23:31Z"},{"id":49,"issue_id":"av-fo9","author":"entity","text":"Migrated latest preserved financial-research-agent artifact into sibling repo /home/entity/projects/EntityProcess/financial-research-agent. Confirmed sibling repo was clean on main before migration at c649fd847659b1aa2c19280016c3956fda2d6847. During copy, rsync --delete removed the repo .git metadata; recovered it from https://github.com/christso/financial-research-agent.git, whose HEAD matched the pre-copy hash. No commit made.\\n\\nMigration result: integration worktree copy remains preserved; sibling repo now matches the preserved artifact content except only README.md is dirty against origin/main. The full eval YAML is present at evals/financial-research-agent.eval.yaml with 50 tests and source_row entries through 50. Generator remains canonical: running DEXTER_REPO_PATH=/tmp/dexter-pinned-8d9419829f443f84b804d033bb2c3b1fbd788629 DEXTER_COMMIT=8d9419829f443f84b804d033bb2c3b1fbd788629 bun run scripts/generate-eval-from-dexter.ts --out /tmp/financial-research-agent.regenerated.eval.yaml produced 50 tests and cmp matched the committed eval YAML byte-for-byte.\\n\\nVerification in sibling repo: default setup passed with DEXTER_REPO_PATH and FINANCIAL_DATASETS_API_KEY explicitly unset using dummy non-secret env (AGENT_TARGET=financial-research-agent, GRADER_TARGET=openai-grader, CODEX_EXECUTABLE=codex-eng, CODEX_MODEL=gpt-5.5, CODEX_WORKSPACE_DIR=.agentv/codex-workspaces, CODEX_LOG_DIR=.agentv/logs/codex, OPENAI_MODEL=dummy-grader-model). AgentV validation passed for evals/financial-research-agent.eval.yaml and .agentv/targets.yaml: 2 valid, 0 invalid. Stale naming scan found no smoke/dexter-evals/dexter-finance/financial-research-agent-generated strings in the sibling repo content.\\n\\nStatus/blockers: sibling repo has uncommitted README.md only, changing the result sync wording from financial-research-agent-eval-results to financial-research-agent-evals. No validation blockers. Awaiting explicit commit instruction.","created_at":"2026-06-04T09:22:31Z"},{"id":51,"issue_id":"av-fo9","author":"entity","text":"Completed financial-research-agent sibling repo migration and final docs commit.\\n\\nCommit: abf4384ae26bc1189f9ae9b2c4b0f71612be5c6e (docs: align financial result repo name)\\nPush target: https://github.com/christso/financial-research-agent.git main (origin/main), push succeeded c649fd8..abf4384.\\n\\nFinal verification evidence from /home/entity/projects/EntityProcess/financial-research-agent: full eval YAML at evals/financial-research-agent.eval.yaml has 50 tests and source_row through 50; generator reproduced the committed eval byte-for-byte from the pinned Dexter CSV; default setup passed with DEXTER_REPO_PATH and FINANCIAL_DATASETS_API_KEY unset using dummy non-secret grader/model env; AgentV validation passed for evals/financial-research-agent.eval.yaml and .agentv/targets.yaml with 2 valid, 0 invalid; stale naming scan found no smoke/dexter-evals/dexter-finance/financial-research-agent-generated strings.\\n\\nScope note: only /home/entity/projects/EntityProcess/financial-research-agent was committed/pushed, plus this Beads update from /home/entity/projects/EntityProcess/agentv. Did not touch unrelated AgentV dashboard-run-management changes.","created_at":"2026-06-04T10:28:24Z"},{"id":54,"issue_id":"av-fo9","author":"entity","text":"Post-closeout cleanup completed after separate repo push.\\n\\nDurability confirmed: /home/entity/projects/EntityProcess/financial-research-agent is clean at abf4384ae26bc1189f9ae9b2c4b0f71612be5c6e, and origin/main at https://github.com/christso/financial-research-agent.git resolves to the same hash. The sibling repo contains the migrated durable content: full 50-test eval YAML, canonical generator, targets, scripts, docs, and result repo wording.\\n\\nRemoved from AgentV integration worktree: deleted the untracked migrated copy directory /home/entity/projects/EntityProcess/agentv.worktrees/public-demo-integration/financial-research-agent/ because financial-research-agent now lives as its own sibling repository and AgentV should not carry that separate eval repo copy. Also removed temporary verification artifacts I created under /tmp: dexter-pinned-8d9419829f443f84b804d033bb2c3b1fbd788629, financial-research-agent.regenerated.eval.yaml, and financial-research-agent-dry-run.jsonl.\\n\\nLeft untouched: unrelated AgentV worktree changes including .gitignore and SWE/dashboard-run-management state. The existing tracked dexter-evals/** deletion state remains in the AgentV integration worktree as the AgentV-side removal of the old embedded companion content; I did not restore it because that would reintroduce separate eval repo content into AgentV, and I did not commit it because this closeout only requested the separate repo commit/push plus cleanup.\\n\\nAgent Mail/resources: this Codex session did not register an Agent Mail identity and did not create file reservations, so there was nothing to deregister or release. No subagents were spawned. Per user instruction, after this final note I will kill the tmux session agent-agentv-public-demo-financial-research-agent-fo9-main-20260604061758.","created_at":"2026-06-04T10:39:02Z"},{"id":81,"issue_id":"av-fo9","author":"entity","text":"Repo ownership update 2026-06-06: moved financial-research-agent from `christso/financial-research-agent` to public sibling repo `EntityProcess/financial-research-agent`. Local origin updated to `https://github.com/EntityProcess/financial-research-agent.git`; main is `90863fe`.","created_at":"2026-06-06T04:10:33Z"}]} {"id":"av-g56","title":"feat: delete local eval runs","description":"Goal:\nAdd a delete-runs capability for local AgentV result run workspaces so users can remove stale or accidental runs after creating/combining runs.\n\nContext:\n- av-l5n added combined runs and explicitly excluded delete/broad run-management behavior.\n- Current code has tag deletion and project deletion only; no run workspace deletion surface was found.\n\nScope:\n- Delete local run workspaces from CLI and Dashboard/API.\n- Do not delete remote/synced runs.\n- Keep the implementation primitive and deterministic: remove the selected local run workspace directory and refresh listings.\n- Avoid new run-management abstractions beyond what deletion needs.\n\nAcceptance:\n- CLI supports deleting one or more local runs by run ID/path with an explicit confirmation or force flag.\n- Dashboard/API supports deleting selected local completed runs and rejects remote runs.\n- Deleting a run removes its workspace directory and associated sidecars within that workspace.\n- Tests cover local deletion, remote rejection, missing run handling, and user-facing CLI/API behavior.\n- Red/green UAT evidence is recorded before handoff.","status":"closed","priority":2,"issue_type":"feature","assignee":"entity","created_at":"2026-06-05T13:10:14.254947559Z","created_by":"entity","updated_at":"2026-06-06T02:10:42.098237601Z","closed_at":"2026-06-06T02:10:42.098063466Z","close_reason":"Completed and pushed for review on origin/feat/av-g56-delete-runs at 7870929a. Browser dogfood screenshots pushed to agentv-assets-private at 351e76a; focused tests/lint/typecheck passed.","source_repo":"agentv","source_repo_path":"/home/entity/projects/EntityProcess/agentv","compaction_level":0,"original_size":0,"labels":["cli","dashboard","runs"],"comments":[{"id":72,"issue_id":"av-g56","author":"entity","text":"Implementation note: Added local run deletion as a primitive rather than broad run management. CLI: agentv results delete --yes deletes local run IDs, workspace directories, or index.jsonl manifests after validation. API: DELETE /api/runs/:filename and DELETE /api/projects/:projectId/runs/:filename reject remote/active runs and remove the local run workspace. Dashboard Recent Runs selection now offers Delete alongside Combine for local completed runs. Docs updated in dashboard tool page.\n\nVerification: Red UAT on main: `bun apps/cli/src/cli.ts results delete --help` listed results subcommands without delete. Green UAT on branch: same help shows `agentv results delete` with --yes; synthetic run `demo::2026-06-01T10-00-00-000Z` was deleted from /tmp and confirmed RUN_DIR_DELETED. Automated: `bun test apps/cli/test/commands/results/delete.test.ts apps/cli/test/commands/results/serve.test.ts` passed 72 tests; `bun --filter agentv typecheck` passed; `bun --filter agentv build` passed; `bun --filter @agentv/dashboard build` passed; `bun run lint` passed. Core build was run first in the fresh worktree so CLI tests could resolve @agentv/core.","created_at":"2026-06-05T13:40:15Z"},{"id":73,"issue_id":"av-g56","author":"entity","text":"Follow-up polish: tightened CLI missing-run handling so unknown run IDs report `Run not found` instead of a path-shape validation error. Added regression coverage. Final focused test run after this patch: `bun test apps/cli/test/commands/results/delete.test.ts apps/cli/test/commands/results/serve.test.ts` passed 73 tests / 208 assertions.","created_at":"2026-06-05T13:44:31Z"},{"id":76,"issue_id":"av-g56","author":"entity","text":"Dogfood screenshot evidence saved and pushed to agentv-assets-private commit 351e76a. Paths:\n- dogfood/av-g56-delete-runs/01-recent-runs-before-delete.png\n- dogfood/av-g56-delete-runs/02-run-selected-delete-enabled.png\n- dogfood/av-g56-delete-runs/03-after-delete.png\n\nBrowser UAT used a temp Dashboard project on localhost:3217. The UI showed two local completed runs, selecting Candidate enabled Delete, confirming Delete removed Candidate from the table, and the run workspace was absent from disk afterward; only the baseline index.jsonl remained.","created_at":"2026-06-05T21:29:49Z"}]} {"id":"av-goc","title":"EPIC: AgentV demo gap follow-up work","description":"Project-scope grouping Epic for AgentV demo follow-up gaps discovered after the GitHub remote-sync demo and Dashboard UX polish. This Epic groups portable implementation Beads for: mobile Dashboard run-table/detail UX; execution-error vs quality-failure scoring and UI semantics; realistic local WTG.AI.Prompts PR 679 runs; and AgentV branding treatment. Orchestration state, session topology, prompt receipt summaries, and coordinator handoffs live in Agent Mail thread agentv-demo-gap-orchestration-20260607, not in this Epic. Each child Bead must carry its own acceptance criteria, URLs, paths, constraints, and verification expectations.","status":"closed","priority":1,"issue_type":"epic","assignee":"entity","created_at":"2026-06-07T08:22:02.468085266Z","created_by":"entity","updated_at":"2026-06-07T12:29:40.383559089Z","closed_at":"2026-06-07T12:29:40.383376680Z","close_reason":"Completed: all child project tasks closed; remaining PR #1323 is open, CLEAN, and ready for review/merge.","source_repo":"agentv","source_repo_path":"/home/entity/projects/EntityProcess/agentv","compaction_level":0,"original_size":0,"labels":["dashboard","demo","frontend","orchestration","remote-sync"],"comments":[{"id":222,"issue_id":"av-goc","author":"entity","text":"Launching EP-owned tmux orchestrator session agentv-gap-orchestrator for follow-up demo gap delegation. PROMPT_UID=orch-gap-handoff-20260607-0822.","created_at":"2026-06-07T08:23:10Z"},{"id":223,"issue_id":"av-goc","author":"entity","text":"Accepted orchestration handoff as replacement coordinator. PROMPT_UID=orch-gap-handoff-20260607-0822. Standing constraints: coordinate through Beads/NTM only, do not implement feature work directly, preserve guarded NTM kill policy, and do not kill agentv-demo-github-sync without explicit user request or fully documented kill-gate evidence.","created_at":"2026-06-07T08:24:59Z"},{"id":226,"issue_id":"av-goc","author":"agentv-gap-orchestrator","text":"PROMPT_UID=beads-agentmail-correction-20260607-0829. Policy correction recorded: live orchestration state is moving to Agent Mail thread agentv-demo-gap-orchestration-20260607. Beads are only for portable task units or true Epics; this Bead is no longer an orchestration ledger and is being treated only as a grouping Epic for the four project tasks.","created_at":"2026-06-07T08:32:43Z"},{"id":246,"issue_id":"av-goc","author":"agentv-gap-orchestrator","text":"Grouping Epic closeout. All child Beads are closed: av-goc.1 mobile Dashboard UX (#1319 merged), av-goc.2 execution errors vs quality failures (#1321 merged), av-goc.3 WTG PR679 realistic local runs (local demo evidence complete, no AgentV PR needed), av-goc.4 AgentV wordmark (#1320 merged), and av-goc.5 uppercase AGENTV wordmark (#1323 open, CLEAN, ready for review/merge). Live orchestration/session state remains in Agent Mail thread agentv-demo-gap-orchestration-20260607. No sessions killed.","created_at":"2026-06-07T12:29:40Z"}]} @@ -80,4 +84,5 @@ {"id":"av-wy0.3","title":"results: materialize self-contained per-test task bundles","description":"Problem:\nPortable runs should not introduce parallel source schemas such as run_source.json or target_recipe.json when AgentV already has native eval and target config formats. The portable/auditable source unit is a single test task bundle, but it must not make the result artifact directory itself behave like an AgentV project root. Otherwise reruns can accidentally write nested output such as /.agentv/results/runs//...\n\nDesign goal:\nExtract a reusable materializer that can create self-contained per-test task bundles independent of running evals. The eval runner should call that function after/while writing results, but tests and future export/rerun flows should be able to call it directly from eval/target source inputs. For every result row, the materializer writes native AgentV task source under /task/: a single-test EVAL.yaml, a selected-target targets.yaml, copied test-referenced files, and copied grader-referenced assets. Existing result artifacts stay beside task/: input.md, grading.json, timing.json, and outputs/response.md. index.jsonl remains the run-level result index and points at both the result artifacts and the task bundle.\n\nWhy task/ rather than inputs/:\ninput.md is already the rendered model/agent input artifact. The source bundle also contains targets and graders, so calling it inputs/ is misleading. task/ better describes the runnable eval task contract and is more consistent with existing eval and benchmark frameworks.\n\nWhy this is not a new epic:\nThis is a straightforward scope correction inside the existing av-wy0 portable run-bundles epic. It replaces the stale manifest/recipe/source-JSON idea in this child bead with a smaller convention-over-configuration artifact layout and a reusable implementation boundary. Rerun execution remains in av-wy0.4 and docs/dogfood remain in av-wy0.5.\n\nSupersedes earlier av-wy0.3 comments that proposed run_manifest.json, target_recipe.json, and run_source.json as new schema artifacts, and supersedes the temporary inputs/ folder wording from the 2026-06-08 design discussion.\n\nNon-goals:\n- Do not execute reruns in this bead; that belongs to av-wy0.4.\n- Do not add a Margin-compatible runner or clone Margin schemas.\n- Do not add a benchmark registry/catalog abstraction.\n- Do not copy whole repositories into result artifacts.\n- Do not persist .env values, OAuth files, raw env dumps, or secret material.\n- Do not rename filenames to snake_case by analogy with field naming; snake_case applies to wire keys, not necessarily artifact filenames.\n- Do not require running a provider/eval just to test task-bundle materialization.","acceptance_criteria":"- Extract a reusable task-bundle materializer function/module that can be called without running an eval provider. It should accept source eval/test metadata, selected target/targets-file metadata, referenced file/grader metadata, and an output test artifact directory.\n- The eval runner calls the materializer for completed result rows when source metadata is available, instead of embedding task-bundle construction only inside end-of-run result writing.\n- Unit tests can invoke the materializer directly from fixture eval/targets inputs and assert the output tree without executing model/provider calls.\n- For each completed result row, the test artifact directory contains existing result artifacts beside a task/ folder: input.md, grading.json, timing.json, outputs/response.md when output exists, and task/.\n- input.md remains the rendered model/agent input for human inspection; task/ contains the native rerunnable task contract, including eval source, target config, files, and graders.\n- task/EVAL.yaml contains exactly the test case that produced that row, using normal AgentV eval YAML shape and snake_case keys inside the file.\n- task/EVAL.yaml references local files inside task/files/ and local grader assets inside task/graders/, not original workspace paths.\n- task/targets.yaml contains the selected target needed for that test, preserving variable-substitution placeholders as authored and never resolving or copying .env values.\n- task/files/ copies every file input or fixture referenced by the test, preserving enough relative path information for EVAL.yaml to reference those local copies.\n- task/graders/ copies every referenced grader asset needed by that test, including code-grader scripts, LLM-grader prompt files, and grader-side fixtures. Inline graders may remain inline in EVAL.yaml.\n- The materialized task bundle must not create or depend on a nested output root under the test artifact directory. In particular, tests should assert no /task/.agentv/results and no /.agentv/results are produced by materialization.\n- index.jsonl remains the run-level result index. It points at the per-test artifact folder and relevant files, and should add focused path fields such as artifact_dir, task_dir, eval_path, targets_path, files_path, and graders_path if needed by consumers.\n- benchmark.json remains the run-level aggregate summary. Add run-level metadata there only when a concrete Dashboard/API/rerun consumer needs it.\n- Do not introduce run_source.json, target_recipe.json, or a new run_manifest.json unless implementation proves an existing artifact cannot serve a concrete consumer.\n- Add focused tests for single-test and multi-test runs, direct materializer invocation, target placeholder preservation, file-reference copying, grader-asset copying, path rewriting, no-secret persistence, no nested .agentv/results creation, and backward-compatible reading of historical runs without these task bundles.","notes":"Design correction recorded from av-2lq Margin parity review and user review on 2026-06-08, with anti-nesting and final naming corrections later the same day. The goal is convention over configuration: use test-level task/EVAL.yaml and task/targets.yaml artifacts plus copied files/graders, with index.jsonl as the run-level index. input.md stays the rendered agent input artifact; task/ is the runnable task contract. Existing partial pieces include writeArtifactsFromResults() for result artifacts and buildRunSourceArtifact() for source capture, but av-wy0.3 should extract a task-bundle materializer that can run independently of eval execution. This bead should not implement target_recipe.json, run_source.json, run_manifest.json, inputs/ bundles, or filename snake_case churn.","status":"in_progress","priority":1,"issue_type":"feature","assignee":"codex-av-wy0.3","created_at":"2026-06-08T10:22:03.047804636Z","created_by":"entity","updated_at":"2026-06-08T23:51:29.606824631Z","source_repo":"agentv","source_repo_path":"/home/entity/projects/EntityProcess/agentv","compaction_level":0,"original_size":0,"labels":["auditability","rerun","results","run-bundles","targets"],"dependencies":[{"issue_id":"av-wy0.3","depends_on_id":"av-n75","type":"related","created_at":"2026-06-08T10:23:06.760050213Z","created_by":"entity","metadata":"{}","thread_id":""},{"issue_id":"av-wy0.3","depends_on_id":"av-wy0","type":"parent-child","created_at":"2026-06-08T10:22:03.047804636Z","created_by":"entity","metadata":"{}","thread_id":""}],"comments":[{"id":278,"issue_id":"av-wy0.3","author":"FuchsiaStream","text":"Design handoff from av-wy0 brainstorming (FuchsiaStream, 2026-06-08): start av-wy0.3 with schema/types/fixtures for `run_manifest.json` and `target_recipe.json`, then wire the writer. `run_manifest.json` is the bundle table of contents and links to existing artifacts; `target_recipe.json` is the redacted rerun-facing target/workspace recipe. Use snake_case keys, derive `required_env` names/placeholders from target config placeholders or known provider requirements when available, and never persist secret/env values or raw env dumps. No separate design bead is needed; this bead should still be design-first in implementation order. Rerun execution and deprecated output flag removal remain out of scope.","created_at":"2026-06-08T13:22:57Z"},{"id":282,"issue_id":"av-wy0.3","author":"FuchsiaStream","text":"Scope correction (FuchsiaStream, 2026-06-08): `run_manifest.json` should reference `run_source.json` only. Do not include a legacy `run-source.json` fallback in the new manifest/recipe schema. Apply the same rule to other new/unreleased bundle artifact names: hard-deprecate to the snake_case v1 name before release instead of carrying aliases.","created_at":"2026-06-08T13:42:12Z"},{"id":289,"issue_id":"av-wy0.3","author":"codex-av-2lq","text":"Design correction (codex-av-2lq, 2026-06-08): supersedes earlier comments proposing run_manifest.json, target_recipe.json, and run_source.json. The goal is straightforward and stays inside this existing av-wy0 child, not a new epic: make each per-test artifact folder the portable/auditable unit using existing AgentV conventions.\n\nImplementation direction: for every result row, write a self-contained test folder containing a single-test eval.yaml, selected-target targets.yaml, copied test-referenced files, copied grader-referenced assets, input.md, grading.json, timing.json, and outputs/response.md when present. index.jsonl remains the run-level result index and points at these test folders; benchmark.json remains the run-level aggregate. Preserve targets.yaml placeholders as authored and never persist .env values, OAuth files, raw env dumps, or secrets. Do not implement target_recipe.json/run_source.json or filename snake_case churn unless a concrete consumer proves existing artifacts cannot serve the use case.","created_at":"2026-06-08T21:26:22Z"},{"id":297,"issue_id":"av-wy0.3","author":"entity","text":"Design correction (codex-av-2lq, 2026-06-08): replace the temporary task/ wording with inputs/. av-wy0.3 should extract a reusable input-bundle materializer independent of eval execution. It writes inputs/EVAL.yaml, inputs/targets.yaml, inputs/files/, and inputs/graders/ beside existing result artifacts input.md, grading.json, timing.json, and outputs/response.md. Tests should invoke the materializer directly and assert it creates no nested .agentv/results under the test artifact directory or inputs/.","created_at":"2026-06-08T21:42:57Z"},{"id":303,"issue_id":"av-wy0.3","author":"entity","text":"Final naming correction (codex-av-2lq, 2026-06-08): av-wy0.3 should materialize task/ bundles, not inputs/. Extract a reusable task-bundle materializer independent of eval execution. It writes task/EVAL.yaml, task/targets.yaml, task/files/, and task/graders/ beside existing result artifacts input.md, grading.json, timing.json, and outputs/response.md. input.md remains the rendered agent input. Tests should invoke the materializer directly and assert no nested .agentv/results is created under the test artifact directory or task/.","created_at":"2026-06-08T21:48:14Z"},{"id":309,"issue_id":"av-wy0.3","author":"codex-orchestrator","text":"Dispatch note (codex-orchestrator, 2026-06-08): spawned NTM Codex worker for final task-bundle implementation. Session: agentv--task-bundles. Pane/Agent Mail identity: GentleForest. Scope: implement av-wy0.3 only using /home/entity/projects/EntityProcess/agentv for Beads/status/comments and a dedicated worktree under /home/entity/projects/EntityProcess/agentv.worktrees. Base implementation branch on origin/chore/av-wy0-task-bundle-design so the worker sees the corrected Beads design; that branch is origin/main plus Beads-only design/coordination commits. Do not use /home/entity/ntm_Dev/agentv as the Beads coordination checkout. Build self-contained per-test task/ bundles with task/EVAL.yaml, task/targets.yaml, task/files/, task/graders/; preserve placeholders; do not persist secrets; do not implement run_source.json, target_recipe.json, run_manifest.json, or inputs/ bundles; assert no nested .agentv/results under test artifact directories or task/.","created_at":"2026-06-08T22:25:42Z"},{"id":310,"issue_id":"av-wy0.3","author":"codex-av-wy0.3","text":"Implemented self-contained per-test task bundles in feat/av-wy0.3-task-bundles. Key decisions: hard-deprecated the unreleased run-source surface; task/EVAL.yaml and task/targets.yaml are the native portable contract; index.jsonl carries artifact_dir/task_dir/eval_path/targets_path/files_path/graders_path links. Verification: focused Bun tests passed (128 pass) for task-bundle materializer, artifact writer, results serve, and combine; bun --filter agentv typecheck passed; red/green dry-run UAT confirmed origin/main lacks task_dir/task/EVAL.yaml while this branch writes task_dir, selected target mock-target in task/EVAL.yaml, copied input_files under task/files, and no nested .agentv/results.","created_at":"2026-06-08T23:51:29Z"}]} {"id":"av-wy0.4","title":"cli: rerun a run bundle with local target environment","description":"Problem:\nUsers should be able to rerun a captured AgentV run bundle with the same single-test eval contract and target shape while supplying their own local env vars/secrets. This is distinct from replay: replay reuses captured outputs; rerun executes the captured eval again.\n\nThe rerun input should be the native per-test task bundles written by av-wy0.3: task/EVAL.yaml, task/targets.yaml, task/files/, and task/graders/. Rerun must avoid the nested-output trap where running from the task/test folder with defaults creates /.agentv/results/runs//... It should always pass or choose an explicit output run directory outside the captured task bundle.","acceptance_criteria":"- Add a CLI workflow such as agentv runs rerun that uses index.jsonl to select test artifact folders and reads task/EVAL.yaml and task/targets.yaml.\n- Invoke the captured eval by explicit file path, e.g. equivalent to agentv eval /task/EVAL.yaml --targets /task/targets.yaml --output , rather than relying on cwd/default output discovery inside task/.\n- Support rerunning one test, a subset of tests, or all captured tests when the required per-test task artifacts are present.\n- Support supplying local env via env file or ambient environment and fail clearly when target placeholders or provider requirements are missing. Do not require a bundled required_env artifact.\n- Support overriding targets with a compatible targets.yaml when safe.\n- Never read secret values from the bundle; use placeholders in targets.yaml and local env only.\n- Preserve test_id mapping and emit a new normal AgentV run with links back to the source run and per-test task bundle.\n- Fail loudly when the bundle lacks required task/EVAL.yaml, task/targets.yaml, copied files, copied grader assets, or when target config is incompatible.\n- Tests must assert rerun output is written to the chosen output run directory and not under /.agentv/results or /task/.agentv/results.\n- Add CLI tests covering happy path, missing env, incompatible target override, missing per-test task artifacts, subset rerun, explicit output location, no nested output, and distinction from replay output fixtures.\n\nDependencies:\n- Requires the self-contained per-test task-bundle contract from av-wy0.3. av-wy0.1 is closed as not planned and is not a prerequisite.","notes":"Design corrected on 2026-06-08 after av-2lq/user review, anti-nesting follow-up, and final task/ naming correction. Rerun should consume native per-test task/EVAL.yaml and task/targets.yaml artifacts instead of a manifest/recipe/source JSON contract. Always write rerun output to an explicit separate run directory.","status":"open","priority":2,"issue_type":"feature","created_at":"2026-06-08T10:22:23.327272098Z","created_by":"entity","updated_at":"2026-06-08T22:23:43.852833863Z","source_repo":"agentv","source_repo_path":"/home/entity/projects/EntityProcess/agentv","compaction_level":0,"original_size":0,"labels":["auditability","cli","rerun","run-bundles","targets"],"dependencies":[{"issue_id":"av-wy0.4","depends_on_id":"av-vwa.2","type":"related","created_at":"2026-06-08T10:23:06.964086759Z","created_by":"entity","metadata":"{}","thread_id":""},{"issue_id":"av-wy0.4","depends_on_id":"av-wy0","type":"parent-child","created_at":"2026-06-08T10:22:23.327272098Z","created_by":"entity","metadata":"{}","thread_id":""},{"issue_id":"av-wy0.4","depends_on_id":"av-wy0.3","type":"blocks","created_at":"2026-06-08T10:23:06.139647051Z","created_by":"entity","metadata":"{}","thread_id":""}],"comments":[{"id":293,"issue_id":"av-wy0.4","author":"entity","text":"Design correction (codex-av-2lq, 2026-06-08): aligned rerun scope with the corrected per-test artifact design. Rerun should use index.jsonl to locate test folders and consume test-local eval.yaml and targets.yaml plus copied files/graders. It should not depend on run_source.json, run_manifest.json, target_recipe.json, or a bundled required_env artifact.","created_at":"2026-06-08T21:32:26Z"},{"id":299,"issue_id":"av-wy0.4","author":"entity","text":"Design correction (codex-av-2lq, 2026-06-08): rerun should consume inputs/EVAL.yaml and inputs/targets.yaml by explicit path and always choose/pass a separate output run directory. It must not run from inside the captured inputs/ folder with default output behavior, because that risks nested /.agentv/results/... artifacts.","created_at":"2026-06-08T21:42:57Z"},{"id":305,"issue_id":"av-wy0.4","author":"entity","text":"Final naming correction (codex-av-2lq, 2026-06-08): rerun should consume task/EVAL.yaml and task/targets.yaml by explicit path and always choose/pass a separate output run directory. Do not run from inside task/ with default output behavior. Earlier inputs/ wording is superseded.","created_at":"2026-06-08T21:48:41Z"}]} {"id":"av-wy0.5","title":"docs: dogfood run bundle auditability workflow","description":"Problem:\nRun bundle auditability needs a user-facing workflow and dogfood evidence, especially around Dashboard Files visibility and rerun planning. Users should understand how to inspect run-level summaries plus per-test task source, test_id, target config, grader assets, copied files, rendered input.md, outputs, traces, and scoring files from a run folder without accidentally treating the captured task folder as the rerun output root.","acceptance_criteria":"- Document the run bundle/auditability workflow: run folder as audit boundary, index.jsonl as run-level result index, benchmark.json/timing/transcript as run-level summaries, per-test task/EVAL.yaml, task/targets.yaml, task/files/, task/graders/, rendered input.md, Dashboard Files, replay vs rerun semantics, and no-secret guarantees.\n- Documentation must distinguish input.md as the rendered model/agent input from task/ as the captured native runnable task contract.\n- Documentation must call out the anti-nesting rule: task/ is the captured source bundle; reruns must write to an explicit separate output run directory, not /.agentv/results or task/.agentv/results.\n- Include a small public-safe example or fixture that demonstrates run-level files and self-contained per-test task bundles in the Dashboard.\n- Add dogfood evidence from Dashboard UAT for a real or representative run: open the Dashboard, navigate to the run/result Files viewer, and visually confirm run-level files plus per-test task/EVAL.yaml, task/targets.yaml, task/files/, task/graders/, and input.md are visible.\n- Save browser screenshots/evidence under /home/entity/projects/EntityProcess/agentv-private/dogfood//, commit and push to agentv-private, and record the commit/path in the bead/PR.\n- Cross-link eval source traceability docs from av-n75 and trace/replay docs from av-vwa where relevant, while making clear that this design does not require run_source.json, run_manifest.json, or target_recipe.json.\n\nDependencies:\n- Should follow the self-contained per-test task-bundle contract from av-wy0.3, Dashboard Files implementation from av-wy0.2, and rerun anti-nesting rule from av-wy0.4/design notes. av-wy0.1 is closed as not planned and is not a prerequisite.","notes":"Design corrected on 2026-06-08 after av-2lq/user review, anti-nesting follow-up, and final task/ naming correction. Docs and dogfood should teach per-test native task artifacts, not manifest/target-recipe/source JSON artifacts, and should tell users to write rerun output outside the captured task folder. Dogfood requires Dashboard visual confirmation in the Files viewer.","status":"open","priority":2,"issue_type":"task","created_at":"2026-06-08T10:22:42.578431238Z","created_by":"entity","updated_at":"2026-06-08T22:24:06.995372040Z","source_repo":"agentv","source_repo_path":"/home/entity/projects/EntityProcess/agentv","compaction_level":0,"original_size":0,"labels":["auditability","dashboard","docs","dogfood","run-bundles"],"dependencies":[{"issue_id":"av-wy0.5","depends_on_id":"av-wy0","type":"parent-child","created_at":"2026-06-08T10:22:42.578431238Z","created_by":"entity","metadata":"{}","thread_id":""},{"issue_id":"av-wy0.5","depends_on_id":"av-wy0.2","type":"blocks","created_at":"2026-06-08T10:23:06.430348748Z","created_by":"entity","metadata":"{}","thread_id":""},{"issue_id":"av-wy0.5","depends_on_id":"av-wy0.3","type":"blocks","created_at":"2026-06-08T10:23:06.582670098Z","created_by":"entity","metadata":"{}","thread_id":""}],"comments":[{"id":294,"issue_id":"av-wy0.5","author":"entity","text":"Design correction (codex-av-2lq, 2026-06-08): aligned docs/dogfood scope with the corrected per-test artifact design. Documentation should teach run-level index/summary files and per-test native artifacts: eval.yaml, targets.yaml, files/, graders/, input/output/grading/timing. It should explicitly avoid presenting manifest/source/target-recipe JSON as required v1 artifacts.","created_at":"2026-06-08T21:32:26Z"},{"id":300,"issue_id":"av-wy0.5","author":"entity","text":"Design correction (codex-av-2lq, 2026-06-08): dogfood must use Dashboard UAT, navigate to the run/result Files viewer, and visually confirm run-level files plus per-test inputs/EVAL.yaml, inputs/targets.yaml, inputs/files/, inputs/graders/, and input.md are visible. Save screenshots/evidence to agentv-private and record commit/path in the bead/PR.","created_at":"2026-06-08T21:42:57Z"},{"id":306,"issue_id":"av-wy0.5","author":"entity","text":"Final naming correction (codex-av-2lq, 2026-06-08): dogfood must use Dashboard UAT and visually confirm the Files viewer shows run-level files plus per-test task/EVAL.yaml, task/targets.yaml, task/files/, task/graders/, and input.md. input.md is rendered input; task/ is the runnable task contract. Save screenshots/evidence to agentv-private. Earlier inputs/ wording is superseded.","created_at":"2026-06-08T21:48:54Z"}]} +{"id":"av-xqm","title":"bug: Dashboard remote sync status can show stale last_error after later conflict state","description":"Adversarial remote-sync Dashboard dogfood found that after a failed unsafe dirty sync, a later conflicted results-clone state shows the current Conflicted status and disables Sync Project, but still renders the previous last_error block below the current status. Evidence came from throwaway file remote/session /tmp/agentv-av-fis-remote-sync-9L2U3A, screenshots dashboard-unsafe-dirty-blocked-desktop.png and dashboard-conflicted-desktop.png. Likely surface: apps/dashboard/src/components/RunSourceToolbar.tsx renders remoteStatus.last_error unconditionally, so stale error text can survive a current status summary. Recommended fix: only show last_error when it describes the current blocked/sync-failure response, or clear/derive it at the API boundary; add a focused project-sync-status/toolbar regression test.","acceptance_criteria":"- Ensure a stale `last_error` from an earlier failed remote sync is cleared or superseded when the current remote status changes to a different blocked/conflicted/clean state.\n- Prefer deriving or clearing stale error state at the API/status boundary when possible; otherwise gate Dashboard rendering so `last_error` is shown only when it describes the current sync/status result.\n- Preserve actionable current errors for real failed sync responses; do not hide the active block reason.\n- Add focused regression coverage for an unsafe dirty sync failure followed by a conflicted status, asserting that the current conflict/status copy is visible and the older error text is absent.","status":"in_review","priority":2,"issue_type":"bug","assignee":"OliveHeron","created_at":"2026-06-09T00:30:52.567084593Z","created_by":"entity","updated_at":"2026-06-09T06:22:24.154734573Z","external_ref":"https://github.com/EntityProcess/agentv/pull/1338","source_repo":"agentv","source_repo_path":"/home/entity/projects/EntityProcess/agentv","compaction_level":0,"original_size":0,"dependencies":[{"issue_id":"av-xqm","depends_on_id":"av-fis","type":"parent-child","created_at":"2026-06-09T00:30:52.567084593Z","created_by":"entity","metadata":"{}","thread_id":""}],"comments":[{"id":332,"issue_id":"av-xqm","author":"OliveHeron","text":"Claimed for implementation in worktree /home/entity/projects/EntityProcess/agentv.worktrees/fix-av-xqm-stale-last-error on branch fix/av-xqm-stale-last-error. Scope: clear or supersede stale remote sync last_error at the status/API boundary with focused core regression coverage.","created_at":"2026-06-09T05:58:47Z"},{"id":333,"issue_id":"av-xqm","author":"OliveHeron","text":"Implementation verification before PR: red check was the new focused regression `bun test packages/core/test/evaluation/results-repo.test.ts -t \"supersedes stale sync errors\"`, which failed before the core change with stale `Results repo has uncommitted changes and auto_push is disabled` while current status was conflicted.\n\nGreen checks after implementation:\n- `bun test packages/core/test/evaluation/results-repo.test.ts -t \"supersedes stale sync errors\"` passed.\n- `bun test packages/core/test/evaluation/results-repo.test.ts` passed 12/12.\n- `bun run build` passed after `bun install` in the fresh worktree, with only the existing Dashboard large-chunk warning.\n- `bun test apps/cli/test/commands/results/serve.test.ts` passed 83/83.\n- `bunx biome check packages/core/src/evaluation/results-repo.ts packages/core/test/evaluation/results-repo.test.ts` passed.\n- `bun run test` passed across core/eval/phoenix/cli/dashboard packages.\n","created_at":"2026-06-09T06:18:30Z"},{"id":334,"issue_id":"av-xqm","author":"OliveHeron","text":"PR opened for av-xqm: https://github.com/EntityProcess/agentv/pull/1338\n\nThe PR keeps the fix at the core status boundary and does not add Dashboard-specific state. Remaining remote-sync follow-ups stay separate: av-fis.1 for stuck Dashboard Syncing UI, av-fis.2 for CLI contract follow-through, and av-fis.3 for interrupted retry/cleanup coverage.\n","created_at":"2026-06-09T06:22:24Z"}]} {"id":"av-z74","title":"cleanup: simplify result repo configuration","description":"Problem:\n`results_by_project` is an antipattern. It stores per-project result repo bindings in global AGENTV_HOME config while also coexisting with project-local `.agentv/config.yaml` `results`, creating ambiguous precedence: global config is supposed to be machine-global, but the field is project-specific and competes with source-controlled project config. PR #1297 exposed the ambiguity by proposing that global `results_by_project.` should override project-local `results` for registered projects. That may fix one Dashboard deployment case but makes the model harder to reason about.\n\nObservable behavior today:\n- Project-local `.agentv/config.yaml` may contain top-level `results`.\n- `$AGENTV_HOME/config.yaml` may contain top-level `results` and `results_by_project`.\n- `loadConfig()` conditionally attaches global `results_by_project` to project-local config only when project-local `results` is absent.\n- `resolveResultsConfigForProject()` then resolves `results_by_project[project_id] ?? results`.\n- Dashboard docs describe `results_by_project` as the multi-project Dashboard mechanism.\n\nSimpler model to explore:\n- Keep project-local `.agentv/config.yaml` as the source-controlled default for a single project.\n- Move per-registered-project machine-local settings into the project registry entry in `$AGENTV_HOME/projects.yaml`, e.g. each project can carry its own optional `results` binding alongside `id`, `name`, and `path`.\n- Keep `$AGENTV_HOME/config.yaml` top-level `results` only as a true global default/fallback, not as a project map.\n- Remove new precedence rules between `results_by_project` and project-local `results`; resolve from an explicit project entry when the operation is project-scoped.\n\nMigration notes:\n- Support reading existing `$AGENTV_HOME/config.yaml results_by_project` temporarily with a deprecation warning, converting each entry to the corresponding registered project entry when possible.\n- Preserve snake_case on disk and camelCase internally at the boundary.\n- Update Dashboard docs and tests around remote status/sync.\n- Re-evaluate PR #1297 against this redesign rather than merging it as the long-term fix.\n\nReferences:\n- packages/core/src/evaluation/loaders/config-loader.ts\n- packages/core/src/projects.ts\n- apps/cli/src/commands/results/remote.ts\n- apps/web/src/content/docs/docs/tools/dashboard.mdx\n- PR #1297: fix(results): prefer project mappings for registered projects\n","status":"closed","priority":1,"issue_type":"task","created_at":"2026-06-05T22:00:07.545298704Z","created_by":"entity","updated_at":"2026-06-06T02:12:30.898625042Z","closed_at":"2026-06-06T02:12:30.898476846Z","close_reason":"Completed by PR #1299 (fix(config): complete layered home config migration), merged to main at 809750403b521d7d52938d7f237db66f5edc515f. CI green; dogfood smokes recorded in PR body.","source_repo":"agentv","source_repo_path":"/home/entity/projects/EntityProcess/agentv","compaction_level":0,"original_size":0,"labels":["cleanup","config","dashboard","results"]} diff --git a/packages/core/src/evaluation/results-repo.ts b/packages/core/src/evaluation/results-repo.ts index 743c99e0..4826de0c 100644 --- a/packages/core/src/evaluation/results-repo.ts +++ b/packages/core/src/evaluation/results-repo.ts @@ -498,9 +498,34 @@ function withGitInspection( conflicted_paths: inspection.conflictedPaths, git_status: inspection.gitStatus, git_diff_summary: inspection.gitDiffSummary, + last_error: lastErrorForGitInspection(status, inspection), }; } +function lastErrorForGitInspection( + status: ResultsRepoStatus, + inspection: ResultsRepoGitInspection, +): string | undefined { + if (inspection.syncStatus === 'conflicted') { + return 'Results repo has unresolved git conflicts'; + } + + if (inspection.syncStatus === 'diverged') { + return 'Results repo local and remote histories have diverged'; + } + + if (inspection.syncStatus === 'dirty') { + if (status.auto_push === false) { + return 'Results repo has uncommitted changes and auto_push is disabled'; + } + if (!areSafeResultsRepoPaths(inspection.dirtyPaths)) { + return 'Results repo has non-results working tree changes'; + } + } + + return undefined; +} + function withBlockedStatus( status: ResultsRepoStatus, blockReason: string, diff --git a/packages/core/test/evaluation/results-repo.test.ts b/packages/core/test/evaluation/results-repo.test.ts index c1febc38..286feb9e 100644 --- a/packages/core/test/evaluation/results-repo.test.ts +++ b/packages/core/test/evaluation/results-repo.test.ts @@ -518,4 +518,56 @@ describe('results repo write path', () => { expect(status.git_diff_summary).toContain('local-only'); expect(status.git_diff_summary).toContain('benchmark.json'); }, 20000); + + it('supersedes stale sync errors with the current conflicted status', async () => { + const { remoteDir, seedDir } = initializeRemoteRepo(rootDir); + const cloneDir = path.join(rootDir, 'results-clone'); + const config = { ...createResultsConfig(remoteDir, cloneDir), auto_push: false }; + const relativeMetadataPath = path.join( + '.agentv', + 'results', + 'metadata', + 'runs', + 'stale-error', + '2026-05-26T10-00-00-000Z', + 'tags.json', + ); + const writeTags = (repoDir: string, tags: string[]) => { + const tagPath = path.join(repoDir, relativeMetadataPath); + mkdirSync(path.dirname(tagPath), { recursive: true }); + writeFileSync(tagPath, `${JSON.stringify({ tags }, null, 2)}\n`); + }; + + await ensureResultsRepoClone(config); + git('git config user.email "test@example.com"', cloneDir); + git('git config user.name "Test User"', cloneDir); + + writeTags(cloneDir, ['dirty']); + const dirtyStatus = await syncResultsRepoForProject(config); + expect(dirtyStatus.sync_status).toBe('dirty'); + expect(dirtyStatus.block_reason).toContain('auto_push is disabled'); + + git('git reset --hard --quiet', cloneDir); + git('git clean -fd --quiet .agentv', cloneDir); + + writeTags(seedDir, ['base']); + git('git add .agentv && git commit --quiet -m "seed tag metadata"', seedDir); + git('git push --quiet origin main', seedDir); + git('git pull --ff-only --quiet', cloneDir); + + writeTags(cloneDir, ['local']); + git('git add .agentv && git commit --quiet -m "local tag metadata"', cloneDir); + writeTags(seedDir, ['remote']); + git('git add .agentv && git commit --quiet -m "remote tag metadata"', seedDir); + git('git push --quiet origin main', seedDir); + git('git fetch --quiet origin --prune', cloneDir); + git('git merge origin/main || true', cloneDir); + + const status = await getResultsRepoSyncStatus(config); + + expect(status.sync_status).toBe('conflicted'); + expect(status.last_error).toBe('Results repo has unresolved git conflicts'); + expect(status.last_error).not.toContain('auto_push is disabled'); + expect(status.conflicted_paths).toEqual([relativeMetadataPath]); + }, 20000); });