bd: backup 2026-03-09 16:28

sjarmak · sjarmak · commit 5bbe6752e0f8 · 2026-03-09T16:28:57.000Z
diff --git a/.beads/backup/backup_state.json b/.beads/backup/backup_state.json
@@ -1,10 +1,10 @@
 {
-  "last_dolt_commit": "j4nqo4c9sm7s238f7rmg2e7jf0idnt9r",
+  "last_dolt_commit": "9lp64kdd2mq6lb7fcsfr6r3dv7r39p19",
   "last_event_id": 0,
-  "timestamp": "2026-03-09T16:11:44.531183368Z",
+  "timestamp": "2026-03-09T16:28:56.687035219Z",
   "counts": {
     "issues": 15,
-    "events": 32,
+    "events": 36,
     "comments": 0,
     "dependencies": 10,
     "labels": 0,
diff --git a/.beads/backup/events.jsonl b/.beads/backup/events.jsonl
@@ -30,3 +30,7 @@
 {"actor":"sjarmak","comment":null,"created_at":"2026-03-09T16:05:19Z","event_type":"created","id":30,"issue_id":"CodeScaleBench-25b.4","new_value":"","old_value":""}
 {"actor":"sjarmak","comment":null,"created_at":"2026-03-09T16:05:19Z","event_type":"created","id":31,"issue_id":"CodeScaleBench-25b.5","new_value":"","old_value":""}
 {"actor":"sjarmak","comment":null,"created_at":"2026-03-09T16:11:44Z","event_type":"claimed","id":32,"issue_id":"CodeScaleBench-25b.1","new_value":"{\"assignee\":\"sjarmak\",\"status\":\"in_progress\"}","old_value":"{\"id\":\"CodeScaleBench-25b.1\",\"title\":\"Audit canonical task evaluator families and output contracts\",\"description\":\"Goal\\nProduce a canonical audit of the 275 selected tasks so follow-on work is driven by facts instead of assumptions.\\n\\nScope\\n- Classify each task by verifier family (oracle-checks, checklist, repo-state heuristic, test-ratio, F1-hybrid, etc.).\\n- Record expected agent output contract (solution.json, answer.json, review.json bridge, patched repo, report markdown, etc.).\\n- Record whether Dockerfile.artifact_only and Dockerfile.artifact_baseline exist.\\n- Record whether the verifier already emits validation_result-style structured output.\\n\\nWhy\\nCurrent support is uneven across SDLC suites, and we need a canonical source of truth before standardizing contracts or closing gaps.\",\"acceptance_criteria\":\"1. All 275 canonical tasks are classified by evaluator family, expected output artifact(s), and current artifact-mode support. 2. The audit identifies exact tasks missing artifact_only or answer.json bridge support. 3. Results are stored in repo-visible metadata or a generated audit artifact that can drive follow-on work.\",\"status\":\"open\",\"priority\":1,\"issue_type\":\"task\",\"owner\":\"sjarmak@users.noreply.github.com\",\"created_at\":\"2026-03-09T16:05:19Z\",\"created_by\":\"sjarmak\",\"updated_at\":\"2026-03-09T16:05:19Z\"}"}
+{"actor":"sjarmak","comment":null,"created_at":"2026-03-09T16:23:04Z","event_type":"updated","id":33,"issue_id":"CodeScaleBench-25b.1","new_value":"{\"notes\":\"-\"}","old_value":"{\"id\":\"CodeScaleBench-25b.1\",\"title\":\"Audit canonical task evaluator families and output contracts\",\"description\":\"Goal\\nProduce a canonical audit of the 275 selected tasks so follow-on work is driven by facts instead of assumptions.\\n\\nScope\\n- Classify each task by verifier family (oracle-checks, checklist, repo-state heuristic, test-ratio, F1-hybrid, etc.).\\n- Record expected agent output contract (solution.json, answer.json, review.json bridge, patched repo, report markdown, etc.).\\n- Record whether Dockerfile.artifact_only and Dockerfile.artifact_baseline exist.\\n- Record whether the verifier already emits validation_result-style structured output.\\n\\nWhy\\nCurrent support is uneven across SDLC suites, and we need a canonical source of truth before standardizing contracts or closing gaps.\",\"acceptance_criteria\":\"1. All 275 canonical tasks are classified by evaluator family, expected output artifact(s), and current artifact-mode support. 2. The audit identifies exact tasks missing artifact_only or answer.json bridge support. 3. Results are stored in repo-visible metadata or a generated audit artifact that can drive follow-on work.\",\"status\":\"in_progress\",\"priority\":1,\"issue_type\":\"task\",\"assignee\":\"sjarmak\",\"owner\":\"sjarmak@users.noreply.github.com\",\"created_at\":\"2026-03-09T16:05:19Z\",\"created_by\":\"sjarmak\",\"updated_at\":\"2026-03-09T16:11:44Z\"}"}
+{"actor":"sjarmak","comment":null,"created_at":"2026-03-09T16:23:21Z","event_type":"updated","id":34,"issue_id":"CodeScaleBench-25b.1","new_value":"{\"notes\":\"Canonical evaluation audit landed in configs/canonical_evaluation_audit.json, generated by scripts/audit_canonical_evaluation_contract.py.\\n\\nHeadline counts from the checked-in audit:\\n- 275 tasks audited (264 active, 11 excluded)\\n- answer_json modes: 136 native, 89 bridge, 50 none\\n- artifact primary outputs among artifact-capable tasks: 217 /workspace/answer.json, 8 /app/solution.json, 7 repo-state only, 1 unspecified\\n- 42 tasks missing Dockerfile.artifact_only\\n- 16 artifact-capable tasks do not actually use answer.json\\n- 217 tasks do not yet emit validation_result-style structured verifier output\\n\\nThis confirms the canonical benchmark is hybrid at the benchmark level, but not uniformly dual-mode or uniformly structured-output across all 275 tasks.\"}","old_value":"{\"id\":\"CodeScaleBench-25b.1\",\"title\":\"Audit canonical task evaluator families and output contracts\",\"description\":\"Goal\\nProduce a canonical audit of the 275 selected tasks so follow-on work is driven by facts instead of assumptions.\\n\\nScope\\n- Classify each task by verifier family (oracle-checks, checklist, repo-state heuristic, test-ratio, F1-hybrid, etc.).\\n- Record expected agent output contract (solution.json, answer.json, review.json bridge, patched repo, report markdown, etc.).\\n- Record whether Dockerfile.artifact_only and Dockerfile.artifact_baseline exist.\\n- Record whether the verifier already emits validation_result-style structured output.\\n\\nWhy\\nCurrent support is uneven across SDLC suites, and we need a canonical source of truth before standardizing contracts or closing gaps.\",\"acceptance_criteria\":\"1. All 275 canonical tasks are classified by evaluator family, expected output artifact(s), and current artifact-mode support. 2. The audit identifies exact tasks missing artifact_only or answer.json bridge support. 3. Results are stored in repo-visible metadata or a generated audit artifact that can drive follow-on work.\",\"notes\":\"-\",\"status\":\"in_progress\",\"priority\":1,\"issue_type\":\"task\",\"assignee\":\"sjarmak\",\"owner\":\"sjarmak@users.noreply.github.com\",\"created_at\":\"2026-03-09T16:05:19Z\",\"created_by\":\"sjarmak\",\"updated_at\":\"2026-03-09T16:23:05Z\"}"}
+{"actor":"sjarmak","comment":null,"created_at":"2026-03-09T16:24:17Z","event_type":"closed","id":35,"issue_id":"CodeScaleBench-25b.1","new_value":"Done","old_value":""}
+{"actor":"sjarmak","comment":null,"created_at":"2026-03-09T16:28:56Z","event_type":"claimed","id":36,"issue_id":"CodeScaleBench-25b.2","new_value":"{\"assignee\":\"sjarmak\",\"status\":\"in_progress\"}","old_value":"{\"id\":\"CodeScaleBench-25b.2\",\"title\":\"Define standard validation_result schema for canonical verifiers\",\"description\":\"Goal\\nDefine the common verifier output contract that all canonical tasks should converge on in addition to reward.txt.\\n\\nScope\\n- Specify required and optional fields.\\n- Distinguish continuous reward from pass/fail semantics.\\n- Cover both deterministic verifiers and answer.json-derived artifact verifiers.\\n- Ensure the schema can represent partial credit, verifier failures, and missing output cleanly.\\n\\nWhy\\nToday reward.txt is universal, but the richer semantics are inconsistent and often lost.\",\"acceptance_criteria\":\"1. A single validation_result schema is defined for canonical tasks. 2. The schema includes scorer_family, reward, pass_threshold, passed, sub_scores, output_contract, and failure/error context. 3. The schema is documented in reference docs and is implementable from shell/Python verifiers without ambiguity.\",\"status\":\"open\",\"priority\":1,\"issue_type\":\"task\",\"owner\":\"sjarmak@users.noreply.github.com\",\"created_at\":\"2026-03-09T16:05:19Z\",\"created_by\":\"sjarmak\",\"updated_at\":\"2026-03-09T16:05:19Z\"}"}
diff --git a/.beads/backup/issues.jsonl b/.beads/backup/issues.jsonl