bd: backup 2026-03-09 18:00

sjarmak · sjarmak · commit 8f18d9664af3 · 2026-03-09T18:00:36.000Z
diff --git a/.beads/backup/backup_state.json b/.beads/backup/backup_state.json
@@ -1,10 +1,10 @@
 {
-  "last_dolt_commit": "1mado597vug6o8vk9voqcinlvk6rv1h0",
+  "last_dolt_commit": "jff7k6nv2jf6659908fg7birkmpfrs5o",
   "last_event_id": 0,
-  "timestamp": "2026-03-09T17:27:55.748463217Z",
+  "timestamp": "2026-03-09T18:00:36.185914301Z",
   "counts": {
     "issues": 15,
-    "events": 39,
+    "events": 41,
     "comments": 0,
     "dependencies": 10,
     "labels": 0,
diff --git a/.beads/backup/events.jsonl b/.beads/backup/events.jsonl
@@ -37,3 +37,5 @@
 {"actor":"sjarmak","comment":null,"created_at":"2026-03-09T17:09:30Z","event_type":"claimed","id":37,"issue_id":"CodeScaleBench-25b.3","new_value":"{\"assignee\":\"sjarmak\",\"status\":\"in_progress\"}","old_value":"{\"id\":\"CodeScaleBench-25b.3\",\"title\":\"Close artifact-mode coverage gaps in the 275 canonical tasks\",\"description\":\"Goal\\nBring the canonical set closer to the intended hybrid evaluation model by closing answer.json/artifact support gaps where feasible.\\n\\nKnown baseline\\nA local audit found 42 canonical tasks without Dockerfile.artifact_only support. SDLC suites are the main source of gaps.\\n\\nScope\\n- Add artifact-mode Dockerfiles and verifier bridges where appropriate.\\n- Reuse answer_json_verifier_lib.sh when it fits; avoid bespoke one-offs.\\n- For tasks that should remain deterministic-only, document the reason explicitly.\\n\\nWhy\\nThe canonical benchmark should not imply universal hybrid evaluation if a non-trivial subset cannot actually run that way.\",\"acceptance_criteria\":\"1. Every canonical task either supports artifact_only evaluation or is explicitly marked as a documented exception. 2. Missing artifact-mode tasks are either remediated or tracked individually with rationale. 3. Representative smoke coverage exists for each family touched by the remediation work.\",\"status\":\"open\",\"priority\":1,\"issue_type\":\"task\",\"owner\":\"sjarmak@users.noreply.github.com\",\"created_at\":\"2026-03-09T16:05:19Z\",\"created_by\":\"sjarmak\",\"updated_at\":\"2026-03-09T16:05:19Z\"}"}
 {"actor":"sjarmak","comment":null,"created_at":"2026-03-09T17:10:17Z","event_type":"closed","id":38,"issue_id":"CodeScaleBench-25b.2","new_value":"Pushed a227d2eb0 on main with the canonical validation_result contract docs, audit, and maintained verifier script updates.","old_value":""}
 {"actor":"sjarmak","comment":null,"created_at":"2026-03-09T17:27:55Z","event_type":"closed","id":39,"issue_id":"CodeScaleBench-25b.3","new_value":"Completed artifact-mode coverage migration in d055a6b6a (Dockerfile.artifact_only coverage closed, answer_json verifier bridges added, no_artifact smoke checks passing).","old_value":""}
+{"actor":"sjarmak","comment":null,"created_at":"2026-03-09T17:34:50Z","event_type":"claimed","id":40,"issue_id":"CodeScaleBench-25b.4","new_value":"{\"assignee\":\"sjarmak\",\"status\":\"in_progress\"}","old_value":"{\"id\":\"CodeScaleBench-25b.4\",\"title\":\"Update reporting to separate reward, pass status, and scorer family\",\"description\":\"Goal\\nMake downstream analysis reflect the real semantics of the verifier outputs instead of collapsing everything into one comparable-looking scalar.\\n\\nScope\\n- Thread validation_result metadata into report generation.\\n- Expose pass_threshold and passed alongside reward.\\n- Label evaluator families in summaries and comparisons.\\n- Add caveats or partitioned views where scorer families are not directly comparable.\\n\\nWhy\\nA 0.6 from oracle F1 is not the same construct as a 0.6 from a checklist or repo-grep verifier.\",\"acceptance_criteria\":\"1. Reports surface scorer family and output contract for canonical tasks. 2. Continuous reward and solved/pass status are reported separately. 3. Aggregate reporting avoids direct cross-family comparisons unless calibrated or clearly caveated.\",\"status\":\"open\",\"priority\":2,\"issue_type\":\"task\",\"owner\":\"sjarmak@users.noreply.github.com\",\"created_at\":\"2026-03-09T16:05:19Z\",\"created_by\":\"sjarmak\",\"updated_at\":\"2026-03-09T16:05:19Z\"}"}
+{"actor":"sjarmak","comment":null,"created_at":"2026-03-09T18:00:36Z","event_type":"closed","id":41,"issue_id":"CodeScaleBench-25b.4","new_value":"Completed reporting update in 53b2b8694: validation_result metadata now threads into task metrics and reporting, pass status is separated from reward, scorer family/output contract are surfaced, and mixed-family aggregate views are caveated.","old_value":""}
diff --git a/.beads/backup/issues.jsonl b/.beads/backup/issues.jsonl