|
30 | 30 | {"actor":"sjarmak","comment":null,"created_at":"2026-03-09T16:05:19Z","event_type":"created","id":30,"issue_id":"CodeScaleBench-25b.4","new_value":"","old_value":""} |
31 | 31 | {"actor":"sjarmak","comment":null,"created_at":"2026-03-09T16:05:19Z","event_type":"created","id":31,"issue_id":"CodeScaleBench-25b.5","new_value":"","old_value":""} |
32 | 32 | {"actor":"sjarmak","comment":null,"created_at":"2026-03-09T16:11:44Z","event_type":"claimed","id":32,"issue_id":"CodeScaleBench-25b.1","new_value":"{\"assignee\":\"sjarmak\",\"status\":\"in_progress\"}","old_value":"{\"id\":\"CodeScaleBench-25b.1\",\"title\":\"Audit canonical task evaluator families and output contracts\",\"description\":\"Goal\\nProduce a canonical audit of the 275 selected tasks so follow-on work is driven by facts instead of assumptions.\\n\\nScope\\n- Classify each task by verifier family (oracle-checks, checklist, repo-state heuristic, test-ratio, F1-hybrid, etc.).\\n- Record expected agent output contract (solution.json, answer.json, review.json bridge, patched repo, report markdown, etc.).\\n- Record whether Dockerfile.artifact_only and Dockerfile.artifact_baseline exist.\\n- Record whether the verifier already emits validation_result-style structured output.\\n\\nWhy\\nCurrent support is uneven across SDLC suites, and we need a canonical source of truth before standardizing contracts or closing gaps.\",\"acceptance_criteria\":\"1. All 275 canonical tasks are classified by evaluator family, expected output artifact(s), and current artifact-mode support. 2. The audit identifies exact tasks missing artifact_only or answer.json bridge support. 3. Results are stored in repo-visible metadata or a generated audit artifact that can drive follow-on work.\",\"status\":\"open\",\"priority\":1,\"issue_type\":\"task\",\"owner\":\"sjarmak@users.noreply.github.com\",\"created_at\":\"2026-03-09T16:05:19Z\",\"created_by\":\"sjarmak\",\"updated_at\":\"2026-03-09T16:05:19Z\"}"} |
| 33 | +{"actor":"sjarmak","comment":null,"created_at":"2026-03-09T16:23:04Z","event_type":"updated","id":33,"issue_id":"CodeScaleBench-25b.1","new_value":"{\"notes\":\"-\"}","old_value":"{\"id\":\"CodeScaleBench-25b.1\",\"title\":\"Audit canonical task evaluator families and output contracts\",\"description\":\"Goal\\nProduce a canonical audit of the 275 selected tasks so follow-on work is driven by facts instead of assumptions.\\n\\nScope\\n- Classify each task by verifier family (oracle-checks, checklist, repo-state heuristic, test-ratio, F1-hybrid, etc.).\\n- Record expected agent output contract (solution.json, answer.json, review.json bridge, patched repo, report markdown, etc.).\\n- Record whether Dockerfile.artifact_only and Dockerfile.artifact_baseline exist.\\n- Record whether the verifier already emits validation_result-style structured output.\\n\\nWhy\\nCurrent support is uneven across SDLC suites, and we need a canonical source of truth before standardizing contracts or closing gaps.\",\"acceptance_criteria\":\"1. All 275 canonical tasks are classified by evaluator family, expected output artifact(s), and current artifact-mode support. 2. The audit identifies exact tasks missing artifact_only or answer.json bridge support. 3. Results are stored in repo-visible metadata or a generated audit artifact that can drive follow-on work.\",\"status\":\"in_progress\",\"priority\":1,\"issue_type\":\"task\",\"assignee\":\"sjarmak\",\"owner\":\"sjarmak@users.noreply.github.com\",\"created_at\":\"2026-03-09T16:05:19Z\",\"created_by\":\"sjarmak\",\"updated_at\":\"2026-03-09T16:11:44Z\"}"} |
| 34 | +{"actor":"sjarmak","comment":null,"created_at":"2026-03-09T16:23:21Z","event_type":"updated","id":34,"issue_id":"CodeScaleBench-25b.1","new_value":"{\"notes\":\"Canonical evaluation audit landed in configs/canonical_evaluation_audit.json, generated by scripts/audit_canonical_evaluation_contract.py.\\n\\nHeadline counts from the checked-in audit:\\n- 275 tasks audited (264 active, 11 excluded)\\n- answer_json modes: 136 native, 89 bridge, 50 none\\n- artifact primary outputs among artifact-capable tasks: 217 /workspace/answer.json, 8 /app/solution.json, 7 repo-state only, 1 unspecified\\n- 42 tasks missing Dockerfile.artifact_only\\n- 16 artifact-capable tasks do not actually use answer.json\\n- 217 tasks do not yet emit validation_result-style structured verifier output\\n\\nThis confirms the canonical benchmark is hybrid at the benchmark level, but not uniformly dual-mode or uniformly structured-output across all 275 tasks.\"}","old_value":"{\"id\":\"CodeScaleBench-25b.1\",\"title\":\"Audit canonical task evaluator families and output contracts\",\"description\":\"Goal\\nProduce a canonical audit of the 275 selected tasks so follow-on work is driven by facts instead of assumptions.\\n\\nScope\\n- Classify each task by verifier family (oracle-checks, checklist, repo-state heuristic, test-ratio, F1-hybrid, etc.).\\n- Record expected agent output contract (solution.json, answer.json, review.json bridge, patched repo, report markdown, etc.).\\n- Record whether Dockerfile.artifact_only and Dockerfile.artifact_baseline exist.\\n- Record whether the verifier already emits validation_result-style structured output.\\n\\nWhy\\nCurrent support is uneven across SDLC suites, and we need a canonical source of truth before standardizing contracts or closing gaps.\",\"acceptance_criteria\":\"1. All 275 canonical tasks are classified by evaluator family, expected output artifact(s), and current artifact-mode support. 2. The audit identifies exact tasks missing artifact_only or answer.json bridge support. 3. Results are stored in repo-visible metadata or a generated audit artifact that can drive follow-on work.\",\"notes\":\"-\",\"status\":\"in_progress\",\"priority\":1,\"issue_type\":\"task\",\"assignee\":\"sjarmak\",\"owner\":\"sjarmak@users.noreply.github.com\",\"created_at\":\"2026-03-09T16:05:19Z\",\"created_by\":\"sjarmak\",\"updated_at\":\"2026-03-09T16:23:05Z\"}"} |
| 35 | +{"actor":"sjarmak","comment":null,"created_at":"2026-03-09T16:24:17Z","event_type":"closed","id":35,"issue_id":"CodeScaleBench-25b.1","new_value":"Done","old_value":""} |
| 36 | +{"actor":"sjarmak","comment":null,"created_at":"2026-03-09T16:28:56Z","event_type":"claimed","id":36,"issue_id":"CodeScaleBench-25b.2","new_value":"{\"assignee\":\"sjarmak\",\"status\":\"in_progress\"}","old_value":"{\"id\":\"CodeScaleBench-25b.2\",\"title\":\"Define standard validation_result schema for canonical verifiers\",\"description\":\"Goal\\nDefine the common verifier output contract that all canonical tasks should converge on in addition to reward.txt.\\n\\nScope\\n- Specify required and optional fields.\\n- Distinguish continuous reward from pass/fail semantics.\\n- Cover both deterministic verifiers and answer.json-derived artifact verifiers.\\n- Ensure the schema can represent partial credit, verifier failures, and missing output cleanly.\\n\\nWhy\\nToday reward.txt is universal, but the richer semantics are inconsistent and often lost.\",\"acceptance_criteria\":\"1. A single validation_result schema is defined for canonical tasks. 2. The schema includes scorer_family, reward, pass_threshold, passed, sub_scores, output_contract, and failure/error context. 3. The schema is documented in reference docs and is implementable from shell/Python verifiers without ambiguity.\",\"status\":\"open\",\"priority\":1,\"issue_type\":\"task\",\"owner\":\"sjarmak@users.noreply.github.com\",\"created_at\":\"2026-03-09T16:05:19Z\",\"created_by\":\"sjarmak\",\"updated_at\":\"2026-03-09T16:05:19Z\"}"} |
0 commit comments