|
83 | 83 | {"actor":"sjarmak","comment":null,"created_at":"2026-03-11T01:49:07Z","event_type":"closed","id":83,"issue_id":"CodeScaleBench-6or","new_value":"Added DUAL-SCORE ANALYSIS and DUAL-SCORE BY SUITE sections to extract_v2_report_data.py output. Shows direct vs artifact means, gap, and Pearson correlation. breakdown_by() now includes per-dimension stats (bl_mean_direct, mcp_mean_direct, delta_direct, etc.) when data available.","old_value":""} |
84 | 84 | {"actor":"sjarmak","comment":null,"created_at":"2026-03-11T01:49:07Z","event_type":"closed","id":84,"issue_id":"CodeScaleBench-zrs","new_value":"Epic complete. 275 tasks in benchmarks/csb/ across 9 merged suites, all with dual-score verifiers. Agent instructions updated to always produce both direct edits and answer.json. Extraction and reporting pipelines extended for dual scores.","old_value":""} |
85 | 85 | {"actor":"sjarmak","comment":null,"created_at":"2026-03-11T23:15:55Z","event_type":"created","id":85,"issue_id":"CodeScaleBench-wn8","new_value":"","old_value":""} |
| 86 | +{"actor":"sjarmak","comment":null,"created_at":"2026-03-11T23:15:56Z","event_type":"created","id":86,"issue_id":"CodeScaleBench-fv1","new_value":"","old_value":""} |
| 87 | +{"actor":"sjarmak","comment":null,"created_at":"2026-03-11T23:15:58Z","event_type":"created","id":87,"issue_id":"CodeScaleBench-y0r","new_value":"","old_value":""} |
| 88 | +{"actor":"sjarmak","comment":null,"created_at":"2026-03-11T23:15:59Z","event_type":"created","id":88,"issue_id":"CodeScaleBench-eee","new_value":"","old_value":""} |
| 89 | +{"actor":"sjarmak","comment":null,"created_at":"2026-03-11T23:16:06Z","event_type":"created","id":89,"issue_id":"CodeScaleBench-41a","new_value":"","old_value":""} |
| 90 | +{"actor":"sjarmak","comment":null,"created_at":"2026-03-11T23:16:11Z","event_type":"created","id":90,"issue_id":"CodeScaleBench-b61","new_value":"","old_value":""} |
| 91 | +{"actor":"sjarmak","comment":null,"created_at":"2026-03-11T23:16:16Z","event_type":"created","id":91,"issue_id":"CodeScaleBench-0l6","new_value":"","old_value":""} |
| 92 | +{"actor":"sjarmak","comment":null,"created_at":"2026-03-11T23:24:00Z","event_type":"closed","id":92,"issue_id":"CodeScaleBench-y0r","new_value":"Fixed in 4b457d601: 16 files updated (/app/solution.json → /workspace/answer.json)","old_value":""} |
| 93 | +{"actor":"sjarmak","comment":null,"created_at":"2026-03-11T23:24:00Z","event_type":"closed","id":93,"issue_id":"CodeScaleBench-fv1","new_value":"Fixed in 4b457d601: 3 mkdir + 371 chown additions","old_value":""} |
| 94 | +{"actor":"sjarmak","comment":null,"created_at":"2026-03-11T23:24:00Z","event_type":"closed","id":94,"issue_id":"CodeScaleBench-eee","new_value":"Fixed in 49dafd161 (132 files) — remaining 274 already had chmod or don't use USER claude","old_value":""} |
| 95 | +{"actor":"sjarmak","comment":null,"created_at":"2026-03-11T23:24:00Z","event_type":"closed","id":95,"issue_id":"CodeScaleBench-41a","new_value":"Fixed in 4b457d601: 4 sg_only files + 4 linux kernel files from prior commit","old_value":""} |
| 96 | +{"actor":"sjarmak","comment":null,"created_at":"2026-03-11T23:24:00Z","event_type":"closed","id":96,"issue_id":"CodeScaleBench-b61","new_value":"Fixed in 4b457d601: timeout 600 added to k8s and servo test.sh","old_value":""} |
| 97 | +{"actor":"sjarmak","comment":null,"created_at":"2026-03-11T23:24:00Z","event_type":"closed","id":97,"issue_id":"CodeScaleBench-0l6","new_value":"Fixed in 4b457d601: 4 task.toml updated to memory_mb=8192","old_value":""} |
| 98 | +{"actor":"sjarmak","comment":null,"created_at":"2026-03-11T23:29:01Z","event_type":"status_changed","id":98,"issue_id":"CodeScaleBench-wn8","new_value":"{\"notes\":\"Script ready at scripts/migrate_sweap_to_ghcr.py. Needs manual GHCR auth: (1) gh auth refresh -h github.com -s write:packages, (2) gh auth token | docker login ghcr.io -u sjarmak --password-stdin, (3) python3 scripts/migrate_sweap_to_ghcr.py --push --update. 11 images, 33 Dockerfiles.\",\"status\":\"in_progress\"}","old_value":"{\"id\":\"CodeScaleBench-wn8\",\"title\":\"Migrate 24 SWEAP tasks from jefzda/ Docker Hub to ghcr.io/sg-evals/\",\"description\":\"96 Dockerfiles (24 tasks x 4 variants) reference jefzda/sweap-images on personal Docker Hub. These fail in cloud environments without Docker Hub credentials. Migrate all to ghcr.io/sg-evals/sweap-images. Affected suites: csb_sdlc_debug (ansible, qutebrowser, teleport, vuls, flipt), csb_sdlc_fix (ansible, nodebb, element-web).\",\"status\":\"open\",\"priority\":1,\"issue_type\":\"bug\",\"owner\":\"sjarmak@users.noreply.github.com\",\"created_at\":\"2026-03-11T23:15:55Z\",\"created_by\":\"sjarmak\",\"updated_at\":\"2026-03-11T23:15:55Z\"}"} |
| 99 | +{"actor":"sjarmak","comment":null,"created_at":"2026-03-11T23:56:33Z","event_type":"closed","id":99,"issue_id":"CodeScaleBench-wn8","new_value":"11 images pushed to ghcr.io/sg-evals/sweap-images, 33 Dockerfiles updated. Commit b391b3a8d.","old_value":""} |
0 commit comments