bd: backup 2026-03-12 00:32

sjarmak · sjarmak · commit fffd26cbb30c · 2026-03-12T00:32:34.000Z
diff --git a/.beads/backup/backup_state.json b/.beads/backup/backup_state.json
@@ -1,10 +1,10 @@
 {
-  "last_dolt_commit": "0932sp2bg17edjo33tahm70kkn40qbq6",
+  "last_dolt_commit": "likejnd3mvcscvju0me71jgnqsao72q5",
   "last_event_id": 0,
-  "timestamp": "2026-03-11T23:56:33.349403044Z",
+  "timestamp": "2026-03-12T00:32:34.430087241Z",
   "counts": {
-    "issues": 34,
-    "events": 99,
+    "issues": 35,
+    "events": 100,
     "comments": 0,
     "dependencies": 16,
     "labels": 0,
diff --git a/.beads/backup/events.jsonl b/.beads/backup/events.jsonl
@@ -97,3 +97,4 @@
 {"actor":"sjarmak","comment":null,"created_at":"2026-03-11T23:24:00Z","event_type":"closed","id":97,"issue_id":"CodeScaleBench-0l6","new_value":"Fixed in 4b457d601: 4 task.toml updated to memory_mb=8192","old_value":""}
 {"actor":"sjarmak","comment":null,"created_at":"2026-03-11T23:29:01Z","event_type":"status_changed","id":98,"issue_id":"CodeScaleBench-wn8","new_value":"{\"notes\":\"Script ready at scripts/migrate_sweap_to_ghcr.py. Needs manual GHCR auth: (1) gh auth refresh -h github.com -s write:packages, (2) gh auth token | docker login ghcr.io -u sjarmak --password-stdin, (3) python3 scripts/migrate_sweap_to_ghcr.py --push --update. 11 images, 33 Dockerfiles.\",\"status\":\"in_progress\"}","old_value":"{\"id\":\"CodeScaleBench-wn8\",\"title\":\"Migrate 24 SWEAP tasks from jefzda/ Docker Hub to ghcr.io/sg-evals/\",\"description\":\"96 Dockerfiles (24 tasks x 4 variants) reference jefzda/sweap-images on personal Docker Hub. These fail in cloud environments without Docker Hub credentials. Migrate all to ghcr.io/sg-evals/sweap-images. Affected suites: csb_sdlc_debug (ansible, qutebrowser, teleport, vuls, flipt), csb_sdlc_fix (ansible, nodebb, element-web).\",\"status\":\"open\",\"priority\":1,\"issue_type\":\"bug\",\"owner\":\"sjarmak@users.noreply.github.com\",\"created_at\":\"2026-03-11T23:15:55Z\",\"created_by\":\"sjarmak\",\"updated_at\":\"2026-03-11T23:15:55Z\"}"}
 {"actor":"sjarmak","comment":null,"created_at":"2026-03-11T23:56:33Z","event_type":"closed","id":99,"issue_id":"CodeScaleBench-wn8","new_value":"11 images pushed to ghcr.io/sg-evals/sweap-images, 33 Dockerfiles updated. Commit b391b3a8d.","old_value":""}
+{"actor":"sjarmak","comment":null,"created_at":"2026-03-12T00:32:34Z","event_type":"created","id":100,"issue_id":"CodeScaleBench-82e","new_value":"","old_value":""}
diff --git a/.beads/backup/issues.jsonl b/.beads/backup/issues.jsonl
@@ -13,6 +13,7 @@
 {"acceptance_criteria":"","actor":"","agent_state":"","assignee":null,"await_id":"","await_type":"","close_reason":"Defined Org→SDLC mapping for all 11 org suites. Selected 67 promotion candidates: all multi-repo, 84% 2M+ LOC, balanced across 6 target SDLC suites. configs/org_promotion_manifest.json.","closed_at":"2026-03-07T23:01:01Z","closed_by_session":"","compacted_at":null,"compacted_at_commit":null,"compaction_level":0,"content_hash":"e57ed0ffb8999cc5708e3fbe9fa45f6a2e6461b45004c38ec33b54abfd14e753","created_at":"2026-03-07T22:56:46Z","created_by":"sjarmak","crystallizes":0,"defer_until":null,"description":"Analyze current SDLC coverage gaps: multi-repo (only 15/171), large codebases (only 2 tasks in 8M-40M, 0 in \u003e40M), and task-type balance. Select ~60-80 Org tasks for promotion that maximize: (1) multi-repo representation across all SDLC suites, (2) large codebase coverage (prioritize 2M+ LOC), (3) task-type balance across comprehension/implementation/quality. Produce a promotion manifest with target suite, verifier approach, and priority ranking.","design":"","due_at":null,"ephemeral":0,"estimated_minutes":null,"event_kind":"","external_ref":null,"hook_bead":"","id":"CodeScaleBench-5p1","is_template":0,"issue_type":"task","last_activity":null,"metadata":"{}","mol_type":"","notes":"","original_size":null,"owner":"sjarmak@users.noreply.github.com","payload":"","pinned":0,"priority":2,"quality_score":null,"rig":"","role_bead":"","role_type":"","sender":"","source_repo":"","source_system":"","spec_id":"","status":"closed","target":"","timeout_ns":0,"title":"Select Org→SDLC promotion candidates optimized for coverage gaps","updated_at":"2026-03-07T23:01:01Z","waiters":"","wisp_type":"","work_type":""}
 {"acceptance_criteria":"","actor":"","agent_state":"","assignee":null,"await_id":"","await_type":"","close_reason":"All 131 SDLC tasks in benchmarks/csb/ integrated via integrate_dual_score.py. 123 already had answer_json_verifier_lib, 8 skip tasks got it added. All 131 have dual_score_lib.sh sourced at end of test.sh.","closed_at":"2026-03-11T01:46:25Z","closed_by_session":"","compacted_at":null,"compacted_at_commit":null,"compaction_level":0,"content_hash":"85d5cbd5654d214ca772ce8378eae07d4cb7d04b056146bb8e4ca9fd4d345f5e","created_at":"2026-03-11T01:18:17Z","created_by":"sjarmak","crystallizes":0,"defer_until":null,"description":"Update all 131 SDLC test.sh files to source dual_score_lib.sh. The 123 tasks that already have answer_json_verifier_lib.sh need the lib swapped for the new dual-score version. The 8 skip tasks (4 design, 1 feature bustub, 3 understand) need answer.json support added with appropriate format bridging. Each task produces both reward_direct.txt and reward_artifact.txt.","design":"","due_at":null,"ephemeral":0,"estimated_minutes":null,"event_kind":"","external_ref":null,"hook_bead":"","id":"CodeScaleBench-6cv","is_template":0,"issue_type":"task","last_activity":null,"metadata":"{}","mol_type":"","notes":"","original_size":null,"owner":"sjarmak@users.noreply.github.com","payload":"","pinned":0,"priority":2,"quality_score":null,"rig":"","role_bead":"","role_type":"","sender":"","source_repo":"","source_system":"","spec_id":"","status":"closed","target":"","timeout_ns":0,"title":"Integrate dual-score into all 131 SDLC task verifiers","updated_at":"2026-03-11T01:46:25Z","waiters":"","wisp_type":"","work_type":""}
 {"acceptance_criteria":"","actor":"","agent_state":"","assignee":null,"await_id":"","await_type":"","close_reason":"Added DUAL-SCORE ANALYSIS and DUAL-SCORE BY SUITE sections to extract_v2_report_data.py output. Shows direct vs artifact means, gap, and Pearson correlation. breakdown_by() now includes per-dimension stats (bl_mean_direct, mcp_mean_direct, delta_direct, etc.) when data available.","closed_at":"2026-03-11T01:49:08Z","closed_by_session":"","compacted_at":null,"compacted_at_commit":null,"compaction_level":0,"content_hash":"02df8de04d043cbcd8bd36d38f841484dfd51ebb151b0ff5b32c11cbf9ba40ba","created_at":"2026-03-11T01:18:27Z","created_by":"sjarmak","crystallizes":0,"defer_until":null,"description":"Extend reporting to show both score dimensions: (1) compute_paired_stats produces bl_reward_direct, mcp_reward_direct, delta_direct (and same for artifact); (2) breakdown_by generates per-language, per-difficulty, per-suite stats for each dimension; (3) Add correlation analysis between direct and artifact scores (do agents that edit well also describe well?). Output unified report with both dimensions.","design":"","due_at":null,"ephemeral":0,"estimated_minutes":null,"event_kind":"","external_ref":null,"hook_bead":"","id":"CodeScaleBench-6or","is_template":0,"issue_type":"task","last_activity":null,"metadata":"{}","mol_type":"","notes":"","original_size":null,"owner":"sjarmak@users.noreply.github.com","payload":"","pinned":0,"priority":3,"quality_score":null,"rig":"","role_bead":"","role_type":"","sender":"","source_repo":"","source_system":"","spec_id":"","status":"closed","target":"","timeout_ns":0,"title":"Dual-score reporting: paired stats and breakdowns for both dimensions","updated_at":"2026-03-11T01:49:08Z","waiters":"","wisp_type":"","work_type":""}
+{"acceptance_criteria":"","actor":"","agent_state":"","assignee":null,"await_id":"","await_type":"","close_reason":"","closed_at":null,"closed_by_session":"","compacted_at":null,"compacted_at_commit":null,"compaction_level":0,"content_hash":"cbadb6bc4cd5a2b13149d1d486b66374a68bab6b919506aa1f7b732d83a684cd","created_at":"2026-03-12T00:32:34Z","created_by":"sjarmak","crystallizes":0,"defer_until":null,"description":"Both baseline runs failed in openhands_sonnet46_20260311_174751. kafka: Harbor timestamp collision (FileExistsError). typescript: DaytonaError sandbox already exists. MCP results exist and are valid (kafka=0.89). Need baseline rerun via openhands_2config.sh --baseline-only.","design":"","due_at":null,"ephemeral":0,"estimated_minutes":null,"event_kind":"","external_ref":null,"hook_bead":"","id":"CodeScaleBench-82e","is_template":0,"issue_type":"task","last_activity":null,"metadata":"{}","mol_type":"","notes":"","original_size":null,"owner":"sjarmak@users.noreply.github.com","payload":"","pinned":0,"priority":2,"quality_score":null,"rig":"","role_bead":"","role_type":"","sender":"","source_repo":"","source_system":"","spec_id":"","status":"open","target":"","timeout_ns":0,"title":"Rerun 2 missing OH baselines: kafka-contributor-workflow-001, typescript-type-narrowing-secure-001","updated_at":"2026-03-12T00:32:34Z","waiters":"","wisp_type":"","work_type":""}
 {"acceptance_criteria":"All remaining tasks in configs/canonical_evaluation_audit.json migrate from structured_output_mode=none to validation_result; representative contract-only preflight checks pass for each remaining family; python3 scripts/repo_health.py passes; changes are committed and pushed on main.","actor":"","agent_state":"","assignee":null,"await_id":"","await_type":"","close_reason":"All 264 active tasks now emit validation_result.json (v1alpha1). 50 tasks migrated across 6 families: ir_checklist(17), checklist(16), f1_hybrid(7), continuous(5), test_ratio(3), f1(2). Commit be8bff87f.","closed_at":"2026-03-09T20:31:38Z","closed_by_session":"","compacted_at":null,"compacted_at_commit":null,"compaction_level":0,"content_hash":"7ff98ccdedca19c23d711519126d71f933e5a549f92587aecf5f8964f143e22a","created_at":"2026-03-09T20:19:06Z","created_by":"sjarmak","crystallizes":0,"defer_until":null,"description":"Finish the canonical verifier-contract migration for the remaining 61 tasks that still emit only reward.txt.\n\nCurrent remaining families from configs/canonical_evaluation_audit.json:\n- ir_checklist: 17\n- checklist: 16\n- find_and_prove: 8\n- f1_hybrid: 7\n- test_ratio: 6\n- continuous: 5\n- f1: 2\n\nPlanned approach:\n- Migrate family-by-family using the same validation_result.v1alpha1 contract now used by oracle_checks and repo_state_heuristic.\n- Preserve reward.txt compatibility while adding /logs/verifier/validation_result.json.\n- Keep reward separate from pass semantics and emit invalid_output / verifier_error when the task contract requires it.\n- Regenerate configs/canonical_evaluation_audit.json after each landed batch.\n- Run representative python3 scripts/validate_tasks_preflight.py --task ... --contract-only --format json checks per family and python3 scripts/repo_health.py before commit/push.\n\nSuggested execution order:\n1. ir_checklist\n2. checklist\n3. find_and_prove\n4. f1_hybrid\n5. test_ratio\n6. continuous\n7. f1\n","design":"","due_at":null,"ephemeral":0,"estimated_minutes":null,"event_kind":"","external_ref":null,"hook_bead":"","id":"CodeScaleBench-aa9","is_template":0,"issue_type":"task","last_activity":null,"metadata":"{}","mol_type":"","notes":"","original_size":null,"owner":"sjarmak@users.noreply.github.com","payload":"","pinned":0,"priority":1,"quality_score":null,"rig":"","role_bead":"","role_type":"","sender":"","source_repo":"","source_system":"","spec_id":"","status":"closed","target":"","timeout_ns":0,"title":"Complete canonical validation_result migration for remaining verifier families","updated_at":"2026-03-09T20:31:38Z","waiters":"","wisp_type":"","work_type":""}
 {"acceptance_criteria":"","actor":"","agent_state":"","assignee":null,"await_id":"","await_type":"","close_reason":"Defined Org→SDLC mapping for all 11 org suites. Selected 67 promotion candidates: all multi-repo, 84% 2M+ LOC, balanced across 6 target SDLC suites. configs/org_promotion_manifest.json.","closed_at":"2026-03-07T23:01:01Z","closed_by_session":"","compacted_at":null,"compacted_at_commit":null,"compaction_level":0,"content_hash":"d581391bafd28d416539191f5b91d255b0832d75fccc535e206157b820ddbeec","created_at":"2026-03-07T22:56:46Z","created_by":"sjarmak","crystallizes":0,"defer_until":null,"description":"Select Org tasks that naturally map to SDLC phases and add deterministic verifiers. Priority: multi-repo tasks from large codebases that fill gaps in SDLC coverage. Natural mappings: incident→debug, security/compliance→secure, migration→refactor, onboarding/domain→understand, crossrepo/crossrepo_tracing→design. For each promoted task, identify the most straightforward deterministic verifier approach matching the target SDLC suite's pattern. Focus on tasks where the oracle_checks.py already does structured validation that can be made deterministic.","design":"","due_at":null,"ephemeral":0,"estimated_minutes":null,"event_kind":"","external_ref":null,"hook_bead":"","id":"CodeScaleBench-aav","is_template":0,"issue_type":"task","last_activity":null,"metadata":"{}","mol_type":"","notes":"","original_size":null,"owner":"sjarmak@users.noreply.github.com","payload":"","pinned":0,"priority":2,"quality_score":null,"rig":"","role_bead":"","role_type":"","sender":"","source_repo":"","source_system":"","spec_id":"","status":"closed","target":"","timeout_ns":0,"title":"Map and promote Org tasks to SDLC categories","updated_at":"2026-03-07T23:01:01Z","waiters":"","wisp_type":"","work_type":""}
 {"acceptance_criteria":"","actor":"","agent_state":"","assignee":null,"await_id":"","await_type":"","close_reason":"Taxonomy defined: comprehension/implementation/quality. Mapped all 20 suites and 477 tasks. Manifest: 40/37/23% split. configs/task_type_taxonomy.json + task_type field on all tasks.","closed_at":"2026-03-07T22:59:37Z","closed_by_session":"","compacted_at":null,"compacted_at_commit":null,"compaction_level":0,"content_hash":"cac4323aa5802e3e8dca37694c0f3c50c9dacf7ab21a04cf5e65a0bd3b7712a2","created_at":"2026-03-07T22:56:46Z","created_by":"sjarmak","crystallizes":0,"defer_until":null,"description":"Formalize the three task-type buckets that cut across suites: Comprehension (understand, design, document, onboarding, domain), Implementation (feature, fix, refactor, migration), Quality (test, debug, secure, compliance, incident). Add task_type field to selected_benchmark_tasks.json. Map existing SUITE_TO_PROFILE curator profiles to these three buckets. This taxonomy enables power analysis and balanced selection across task types, not just suites.","design":"","due_at":null,"ephemeral":0,"estimated_minutes":null,"event_kind":"","external_ref":null,"hook_bead":"","id":"CodeScaleBench-abl","is_template":0,"issue_type":"task","last_activity":null,"metadata":"{}","mol_type":"","notes":"","original_size":null,"owner":"sjarmak@users.noreply.github.com","payload":"","pinned":0,"priority":2,"quality_score":null,"rig":"","role_bead":"","role_type":"","sender":"","source_repo":"","source_system":"","spec_id":"","status":"closed","target":"","timeout_ns":0,"title":"Define task-type taxonomy: comprehension / implementation / quality","updated_at":"2026-03-07T22:59:37Z","waiters":"","wisp_type":"","work_type":""}