bd: backup 2026-03-07 23:33

sjarmak · sjarmak · commit 59989a75f019 · 2026-03-07T23:33:04.000Z
diff --git a/.beads/backup/backup_state.json b/.beads/backup/backup_state.json
@@ -1,10 +1,10 @@
 {
-  "last_dolt_commit": "u8iske93hleno1888n8cjtvo91c8u8tg",
+  "last_dolt_commit": "lljnnv6vgdfsda7fvcuqhp8jr31vvqte",
   "last_event_id": 0,
-  "timestamp": "2026-03-07T23:13:50.285366219Z",
+  "timestamp": "2026-03-07T23:33:04.618114095Z",
   "counts": {
     "issues": 7,
-    "events": 14,
+    "events": 18,
     "comments": 0,
     "dependencies": 5,
     "labels": 0,
diff --git a/.beads/backup/events.jsonl b/.beads/backup/events.jsonl
@@ -12,3 +12,7 @@
 {"actor":"sjarmak","comment":null,"created_at":"2026-03-07T23:01:00Z","event_type":"closed","id":12,"issue_id":"CodeScaleBench-aav","new_value":"Defined Org→SDLC mapping for all 11 org suites. Selected 67 promotion candidates: all multi-repo, 84% 2M+ LOC, balanced across 6 target SDLC suites. configs/org_promotion_manifest.json.","old_value":""}
 {"actor":"sjarmak","comment":null,"created_at":"2026-03-07T23:01:01Z","event_type":"closed","id":13,"issue_id":"CodeScaleBench-5p1","new_value":"Defined Org→SDLC mapping for all 11 org suites. Selected 67 promotion candidates: all multi-repo, 84% 2M+ LOC, balanced across 6 target SDLC suites. configs/org_promotion_manifest.json.","old_value":""}
 {"actor":"sjarmak","comment":null,"created_at":"2026-03-07T23:03:36Z","event_type":"closed","id":14,"issue_id":"CodeScaleBench-ggy","new_value":"IR scoring infrastructure already exists: 402/477 tasks have ground truth (171 SDLC + 265 Org), scripts/ir_analysis.py + retrieval_eval_pipeline.py + csb_metrics/ir_metrics.py handle computation. Pipeline currently gets 0 runs due to transcript path scanning issue in ir_analysis.py vs _raw/ directory format — that's a bug fix, not new feature work.","old_value":""}
+{"actor":"sjarmak","comment":null,"created_at":"2026-03-07T23:18:12Z","event_type":"status_changed","id":15,"issue_id":"CodeScaleBench-c17","new_value":"{\"status\":\"in_progress\"}","old_value":"{\"id\":\"CodeScaleBench-c17\",\"title\":\"Build deterministic verifiers for promoted Org tasks\",\"description\":\"For each Org task promoted to an SDLC category, create a deterministic verifier (test.sh) following the target suite's verification pattern. Approaches by suite: debug/fix → patch validation or test-pass checks, secure → grep for vulnerability patterns + fix verification, understand/design → F1 scoring against ground truth file/symbol sets, refactor → diff-based structural checks. Reuse existing oracle_checks.py logic where possible, converting soft scoring to deterministic pass/fail thresholds.\",\"status\":\"open\",\"priority\":3,\"issue_type\":\"task\",\"owner\":\"sjarmak@users.noreply.github.com\",\"created_at\":\"2026-03-07T22:56:46Z\",\"created_by\":\"sjarmak\",\"updated_at\":\"2026-03-07T22:56:46Z\"}"}
+{"actor":"sjarmak","comment":null,"created_at":"2026-03-07T23:24:00Z","event_type":"closed","id":16,"issue_id":"CodeScaleBench-c17","new_value":"Generated deterministic SDLC-quality test.sh + promoted_verifier.py for all 67 promoted Org tasks. Suite-specific composite weights (understand/design/debug/secure/refactor/test). Multiple assertion patterns per verifier. Original test.sh backed up as test.sh.org_backup.","old_value":""}
+{"actor":"sjarmak","comment":null,"created_at":"2026-03-07T23:26:58Z","event_type":"status_changed","id":17,"issue_id":"CodeScaleBench-utv","new_value":"{\"status\":\"in_progress\"}","old_value":"{\"id\":\"CodeScaleBench-utv\",\"title\":\"Rebuild unified manifest with power-optimized task-type balance\",\"description\":\"Rebuild the core benchmark manifest as a single unified set (no SDLC vs Org split). Optimize selection for: (1) 80% power for overall retrieval effect, (2) balanced task-type representation (comprehension/implementation/quality), (3) multi-repo coverage in every task type, (4) LOC band diversity with emphasis on large codebases (2M+ LOC). Target ~280-300 tasks based on power analysis. Every task has both deterministic reward and IR retrieval scoring.\",\"status\":\"open\",\"priority\":3,\"issue_type\":\"task\",\"owner\":\"sjarmak@users.noreply.github.com\",\"created_at\":\"2026-03-07T22:56:46Z\",\"created_by\":\"sjarmak\",\"updated_at\":\"2026-03-07T22:56:46Z\"}"}
+{"actor":"sjarmak","comment":null,"created_at":"2026-03-07T23:33:04Z","event_type":"closed","id":18,"issue_id":"CodeScaleBench-utv","new_value":"Built unified 280-task manifest (schema v2.0). comprehension=100, implementation=90, quality=90. Overall power=84.1% at sigma=0.20. Large codebase 58.6%, multi-repo 31.8%, 20 suites, 11 languages. LOC fallback chain eliminates all unknowns.","old_value":""}
diff --git a/.beads/backup/issues.jsonl b/.beads/backup/issues.jsonl
@@ -1,7 +1,7 @@
 {"acceptance_criteria":"","actor":"","agent_state":"","assignee":null,"await_id":"","await_type":"","close_reason":"Defined Org→SDLC mapping for all 11 org suites. Selected 67 promotion candidates: all multi-repo, 84% 2M+ LOC, balanced across 6 target SDLC suites. configs/org_promotion_manifest.json.","closed_at":"2026-03-07T23:01:01Z","closed_by_session":"","compacted_at":null,"compacted_at_commit":null,"compaction_level":0,"content_hash":"e57ed0ffb8999cc5708e3fbe9fa45f6a2e6461b45004c38ec33b54abfd14e753","created_at":"2026-03-07T22:56:46Z","created_by":"sjarmak","crystallizes":0,"defer_until":null,"description":"Analyze current SDLC coverage gaps: multi-repo (only 15/171), large codebases (only 2 tasks in 8M-40M, 0 in \u003e40M), and task-type balance. Select ~60-80 Org tasks for promotion that maximize: (1) multi-repo representation across all SDLC suites, (2) large codebase coverage (prioritize 2M+ LOC), (3) task-type balance across comprehension/implementation/quality. Produce a promotion manifest with target suite, verifier approach, and priority ranking.","design":"","due_at":null,"ephemeral":0,"estimated_minutes":null,"event_kind":"","external_ref":null,"hook_bead":"","id":"CodeScaleBench-5p1","is_template":0,"issue_type":"task","last_activity":null,"metadata":"{}","mol_type":"","notes":"","original_size":null,"owner":"sjarmak@users.noreply.github.com","payload":"","pinned":0,"priority":2,"quality_score":null,"rig":"","role_bead":"","role_type":"","sender":"","source_repo":"","source_system":"","spec_id":"","status":"closed","target":"","timeout_ns":0,"title":"Select Org→SDLC promotion candidates optimized for coverage gaps","updated_at":"2026-03-07T23:01:01Z","waiters":"","wisp_type":"","work_type":""}
 {"acceptance_criteria":"","actor":"","agent_state":"","assignee":null,"await_id":"","await_type":"","close_reason":"Defined Org→SDLC mapping for all 11 org suites. Selected 67 promotion candidates: all multi-repo, 84% 2M+ LOC, balanced across 6 target SDLC suites. configs/org_promotion_manifest.json.","closed_at":"2026-03-07T23:01:01Z","closed_by_session":"","compacted_at":null,"compacted_at_commit":null,"compaction_level":0,"content_hash":"d581391bafd28d416539191f5b91d255b0832d75fccc535e206157b820ddbeec","created_at":"2026-03-07T22:56:46Z","created_by":"sjarmak","crystallizes":0,"defer_until":null,"description":"Select Org tasks that naturally map to SDLC phases and add deterministic verifiers. Priority: multi-repo tasks from large codebases that fill gaps in SDLC coverage. Natural mappings: incident→debug, security/compliance→secure, migration→refactor, onboarding/domain→understand, crossrepo/crossrepo_tracing→design. For each promoted task, identify the most straightforward deterministic verifier approach matching the target SDLC suite's pattern. Focus on tasks where the oracle_checks.py already does structured validation that can be made deterministic.","design":"","due_at":null,"ephemeral":0,"estimated_minutes":null,"event_kind":"","external_ref":null,"hook_bead":"","id":"CodeScaleBench-aav","is_template":0,"issue_type":"task","last_activity":null,"metadata":"{}","mol_type":"","notes":"","original_size":null,"owner":"sjarmak@users.noreply.github.com","payload":"","pinned":0,"priority":2,"quality_score":null,"rig":"","role_bead":"","role_type":"","sender":"","source_repo":"","source_system":"","spec_id":"","status":"closed","target":"","timeout_ns":0,"title":"Map and promote Org tasks to SDLC categories","updated_at":"2026-03-07T23:01:01Z","waiters":"","wisp_type":"","work_type":""}
 {"acceptance_criteria":"","actor":"","agent_state":"","assignee":null,"await_id":"","await_type":"","close_reason":"Taxonomy defined: comprehension/implementation/quality. Mapped all 20 suites and 477 tasks. Manifest: 40/37/23% split. configs/task_type_taxonomy.json + task_type field on all tasks.","closed_at":"2026-03-07T22:59:37Z","closed_by_session":"","compacted_at":null,"compacted_at_commit":null,"compaction_level":0,"content_hash":"cac4323aa5802e3e8dca37694c0f3c50c9dacf7ab21a04cf5e65a0bd3b7712a2","created_at":"2026-03-07T22:56:46Z","created_by":"sjarmak","crystallizes":0,"defer_until":null,"description":"Formalize the three task-type buckets that cut across suites: Comprehension (understand, design, document, onboarding, domain), Implementation (feature, fix, refactor, migration), Quality (test, debug, secure, compliance, incident). Add task_type field to selected_benchmark_tasks.json. Map existing SUITE_TO_PROFILE curator profiles to these three buckets. This taxonomy enables power analysis and balanced selection across task types, not just suites.","design":"","due_at":null,"ephemeral":0,"estimated_minutes":null,"event_kind":"","external_ref":null,"hook_bead":"","id":"CodeScaleBench-abl","is_template":0,"issue_type":"task","last_activity":null,"metadata":"{}","mol_type":"","notes":"","original_size":null,"owner":"sjarmak@users.noreply.github.com","payload":"","pinned":0,"priority":2,"quality_score":null,"rig":"","role_bead":"","role_type":"","sender":"","source_repo":"","source_system":"","spec_id":"","status":"closed","target":"","timeout_ns":0,"title":"Define task-type taxonomy: comprehension / implementation / quality","updated_at":"2026-03-07T22:59:37Z","waiters":"","wisp_type":"","work_type":""}
-{"acceptance_criteria":"","actor":"","agent_state":"","assignee":null,"await_id":"","await_type":"","close_reason":"","closed_at":null,"closed_by_session":"","compacted_at":null,"compacted_at_commit":null,"compaction_level":0,"content_hash":"014d196f64f7b7deb5bfbcfde49ed3ab45243bc4abfb8a71f9421de598b3a71d","created_at":"2026-03-07T22:56:46Z","created_by":"sjarmak","crystallizes":0,"defer_until":null,"description":"For each Org task promoted to an SDLC category, create a deterministic verifier (test.sh) following the target suite's verification pattern. Approaches by suite: debug/fix → patch validation or test-pass checks, secure → grep for vulnerability patterns + fix verification, understand/design → F1 scoring against ground truth file/symbol sets, refactor → diff-based structural checks. Reuse existing oracle_checks.py logic where possible, converting soft scoring to deterministic pass/fail thresholds.","design":"","due_at":null,"ephemeral":0,"estimated_minutes":null,"event_kind":"","external_ref":null,"hook_bead":"","id":"CodeScaleBench-c17","is_template":0,"issue_type":"task","last_activity":null,"metadata":"{}","mol_type":"","notes":"","original_size":null,"owner":"sjarmak@users.noreply.github.com","payload":"","pinned":0,"priority":3,"quality_score":null,"rig":"","role_bead":"","role_type":"","sender":"","source_repo":"","source_system":"","spec_id":"","status":"open","target":"","timeout_ns":0,"title":"Build deterministic verifiers for promoted Org tasks","updated_at":"2026-03-07T22:56:46Z","waiters":"","wisp_type":"","work_type":""}
+{"acceptance_criteria":"","actor":"","agent_state":"","assignee":null,"await_id":"","await_type":"","close_reason":"Generated deterministic SDLC-quality test.sh + promoted_verifier.py for all 67 promoted Org tasks. Suite-specific composite weights (understand/design/debug/secure/refactor/test). Multiple assertion patterns per verifier. Original test.sh backed up as test.sh.org_backup.","closed_at":"2026-03-07T23:24:00Z","closed_by_session":"","compacted_at":null,"compacted_at_commit":null,"compaction_level":0,"content_hash":"014d196f64f7b7deb5bfbcfde49ed3ab45243bc4abfb8a71f9421de598b3a71d","created_at":"2026-03-07T22:56:46Z","created_by":"sjarmak","crystallizes":0,"defer_until":null,"description":"For each Org task promoted to an SDLC category, create a deterministic verifier (test.sh) following the target suite's verification pattern. Approaches by suite: debug/fix → patch validation or test-pass checks, secure → grep for vulnerability patterns + fix verification, understand/design → F1 scoring against ground truth file/symbol sets, refactor → diff-based structural checks. Reuse existing oracle_checks.py logic where possible, converting soft scoring to deterministic pass/fail thresholds.","design":"","due_at":null,"ephemeral":0,"estimated_minutes":null,"event_kind":"","external_ref":null,"hook_bead":"","id":"CodeScaleBench-c17","is_template":0,"issue_type":"task","last_activity":null,"metadata":"{}","mol_type":"","notes":"","original_size":null,"owner":"sjarmak@users.noreply.github.com","payload":"","pinned":0,"priority":3,"quality_score":null,"rig":"","role_bead":"","role_type":"","sender":"","source_repo":"","source_system":"","spec_id":"","status":"closed","target":"","timeout_ns":0,"title":"Build deterministic verifiers for promoted Org tasks","updated_at":"2026-03-07T23:24:00Z","waiters":"","wisp_type":"","work_type":""}
 {"acceptance_criteria":"","actor":"","agent_state":"","assignee":null,"await_id":"","await_type":"","close_reason":"IR scoring infrastructure already exists: 402/477 tasks have ground truth (171 SDLC + 265 Org), scripts/ir_analysis.py + retrieval_eval_pipeline.py + csb_metrics/ir_metrics.py handle computation. Pipeline currently gets 0 runs due to transcript path scanning issue in ir_analysis.py vs _raw/ directory format — that's a bug fix, not new feature work.","closed_at":"2026-03-07T23:03:36Z","closed_by_session":"","compacted_at":null,"compacted_at_commit":null,"compaction_level":0,"content_hash":"0bb08efd6b2745ce275e5212f3de9244dae8f6f448e07636df30a836c98e0861","created_at":"2026-03-07T22:56:23Z","created_by":"sjarmak","crystallizes":0,"defer_until":null,"description":"Add answer.json or ground_truth-based IR scoring to all SDLC tasks so they have dual scores: deterministic reward + IR recall/precision. The curator agent already produces ground_truth.json with files/symbols for every task. Add an extraction step that captures the agent's file-level retrieval from workspace activity and scores it against curator ground truth. This makes SDLC tasks directly comparable with Org tasks on retrieval metrics.","design":"","due_at":null,"ephemeral":0,"estimated_minutes":null,"event_kind":"","external_ref":null,"hook_bead":"","id":"CodeScaleBench-ggy","is_template":0,"issue_type":"task","last_activity":null,"metadata":"{}","mol_type":"","notes":"","original_size":null,"owner":"sjarmak@users.noreply.github.com","payload":"","pinned":0,"priority":2,"quality_score":null,"rig":"","role_bead":"","role_type":"","sender":"","source_repo":"","source_system":"","spec_id":"","status":"closed","target":"","timeout_ns":0,"title":"Add IR/retrieval scoring layer to SDLC tasks","updated_at":"2026-03-07T23:03:36Z","waiters":"","wisp_type":"","work_type":""}
-{"acceptance_criteria":"","actor":"","agent_state":"","assignee":null,"await_id":"","await_type":"","close_reason":"","closed_at":null,"closed_by_session":"","compacted_at":null,"compacted_at_commit":null,"compaction_level":0,"content_hash":"e464c7d5aa11f02b2eac40dc12bfbee707add98b6882dc3f11c7d9410edd7b71","created_at":"2026-03-07T22:56:46Z","created_by":"sjarmak","crystallizes":0,"defer_until":null,"description":"Rebuild the core benchmark manifest as a single unified set (no SDLC vs Org split). Optimize selection for: (1) 80% power for overall retrieval effect, (2) balanced task-type representation (comprehension/implementation/quality), (3) multi-repo coverage in every task type, (4) LOC band diversity with emphasis on large codebases (2M+ LOC). Target ~280-300 tasks based on power analysis. Every task has both deterministic reward and IR retrieval scoring.","design":"","due_at":null,"ephemeral":0,"estimated_minutes":null,"event_kind":"","external_ref":null,"hook_bead":"","id":"CodeScaleBench-utv","is_template":0,"issue_type":"task","last_activity":null,"metadata":"{}","mol_type":"","notes":"","original_size":null,"owner":"sjarmak@users.noreply.github.com","payload":"","pinned":0,"priority":3,"quality_score":null,"rig":"","role_bead":"","role_type":"","sender":"","source_repo":"","source_system":"","spec_id":"","status":"open","target":"","timeout_ns":0,"title":"Rebuild unified manifest with power-optimized task-type balance","updated_at":"2026-03-07T22:56:46Z","waiters":"","wisp_type":"","work_type":""}
+{"acceptance_criteria":"","actor":"","agent_state":"","assignee":null,"await_id":"","await_type":"","close_reason":"Built unified 280-task manifest (schema v2.0). comprehension=100, implementation=90, quality=90. Overall power=84.1% at sigma=0.20. Large codebase 58.6%, multi-repo 31.8%, 20 suites, 11 languages. LOC fallback chain eliminates all unknowns.","closed_at":"2026-03-07T23:33:05Z","closed_by_session":"","compacted_at":null,"compacted_at_commit":null,"compaction_level":0,"content_hash":"e464c7d5aa11f02b2eac40dc12bfbee707add98b6882dc3f11c7d9410edd7b71","created_at":"2026-03-07T22:56:46Z","created_by":"sjarmak","crystallizes":0,"defer_until":null,"description":"Rebuild the core benchmark manifest as a single unified set (no SDLC vs Org split). Optimize selection for: (1) 80% power for overall retrieval effect, (2) balanced task-type representation (comprehension/implementation/quality), (3) multi-repo coverage in every task type, (4) LOC band diversity with emphasis on large codebases (2M+ LOC). Target ~280-300 tasks based on power analysis. Every task has both deterministic reward and IR retrieval scoring.","design":"","due_at":null,"ephemeral":0,"estimated_minutes":null,"event_kind":"","external_ref":null,"hook_bead":"","id":"CodeScaleBench-utv","is_template":0,"issue_type":"task","last_activity":null,"metadata":"{}","mol_type":"","notes":"","original_size":null,"owner":"sjarmak@users.noreply.github.com","payload":"","pinned":0,"priority":3,"quality_score":null,"rig":"","role_bead":"","role_type":"","sender":"","source_repo":"","source_system":"","spec_id":"","status":"closed","target":"","timeout_ns":0,"title":"Rebuild unified manifest with power-optimized task-type balance","updated_at":"2026-03-07T23:33:05Z","waiters":"","wisp_type":"","work_type":""}
 {"acceptance_criteria":"","actor":"","agent_state":"","assignee":null,"await_id":"","await_type":"","close_reason":"","closed_at":null,"closed_by_session":"","compacted_at":null,"compacted_at_commit":null,"compaction_level":0,"content_hash":"e3d9bf86e6f520ab604c0c7d317b708e8814f4e5505b5d360caf4591b3428e2d","created_at":"2026-03-07T22:56:15Z","created_by":"sjarmak","crystallizes":0,"defer_until":null,"description":"Converge the two halves of CodeScaleBench (SDLC with deterministic verifiers + Org with answer.json verifiers) into a single unified benchmark. Three phases: (1) add IR scoring to SDLC tasks via curator ground truth, (2) promote select Org tasks to SDLC categories with deterministic verifiers, (3) rebuild manifest optimized for multi-repo, large codebase, and task-type balance (comprehension/implementation/quality).","design":"","due_at":null,"ephemeral":0,"estimated_minutes":null,"event_kind":"","external_ref":null,"hook_bead":"","id":"CodeScaleBench-xjg","is_template":0,"issue_type":"feature","last_activity":null,"metadata":"{}","mol_type":"","notes":"","original_size":null,"owner":"sjarmak@users.noreply.github.com","payload":"","pinned":0,"priority":1,"quality_score":null,"rig":"","role_bead":"","role_type":"","sender":"","source_repo":"","source_system":"","spec_id":"","status":"open","target":"","timeout_ns":0,"title":"[Epic] Unify SDLC + Org into single balanced benchmark","updated_at":"2026-03-07T22:56:15Z","waiters":"","wisp_type":"","work_type":""}