From 11210d7dbca1b2cf8cb5cbea2358df6b3e3306bb Mon Sep 17 00:00:00 2001 From: madgegja Date: Sun, 7 Jun 2026 04:01:12 +0200 Subject: [PATCH] feat: require global review debug gate --- README.md | 9 +-- packages/web/content/docs/skills.md | 6 +- packages/web/content/docs/start-work.md | 3 +- packages/web/lib/commands.ts | 5 +- packages/web/lib/docs-content.generated.ts | 4 +- packages/web/lib/site-config.ts | 2 +- .../rules/bundled-rules/hephaestus.md | 22 ++++++++ .../start-work-continuation/directive.md | 12 +++- .../test/codex-hook.test.ts | 26 +++++++++ plugins/omo/scripts/sync-skills.mjs | 55 ++++++++++++++++++- plugins/omo/skills/review-work/SKILL.md | 13 +++++ plugins/omo/skills/start-work/SKILL.md | 13 ++++- plugins/omo/test/start-work-skill.test.mjs | 34 +++++++++++- plugins/omo/test/sync-skills.test.mjs | 41 +++++++++++++- .../readme-feature-workflows-content.test.mjs | 2 + test/web-workflow-copy-content.test.mjs | 32 +++++++++++ 16 files changed, 258 insertions(+), 21 deletions(-) create mode 100644 test/web-workflow-copy-content.test.mjs diff --git a/README.md b/README.md index d4dd215..7eec486 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ LazyCodex installs these as OmO commands for Codex. Invoke them with the | --- | --- | --- | | `$ulw-loop` | `$ulw-loop "task" [--completion-promise=TEXT] [--strategy=reset\|continue]` | Self-referential loop that runs until Oracle-verified completion. Caps at 500 iterations in ultrawork mode, 100 in normal mode. | | `$ulw-plan` | `$ulw-plan "what to build"` | Prometheus strategic planner. Writes a plan to `plans/.md`. Never writes product code. | -| `$start-work` | `$start-work [plan-name] [--worktree ]` | Executes a plan until every checkbox is done. Prints **ORCHESTRATION COMPLETE**. | +| `$start-work` | `$start-work [plan-name] [--worktree ]` | Executes a plan until every checkbox is done, then requires the global post-implementation review and debugging gate before **ORCHESTRATION COMPLETE**. | Full documentation lives at [lazycodex.ai/docs](https://lazycodex.ai/docs). @@ -86,7 +86,8 @@ Use `$ulw-plan` when the work needs decisions before implementation. It writes a plan to `plans/.md` and does not touch product code. Use `$start-work` when a plan is ready. It executes the checklist with durable -Boulder progress and stops only when the plan is complete. +Boulder progress and stops only when the plan is complete and the global +post-implementation review plus debugging gate has passed. Use `$ulw-loop` when the task should keep moving until the result is verified by evidence instead of a hopeful status update. @@ -100,9 +101,9 @@ actual work: | --- | --- | | `/init-deep` | Hierarchical project memory through `AGENTS.md` | | `$ulw-plan` | Decision-complete planning before code changes | -| `$start-work` | Durable plan execution with Boulder progress | +| `$start-work` | Durable plan execution with Boulder progress, post-implementation review, and debugging gate | | `$ulw-loop` | Verified completion for open-ended tasks | -| `review-work` | Multi-angle post-implementation review | +| `review-work` | Multi-angle post-implementation review that blocks completion when any lane fails or is inconclusive | | `remove-ai-slops` | Behavior-preserving cleanup of AI-looking code | | `frontend-ui-ux` | Polished UI surfaces | | `programming` | Strict TypeScript, Rust, Python, or Go discipline | diff --git a/packages/web/content/docs/skills.md b/packages/web/content/docs/skills.md index 297e354..c40f546 100644 --- a/packages/web/content/docs/skills.md +++ b/packages/web/content/docs/skills.md @@ -4,7 +4,7 @@ LazyCodex is most useful as a harness for complex codebases: project memory, pla Start with `/init-deep` when the repository is too large or too old to explain from memory. It generates hierarchical `AGENTS.md` context so agents can find the right files before they change code. -Use `$ulw-plan` when the work needs decisions before implementation, `$start-work` when a plan should be executed, and `$ulw-loop` when you want the agent to keep going until the result is verified. +Use `$ulw-plan` when the work needs decisions before implementation, `$start-work` when a plan should be executed through a final review/debugging gate, and `$ulw-loop` when you want the agent to keep going until the result is verified. ### Feature coverage @@ -12,13 +12,13 @@ The three command pillars stay simple: - `$ulw-loop` keeps moving until verified completion - `$ulw-plan` turns fuzzy work into a decision-complete plan -- `$start-work` executes a plan with durable Boulder progress +- `$start-work` executes a plan with durable Boulder progress, post-implementation review, and a debugging gate Skills add specialist judgment around those pillars: | Skill | Use it for | | --- | --- | -| `review-work` | Multi-angle post-implementation review | +| `review-work` | Multi-angle post-implementation review that blocks completion when any lane fails or is inconclusive | | `remove-ai-slops` | Behavior-preserving cleanup of AI-looking code | | `frontend-ui-ux` | Designed UI work instead of generic layout filling | | `programming` | Strict TypeScript, Rust, Python, or Go discipline | diff --git a/packages/web/content/docs/start-work.md b/packages/web/content/docs/start-work.md index f9979e7..0fd3598 100644 --- a/packages/web/content/docs/start-work.md +++ b/packages/web/content/docs/start-work.md @@ -6,6 +6,7 @@ - A Stop-hook re-injects the next turn until the plan is complete - Independent sub-tasks fan out to parallel subagents - Strict TDD plus five evidence gates: plan reread, automated verification, manual-QA, adversarial QA, cleanup +- A final Global Review and Debugging Gate runs `review-work`, records a debugging audit, and blocks completion or PR handoff on failed or inconclusive lanes - Progress is recorded to a ledger ### Syntax @@ -16,4 +17,4 @@ $start-work [plan-name] [--worktree ] ### Done -It prints an `ORCHESTRATION COMPLETE` block when every checkbox is checked. +It prints an `ORCHESTRATION COMPLETE` block only when every checkbox is checked and the global post-implementation review plus debugging gate has passed. diff --git a/packages/web/lib/commands.ts b/packages/web/lib/commands.ts index 66c554d..32b41cb 100644 --- a/packages/web/lib/commands.ts +++ b/packages/web/lib/commands.ts @@ -33,11 +33,12 @@ export const COMMANDS: readonly LazyCommand[] = [ name: "$start-work", glyph: "work", syntax: "$start-work [plan-name] [--worktree ]", - summary: "Executes a Prometheus plan until every checkbox is done.", + summary: "Executes a Prometheus plan through every checkbox and the final review/debugging gate.", facts: [ "Durable Boulder state survives across turns", "Parallel subagents, strict TDD + 5 evidence gates", - "Prints ORCHESTRATION COMPLETE when finished", + "Global review + debugging gate blocks completion and PR handoff", + "Prints ORCHESTRATION COMPLETE only after the gate passes", ], }, ] as const; diff --git a/packages/web/lib/docs-content.generated.ts b/packages/web/lib/docs-content.generated.ts index 98317d8..6810c8b 100644 --- a/packages/web/lib/docs-content.generated.ts +++ b/packages/web/lib/docs-content.generated.ts @@ -2,9 +2,9 @@ export const DOC_SOURCES: Record = { "overview.md": "

LazyCodex packages oh-my-openagent (OmO) inside Codex as the agent harness for complex codebases. Think LazyVim for lazy.nvim, but for Codex.

\n

What you get

\n

OmO gives Codex a full agent harness: discipline agents (Sisyphus orchestrates Hephaestus, Oracle, and Librarian), parallel execution, multi-model routing, a skills system, hooks and lifecycle, and verification defaults. LazyCodex packages that harness as a repeatable Codex setup.

\n

The harness workflow

\n

Use {your prompt} ultrawork when the job needs project memory, planning, parallel agents, and verified completion to run as one coordinated loop.

\n

How it fits together

\n

LazyCodex is a thin distribution layer. The core engine is OmO. LazyCodex is maintained by Sisyphus Labs.

\n

Credit: The LazyCodex name idea is inspired by LazyVim. The Ultragoal and UltraQA ideas are inspired by oh-my-codex, reimplemented from concept for this Codex setup.

\n\n", "installation.md": "

One command installs the OmO agent harness for Codex without a global package install.

\n

Install

\n
npx lazycodex-ai install\n
\n

This is exactly equivalent to npx --yes --package oh-my-openagent omo install --platform=codex.

\n

Autonomous one-liner

\n
npx lazycodex-ai install --no-tui --codex-autonomous\n
\n

Prerequisites

\n\n
\n

Do NOT use npm install -g or bun add -g. Always invoke via npx.

\n
\n

Let an agent do it

\n

It is strongly recommended to let an LLM agent run the install and walk the setup for you. The agent handles subscription detection, model selection, and provider auth automatically.

\n", - "skills.md": "

LazyCodex is most useful as a harness for complex codebases: project memory, planning, execution, verified completion, skills, hooks, model routing, and diagnostics.

\n

Built-in workflows

\n

Start with /init-deep when the repository is too large or too old to explain from memory. It generates hierarchical AGENTS.md context so agents can find the right files before they change code.

\n

Use $ulw-plan when the work needs decisions before implementation, $start-work when a plan should be executed, and $ulw-loop when you want the agent to keep going until the result is verified.

\n

Feature coverage

\n

The three command pillars stay simple:

\n
    \n
  • $ulw-loop keeps moving until verified completion
  • \n
  • $ulw-plan turns fuzzy work into a decision-complete plan
  • \n
  • $start-work executes a plan with durable Boulder progress
  • \n
\n

Skills add specialist judgment around those pillars:

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
SkillUse it for
review-workMulti-angle post-implementation review
remove-ai-slopsBehavior-preserving cleanup of AI-looking code
frontend-ui-uxDesigned UI work instead of generic layout filling
programmingStrict TypeScript, Rust, Python, or Go discipline
LSPDiagnostics, definitions, references, symbols, and renames
AST-grepStructural search and rewrite across code
rulesProject instructions from AGENTS, rules, and instruction files
comment-checkerFeedback after edit-like operations
\n

Where skills live

\n

OmO can load skills from project and user locations such as .opencode/skills, ~/.config/opencode/skills, .claude/skills, .agents/skills, and ~/.agents/skills.

\n

LazyCodex installs the Codex Light setup with:

\n
npx lazycodex-ai install\n
\n

That installer wires the Codex marketplace plugin as omo@sisyphuslabs while keeping the public package alias easy to remember.

\n", + "skills.md": "

LazyCodex is most useful as a harness for complex codebases: project memory, planning, execution, verified completion, skills, hooks, model routing, and diagnostics.

\n

Built-in workflows

\n

Start with /init-deep when the repository is too large or too old to explain from memory. It generates hierarchical AGENTS.md context so agents can find the right files before they change code.

\n

Use $ulw-plan when the work needs decisions before implementation, $start-work when a plan should be executed through a final review/debugging gate, and $ulw-loop when you want the agent to keep going until the result is verified.

\n

Feature coverage

\n

The three command pillars stay simple:

\n
    \n
  • $ulw-loop keeps moving until verified completion
  • \n
  • $ulw-plan turns fuzzy work into a decision-complete plan
  • \n
  • $start-work executes a plan with durable Boulder progress, post-implementation review, and a debugging gate
  • \n
\n

Skills add specialist judgment around those pillars:

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
SkillUse it for
review-workMulti-angle post-implementation review that blocks completion when any lane fails or is inconclusive
remove-ai-slopsBehavior-preserving cleanup of AI-looking code
frontend-ui-uxDesigned UI work instead of generic layout filling
programmingStrict TypeScript, Rust, Python, or Go discipline
LSPDiagnostics, definitions, references, symbols, and renames
AST-grepStructural search and rewrite across code
rulesProject instructions from AGENTS, rules, and instruction files
comment-checkerFeedback after edit-like operations
\n

Where skills live

\n

OmO can load skills from project and user locations such as .opencode/skills, ~/.config/opencode/skills, .claude/skills, .agents/skills, and ~/.agents/skills.

\n

LazyCodex installs the Codex Light setup with:

\n
npx lazycodex-ai install\n
\n

That installer wires the Codex marketplace plugin as omo@sisyphuslabs while keeping the public package alias easy to remember.

\n", "ultrawork.md": "

ultrawork is the headline mode. Typing ultrawork (or the short alias ulw) anywhere in your prompt activates maximum-precision, outcome-first, evidence-driven orchestration.

\n
\n

"Plan, execute, verify, and keep the evidence attached."

\n
\n

Usage

\n
ulw add authentication\n
\n

What it enforces

\n
    \n
  • Strict TDD: RED → GREEN → SURFACE → CLEAN
  • \n
  • At least 3 realistic QA scenarios
  • \n
  • Real manual-QA channels (HTTP call, tmux, browser)
  • \n
  • A binding verification gate that loops until the work is genuinely done
  • \n
\n", "ulw-loop.md": "

$ulw-loop is a self-referential development loop that runs until verified completion.

\n

How it works

\n

The agent works continuously and emits <promise>DONE</promise> when it believes the task is complete, but that does NOT end the loop. An Oracle must verify the result first. The loop ends only after the system confirms Oracle verified it. If verification fails, it continues with the message: "Oracle verification failed. Continuing ULTRAWORK loop."

\n

Syntax

\n
$ulw-loop "task description" [--completion-promise=TEXT] [--strategy=reset|continue]\n
\n

Limits

\n

The iteration cap is 500 in ultrawork mode (100 in normal mode).

\n", "ulw-plan.md": "

$ulw-plan is the strategic planning consultant (Prometheus). It turns an idea into a decision-complete work plan. It is a planner, NOT an implementer. When you say "do X" it produces a plan for X and never writes product code.

\n

The flow

\n
    \n
  1. Socratic interview
  2. \n
  3. Parallel codebase exploration
  4. \n
  5. Metis gap analysis
  6. \n
  7. Writes the plan to plans/<slug>.md
  8. \n
  9. Optional Momus high-accuracy review
  10. \n
\n

Output

\n

Questions, research, and a work plan you can hand to $start-work.

\n", - "start-work.md": "

$start-work executes a Prometheus work plan until every top-level checkbox is done.

\n

How it works

\n
    \n
  • Durable Boulder state in .omo/boulder.json survives across turns and sessions
  • \n
  • A Stop-hook re-injects the next turn until the plan is complete
  • \n
  • Independent sub-tasks fan out to parallel subagents
  • \n
  • Strict TDD plus five evidence gates: plan reread, automated verification, manual-QA, adversarial QA, cleanup
  • \n
  • Progress is recorded to a ledger
  • \n
\n

Syntax

\n
$start-work [plan-name] [--worktree <absolute-path>]\n
\n

Done

\n

It prints an ORCHESTRATION COMPLETE block when every checkbox is checked.

\n" + "start-work.md": "

$start-work executes a Prometheus work plan until every top-level checkbox is done.

\n

How it works

\n
    \n
  • Durable Boulder state in .omo/boulder.json survives across turns and sessions
  • \n
  • A Stop-hook re-injects the next turn until the plan is complete
  • \n
  • Independent sub-tasks fan out to parallel subagents
  • \n
  • Strict TDD plus five evidence gates: plan reread, automated verification, manual-QA, adversarial QA, cleanup
  • \n
  • A final Global Review and Debugging Gate runs review-work, records a debugging audit, and blocks completion or PR handoff on failed or inconclusive lanes
  • \n
  • Progress is recorded to a ledger
  • \n
\n

Syntax

\n
$start-work [plan-name] [--worktree <absolute-path>]\n
\n

Done

\n

It prints an ORCHESTRATION COMPLETE block only when every checkbox is checked and the global post-implementation review plus debugging gate has passed.

\n" }; diff --git a/packages/web/lib/site-config.ts b/packages/web/lib/site-config.ts index 1e52901..67798e7 100644 --- a/packages/web/lib/site-config.ts +++ b/packages/web/lib/site-config.ts @@ -33,7 +33,7 @@ export const SITE_CONFIG = { }, { label: "Plans before edits", - text: "$ulw-plan turns ambiguous work into a decision-complete plan, then $start-work executes it with durable Boulder progress.", + text: "$ulw-plan turns ambiguous work into a decision-complete plan, then $start-work executes it with durable Boulder progress and blocks completion on the global review/debugging gate.", }, { label: "Evidence at the end", diff --git a/plugins/omo/components/rules/bundled-rules/hephaestus.md b/plugins/omo/components/rules/bundled-rules/hephaestus.md index 13f0d24..1931876 100644 --- a/plugins/omo/components/rules/bundled-rules/hephaestus.md +++ b/plugins/omo/components/rules/bundled-rules/hephaestus.md @@ -98,6 +98,27 @@ omo-codex bundles three read-only Codex subagent roles in `CODEX_HOME/agents/`: - **Verify.** Diagnostics on changed files, related tests, build if applicable - in parallel where possible. - **Manually QA.** Drive the artifact through its surface (Manual QA Gate). Then write the final message. +# Post-implementation Global Review and Debugging Gate + +For significant implementation work, PR creation, or PR handoff, completion is +blocked until a global review and debugging gate passes. + +1. Run `review-work` after implementation verification. All five lanes must + PASS. Failed, timed-out, missing-deliverable, ack-only, `BLOCKED:`, or + inconclusive lanes block completion. +2. Run a debugging-oriented audit against the changed surface: name at least + three plausible runtime failure hypotheses, run distinguishing checks, and + record the evidence that each was ruled out or confirmed. +3. If review or debugging finds a real issue, use the `debugging` skill to + confirm root cause with runtime evidence, add a failing test or reproduction, + fix minimally, and rerun the gate. +4. Redact or mask secrets and sensitive user data before writing evidence to a + ledger, PR body, or handoff. Never include raw tokens, credentials, auth + headers, cookies, API keys, env dumps, private logs, or PII; use concise + summaries, lengths, hashes, or short non-sensitive prefixes instead. +5. For PR work, refresh branch/PR state after the gate and include only + redacted review/debugging evidence in the PR body or handoff. + # Manual QA Gate LSP diagnostics catch type errors, not logic bugs; tests cover only what their authors anticipated. **"Done" requires you have personally used the deliverable through its matching surface and observed it working** within this turn. The surface determines the tool: @@ -170,6 +191,7 @@ Done when ALL of: - LSP diagnostics clean on every file you changed. - Build (if applicable) exits 0; tests pass, or pre-existing failures are explicitly named with the reason. - The artifact has been driven through its matching surface in this turn (Manual QA Gate). +- Significant implementation work, PR creation, and PR handoff have passed the Post-implementation Global Review and Debugging Gate. - The final message reports what you did, what you verified, what you could not verify (with the reason), and any pre-existing issues you noticed but did not touch. When you think you are done: re-read the original request and your intent line. Did every committed action complete? Run verification once more on changed files in parallel. Then report. diff --git a/plugins/omo/components/start-work-continuation/directive.md b/plugins/omo/components/start-work-continuation/directive.md index 788d6e8..31588c9 100644 --- a/plugins/omo/components/start-work-continuation/directive.md +++ b/plugins/omo/components/start-work-continuation/directive.md @@ -40,7 +40,17 @@ You are mid-flight on a Prometheus work plan. The turn just ended without finish - A top-level checkbox flipped to `- [x]` after the 5-phase QA gate (Phase 1 read, Phase 2 automated, Phase 3 channel scenario, Phase 4 adversarial-class probing, Phase 5 gate decision). Then the Stop hook will re-evaluate; if more checkboxes remain you will be continued again. - 3 same-failure cycles on one sub-task → escalate via `spawn_agent(agent_type="codex-ultrawork-reviewer", fork_turns="none", ...)` and stop dispatch. - Safety boundary (destructive command, secret exfiltration, production write) → stop and surface a safe substitute. -- All top-level checkboxes `- [x]` AND (if gate triggered) `codex-ultrawork-reviewer` approved unconditionally → print the ORCHESTRATION COMPLETE block and end. +- All top-level checkboxes `- [x]` AND the Global Review and Debugging Gate passed with recorded evidence → print the ORCHESTRATION COMPLETE block and end. + +# Final gate + +Before `ORCHESTRATION COMPLETE`, final response, PR creation, PR handoff, or branch handoff: + +1. Invoke the `review-work` skill with the final diff, changed files, user goal, constraints, run command, and verification evidence. All five lanes must PASS. Failed, timed-out, missing-deliverable, ack-only, `BLOCKED:`, or inconclusive lanes block completion. +2. Run a debugging-oriented runtime audit against the changed surface: name at least three plausible failure hypotheses, run distinguishing checks against the actual artifact, and append the ruled-out or confirmed result to `{{LEDGER_PATH}}`. +3. If review or debugging finds a real issue, invoke the `debugging` skill, confirm root cause with runtime evidence, add the minimal failing test or reproduction, fix it, rerun affected verification, then rerun this gate. +4. Redact or mask secrets and sensitive user data before writing evidence to the ledger, PR body, or handoff. Never include raw tokens, credentials, auth headers, cookies, API keys, env dumps, private logs, or PII; use concise summaries, lengths, hashes, or short non-sensitive prefixes instead. +5. For PR work, refresh `git status` and PR/branch state after the gate, then include only redacted review/debugging evidence in the PR body or handoff. # Output discipline diff --git a/plugins/omo/components/start-work-continuation/test/codex-hook.test.ts b/plugins/omo/components/start-work-continuation/test/codex-hook.test.ts index 120cf60..7f76da8 100644 --- a/plugins/omo/components/start-work-continuation/test/codex-hook.test.ts +++ b/plugins/omo/components/start-work-continuation/test/codex-hook.test.ts @@ -88,6 +88,32 @@ describe("start-work Stop hook", () => { expect(parsed.reason).toMatch(/single `list_agents`/); }); + it("#given active codex work #when continuation directive is emitted #then completion requires global review and debugging", () => { + // given + const fs = createMemoryFs({ + [BOULDER_PATH]: createBoulderJson({ + sessionIds: ["codex:sess_abc"], + status: "active", + }), + [PLAN_PATH]: ["# Plan", "", "## TODOs", "- [ ] First"].join("\n"), + }); + + // when + const output = runStopHook(createStopInput(), fs); + + // then + const parsed = parseBlockOutput(output); + expect(parsed.reason).toMatch(/Global Review and Debugging Gate/); + expect(parsed.reason).toMatch(/\breview-work\b/); + expect(parsed.reason).toMatch(/\bdebugging\b/); + expect(parsed.reason).toMatch(/three plausible failure hypotheses/); + expect(parsed.reason).toMatch(/redact|mask/i); + expect(parsed.reason).toMatch(/raw tokens/i); + expect(parsed.reason).toMatch(/PR creation/); + expect(parsed.reason).toMatch(/PR handoff/); + expect(parsed.reason).not.toMatch(/codex-ultrawork-reviewer` approved unconditionally/); + }); + it("#given active work belongs to another harness #when hook runs #then returns empty output", () => { // given const fs = createMemoryFs({ diff --git a/plugins/omo/scripts/sync-skills.mjs b/plugins/omo/scripts/sync-skills.mjs index ace5bb8..64a90fc 100644 --- a/plugins/omo/scripts/sync-skills.mjs +++ b/plugins/omo/scripts/sync-skills.mjs @@ -49,10 +49,63 @@ function insertCodexCompatibilityGuidance(content) { return `${frontmatterMatch[0]}${codexHarnessToolCompatibility}${content.slice(frontmatterMatch[0].length)}`; } +const startWorkOriginalCompletion = `When all top-level checkboxes in \`## TODOs\` and \`## Final Verification Wave\` are complete: + +1. Run the plan's final verification commands. +2. If worktree mode was used, sync \`.omo/\` state back to the main repo, merge or hand off exactly as requested, and remove the worktree only after successful merge or explicit handoff. +3. Remove or mark the Boulder work as completed. +4. Print an \`ORCHESTRATION COMPLETE\` block with the plan path, verification commands, artifacts, and cleanup receipts.`; + +const startWorkCodexCompletion = `When all top-level checkboxes in \`## TODOs\` and \`## Final Verification Wave\` are complete: + +1. Run the plan's final verification commands. +2. Complete the **Global Review and Debugging Gate** before any completion claim, PR handoff, or branch handoff: + - Invoke the \`review-work\` skill with the final diff, changed files, user goal, constraints, run command, and verification evidence. All five review lanes must return PASS. A timeout, missing deliverable, ack-only child, \`BLOCKED:\`, or inconclusive lane is a gate failure, not approval. + - Run a debugging-oriented runtime audit even when the review passes: name at least three plausible failure hypotheses for the changed surface, run the distinguishing checks against the actual artifact, and append the ruled-out or confirmed result to \`.omo/start-work/ledger.jsonl\`. + - If any review lane or debugging hypothesis fails, invoke the \`debugging\` skill, confirm root cause with runtime evidence, add the minimal failing test or reproduction, fix it, rerun the affected verification, then rerun the Global Review and Debugging Gate. + - Evidence hygiene is mandatory: redact or mask secrets and sensitive user data before writing \`.omo/start-work/ledger.jsonl\`, a PR body, or a handoff. Never include raw tokens, credentials, auth headers, cookies, API keys, env dumps, private logs, or PII; use concise summaries, lengths, hashes, or short non-sensitive prefixes instead. + - If the work includes creating, updating, or handing off a PR, refresh \`git status\` and the PR/branch state after the gate, and include only redacted review/debugging evidence in the PR body or handoff. +3. If worktree mode was used, sync \`.omo/\` state back to the main repo, merge or hand off exactly as requested, and remove the worktree only after successful merge or explicit handoff. +4. Remove or mark the Boulder work as completed. +5. Print an \`ORCHESTRATION COMPLETE\` block with the plan path, verification commands, Global Review and Debugging Gate verdict, artifacts, and cleanup receipts.`; + +const startWorkOriginalHardRule = "- No completion claim while an applicable ultraqa adversarial class was never probed. Each applicable class needs a captured observable result; each skipped class needs a one-line not-applicable reason in the ledger.\n- No unprefixed session ids in Boulder state. Codex sessions are always `codex:`."; + +const startWorkCodexHardRule = "- No completion claim while an applicable ultraqa adversarial class was never probed. Each applicable class needs a captured observable result; each skipped class needs a one-line not-applicable reason in the ledger.\n- No `ORCHESTRATION COMPLETE`, final response, PR creation, or PR handoff before the Global Review and Debugging Gate passes with recorded evidence.\n- No unprefixed session ids in Boulder state. Codex sessions are always `codex:`."; + +const reviewWorkAnchor = "Launch 5 specialized sub-agents in parallel to review completed implementation work from every angle. All 5 must pass for the review to pass. If even ONE fails, the review fails.\n"; + +const reviewWorkCodexGate = ` +When \`review-work\` is used as a final implementation, PR, or \`$start-work\` +gate, it is blocking. A timeout, missing deliverable, ack-only response, +explicit \`BLOCKED:\`, or inconclusive lane is not a pass. Treat that lane as +failed, investigate the underlying uncertainty with the \`debugging\` skill when +runtime behavior may be wrong, fix with evidence, and rerun the affected lane +before claiming completion or handing off a PR. + +Review evidence must be safe to share. Redact or mask secrets and sensitive +user data before including evidence in logs, PR bodies, or handoffs. Never +include raw tokens, credentials, auth headers, cookies, API keys, env dumps, +private logs, or PII; summarize with lengths, hashes, and short non-sensitive +prefixes when identity is needed. +`; + +function applyCodexSkillOverlays(skillName, content) { + if (skillName === "start-work") { + return content + .replace(startWorkOriginalCompletion, startWorkCodexCompletion) + .replace(startWorkOriginalHardRule, startWorkCodexHardRule); + } + if (skillName === "review-work" && !content.includes("When `review-work` is used as a final implementation")) { + return content.replace(reviewWorkAnchor, `${reviewWorkAnchor}${reviewWorkCodexGate}`); + } + return content; +} + async function adaptSkillForCodex(skillName) { const skillPath = join(skillsRoot, skillName, "SKILL.md"); const content = await readFile(skillPath, "utf8"); - const adapted = insertCodexCompatibilityGuidance(content); + const adapted = applyCodexSkillOverlays(skillName, insertCodexCompatibilityGuidance(content)); if (adapted !== content) { await writeFile(skillPath, adapted, "utf8"); } diff --git a/plugins/omo/skills/review-work/SKILL.md b/plugins/omo/skills/review-work/SKILL.md index 4b0f294..68fdf1b 100644 --- a/plugins/omo/skills/review-work/SKILL.md +++ b/plugins/omo/skills/review-work/SKILL.md @@ -53,6 +53,19 @@ deliverable. Launch 5 specialized sub-agents in parallel to review completed implementation work from every angle. All 5 must pass for the review to pass. If even ONE fails, the review fails. +When `review-work` is used as a final implementation, PR, or `$start-work` +gate, it is blocking. A timeout, missing deliverable, ack-only response, +explicit `BLOCKED:`, or inconclusive lane is not a pass. Treat that lane as +failed, investigate the underlying uncertainty with the `debugging` skill when +runtime behavior may be wrong, fix with evidence, and rerun the affected lane +before claiming completion or handing off a PR. + +Review evidence must be safe to share. Redact or mask secrets and sensitive +user data before including evidence in logs, PR bodies, or handoffs. Never +include raw tokens, credentials, auth headers, cookies, API keys, env dumps, +private logs, or PII; summarize with lengths, hashes, and short non-sensitive +prefixes when identity is needed. + The 5 agents cover complementary concerns - together they form a comprehensive review that no single reviewer could match: | # | Agent | Type | Role | Focus Level | diff --git a/plugins/omo/skills/start-work/SKILL.md b/plugins/omo/skills/start-work/SKILL.md index eedc8f3..cb6feec 100644 --- a/plugins/omo/skills/start-work/SKILL.md +++ b/plugins/omo/skills/start-work/SKILL.md @@ -192,9 +192,15 @@ Only after verification passes: When all top-level checkboxes in `## TODOs` and `## Final Verification Wave` are complete: 1. Run the plan's final verification commands. -2. If worktree mode was used, sync `.omo/` state back to the main repo, merge or hand off exactly as requested, and remove the worktree only after successful merge or explicit handoff. -3. Remove or mark the Boulder work as completed. -4. Print an `ORCHESTRATION COMPLETE` block with the plan path, verification commands, artifacts, and cleanup receipts. +2. Complete the **Global Review and Debugging Gate** before any completion claim, PR handoff, or branch handoff: + - Invoke the `review-work` skill with the final diff, changed files, user goal, constraints, run command, and verification evidence. All five review lanes must return PASS. A timeout, missing deliverable, ack-only child, `BLOCKED:`, or inconclusive lane is a gate failure, not approval. + - Run a debugging-oriented runtime audit even when the review passes: name at least three plausible failure hypotheses for the changed surface, run the distinguishing checks against the actual artifact, and append the ruled-out or confirmed result to `.omo/start-work/ledger.jsonl`. + - If any review lane or debugging hypothesis fails, invoke the `debugging` skill, confirm root cause with runtime evidence, add the minimal failing test or reproduction, fix it, rerun the affected verification, then rerun the Global Review and Debugging Gate. + - Evidence hygiene is mandatory: redact or mask secrets and sensitive user data before writing `.omo/start-work/ledger.jsonl`, a PR body, or a handoff. Never include raw tokens, credentials, auth headers, cookies, API keys, env dumps, private logs, or PII; use concise summaries, lengths, hashes, or short non-sensitive prefixes instead. + - If the work includes creating, updating, or handing off a PR, refresh `git status` and the PR/branch state after the gate, and include only redacted review/debugging evidence in the PR body or handoff. +3. If worktree mode was used, sync `.omo/` state back to the main repo, merge or hand off exactly as requested, and remove the worktree only after successful merge or explicit handoff. +4. Remove or mark the Boulder work as completed. +5. Print an `ORCHESTRATION COMPLETE` block with the plan path, verification commands, Global Review and Debugging Gate verdict, artifacts, and cleanup receipts. ## Hard rules @@ -202,5 +208,6 @@ When all top-level checkboxes in `## TODOs` and `## Final Verification Wave` are - No `--dry-run` as completion evidence. - No tests-only completion claim. A Manual-QA artifact is required. - No completion claim while an applicable ultraqa adversarial class was never probed. Each applicable class needs a captured observable result; each skipped class needs a one-line not-applicable reason in the ledger. +- No `ORCHESTRATION COMPLETE`, final response, PR creation, or PR handoff before the Global Review and Debugging Gate passes with recorded evidence. - No unprefixed session ids in Boulder state. Codex sessions are always `codex:`. - No stale-memory execution. The plan and ledger are the durable source of truth. diff --git a/plugins/omo/test/start-work-skill.test.mjs b/plugins/omo/test/start-work-skill.test.mjs index 029e15c..53559bf 100644 --- a/plugins/omo/test/start-work-skill.test.mjs +++ b/plugins/omo/test/start-work-skill.test.mjs @@ -5,10 +5,17 @@ import test from "node:test"; import { fileURLToPath } from "node:url"; const pluginRoot = dirname(dirname(fileURLToPath(import.meta.url))); -const repoRoot = dirname(dirname(dirname(pluginRoot))); const startWorkSkillPaths = [ - join(repoRoot, "packages", "shared-skills", "skills", "start-work", "SKILL.md"), + join(pluginRoot, "skills", "start-work", "SKILL.md"), ]; +const reviewWorkSkillPath = join(pluginRoot, "skills", "review-work", "SKILL.md"); +const hephaestusRulePath = join( + pluginRoot, + "components", + "rules", + "bundled-rules", + "hephaestus.md", +); const stopHookPath = join( pluginRoot, "components", @@ -67,6 +74,29 @@ test("#given worker done claim #when start-work contract is inspected #then adve assert.deepEqual(missing, []); }); +test("#given start-work completion surfaces #when inspected #then global review and debugging gate completion", async () => { + // given + const [startWorkSkill, reviewWorkSkill, hephaestusRule] = await Promise.all([ + readFile(startWorkSkillPaths[0], "utf8"), + readFile(reviewWorkSkillPath, "utf8"), + readFile(hephaestusRulePath, "utf8"), + ]); + + // then + assert.match(startWorkSkill, /Global Review and Debugging Gate/); + assert.match(startWorkSkill, /\breview-work\b/); + assert.match(startWorkSkill, /\bdebugging\b/); + assert.match(startWorkSkill, /inconclusive/i); + assert.match(startWorkSkill, /redact|mask/i); + assert.match(startWorkSkill, /raw (?:tokens|credentials|auth headers|cookies)/i); + assert.match(startWorkSkill, /ORCHESTRATION COMPLETE/); + assert.match(reviewWorkSkill, /debugging/i); + assert.match(reviewWorkSkill, /inconclusive/i); + assert.match(hephaestusRule, /Post-implementation Global Review and Debugging Gate/); + assert.match(hephaestusRule, /redact|mask/i); + assert.match(hephaestusRule, /raw (?:tokens|credentials|auth headers|cookies)/i); +}); + test("#given start-work continuation hook #when inspected #then it remains Boulder-only without planning bootstrap logic", async () => { // given const hook = await readFile(stopHookPath, "utf8"); diff --git a/plugins/omo/test/sync-skills.test.mjs b/plugins/omo/test/sync-skills.test.mjs index ee9ecfb..7fcd600 100644 --- a/plugins/omo/test/sync-skills.test.mjs +++ b/plugins/omo/test/sync-skills.test.mjs @@ -53,6 +53,45 @@ function removeCodexCompatibilityGuidance(content) { return `${content.slice(0, start)}${content.slice(end + endMarker.length)}`; } +const startWorkOriginalCompletion = `When all top-level checkboxes in \`## TODOs\` and \`## Final Verification Wave\` are complete: + +1. Run the plan's final verification commands. +2. If worktree mode was used, sync \`.omo/\` state back to the main repo, merge or hand off exactly as requested, and remove the worktree only after successful merge or explicit handoff. +3. Remove or mark the Boulder work as completed. +4. Print an \`ORCHESTRATION COMPLETE\` block with the plan path, verification commands, artifacts, and cleanup receipts.`; + +const startWorkCodexCompletion = `When all top-level checkboxes in \`## TODOs\` and \`## Final Verification Wave\` are complete: + +1. Run the plan's final verification commands. +2. Complete the **Global Review and Debugging Gate** before any completion claim, PR handoff, or branch handoff: + - Invoke the \`review-work\` skill with the final diff, changed files, user goal, constraints, run command, and verification evidence. All five review lanes must return PASS. A timeout, missing deliverable, ack-only child, \`BLOCKED:\`, or inconclusive lane is a gate failure, not approval. + - Run a debugging-oriented runtime audit even when the review passes: name at least three plausible failure hypotheses for the changed surface, run the distinguishing checks against the actual artifact, and append the ruled-out or confirmed result to \`.omo/start-work/ledger.jsonl\`. + - If any review lane or debugging hypothesis fails, invoke the \`debugging\` skill, confirm root cause with runtime evidence, add the minimal failing test or reproduction, fix it, rerun the affected verification, then rerun the Global Review and Debugging Gate. + - Evidence hygiene is mandatory: redact or mask secrets and sensitive user data before writing \`.omo/start-work/ledger.jsonl\`, a PR body, or a handoff. Never include raw tokens, credentials, auth headers, cookies, API keys, env dumps, private logs, or PII; use concise summaries, lengths, hashes, or short non-sensitive prefixes instead. + - If the work includes creating, updating, or handing off a PR, refresh \`git status\` and the PR/branch state after the gate, and include only redacted review/debugging evidence in the PR body or handoff. +3. If worktree mode was used, sync \`.omo/\` state back to the main repo, merge or hand off exactly as requested, and remove the worktree only after successful merge or explicit handoff. +4. Remove or mark the Boulder work as completed. +5. Print an \`ORCHESTRATION COMPLETE\` block with the plan path, verification commands, Global Review and Debugging Gate verdict, artifacts, and cleanup receipts.`; + +const startWorkOriginalHardRule = "- No completion claim while an applicable ultraqa adversarial class was never probed. Each applicable class needs a captured observable result; each skipped class needs a one-line not-applicable reason in the ledger.\n- No unprefixed session ids in Boulder state. Codex sessions are always `codex:`."; + +const startWorkCodexHardRule = "- No completion claim while an applicable ultraqa adversarial class was never probed. Each applicable class needs a captured observable result; each skipped class needs a one-line not-applicable reason in the ledger.\n- No `ORCHESTRATION COMPLETE`, final response, PR creation, or PR handoff before the Global Review and Debugging Gate passes with recorded evidence.\n- No unprefixed session ids in Boulder state. Codex sessions are always `codex:`."; + +const reviewWorkCodexGatePattern = + /\nWhen `review-work` is used as a final implementation, PR, or `\$start-work`\ngate, it is blocking\. A timeout, missing deliverable, ack-only response,\nexplicit `BLOCKED:`, or inconclusive lane is not a pass\. Treat that lane as\nfailed, investigate the underlying uncertainty with the `debugging` skill when\nruntime behavior may be wrong, fix with evidence, and rerun the affected lane\nbefore claiming completion or handing off a PR\.\n\nReview evidence must be safe to share\. Redact or mask secrets and sensitive\nuser data before including evidence in logs, PR bodies, or handoffs\. Never\ninclude raw tokens, credentials, auth headers, cookies, API keys, env dumps,\nprivate logs, or PII; summarize with lengths, hashes, and short non-sensitive\nprefixes when identity is needed\.\n/; + +function removeCodexSkillOverlays(skillName, content) { + if (skillName === "start-work") { + return content + .replace(startWorkCodexCompletion, startWorkOriginalCompletion) + .replace(startWorkCodexHardRule, startWorkOriginalHardRule); + } + if (skillName === "review-work") { + return content.replace(reviewWorkCodexGatePattern, "\n"); + } + return content; +} + async function listSkillFiles(dir) { const entries = await readdir(dir, { withFileTypes: true }); const files = []; @@ -133,7 +172,7 @@ test("#given shared skill package source #when aggregate Codex shared skills are const sharedContent = await readFile(join(sharedSkillsRoot, skillName, "SKILL.md"), "utf8"); const aggregateContent = await readFile(join(aggregateSkillsRoot, skillName, "SKILL.md"), "utf8"); assert.equal( - removeCodexCompatibilityGuidance(aggregateContent), + removeCodexSkillOverlays(skillName, removeCodexCompatibilityGuidance(aggregateContent)), removeCodexCompatibilityGuidance(sharedContent), `${skillName} drifted from shared-skills`, ); diff --git a/test/readme-feature-workflows-content.test.mjs b/test/readme-feature-workflows-content.test.mjs index 5f7dbf0..d0cd5a7 100644 --- a/test/readme-feature-workflows-content.test.mjs +++ b/test/readme-feature-workflows-content.test.mjs @@ -14,6 +14,8 @@ test("README documents built-in LazyCodex workflows without Hangul", () => { "$start-work", "$ulw-loop", "review-work", + "post-implementation review", + "debugging gate", "remove-ai-slops", "https://lazycodex.ai", ] diff --git a/test/web-workflow-copy-content.test.mjs b/test/web-workflow-copy-content.test.mjs new file mode 100644 index 0000000..c25b5ec --- /dev/null +++ b/test/web-workflow-copy-content.test.mjs @@ -0,0 +1,32 @@ +import { readFileSync } from "node:fs" +import test from "node:test" +import assert from "node:assert/strict" + +const SITE_CONFIG_SOURCE = readFileSync("packages/web/lib/site-config.ts", "utf8") +const COMMANDS_SOURCE = readFileSync("packages/web/lib/commands.ts", "utf8") + +test("web workflow copy documents the start-work final gate", () => { + const requiredSiteConfigSnippets = [ + "$start-work executes", + "durable Boulder progress", + "blocks completion", + "global review/debugging gate", + ] + const requiredCommandSnippets = [ + "final review/debugging gate", + "Global review + debugging gate blocks completion and PR handoff", + "ORCHESTRATION COMPLETE only after the gate passes", + ] + + for (const snippet of requiredSiteConfigSnippets) { + assert.match(SITE_CONFIG_SOURCE, escapedPattern(snippet)) + } + + for (const snippet of requiredCommandSnippets) { + assert.match(COMMANDS_SOURCE, escapedPattern(snippet)) + } +}) + +function escapedPattern(snippet) { + return new RegExp(snippet.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")) +}