diff --git a/cli/lib/launch.js b/cli/lib/launch.js index 4d42249..4b3ea33 100644 --- a/cli/lib/launch.js +++ b/cli/lib/launch.js @@ -78,35 +78,45 @@ function launchInteractive(contentDir, cliName) { ); } + // Preserve the user's working directory before staging content. + const originalCwd = process.cwd(); + // Copy content to a temp directory so the LLM can read the files const tmpDir = copyContentToTemp(contentDir); console.log(`PromptKit content staged at: ${tmpDir}`); console.log(`Launching ${cli}...\n`); - const bootstrapPrompt = "Read and execute bootstrap.md"; + // Use an absolute path so the LLM can locate bootstrap.md regardless of + // which directory it treats as its working directory. + const bootstrapPrompt = `Read and execute ${path.join(tmpDir, "bootstrap.md")}`; let cmd, args; switch (cli) { case "copilot": cmd = "copilot"; - args = ["-i", bootstrapPrompt]; + // --add-dir grants file access to the staging directory. + args = ["--add-dir", tmpDir, "-i", bootstrapPrompt]; break; case "gh-copilot": cmd = "gh"; - args = ["copilot", "-i", bootstrapPrompt]; + args = ["copilot", "--add-dir", tmpDir, "-i", bootstrapPrompt]; break; case "claude": + // --add-dir grants file access to the staging directory. cmd = "claude"; - args = [bootstrapPrompt]; + args = ["--add-dir", tmpDir, bootstrapPrompt]; break; default: console.error(`Unknown CLI: ${cli}`); process.exit(1); } + // All CLIs are spawned from the user's original directory so the LLM + // session reflects the directory the user was working in. const child = spawn(cmd, args, { - cwd: tmpDir, + cwd: originalCwd, stdio: "inherit", + shell: true, }); child.on("error", (err) => { diff --git a/cli/package-lock.json b/cli/package-lock.json index 07a9451..80db0f5 100644 --- a/cli/package-lock.json +++ b/cli/package-lock.json @@ -1,12 +1,12 @@ { "name": "@alan-jowett/promptkit", - "version": "0.2.0", + "version": "0.5.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@alan-jowett/promptkit", - "version": "0.2.0", + "version": "0.5.0", "license": "MIT", "dependencies": { "commander": "^12.0.0", diff --git a/cli/specs/requirements.md b/cli/specs/requirements.md index e61ef6a..5c0602c 100644 --- a/cli/specs/requirements.md +++ b/cli/specs/requirements.md @@ -1,8 +1,8 @@ --- title: "PromptKit CLI — Requirements Specification" project: "PromptKit CLI (@alan-jowett/promptkit)" -version: "0.3.0" -date: "2025-07-17" +version: "0.4.0" +date: "2026-03-31" status: draft source_files: - cli/bin/cli.js @@ -19,6 +19,8 @@ source_files: |-----|------|--------|-------------| | 0.1 | 2025-07-17 | Spec-extraction-workflow | Initial draft extracted from source code | | 0.2 | 2025-07-18 | Engineering-workflow Phase 2 | Retired assemble command (REQ-CLI-030–037), assembly engine (REQ-CLI-040–051), manifest resolution module (REQ-CLI-060–069). Kept list command with inline manifest parsing. Modified REQ-CLI-002, 004, 011, 012, 020–023, 080, 091, 094. Retired REQ-CLI-092, CON-005, ASSUMPTION-002, ASSUMPTION-006. Added REQ-CLI-100, 101, 103. | +| 0.3 | 2026-03-31 | Bug-fix | Added REQ-CLI-024 (cwd preservation for claude). Updated REQ-CLI-015 and REQ-CLI-017 to reflect per-CLI spawn cwd behaviour. | +| 0.4 | 2026-03-31 | Bug-fix | Extended cwd fix to all CLIs. Added REQ-CLI-025 (--add-dir for staging directory). Updated REQ-CLI-015, 016, 017, 024 to be CLI-agnostic. | --- @@ -111,25 +113,50 @@ directory to a temporary directory before launching the LLM CLI. content files. **REQ-CLI-015**: The `interactive` command MUST spawn the LLM CLI process -with the working directory set to the temporary content directory and -`stdio: "inherit"` so the user can interact directly. -- *Source*: `launch.js` lines 107–110. -- *Acceptance*: The child process has `cwd` equal to the temp directory - and inherits stdin/stdout/stderr. +with `cwd` set to the user's working directory at the time the interactive +session is launched (captured when launching) and `stdio: "inherit"` so the +user can interact directly. +- *Source*: `launch.js` (`launchInteractive()`). +- *Acceptance*: The spawned process has `cwd` equal to the directory from + which `promptkit` was invoked. The process inherits stdin/stdout/stderr. **REQ-CLI-016**: The `interactive` command MUST pass the bootstrap prompt -`"Read and execute bootstrap.md"` as the initial instruction to the LLM CLI. -- *Source*: `launch.js` line 86. -- *Acceptance*: The spawned process receives this string as an argument. +`"Read and execute "` as the initial instruction +to the LLM CLI, where `` is the absolute path to +`bootstrap.md` inside the temporary staging directory. The absolute path +allows the LLM to locate the file regardless of which directory it treats +as its working directory. +- *Source*: `launch.js` (`launchInteractive()`). +- *Acceptance*: The spawned process receives a string argument that contains + an absolute path ending in `bootstrap.md`. **REQ-CLI-017**: The CLI MUST construct the correct command and arguments -for each supported LLM CLI: -- `copilot`: `copilot -i "Read and execute bootstrap.md"` -- `gh-copilot`: `gh copilot -i "Read and execute bootstrap.md"` -- `claude`: `claude "Read and execute bootstrap.md"` -- *Source*: `launch.js` lines 89–105. +for each supported LLM CLI. All CLIs receive `--add-dir ` and an +absolute path to `bootstrap.md`: +- `copilot`: `copilot --add-dir -i "Read and execute /bootstrap.md"` +- `gh-copilot`: `gh copilot --add-dir -i "Read and execute /bootstrap.md"` +- `claude`: `claude --add-dir "Read and execute /bootstrap.md"` +- *Source*: `launch.js` (`launchInteractive()`). - *Acceptance*: Spawn is called with the documented cmd/args for each CLI. +**REQ-CLI-024**: The `interactive` command MUST preserve the user's original +working directory for ALL supported LLM CLIs. Every LLM CLI child process +MUST be spawned with `cwd` equal to the directory from which `promptkit` +was invoked, not the temporary staging directory. +- *Source*: `launch.js` (`launchInteractive()`). +- *Acceptance*: When `promptkit --cli ` is run from directory `D`, + the spawned process reports `cwd = D` for every supported CLI. The cwd + is NOT the temporary `promptkit-*` staging directory. + +**REQ-CLI-025**: The `interactive` command MUST grant the LLM CLI file +access to the temporary staging directory by passing `--add-dir ` +at launch, for every supported LLM CLI. This ensures the LLM can read +PromptKit content files from the staging directory even though the process +cwd is the user's original working directory. +- *Source*: `launch.js` (`launchInteractive()`). +- *Acceptance*: The spawn args for every supported CLI contain `--add-dir` + followed by the path of the temporary staging directory. + **REQ-CLI-018**: When the child process exits, the CLI MUST clean up the temporary directory (best-effort) and then exit with the child's exit code. If the child was killed by a signal, the CLI MUST re-send that signal to diff --git a/cli/specs/validation.md b/cli/specs/validation.md index 36aac2c..bbc61fd 100644 --- a/cli/specs/validation.md +++ b/cli/specs/validation.md @@ -1,8 +1,8 @@ --- title: "PromptKit CLI — Validation Plan" project: "PromptKit CLI (@alan-jowett/promptkit)" -version: "0.3.0" -date: "2025-07-17" +version: "0.4.0" +date: "2026-03-31" status: draft related: - requirements: cli/specs/requirements.md @@ -17,6 +17,8 @@ related: |-----|------|--------|-------------| | 0.1 | 2025-07-17 | Spec-extraction-workflow | Initial draft extracted from source code | | 0.2 | 2025-07-18 | Engineering-workflow Phase 2 | Retired test cases for assembly engine (TC-CLI-010–024), manifest resolution (TC-CLI-030–042), assemble command (TC-CLI-060–067), Windows frontmatter (TC-CLI-113). Updated TC-CLI-001, TC-CLI-003, TC-CLI-053. Added TC-CLI-120–122 for new requirements. Updated traceability matrix. | +| 0.3 | 2026-03-31 | Bug-fix | Added TC-CLI-082 for REQ-CLI-024 (claude cwd preservation). Updated TC-CLI-080/081 notes. Updated traceability matrix. | +| 0.4 | 2026-03-31 | Bug-fix | Extended TC-CLI-082 to all CLIs. Added TC-CLI-083 (--add-dir for staging dir). Updated TC-CLI-080/081/082. Added REQ-CLI-025 to traceability. | --- @@ -267,16 +269,44 @@ See REQ-CLI-100.* - *Requirement*: REQ-CLI-016 - *Type*: Unit - *Steps*: Inspect the spawn arguments for each CLI type. -- *Expected*: `"Read and execute bootstrap.md"` appears in args. +- *Expected*: For every CLI, the bootstrap prompt argument is an absolute + path ending in `bootstrap.md` (e.g. `"Read and execute /tmp/promptkit-xxx/bootstrap.md"`). **TC-CLI-081**: Correct command construction for each CLI. - *Requirement*: REQ-CLI-017 - *Type*: Unit - *Steps*: Verify spawn cmd/args for `copilot`, `gh-copilot`, `claude`. - *Expected*: - - copilot: `cmd="copilot"`, `args=["-i", "Read and execute bootstrap.md"]` - - gh-copilot: `cmd="gh"`, `args=["copilot", "-i", "Read and execute bootstrap.md"]` - - claude: `cmd="claude"`, `args=["Read and execute bootstrap.md"]` + - copilot: `cmd="copilot"`, args include `"--add-dir"`, ``, + `"-i"`, `"Read and execute /bootstrap.md"` + - gh-copilot: `cmd="gh"`, args include `"copilot"`, `"--add-dir"`, + ``, `"-i"`, `"Read and execute /bootstrap.md"` + - claude: `cmd="claude"`, args include `"--add-dir"`, ``, + `"Read and execute /bootstrap.md"` + +**TC-CLI-082**: All CLIs are spawned with the user's original working directory. +- *Requirement*: REQ-CLI-024 +- *Type*: Integration (uses mock CLI executables) +- *Steps*: + 1. Create a mock executable (for each CLI under test) that records + `process.cwd()` to a JSON file and exits. + 2. Run `promptkit interactive --cli ` from a known directory `D` + with the mock on PATH. + 3. Read the recorded cwd from the file. +- *Expected*: The recorded cwd equals `D` for every supported CLI. It does + NOT equal the temporary `promptkit-*` staging directory. + +**TC-CLI-083**: All CLIs receive `--add-dir ` in their spawn arguments. +- *Requirement*: REQ-CLI-025 +- *Type*: Integration (uses mock CLI executables) +- *Steps*: + 1. Create a mock executable (for each CLI under test) that records + `process.argv.slice(2)` to a JSON file and exits. + 2. Run `promptkit interactive --cli ` with the mock on PATH. + 3. Read the recorded args from the file. +- *Expected*: For every supported CLI, `"--add-dir"` appears in the recorded + args and the following argument is the path of the temporary staging + directory (a directory under `os.tmpdir()` with a `promptkit-` prefix). ### 2.7 Content Bundling (copy-content.js) @@ -397,7 +427,7 @@ concern.* | REQ-CLI-012 | TC-CLI-076 | High | Active | | REQ-CLI-013 | TC-CLI-077 | Low | Active | | REQ-CLI-014 | TC-CLI-078 | High | Active | -| REQ-CLI-015 | TC-CLI-078, TC-CLI-081 | High | Active | +| REQ-CLI-015 | TC-CLI-078, TC-CLI-081, TC-CLI-082 | High | Active | | REQ-CLI-016 | TC-CLI-080 | High | Active | | REQ-CLI-017 | TC-CLI-081 | High | Active | | REQ-CLI-018 | TC-CLI-079 | High | Active | @@ -406,6 +436,8 @@ concern.* | REQ-CLI-021 | TC-CLI-051 | Medium | Active | | REQ-CLI-022 | TC-CLI-052 | Medium | Active | | REQ-CLI-023 | TC-CLI-053 | Low | Active | +| REQ-CLI-024 | TC-CLI-082 | High | Active | +| REQ-CLI-025 | TC-CLI-083 | High | Active | | REQ-CLI-030 | ~~TC-CLI-060~~ | — | RETIRED | | REQ-CLI-031 | ~~TC-CLI-060, TC-CLI-061~~ | — | RETIRED | | REQ-CLI-032 | ~~TC-CLI-062~~ | — | RETIRED | diff --git a/cli/tests/launch.test.js b/cli/tests/launch.test.js index 8fc4133..4dfc470 100644 --- a/cli/tests/launch.test.js +++ b/cli/tests/launch.test.js @@ -204,13 +204,23 @@ describe("Launch Module", () => { }); describe("Module exports and bootstrap prompt", () => { - it("TC-CLI-080/081: launch module exports expected functions and contains bootstrap prompt", () => { + // Note: TC-CLI-080 (bootstrap prompt arg) and TC-CLI-081 (cmd/args per CLI) + // are validated by the integration tests in "CWD preservation and staging + // directory access" (TC-CLI-082/083), which assert --add-dir, absolute + // bootstrap path, and correct spawn cwd for each CLI. + it("launch module exports expected functions and source references bootstrap prompt", () => { const launchSrc = fs.readFileSync(launchModulePath, "utf8"); - const bootstrapPrompt = "Read and execute bootstrap.md"; + // The bootstrap prompt now uses an absolute path, so check for the + // constant prefix ("Read and execute ") rather than the exact string. + const bootstrapPrefix = "Read and execute "; assert.ok( - launchSrc.includes(bootstrapPrompt), - "launch.js should contain the bootstrap prompt" + launchSrc.includes(bootstrapPrefix), + "launch.js should contain the bootstrap prompt prefix" + ); + assert.ok( + launchSrc.includes("bootstrap.md"), + "launch.js should reference bootstrap.md" ); // Verify command construction by checking source contains expected patterns. @@ -230,4 +240,120 @@ describe("Launch Module", () => { ); }); }); + + describe("CWD preservation and staging directory access", () => { + let cwdTestTmpDir; + + before(() => { + cwdTestTmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "pk-cwdtest-")); + }); + + after(() => { + try { + fs.rmSync(cwdTestTmpDir, { recursive: true, force: true }); + } catch { + // best effort + } + }); + + // Creates a mock CLI executable that records { cwd, args } to a JSON + // file at captureFile, then exits. + function createCapturingMock(mockBinDir, binName, captureFile) { + const implScript = path.join(cwdTestTmpDir, `${binName}-impl.js`); + fs.writeFileSync( + implScript, + [ + `const fs = require('fs');`, + `fs.writeFileSync(`, + ` ${JSON.stringify(captureFile)},`, + ` JSON.stringify({ cwd: process.cwd(), args: process.argv.slice(2) })`, + `);`, + ].join("\n") + ); + + if (process.platform === "win32") { + fs.writeFileSync( + path.join(mockBinDir, `${binName}.cmd`), + `@"${process.execPath}" "${implScript}" %*\r\n` + ); + } else { + const p = path.join(mockBinDir, binName); + fs.writeFileSync(p, `#!/bin/sh\n${JSON.stringify(process.execPath)} "${implScript}" "$@"\n`); + fs.chmodSync(p, 0o755); + } + } + + // Run promptkit interactive --cli from userCwd with mockBinDir + // prepended to PATH. Returns the parsed JSON capture written by the mock. + function runAndCapture(cliName, mockBinDir, captureFile, userCwd) { + const newPath = `${mockBinDir}${path.delimiter}${process.env.PATH || ""}`; + try { + execFileSync( + process.execPath, + [cliPath, "interactive", "--cli", cliName], + { + env: envWithPath(newPath), + cwd: userCwd, + encoding: "utf8", + timeout: 15000, + } + ); + } catch { + // The mock exits 0, so errors here are unexpected but we still want + // to read whatever was captured. + } + assert.ok( + fs.existsSync(captureFile), + `mock ${cliName} should have written capture file` + ); + return JSON.parse(fs.readFileSync(captureFile, "utf8")); + } + + for (const cliName of ["claude", "copilot", "gh-copilot"]) { + // TC-CLI-082 and TC-CLI-083 combined — run once per CLI + it(`TC-CLI-082/083: ${cliName} spawned with originalCwd and --add-dir for staging dir`, () => { + const mockBinDir = path.join(cwdTestTmpDir, `mock-bin-${cliName}`); + fs.mkdirSync(mockBinDir, { recursive: true }); + const captureFile = path.join(cwdTestTmpDir, `${cliName}-capture.json`); + + // For gh-copilot the binary is "gh"; for others it matches cliName. + const binName = cliName === "gh-copilot" ? "gh" : cliName; + createCapturingMock(mockBinDir, binName, captureFile); + + // Use cwdTestTmpDir as the simulated user working directory. + const userCwd = cwdTestTmpDir; + const result = runAndCapture(cliName, mockBinDir, captureFile, userCwd); + + // TC-CLI-082: verify cwd is the user's original directory. + const actualCwd = fs.realpathSync(result.cwd); + const expectedCwd = fs.realpathSync(userCwd); + assert.strictEqual( + actualCwd, + expectedCwd, + `${cliName} should be spawned with the user's original cwd` + ); + + // TC-CLI-083: verify --add-dir is present and points to a + // promptkit-* staging directory under os.tmpdir(). + const addDirIdx = result.args.indexOf("--add-dir"); + assert.ok( + addDirIdx !== -1, + `${cliName} args should include --add-dir` + ); + const addDirValue = result.args[addDirIdx + 1]; + assert.ok( + addDirValue && addDirValue.startsWith(os.tmpdir()) && + path.basename(addDirValue).startsWith("promptkit-"), + `${cliName} --add-dir value should point to a promptkit-* staging dir under tmpdir` + ); + + // Also verify the bootstrap prompt uses an absolute path. + const bootstrapArg = result.args.find((a) => a.includes("bootstrap.md")); + assert.ok( + bootstrapArg && path.isAbsolute(bootstrapArg.replace("Read and execute ", "")), + `${cliName} bootstrap prompt should contain an absolute path to bootstrap.md` + ); + }); + } + }); }); diff --git a/docs/case-studies/ebpf_epoch.md b/docs/case-studies/ebpf-epoch.md similarity index 97% rename from docs/case-studies/ebpf_epoch.md rename to docs/case-studies/ebpf-epoch.md index b41271a..77c291e 100644 --- a/docs/case-studies/ebpf_epoch.md +++ b/docs/case-studies/ebpf-epoch.md @@ -1,261 +1,261 @@ - - - -# Case Study: Spec Extraction for the eBPF Epoch Module - -## Context - -**Project**: [eBPF for Windows](https://github.com/microsoft/ebpf-for-windows) -**Module**: `ebpf_epoch` — epoch-based memory reclamation (EBR) -**Date**: 2026-03-30 -**Duration**: ~45 minutes wall-clock (human interaction), ~7 minutes agent compute -**Tool**: GitHub Copilot CLI (Claude Opus 4.6, 1M context) - -The `ebpf_epoch` module is a kernel-level subsystem that provides safe, deferred -memory reclamation for concurrent data structures in the eBPF for Windows runtime. -It implements a per-CPU architecture with inter-CPU DPC messaging, epoch-stamped -free lists, and a three-phase computation protocol — the kind of intricate systems -code that typically resists documentation efforts because the complexity lives -entirely in the implementation. - -The module had **zero formal specifications**. The only documentation was code -comments and a brief design overview in `docs/EpochBasedMemoryManagement.md`. - -## Objective - -Bootstrap a **semantic baseline** — structured requirements, design, and validation -specifications — extracted entirely from the existing codebase and test suite, -following the project's `spec-extraction-workflow.md`. - -## Input Files - -| File | Lines | Role | -|------|-------|------| -| `libs/runtime/ebpf_epoch.h` | 141 | Public API (13 functions, 2 types) | -| `libs/runtime/ebpf_epoch.c` | 1,062 | Implementation | -| `libs/runtime/unit/platform_unit_test.cpp` | ~1,150 (epoch portion) | 6 test cases | - -**Total input**: ~2,350 lines of C/C++ code. - -## Process - -### Phase 1 — Repository Scan (~3 minutes) - -The agent read all three source files in full and produced a structured analysis -summary: - -- Identified the 13-function public API surface -- Cataloged the per-CPU architecture, inter-CPU messaging protocol, and free-list - release algorithm -- Mapped all 6 existing test cases with their concurrency requirements (CPU count - thresholds, thread affinity patterns, stress test durations) -- Proposed the specification scope - -**Human interaction**: User confirmed the scope with a single "yes." - -### Phase 2 — Draft Extraction (~7 minutes) - -Three background agents were launched **in parallel**, each producing one -specification document: - -| Agent | Output | Size | Time | -|-------|--------|------|------| -| draft-requirements | `specs/epoch/requirements.md` | 32 KB | 7m 27s | -| draft-design | `specs/epoch/design.md` | 32 KB | 6m 55s | -| draft-validation | `specs/epoch/validation.md` | 34 KB | 6m 29s | - -Each agent received a comprehensive prompt containing: -- The full API surface extracted from Phase 1 -- Key behavioral details from the implementation -- A document format template with section skeletons -- Anti-hallucination rules (cite sources, mark confidence, no invented behaviors) - -The parallel execution cut wall-clock time from ~21 minutes (sequential) to ~7.5 -minutes. - -### Phase 3 — Human Clarification (~1 minute) - -The agent presented 5 targeted questions about ambiguous items: -1. Is the 1ms timer delay a requirement or implementation detail? -2. Is CPU 0 as coordinator essential or incidental? -3. Are work item callbacks contractually at PASSIVE_LEVEL? -4. Is the cross-CPU DISPATCH fail-fast a formal requirement? -5. Is stack-allocated synchronization a requirement or optimization? - -**Human interaction**: User said "proceed to audit" — accepting the drafts as-is -and deferring the clarifications to future review. - -### Phase 4 — Consistency Audit (~5 minutes) - -The agent performed an adversarial traceability audit by: -1. Reading all three documents in full (~2,000 lines) -2. Extracting every REQ-ID from requirements.md (57 total) -3. Cross-referencing against design.md and validation.md -4. Checking for orphaned IDs, phantom references, and coverage gaps - -**7 findings** were identified: - -| ID | Severity | Issue | -|----|----------|-------| -| F-001 | Medium | Design doc invented parallel REQ-IDs instead of using canonical ones | -| F-002 | Medium | Validation plan referenced phantom REQ-IDs | -| F-003 | Medium | 21/57 requirements had no explicit test traceability | -| F-004 | Low | Validation defined REQ-DIAG-002 not in requirements | -| F-005 | Low | Date inconsistency across documents | -| F-006 | Low | "Released epoch" phrasing varied | -| F-007 | Low | Design referenced out-of-scope files | - -The agent immediately applied fixes for F-001 (replaced design §2 with canonical -REQ-ID references) and F-005 (normalized dates). - -**Root cause**: The three documents were drafted by independent agents without a -shared REQ-ID registry. This is a known limitation of parallel extraction. - -**Verdict**: REVISE (with critical corrections already applied). - -### Phase 5 — Human Approval (~instant) - -User said "looks good to me." - -### Phase 6 — Deliverable (~2 minutes) - -The agent: -1. Created branch `alanjo/specs-epoch-baseline` -2. Committed 4 spec files + 4 workflow files with sign-off and detailed commit message -3. Pushed and created draft PR #5150 - -## Output - -### Deliverables - -| File | Content | -|------|---------| -| `specs/epoch/requirements.md` | 57 requirements, 5 dependencies, 5 assumptions, 4 risks | -| `specs/epoch/design.md` | Architecture diagram, 10 design subsections, 7 tradeoff analyses, 5 open questions | -| `specs/epoch/validation.md` | 17 test cases (6 existing + 11 proposed), 10 coverage gaps, risk prioritization | -| `specs/epoch/audit.md` | 7 findings, root cause analysis, remediation plan | - -### Quantitative Summary - -| Metric | Value | -|--------|-------| -| Input (source code) | 2,350 lines | -| Output (specifications) | 2,265 lines | -| Requirements extracted | 57 | -| Acceptance criteria | 57+ (at least 1 per requirement) | -| Design sections | 10 detailed + 7 tradeoff analyses | -| Test cases documented | 17 (6 existing, 11 proposed) | -| Coverage gaps identified | 10 | -| API coverage by tests | 85% (11/13 functions) | -| Audit findings | 7 (2 fixed inline) | -| Source citations | Every requirement cites file:line | -| Confidence tagging | 100% of items tagged [High]/[Medium]/[Low] | - -## What Worked Well - -### 1. Parallel agent execution - -Launching three independent agents cut the extraction phase from ~21 minutes to -~7.5 minutes. The agents produced high-quality, structurally consistent documents -despite working independently. - -### 2. Anti-hallucination guardrails - -Every requirement cites specific source locations (e.g., `ebpf_epoch.c:704-706`). -No invented behaviors were found during the audit. The confidence tagging -([High]/[Medium]/[Low]) made it easy to identify items needing human review. - -### 3. Adversarial audit caught real issues - -The audit found that the independently-drafted documents used conflicting REQ-ID -schemes — a genuine traceability problem that would have confused future readers. -The automated fix (replacing the design doc's ad-hoc IDs with canonical references) -was surgical and correct. - -### 4. Structured workflow - -The 6-phase workflow with explicit human gates prevented the agent from -hallucinating requirements or auto-approving its own work. Each phase had clear -inputs, outputs, and stop conditions. - -### 5. Deep technical understanding - -The agent correctly identified and documented: -- The epoch skew hazard and how the global published epoch prevents it -- The `released_epoch = proposed - 1` safety margin and why it exists -- The stack-allocated synchronization entry design and its OOM-avoidance rationale -- The cross-CPU exit path's DISPATCH_LEVEL fail-fast invariant - -## What Could Be Improved - -### 1. Shared REQ-ID registry for parallel agents - -The primary audit finding (F-001/F-002) stemmed from parallel agents independently -inventing requirement IDs. A future improvement would be to: -- Run Phase 2a (requirements) first -- Extract the REQ-ID list -- Feed it to the design and validation agents - -This trades ~2 minutes of parallelism for perfect traceability. - -### 2. Validation coverage granularity - -The validation plan mapped 6 existing tests to ~30 requirement groups, but the -requirements document defined 57 atomic requirements. The 21 "untested" -requirements are mostly internal mechanism requirements (timer flags, KEVENT -signaling) that are implicitly exercised by integration tests. A "verified by -implicit coverage" category would reduce false coverage-gap noise. - -### 3. Out-of-scope file references - -The design agent discovered and referenced `docs/EpochBasedMemoryManagement.md` -and `include/ebpf_extension.h`, which were not in the specified input set. While -this added useful context, it violated the scoping rules. Future runs should either -explicitly include these files or instruct agents to stay within scope. - -### 4. Proposed test case specificity - -The 11 proposed test cases vary in specificity. Some (like TC-ECS-003: thread -migration) describe a clear, implementable test. Others (like TC-LIFE-002: -initialization failure) would require fault injection infrastructure that may -not exist. Tagging proposed tests with implementation feasibility would help -prioritize. - -## Lessons Learned - -1. **Spec extraction is viable for systems code.** Even a complex kernel module - with per-CPU state, inter-CPU messaging, and subtle concurrency invariants can - be systematically specified from its implementation. - -2. **The audit phase is essential, not optional.** Without it, the three documents - would have shipped with conflicting requirement IDs — a traceability defect - that compounds over time as specs are maintained. - -3. **Parallel extraction is a net win despite traceability cost.** The 3x speedup - is worth the alignment fix, especially since the fix is mechanical (search and - replace IDs). - -4. **Human gates prevent runaway fabrication.** The user's involvement was minimal - (~3 interactions totaling ~10 words) but each gate forced the agent to pause, - present evidence, and wait — preventing the common failure mode of LLMs - generating plausible-but-unsupported content. - -5. **Coverage gap analysis has immediate value.** The 10 identified gaps - (e.g., `ebpf_epoch_cancel_work_item` never tested, thread migration path - untested) are actionable items that can drive the next round of test - development, independent of whether the specs themselves are adopted. - -## Reproduction - -To reproduce this extraction on another module: - -``` -Read and execute the spec-extraction-workflow in the workflows directory. -``` - -Then specify: -- The source files to analyze -- The output directory for specs -- Any additional context files (design docs, related headers) - -The workflow is interactive and will pause at each phase gate for confirmation. + + + +# Case Study: Spec Extraction for the eBPF Epoch Module + +## Context + +**Project**: [eBPF for Windows](https://github.com/microsoft/ebpf-for-windows) +**Module**: `ebpf_epoch` — epoch-based memory reclamation (EBR) +**Date**: 2026-03-30 +**Duration**: ~45 minutes wall-clock (human interaction), ~7 minutes agent compute +**Tool**: GitHub Copilot CLI (Claude Opus 4.6, 1M context) + +The `ebpf_epoch` module is a kernel-level subsystem that provides safe, deferred +memory reclamation for concurrent data structures in the eBPF for Windows runtime. +It implements a per-CPU architecture with inter-CPU DPC messaging, epoch-stamped +free lists, and a three-phase computation protocol — the kind of intricate systems +code that typically resists documentation efforts because the complexity lives +entirely in the implementation. + +The module had **zero formal specifications**. The only documentation was code +comments and a brief design overview in `docs/EpochBasedMemoryManagement.md`. + +## Objective + +Bootstrap a **semantic baseline** — structured requirements, design, and validation +specifications — extracted entirely from the existing codebase and test suite, +following the project's `spec-extraction-workflow.md`. + +## Input Files + +| File | Lines | Role | +|------|-------|------| +| `libs/runtime/ebpf_epoch.h` | 141 | Public API (13 functions, 2 types) | +| `libs/runtime/ebpf_epoch.c` | 1,062 | Implementation | +| `libs/runtime/unit/platform_unit_test.cpp` | ~1,150 (epoch portion) | 6 test cases | + +**Total input**: ~2,350 lines of C/C++ code. + +## Process + +### Phase 1 — Repository Scan (~3 minutes) + +The agent read all three source files in full and produced a structured analysis +summary: + +- Identified the 13-function public API surface +- Cataloged the per-CPU architecture, inter-CPU messaging protocol, and free-list + release algorithm +- Mapped all 6 existing test cases with their concurrency requirements (CPU count + thresholds, thread affinity patterns, stress test durations) +- Proposed the specification scope + +**Human interaction**: User confirmed the scope with a single "yes." + +### Phase 2 — Draft Extraction (~7 minutes) + +Three background agents were launched **in parallel**, each producing one +specification document: + +| Agent | Output | Size | Time | +|-------|--------|------|------| +| draft-requirements | `specs/epoch/requirements.md` | 32 KB | 7m 27s | +| draft-design | `specs/epoch/design.md` | 32 KB | 6m 55s | +| draft-validation | `specs/epoch/validation.md` | 34 KB | 6m 29s | + +Each agent received a comprehensive prompt containing: +- The full API surface extracted from Phase 1 +- Key behavioral details from the implementation +- A document format template with section skeletons +- Anti-hallucination rules (cite sources, mark confidence, no invented behaviors) + +The parallel execution cut wall-clock time from ~21 minutes (sequential) to ~7.5 +minutes. + +### Phase 3 — Human Clarification (~1 minute) + +The agent presented 5 targeted questions about ambiguous items: +1. Is the 1ms timer delay a requirement or implementation detail? +2. Is CPU 0 as coordinator essential or incidental? +3. Are work item callbacks contractually at PASSIVE_LEVEL? +4. Is the cross-CPU DISPATCH fail-fast a formal requirement? +5. Is stack-allocated synchronization a requirement or optimization? + +**Human interaction**: User said "proceed to audit" — accepting the drafts as-is +and deferring the clarifications to future review. + +### Phase 4 — Consistency Audit (~5 minutes) + +The agent performed an adversarial traceability audit by: +1. Reading all three documents in full (~2,000 lines) +2. Extracting every REQ-ID from requirements.md (57 total) +3. Cross-referencing against design.md and validation.md +4. Checking for orphaned IDs, phantom references, and coverage gaps + +**7 findings** were identified: + +| ID | Severity | Issue | +|----|----------|-------| +| F-001 | Medium | Design doc invented parallel REQ-IDs instead of using canonical ones | +| F-002 | Medium | Validation plan referenced phantom REQ-IDs | +| F-003 | Medium | 21/57 requirements had no explicit test traceability | +| F-004 | Low | Validation defined REQ-DIAG-002 not in requirements | +| F-005 | Low | Date inconsistency across documents | +| F-006 | Low | "Released epoch" phrasing varied | +| F-007 | Low | Design referenced out-of-scope files | + +The agent immediately applied fixes for F-001 (replaced design §2 with canonical +REQ-ID references) and F-005 (normalized dates). + +**Root cause**: The three documents were drafted by independent agents without a +shared REQ-ID registry. This is a known limitation of parallel extraction. + +**Verdict**: REVISE (with critical corrections already applied). + +### Phase 5 — Human Approval (~instant) + +User said "looks good to me." + +### Phase 6 — Deliverable (~2 minutes) + +The agent: +1. Created branch `alanjo/specs-epoch-baseline` +2. Committed 4 spec files + 4 workflow files with sign-off and detailed commit message +3. Pushed and created draft PR #5150 + +## Output + +### Deliverables + +| File | Content | +|------|---------| +| `specs/epoch/requirements.md` | 57 requirements, 5 dependencies, 5 assumptions, 4 risks | +| `specs/epoch/design.md` | Architecture diagram, 10 design subsections, 7 tradeoff analyses, 5 open questions | +| `specs/epoch/validation.md` | 17 test cases (6 existing + 11 proposed), 10 coverage gaps, risk prioritization | +| `specs/epoch/audit.md` | 7 findings, root cause analysis, remediation plan | + +### Quantitative Summary + +| Metric | Value | +|--------|-------| +| Input (source code) | 2,350 lines | +| Output (specifications) | 2,265 lines | +| Requirements extracted | 57 | +| Acceptance criteria | 57+ (at least 1 per requirement) | +| Design sections | 10 detailed + 7 tradeoff analyses | +| Test cases documented | 17 (6 existing, 11 proposed) | +| Coverage gaps identified | 10 | +| API coverage by tests | 85% (11/13 functions) | +| Audit findings | 7 (2 fixed inline) | +| Source citations | Every requirement cites file:line | +| Confidence tagging | 100% of items tagged [High]/[Medium]/[Low] | + +## What Worked Well + +### 1. Parallel agent execution + +Launching three independent agents cut the extraction phase from ~21 minutes to +~7.5 minutes. The agents produced high-quality, structurally consistent documents +despite working independently. + +### 2. Anti-hallucination guardrails + +Every requirement cites specific source locations (e.g., `ebpf_epoch.c:704-706`). +No invented behaviors were found during the audit. The confidence tagging +([High]/[Medium]/[Low]) made it easy to identify items needing human review. + +### 3. Adversarial audit caught real issues + +The audit found that the independently-drafted documents used conflicting REQ-ID +schemes — a genuine traceability problem that would have confused future readers. +The automated fix (replacing the design doc's ad-hoc IDs with canonical references) +was surgical and correct. + +### 4. Structured workflow + +The 6-phase workflow with explicit human gates prevented the agent from +hallucinating requirements or auto-approving its own work. Each phase had clear +inputs, outputs, and stop conditions. + +### 5. Deep technical understanding + +The agent correctly identified and documented: +- The epoch skew hazard and how the global published epoch prevents it +- The `released_epoch = proposed - 1` safety margin and why it exists +- The stack-allocated synchronization entry design and its OOM-avoidance rationale +- The cross-CPU exit path's DISPATCH_LEVEL fail-fast invariant + +## What Could Be Improved + +### 1. Shared REQ-ID registry for parallel agents + +The primary audit finding (F-001/F-002) stemmed from parallel agents independently +inventing requirement IDs. A future improvement would be to: +- Run Phase 2a (requirements) first +- Extract the REQ-ID list +- Feed it to the design and validation agents + +This trades ~2 minutes of parallelism for perfect traceability. + +### 2. Validation coverage granularity + +The validation plan mapped 6 existing tests to ~30 requirement groups, but the +requirements document defined 57 atomic requirements. The 21 "untested" +requirements are mostly internal mechanism requirements (timer flags, KEVENT +signaling) that are implicitly exercised by integration tests. A "verified by +implicit coverage" category would reduce false coverage-gap noise. + +### 3. Out-of-scope file references + +The design agent discovered and referenced `docs/EpochBasedMemoryManagement.md` +and `include/ebpf_extension.h`, which were not in the specified input set. While +this added useful context, it violated the scoping rules. Future runs should either +explicitly include these files or instruct agents to stay within scope. + +### 4. Proposed test case specificity + +The 11 proposed test cases vary in specificity. Some (like TC-ECS-003: thread +migration) describe a clear, implementable test. Others (like TC-LIFE-002: +initialization failure) would require fault injection infrastructure that may +not exist. Tagging proposed tests with implementation feasibility would help +prioritize. + +## Lessons Learned + +1. **Spec extraction is viable for systems code.** Even a complex kernel module + with per-CPU state, inter-CPU messaging, and subtle concurrency invariants can + be systematically specified from its implementation. + +2. **The audit phase is essential, not optional.** Without it, the three documents + would have shipped with conflicting requirement IDs — a traceability defect + that compounds over time as specs are maintained. + +3. **Parallel extraction is a net win despite traceability cost.** The 3x speedup + is worth the alignment fix, especially since the fix is mechanical (search and + replace IDs). + +4. **Human gates prevent runaway fabrication.** The user's involvement was minimal + (~3 interactions totaling ~10 words) but each gate forced the agent to pause, + present evidence, and wait — preventing the common failure mode of LLMs + generating plausible-but-unsupported content. + +5. **Coverage gap analysis has immediate value.** The 10 identified gaps + (e.g., `ebpf_epoch_cancel_work_item` never tested, thread migration path + untested) are actionable items that can drive the next round of test + development, independent of whether the specs themselves are adopted. + +## Reproduction + +To reproduce this extraction on another module: + +``` +Read and execute the spec-extraction-workflow in the workflows directory. +``` + +Then specify: +- The source files to analyze +- The output directory for specs +- Any additional context files (design docs, related headers) + +The workflow is interactive and will pause at each phase gate for confirmation.