rivet-dev · NathanFlurry · Mar 23, 2026 · Mar 23, 2026
diff --git a/.github/workflows/posix-conformance.yml b/.github/workflows/posix-conformance.yml
@@ -0,0 +1,122 @@
+name: POSIX Conformance
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "native/wasmvm/**"
+      - "packages/wasmvm/**"
+      - "scripts/validate-posix-exclusions.ts"
+      - "scripts/generate-posix-report.ts"
+      - "scripts/import-os-test.ts"
+      - "scripts/posix-exclusion-schema.ts"
+      - ".github/workflows/posix-conformance.yml"
+  pull_request:
+    branches:
+      - main
+    paths:
+      - "native/wasmvm/**"
+      - "packages/wasmvm/**"
+      - "scripts/validate-posix-exclusions.ts"
+      - "scripts/generate-posix-report.ts"
+      - "scripts/import-os-test.ts"
+      - "scripts/posix-exclusion-schema.ts"
+      - ".github/workflows/posix-conformance.yml"
+
+jobs:
+  posix-conformance:
+    name: POSIX Conformance (os-test)
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      # --- Rust / WASM build ---
+      - name: Set up Rust toolchain
+        uses: dtolnay/rust-toolchain@nightly
+        with:
+          toolchain: nightly-2026-03-01
+          targets: wasm32-wasip1
+          components: rust-src
+
+      - name: Install wasm-opt (binaryen)
+        run: sudo apt-get update && sudo apt-get install -y binaryen
+
+      - name: Cache WASM build artifacts
+        uses: actions/cache@v4
+        with:
+          path: |
+            native/wasmvm/target
+            native/wasmvm/vendor
+          key: wasm-${{ runner.os }}-${{ hashFiles('native/wasmvm/Cargo.lock', 'native/wasmvm/rust-toolchain.toml') }}
+
+      - name: Build WASM binaries
+        run: cd native/wasmvm && make wasm
+
+      # --- C toolchain (wasi-sdk + patched sysroot) ---
+      - name: Cache wasi-sdk
+        id: cache-wasi-sdk
+        uses: actions/cache@v4
+        with:
+          path: native/wasmvm/c/vendor/wasi-sdk
+          key: wasi-sdk-25-${{ runner.os }}-${{ runner.arch }}
+
+      - name: Download wasi-sdk
+        if: steps.cache-wasi-sdk.outputs.cache-hit != 'true'
+        run: make -C native/wasmvm/c wasi-sdk
+
+      - name: Cache patched wasi-libc sysroot
+        id: cache-sysroot
+        uses: actions/cache@v4
+        with:
+          path: |
+            native/wasmvm/c/sysroot
+            native/wasmvm/c/vendor/wasi-libc
+          key: wasi-libc-sysroot-${{ runner.os }}-${{ hashFiles('native/wasmvm/patches/wasi-libc/*.patch', 'native/wasmvm/scripts/patch-wasi-libc.sh') }}
+
+      - name: Build patched wasi-libc sysroot
+        if: steps.cache-sysroot.outputs.cache-hit != 'true'
+        run: make -C native/wasmvm/c sysroot
+
+      # --- Build os-test (WASM + native) ---
+      - name: Build os-test binaries (WASM + native)
+        run: make -C native/wasmvm/c os-test os-test-native
+
+      # --- Node.js / TypeScript ---
+      - name: Set up pnpm
+        uses: pnpm/action-setup@v4
+        with:
+          version: 8.15.6
+
+      - name: Set up Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: 22
+          cache: pnpm
+          cache-dependency-path: pnpm-lock.yaml
+
+      - name: Install dependencies
+        run: pnpm install --frozen-lockfile
+
+      # --- Run conformance tests ---
+      - name: Run POSIX conformance tests
+        run: pnpm vitest run packages/wasmvm/test/posix-conformance.test.ts
+
+      - name: Validate exclusion list
+        run: pnpm tsx scripts/validate-posix-exclusions.ts
+
+      # --- Generate report ---
+      - name: Generate conformance report MDX
+        if: always()
+        run: pnpm tsx scripts/generate-posix-report.ts
+
+      # --- Upload artifacts ---
+      - name: Upload conformance report
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: posix-conformance-report
+          path: |
+            posix-conformance-report.json
+            docs/posix-conformance-report.mdx
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -17,30 +17,17 @@
 
 - NEVER mock external services in tests — use real implementations (Docker containers for databases/services, real HTTP servers for network tests, real binaries for CLI tool tests)
 - tests that validate sandbox behavior MUST run code through the secure-exec sandbox (NodeRuntime/proc.exec()), never directly on the host
-- NOTHING runs on the host except Docker containers for e2e-docker tests — all CLI tool code executes inside the sandbox VM
-- Pi, Claude Code, and OpenCode are ALL pure JavaScript/TypeScript — they ALL run in-VM via `import()` through `kernel.openShell()` or `kernel.spawn()`
-- Claude Code is a bundled ESM Node.js script (`@anthropic-ai/claude-code/cli.js`), not a native binary — its `.node` addons (tree-sitter, audio-capture) are optional and gracefully degrade
-- OpenCode is TypeScript (https://github.com/anomalyco/opencode) — the distributed `opencode` command is a Bun-compiled binary but the source is vanilla TS/JS; build the JS bundle from source and run it in-VM
-- if the sandbox can't run something, that is a secure-exec bug to fix, not a reason to spawn on the host
-- NEVER work around sandbox limitations with host-side execution — this is the #1 rule for CLI tool tests:
-  - do NOT use `child_process.spawn` or `child_process.spawnSync` from INSIDE sandbox code to run a tool binary on the host (e.g. `spawnSync('claude', [...args])` routing through the child_process bridge) — this is still host execution, the tool's JS runs on the host not in the V8 isolate
-  - do NOT use `node:child_process.spawn` from TEST code to run tools on the host
-  - do NOT create `HostBinaryDriver` classes that spawn binaries on the host
-  - do NOT use `script -qefc` or `python3 pty.spawn` to give host processes a PTY
-  - do NOT add `sandboxSkip` / probe-based skip logic that silently skips when the sandbox can't do something
-  - do NOT mark a story as passing if the tool runs on the host instead of in the V8 isolate
-  - the ONLY correct pattern is: `kernel.spawn('node', ['-e', 'import("tool-entry.js")'])` or equivalent — the tool's JavaScript executes inside the V8 sandbox isolate
-  - if `import()` hangs, if ESM loading fails, if the TUI crashes — those are secure-exec bugs to fix in packages/nodejs/src/, packages/core/src/, or native/v8-runtime/src/
+- CLI tool tests (Pi, Claude Code, OpenCode) must execute inside the sandbox: Pi runs as JS in the VM, Claude Code and OpenCode spawn their binaries via the sandbox's child_process.spawn bridge
 - e2e-docker fixtures connect to real Docker containers (Postgres, MySQL, Redis, SSH/SFTP) — skip gracefully via `skipUnlessDocker()` when Docker is unavailable
 - interactive/PTY tests must use `kernel.openShell()` with `@xterm/headless`, not host PTY via `script -qefc`
-- CLI tool tests (Pi, Claude Code, OpenCode) must support both mock and real LLM API tokens:
-  - check `ANTHROPIC_API_KEY` and `OPENAI_API_KEY` env vars at test startup
-  - if a real token is present, use it instead of the mock LLM server — this validates true e2e behavior
-  - Pi supports both Anthropic and OpenAI tokens; OpenCode uses OpenAI; Claude Code uses Anthropic
-  - log which mode each test suite is using at startup: `"Using real ANTHROPIC_API_KEY"`, `"Using real OPENAI_API_KEY"`, or `"Using mock LLM server"`
-  - tests must pass with both mock and real tokens — mock is the fallback, real is preferred
-  - to run with real tokens locally: `source ~/misc/env.txt` before running tests
-  - real-token tests may use longer timeouts (up to 60s) since they hit external APIs
+
+### POSIX Conformance Test Integrity
+
+- **no test-only workarounds** — if a C override fixes broken libc behavior (fcntl, realloc, strfmon, etc.), it MUST go in the patched sysroot (`native/wasmvm/patches/wasi-libc/`) so all WASM programs get the fix; never link overrides only into test binaries — that inflates conformance numbers while real users still hit the bug
+- **never replace upstream test source files** — if an os-test `.c` file fails due to a platform difference (e.g. `sizeof(long)`), exclude it via `posix-exclusions.json` with the real reason; do not swap in a rewritten version that changes what the test validates
+- **kernel behavior belongs in the kernel, not the test runner** — if a test requires runtime state (POSIX directories like `/tmp`, `/usr`, device nodes, etc.), implement it in the kernel/device-layer so all users get it; the test runner should not create kernel state that real users won't have
+- **no suite-specific VFS special-casing** — the test runner must not branch on suite name to inject different filesystem state; if a test needs files to exist, either the kernel should provide them or the test should be excluded
+- **categorize exclusions honestly** — if a failure is fixable with a patch or build flag, it's `implementation-gap`, not `wasm-limitation`; reserve `wasm-limitation` for things genuinely impossible in wasm32-wasip1 (no 80-bit long double, no fork, no mmap)
 
 ## Tooling
 
@@ -54,6 +41,12 @@
 - check GitHub Actions test/typecheck status per commit to identify when a failure first appeared
 - do not use `contract` in test filenames; use names like `suite`, `behavior`, `parity`, `integration`, or `policy` instead
 
+## GitHub Issues
+
+- when fixing a bug or implementation gap tracked by a GitHub issue, close the issue in the same PR using `gh issue close <number> --comment "Fixed in <commit-hash>"`
+- when removing a test from `posix-exclusions.json` because the fix landed, close the linked issue
+- do not leave resolved issues open — verify with `gh issue view <number>` if unsure
+
 ## Tool Integration Policy
 
 - NEVER implement a from-scratch reimplementation of a tool when the PRD specifies using an existing upstream project (e.g., codex, curl, git, make)