diff --git a/.env.local.example b/.env.local.example
index d3c8c50..e542e1b 100644
--- a/.env.local.example
+++ b/.env.local.example
@@ -1,5 +1,9 @@
# Local development configuration
# Copy this to .env.local and fill in values
-# WorkOS Client ID for local development
-WORKOS_CLIENT_ID=client_xxx
+# Required for running evals
+ANTHROPIC_API_KEY=sk-ant-...
+
+# WorkOS credentials (optional for evals - placeholders used if missing)
+WORKOS_API_KEY=sk_test_...
+WORKOS_CLIENT_ID=client_...
diff --git a/.gitignore b/.gitignore
index 785f59f..92aa0ee 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,3 +28,6 @@ src/version.ts
.idea
*.sublime-*
dist/
+
+# Eval results
+tests/eval-results/
diff --git a/package.json b/package.json
index 6ac44c4..98163b6 100644
--- a/package.json
+++ b/package.json
@@ -63,6 +63,7 @@
"@vitest/coverage-v8": "^4.0.18",
"@vitest/ui": "^4.0.18",
"dotenv": "^17.2.3",
+ "p-limit": "^7.2.0",
"prettier": "^3.8.0",
"tsx": "^4.20.3",
"typescript": "^5.9.3",
@@ -71,7 +72,7 @@
"engines": {
"node": ">=20.20"
},
- "packageManager": "pnpm@10.23.0+sha512.21c4e5698002ade97e4efe8b8b4a89a8de3c85a37919f957e7a0f30f38fbc5bbdd05980ffe29179b2fb6e6e691242e098d945d1601772cad0fef5fb6411e2a4b",
+ "packageManager": "pnpm@10.28.2",
"scripts": {
"clean": "rm -rf ./dist",
"prebuild": "pnpm clean",
@@ -85,7 +86,12 @@
"test": "vitest run",
"test:watch": "vitest",
"test:coverage": "vitest run --coverage",
- "typecheck": "tsc --noEmit"
+ "typecheck": "tsc --noEmit",
+ "eval": "tsx tests/evals/index.ts",
+ "eval:history": "tsx tests/evals/index.ts history",
+ "eval:compare": "tsx tests/evals/index.ts compare",
+ "eval:logs": "tsx tests/evals/index.ts logs",
+ "eval:show": "tsx tests/evals/index.ts show"
},
"author": "WorkOS",
"license": "MIT"
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 8e55291..5d4b02c 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -81,6 +81,9 @@ importers:
dotenv:
specifier: ^17.2.3
version: 17.2.3
+ p-limit:
+ specifier: ^7.2.0
+ version: 7.2.0
prettier:
specifier: ^3.8.0
version: 3.8.0
@@ -492,56 +495,66 @@ packages:
resolution: {integrity: sha512-9B+taZ8DlyyqzZQnoeIvDVR/2F4EbMepXMc/NdVbkzsJbzkUjhXv/70GQJ7tdLA4YJgNP25zukcxpX2/SueNrA==}
cpu: [arm64]
os: [linux]
+ libc: [glibc]
'@img/sharp-libvips-linux-arm@1.0.5':
resolution: {integrity: sha512-gvcC4ACAOPRNATg/ov8/MnbxFDJqf/pDePbBnuBDcjsI8PssmjoKMAz4LtLaVi+OnSb5FK/yIOamqDwGmXW32g==}
cpu: [arm]
os: [linux]
+ libc: [glibc]
'@img/sharp-libvips-linux-x64@1.0.4':
resolution: {integrity: sha512-MmWmQ3iPFZr0Iev+BAgVMb3ZyC4KeFc3jFxnNbEPas60e1cIfevbtuyf9nDGIzOaW9PdnDciJm+wFFaTlj5xYw==}
cpu: [x64]
os: [linux]
+ libc: [glibc]
'@img/sharp-libvips-linuxmusl-arm64@1.0.4':
resolution: {integrity: sha512-9Ti+BbTYDcsbp4wfYib8Ctm1ilkugkA/uscUn6UXK1ldpC1JjiXbLfFZtRlBhjPZ5o1NCLiDbg8fhUPKStHoTA==}
cpu: [arm64]
os: [linux]
+ libc: [musl]
'@img/sharp-libvips-linuxmusl-x64@1.0.4':
resolution: {integrity: sha512-viYN1KX9m+/hGkJtvYYp+CCLgnJXwiQB39damAO7WMdKWlIhmYTfHjwSbQeUK/20vY154mwezd9HflVFM1wVSw==}
cpu: [x64]
os: [linux]
+ libc: [musl]
'@img/sharp-linux-arm64@0.33.5':
resolution: {integrity: sha512-JMVv+AMRyGOHtO1RFBiJy/MBsgz0x4AWrT6QoEVVTyh1E39TrCUpTRI7mx9VksGX4awWASxqCYLCV4wBZHAYxA==}
engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
cpu: [arm64]
os: [linux]
+ libc: [glibc]
'@img/sharp-linux-arm@0.33.5':
resolution: {integrity: sha512-JTS1eldqZbJxjvKaAkxhZmBqPRGmxgu+qFKSInv8moZ2AmT5Yib3EQ1c6gp493HvrvV8QgdOXdyaIBrhvFhBMQ==}
engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
cpu: [arm]
os: [linux]
+ libc: [glibc]
'@img/sharp-linux-x64@0.33.5':
resolution: {integrity: sha512-opC+Ok5pRNAzuvq1AG0ar+1owsu842/Ab+4qvU879ippJBHvyY5n2mxF1izXqkPYlGuP/M556uh53jRLJmzTWA==}
engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
cpu: [x64]
os: [linux]
+ libc: [glibc]
'@img/sharp-linuxmusl-arm64@0.33.5':
resolution: {integrity: sha512-XrHMZwGQGvJg2V/oRSUfSAfjfPxO+4DkiRh6p2AFjLQztWUuY/o8Mq0eMQVIY7HJ1CDQUJlxGGZRw1a5bqmd1g==}
engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
cpu: [arm64]
os: [linux]
+ libc: [musl]
'@img/sharp-linuxmusl-x64@0.33.5':
resolution: {integrity: sha512-WT+d/cgqKkkKySYmqoZ8y3pxx7lx9vVejxW/W4DOFMYVSkErR+w7mf2u8m/y4+xHe7yY9DAXQMWQhpnMuFfScw==}
engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
cpu: [x64]
os: [linux]
+ libc: [musl]
'@img/sharp-win32-x64@0.33.5':
resolution: {integrity: sha512-MpY/o8/8kj+EcnxwvrP4aTJSWw/aZ7JIGR4aBeZkZw5B7/Jn+tY9/VNwtcoGmdT7GfggGIU4kygOMSbYnOrAbg==}
@@ -853,66 +866,79 @@ packages:
resolution: {integrity: sha512-Rn3n+FUk2J5VWx+ywrG/HGPTD9jXNbicRtTM11e/uorplArnXZYsVifnPPqNNP5BsO3roI4n8332ukpY/zN7rQ==}
cpu: [arm]
os: [linux]
+ libc: [glibc]
'@rollup/rollup-linux-arm-musleabihf@4.55.1':
resolution: {integrity: sha512-grPNWydeKtc1aEdrJDWk4opD7nFtQbMmV7769hiAaYyUKCT1faPRm2av8CX1YJsZ4TLAZcg9gTR1KvEzoLjXkg==}
cpu: [arm]
os: [linux]
+ libc: [musl]
'@rollup/rollup-linux-arm64-gnu@4.55.1':
resolution: {integrity: sha512-a59mwd1k6x8tXKcUxSyISiquLwB5pX+fJW9TkWU46lCqD/GRDe9uDN31jrMmVP3feI3mhAdvcCClhV8V5MhJFQ==}
cpu: [arm64]
os: [linux]
+ libc: [glibc]
'@rollup/rollup-linux-arm64-musl@4.55.1':
resolution: {integrity: sha512-puS1MEgWX5GsHSoiAsF0TYrpomdvkaXm0CofIMG5uVkP6IBV+ZO9xhC5YEN49nsgYo1DuuMquF9+7EDBVYu4uA==}
cpu: [arm64]
os: [linux]
+ libc: [musl]
'@rollup/rollup-linux-loong64-gnu@4.55.1':
resolution: {integrity: sha512-r3Wv40in+lTsULSb6nnoudVbARdOwb2u5fpeoOAZjFLznp6tDU8kd+GTHmJoqZ9lt6/Sys33KdIHUaQihFcu7g==}
cpu: [loong64]
os: [linux]
+ libc: [glibc]
'@rollup/rollup-linux-loong64-musl@4.55.1':
resolution: {integrity: sha512-MR8c0+UxAlB22Fq4R+aQSPBayvYa3+9DrwG/i1TKQXFYEaoW3B5b/rkSRIypcZDdWjWnpcvxbNaAJDcSbJU3Lw==}
cpu: [loong64]
os: [linux]
+ libc: [musl]
'@rollup/rollup-linux-ppc64-gnu@4.55.1':
resolution: {integrity: sha512-3KhoECe1BRlSYpMTeVrD4sh2Pw2xgt4jzNSZIIPLFEsnQn9gAnZagW9+VqDqAHgm1Xc77LzJOo2LdigS5qZ+gw==}
cpu: [ppc64]
os: [linux]
+ libc: [glibc]
'@rollup/rollup-linux-ppc64-musl@4.55.1':
resolution: {integrity: sha512-ziR1OuZx0vdYZZ30vueNZTg73alF59DicYrPViG0NEgDVN8/Jl87zkAPu4u6VjZST2llgEUjaiNl9JM6HH1Vdw==}
cpu: [ppc64]
os: [linux]
+ libc: [musl]
'@rollup/rollup-linux-riscv64-gnu@4.55.1':
resolution: {integrity: sha512-uW0Y12ih2XJRERZ4jAfKamTyIHVMPQnTZcQjme2HMVDAHY4amf5u414OqNYC+x+LzRdRcnIG1YodLrrtA8xsxw==}
cpu: [riscv64]
os: [linux]
+ libc: [glibc]
'@rollup/rollup-linux-riscv64-musl@4.55.1':
resolution: {integrity: sha512-u9yZ0jUkOED1BFrqu3BwMQoixvGHGZ+JhJNkNKY/hyoEgOwlqKb62qu+7UjbPSHYjiVy8kKJHvXKv5coH4wDeg==}
cpu: [riscv64]
os: [linux]
+ libc: [musl]
'@rollup/rollup-linux-s390x-gnu@4.55.1':
resolution: {integrity: sha512-/0PenBCmqM4ZUd0190j7J0UsQ/1nsi735iPRakO8iPciE7BQ495Y6msPzaOmvx0/pn+eJVVlZrNrSh4WSYLxNg==}
cpu: [s390x]
os: [linux]
+ libc: [glibc]
'@rollup/rollup-linux-x64-gnu@4.55.1':
resolution: {integrity: sha512-a8G4wiQxQG2BAvo+gU6XrReRRqj+pLS2NGXKm8io19goR+K8lw269eTrPkSdDTALwMmJp4th2Uh0D8J9bEV1vg==}
cpu: [x64]
os: [linux]
+ libc: [glibc]
'@rollup/rollup-linux-x64-musl@4.55.1':
resolution: {integrity: sha512-bD+zjpFrMpP/hqkfEcnjXWHMw5BIghGisOKPj+2NaNDuVT+8Ds4mPf3XcPHuat1tz89WRL+1wbcxKY3WSbiT7w==}
cpu: [x64]
os: [linux]
+ libc: [musl]
'@rollup/rollup-openbsd-x64@4.55.1':
resolution: {integrity: sha512-eLXw0dOiqE4QmvikfQ6yjgkg/xDM+MdU9YJuP4ySTibXU0oAvnEWXt7UDJmD4UkYialMfOGFPJnIHSe/kdzPxg==}
@@ -1576,6 +1602,10 @@ packages:
outvariant@1.4.3:
resolution: {integrity: sha512-+Sl2UErvtsoajRDKCE5/dBz4DIvHXQQnAxtQTF04OJxY0+DyZXSo5P5Bb7XYWOh81syohlYL24hbDwxedPUJCA==}
+ p-limit@7.2.0:
+ resolution: {integrity: sha512-ATHLtwoTNDloHRFFxFJdHnG6n2WUeFjaR8XQMFdKIv0xkXjrER8/iG9iu265jOM95zXHAfv9oTkqhrfbIzosrQ==}
+ engines: {node: '>=20'}
+
partysocket@0.0.25:
resolution: {integrity: sha512-1oCGA65fydX/FgdnsiBh68buOvfxuteoZVSb3Paci2kRp/7lhF0HyA8EDb5X/O6FxId1e+usPTQNRuzFEvkJbQ==}
@@ -2006,6 +2036,10 @@ packages:
resolution: {integrity: sha512-4UEqdc2RYGHZc7Doyqkrqiln3p9X2DZVxaGbwhn2pi7MrRagKaOcIKe8L3OxYcbhXLgLFUS3zAYuQjKBQgmuNg==}
engines: {node: ^20.19.0 || ^22.12.0 || >=23}
+ yocto-queue@1.2.2:
+ resolution: {integrity: sha512-4LCcse/U2MHZ63HAJVE+v71o7yOdIe4cZ70Wpf8D/IyjDKYQLV5GD46B+hSTjJsvV5PztjvHoU580EftxjDZFQ==}
+ engines: {node: '>=12.20'}
+
yoctocolors-cjs@2.1.2:
resolution: {integrity: sha512-cYVsTjKl8b+FrnidjibDWskAv7UKOfcwaVZdp/it9n1s9fU3IkgDbhdIRKCW4JDsAlECJY0ytoVPT3sK6kideA==}
engines: {node: '>=18'}
@@ -3397,6 +3431,10 @@ snapshots:
outvariant@1.4.3:
optional: true
+ p-limit@7.2.0:
+ dependencies:
+ yocto-queue: 1.2.2
+
partysocket@0.0.25:
dependencies:
event-target-shim: 6.0.2
@@ -3878,6 +3916,8 @@ snapshots:
y18n: 5.0.8
yargs-parser: 22.0.0
+ yocto-queue@1.2.2: {}
+
yoctocolors-cjs@2.1.2:
optional: true
diff --git a/skills/workos-authkit-tanstack-start/SKILL.md b/skills/workos-authkit-tanstack-start/SKILL.md
index a98d7c6..5d0e531 100644
--- a/skills/workos-authkit-tanstack-start/SKILL.md
+++ b/skills/workos-authkit-tanstack-start/SKILL.md
@@ -12,9 +12,9 @@ description: Integrate WorkOS AuthKit with TanStack Start applications. Full-sta
├── Extract package name from install command
└── README is source of truth for ALL code patterns
-2. Verify TanStack Start project
- ├── @tanstack/start or @tanstack/react-start in package.json
- └── app.config.ts exists (vinxi)
+2. Detect directory structure
+ ├── src/ (TanStack Start v1.132+, default)
+ └── app/ (legacy vinxi-based projects)
3. Follow README install/setup exactly
└── Do not invent commands or patterns
@@ -28,7 +28,7 @@ WebFetch: `https://github.com/workos/authkit-tanstack-start/blob/main/README.md`
From README, extract:
-1. Package name from install command (e.g., `pnpm add @workos/...`)
+1. Package name: `@workos/authkit-tanstack-react-start`
2. Use that exact name for all imports
**README overrides this skill if conflict.**
@@ -37,9 +37,35 @@ From README, extract:
- [ ] README fetched and package name extracted
- [ ] `@tanstack/start` or `@tanstack/react-start` in package.json
-- [ ] `app.config.ts` exists
+- [ ] Identify directory structure: `src/` (modern) or `app/` (legacy)
- [ ] Environment variables set (see below)
+## Directory Structure Detection
+
+**Modern TanStack Start (v1.132+)** uses `src/`:
+```
+src/
+├── start.ts # Middleware config (CRITICAL)
+├── router.tsx # Router setup
+├── routes/
+│ ├── __root.tsx # Root layout
+│ ├── api.auth.callback.tsx # OAuth callback (flat route)
+│ └── ...
+```
+
+**Legacy (vinxi-based)** uses `app/`:
+```
+app/
+├── start.ts or router.tsx
+├── routes/
+│ └── api/auth/callback.tsx # OAuth callback (nested route)
+```
+
+**Detection:**
+```bash
+ls src/routes 2>/dev/null && echo "Modern (src/)" || echo "Legacy (app/)"
+```
+
## Environment Variables
| Variable | Format | Required |
@@ -51,56 +77,179 @@ From README, extract:
Generate password if missing: `openssl rand -base64 32`
+Default redirect URI: `http://localhost:3000/api/auth/callback`
+
## Middleware Configuration (CRITICAL)
-**authkitMiddleware MUST be configured or auth will fail.**
+**authkitMiddleware MUST be configured or auth will fail silently.**
-Find file with `createRouter` (typically `app/router.tsx` or `app.tsx`).
+Create or update `src/start.ts` (or `app/start.ts` for legacy):
+
+```typescript
+import { authkitMiddleware } from '@workos/authkit-tanstack-react-start';
+
+export default {
+ requestMiddleware: [authkitMiddleware()],
+};
+```
+
+Alternative pattern with createStart:
+```typescript
+import { createStart } from '@tanstack/react-start';
+import { authkitMiddleware } from '@workos/authkit-tanstack-react-start';
+
+export default createStart({
+ requestMiddleware: [authkitMiddleware()],
+});
+```
### Verification Checklist
-- [ ] `authkitMiddleware` imported from SDK package
-- [ ] `middleware: [authkitMiddleware()]` in createRouter config
-- [ ] Array syntax used: `[authkitMiddleware()]` not `authkitMiddleware()`
+- [ ] `authkitMiddleware` imported from `@workos/authkit-tanstack-react-start`
+- [ ] Middleware in `requestMiddleware` array
+- [ ] File exports the config (default export or named `startInstance`)
+
+Verify: `grep -r "authkitMiddleware" src/ app/ 2>/dev/null`
-Verify: `grep "authkitMiddleware" app/router.tsx app.tsx src/router.tsx`
+## Callback Route (CRITICAL)
-## Logout Route Pattern
+Path must match `WORKOS_REDIRECT_URI`. For `/api/auth/callback`:
-Logout requires `signOut()` followed by redirect in a route loader. See README for exact implementation.
+**Modern (flat routes):** `src/routes/api.auth.callback.tsx`
+**Legacy (nested routes):** `app/routes/api/auth/callback.tsx`
-## Callback Route
+```typescript
+import { createFileRoute } from '@tanstack/react-router';
+import { handleCallbackRoute } from '@workos/authkit-tanstack-react-start';
-Path must match `WORKOS_REDIRECT_URI`. If URI is `/api/auth/callback`:
+export const Route = createFileRoute('/api/auth/callback')({
+ server: {
+ handlers: {
+ GET: handleCallbackRoute(),
+ },
+ },
+});
+```
+
+**Key points:**
+- Use `handleCallbackRoute()` - do not write custom OAuth logic
+- Route path string must match the URI path exactly
+- This is a server-only route (no component needed)
+
+## Protected Routes
+
+Use `getAuth()` in route loaders to check authentication:
+
+```typescript
+import { createFileRoute, redirect } from '@tanstack/react-router';
+import { getAuth, getSignInUrl } from '@workos/authkit-tanstack-react-start';
+
+export const Route = createFileRoute('/dashboard')({
+ loader: async () => {
+ const { user } = await getAuth();
+ if (!user) {
+ const signInUrl = await getSignInUrl();
+ throw redirect({ href: signInUrl });
+ }
+ return { user };
+ },
+ component: Dashboard,
+});
+```
-- File: `app/routes/api/auth/callback.tsx`
-- Use `handleAuth()` from SDK - do not write custom OAuth logic
+## Sign Out Route
+
+```typescript
+import { createFileRoute, redirect } from '@tanstack/react-router';
+import { signOut } from '@workos/authkit-tanstack-react-start';
+
+export const Route = createFileRoute('/signout')({
+ loader: async () => {
+ await signOut();
+ throw redirect({ href: '/' });
+ },
+});
+```
+
+## Client-Side Hooks (Optional)
+
+Only needed if you want reactive auth state in components.
+
+**1. Add AuthKitProvider to root:**
+
+```typescript
+// src/routes/__root.tsx
+import { AuthKitProvider } from '@workos/authkit-tanstack-react-start/client';
+
+function RootComponent() {
+ return (
+
+
+
+ );
+}
+```
+
+**2. Use hooks in components:**
+
+```typescript
+import { useAuth } from '@workos/authkit-tanstack-react-start/client';
+
+function Profile() {
+ const { user, isLoading } = useAuth();
+ // ...
+}
+```
+
+**Note:** Server-side `getAuth()` is preferred for most use cases.
## Error Recovery
### "AuthKit middleware is not configured"
-**Cause:** `authkitMiddleware()` not added to router
-**Fix:** Add `middleware: [authkitMiddleware()]` to createRouter config
-**Verify:** `grep "authkitMiddleware" app/router.tsx app.tsx`
+**Cause:** `authkitMiddleware()` not in start.ts
+**Fix:** Create/update `src/start.ts` with middleware config
+**Verify:** `grep -r "authkitMiddleware" src/`
### "Module not found" for SDK
**Cause:** Wrong package name or not installed
-**Fix:** Re-read README, extract correct package name, reinstall
-**Verify:** `ls node_modules/` + package name from README
+**Fix:** `pnpm add @workos/authkit-tanstack-react-start`
+**Verify:** `ls node_modules/@workos/authkit-tanstack-react-start`
### Callback 404
-**Cause:** Route path doesn't match WORKOS_REDIRECT_URI
-**Fix:** File path must mirror URI path under `app/routes/`
+**Cause:** Route file path doesn't match WORKOS_REDIRECT_URI
+**Fix:**
+- URI `/api/auth/callback` → file `src/routes/api.auth.callback.tsx` (flat) or `app/routes/api/auth/callback.tsx` (nested)
+- Route path string in `createFileRoute()` must match exactly
-### getAuth returns undefined
+### getAuth returns undefined user
-**Cause:** Middleware not configured
-**Fix:** Same as "AuthKit middleware not configured" above
+**Cause:** Middleware not configured or not running
+**Fix:** Ensure `authkitMiddleware()` is in start.ts requestMiddleware array
### "Cookie password too short"
**Cause:** WORKOS_COOKIE_PASSWORD < 32 chars
**Fix:** `openssl rand -base64 32`, update .env
+
+### Build fails with route type errors
+
+**Cause:** Route tree not regenerated after adding routes
+**Fix:** `pnpm dev` to regenerate `routeTree.gen.ts`
+
+## SDK Exports Reference
+
+**Server (main export):**
+- `authkitMiddleware()` - Request middleware
+- `handleCallbackRoute()` - OAuth callback handler
+- `getAuth()` - Get current session
+- `signOut()` - Sign out user
+- `getSignInUrl()` / `getSignUpUrl()` - Auth URLs
+- `switchToOrganization()` - Change org context
+
+**Client (`/client` subpath):**
+- `AuthKitProvider` - Context provider
+- `useAuth()` - Auth state hook
+- `useAccessToken()` - Token management
diff --git a/src/utils/exec-file.ts b/src/utils/exec-file.ts
new file mode 100644
index 0000000..38d5e46
--- /dev/null
+++ b/src/utils/exec-file.ts
@@ -0,0 +1,55 @@
+import { spawn } from 'node:child_process';
+
+export interface ExecResult {
+ status: number;
+ stdout: string;
+ stderr: string;
+}
+
+export interface ExecOptions {
+ cwd?: string;
+ timeout?: number;
+ env?: NodeJS.ProcessEnv;
+}
+
+/**
+ * Execute a command without throwing on non-zero exit codes.
+ * Returns { status, stdout, stderr } for all outcomes.
+ */
+export function execFileNoThrow(command: string, args: string[], options: ExecOptions = {}): Promise {
+ return new Promise((resolve) => {
+ const child = spawn(command, args, {
+ cwd: options.cwd,
+ env: options.env ?? process.env,
+ timeout: options.timeout,
+ shell: false,
+ });
+
+ let stdout = '';
+ let stderr = '';
+
+ child.stdout?.on('data', (data) => {
+ stdout += data.toString();
+ });
+
+ child.stderr?.on('data', (data) => {
+ stderr += data.toString();
+ });
+
+ child.on('close', (code) => {
+ resolve({
+ status: code ?? 1,
+ stdout,
+ stderr,
+ });
+ });
+
+ child.on('error', (err) => {
+ resolve({
+ status: 1,
+ stdout,
+ stderr: err.message,
+ });
+ });
+ });
+}
diff --git a/tests/evals/README.md b/tests/evals/README.md
new file mode 100644
index 0000000..71009ee
--- /dev/null
+++ b/tests/evals/README.md
@@ -0,0 +1,175 @@
+# Installer Evaluations
+
+Automated evaluation framework for testing WorkOS AuthKit installer skills against realistic project scenarios.
+
+## Quick Start
+
+```bash
+# Run all evaluations
+pnpm eval
+
+# Run specific framework
+pnpm eval --framework=nextjs
+
+# Run specific scenario
+pnpm eval --framework=react --state=example-auth0
+```
+
+## Test Matrix
+
+The framework tests 10 scenarios (5 frameworks × 2 project states):
+
+| State | Description |
+| --------------- | ---------------------------------------------------- |
+| `example` | Project with routes, components, custom config |
+| `example-auth0` | Project with Auth0 authentication already integrated |
+
+| Framework | Skill | Key Checks |
+| ---------------- | ----------------------------- | ---------------------------------------------- |
+| `nextjs` | workos-authkit-nextjs | middleware.ts, callback route, AuthKitProvider |
+| `react` | workos-authkit-react | AuthKitProvider, callback component, useAuth |
+| `react-router` | workos-authkit-react-router | Auth loader, protected routes |
+| `tanstack-start` | workos-authkit-tanstack-start | Server functions, callback route |
+| `vanilla-js` | workos-authkit-vanilla-js | Auth script, callback page |
+
+## CLI Options
+
+```
+--framework= Filter by framework
+--state= Filter by project state
+--verbose, -v Show agent tool calls and detailed output
+--debug Extra verbose, preserve temp dirs on failure
+--keep-on-fail Don't cleanup temp directory when scenario fails
+--retry= Number of retry attempts (default: 2)
+--no-retry Disable retries
+--json Output results as JSON
+--help, -h Show help
+```
+
+## Debugging Failures
+
+### 1. Inspect the failure details
+
+```bash
+pnpm eval --framework=react --state=example-auth0 --verbose
+```
+
+### 2. Preserve the temp directory
+
+```bash
+pnpm eval --framework=react --state=example-auth0 --keep-on-fail
+# Output will show: "Temp directory preserved: /tmp/eval-react-xxxxx"
+```
+
+### 3. Manually inspect the project state
+
+```bash
+cd /tmp/eval-react-xxxxx
+ls -la
+cat middleware.ts
+```
+
+### 4. Compare with previous runs
+
+```bash
+# List recent runs
+pnpm eval:history
+
+# Compare two runs
+pnpm eval:compare 2024-01-15T10-30-00 2024-01-16T14-45-00
+```
+
+## Adding a New Fixture
+
+1. Create directory: `tests/fixtures/{framework}/{state}/`
+
+2. Add minimal project files:
+ - `package.json` with dependencies
+ - `tsconfig.json` (if TypeScript)
+ - Framework config file
+ - Basic app structure
+
+3. Verify fixture works standalone:
+
+ ```bash
+ cd tests/fixtures/{framework}/{state}
+ pnpm install
+ pnpm build
+ ```
+
+4. Add scenario to `tests/evals/runner.ts` SCENARIOS array
+
+## Adding/Modifying Graders
+
+Graders live in `tests/evals/graders/{framework}.grader.ts`.
+
+Each grader implements:
+
+```typescript
+interface Grader {
+ grade(): Promise;
+}
+```
+
+Use the helper classes:
+
+- `FileGrader` - Check file existence and content patterns
+- `BuildGrader` - Run build commands and check exit codes
+
+Example:
+
+```typescript
+const checks: GradeCheck[] = [];
+
+// File must exist
+checks.push(await this.fileGrader.checkFileExists('middleware.ts'));
+
+// File must contain patterns
+checks.push(
+ ...(await this.fileGrader.checkFileContains('middleware.ts', ['@workos-inc/authkit', 'authkitMiddleware'])),
+);
+
+// Build must succeed
+checks.push(await this.buildGrader.checkBuild());
+
+return { passed: checks.every((c) => c.passed), checks };
+```
+
+## Results Storage
+
+Results are saved to `tests/eval-results/`:
+
+- Each run creates `{timestamp}.json`
+- `latest.json` symlinks to most recent
+- Use `pnpm eval:history` to list runs
+- Use `pnpm eval:compare` to diff runs
+
+## Troubleshooting
+
+### "pnpm install failed"
+
+The fixture's dependencies may have version conflicts. Check:
+
+```bash
+cd tests/fixtures/{framework}/{state}
+pnpm install
+```
+
+### "Build failed" but files look correct
+
+The agent may have created correct files but with syntax errors. Use `--keep-on-fail` to inspect:
+
+```bash
+pnpm eval --framework=nextjs --keep-on-fail
+# Then run build manually in temp dir to see full error
+```
+
+### Flaky passes/failures
+
+LLM responses vary. Use `--retry=3` for more attempts:
+
+```bash
+pnpm eval --retry=3
+```
+
+If a scenario is consistently flaky, check if the skill instructions are ambiguous.
diff --git a/tests/evals/agent-executor.ts b/tests/evals/agent-executor.ts
new file mode 100644
index 0000000..897d2e6
--- /dev/null
+++ b/tests/evals/agent-executor.ts
@@ -0,0 +1,184 @@
+import path from 'node:path';
+import { fileURLToPath } from 'node:url';
+import { Integration } from '../../src/lib/constants.js';
+import { loadCredentials } from './env-loader.js';
+import { writeEnvLocal } from '../../src/lib/env-writer.js';
+import { getConfig } from '../../src/lib/settings.js';
+import type { ToolCall } from './types.js';
+
+export interface AgentResult {
+ success: boolean;
+ output: string;
+ toolCalls: ToolCall[];
+ error?: string;
+}
+
+export interface AgentExecutorOptions {
+ verbose?: boolean;
+ scenarioName?: string;
+}
+
+// Skill name mapping for each framework
+const SKILL_NAMES: Record = {
+ [Integration.nextjs]: 'workos-authkit-nextjs',
+ [Integration.react]: 'workos-authkit-react',
+ [Integration.reactRouter]: 'workos-authkit-react-router',
+ [Integration.tanstackStart]: 'workos-authkit-tanstack-start',
+ [Integration.vanillaJs]: 'workos-authkit-vanilla-js',
+};
+
+export class AgentExecutor {
+ private options: AgentExecutorOptions;
+ private credentials: ReturnType;
+
+ constructor(
+ private workDir: string,
+ private framework: string,
+ options: AgentExecutorOptions = {},
+ ) {
+ this.options = options;
+ this.credentials = loadCredentials();
+ }
+
+ async run(): Promise {
+ const integration = this.getIntegration();
+ const toolCalls: ToolCall[] = [];
+ const collectedOutput: string[] = [];
+
+ const label = this.options.scenarioName ? `[${this.options.scenarioName}]` : '';
+ if (this.options.verbose) {
+ console.log(`${label} Initializing agent for ${integration}...`);
+ }
+
+ // Write .env.local with credentials (agent configures redirect URI per framework)
+ writeEnvLocal(this.workDir, {
+ WORKOS_API_KEY: this.credentials.workosApiKey,
+ WORKOS_CLIENT_ID: this.credentials.workosClientId,
+ });
+
+ // Build prompt
+ const skillName = SKILL_NAMES[integration];
+ const prompt = this.buildPrompt(skillName);
+
+ // Initialize and run agent
+ try {
+ const { query } = await import('@anthropic-ai/claude-agent-sdk');
+
+ // Build SDK environment for direct mode
+ const sdkEnv: Record = {
+ ...process.env,
+ ANTHROPIC_API_KEY: this.credentials.anthropicApiKey,
+ CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS: 'true',
+ CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC: 'true',
+ };
+ // Remove gateway config to use direct API
+ delete sdkEnv.ANTHROPIC_BASE_URL;
+ delete sdkEnv.ANTHROPIC_AUTH_TOKEN;
+
+ // Get plugin path for skills
+ const __filename = fileURLToPath(import.meta.url);
+ const __dirname = path.dirname(__filename);
+ const pluginPath = path.join(__dirname, '../..');
+
+ const response = query({
+ prompt: prompt,
+ options: {
+ model: getConfig().model,
+ cwd: this.workDir,
+ permissionMode: 'acceptEdits',
+ mcpServers: {
+ workos: {
+ command: 'npx',
+ args: ['-y', '@workos/mcp-docs-server'],
+ },
+ },
+ env: sdkEnv,
+ tools: { type: 'preset', preset: 'claude_code' },
+ allowedTools: ['Skill', 'Read', 'Write', 'Edit', 'Bash', 'Glob', 'Grep', 'WebFetch'],
+ plugins: [{ type: 'local', path: pluginPath }],
+ },
+ });
+
+ // Process message stream
+ for await (const message of response) {
+ this.handleMessage(message, toolCalls, collectedOutput, label);
+ }
+
+ return {
+ success: true,
+ output: collectedOutput.join('\n'),
+ toolCalls,
+ };
+ } catch (error) {
+ return {
+ success: false,
+ output: collectedOutput.join('\n'),
+ toolCalls,
+ error: error instanceof Error ? error.message : String(error),
+ };
+ }
+ }
+
+ private buildPrompt(skillName: string): string {
+ return `You are integrating WorkOS AuthKit into this application.
+
+## Project Context
+- Framework: ${this.framework}
+- Working directory: ${this.workDir}
+
+## Environment
+The following environment variables have been configured in .env.local:
+- WORKOS_API_KEY
+- WORKOS_CLIENT_ID
+
+## Your Task
+Use the \`${skillName}\` skill to integrate WorkOS AuthKit into this application.
+
+Begin by invoking the ${skillName} skill.`;
+ }
+
+ private handleMessage(message: any, toolCalls: ToolCall[], collectedOutput: string[], label: string): void {
+ if (message.type === 'assistant') {
+ const content = message.message?.content;
+ if (Array.isArray(content)) {
+ for (const block of content) {
+ // Capture text output
+ if (block.type === 'text' && typeof block.text === 'string') {
+ collectedOutput.push(block.text);
+ if (this.options.verbose) {
+ console.log(`${label} Agent: ${block.text.slice(0, 100)}...`);
+ }
+ }
+ // Capture tool calls
+ if (block.type === 'tool_use') {
+ const call: ToolCall = {
+ tool: block.name,
+ input: block.input as Record,
+ };
+ toolCalls.push(call);
+ if (this.options.verbose) {
+ console.log(`${label} Tool: ${block.name}`);
+ }
+ }
+ }
+ }
+ }
+
+ if (message.type === 'result') {
+ if (message.subtype !== 'success' && message.errors?.length > 0) {
+ collectedOutput.push(`Error: ${message.errors.join(', ')}`);
+ }
+ }
+ }
+
+ private getIntegration(): Integration {
+ const map: Record = {
+ nextjs: Integration.nextjs,
+ react: Integration.react,
+ 'react-router': Integration.reactRouter,
+ 'tanstack-start': Integration.tanstackStart,
+ 'vanilla-js': Integration.vanillaJs,
+ };
+ return map[this.framework];
+ }
+}
diff --git a/tests/evals/cli.ts b/tests/evals/cli.ts
new file mode 100644
index 0000000..c248144
--- /dev/null
+++ b/tests/evals/cli.ts
@@ -0,0 +1,155 @@
+export interface CliOptions {
+ framework?: string;
+ state?: string;
+ verbose: boolean;
+ debug: boolean;
+ json: boolean;
+ help: boolean;
+ keep: boolean;
+ keepOnFail: boolean;
+ retry: number;
+ noRetry: boolean;
+ sequential: boolean;
+ noDashboard: boolean;
+ command?: 'run' | 'history' | 'compare' | 'logs' | 'show';
+ compareIds?: [string, string];
+ logFile?: string;
+}
+
+const FRAMEWORKS = ['nextjs', 'react', 'react-router', 'tanstack-start', 'vanilla-js'];
+const STATES = ['example', 'example-auth0'];
+
+export function parseArgs(args: string[]): CliOptions {
+ const options: CliOptions = {
+ verbose: false,
+ debug: false,
+ json: false,
+ help: false,
+ keep: false,
+ keepOnFail: false,
+ retry: 2,
+ noRetry: false,
+ sequential: false,
+ noDashboard: false,
+ };
+
+ // Check for subcommands
+ if (args[0] === 'history') {
+ options.command = 'history';
+ return options;
+ }
+
+ if (args[0] === 'compare' && args.length >= 3) {
+ options.command = 'compare';
+ options.compareIds = [args[1], args[2]];
+ return options;
+ }
+
+ if (args[0] === 'logs') {
+ options.command = 'logs';
+ return options;
+ }
+
+ if (args[0] === 'show' && args[1]) {
+ options.command = 'show';
+ options.logFile = args[1];
+ return options;
+ }
+
+ options.command = 'run';
+
+ for (const arg of args) {
+ if (arg === '--help' || arg === '-h') {
+ options.help = true;
+ } else if (arg === '--verbose' || arg === '-v') {
+ options.verbose = true;
+ } else if (arg === '--debug') {
+ options.debug = true;
+ options.verbose = true;
+ options.keepOnFail = true;
+ } else if (arg === '--json') {
+ options.json = true;
+ } else if (arg === '--keep') {
+ options.keep = true;
+ } else if (arg === '--keep-on-fail') {
+ options.keepOnFail = true;
+ } else if (arg === '--no-retry') {
+ options.noRetry = true;
+ } else if (arg.startsWith('--retry=')) {
+ options.retry = parseInt(arg.split('=')[1], 10);
+ } else if (arg.startsWith('--framework=')) {
+ const framework = arg.split('=')[1];
+ if (!FRAMEWORKS.includes(framework)) {
+ throw new Error(`Unknown framework: ${framework}. Valid: ${FRAMEWORKS.join(', ')}`);
+ }
+ options.framework = framework;
+ } else if (arg.startsWith('--state=')) {
+ const state = arg.split('=')[1];
+ if (!STATES.includes(state)) {
+ throw new Error(`Unknown state: ${state}. Valid: ${STATES.join(', ')}`);
+ }
+ options.state = state;
+ } else if (arg === '--sequential') {
+ options.sequential = true;
+ } else if (arg === '--no-dashboard') {
+ options.noDashboard = true;
+ }
+ }
+
+ if (options.noRetry) {
+ options.retry = 0;
+ }
+
+ return options;
+}
+
+export function printHelp(): void {
+ console.log(`
+Usage: pnpm eval [command] [options]
+
+Commands:
+ run (default) Run evaluations
+ history List recent eval runs
+ compare Compare two eval runs
+ logs List recent detailed log files
+ show Display formatted log summary
+
+Options:
+ --framework= Run only scenarios for this framework
+ Valid: ${FRAMEWORKS.join(', ')}
+
+ --state= Run only scenarios for this project state
+ Valid: ${STATES.join(', ')}
+
+ --verbose, -v Show detailed output including agent tool calls
+
+ --debug Extra verbose, preserve temp dirs on failure
+
+ --keep Always preserve temp directory (for manual testing)
+
+ --keep-on-fail Don't cleanup temp directory when scenario fails
+
+ --retry= Number of retry attempts (default: 2)
+
+ --no-retry Disable retries
+
+ --sequential Run scenarios sequentially (disable parallelism)
+
+ --no-dashboard Disable live dashboard, use sequential logging
+
+ --json Output results as JSON (for scripting)
+
+ --help, -h Show this help message
+
+Examples:
+ pnpm eval # Run all 10 scenarios
+ pnpm eval --framework=nextjs # Run only Next.js scenarios
+ pnpm eval --state=example # Run only example app scenarios
+ pnpm eval --framework=react --state=example-auth0
+ # Run specific scenario
+ pnpm eval --debug # Verbose output, keep failed dirs
+ pnpm eval --retry=3 # More retry attempts
+ pnpm eval:history # List recent runs
+ pnpm eval:compare # Compare two runs
+`);
+}
diff --git a/tests/evals/concurrency.ts b/tests/evals/concurrency.ts
new file mode 100644
index 0000000..a69e7f9
--- /dev/null
+++ b/tests/evals/concurrency.ts
@@ -0,0 +1,25 @@
+import os from 'node:os';
+
+export interface ConcurrencyInfo {
+ detected: number;
+ effective: number;
+ reason: string;
+}
+
+export function detectConcurrency(): ConcurrencyInfo {
+ const cpuCount = os.cpus().length;
+
+ // Leave 1 core for system, cap at 8 to avoid Claude SDK rate limits
+ const effective = Math.max(2, Math.min(cpuCount - 1, 8));
+
+ return {
+ detected: cpuCount,
+ effective,
+ reason:
+ cpuCount <= 2
+ ? 'Low core count, using minimum concurrency'
+ : cpuCount > 8
+ ? 'Capped at 8 to avoid rate limits'
+ : 'Using CPU cores minus 1',
+ };
+}
diff --git a/tests/evals/dashboard/EvalDashboard.tsx b/tests/evals/dashboard/EvalDashboard.tsx
new file mode 100644
index 0000000..d5ad2e9
--- /dev/null
+++ b/tests/evals/dashboard/EvalDashboard.tsx
@@ -0,0 +1,130 @@
+import React, { useState, useEffect } from 'react';
+import { Box, Text, useApp } from 'ink';
+import {
+ evalEvents,
+ type ScenarioStartEvent,
+ type ScenarioCompleteEvent,
+ type RunProgressEvent,
+} from '../events.js';
+import { Header } from './Header.js';
+import { ScenarioRow } from './ScenarioRow.js';
+
+interface ScenarioStatus {
+ scenario: string;
+ framework: string;
+ state: string;
+ status: 'pending' | 'running' | 'retrying' | 'passed' | 'failed';
+ attempt: number;
+ duration?: number;
+ error?: string;
+ startTime?: number;
+}
+
+interface DashboardProps {
+ scenarios: Array<{ framework: string; state: string }>;
+ concurrency: number;
+}
+
+export function EvalDashboard({ scenarios, concurrency }: DashboardProps) {
+ const { exit } = useApp();
+
+ const [statuses, setStatuses] = useState