Add unit tests for evalbuff (35 tests across 4 files)

jahooma · claude · jahooma · commit f411a3f1b6e2 · 2026-03-26T11:30:39.000-07:00
Tests for criteria (promotion logic, level accumulation), docs-optimizer
(apply/overwrite/reject/AGENTS.md creation, compareScores, readCurrentDocs),
cli-runner (happy path, diff capture, crash, timeout, CLI not found),
and morning-report (normal/empty/error reports, score trajectory, JSONL append).

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/evals/evalbuff/__tests__/cli-runner.test.ts b/evals/evalbuff/__tests__/cli-runner.test.ts
@@ -0,0 +1,107 @@
+import fs from 'fs'
+import os from 'os'
+import path from 'path'
+import { execSync } from 'child_process'
+
+import { afterEach, beforeEach, describe, expect, it } from 'bun:test'
+
+import { runCliAgent } from '../cli-runner'
+
+let tmpDir: string
+
+beforeEach(() => {
+  tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-cli-test-'))
+  // Initialize a git repo so git diff works
+  execSync('git init && git add . && git commit --allow-empty -m "init"', {
+    cwd: tmpDir,
+    stdio: 'ignore',
+  })
+})
+
+afterEach(() => {
+  fs.rmSync(tmpDir, { recursive: true, force: true })
+})
+
+describe('runCliAgent', () => {
+  it('happy path: captures stdout and exit code 0', async () => {
+    const result = await runCliAgent({
+      command: 'echo',
+      prompt: 'hello world',
+      cwd: tmpDir,
+      timeoutMs: 10_000,
+    })
+
+    expect(result.exitCode).toBe(0)
+    expect(result.stdout.trim()).toBe('hello world')
+    expect(result.durationMs).toBeGreaterThan(0)
+  })
+
+  it('captures git diff when agent creates a file', async () => {
+    // Use a bash command that creates a file
+    const scriptPath = path.join(tmpDir, 'agent.sh')
+    fs.writeFileSync(
+      scriptPath,
+      '#!/bin/bash\necho "new content" > newfile.txt\n',
+    )
+    fs.chmodSync(scriptPath, '755')
+
+    const result = await runCliAgent({
+      command: scriptPath,
+      prompt: 'create a file',
+      cwd: tmpDir,
+      timeoutMs: 10_000,
+    })
+
+    expect(result.exitCode).toBe(0)
+    expect(result.diff).toContain('newfile.txt')
+    expect(result.diff).toContain('new content')
+  })
+
+  it('handles agent crash with non-zero exit code', async () => {
+    const result = await runCliAgent({
+      command: 'bash -c',
+      prompt: 'exit 42',
+      cwd: tmpDir,
+      timeoutMs: 10_000,
+    })
+
+    expect(result.exitCode).toBe(42)
+  })
+
+  it('returns empty diff when agent makes no changes', async () => {
+    const result = await runCliAgent({
+      command: 'echo',
+      prompt: 'do nothing',
+      cwd: tmpDir,
+      timeoutMs: 10_000,
+    })
+
+    expect(result.diff).toBe('')
+  })
+
+  it('rejects when agent CLI is not found', async () => {
+    const promise = runCliAgent({
+      command: 'nonexistent-agent-binary-xyz',
+      prompt: 'test',
+      cwd: tmpDir,
+      timeoutMs: 10_000,
+    })
+
+    await expect(promise).rejects.toThrow('CLI agent failed to start')
+    await expect(promise).rejects.toThrow('nonexistent-agent-binary-xyz')
+  })
+
+  it('kills agent on timeout', async () => {
+    const result = await runCliAgent({
+      command: 'sleep',
+      prompt: '30',
+      cwd: tmpDir,
+      timeoutMs: 500, // 500ms timeout
+    })
+
+    // Process should have been killed
+    expect(result.durationMs).toBeLessThan(5000)
+    // Exit code is null when killed by signal, which becomes 1
+    expect(result.exitCode).not.toBe(0)
+  })
+})
diff --git a/evals/evalbuff/__tests__/criteria.test.ts b/evals/evalbuff/__tests__/criteria.test.ts
@@ -0,0 +1,111 @@
+import { describe, expect, it } from 'bun:test'
+
+import {
+  formatCriteriaForPrompt,
+  getCriteriaForLevel,
+  maybePromoteCriteria,
+} from '../criteria'
+
+import type { QualityCriteria } from '../criteria'
+
+function makeCriteria(
+  level: number,
+  threshold = 8.0,
+  window = 10,
+): QualityCriteria {
+  return {
+    level,
+    criteria: getCriteriaForLevel(level),
+    promotionThreshold: threshold,
+    promotionWindow: window,
+  }
+}
+
+describe('getCriteriaForLevel', () => {
+  it('returns only L1 criteria at level 1', () => {
+    const criteria = getCriteriaForLevel(1)
+    expect(criteria).toHaveLength(3)
+    expect(criteria.map((c) => c.name)).toEqual([
+      'Correctness',
+      'Completeness',
+      'Basic Style',
+    ])
+  })
+
+  it('accumulates criteria up to level 3', () => {
+    const criteria = getCriteriaForLevel(3)
+    expect(criteria.map((c) => c.name)).toEqual([
+      'Correctness',
+      'Completeness',
+      'Basic Style',
+      'Pattern Consistency',
+      'Test Quality',
+    ])
+  })
+
+  it('includes all criteria at level 5', () => {
+    const criteria = getCriteriaForLevel(5)
+    expect(criteria).toHaveLength(7)
+    expect(criteria[criteria.length - 1].name).toBe('Fluency')
+  })
+
+  it('caps at level 5 even if higher number passed', () => {
+    const criteria = getCriteriaForLevel(10)
+    expect(criteria).toHaveLength(7)
+  })
+})
+
+describe('maybePromoteCriteria', () => {
+  it('promotes when avg above threshold over window', () => {
+    const criteria = makeCriteria(1, 8.0, 5)
+    const scores = [8.5, 9.0, 8.2, 8.8, 8.6]
+    const newLevel = maybePromoteCriteria(criteria, scores)
+    expect(newLevel).toBe(2)
+  })
+
+  it('does NOT promote when avg below threshold', () => {
+    const criteria = makeCriteria(1, 8.0, 5)
+    const scores = [7.0, 6.5, 8.0, 7.5, 7.0]
+    const newLevel = maybePromoteCriteria(criteria, scores)
+    expect(newLevel).toBe(1)
+  })
+
+  it('does NOT promote when already at max level (5)', () => {
+    const criteria = makeCriteria(5, 8.0, 3)
+    const scores = [9.0, 9.5, 9.0]
+    const newLevel = maybePromoteCriteria(criteria, scores)
+    expect(newLevel).toBe(5)
+  })
+
+  it('does NOT promote when fewer iterations than window size', () => {
+    const criteria = makeCriteria(1, 8.0, 10)
+    const scores = [9.0, 9.5, 9.0]
+    const newLevel = maybePromoteCriteria(criteria, scores)
+    expect(newLevel).toBe(1)
+  })
+
+  it('uses only the last N scores in the window', () => {
+    const criteria = makeCriteria(2, 8.0, 3)
+    // Old scores are low, but last 3 are high
+    const scores = [3.0, 4.0, 5.0, 8.5, 9.0, 8.5]
+    const newLevel = maybePromoteCriteria(criteria, scores)
+    expect(newLevel).toBe(3)
+  })
+})
+
+describe('formatCriteriaForPrompt', () => {
+  it('includes level and all criteria names', () => {
+    const criteria = makeCriteria(2)
+    const prompt = formatCriteriaForPrompt(criteria)
+    expect(prompt).toContain('Level 2/5')
+    expect(prompt).toContain('Correctness')
+    expect(prompt).toContain('Pattern Consistency')
+  })
+
+  it('includes weights', () => {
+    const criteria = makeCriteria(1)
+    const prompt = formatCriteriaForPrompt(criteria)
+    expect(prompt).toContain('weight: 3')
+    expect(prompt).toContain('weight: 1')
+  })
+})
diff --git a/evals/evalbuff/__tests__/docs-optimizer.test.ts b/evals/evalbuff/__tests__/docs-optimizer.test.ts
@@ -0,0 +1,126 @@
+import fs from 'fs'
+import os from 'os'
+import path from 'path'
+
+import { afterEach, beforeEach, describe, expect, it } from 'bun:test'
+
+import { applyDocEdit, compareScores, readCurrentDocs } from '../docs-optimizer'
+
+let tmpDir: string
+
+beforeEach(() => {
+  tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-test-'))
+})
+
+afterEach(() => {
+  fs.rmSync(tmpDir, { recursive: true, force: true })
+})
+
+describe('applyDocEdit', () => {
+  it('creates new file under docs/ and updates AGENTS.md TOC', () => {
+    const result = applyDocEdit(
+      tmpDir,
+      'patterns/error-handling.md',
+      '# Error Handling\n\nAlways use try/catch.',
+    )
+    expect(result).toBe(true)
+
+    const docPath = path.join(tmpDir, 'docs', 'patterns', 'error-handling.md')
+    expect(fs.existsSync(docPath)).toBe(true)
+    expect(fs.readFileSync(docPath, 'utf-8')).toContain('Error Handling')
+
+    const agentsMd = fs.readFileSync(
+      path.join(tmpDir, 'AGENTS.md'),
+      'utf-8',
+    )
+    expect(agentsMd).toContain('docs/patterns/error-handling.md')
+  })
+
+  it('overwrites existing file content', () => {
+    // Create initial doc
+    applyDocEdit(tmpDir, 'conventions/naming.md', 'Original content')
+
+    // Overwrite
+    applyDocEdit(tmpDir, 'conventions/naming.md', 'Updated content')
+
+    const content = fs.readFileSync(
+      path.join(tmpDir, 'docs', 'conventions', 'naming.md'),
+      'utf-8',
+    )
+    expect(content).toBe('Updated content')
+  })
+
+  it('does not duplicate AGENTS.md entry on overwrite', () => {
+    applyDocEdit(tmpDir, 'test.md', 'v1')
+    applyDocEdit(tmpDir, 'test.md', 'v2')
+
+    const agentsMd = fs.readFileSync(
+      path.join(tmpDir, 'AGENTS.md'),
+      'utf-8',
+    )
+    // The link format is "- [docs/test.md](docs/test.md)" — one entry has two occurrences of the path
+    const entryMatches = agentsMd.match(/- \[docs\/test\.md\]/g)
+    expect(entryMatches).toHaveLength(1)
+  })
+
+  it('rejects path starting with /', () => {
+    const result = applyDocEdit(tmpDir, '/etc/passwd', 'bad')
+    expect(result).toBe(false)
+  })
+
+  it('rejects path with ..', () => {
+    const result = applyDocEdit(tmpDir, '../outside/file.md', 'bad')
+    expect(result).toBe(false)
+  })
+
+  it('creates AGENTS.md if it does not exist', () => {
+    expect(fs.existsSync(path.join(tmpDir, 'AGENTS.md'))).toBe(false)
+    applyDocEdit(tmpDir, 'new-doc.md', 'content')
+    expect(fs.existsSync(path.join(tmpDir, 'AGENTS.md'))).toBe(true)
+
+    const agentsMd = fs.readFileSync(
+      path.join(tmpDir, 'AGENTS.md'),
+      'utf-8',
+    )
+    expect(agentsMd).toContain('# Documentation')
+    expect(agentsMd).toContain('docs/new-doc.md')
+  })
+})
+
+describe('compareScores', () => {
+  it('returns improved when new > old', () => {
+    expect(compareScores(5.0, 7.0)).toBe('improved')
+  })
+
+  it('returns same when new == old', () => {
+    expect(compareScores(5.0, 5.0)).toBe('same')
+  })
+
+  it('returns worse when new < old', () => {
+    expect(compareScores(7.0, 5.0)).toBe('worse')
+  })
+})
+
+describe('readCurrentDocs', () => {
+  it('returns empty object when docs/ does not exist', () => {
+    const docs = readCurrentDocs(tmpDir)
+    expect(docs).toEqual({})
+  })
+
+  it('reads all markdown files recursively', () => {
+    const docsDir = path.join(tmpDir, 'docs')
+    fs.mkdirSync(path.join(docsDir, 'patterns'), { recursive: true })
+    fs.writeFileSync(path.join(docsDir, 'intro.md'), 'intro content')
+    fs.writeFileSync(
+      path.join(docsDir, 'patterns', 'api.md'),
+      'api patterns',
+    )
+    // Non-md file should be ignored
+    fs.writeFileSync(path.join(docsDir, 'notes.txt'), 'ignored')
+
+    const docs = readCurrentDocs(tmpDir)
+    expect(Object.keys(docs).sort()).toEqual(['intro.md', 'patterns/api.md'])
+    expect(docs['intro.md']).toBe('intro content')
+    expect(docs['patterns/api.md']).toBe('api patterns')
+  })
+})
diff --git a/evals/evalbuff/__tests__/morning-report.test.ts b/evals/evalbuff/__tests__/morning-report.test.ts