diff --git a/.agentv/targets.yaml b/.agentv/targets.yaml
index de3f4cf0..99954180 100644
--- a/.agentv/targets.yaml
+++ b/.agentv/targets.yaml
@@ -25,6 +25,10 @@ targets:
     system_prompt: "Answer directly based on the information provided."
     grader_target: gemini-flash
 
+  - name: claude-cli
+    provider: claude-cli
+    grader_target: gemini-flash
+
   - name: codex
     provider: codex
     grader_target: gemini-llm
diff --git a/evals/hivespec/hs-claim.eval.yaml b/evals/hivespec/hs-claim.eval.yaml
new file mode 100644
index 00000000..01ddff6b
--- /dev/null
+++ b/evals/hivespec/hs-claim.eval.yaml
@@ -0,0 +1,75 @@
+description: Evaluates that the hs-claim skill reads repo guidelines, extracts issue details, and assesses scope
+
+execution:
+  targets:
+    - pi-cli
+
+workspace:
+  template: ./workspace-template
+  hooks:
+    before_all:
+      command:
+        - node
+        - "{{workspace_path}}/scripts/setup.mjs"
+
+input:
+  - role: user
+    content:
+      - type: file
+        value: "/plugins/hivespec/skills/hs-claim/SKILL.md"
+
+tests:
+  - id: reads-guidelines-first
+    criteria: Agent reads CLAUDE.md and AGENTS.md before creating any branch or worktree
+    input:
+      - role: user
+        content: |
+          I want to start work on adding a due date field to tasks.
+          Read the repo guidelines and set up the workspace. No need to use GitHub —
+          the branch is already created.
+    assertions:
+      - type: skill-trigger
+        skill: hs-claim
+      - type: rubrics
+        criteria:
+          - Reads CLAUDE.md or AGENTS.md before doing other work
+          - Identifies the repo conventions (worktree location, branch naming, commit format)
+          - Does not start coding before reading guidelines
+
+  - id: reads-full-issue-body
+    criteria: Agent reads and extracts objective, constraints, and acceptance signals from the issue
+    input:
+      - role: user
+        content: |
+          I'm claiming this issue. Here is the full issue body — no need to fetch from GitHub:
+
+          **Issue #42: Add due date field to tasks**
+          Objective: Add an optional dueDate field to the Task interface.
+          Acceptance: Tasks can be filtered by due date. Existing tasks without due dates still work.
+          Non-goals: No calendar UI, no recurring tasks.
+
+          Extract the key details from this issue and tell me what you understand.
+    assertions:
+      - type: rubrics
+        criteria:
+          - Extracts the objective (add dueDate field)
+          - Identifies acceptance signals (filter by due date, backward compatibility)
+          - Identifies non-goals (no calendar UI, no recurring tasks)
+
+  - id: assesses-scope
+    criteria: Agent correctly classifies a trivial change and skips design/plan phases
+    input:
+      - role: user
+        content: |
+          I'm claiming this issue. Here is the full issue body:
+
+          **Issue #99: Fix typo in README.md — change 'taks' to 'tasks'**
+
+          The README.md file is in the repo root. Assess the scope of this change
+          and tell me which lifecycle phases are needed.
+    assertions:
+      - type: rubrics
+        criteria:
+          - Classifies this as a trivial change (docs-only, single line)
+          - Recommends skipping explore, design, and plan phases
+          - Suggests going directly to implement then verify then ship
diff --git a/evals/hivespec/hs-design.eval.yaml b/evals/hivespec/hs-design.eval.yaml
new file mode 100644
index 00000000..65f618ec
--- /dev/null
+++ b/evals/hivespec/hs-design.eval.yaml
@@ -0,0 +1,69 @@
+description: Evaluates that the hs-design skill requires brainstorming before implementation and writes a spec to the plans directory
+
+execution:
+  targets:
+    - pi-cli
+
+workspace:
+  template: ./workspace-template
+  hooks:
+    before_all:
+      command:
+        - node
+        - "{{workspace_path}}/scripts/setup.mjs"
+
+input:
+  - role: user
+    content:
+      - type: file
+        value: "/plugins/hivespec/skills/hs-design/SKILL.md"
+
+tests:
+  - id: refuses-implementation-without-design
+    criteria: Agent blocks implementation attempts on non-trivial features without an approved design
+    input:
+      - role: user
+        content: |
+          Add a priority system to tasks. High priority tasks should appear first in listings,
+          support custom priority rules, and work across both CLI and API modes.
+          Just go ahead and implement it.
+    assertions:
+      - type: skill-trigger
+        skill: hs-design
+      - type: rubrics
+        criteria:
+          - Does not start writing implementation code
+          - Insists on completing the design phase first or asks clarifying questions before implementing
+          - Explains why design is needed before implementation
+
+  - id: proposes-multiple-approaches
+    criteria: Agent presents 2-3 approaches with trade-offs before proceeding
+    input:
+      - role: user
+        content: |
+          Design a solution for adding task priorities.
+          Tasks should be sortable by priority in both CLI and API modes.
+    assertions:
+      - type: rubrics
+        criteria:
+          - Presents at least 2 different approaches
+          - Includes trade-offs for each approach (pros and cons)
+          - Makes a recommendation with reasoning
+          - Asks for user approval before proceeding
+
+  - id: writes-spec-to-plans
+    criteria: Agent saves the design spec to .agents/plans/ on the branch
+    input:
+      - role: user
+        content: "Design a priority field for tasks. Approach B sounds good, let's go with that."
+      - role: assistant
+        content: "I'll write up the design spec based on Approach B."
+      - role: user
+        content: "Yes, write the spec."
+    assertions:
+      - type: rubrics
+        criteria:
+          - Writes a design spec file
+          - File is saved to .agents/plans/ directory
+          - Spec includes concrete acceptance signals
+          - No placeholders or ambiguous language in the spec
diff --git a/evals/hivespec/hs-explore.eval.yaml b/evals/hivespec/hs-explore.eval.yaml
new file mode 100644
index 00000000..29602eb6
--- /dev/null
+++ b/evals/hivespec/hs-explore.eval.yaml
@@ -0,0 +1,57 @@
+description: Evaluates that the hs-explore skill discovers existing implementations, finds all consumers of shared interfaces, and produces a structured summary
+
+execution:
+  targets:
+    - pi-cli
+
+workspace:
+  template: ./workspace-template
+  hooks:
+    before_all:
+      command:
+        - node
+        - "{{workspace_path}}/scripts/setup.mjs"
+
+tests:
+  - id: discovers-existing-implementation
+    criteria: Agent finds the existing derivePriority function before proposing new priority code
+    input: |
+      Explore the codebase for issue #50: "Add priority field to tasks with custom rules".
+      Understand what exists before proposing changes.
+    assertions:
+      - type: skill-trigger
+        skill: hs-explore
+      - type: contains
+        value: derivePriority
+      - type: rubrics
+        criteria:
+          - Discovers the existing derivePriority function in src/utils/format-task.ts
+          - Notes that a partial implementation already exists
+          - Identifies that derivePriority only handles basic cases and needs extension
+
+  - id: finds-all-consumers
+    criteria: Agent finds all 3 consumers of the formatTask shared utility
+    input: |
+      Explore the codebase to understand the impact of changing the formatTask function signature.
+      Find all files that use formatTask.
+    assertions:
+      - type: rubrics
+        criteria:
+          - Finds formatTask usage in src/cli/index.ts
+          - Finds formatTask usage in src/api/index.ts
+          - Finds formatTask usage in src/reports/summary.ts
+          - Lists all 3 consumers in the exploration summary
+          - Notes that changes to formatTask signature affect all 3 files
+
+  - id: structured-summary
+    criteria: Agent produces a summary with what-exists, what-needs-to-change, consumers, risks
+    input: |
+      Explore the codebase for issue #50: "Add priority field to tasks with custom rules".
+      Produce a structured exploration summary.
+    assertions:
+      - type: rubrics
+        criteria:
+          - Includes a "what exists" section identifying current code
+          - Includes a "what needs to change" section
+          - Lists all consumers of affected interfaces
+          - Identifies risks (backward compatibility, multiple entry points)
diff --git a/evals/hivespec/hs-ship.eval.yaml b/evals/hivespec/hs-ship.eval.yaml
new file mode 100644
index 00000000..cd09124c
--- /dev/null
+++ b/evals/hivespec/hs-ship.eval.yaml
@@ -0,0 +1,88 @@
+description: Evaluates that the hs-ship skill applies verification gates, blast radius checks, and risk classification before merging
+
+execution:
+  targets:
+    - pi-cli
+
+workspace:
+  template: ./workspace-template
+  hooks:
+    before_all:
+      command:
+        - node
+        - "{{workspace_path}}/scripts/setup.mjs"
+
+input:
+  - role: user
+    content:
+      - type: file
+        value: "/plugins/hivespec/skills/hs-ship/SKILL.md"
+
+tests:
+  - id: requires-verification-before-shipping
+    criteria: Agent refuses to ship without verification evidence
+    input:
+      - role: user
+        content: |
+          I've finished implementing the priority field. The code changes are done.
+          Ship it — I haven't run any tests yet but I'm confident it works.
+          What should I do before merging?
+    assertions:
+      - type: skill-trigger
+        skill: hs-ship
+      - type: rubrics
+        criteria:
+          - Does not agree to merge without verification
+          - Insists on running build, test, and lint first
+          - Explains that confidence is not a substitute for verification evidence
+
+  - id: final-blast-radius-check
+    criteria: Agent checks for consumers of modified interfaces before shipping
+    input:
+      - role: user
+        content: |
+          The PR modifies the Task interface in src/models/task.ts and the formatTask
+          utility in src/utils/format-task.ts. All tests pass.
+          Before I mark this ready for review, what checks should I run?
+    assertions:
+      - type: rubrics
+        criteria:
+          - Recommends searching for all consumers of Task or formatTask across the codebase
+          - Identifies that consumers in cli, api, and reports may be affected
+          - Recommends verifying all consumers are updated before marking ready
+
+  - id: risk-classification
+    criteria: Agent correctly classifies a breaking API change as elevated risk requiring confirmation
+    input:
+      - role: user
+        content: |
+          Here is what the PR changes:
+          - Adds a required `priority` field to the Task interface (breaking change)
+          - Modifies the API response format in src/api/index.ts to include priority
+          - All tests pass and verification is complete.
+
+          Should I auto-merge this or get review first?
+    assertions:
+      - type: rubrics
+        criteria:
+          - Classifies this as elevated risk due to breaking interface or API changes
+          - Recommends review or explicit confirmation before merging
+          - Explains why breaking changes are elevated risk
+
+  - id: auto-merges-low-risk
+    criteria: Agent correctly identifies a docs-only change as low risk
+    input:
+      - role: user
+        content: |
+          Here is what the PR changes:
+          - Updated README.md to fix usage examples (typo corrections only)
+          - No code changes, no interface changes, no API changes
+          - Build, test, and lint all pass.
+
+          Should I auto-merge this or get review first?
+    assertions:
+      - type: rubrics
+        criteria:
+          - Classifies this as low risk (documentation only, no code changes)
+          - Indicates this is safe to merge without additional review
+          - Recommends squash merge
diff --git a/evals/hivespec/hs-verify.eval.yaml b/evals/hivespec/hs-verify.eval.yaml
new file mode 100644
index 00000000..494e7de7
--- /dev/null
+++ b/evals/hivespec/hs-verify.eval.yaml
@@ -0,0 +1,82 @@
+description: Evaluates that the hs-verify skill runs actual e2e verification, tests all execution modes, and checks blast radius
+
+execution:
+  targets:
+    - pi-cli
+
+workspace:
+  template: ./workspace-template
+  hooks:
+    before_all:
+      command:
+        - node
+        - "{{workspace_path}}/scripts/setup.mjs"
+
+input:
+  - role: user
+    content:
+      - type: file
+        value: "/plugins/hivespec/skills/hs-verify/SKILL.md"
+
+tests:
+  - id: runs-actual-verification
+    criteria: Agent runs build, test, and lint commands with actual output rather than claiming tests pass
+    input:
+      - role: user
+        content: |
+          I've finished implementing the priority field feature.
+          All the code is written. Verify that it works by running the build, test,
+          and lint commands listed in package.json.
+    assertions:
+      - type: skill-trigger
+        skill: hs-verify
+      - type: rubrics
+        criteria:
+          - Runs or attempts to run build, test, or lint commands
+          - Shows command output as evidence of execution
+          - Does not claim completion without showing verification output
+
+  - id: tests-all-execution-modes
+    criteria: Agent identifies and tests both CLI and API entry points
+    input:
+      - role: user
+        content: |
+          Verify the priority field works. The AGENTS.md says both CLI and API modes
+          use the same service layer and both must be tested.
+    assertions:
+      - type: rubrics
+        criteria:
+          - Identifies CLI (src/cli/index.ts) and API (src/api/index.ts) as separate entry points
+          - Tests or verifies the feature through both entry points
+          - Does not claim verification is complete after testing only one mode
+
+  - id: checks-blast-radius
+    criteria: Agent greps for modified types and finds all untouched consumers
+    input:
+      - role: user
+        content: |
+          I modified the Task interface in src/models/task.ts and the formatTask function
+          in src/utils/format-task.ts. Check whether any consumers were missed.
+    assertions:
+      - type: rubrics
+        criteria:
+          - Greps for Task interface usage across the codebase
+          - Greps for formatTask usage across the codebase
+          - Identifies all 3 consumers of formatTask (cli, api, reports)
+          - Flags any consumer that was not updated as a potential regression
+
+  - id: rejects-claims-without-evidence
+    criteria: Agent pushes back on premature completion claims
+    input:
+      - role: user
+        content: "All 50 tests pass and the build is clean. Ship it."
+      - role: assistant
+        content: "Let me verify that independently."
+      - role: user
+        content: "No need, I already checked. Just ship."
+    assertions:
+      - type: rubrics
+        criteria:
+          - Does not accept the user's claim at face value
+          - Insists on running verification commands independently
+          - Explains that verification evidence must come from actual command output
diff --git a/evals/hivespec/workspace-template/AGENTS.md b/evals/hivespec/workspace-template/AGENTS.md
new file mode 100644
index 00000000..efc4e0a4
--- /dev/null
+++ b/evals/hivespec/workspace-template/AGENTS.md
@@ -0,0 +1,22 @@
+# Task Tracker
+
+A minimal task management CLI and API.
+
+## Conventions
+
+- **Worktree location:** `../task-tracker.worktrees/<branch-name>`
+- **Branch naming:** `<type>/<issue-number>-<short-desc>`
+- **Commit format:** Conventional commits — `type(scope): description`
+- **Test command:** `npx vitest run`
+- **Lint command:** `npx biome check .`
+- **Build command:** `npx tsc --noEmit`
+
+## Architecture
+
+- `src/models/` — Data types and interfaces
+- `src/services/` — Business logic
+- `src/cli/` — CLI entry point
+- `src/api/` — API entry point (Express)
+- `src/utils/` — Shared utilities
+
+Both CLI and API modes use the same service layer. Changes to services or models must be tested through both entry points.
diff --git a/evals/hivespec/workspace-template/CLAUDE.md b/evals/hivespec/workspace-template/CLAUDE.md
new file mode 100644
index 00000000..08f755ba
--- /dev/null
+++ b/evals/hivespec/workspace-template/CLAUDE.md
@@ -0,0 +1 @@
+**FIRST ACTION**: Read @AGENTS.md before any task.
diff --git a/evals/hivespec/workspace-template/README.md b/evals/hivespec/workspace-template/README.md
new file mode 100644
index 00000000..7ee755b9
--- /dev/null
+++ b/evals/hivespec/workspace-template/README.md
@@ -0,0 +1,22 @@
+# Task Tracker
+
+A minimal taks management CLI and API.
+
+## Usage
+
+```bash
+# Add a task
+task-tracker add "Buy groceries"
+
+# List tasks
+task-tracker list
+```
+
+## Development
+
+```bash
+npm install
+npm run build
+npm test
+npm run lint
+```
diff --git a/evals/hivespec/workspace-template/biome.json b/evals/hivespec/workspace-template/biome.json
new file mode 100644
index 00000000..a1591f5b
--- /dev/null
+++ b/evals/hivespec/workspace-template/biome.json
@@ -0,0 +1,17 @@
+{
+  "$schema": "https://biomejs.dev/schemas/1.9.0/schema.json",
+  "organizeImports": {
+    "enabled": true
+  },
+  "linter": {
+    "enabled": true,
+    "rules": {
+      "recommended": true
+    }
+  },
+  "formatter": {
+    "enabled": true,
+    "indentStyle": "space",
+    "indentWidth": 2
+  }
+}
diff --git a/evals/hivespec/workspace-template/package.json b/evals/hivespec/workspace-template/package.json
new file mode 100644
index 00000000..1d058465
--- /dev/null
+++ b/evals/hivespec/workspace-template/package.json
@@ -0,0 +1,15 @@
+{
+  "name": "task-tracker",
+  "version": "1.0.0",
+  "type": "module",
+  "scripts": {
+    "build": "tsc --noEmit",
+    "test": "vitest run",
+    "lint": "biome check ."
+  },
+  "devDependencies": {
+    "typescript": "^5.8.0",
+    "vitest": "^3.0.0",
+    "@biomejs/biome": "^1.9.0"
+  }
+}
diff --git a/evals/hivespec/workspace-template/scripts/setup.mjs b/evals/hivespec/workspace-template/scripts/setup.mjs
new file mode 100644
index 00000000..696a216f
--- /dev/null
+++ b/evals/hivespec/workspace-template/scripts/setup.mjs
@@ -0,0 +1,80 @@
+#!/usr/bin/env node
+/**
+ * Workspace before_all hook: copy hivespec skills into the workspace
+ * for agent discovery. Receives workspace_path via stdin JSON from AgentV.
+ */
+
+import { execSync } from 'node:child_process';
+import { cpSync, mkdirSync, readFileSync, readdirSync } from 'node:fs';
+import { join } from 'node:path';
+
+// Read workspace_path from stdin (provided by AgentV orchestrator)
+let workspacePath;
+try {
+  const stdin = readFileSync(0, 'utf8');
+  const context = JSON.parse(stdin);
+  workspacePath = context.workspace_path;
+} catch {
+  workspacePath = process.cwd();
+}
+
+// Resolve repo root from cwd (eval dir is inside the repo)
+let repoRoot;
+try {
+  repoRoot = execSync('git rev-parse --show-toplevel', {
+    encoding: 'utf8',
+  }).trim();
+} catch {
+  console.error('Failed to resolve repo root from cwd:', process.cwd());
+  process.exit(1);
+}
+
+console.log(`Workspace: ${workspacePath}`);
+console.log(`Repo root: ${repoRoot}`);
+
+// Copy to skill discovery directories in the workspace
+// Each provider discovers skills from a different path:
+//   Claude CLI: .claude/skills/
+//   Pi CLI / Pi Coding Agent: .agents/skills/
+//   Codex: .agents/skills/ or .codex/skills/
+const skillDirs = [
+  join(workspacePath, '.claude', 'skills'),
+  join(workspacePath, '.agents', 'skills'),
+  join(workspacePath, '.pi', 'skills'),
+];
+for (const dir of skillDirs) {
+  mkdirSync(dir, { recursive: true });
+}
+
+// Copy all hivespec skills
+const pluginSkillsDir = join(repoRoot, 'plugins', 'hivespec', 'skills');
+const skillNames = readdirSync(pluginSkillsDir);
+
+for (const name of skillNames) {
+  const src = join(pluginSkillsDir, name);
+  for (const dir of skillDirs) {
+    cpSync(src, join(dir, name), { recursive: true });
+  }
+  console.log(`Copied ${name}`);
+}
+
+for (const dir of skillDirs) {
+  console.log(`Skills in ${dir}: ${readdirSync(dir).join(', ')}`);
+}
+
+// Initialize git repo in workspace so ship/claim tests can use git commands
+try {
+  execSync('git init && git add -A && git commit -m "initial commit"', {
+    cwd: workspacePath,
+    encoding: 'utf8',
+    stdio: 'pipe',
+  });
+  execSync('git checkout -b feat/42-add-priority', {
+    cwd: workspacePath,
+    encoding: 'utf8',
+    stdio: 'pipe',
+  });
+  console.log('Git repo initialized with feat branch');
+} catch (e) {
+  console.error('Git init failed:', e.message);
+}
diff --git a/evals/hivespec/workspace-template/src/api/index.ts b/evals/hivespec/workspace-template/src/api/index.ts
new file mode 100644
index 00000000..fed44208
--- /dev/null
+++ b/evals/hivespec/workspace-template/src/api/index.ts
@@ -0,0 +1,26 @@
+import { addTask, listTasks } from '../services/task-service';
+import { formatTask } from '../utils/format-task';
+
+/**
+ * API handler — mirrors CLI functionality over HTTP.
+ * Both entry points share the same service layer.
+ */
+export function handleRequest(
+  method: string,
+  path: string,
+  body?: Record<string, unknown>,
+): { status: number; body: unknown } {
+  if (method === 'GET' && path === '/tasks') {
+    const tasks = listTasks();
+    return { status: 200, body: tasks.map(formatTask) };
+  }
+
+  if (method === 'POST' && path === '/tasks') {
+    const title = body?.title as string;
+    if (!title) return { status: 400, body: { error: 'title is required' } };
+    const task = addTask(title);
+    return { status: 201, body: task };
+  }
+
+  return { status: 404, body: { error: 'not found' } };
+}
diff --git a/evals/hivespec/workspace-template/src/cli/index.ts b/evals/hivespec/workspace-template/src/cli/index.ts
new file mode 100644
index 00000000..f7cf249c
--- /dev/null
+++ b/evals/hivespec/workspace-template/src/cli/index.ts
@@ -0,0 +1,17 @@
+import { addTask, getFormattedTasks } from '../services/task-service';
+import { formatTask } from '../utils/format-task';
+
+export function runCli(args: string[]): string {
+  const [command, ...rest] = args;
+
+  switch (command) {
+    case 'add': {
+      const task = addTask(rest.join(' '));
+      return `Created: ${formatTask(task)}`;
+    }
+    case 'list':
+      return getFormattedTasks().join('\n') || 'No tasks found.';
+    default:
+      return 'Usage: task-tracker [add|list] [args...]';
+  }
+}
diff --git a/evals/hivespec/workspace-template/src/models/task.ts b/evals/hivespec/workspace-template/src/models/task.ts
new file mode 100644
index 00000000..3fd8d2eb
--- /dev/null
+++ b/evals/hivespec/workspace-template/src/models/task.ts
@@ -0,0 +1,12 @@
+export interface Task {
+  readonly id: string;
+  readonly title: string;
+  readonly status: 'todo' | 'in_progress' | 'done';
+  readonly createdAt: Date;
+  readonly updatedAt: Date;
+}
+
+export interface TaskFilter {
+  readonly status?: Task['status'];
+  readonly search?: string;
+}
diff --git a/evals/hivespec/workspace-template/src/reports/summary.ts b/evals/hivespec/workspace-template/src/reports/summary.ts
new file mode 100644
index 00000000..dcbd9cd7
--- /dev/null
+++ b/evals/hivespec/workspace-template/src/reports/summary.ts
@@ -0,0 +1,11 @@
+import { listTasks } from '../services/task-service';
+import { formatTask } from '../utils/format-task';
+
+/**
+ * Third consumer of formatTask — generates a summary report.
+ */
+export function generateSummary(): string {
+  const tasks = listTasks();
+  const lines = [`Task Summary (${tasks.length} total)`, '---', ...tasks.map(formatTask)];
+  return lines.join('\n');
+}
diff --git a/evals/hivespec/workspace-template/src/services/task-service.ts b/evals/hivespec/workspace-template/src/services/task-service.ts
new file mode 100644
index 00000000..88ea4544
--- /dev/null
+++ b/evals/hivespec/workspace-template/src/services/task-service.ts
@@ -0,0 +1,32 @@
+import type { Task, TaskFilter } from '../models/task';
+import { formatTask } from '../utils/format-task';
+
+const tasks: Task[] = [];
+
+export function addTask(title: string): Task {
+  const task: Task = {
+    id: `task-${tasks.length + 1}`,
+    title,
+    status: 'todo',
+    createdAt: new Date(),
+    updatedAt: new Date(),
+  };
+  tasks.push(task);
+  return task;
+}
+
+export function listTasks(filter?: TaskFilter): Task[] {
+  let result = [...tasks];
+  if (filter?.status) {
+    result = result.filter((t) => t.status === filter.status);
+  }
+  if (filter?.search) {
+    const q = filter.search.toLowerCase();
+    result = result.filter((t) => t.title.toLowerCase().includes(q));
+  }
+  return result;
+}
+
+export function getFormattedTasks(filter?: TaskFilter): string[] {
+  return listTasks(filter).map(formatTask);
+}
diff --git a/evals/hivespec/workspace-template/src/utils/format-task.ts b/evals/hivespec/workspace-template/src/utils/format-task.ts
new file mode 100644
index 00000000..4212b31d
--- /dev/null
+++ b/evals/hivespec/workspace-template/src/utils/format-task.ts
@@ -0,0 +1,19 @@
+import type { Task } from '../models/task';
+
+/**
+ * Shared utility used by CLI, API, and reports.
+ * Format a task for display output.
+ */
+export function formatTask(task: Task): string {
+  const statusIcon = task.status === 'done' ? '✓' : task.status === 'in_progress' ? '→' : '○';
+  return `${statusIcon} [${task.id}] ${task.title}`;
+}
+
+/**
+ * Partial implementation of priority derivation.
+ * Currently only handles basic cases — does not support custom priority rules.
+ */
+export function derivePriority(task: Task): 'high' | 'medium' | 'low' {
+  if (task.status === 'in_progress') return 'high';
+  return 'medium';
+}
diff --git a/evals/hivespec/workspace-template/tsconfig.json b/evals/hivespec/workspace-template/tsconfig.json
new file mode 100644
index 00000000..9b274f2a
--- /dev/null
+++ b/evals/hivespec/workspace-template/tsconfig.json
@@ -0,0 +1,12 @@
+{
+  "compilerOptions": {
+    "target": "ES2022",
+    "module": "ESNext",
+    "moduleResolution": "bundler",
+    "strict": true,
+    "noEmit": true,
+    "esModuleInterop": true,
+    "skipLibCheck": true
+  },
+  "include": ["src/**/*.ts"]
+}
diff --git a/plugins/hivespec/skills/hs-claim/SKILL.md b/plugins/hivespec/skills/hs-claim/SKILL.md
new file mode 100644
index 00000000..7ace30aa
--- /dev/null
+++ b/plugins/hivespec/skills/hs-claim/SKILL.md
@@ -0,0 +1,81 @@
+---
+name: hs-claim
+description: >-
+  Use when starting work on a GitHub issue, setting up a development workspace,
+  creating a worktree and branch, or when asked to "claim an issue", "start work on
+  issue #N", "set up a branch for this", or "create a worktree". Handles issue
+  assignment, worktree creation, branch setup, and draft PR creation.
+---
+
+# Claim
+
+## Overview
+
+Claim a GitHub issue and set up the full development workspace: worktree, branch, initial commit, and draft PR. This is the "I'm starting work" signal.
+
+## Process
+
+### Step 1: Read repo guidelines
+
+Read ALL convention sources before writing any code:
+
+- CLAUDE.md
+- AGENTS.md
+- CONTRIBUTING.md (if present)
+- `.editorconfig`, `biome.json`, `.eslintrc`, pre-commit hook configs
+
+Extract: worktree location, branch naming, commit format, lint rules, test commands.
+
+If no conventions are specified, use these defaults:
+
+| Convention | Default |
+|---|---|
+| Worktree location | `../<repo>.worktrees/<type>-<short-desc>` |
+| Branch naming | `<type>/<issue-number>-<short-desc>` |
+| Commit format | Conventional commits: `type(scope): description` |
+
+### Step 2: Check issue state
+
+- Verify the issue exists and is open
+- Check it is unassigned or already assigned to self
+- Read the full issue body — extract objective, constraints, acceptance signals, non-goals
+
+### Step 3: Self-assign
+
+```bash
+gh issue edit <number> --add-assignee @me
+```
+
+### Step 4: Create worktree
+
+```bash
+git fetch origin
+git worktree add <worktree-path> -b <branch-name> origin/main
+```
+
+### Step 5: Initial commit and draft PR
+
+```bash
+cd <worktree-path>
+git commit --allow-empty -m "chore(<scope>): claim issue #<number>"
+git push -u origin <branch-name>
+gh pr create --draft --title "<type>(<scope>): <description>" --body "Closes #<number>"
+```
+
+### Step 6: Assess scope
+
+After claiming, determine which phases to run next:
+
+- **Trivial** (< 5 lines, docs, config): skip to hs-implement
+- **Bug fix with clear root cause**: proceed to hs-explore, then hs-implement
+- **Feature or complex change**: proceed to hs-explore → hs-design → hs-plan
+
+## Hard Gates
+
+- Must have an issue number — no untracked work
+- Must read repo guidelines before any other action
+- Must read the full issue body before assessing scope
+
+## Multi-Repo Work
+
+For changes spanning multiple repositories, the plugin defers to the repo's CLAUDE.md/AGENTS.md for coordination instructions. The plugin's only responsibility: read the repo guidelines and follow them. When dispatching a subagent to a target repo, the subagent must read that repo's guidelines as its first action.
diff --git a/plugins/hivespec/skills/hs-design/SKILL.md b/plugins/hivespec/skills/hs-design/SKILL.md
new file mode 100644
index 00000000..55c4db81
--- /dev/null
+++ b/plugins/hivespec/skills/hs-design/SKILL.md
@@ -0,0 +1,84 @@
+---
+name: hs-design
+description: >-
+  Use when a feature or change needs a design before implementation, when the scope
+  is non-trivial, when asked to "brainstorm", "design this", "write a spec", "propose
+  approaches", or when multiple valid implementation strategies exist. Produces an
+  approved design spec through collaborative dialogue.
+---
+
+# Design
+
+## Overview
+
+Turn exploration findings into a validated design through collaborative dialogue. Prevents implementation without an approved spec.
+
+## Process
+
+### Step 1: Assess scope
+
+If the change is too large for a single design session, decompose into sub-projects first. Each sub-project gets its own design cycle.
+
+### Step 2: Clarifying questions
+
+Ask questions one at a time. Prefer multiple choice over open-ended:
+
+```
+Should this support both CLI and API modes?
+a) CLI only (simpler, covers current use cases)
+b) Both CLI and API (more flexible, more work)
+c) API only (if CLI is being deprecated)
+
+Recommendation: (b) — the API mode is needed for pipeline integration.
+```
+
+### Step 3: Propose approaches
+
+Present 2-3 approaches with trade-offs and a recommendation:
+
+| Approach | Pros | Cons |
+|---|---|---|
+| A: Extend existing | Low risk, fast | Limited flexibility |
+| B: New abstraction | Clean design | More code to maintain |
+| C: External library | Battle-tested | New dependency |
+
+**Recommendation:** Approach A — explain why.
+
+### Step 4: Section-by-section approval
+
+Present the design in sections. Get user approval after each before proceeding:
+
+1. Data model / types
+2. API / interface changes
+3. Implementation approach
+4. Migration / backward compatibility
+
+### Step 5: Write spec
+
+Save the approved design to the worktree branch:
+
+```
+.agents/plans/YYYY-MM-DD-<topic>-design.md
+```
+
+### Step 6: Self-review
+
+Before presenting the spec to the user, check:
+
+- No placeholders ("TBD", "to be determined", "similar to above")
+- No ambiguous language ("might", "could potentially", "if needed")
+- Consistent naming throughout
+- All decisions from the dialogue are captured
+- Acceptance signals are concrete and testable
+
+### Step 7: User reviews written spec
+
+The user reviews the written spec file. Design is not approved until the user confirms.
+
+## Hard Gate
+
+No implementation until design is approved. Every project, regardless of perceived simplicity. The thought "this is too simple to need a design" is the strongest signal that a design is needed.
+
+## Visual Companions
+
+For UI changes, include mockups or wireframes (ASCII, Mermaid, or browser-based) alongside the spec. Visual designs prevent misalignment more effectively than prose.
diff --git a/plugins/hivespec/skills/hs-explore/SKILL.md b/plugins/hivespec/skills/hs-explore/SKILL.md
new file mode 100644
index 00000000..b08c1a1f
--- /dev/null
+++ b/plugins/hivespec/skills/hs-explore/SKILL.md
@@ -0,0 +1,77 @@
+---
+name: hs-explore
+description: >-
+  Use when starting work on a feature or bug fix to understand the codebase before
+  proposing changes, when asked to "explore the code", "understand the problem",
+  "find related code", or "check for existing implementations". Produces a structured
+  summary of what exists, what needs to change, and key risks.
+---
+
+# Explore
+
+## Overview
+
+Structured codebase exploration before design. Understand the problem space, existing code, and prior art before proposing any changes.
+
+## Process
+
+### Step 1: Read the issue
+
+Read the full issue body. Extract:
+
+- Objective — what is the desired end state?
+- Constraints — what must not change?
+- Acceptance signals — how do we know it works?
+- Non-goals — what is explicitly out of scope?
+
+### Step 2: Check for existing implementations
+
+Before proposing new code, search for prior art within the codebase:
+
+```
+Grep for related function names, types, and concepts.
+```
+
+Existing partial implementations change the design. A function that already does 80% of what you need is not a greenfield task — it is an extension.
+
+### Step 3: Fan-out exploration
+
+Use this tool priority (fastest first):
+
+1. **Local filesystem** — Glob, Grep, Read for file patterns and code search
+2. **Semantic search** — DeepWiki MCP or LSP for understanding dependency relationships
+3. **Web** — WebFetch/WebSearch for external API docs or prior art
+
+### Step 4: Find all consumers of shared interfaces
+
+When the change touches a shared function, type, or interface:
+
+```
+Grep for all import sites and call sites.
+```
+
+Every consumer must be understood before design begins. A change to a shared interface with 3 consumers is a different problem than one with 30.
+
+### Step 5: Check for duplicates
+
+Search open and recently closed issues/PRs for related work. Avoid duplicating effort.
+
+### Step 6: Produce summary
+
+Output a structured summary to conversation context (not a file):
+
+- **What exists** — relevant code, partial implementations, related utilities
+- **What needs to change** — files to modify, new files needed
+- **All consumers** — every call site of shared interfaces being modified
+- **Key decisions** — design choices that need resolution
+- **Risks** — backward compatibility, performance, blast radius
+
+## Hard Gates
+
+- Must understand existing code before proposing changes
+- Must check for duplicate/related issues
+- Must find all consumers of any shared interface being modified
+
+## Resumability
+
+Before re-exploring, check what is already in conversation context. Do not repeat work that has already been done in this session.
diff --git a/plugins/hivespec/skills/hs-implement/SKILL.md b/plugins/hivespec/skills/hs-implement/SKILL.md
new file mode 100644
index 00000000..6e551cf1
--- /dev/null
+++ b/plugins/hivespec/skills/hs-implement/SKILL.md
@@ -0,0 +1,83 @@
+---
+name: hs-implement
+description: >-
+  Use when executing an implementation plan task-by-task, writing code with TDD
+  discipline, dispatching subagents for independent tasks, or debugging failures.
+  Triggers when asked to "implement the plan", "start coding", "write the code",
+  "execute the tasks", or when a plan exists and implementation has not started.
+---
+
+# Implement
+
+## Overview
+
+Execute the plan task-by-task with TDD discipline. Dispatch subagents for independent tasks. Debug systematically when things break.
+
+## Hard Gate
+
+Must have a plan to execute. If no plan exists at `.agents/plans/*-plan.md` on the current branch, stop and tell the user to run hs-plan first. Exception: trivial changes (< 5 lines, docs, config) may proceed without a plan.
+
+## TDD Protocol
+
+For every task:
+
+1. **Write the failing test first**
+2. **Run it — must fail (red).** If it passes, the test is wrong or the feature already exists. Investigate.
+3. **Write minimal implementation** to make the test pass
+4. **Run it — must pass (green).** If it fails, debug (see Systematic Debugging below).
+5. **Refactor** if needed — tests must still pass after refactoring
+6. **Run full pre-commit checks:** build → test → lint
+7. **Commit**
+
+Before each commit, run the full pre-commit check chain. If the repo has pre-commit hooks, run them explicitly rather than discovering failures on push:
+
+```bash
+bun run build && bun run test && bun run lint
+```
+
+Adapt the commands to the repo's actual toolchain (read from CLAUDE.md/AGENTS.md/package.json).
+
+## Subagent Dispatch
+
+When 2+ tasks are independent with no shared state, dispatch them in parallel:
+
+- **Fresh subagent per task** — no context pollution between tasks
+- **Each subagent gets:** the plan, the specific task, and any relevant context files
+- **Model selection:** use cheaper models for mechanical tasks (rename, move, format), capable models for judgment tasks (architecture, complex logic)
+
+### Subagent Review Protocol
+
+After each subagent completes, review in two stages:
+
+1. **Spec compliance** — does the output match the plan's requirements?
+2. **Code quality** — is the code clean, tested, and consistent with the codebase?
+
+Load `references/spec-reviewer-prompt.md` and `references/code-quality-reviewer-prompt.md` for reviewer instructions.
+
+### Subagent Status Handling
+
+| Status | Action |
+|---|---|
+| DONE | Accept, move to next task |
+| DONE_WITH_CONCERNS | Review concerns, fix if valid, accept if not |
+| NEEDS_CONTEXT | Provide missing context, re-dispatch |
+| BLOCKED | Investigate blocker, unblock or escalate to user |
+
+## Systematic Debugging
+
+When something breaks:
+
+1. **Read the error** — the full error message, not just the first line
+2. **Check assumptions** — is the file where you think it is? Is the function signature what you expect?
+3. **Try a focused fix** — one change at a time, re-run after each
+4. **Do not retry blindly** — if the same command fails twice, the problem is not transient
+5. **Do not abandon a viable approach after one failure** — diagnose before switching tactics
+6. **Escalate to user** only when genuinely stuck after investigation
+
+**Iron law:** No fixes without root cause investigation first. "It might be X" is not a root cause. Read the code, trace the execution, find the actual cause.
+
+## Skill Resources
+
+- `references/implementer-prompt.md` — Subagent prompt template for implementation tasks
+- `references/spec-reviewer-prompt.md` — Subagent prompt template for spec compliance review
+- `references/code-quality-reviewer-prompt.md` — Subagent prompt template for code quality review
diff --git a/plugins/hivespec/skills/hs-implement/references/code-quality-reviewer-prompt.md b/plugins/hivespec/skills/hs-implement/references/code-quality-reviewer-prompt.md
new file mode 100644
index 00000000..d32d1b9d
--- /dev/null
+++ b/plugins/hivespec/skills/hs-implement/references/code-quality-reviewer-prompt.md
@@ -0,0 +1,26 @@
+# Code Quality Reviewer Prompt
+
+You are reviewing a completed implementation task for code quality. Spec compliance has already been verified — focus on code quality, maintainability, and correctness.
+
+## Review Checklist
+
+1. **No dead code** — no commented-out code, unused imports, unreachable branches
+2. **No placeholders** — no TODO comments, no "implement later" stubs
+3. **Consistent style** — matches the surrounding codebase conventions
+4. **Error handling** — errors are handled at system boundaries, not over-handled internally
+5. **No security vulnerabilities** — no injection risks, no hardcoded secrets, no unsafe deserialization
+6. **No unnecessary complexity** — no premature abstractions, no speculative features, no over-engineering
+7. **Test quality** — tests verify behavior (not implementation details), names describe what is being tested
+
+## Output Format
+
+For each finding:
+
+```
+[severity] file:line — description
+Fix: <specific suggestion>
+```
+
+Severity: CRITICAL (must fix), MEDIUM (should fix), LOW (consider fixing)
+
+End with: **APPROVED** or **CHANGES_REQUESTED** (with specific fixes)
diff --git a/plugins/hivespec/skills/hs-implement/references/implementer-prompt.md b/plugins/hivespec/skills/hs-implement/references/implementer-prompt.md
new file mode 100644
index 00000000..cf19d8a3
--- /dev/null
+++ b/plugins/hivespec/skills/hs-implement/references/implementer-prompt.md
@@ -0,0 +1,24 @@
+# Implementer Subagent Prompt
+
+You are implementing a specific task from an implementation plan. Follow the task exactly as written.
+
+## Your Task
+
+{{task_description}}
+
+## Rules
+
+1. **TDD discipline** — write the failing test first, run it (must fail), then implement, run it (must pass)
+2. **Minimal implementation** — write only what is needed to pass the test. No speculative features.
+3. **Exact code** — follow the plan's code exactly unless you find a bug in it. If you deviate, explain why.
+4. **Pre-commit checks** — run build, test, and lint before committing
+5. **One commit per task** — use the commit message from the plan
+
+## Status Reporting
+
+When done, report one of:
+
+- **DONE** — task completed as specified
+- **DONE_WITH_CONCERNS** — task completed but you noticed potential issues (describe them)
+- **NEEDS_CONTEXT** — you need information not provided (describe what you need)
+- **BLOCKED** — you cannot proceed (describe the blocker)
diff --git a/plugins/hivespec/skills/hs-implement/references/spec-reviewer-prompt.md b/plugins/hivespec/skills/hs-implement/references/spec-reviewer-prompt.md
new file mode 100644
index 00000000..7850e2d0
--- /dev/null
+++ b/plugins/hivespec/skills/hs-implement/references/spec-reviewer-prompt.md
@@ -0,0 +1,22 @@
+# Spec Compliance Reviewer Prompt
+
+You are reviewing a completed implementation task for spec compliance. Your job is to verify that the implementation matches what the plan and design spec require.
+
+## Review Checklist
+
+1. **Does the implementation match the plan's task description?** — Compare the diff against the specific task requirements
+2. **Are all acceptance signals addressed?** — Check the design spec's acceptance criteria
+3. **Are types consistent?** — Do new types match the design spec's type definitions?
+4. **Are all consumers updated?** — If a shared interface changed, verify every call site was updated
+5. **Are tests present and meaningful?** — Do tests verify the actual requirement, not just that code runs?
+
+## Output Format
+
+For each finding:
+
+```
+[PASS/FAIL] <checklist item>
+Reason: <specific evidence from the diff>
+```
+
+End with: **APPROVED** or **CHANGES_REQUESTED** (with specific file:line references)
diff --git a/plugins/hivespec/skills/hs-plan/SKILL.md b/plugins/hivespec/skills/hs-plan/SKILL.md
new file mode 100644
index 00000000..2d0aa5cf
--- /dev/null
+++ b/plugins/hivespec/skills/hs-plan/SKILL.md
@@ -0,0 +1,106 @@
+---
+name: hs-plan
+description: >-
+  Use when converting an approved design spec into an implementation plan, when the
+  design is ready and you need step-by-step tasks with exact code and commands, or
+  when asked to "write a plan", "break this into tasks", "create implementation steps",
+  or "plan the implementation".
+---
+
+# Plan
+
+## Overview
+
+Convert an approved design into a bite-sized implementation plan with exact file paths, complete code, and test commands. Every step is a 2-5 minute task.
+
+## Hard Gate
+
+Must reference an approved design spec. If no spec exists at `.agents/plans/*-design.md` on the current branch, stop and tell the user to run hs-design first.
+
+## Process
+
+### Step 1: Plan header
+
+Start the plan with:
+
+```markdown
+# Implementation Plan: <topic>
+
+**Design spec:** `.agents/plans/YYYY-MM-DD-<topic>-design.md`
+**Goal:** <one-line summary>
+**Architecture:** <key technical decisions>
+**Tech stack:** <languages, frameworks, tools>
+```
+
+### Step 2: Enumerate all consumers
+
+When modifying a shared function, type, or interface: grep for all import sites and call sites. Every consumer must appear in the plan as a file to modify. Missing a consumer is a guaranteed bug.
+
+### Step 3: Define tasks
+
+Each task is a 2-5 minute step following TDD:
+
+```markdown
+## Task N: <description>
+
+**Files:** `path/to/file.ts` (modify), `path/to/file.test.ts` (create)
+
+### Test
+\`\`\`typescript
+// Exact test code
+\`\`\`
+
+### Run
+\`\`\`bash
+bun run test -- path/to/file.test.ts
+\`\`\`
+
+### Expected: FAIL (function does not exist yet)
+
+### Implementation
+\`\`\`typescript
+// Exact implementation code
+\`\`\`
+
+### Run
+\`\`\`bash
+bun run test -- path/to/file.test.ts
+\`\`\`
+
+### Expected: PASS
+
+### Commit
+\`\`\`bash
+git add path/to/file.ts path/to/file.test.ts
+git commit -m "feat(scope): add <description>"
+\`\`\`
+```
+
+### Step 4: Self-review checklist
+
+Before presenting the plan:
+
+- [ ] Every file in the design spec has a corresponding task
+- [ ] Every consumer of modified interfaces has a task
+- [ ] No placeholders ("TBD", "implement later", "similar to Task N")
+- [ ] Every task has actual code, not pseudocode
+- [ ] Every task has exact commands with expected output
+- [ ] Tasks are ordered by dependency (tests before implementation)
+- [ ] Types are consistent across all tasks
+
+### Step 5: Save plan
+
+Save to the worktree branch:
+
+```
+.agents/plans/YYYY-MM-DD-<topic>-plan.md
+```
+
+## Plan Quality Rules
+
+- **No placeholders** — every task has real code
+- **No hand-waving** — "similar to Task N" is not a task
+- **DRY** — if two tasks touch the same file, consider merging
+- **YAGNI** — do not plan tasks for hypothetical future requirements
+- **Frequent commits** — each task ends with a commit
+- **TDD order** — write test, run it (red), implement, run it (green), commit
diff --git a/plugins/hivespec/skills/hs-ship/SKILL.md b/plugins/hivespec/skills/hs-ship/SKILL.md
new file mode 100644
index 00000000..aea6efe0
--- /dev/null
+++ b/plugins/hivespec/skills/hs-ship/SKILL.md
@@ -0,0 +1,95 @@
+---
+name: hs-ship
+description: >-
+  Use when implementation and verification are complete and you need to integrate the
+  work, when asked to "ship it", "merge the PR", "mark ready for review", "clean up
+  the branch", or "finish this work". Handles final verification, PR management, merge,
+  and worktree cleanup.
+---
+
+# Ship
+
+## Overview
+
+Complete the development branch and integrate the work. Final verification gate, PR management, merge, and worktree cleanup.
+
+## Hard Gate
+
+Must have passing verification evidence from hs-verify before shipping. If no verification has been done in this session, stop and tell the user to run hs-verify first.
+
+## Process
+
+### Step 1: Final verification gate
+
+Run the full check chain one last time:
+
+```bash
+bun run build && bun run test && bun run lint
+```
+
+All must pass with output as evidence. Do not skip this even if hs-verify ran recently — code may have changed since.
+
+### Step 2: Final blast radius check
+
+Before marking the PR ready, grep for the primary types and functions changed in this PR across the entire codebase:
+
+```bash
+git diff main...HEAD --name-only  # Files changed in this PR
+# For each modified shared type/function:
+grep -r "TypeName\|functionName" --include="*.ts" .
+```
+
+Any consumer not touched by this PR is a potential miss. This is the last line of defense.
+
+### Step 3: Push and mark ready
+
+```bash
+git push
+gh pr ready <number>
+```
+
+### Step 4: Risk classification
+
+Assess the PR for merge risk:
+
+| Auto-merge (low risk) | Confirm before merge (elevated risk) |
+|---|---|
+| Documentation changes | Breaking API changes |
+| Config file updates | Feature deletion |
+| Additive features (new files only) | Schema or data model changes |
+| Isolated bug fixes | Security-sensitive changes |
+| Style/formatting | Cross-repo coordination |
+| Test additions | Changes to shared types with many consumers |
+
+### Step 5: Merge
+
+For auto-merge candidates, squash merge:
+
+```bash
+gh pr merge <number> --squash
+```
+
+For elevated risk, present the risk assessment and wait for explicit user confirmation before merging.
+
+### Step 6: Worktree cleanup
+
+After merge:
+
+```bash
+git worktree remove <worktree-path>
+cd <main-worktree>
+git pull origin main
+```
+
+### Step 7: Close tracking
+
+- Verify the linked issue was closed by the merge
+- For multi-repo work: link related PRs, update tracking issues
+
+## Cross-Repo Work
+
+For changes spanning multiple repositories, follow the coordination instructions in the repo's CLAUDE.md/AGENTS.md. The plugin's additions:
+
+- Link related PRs across repos in PR descriptions
+- Cross-repo PRs require explicit user confirmation before merging
+- Update tracking issues with links to all related PRs
diff --git a/plugins/hivespec/skills/hs-using-hivespec/SKILL.md b/plugins/hivespec/skills/hs-using-hivespec/SKILL.md
new file mode 100644
index 00000000..6d12067f
--- /dev/null
+++ b/plugins/hivespec/skills/hs-using-hivespec/SKILL.md
@@ -0,0 +1,86 @@
+---
+name: hs-using-hivespec
+description: >-
+  Use when starting any conversation or session to establish the agentic delivery
+  lifecycle. Determines which phase skills to invoke and prevents rationalization
+  ("this is too simple", "I'll just do this one thing first"). Skip this skill if
+  dispatched as a subagent to execute a specific task.
+---
+
+# Using HiveSpec
+
+## Overview
+
+Entry point skill that establishes the phase-based delivery lifecycle and enforces skill invocation discipline.
+
+## Subagent Stop
+
+If you were dispatched as a subagent to execute a specific task, skip this skill entirely.
+
+## Lifecycle
+
+```
+hs-claim → hs-explore → hs-design → hs-plan → hs-implement → hs-verify → hs-ship
+```
+
+| Phase | Skill | What Happens |
+|---|---|---|
+| Claim | hs-claim | Claim issue, create worktree + branch + draft PR |
+| Explore | hs-explore | Understand the codebase and problem space |
+| Design | hs-design | Brainstorm approaches, write approved spec |
+| Plan | hs-plan | Convert spec into bite-sized implementation plan |
+| Implement | hs-implement | TDD execution with subagent dispatch |
+| Verify | hs-verify | E2E red/green testing, code review, blast radius check |
+| Ship | hs-ship | Mark PR ready, merge, clean up worktree |
+
+## Phase Skip Rules
+
+Not every change needs every phase:
+
+- **Trivial changes** (< 5 lines, docs-only, config-only): claim → implement → verify → ship
+- **Bug fixes with clear root cause**: claim → explore → implement → verify → ship
+- **Well-specified issues** (full spec in issue body): claim → explore → plan → implement → verify → ship
+
+When in doubt, do not skip phases. Skipping design on a "simple" change that turns out to be complex is more expensive than spending 5 minutes on design.
+
+## The 1% Rule
+
+If there is even a 1% chance a phase skill applies to the current task, invoke it. Check for applicable skills BEFORE any response or action — including clarifying questions.
+
+## Red Flags
+
+These thoughts mean STOP — you are rationalizing your way out of a phase:
+
+| Thought | Reality |
+|---|---|
+| "This is just a simple question" | Questions are tasks. Check the lifecycle. |
+| "I need more context first" | That is what hs-explore does. |
+| "Let me just write the code quickly" | That is what hs-implement does, with TDD. |
+| "I can skip the design for this" | Every project needs a design, regardless of perceived simplicity. |
+| "Tests are passing, we're done" | Unit tests ≠ verified. That is what hs-verify does. |
+| "I'll clean up the PR later" | That is what hs-ship does, with blast radius checks. |
+| "Let me explore the code first" | Use hs-explore — it has structured output. |
+| "I know what needs to change" | Verify with hs-explore. Partial implementations may already exist. |
+
+## Skill Priority
+
+When multiple skills could apply:
+
+1. **Process skills first** (hs-explore, hs-design) — determine HOW to approach
+2. **Execution skills second** (hs-implement, hs-verify) — guide what to do
+
+"Let's build X" → hs-explore first, then hs-design, then hs-plan.
+"Fix this bug" → hs-explore first, then hs-implement.
+
+## Artifact Locations
+
+All plan and design artifacts live on the worktree branch, not in the main repo tree:
+
+| Artifact | Location |
+|---|---|
+| Design specs | `.agents/plans/YYYY-MM-DD-<topic>-design.md` |
+| Implementation plans | `.agents/plans/YYYY-MM-DD-<topic>-plan.md` |
+
+## Configuration
+
+The plugin reads conventions from the repo's CLAUDE.md, AGENTS.md, and contributing guides. Repo guidelines always override plugin defaults. The plugin provides workflow discipline — project-specific concerns belong in the project's guidelines.
diff --git a/plugins/hivespec/skills/hs-verify/SKILL.md b/plugins/hivespec/skills/hs-verify/SKILL.md
new file mode 100644
index 00000000..1542707b
--- /dev/null
+++ b/plugins/hivespec/skills/hs-verify/SKILL.md
@@ -0,0 +1,85 @@
+---
+name: hs-verify
+description: >-
+  Use when implementation is complete and you need to prove it works before claiming
+  completion, when asked to "verify", "test end-to-end", "run e2e", "check the blast
+  radius", "review the code", or before any claim that the work is "done", "complete",
+  "ready", or "passing".
+---
+
+# Verify
+
+## Overview
+
+Prove the implementation works before claiming completion. E2E red/green testing, code review, blast radius check, and verification evidence. No completion claims without fresh evidence.
+
+## Iron Law
+
+**No completion claims without verification evidence.** Run the command, read the output, THEN claim the result. "Should work", "I'm confident", and "tests are passing" (without output) are not evidence.
+
+## Process
+
+### Step 1: Build, test, lint
+
+Run the full check chain and capture the output:
+
+```bash
+bun run build && bun run test && bun run lint
+```
+
+All three must pass. If any fails, fix it before proceeding.
+
+### Step 2: E2E red/green protocol
+
+Unit tests prove units work. E2E verification proves the feature works. Both are required.
+
+1. **Red E2E** — verify current behavior before your changes (establish baseline). If on a worktree branch, check out main temporarily or use the main worktree.
+2. **Green E2E** — verify new behavior matches expectations after your changes.
+3. **All modes** — identify all user-facing entry points that exercise the change. Test each one. A feature that works in mode A but not mode B is a bug.
+
+Run the actual feature as a user would — create real test data, exercise the real pipeline, hit the real API. Do not substitute unit test output for e2e verification.
+
+### Step 3: Blast radius check
+
+After implementation, grep for the modified type/function across the entire codebase:
+
+```bash
+# For each modified type, interface, or shared function:
+grep -r "TypeName\|functionName" --include="*.ts" .
+```
+
+Any untouched consumer is a potential regression. This check is mandatory for changes to types, interfaces, or shared utilities.
+
+### Step 4: Code review
+
+Dispatch an isolated reviewer subagent with:
+
+- The diff (`git diff main...HEAD`)
+- The design spec (if one exists)
+- The implementation plan (if one exists)
+- Instructions to review for spec compliance first, then code quality
+
+**Receiving review feedback:**
+
+- Verify feedback is technically correct before implementing
+- Push back on incorrect suggestions with evidence
+- Do not performatively agree — technical rigor over politeness
+
+### Step 5: Final evidence
+
+Before proceeding to hs-ship, confirm:
+
+- [ ] Build passes (with output)
+- [ ] All tests pass (with output showing test count)
+- [ ] Lint passes (with output)
+- [ ] E2E red/green completed (with evidence of both states)
+- [ ] All execution modes tested
+- [ ] Blast radius check completed (no untouched consumers of modified interfaces)
+- [ ] Code review feedback addressed
+
+## Hard Gates
+
+- Must run build, tests, and lint before claiming completion
+- Must have verification command output as evidence
+- E2E must show red-then-green (not just green)
+- Must check blast radius for any change to types, interfaces, or shared utilities