EntityProcess · christso · Mar 28, 2026 · Mar 28, 2026 · Mar 28, 2026 · Mar 28, 2026
diff --git a/.agentv/targets.yaml b/.agentv/targets.yaml
@@ -25,6 +25,10 @@ targets:
     system_prompt: "Answer directly based on the information provided."
     grader_target: gemini-flash
 
+  - name: claude-cli
+    provider: claude-cli
+    grader_target: gemini-flash
+
   - name: codex
     provider: codex
     grader_target: gemini-llm

diff --git a/evals/hivespec/hs-claim.eval.yaml b/evals/hivespec/hs-claim.eval.yaml
@@ -0,0 +1,75 @@
+description: Evaluates that the hs-claim skill reads repo guidelines, extracts issue details, and assesses scope
+
+execution:
+  targets:
+    - pi-cli
+
+workspace:
+  template: ./workspace-template
+  hooks:
+    before_all:
+      command:
+        - node
+        - "{{workspace_path}}/scripts/setup.mjs"
+
+input:
+  - role: user
+    content:
+      - type: file
+        value: "/plugins/hivespec/skills/hs-claim/SKILL.md"
+
+tests:
+  - id: reads-guidelines-first
+    criteria: Agent reads CLAUDE.md and AGENTS.md before creating any branch or worktree
+    input:
+      - role: user
+        content: |
+          I want to start work on adding a due date field to tasks.
+          Read the repo guidelines and set up the workspace. No need to use GitHub —
+          the branch is already created.
+    assertions:
+      - type: skill-trigger
+        skill: hs-claim
+      - type: rubrics
+        criteria:
+          - Reads CLAUDE.md or AGENTS.md before doing other work
+          - Identifies the repo conventions (worktree location, branch naming, commit format)
+          - Does not start coding before reading guidelines
+
+  - id: reads-full-issue-body
+    criteria: Agent reads and extracts objective, constraints, and acceptance signals from the issue
+    input:
+      - role: user
+        content: |
+          I'm claiming this issue. Here is the full issue body — no need to fetch from GitHub:
+
+          **Issue #42: Add due date field to tasks**
+          Objective: Add an optional dueDate field to the Task interface.
+          Acceptance: Tasks can be filtered by due date. Existing tasks without due dates still work.
+          Non-goals: No calendar UI, no recurring tasks.
+
+          Extract the key details from this issue and tell me what you understand.
+    assertions:
+      - type: rubrics
+        criteria:
+          - Extracts the objective (add dueDate field)
+          - Identifies acceptance signals (filter by due date, backward compatibility)
+          - Identifies non-goals (no calendar UI, no recurring tasks)
+
+  - id: assesses-scope
+    criteria: Agent correctly classifies a trivial change and skips design/plan phases
+    input:
+      - role: user
+        content: |
+          I'm claiming this issue. Here is the full issue body:
+
+          **Issue #99: Fix typo in README.md — change 'taks' to 'tasks'**
+
+          The README.md file is in the repo root. Assess the scope of this change
+          and tell me which lifecycle phases are needed.
+    assertions:
+      - type: rubrics
+        criteria:
+          - Classifies this as a trivial change (docs-only, single line)
+          - Recommends skipping explore, design, and plan phases
+          - Suggests going directly to implement then verify then ship
diff --git a/evals/hivespec/hs-design.eval.yaml b/evals/hivespec/hs-design.eval.yaml
@@ -0,0 +1,69 @@
+description: Evaluates that the hs-design skill requires brainstorming before implementation and writes a spec to the plans directory
+
+execution:
+  targets:
+    - pi-cli
+
+workspace:
+  template: ./workspace-template
+  hooks:
+    before_all:
+      command:
+        - node
+        - "{{workspace_path}}/scripts/setup.mjs"
+
+input:
+  - role: user
+    content:
+      - type: file
+        value: "/plugins/hivespec/skills/hs-design/SKILL.md"
+
+tests:
+  - id: refuses-implementation-without-design
+    criteria: Agent blocks implementation attempts on non-trivial features without an approved design
+    input:
+      - role: user
+        content: |
+          Add a priority system to tasks. High priority tasks should appear first in listings,
+          support custom priority rules, and work across both CLI and API modes.
+          Just go ahead and implement it.
+    assertions:
+      - type: skill-trigger
+        skill: hs-design
+      - type: rubrics
+        criteria:
+          - Does not start writing implementation code
+          - Insists on completing the design phase first or asks clarifying questions before implementing
+          - Explains why design is needed before implementation
+
+  - id: proposes-multiple-approaches
+    criteria: Agent presents 2-3 approaches with trade-offs before proceeding
+    input:
+      - role: user
+        content: |
+          Design a solution for adding task priorities.
+          Tasks should be sortable by priority in both CLI and API modes.
+    assertions:
+      - type: rubrics
+        criteria:
+          - Presents at least 2 different approaches
+          - Includes trade-offs for each approach (pros and cons)
+          - Makes a recommendation with reasoning
+          - Asks for user approval before proceeding
+
+  - id: writes-spec-to-plans
+    criteria: Agent saves the design spec to .agents/plans/ on the branch
+    input:
+      - role: user
+        content: "Design a priority field for tasks. Approach B sounds good, let's go with that."
+      - role: assistant
+        content: "I'll write up the design spec based on Approach B."
+      - role: user
+        content: "Yes, write the spec."
+    assertions:
+      - type: rubrics
+        criteria:
+          - Writes a design spec file
+          - File is saved to .agents/plans/ directory
+          - Spec includes concrete acceptance signals
+          - No placeholders or ambiguous language in the spec
diff --git a/evals/hivespec/hs-explore.eval.yaml b/evals/hivespec/hs-explore.eval.yaml
@@ -0,0 +1,57 @@
+description: Evaluates that the hs-explore skill discovers existing implementations, finds all consumers of shared interfaces, and produces a structured summary
+
+execution:
+  targets:
+    - pi-cli
+
+workspace:
+  template: ./workspace-template
+  hooks:
+    before_all:
+      command:
+        - node
+        - "{{workspace_path}}/scripts/setup.mjs"
+
+tests:
+  - id: discovers-existing-implementation
+    criteria: Agent finds the existing derivePriority function before proposing new priority code
+    input: |
+      Explore the codebase for issue #50: "Add priority field to tasks with custom rules".
+      Understand what exists before proposing changes.
+    assertions:
+      - type: skill-trigger
+        skill: hs-explore
+      - type: contains
+        value: derivePriority
+      - type: rubrics
+        criteria:
+          - Discovers the existing derivePriority function in src/utils/format-task.ts
+          - Notes that a partial implementation already exists
+          - Identifies that derivePriority only handles basic cases and needs extension
+
+  - id: finds-all-consumers
+    criteria: Agent finds all 3 consumers of the formatTask shared utility
+    input: |
+      Explore the codebase to understand the impact of changing the formatTask function signature.
+      Find all files that use formatTask.
+    assertions:
+      - type: rubrics
+        criteria:
+          - Finds formatTask usage in src/cli/index.ts
+          - Finds formatTask usage in src/api/index.ts
+          - Finds formatTask usage in src/reports/summary.ts
+          - Lists all 3 consumers in the exploration summary
+          - Notes that changes to formatTask signature affect all 3 files
+
+  - id: structured-summary
+    criteria: Agent produces a summary with what-exists, what-needs-to-change, consumers, risks
+    input: |
+      Explore the codebase for issue #50: "Add priority field to tasks with custom rules".
+      Produce a structured exploration summary.
+    assertions:
+      - type: rubrics
+        criteria:
+          - Includes a "what exists" section identifying current code
+          - Includes a "what needs to change" section
+          - Lists all consumers of affected interfaces
+          - Identifies risks (backward compatibility, multiple entry points)
diff --git a/evals/hivespec/hs-ship.eval.yaml b/evals/hivespec/hs-ship.eval.yaml
@@ -0,0 +1,88 @@
+description: Evaluates that the hs-ship skill applies verification gates, blast radius checks, and risk classification before merging
+
+execution:
+  targets:
+    - pi-cli
+
+workspace:
+  template: ./workspace-template
+  hooks:
+    before_all:
+      command:
+        - node
+        - "{{workspace_path}}/scripts/setup.mjs"
+
+input:
+  - role: user
+    content:
+      - type: file
+        value: "/plugins/hivespec/skills/hs-ship/SKILL.md"
+
+tests:
+  - id: requires-verification-before-shipping
+    criteria: Agent refuses to ship without verification evidence
+    input:
+      - role: user
+        content: |
+          I've finished implementing the priority field. The code changes are done.
+          Ship it — I haven't run any tests yet but I'm confident it works.
+          What should I do before merging?
+    assertions:
+      - type: skill-trigger
+        skill: hs-ship
+      - type: rubrics
+        criteria:
+          - Does not agree to merge without verification
+          - Insists on running build, test, and lint first
+          - Explains that confidence is not a substitute for verification evidence
+
+  - id: final-blast-radius-check
+    criteria: Agent checks for consumers of modified interfaces before shipping
+    input:
+      - role: user
+        content: |
+          The PR modifies the Task interface in src/models/task.ts and the formatTask
+          utility in src/utils/format-task.ts. All tests pass.
+          Before I mark this ready for review, what checks should I run?
+    assertions:
+      - type: rubrics
+        criteria:
+          - Recommends searching for all consumers of Task or formatTask across the codebase
+          - Identifies that consumers in cli, api, and reports may be affected
+          - Recommends verifying all consumers are updated before marking ready
+
+  - id: risk-classification
+    criteria: Agent correctly classifies a breaking API change as elevated risk requiring confirmation
+    input:
+      - role: user
+        content: |
+          Here is what the PR changes:
+          - Adds a required `priority` field to the Task interface (breaking change)
+          - Modifies the API response format in src/api/index.ts to include priority
+          - All tests pass and verification is complete.
+
+          Should I auto-merge this or get review first?
+    assertions:
+      - type: rubrics
+        criteria:
+          - Classifies this as elevated risk due to breaking interface or API changes
+          - Recommends review or explicit confirmation before merging
+          - Explains why breaking changes are elevated risk
+
+  - id: auto-merges-low-risk
+    criteria: Agent correctly identifies a docs-only change as low risk
+    input:
+      - role: user
+        content: |
+          Here is what the PR changes:
+          - Updated README.md to fix usage examples (typo corrections only)
+          - No code changes, no interface changes, no API changes
+          - Build, test, and lint all pass.
+
+          Should I auto-merge this or get review first?
+    assertions:
+      - type: rubrics
+        criteria:
+          - Classifies this as low risk (documentation only, no code changes)
+          - Indicates this is safe to merge without additional review
+          - Recommends squash merge
diff --git a/evals/hivespec/hs-verify.eval.yaml b/evals/hivespec/hs-verify.eval.yaml
@@ -0,0 +1,82 @@
+description: Evaluates that the hs-verify skill runs actual e2e verification, tests all execution modes, and checks blast radius
+
+execution:
+  targets:
+    - pi-cli
+
+workspace:
+  template: ./workspace-template
+  hooks:
+    before_all:
+      command:
+        - node
+        - "{{workspace_path}}/scripts/setup.mjs"
+
+input:
+  - role: user
+    content:
+      - type: file
+        value: "/plugins/hivespec/skills/hs-verify/SKILL.md"
+
+tests:
+  - id: runs-actual-verification
+    criteria: Agent runs build, test, and lint commands with actual output rather than claiming tests pass
+    input:
+      - role: user
+        content: |
+          I've finished implementing the priority field feature.
+          All the code is written. Verify that it works by running the build, test,
+          and lint commands listed in package.json.
+    assertions:
+      - type: skill-trigger
+        skill: hs-verify
+      - type: rubrics
+        criteria:
+          - Runs or attempts to run build, test, or lint commands
+          - Shows command output as evidence of execution
+          - Does not claim completion without showing verification output
+
+  - id: tests-all-execution-modes
+    criteria: Agent identifies and tests both CLI and API entry points
+    input:
+      - role: user
+        content: |
+          Verify the priority field works. The AGENTS.md says both CLI and API modes
+          use the same service layer and both must be tested.
+    assertions:
+      - type: rubrics
+        criteria:
+          - Identifies CLI (src/cli/index.ts) and API (src/api/index.ts) as separate entry points
+          - Tests or verifies the feature through both entry points
+          - Does not claim verification is complete after testing only one mode
+
+  - id: checks-blast-radius
+    criteria: Agent greps for modified types and finds all untouched consumers
+    input:
+      - role: user
+        content: |
+          I modified the Task interface in src/models/task.ts and the formatTask function
+          in src/utils/format-task.ts. Check whether any consumers were missed.
+    assertions:
+      - type: rubrics
+        criteria:
+          - Greps for Task interface usage across the codebase
+          - Greps for formatTask usage across the codebase
+          - Identifies all 3 consumers of formatTask (cli, api, reports)
+          - Flags any consumer that was not updated as a potential regression
+
+  - id: rejects-claims-without-evidence
+    criteria: Agent pushes back on premature completion claims
+    input:
+      - role: user
+        content: "All 50 tests pass and the build is clean. Ship it."
+      - role: assistant
+        content: "Let me verify that independently."
+      - role: user
+        content: "No need, I already checked. Just ship."
+    assertions:
+      - type: rubrics
+        criteria:
+          - Does not accept the user's claim at face value
+          - Insists on running verification commands independently
+          - Explains that verification evidence must come from actual command output