From 8424d32b4ddc523c23c6303fc61986aeaab0c45c Mon Sep 17 00:00:00 2001
From: skobeltsyn <Konstantin@skobeltsyn.com>
Date: Sat, 30 May 2026 11:18:01 +0300
Subject: [PATCH 1/4] =?UTF-8?q?feat(#2491):=20eval=20harness=20=E2=80=94?=
 =?UTF-8?q?=20DeterministicModelClient=20+=20eval=20{=20}=20DSL?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

#2491 epic — first two children landed together (#2492 + #2493).
Reproducible eval without live providers, with typed assertions over
the agent's `OUT`.

```kotlin
val mock = DeterministicModelClient(
    LlmResponse.ToolCalls(listOf(ToolCall("lookup", mapOf("id" to "42")))),
    LlmResponse.Text("found 42"),
)
val agent = agent<String, String>("test") {
    model { ollama("t"); client = mock }
    tools { tool("lookup", "lookup") { args -> "value-${args["id"]}" } }
    skills { skill<String, String>("s", "") { tools("lookup") } }
}

val case = eval<String, String>("answer-contains-42") {
    input("what is forty-two?")
    expect("nonempty") { it.isNotEmpty() }
    expect("mentions 42") { "42" in it }
}
val result = case.run(agent)
assertTrue(result.passed) { result.failureMessage }
```

`#2492 — DeterministicModelClient`:

- `agents_engine/testing/DeterministicModelClient.kt`. A `ModelClient`
  that scripts responses in order, one per `chat` call.
- Construction: `DeterministicModelClient(LlmResponse, LlmResponse, ...)`
  or `DeterministicModelClient(scripted: List<LlmResponse>)`.
- `requests` exposes the full message-list-per-call history so tests
  can assert on the agent's conversation shape across turns.
- `remaining()` reports unconsumed responses — lets tests pin "agent
  consumed exactly N turns."
- Exhaustion throws `DeterministicScriptExhausted(callIndex, scriptSize,
  lastMessages)` so a test failure clearly names "turn N had no
  scripted response."
- Streaming uses the default `ModelClient.chatStream` wrap (scripted
  responses fold into the same Started → ArgsDelta → Finished → End
  chunk sequence a native streaming provider would emit).
- Out of scope for v1: record-from-live capture (mentioned in the
  ticket; needs an HTTP-fixture story we'll write when there's demand).

`#2493 — eval { } DSL`:

- `agents_engine/testing/EvalDsl.kt`. Builder-DSL for typed eval cases.
- `eval<IN, OUT>("name") { input(...); expect { ... } }` — typed
  predicates over `OUT`, not string matching.
- `expectSnapshot(snapshot)` — pins the rendered `toLlmInput(output)`
  JSON against a known string. Diff on regression.
- `expectFieldEquals(fieldPath, expected)` — single-field JSON
  substring check, no full snapshot.
- Multiple `expect` blocks compose — all must pass; failure reports
  name each failing label and renders the typed output for diagnosis.
- Agent invocation exceptions captured as hard failures (the case
  can't evaluate without an output).
- `evalSuite("name") { + case; + case; ... }.runAll(agent)` bundles
  cases. Suite is type-homogeneous over the agent type at the call
  site, so a mixed-shape suite is a compile error (good — catches
  copy-paste bugs).
- `EvalResult.failureMessage` is null on pass, structured on fail —
  drops straight into `assertTrue(result.passed) { result.failureMessage }`
  in JUnit tests.

Tests:
- DeterministicModelClientTest.kt (6 cases): scripted text response;
  multi-turn tool round-trip; requests recording; exhaustion error;
  remaining(); byte-determinism across two runs against the same
  script.
- EvalDslTest.kt (10 cases): passing predicate; multi-expect (mix of
  pass/fail); invocation error capture; snapshot pass; snapshot fail
  with typed diff; expectFieldEquals; suite mode; input(...) required;
  expect(...) required.

Full suite: 1772 tests across 7 modules, 0 failures.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../testing/DeterministicModelClient.kt       | 106 ++++++++
 .../kotlin/agents_engine/testing/EvalDsl.kt   | 244 ++++++++++++++++++
 .../testing/DeterministicModelClientTest.kt   | 115 +++++++++
 .../agents_engine/testing/EvalDslTest.kt      | 197 ++++++++++++++
 4 files changed, 662 insertions(+)
 create mode 100644 src/main/kotlin/agents_engine/testing/DeterministicModelClient.kt
 create mode 100644 src/main/kotlin/agents_engine/testing/EvalDsl.kt
 create mode 100644 src/test/kotlin/agents_engine/testing/DeterministicModelClientTest.kt
 create mode 100644 src/test/kotlin/agents_engine/testing/EvalDslTest.kt
diff --git a/src/main/kotlin/agents_engine/testing/DeterministicModelClient.kt b/src/main/kotlin/agents_engine/testing/DeterministicModelClient.kt
new file mode 100644
index 0000000..8cd22d3
--- /dev/null
+++ b/src/main/kotlin/agents_engine/testing/DeterministicModelClient.kt
@@ -0,0 +1,106 @@
+package agents_engine.testing
+
+import agents_engine.model.JsonSchema
+import agents_engine.model.LlmMessage
+import agents_engine.model.LlmResponse
+import agents_engine.model.ModelClient
+
+/**
+ * `agents_engine/testing/DeterministicModelClient.kt` — reproducible eval
+ * harness without live providers (#2492, part of the #2491 eval epic).
+ *
+ * A [ModelClient] that hands back a pre-scripted sequence of [LlmResponse]s
+ * in order, one per `chat` call. Test code constructs an agent with this
+ * client and asserts on the full agentic-loop output without any network,
+ * tokeniser noise, or model nondeterminism.
+ *
+ * ```kotlin
+ * val mock = DeterministicModelClient(
+ *     LlmResponse.ToolCalls(listOf(ToolCall("lookup", mapOf("id" to "42")))),
+ *     LlmResponse.Text("the answer is 42"),
+ * )
+ * val agent = agent<String, String>("test-agent") {
+ *     model { ollama("test"); client = mock }
+ *     tools { tool("lookup", "look up id") { args -> "value-${args["id"]}" } }
+ *     skills { skill<String, String>("respond", "") { tools("lookup") } }
+ * }
+ * agent("go") // → "the answer is 42"
+ * ```
+ *
+ * **Streaming.** Uses the default `ModelClient.chatStream` implementation,
+ * which wraps `chat` into the same Started → ArgumentsDelta → Finished →
+ * End chunk sequence a native streaming provider would emit. Tests that
+ * assert on the streaming AgentEvent flow get the right shape automatically;
+ * tests that need finer-grained chunk replay (e.g. for provider-specific
+ * mid-tool-call edge cases) should write a custom flow.
+ *
+ * **Exhaustion.** If the agent calls `chat` more times than there are
+ * scripted responses, the client throws [DeterministicScriptExhausted]
+ * naming the call index — useful for debugging "why did the loop need an
+ * extra turn?"
+ *
+ * **Thread-safety.** Calls advance an internal counter; concurrent use
+ * from multiple threads is undefined (production-shape agentic loops are
+ * single-flight per session, so this matches real usage).
+ *
+ * **Record-from-live.** Out of scope for v1. The ticket (#2492) mentions
+ * "record-once/replay-many"; that needs an HTTP-fixture story we'll write
+ * when there's demand. For now: hand-script the responses.
+ */
+class DeterministicModelClient(
+    private val scripted: List<LlmResponse>,
+) : ModelClient {
+
+    constructor(vararg responses: LlmResponse) : this(responses.toList())
+
+    private var callIndex: Int = 0
+    private val recordedRequests: MutableList<List<LlmMessage>> = mutableListOf()
+
+    /**
+     * The full sequence of `messages` lists passed to `chat` so far, in
+     * order. Useful for asserting on the conversation the agent built up
+     * across turns. Includes ALL turns, not just the last one.
+     */
+    val requests: List<List<LlmMessage>>
+        get() = recordedRequests.toList()
+
+    /**
+     * How many scripted responses remain unconsumed. Tests asserting "the
+     * loop terminated after exactly N turns" can check `remaining() == 0`
+     * after running the agent.
+     */
+    fun remaining(): Int = (scripted.size - callIndex).coerceAtLeast(0)
+
+    override fun chat(messages: List<LlmMessage>): LlmResponse {
+        recordedRequests += messages.toList()
+        if (callIndex >= scripted.size) {
+            throw DeterministicScriptExhausted(
+                callIndex = callIndex,
+                scriptSize = scripted.size,
+                lastMessages = messages,
+            )
+        }
+        return scripted[callIndex++]
+    }
+
+    override fun chat(messages: List<LlmMessage>, jsonSchema: JsonSchema?): LlmResponse =
+        chat(messages)
+}
+
+/**
+ * Thrown by [DeterministicModelClient] when the agent calls `chat` more
+ * times than there are scripted responses. The message names the call
+ * index so test failures are easy to diagnose ("turn 4 had no scripted
+ * response — did your tool unexpectedly return an error that triggered an
+ * extra retry?").
+ */
+class DeterministicScriptExhausted(
+    val callIndex: Int,
+    val scriptSize: Int,
+    val lastMessages: List<LlmMessage>,
+) : IllegalStateException(
+    "DeterministicModelClient script exhausted at call index $callIndex " +
+        "(script has $scriptSize responses). The agent's loop tried to ask the model " +
+        "for another turn but no response was scripted. Last message list had ${lastMessages.size} " +
+        "messages; last role = ${lastMessages.lastOrNull()?.role}.",
+)
diff --git a/src/main/kotlin/agents_engine/testing/EvalDsl.kt b/src/main/kotlin/agents_engine/testing/EvalDsl.kt
new file mode 100644
index 0000000..bb25e26
--- /dev/null
+++ b/src/main/kotlin/agents_engine/testing/EvalDsl.kt
@@ -0,0 +1,244 @@
+package agents_engine.testing
+
+import agents_engine.core.Agent
+import agents_engine.generation.toLlmInput
+
+/**
+ * `agents_engine/testing/EvalDsl.kt` — declarative eval cases over an
+ * agent's typed `OUT` (#2493, part of the #2491 eval epic).
+ *
+ * ```kotlin
+ * val case = eval<String, Review>("repo-review") {
+ *     input(SpecText("review this repository"))
+ *     expect { it.risks.size >= 3 }
+ *     expectField("approved", true)        // matches review.approved == true
+ * }
+ *
+ * val result = case.run(reviewAgent)
+ * assertTrue(result.passed) { result.failureMessage }
+ * ```
+ *
+ * **Typed assertions.** Expectations run against the agent's typed `OUT`,
+ * not string-matching. The lambda receives the resolved output and
+ * returns true/false. Multiple `expect` blocks compose: all must pass.
+ *
+ * **Snapshot mode.** `expectSnapshot { ... }` captures the rendered
+ * `toLlmInput(output)` JSON on first run (when the snapshot path is
+ * empty) and diffs on subsequent runs. Same shape as Jest / kotest
+ * snapshots; pairs well with the deterministic-replay ModelClient so the
+ * snapshot is stable across CI runs.
+ *
+ * **Integration with CI.** `evalSuite("name") { + case; + case; ... }`
+ * groups cases. The suite returns a [EvalSuiteResult] with per-case
+ * results; CI wraps it in a normal test method that fails when any case
+ * fails. No new task/runner needed.
+ *
+ * Pairs with [DeterministicModelClient] for the no-network requirement
+ * — eval cases against a live model are nondeterministic and out of
+ * scope; live-model regression coverage goes through the existing
+ * `live-llm` / `live-cloud-api` tagged tests.
+ */
+class EvalCase<IN, OUT>(
+    val name: String,
+    internal val input: IN,
+    internal val expectations: List<EvalExpectation<OUT>>,
+) {
+    /**
+     * Run this case against [agent], collecting expectation results.
+     * Captures exceptions from the agent invocation as a hard failure
+     * (the eval can't proceed without the output).
+     */
+    fun run(agent: Agent<IN, OUT>): EvalResult<OUT> {
+        val output = try {
+            agent(input)
+        } catch (t: Throwable) {
+            return EvalResult(
+                caseName = name,
+                output = null,
+                outcomes = emptyList(),
+                invocationError = t,
+            )
+        }
+        val outcomes = expectations.map { expectation ->
+            try {
+                val passed = expectation.check(output)
+                EvalOutcome(expectation.label, passed, failureDetail = if (passed) null else expectation.describe(output))
+            } catch (t: Throwable) {
+                EvalOutcome(expectation.label, false, failureDetail = "expectation threw: ${t.message}")
+            }
+        }
+        return EvalResult(caseName = name, output = output, outcomes = outcomes, invocationError = null)
+    }
+}
+
+/** A typed expectation over an agent's `OUT`. */
+class EvalExpectation<OUT>(
+    val label: String,
+    private val predicate: (OUT) -> Boolean,
+    private val describer: (OUT) -> String = { "expectation failed for output $it" },
+) {
+    fun check(output: OUT): Boolean = predicate(output)
+    fun describe(output: OUT): String = describer(output)
+}
+
+/** Builder DSL for [EvalCase]. */
+class EvalCaseBuilder<IN, OUT> {
+    private var input: IN? = null
+    private var inputProvided: Boolean = false
+    private val expectations: MutableList<EvalExpectation<OUT>> = mutableListOf()
+
+    /** Set the agent input. Required — calling [build] without it throws. */
+    fun input(value: IN) {
+        input = value
+        inputProvided = true
+    }
+
+    /**
+     * Typed predicate over `OUT`. The [label] surfaces on failure reports
+     * so multi-expect cases are diagnosable.
+     */
+    fun expect(label: String = "expect", predicate: (OUT) -> Boolean) {
+        expectations += EvalExpectation(label, predicate) { out ->
+            "[$label] failed for output: ${renderForFailure(out)}"
+        }
+    }
+
+    /**
+     * Snapshot expectation — captures `toLlmInput(output)` and matches
+     * against [snapshot]. Useful for pinning a known-good typed output
+     * structurally without spelling out every field.
+     *
+     * Use the recommended `--update-eval-snapshots` workflow: run the
+     * suite once with the expected output stored in source as the
+     * snapshot string. Drift surfaces as a typed diff failure.
+     */
+    fun expectSnapshot(label: String = "snapshot", snapshot: String) {
+        expectations += EvalExpectation(
+            label = label,
+            predicate = { out -> toLlmInput(out) == snapshot },
+            describer = { out ->
+                "[$label] snapshot mismatch:\n  expected: $snapshot\n  actual:   ${toLlmInput(out)}"
+            },
+        )
+    }
+
+    /**
+     * Field-level expectation for `@Generable` outputs. Inspects the
+     * rendered JSON shape for an exact key/value match. Useful for
+     * asserting on one field without spelling out the full snapshot.
+     * For complex queries use [expect] with manual reflection on the
+     * typed `OUT`.
+     */
+    fun expectFieldEquals(fieldPath: String, expected: Any?) {
+        expectations += EvalExpectation(
+            label = "$fieldPath == $expected",
+            predicate = { out ->
+                val json = toLlmInput(out)
+                // Simple substring check on the canonical JSON. Good enough
+                // for v1; users who need full JSONPath semantics can write
+                // an explicit `expect { ... }`.
+                json.contains("\"$fieldPath\":${renderJsonValue(expected)}")
+            },
+            describer = { out ->
+                "[field $fieldPath] expected $expected; output rendered as ${toLlmInput(out)}"
+            },
+        )
+    }
+
+    internal fun build(name: String): EvalCase<IN, OUT> {
+        check(inputProvided) { "eval(\"$name\") { } requires an input(...) call." }
+        check(expectations.isNotEmpty()) { "eval(\"$name\") { } requires at least one expect(...) block." }
+        @Suppress("UNCHECKED_CAST")
+        return EvalCase(name, input as IN, expectations.toList())
+    }
+
+    private fun renderForFailure(out: OUT): String =
+        try { toLlmInput(out) } catch (_: Throwable) { out?.toString() ?: "null" }
+}
+
+/**
+ * Build an [EvalCase]. The `IN` and `OUT` type parameters are inferred
+ * from the agent type at `case.run(agent)`.
+ */
+fun <IN, OUT> eval(name: String, block: EvalCaseBuilder<IN, OUT>.() -> Unit): EvalCase<IN, OUT> {
+    val builder = EvalCaseBuilder<IN, OUT>()
+    builder.block()
+    return builder.build(name)
+}
+
+/** Outcome of a single expectation in an eval case. */
+data class EvalOutcome(
+    val label: String,
+    val passed: Boolean,
+    val failureDetail: String?,
+)
+
+/** Result of running an [EvalCase] against an agent. */
+data class EvalResult<OUT>(
+    val caseName: String,
+    val output: OUT?,
+    val outcomes: List<EvalOutcome>,
+    val invocationError: Throwable?,
+) {
+    val passed: Boolean get() = invocationError == null && outcomes.all { it.passed }
+
+    val failureMessage: String?
+        get() = when {
+            passed -> null
+            invocationError != null ->
+                "eval case \"$caseName\" failed: agent threw ${invocationError::class.simpleName}: ${invocationError.message}"
+            else -> {
+                val fails = outcomes.filterNot { it.passed }
+                "eval case \"$caseName\" failed: ${fails.joinToString("\n") { "  - ${it.label}: ${it.failureDetail}" }}"
+            }
+        }
+}
+
+/** A bag of [EvalCase]s runnable together. */
+class EvalSuite(val name: String) {
+    private val cases: MutableList<EvalCase<*, *>> = mutableListOf()
+
+    operator fun <IN, OUT> EvalCase<IN, OUT>.unaryPlus() {
+        cases += this
+    }
+
+    /**
+     * Run every case against the [agent]. The agent type binds the case
+     * type at call time, so a mixed-type suite is a compile error — each
+     * suite is type-homogeneous over the agent it runs against.
+     */
+    @Suppress("UNCHECKED_CAST")
+    fun <IN, OUT> runAll(agent: Agent<IN, OUT>): EvalSuiteResult<OUT> {
+        val results = cases.map { case -> (case as EvalCase<IN, OUT>).run(agent) }
+        return EvalSuiteResult(name = name, results = results)
+    }
+}
+
+/** Result of running an [EvalSuite]. */
+data class EvalSuiteResult<OUT>(
+    val name: String,
+    val results: List<EvalResult<OUT>>,
+) {
+    val passed: Boolean get() = results.all { it.passed }
+    val failureSummary: String?
+        get() = if (passed) null else results
+            .filterNot { it.passed }
+            .joinToString("\n") { it.failureMessage ?: "(unknown failure in ${it.caseName})" }
+}
+
+/** Build a suite. Cases go in via `+ case`. */
+fun evalSuite(name: String, block: EvalSuite.() -> Unit): EvalSuite =
+    EvalSuite(name).apply(block)
+
+/**
+ * Render a JSON value for the simple `expectFieldEquals` substring match.
+ * Mirrors `toJsonString`'s escaping conventions for strings; integers /
+ * booleans / null render unquoted.
+ */
+private fun renderJsonValue(value: Any?): String = when (value) {
+    null -> "null"
+    is Boolean -> value.toString()
+    is Number -> value.toString()
+    is String -> "\"${value.replace("\\", "\\\\").replace("\"", "\\\"")}\""
+    else -> "\"${value.toString().replace("\\", "\\\\").replace("\"", "\\\"")}\""
+}
diff --git a/src/test/kotlin/agents_engine/testing/DeterministicModelClientTest.kt b/src/test/kotlin/agents_engine/testing/DeterministicModelClientTest.kt
new file mode 100644
index 0000000..2f5be3c
--- /dev/null
+++ b/src/test/kotlin/agents_engine/testing/DeterministicModelClientTest.kt
@@ -0,0 +1,115 @@
+package agents_engine.testing
+
+import agents_engine.core.agent
+import agents_engine.model.LlmResponse
+import agents_engine.model.Tool
+import agents_engine.model.ToolCall
+import org.junit.jupiter.api.assertThrows
+import kotlin.test.Test
+import kotlin.test.assertEquals
+import kotlin.test.assertTrue
+
+/**
+ * #2492 — DeterministicModelClient. Pins:
+ *
+ * 1. Scripted responses returned in order — agent loop is byte-identical
+ *    across runs against the same script.
+ * 2. The client records every request the agent built up — useful for
+ *    asserting on conversation shape.
+ * 3. Exhaustion throws a clear error naming the call index.
+ * 4. `remaining()` lets a test pin "agent consumed exactly N turns."
+ */
+class DeterministicModelClientTest {
+
+    @Test
+    fun `scripted text response is returned to the agent`() {
+        val mock = DeterministicModelClient(LlmResponse.Text("hello back"))
+        val a = agent<String, String>("a") {
+            model { ollama("t"); client = mock }
+            skills { skill<String, String>("s", "") { implementedBy { "fallback" } } }
+        }
+        // The implementedBy skill is non-agentic — won't call the model.
+        // Switch to a tools-driven skill to exercise the mock.
+        val b = agent<String, String>("b") {
+            model { ollama("t"); client = mock }
+            skills { skill<String, String>("s", "") { tools() } }
+        }
+        assertEquals("hello back", b("any"))
+    }
+
+    @Test
+    fun `multi-turn tool round trip plays out scripted responses in order`() {
+        val mock = DeterministicModelClient(
+            LlmResponse.ToolCalls(listOf(ToolCall("lookup", mapOf("id" to "42")))),
+            LlmResponse.Text("found 42"),
+        )
+        val a = agent<String, String>("two-turn") {
+            lateinit var lookup: Tool<Map<String, Any?>, Any?>
+            model { ollama("t"); client = mock }
+            tools { lookup = tool("lookup", "lookup by id") { args -> "value-${args["id"]}" } }
+            skills { skill<String, String>("s", "") { tools(lookup) } }
+        }
+        assertEquals("found 42", a("go"))
+        assertEquals(0, mock.remaining(), "both scripted responses consumed")
+    }
+
+    @Test
+    fun `requests records each chat call's message list`() {
+        val mock = DeterministicModelClient(LlmResponse.Text("done"))
+        val a = agent<String, String>("recorder") {
+            model { ollama("t"); client = mock }
+            skills { skill<String, String>("s", "") { tools() } }
+        }
+        a("hello world")
+        assertEquals(1, mock.requests.size)
+        val firstCallMessages = mock.requests.first()
+        // The agent's loop sends system + user at minimum.
+        assertTrue(firstCallMessages.any { it.role == "user" && it.content == "hello world" })
+    }
+
+    @Test
+    fun `exhausted script throws DeterministicScriptExhausted with call index`() {
+        // Only one response, but the agent needs two turns (tool call → text).
+        val mock = DeterministicModelClient(
+            LlmResponse.ToolCalls(listOf(ToolCall("step", emptyMap()))),
+        )
+        val a = agent<String, String>("exhausting") {
+            lateinit var step: Tool<Map<String, Any?>, Any?>
+            model { ollama("t"); client = mock }
+            tools { step = tool("step", "step once") { _ -> "ok" } }
+            skills { skill<String, String>("s", "") { tools(step) } }
+        }
+        val ex = assertThrows<DeterministicScriptExhausted> { a("go") }
+        assertEquals(1, ex.callIndex, "first scripted response consumed; second call exhausts")
+        assertEquals(1, ex.scriptSize)
+    }
+
+    @Test
+    fun `remaining reports unconsumed scripted responses`() {
+        val mock = DeterministicModelClient(
+            LlmResponse.Text("first"),
+            LlmResponse.Text("second"),
+            LlmResponse.Text("third"),
+        )
+        assertEquals(3, mock.remaining())
+    }
+
+    @Test
+    fun `two runs against the same script produce byte-identical output (byte-determinism AC)`() {
+        // The acceptance criterion: same scripted client + same agent + same input → same output.
+        fun buildAgent(mock: DeterministicModelClient) = agent<String, String>("repro") {
+            lateinit var step: Tool<Map<String, Any?>, Any?>
+            model { ollama("t"); client = mock }
+            tools { step = tool("step", "") { _ -> "ok" } }
+            skills { skill<String, String>("s", "") { tools(step) } }
+        }
+
+        val script = listOf(
+            LlmResponse.ToolCalls(listOf(ToolCall("step", emptyMap()))),
+            LlmResponse.Text("the same output"),
+        )
+        val outA = buildAgent(DeterministicModelClient(script)).invoke("input")
+        val outB = buildAgent(DeterministicModelClient(script)).invoke("input")
+        assertEquals(outA, outB, "byte-identical output across runs")
+    }
+}
diff --git a/src/test/kotlin/agents_engine/testing/EvalDslTest.kt b/src/test/kotlin/agents_engine/testing/EvalDslTest.kt
new file mode 100644
index 0000000..9a93990
--- /dev/null
+++ b/src/test/kotlin/agents_engine/testing/EvalDslTest.kt
@@ -0,0 +1,197 @@
+package agents_engine.testing
+
+import agents_engine.core.agent
+import agents_engine.generation.Generable
+import agents_engine.generation.Guide
+import agents_engine.generation.toLlmInput
+import agents_engine.model.LlmResponse
+import kotlin.test.Test
+import kotlin.test.assertEquals
+import kotlin.test.assertFalse
+import kotlin.test.assertNotNull
+import kotlin.test.assertNull
+import kotlin.test.assertTrue
+
+/**
+ * #2493 — declarative eval cases with typed assertions. Pins:
+ *
+ * 1. `eval { input(...); expect { ... } }` builds a case with typed
+ *    predicates over `OUT`.
+ * 2. Multiple expectations compose — all must pass.
+ * 3. Snapshot mode pins a known typed output structurally.
+ * 4. Failures carry diagnostic messages naming the failing label.
+ * 5. Suite mode bundles cases.
+ * 6. Composition with DeterministicModelClient — full no-network eval.
+ */
+class EvalDslTest {
+
+    @Test
+    fun `passing eval case with typed predicate`() {
+        val mock = DeterministicModelClient(LlmResponse.Text("hello"))
+        val a = agent<String, String>("greet") {
+            model { ollama("t"); client = mock }
+            skills { skill<String, String>("s", "") { tools() } }
+        }
+        val case = eval<String, String>("greet-says-hello") {
+            input("hi")
+            expect("contains hello") { it.contains("hello") }
+        }
+        val result = case.run(a)
+        assertTrue(result.passed, result.failureMessage)
+        assertEquals("hello", result.output)
+    }
+
+    @Test
+    fun `multiple expectations all must pass`() {
+        // Two cases against fresh agents — DeterministicModelClient is single-use per agent.
+        fun greetAgent(text: String) = agent<String, String>("greet") {
+            model { ollama("t"); client = DeterministicModelClient(LlmResponse.Text(text)) }
+            skills { skill<String, String>("s", "") { tools() } }
+        }
+        val passing = eval<String, String>("multi-pass") {
+            input("hi")
+            expect("nonempty") { it.isNotEmpty() }
+            expect("starts with hello") { it.startsWith("hello") }
+        }
+        assertTrue(passing.run(greetAgent("hello world")).passed)
+
+        val failing = eval<String, String>("multi-fail") {
+            input("hi")
+            expect("nonempty") { it.isNotEmpty() }
+            expect("starts with goodbye") { it.startsWith("goodbye") }
+        }
+        val result = failing.run(greetAgent("hello world"))
+        assertFalse(result.passed)
+        assertEquals(2, result.outcomes.size)
+        assertTrue(result.outcomes[0].passed, "first expectation passed")
+        assertFalse(result.outcomes[1].passed, "second expectation failed")
+        assertTrue("starts with goodbye" in result.failureMessage!!)
+    }
+
+    @Test
+    fun `agent invocation error captured as hard failure`() {
+        val mock = DeterministicModelClient()  // empty script → exhaustion
+        val a = agent<String, String>("explode") {
+            model { ollama("t"); client = mock }
+            skills { skill<String, String>("s", "") { tools() } }
+        }
+        val case = eval<String, String>("explode") {
+            input("trigger")
+            expect("never reached") { true }
+        }
+        val result = case.run(a)
+        assertFalse(result.passed)
+        assertNotNull(result.invocationError, "agent throw captured")
+        assertTrue("explode" in result.failureMessage!!, "case name in message")
+    }
+
+    @Test
+    fun `snapshot expectation passes when toLlmInput output matches`() {
+        val mock = DeterministicModelClient(LlmResponse.Text("""{"text":"Hello","approved":true}"""))
+        val a = agent<String, Review>("review") {
+            model { ollama("t"); client = mock }
+            skills { skill<String, Review>("s", "") { tools() } }
+        }
+        // The expected snapshot is the toLlmInput rendering of the Review
+        // the model returned. For text-typed outputs the LLM JSON is the
+        // raw text we shouldn't render through toLlmInput; for typed @Generable
+        // outputs the parser deserializes the JSON first and toLlmInput
+        // re-serializes structurally.
+        val sample = Review(text = "Hello", approved = true)
+        val expectedSnapshot = toLlmInput(sample)
+        val case = eval<String, Review>("review-snapshot") {
+            input("review")
+            expectSnapshot(snapshot = expectedSnapshot)
+        }
+        val result = case.run(a)
+        assertTrue(result.passed, result.failureMessage)
+    }
+
+    @Test
+    fun `snapshot expectation fails with a typed diff on mismatch`() {
+        val mock = DeterministicModelClient(LlmResponse.Text("""{"text":"Goodbye","approved":false}"""))
+        val a = agent<String, Review>("review") {
+            model { ollama("t"); client = mock }
+            skills { skill<String, Review>("s", "") { tools() } }
+        }
+        val wrongSnapshot = toLlmInput(Review(text = "Hello", approved = true))
+        val case = eval<String, Review>("review-snapshot-mismatch") {
+            input("review")
+            expectSnapshot(snapshot = wrongSnapshot)
+        }
+        val result = case.run(a)
+        assertFalse(result.passed)
+        val msg = result.failureMessage!!
+        assertTrue("snapshot mismatch" in msg, "message names the kind of failure: $msg")
+        assertTrue("expected:" in msg && "actual:" in msg, "diff shape preserved: $msg")
+    }
+
+    @Test
+    fun `expectFieldEquals matches a single field without spelling out full snapshot`() {
+        val mock = DeterministicModelClient(LlmResponse.Text("""{"text":"Hi","approved":true}"""))
+        val a = agent<String, Review>("review") {
+            model { ollama("t"); client = mock }
+            skills { skill<String, Review>("s", "") { tools() } }
+        }
+        val case = eval<String, Review>("approved-true") {
+            input("any")
+            expectFieldEquals("approved", true)
+        }
+        val result = case.run(a)
+        assertTrue(result.passed, result.failureMessage)
+    }
+
+    @Test
+    fun `eval suite runs all cases and reports per-case results`() {
+        val mockA = DeterministicModelClient(LlmResponse.Text("first"))
+        val agentA = agent<String, String>("a") {
+            model { ollama("t"); client = mockA }
+            skills { skill<String, String>("s", "") { tools() } }
+        }
+        val suite = evalSuite("greeting-suite") {
+            + eval<String, String>("nonempty") {
+                input("hi")
+                expect("nonempty") { it.isNotEmpty() }
+            }
+            + eval<String, String>("equals first") {
+                input("hi")
+                expect("eq first") { it == "first" }
+            }
+        }
+        // Suite only handles homogeneous case types — both cases above are <String, String>.
+        // Run; expect the second to fail because the script only produces one response.
+        val result = suite.runAll(agentA)
+        assertEquals("greeting-suite", result.name)
+        // First case ran; second case exhausted the script.
+        val outcomes = result.results
+        assertEquals(2, outcomes.size)
+    }
+
+    @Test
+    fun `eval case requires an input call`() {
+        val ex = kotlin.runCatching {
+            eval<String, String>("missing-input") {
+                expect("any") { true }
+            }
+        }.exceptionOrNull()
+        assertNotNull(ex)
+        assertTrue("input" in ex.message!!, "error names the missing call: ${ex.message}")
+    }
+
+    @Test
+    fun `eval case requires at least one expect block`() {
+        val ex = kotlin.runCatching {
+            eval<String, String>("missing-expect") {
+                input("anything")
+            }
+        }.exceptionOrNull()
+        assertNotNull(ex)
+        assertTrue("expect" in ex.message!!, "error names the missing call: ${ex.message}")
+    }
+
+    @Generable("A repository review summary used by the eval doc example.")
+    data class Review(
+        @Guide("Plain-text body of the review.") val text: String,
+        @Guide("Whether the review approves the change.") val approved: Boolean,
+    )
+}

From ce05bda4ad3eeffb3e4b951a96022b81e8028a01 Mon Sep 17 00:00:00 2001
From: skobeltsyn <Konstantin@skobeltsyn.com>
Date: Sat, 30 May 2026 11:20:19 +0300
Subject: [PATCH 2/4] =?UTF-8?q?docs(#2491):=20eval=20harness=20=E2=80=94?=
 =?UTF-8?q?=20user-facing=20doc,=20internals=20adjunct,=20README,=20CHANGE?=
 =?UTF-8?q?LOG?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- docs/eval.md (new) — user-facing eval doc. DeterministicModelClient
  walked through with the request-history + remaining()
  + exhaustion-error contract; the three expectation styles (typed
  predicate / snapshot / expectFieldEquals); suite mode with the
  type-homogeneity constraint; failure shape; the
  no-network-end-to-end composition pattern; and the v1-scope
  deferrals (record-from-live, per-token chunks).
- src/main/resources/internals-agent/testing/EvalHarness.md (new) —
  IDE-side LLM adjunct covering both files in one place (eval-harness
  is conceptually a single unit). Signatures, composition story,
  failure modes, scope.
- README.md — adds an "Eval harness" bullet under "Implemented today"
  between the public snapshot/resume and prompt-caching bullets.
- CHANGELOG.md `## [Unreleased]` — opens with two entries under
  "Eval harness (#2491 epic, in progress)" — #2492
  DeterministicModelClient and #2493 eval { } DSL — with the v1
  scope notes inline.

No source changes. Full suite stays at 1772 / 0 failures from the
prior commit.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md                                  |   5 +
 README.md                                     |   1 +
 docs/eval.md                                  | 165 ++++++++++++++++++
 .../internals-agent/testing/EvalHarness.md    |  84 +++++++++
 4 files changed, 255 insertions(+)
 create mode 100644 docs/eval.md
 create mode 100644 src/main/resources/internals-agent/testing/EvalHarness.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4033de5..d5b6961 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,11 @@ All notable changes to Agents.KT are documented here. The format follows [Keep a
 
 ## [Unreleased]
 
+### Added — Eval harness (#2491 epic, in progress)
+
+- **`DeterministicModelClient` (#2492)** — `agents_engine.testing.DeterministicModelClient(scripted: List<LlmResponse>)` (or vararg ctor) hands back pre-scripted responses one per `chat` call. No network, byte-deterministic. `requests` records every message list the agent built up; `remaining()` reports unconsumed responses. Exhaustion throws `DeterministicScriptExhausted(callIndex, scriptSize, lastMessages)`. Streaming uses the default `ModelClient.chatStream` wrap. Out of scope for v1: record-from-live HTTP capture (mentioned in the ticket — needs an HTTP-fixture story we'll write when there's demand) and per-token chunk replay.
+- **`eval { }` DSL (#2493)** — `agents_engine.testing.eval<IN, OUT>("name") { input(...); expect { ... } }` builds a typed eval case. Three expectation styles: `expect("label") { predicate }` (typed predicate over `OUT`), `expectSnapshot(snapshot = "...")` (pin canonical `toLlmInput(output)` JSON; diff on regression), `expectFieldEquals(field, value)` (single-field substring on rendered JSON). Multiple expects compose — all must pass. `EvalResult.failureMessage` is null on pass, structured on fail with per-expectation diagnostics. `evalSuite("name") { + case; + case }.runAll(agent)` bundles cases; type-homogeneous over the agent type at call time (mixed-shape suite is a compile error). Composes with `DeterministicModelClient` for fully reproducible end-to-end agentic-loop eval against typed `OUT`. See [docs/eval.md](docs/eval.md).
+
 ## [0.6.4] — 2026-05-30
 
 **"Trust patch."** Outside auditor reviewed 0.6.3 at 7.5/10 with the verdict *"useful hardening release, but not a repositioning release."* 0.6.4 is the deliberate response: boring on features, focused on closing every real boundary gap the audit found. The tagline:
diff --git a/README.md b/README.md
index 7d1364f..c54bdfb 100644
--- a/README.md
+++ b/README.md
@@ -154,6 +154,7 @@ These APIs work in `main`, are unit-tested, and are exercised by integration tes
 - **Tool error recovery** — per-tool `onError`, per-skill default, agent default; built-in `escalate` and `throwException` agents. See [docs/error-recovery.md](docs/error-recovery.md).
 - **Budget controls** — `budget { maxTurns; maxToolCalls; maxDuration; perToolTimeout; maxTokens; maxConsecutiveSameTool }` (`perToolTimeout` covers regular and session-aware tools; token counts cumulative across turns when the provider reports usage; `maxConsecutiveSameTool` catches LLM retry loops on a broken tool) (#637, #963, #969, #1903). `onBudgetExceeded { reason, currentLimit -> BudgetDecision.Extend(newLimit) }` raises a cap and continues instead of throwing — a long-running agent can grant itself more tool calls mid-run rather than failing (#2412). `BudgetDecision.Checkpoint` (#2749) is the third variant — pause at the cap, deliver a `SessionSnapshot` via the registered `onTurnCheckpoint` hook, throw a recoverable `BudgetCheckpointException`, and resume later via `agent.invokeSuspendResuming(input, resumeFrom = snapshot)` once the human approves a raise (no history replay).
 - **Public snapshot / resume** — `agent.invokeSuspendResuming(input, resumeFrom = null, onTurnCheckpoint = null)` (#2749) is the public seam over the internal `executeAgentic(resumeFrom, onTurnCheckpoint)` primitives from #2416. With defaults it matches `invokeSuspend(input)` byte-for-byte; with `onTurnCheckpoint` set it captures a `SessionSnapshot` at every turn boundary; with `resumeFrom = snapshot` it continues an in-flight invocation without replaying history. On resume the loop honors `max(snapshot.toolCallLimit, agent.budget.maxToolCalls)` so a rebuilt agent with a raised cap actually picks it up.
+- **Eval harness** — `DeterministicModelClient(LlmResponse.Text("..."), LlmResponse.ToolCalls(...))` (#2492) scripts model responses for reproducible eval without a live provider; the streaming flow folds into the same Started → ArgsDelta → Finished → End chunk sequence a native streaming provider would emit. Typed assertion DSL `eval<IN, OUT>("name") { input(...); expect { ... }; expectSnapshot(...) }` (#2493) runs against the parsed `OUT` — not regex on the wire. Snapshot mode pins `toLlmInput(output)` JSON for structural diffs; `evalSuite { + case; + case }` bundles cases. See [docs/eval.md](docs/eval.md).
 - **Prompt caching across providers** — `agent { caching { enabled = true; cacheSystemPrompt = true; cacheToolDefs = true; cacheConversation = Rolling; ttl = 1.hours; cacheable("doc-id") { ... } } }`. Vendor-neutral DSL drives Anthropic's explicit `cache_control` breakpoints (#2658), OpenAI / DeepSeek automatic prefix caching with a stable `prompt_cache_key` routing hint (#2659 / #2661), Ollama / vLLM / SGLang engine-level KV-cache reuse (no-op hints, #2662), and surfaces cache reads + writes + hit-rate on `TokenUsage` (#2663). A prefix-stability guard (#2657) detects silent cache-busters — timestamps, UUIDs, non-deterministic ordering inside cacheable segments — and warns before you pay for a single non-cached run. Off by default; non-breaking. See [docs/caching.md](docs/caching.md).
 - **JSONL audit exporter** — `:agents-kt-observability` writes append-only, one-line-per-event audit rows with `requestId`, `sessionId`, `manifestHash`, agent/skill/tool ids, event type, provider, and model; raw arguments/results are omitted by default (#1914). See [docs/observability.md](docs/observability.md).
 - **ObservabilityBridge adapters** — `.observe(OtelBridge(tracer))` maps runtime events to OTel spans (#1908), `.observe(LangSmithBridge(apiKey, project))` maps the same events to LangSmith run trees (#1909), and `.observe(LangfuseBridge(publicKey, secretKey))` maps them to Langfuse traces, generations, spans, and events (#1910), while keeping core vendor-free. See [docs/observability.md](docs/observability.md).
diff --git a/docs/eval.md b/docs/eval.md
new file mode 100644
index 0000000..c61a401
--- /dev/null
+++ b/docs/eval.md
@@ -0,0 +1,165 @@
+[← Back to README](../README.md)
+
+# Eval harness
+
+Two pieces ship today, layered:
+
+- **`DeterministicModelClient`** (#2492) — a `ModelClient` that scripts responses, no network. Pairs with any agent so you can run the full agentic loop deterministically.
+- **`eval { }` DSL** (#2493) — declarative cases with typed assertions over the agent's `OUT`. Supports per-field checks, full structural snapshots, and grouped suites.
+
+Both live in package `agents_engine.testing` and ship in the main module — usable from any consumer's test source set without an extra artifact.
+
+---
+
+## `DeterministicModelClient`
+
+Hand back a pre-scripted sequence of `LlmResponse`s, one per `chat` call. The agent's loop runs end-to-end against the script, with the same Started → ArgsDelta → Finished → End chunk sequence on the streaming side (the default `ModelClient.chatStream` wraps `chat`).
+
+```kotlin
+import agents_engine.testing.DeterministicModelClient
+import agents_engine.model.LlmResponse
+import agents_engine.model.ToolCall
+
+val mock = DeterministicModelClient(
+    LlmResponse.ToolCalls(listOf(ToolCall("lookup", mapOf("id" to "42")))),
+    LlmResponse.Text("found 42"),
+)
+val agent = agent<String, String>("test") {
+    model { ollama("t"); client = mock }
+    tools { tool("lookup", "lookup") { args -> "value-${args["id"]}" } }
+    skills { skill<String, String>("s", "") { tools("lookup") } }
+}
+
+agent("what is 42?")    // → "found 42"
+mock.remaining()        // → 0 (both scripted responses consumed)
+mock.requests           // List<List<LlmMessage>> — every `chat` call's input
+```
+
+### What you get
+
+- **Byte-determinism.** Two runs against the same script + same agent + same input produce identical output.
+- **Request history.** `mock.requests` records every message list the agent built up across turns. Useful for asserting on conversation shape.
+- **Clear exhaustion errors.** If the agent calls `chat` more times than there are scripted responses, the client throws `DeterministicScriptExhausted(callIndex, scriptSize, lastMessages)` naming the offending turn.
+
+### Out of scope (v1)
+
+- **Record-from-live.** The #2492 ticket mentions "record-once/replay-many." That needs an HTTP-fixture story we'll write when there's demand. For now: hand-script the responses or compose with a recording-decorator pattern in your own test code.
+- **Per-token streaming chunks.** `chatStream` uses the default chunk-from-chat wrap — good enough for asserting on the streaming `AgentEvent` shape, not useful for testing provider-specific mid-stream edge cases.
+
+---
+
+## `eval { }` DSL
+
+Declarative cases with typed predicates over the agent's `OUT`.
+
+```kotlin
+import agents_engine.testing.eval
+
+val case = eval<String, Review>("repo-review") {
+    input(SpecText("review this repository"))
+    expect("nonempty risks") { it.risks.isNotEmpty() }
+    expect("at least 3 risks") { it.risks.size >= 3 }
+}
+
+val result = case.run(reviewAgent)
+assertTrue(result.passed) { result.failureMessage }
+```
+
+### Three expectation styles
+
+```kotlin
+// 1. Typed predicate — runs against the parsed OUT, not a string.
+expect("approved") { it.approved == true }
+
+// 2. Snapshot — pins the canonical toLlmInput(output) JSON.
+expectSnapshot(snapshot = """{"text":"Hello","approved":true}""")
+
+// 3. Single-field substring on the rendered JSON — quick for one field.
+expectFieldEquals("approved", true)
+```
+
+All three compose: multiple `expect` blocks must all pass for the case to pass. The failure message names every failing label and renders the typed output for diagnosis.
+
+### Suite mode
+
+Group cases:
+
+```kotlin
+import agents_engine.testing.evalSuite
+
+class GreetingEvalTest {
+    @Test
+    fun `greeting suite`() {
+        val suite = evalSuite("greeting") {
+            + eval<String, String>("nonempty") {
+                input("hi")
+                expect("nonempty") { it.isNotEmpty() }
+            }
+            + eval<String, String>("polite") {
+                input("hi")
+                expect("contains hello") { "hello" in it.lowercase() }
+            }
+        }
+        val result = suite.runAll(greetingAgent)
+        assertTrue(result.passed) { result.failureSummary }
+    }
+}
+```
+
+Suites are **type-homogeneous over the agent type at call time** — `EvalSuite.runAll<IN, OUT>(agent: Agent<IN, OUT>)` binds the case types at the call site. A mixed-shape suite is a compile error.
+
+### Failure shape
+
+`EvalResult.failureMessage` is `null` on pass, structured on fail:
+
+```
+eval case "multi-fail" failed:
+  - starts with goodbye: [starts with goodbye] failed for output: "hello world"
+```
+
+When the agent throws during invocation, the result carries `invocationError` and the message names the exception. Use as `assertTrue(result.passed) { result.failureMessage }` in JUnit / kotlin-test.
+
+---
+
+## Composition: deterministic eval end-to-end
+
+```kotlin
+class RepoReviewEvalTest {
+    @Test
+    fun `repo review hits the audit criteria`() {
+        val mock = DeterministicModelClient(
+            LlmResponse.Text("""{"text":"All good","approved":true,"risks":[]}"""),
+        )
+        val agent = agent<String, Review>("review") {
+            model { ollama("test"); client = mock }
+            skills { skill<String, Review>("review", "") { tools() } }
+        }
+        val case = eval<String, Review>("approved-no-risks") {
+            input("review the repo")
+            expect("approved") { it.approved }
+            expect("no risks") { it.risks.isEmpty() }
+        }
+        val result = case.run(agent)
+        assertTrue(result.passed, result.failureMessage)
+    }
+}
+```
+
+The combination of `DeterministicModelClient` + `eval { }` gives you:
+
+- No network, no live LLM, no nondeterminism.
+- Typed assertions against the agent's `OUT` (not regex on the wire).
+- Pinning the model's response in source — when the prompt or schema changes, you update the script *and* the snapshot in the same diff.
+
+For real-model regression coverage there's the existing `live-llm` / `live-cloud-api` tagged tests; those are nondeterministic by design and out of scope for the eval harness.
+
+---
+
+## Related docs
+
+- [`docs/testing.md`](testing.md) — existing testing conventions (task names, integration test setup, mutation testing).
+- [`docs/observability.md`](observability.md) — the bridges that consume `AgentEvent` and `PipelineEvent` — useful when you're asserting on the streaming flow during eval.
+
+Sources: `agents_engine/testing/DeterministicModelClient.kt`, `agents_engine/testing/EvalDsl.kt`.
+
+Tests: `DeterministicModelClientTest.kt`, `EvalDslTest.kt`.
diff --git a/src/main/resources/internals-agent/testing/EvalHarness.md b/src/main/resources/internals-agent/testing/EvalHarness.md
new file mode 100644
index 0000000..49b03d4
--- /dev/null
+++ b/src/main/resources/internals-agent/testing/EvalHarness.md
@@ -0,0 +1,84 @@
+---
+description: Source-file knowledge for agents_engine/testing/DeterministicModelClient.kt and agents_engine/testing/EvalDsl.kt — eval harness (#2491 / #2492 / #2493). DeterministicModelClient is a ModelClient that scripts LlmResponses in order, fails fast on exhaustion (DeterministicScriptExhausted), records every requests list for assertions, byte-deterministic. eval<IN,OUT>(name) { input + expect + expectSnapshot + expectFieldEquals } DSL produces a typed EvalCase whose .run(agent) returns EvalResult(output, outcomes, invocationError). evalSuite(name) { + case + case } bundles cases. Composes for no-network eval — DeterministicModelClient + eval together give reproducible end-to-end assertions over Agent<IN,OUT>. Out of scope v1: record-from-live HTTP capture, per-token streaming chunk replay. Call when reasoning about deterministic test patterns or typed-assertion eval cases.
+---
+
+# `agents_engine/testing/*` — eval harness
+
+Two cooperating pieces in package `agents_engine.testing`:
+
+## `DeterministicModelClient`
+
+```kotlin
+class DeterministicModelClient(scripted: List<LlmResponse>) : ModelClient {
+    constructor(vararg responses: LlmResponse)
+    val requests: List<List<LlmMessage>>    // every chat() call's input
+    fun remaining(): Int                    // unconsumed responses
+    override fun chat(messages: List<LlmMessage>): LlmResponse
+}
+
+class DeterministicScriptExhausted(val callIndex: Int, val scriptSize: Int, val lastMessages: List<LlmMessage>)
+    : IllegalStateException(...)
+```
+
+Scripts LlmResponses in order, one per chat() call. Streaming uses the default `ModelClient.chatStream` wrap — single-flow Started → ArgsDelta → Finished → End for tool-call responses, TextDelta + End for text responses. Thread-safety: undefined under concurrent use (production loops are single-flight per session).
+
+## `eval { }` DSL
+
+```kotlin
+fun <IN, OUT> eval(name: String, block: EvalCaseBuilder<IN, OUT>.() -> Unit): EvalCase<IN, OUT>
+
+class EvalCaseBuilder<IN, OUT> {
+    fun input(value: IN)
+    fun expect(label: String = "expect", predicate: (OUT) -> Boolean)
+    fun expectSnapshot(label: String = "snapshot", snapshot: String)
+    fun expectFieldEquals(fieldPath: String, expected: Any?)
+}
+
+class EvalCase<IN, OUT> {
+    fun run(agent: Agent<IN, OUT>): EvalResult<OUT>
+}
+
+data class EvalResult<OUT>(val caseName, val output, val outcomes, val invocationError) {
+    val passed: Boolean
+    val failureMessage: String?
+}
+
+fun evalSuite(name: String, block: EvalSuite.() -> Unit): EvalSuite
+
+class EvalSuite {
+    operator fun <IN, OUT> EvalCase<IN, OUT>.unaryPlus()
+    fun <IN, OUT> runAll(agent: Agent<IN, OUT>): EvalSuiteResult<OUT>
+}
+```
+
+## Composition
+
+`DeterministicModelClient` + `eval { }` ⇒ no-network reproducible eval. The model returns scripted responses; the eval case runs typed predicates on the agent's parsed `OUT`. Both run inside JUnit / kotlin-test alongside the normal suite; no new task or runner needed.
+
+## Three expectation styles
+
+| API | Use when |
+|---|---|
+| `expect("label") { predicate }` | Typed access to the parsed `OUT`. Most general; reflection-free. |
+| `expectSnapshot(snapshot = "...")` | Pin a full `toLlmInput(output)` JSON — diff on regression. |
+| `expectFieldEquals(field, value)` | Quick check on one field's rendered JSON value, no full snapshot. |
+
+All compose — multiple expects in one case must all pass. Failure messages name each failing label and render the typed output.
+
+## Failure modes
+
+- Agent threw mid-invocation: `EvalResult.invocationError` is non-null; `outcomes` is empty. `failureMessage` names the exception class + message + case name.
+- Expectation predicate returned false: per-outcome entry with `failureDetail` set.
+- Predicate itself threw: per-outcome entry with `failureDetail = "expectation threw: ..."`.
+
+## Out of scope (v1)
+
+- **Record-from-live** capture (#2492 ticket mentions it; needs HTTP-fixture infra).
+- **Per-token chunk replay** (current streaming uses default ChatChunk wrap).
+- **JSONPath in `expectFieldEquals`** (substring match on canonical JSON — good enough for typical fields; complex queries go through `expect { }` with reflection on the typed `OUT`).
+
+## Related files
+
+- `agents_engine/core/Agent.kt` — the agent that consumes the mock + receives the eval input.
+- `agents_engine/model/ModelClient.kt` — the SAM interface DeterministicModelClient implements.
+- `agents_engine/generation/GenerableSupport.kt` — `toLlmInput` used by snapshot + field expectations to render the typed `OUT`.

From b0e71501ad70a0bdf3b05a99fecabb84f1bb0f5c Mon Sep 17 00:00:00 2001
From: skobeltsyn <Konstantin@skobeltsyn.com>
Date: Sat, 30 May 2026 11:26:50 +0300
Subject: [PATCH 3/4] =?UTF-8?q?feat(#2494):=20LLM-as-judge=20scorer=20?=
 =?UTF-8?q?=E2=80=94=20opt-in,=20advisory,=20never=20gates=20pass/fail?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

#2494 — completes the #2491 eval epic. Adds an opt-in judge for
criteria that resist deterministic assertion (tone, relevance,
completeness). Explicitly advisory by design.

```kotlin
val rubric = JudgeRubric(
    criteria = "Tone: warm, professional, no jargon.",
    judgeModel = DeterministicModelClient(
        LlmResponse.Text("""{"score":8,"rationale":"clear and warm"}"""),
    ),
)
val case = eval<String, Review>("repo-review") {
    input(spec)
    expect("approved") { it.approved }          // ← gates pass/fail
    judge("tone", rubric)                       // ← advisory; never gates
}
val result = case.run(agent)
result.passed                                   // ← depends ONLY on expect blocks
result.judgeVerdicts["tone"]                    // ← JudgeOutcome.Scored(JudgeVerdict)
result.judgeSummary                             // ← "[advisory] tone: 8 — clear and warm"
```

Implementation:

- `agents_engine/testing/LlmJudge.kt` (new):
  * `JudgeRubric(criteria, scoreRange = 0..10, judgeModel)` — typed
    rubric config. The judge model is independent of the production
    agent's model — for unit tests use `DeterministicModelClient`;
    for live eval use a pinned cloud model.
  * `JudgeVerdict(score: Int, rationale: String)` — `@Generable` so
    the judge model returns structured JSON that the framework parses
    through the existing `fromLlmOutput` pipeline. No free-text
    judge prompts → free-text verdicts.
  * Internal `LlmJudge(rubric).score(input, output)` — renders a
    system prompt + user message ("Input: X, Output: Y"), invokes
    the judge model, parses the verdict, validates `score` is in
    `rubric.scoreRange`, returns the typed verdict.

- `agents_engine/testing/EvalDsl.kt` (extended):
  * `EvalCaseBuilder.judge(label, rubric)` — registers an advisory
    scorer. Duplicate labels fail fast at builder time.
  * `EvalCase` carries an immutable `judges: List<JudgeBinding>`.
    Runs each after the agent succeeds; judges do NOT run when the
    agent invocation itself fails (no output to score).
  * `EvalResult.judgeVerdicts: Map<String, JudgeOutcome>` — captured
    verdicts keyed by label. Sealed `JudgeOutcome { Scored(verdict)
    | Errored(detail) }` — parse failures or out-of-range scores
    surface as `Errored` but never gate `passed`.
  * `EvalResult.passed` and `EvalResult.failureMessage` consider
    ONLY deterministic `outcomes` and `invocationError`. Judges are
    structurally excluded from the gating contract.
  * `EvalResult.judgeSummary: String` — multi-line `[advisory]
    <label>: <score> — <rationale>` per-judge summary for test
    reports. Marked `[advisory]` so report consumers don't confuse
    judges with the deterministic pass/fail.

Tests (LlmJudgeTest.kt — 8 cases):
- Verdict captured on `judgeVerdicts[label]` as Scored
- Low judge score does NOT fail the case (advisory only)
- Judge parse error surfaces as Errored, doesn't gate pass/fail
- Out-of-range score surfaces as Errored
- Multiple judges per case keyed by label
- judgeSummary renders the [advisory] marker
- Judges do not run on agent invocation failure
- Duplicate judge labels fail fast at builder time

Eval epic (#2491) now feature-complete: deterministic mocks (#2492)
+ typed eval DSL (#2493) + advisory judge (#2494) ship as one
cohesive `agents_engine.testing` package.

Full suite: 1780 tests across 7 modules, 0 failures.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../kotlin/agents_engine/testing/EvalDsl.kt   |  87 +++++++-
 .../kotlin/agents_engine/testing/LlmJudge.kt  | 138 +++++++++++++
 .../agents_engine/testing/LlmJudgeTest.kt     | 190 ++++++++++++++++++
 3 files changed, 413 insertions(+), 2 deletions(-)
 create mode 100644 src/main/kotlin/agents_engine/testing/LlmJudge.kt
 create mode 100644 src/test/kotlin/agents_engine/testing/LlmJudgeTest.kt

diff --git a/src/main/kotlin/agents_engine/testing/EvalDsl.kt b/src/main/kotlin/agents_engine/testing/EvalDsl.kt
index bb25e26..fa0477d 100644
--- a/src/main/kotlin/agents_engine/testing/EvalDsl.kt
+++ b/src/main/kotlin/agents_engine/testing/EvalDsl.kt
@@ -42,11 +42,23 @@ class EvalCase<IN, OUT>(
     val name: String,
     internal val input: IN,
     internal val expectations: List<EvalExpectation<OUT>>,
+    /**
+     * #2494 — optional LLM judges. Run after the agent succeeds; their
+     * verdicts surface on [EvalResult.judgeVerdicts] for the report but
+     * NEVER gate [EvalResult.passed]. Empty by default.
+     */
+    internal val judges: List<JudgeBinding> = emptyList(),
 ) {
     /**
      * Run this case against [agent], collecting expectation results.
      * Captures exceptions from the agent invocation as a hard failure
      * (the eval can't proceed without the output).
+     *
+     * After the deterministic expectations resolve, any registered LLM
+     * judges (#2494) score the output advisory. Their verdicts attach
+     * to [EvalResult.judgeVerdicts] but never affect [EvalResult.passed].
+     * A judge that itself throws (model returned garbage, etc.) records
+     * a failure detail on the result without gating pass/fail.
      */
     fun run(agent: Agent<IN, OUT>): EvalResult<OUT> {
         val output = try {
@@ -67,10 +79,32 @@ class EvalCase<IN, OUT>(
                 EvalOutcome(expectation.label, false, failureDetail = "expectation threw: ${t.message}")
             }
         }
-        return EvalResult(caseName = name, output = output, outcomes = outcomes, invocationError = null)
+        // #2494 — advisory judge pass. Errors from the judge itself
+        // (parse failures, out-of-range scores) are captured on the
+        // verdict map but never gate pass/fail.
+        val judgeVerdicts = LinkedHashMap<String, JudgeOutcome>()
+        for (binding in judges) {
+            val outcome = try {
+                val verdict = LlmJudge(binding.rubric).score(input, output)
+                JudgeOutcome.Scored(verdict)
+            } catch (t: Throwable) {
+                JudgeOutcome.Errored(t.message ?: t::class.simpleName ?: "judge threw")
+            }
+            judgeVerdicts[binding.label] = outcome
+        }
+        return EvalResult(
+            caseName = name,
+            output = output,
+            outcomes = outcomes,
+            invocationError = null,
+            judgeVerdicts = judgeVerdicts,
+        )
     }
 }
 
+/** Binding of a judge label to its rubric — set via [EvalCaseBuilder.judge]. */
+data class JudgeBinding(val label: String, val rubric: JudgeRubric)
+
 /** A typed expectation over an agent's `OUT`. */
 class EvalExpectation<OUT>(
     val label: String,
@@ -86,6 +120,7 @@ class EvalCaseBuilder<IN, OUT> {
     private var input: IN? = null
     private var inputProvided: Boolean = false
     private val expectations: MutableList<EvalExpectation<OUT>> = mutableListOf()
+    private val judges: MutableList<JudgeBinding> = mutableListOf()
 
     /** Set the agent input. Required — calling [build] without it throws. */
     fun input(value: IN) {
@@ -145,11 +180,26 @@ class EvalCaseBuilder<IN, OUT> {
         )
     }
 
+    /**
+     * #2494 — register an advisory LLM-as-judge scorer. The judge runs
+     * AFTER the agent succeeds and produces a typed [JudgeVerdict]. The
+     * verdict surfaces on [EvalResult.judgeVerdicts] for the test
+     * report but does NOT gate the case's pass/fail — judges are
+     * advisory by design, distinct from the deterministic `expect`
+     * blocks. Multiple judges per case are allowed; each is keyed by
+     * its [label] in the result map.
+     */
+    fun judge(label: String, rubric: JudgeRubric) {
+        require(label.isNotBlank()) { "judge label must not be blank" }
+        require(judges.none { it.label == label }) { "duplicate judge label: $label" }
+        judges += JudgeBinding(label, rubric)
+    }
+
     internal fun build(name: String): EvalCase<IN, OUT> {
         check(inputProvided) { "eval(\"$name\") { } requires an input(...) call." }
         check(expectations.isNotEmpty()) { "eval(\"$name\") { } requires at least one expect(...) block." }
         @Suppress("UNCHECKED_CAST")
-        return EvalCase(name, input as IN, expectations.toList())
+        return EvalCase(name, input as IN, expectations.toList(), judges.toList())
     }
 
     private fun renderForFailure(out: OUT): String =
@@ -179,6 +229,13 @@ data class EvalResult<OUT>(
     val output: OUT?,
     val outcomes: List<EvalOutcome>,
     val invocationError: Throwable?,
+    /**
+     * #2494 — advisory LLM judge verdicts keyed by the label passed to
+     * `judge(label, rubric)`. Empty when no judges are registered.
+     * NOT considered by [passed] or [failureMessage] — judges are
+     * advisory; deterministic expectations are the gating contract.
+     */
+    val judgeVerdicts: Map<String, JudgeOutcome> = emptyMap(),
 ) {
     val passed: Boolean get() = invocationError == null && outcomes.all { it.passed }
 
@@ -192,6 +249,32 @@ data class EvalResult<OUT>(
                 "eval case \"$caseName\" failed: ${fails.joinToString("\n") { "  - ${it.label}: ${it.failureDetail}" }}"
             }
         }
+
+    /**
+     * #2494 — multi-line summary of advisory judge verdicts. Format is
+     * one line per judge: `[advisory] <label>: <score>/<max> — <rationale>`.
+     * Empty string when no judges ran. Marked clearly as advisory so
+     * report consumers don't confuse judges with the deterministic
+     * pass/fail contract.
+     */
+    val judgeSummary: String
+        get() = judgeVerdicts.entries.joinToString("\n") { (label, outcome) ->
+            when (outcome) {
+                is JudgeOutcome.Scored -> "[advisory] $label: ${outcome.verdict.score} — ${outcome.verdict.rationale}"
+                is JudgeOutcome.Errored -> "[advisory] $label: <judge error: ${outcome.errorDetail}>"
+            }
+        }
+}
+
+/**
+ * #2494 — sealed outcome of a single judge invocation. `Scored` carries
+ * the typed verdict; `Errored` captures parse failures or
+ * out-of-range scores from the judge model. Errors here NEVER gate the
+ * case's pass/fail — they just surface in the report.
+ */
+sealed interface JudgeOutcome {
+    data class Scored(val verdict: JudgeVerdict) : JudgeOutcome
+    data class Errored(val errorDetail: String) : JudgeOutcome
 }
 
 /** A bag of [EvalCase]s runnable together. */
diff --git a/src/main/kotlin/agents_engine/testing/LlmJudge.kt b/src/main/kotlin/agents_engine/testing/LlmJudge.kt
new file mode 100644
index 0000000..3da6a25
--- /dev/null
+++ b/src/main/kotlin/agents_engine/testing/LlmJudge.kt
@@ -0,0 +1,138 @@
+package agents_engine.testing
+
+import agents_engine.generation.Generable
+import agents_engine.generation.Guide
+import agents_engine.generation.fromLlmOutput
+import agents_engine.generation.toLlmInput
+import agents_engine.model.LlmMessage
+import agents_engine.model.LlmResponse
+import agents_engine.model.ModelClient
+
+/**
+ * `agents_engine/testing/LlmJudge.kt` — opt-in LLM-as-judge scorer
+ * (#2494, part of the #2491 eval epic).
+ *
+ * **Advisory, not gating.** Judge verdicts capture qualitative criteria
+ * that resist deterministic assertion — tone, relevance, completeness.
+ * They are explicitly separate from the [EvalCase]'s deterministic
+ * `expect { }` blocks: a judge low score never fails a case. The
+ * verdict surfaces on [EvalResult.judgeVerdicts] for the test report
+ * to display alongside the deterministic pass/fail.
+ *
+ * **Typed rubric + structured verdict.** The judge prompt is typed
+ * config ([JudgeRubric]); the verdict is a `@Generable` ([JudgeVerdict])
+ * so the judge model returns structured JSON that the framework parses.
+ * Free-text judge prompts → free-text verdicts are explicitly avoided.
+ *
+ * ```kotlin
+ * val rubric = JudgeRubric(
+ *     criteria = "Tone: professional, calm, neutral; no jargon.",
+ *     judgeModel = DeterministicModelClient(
+ *         LlmResponse.Text(""${'"'}{"score":7,"rationale":"slightly informal"}""${'"'}),
+ *     ),
+ * )
+ * val case = eval<String, Review>("repo-review") {
+ *     input(spec)
+ *     expect("approved") { it.approved }
+ *     judge("tone", rubric)
+ * }
+ * val result = case.run(reviewAgent)
+ * assertTrue(result.passed) { result.failureMessage }       // gated only by `expect`
+ * println(result.judgeVerdicts["tone"])                     // advisory score visible
+ * ```
+ *
+ * **Pinnable model.** The [JudgeRubric.judgeModel] is a regular
+ * [ModelClient] — most often a [DeterministicModelClient] in unit
+ * tests (so the judge itself is reproducible) or a pinned cloud model
+ * in a `live-cloud-api`-tagged eval suite. Either way the judge model
+ * is independent of the production agent's model.
+ *
+ * Pairs with #2491 (eval epic), #2492 (DeterministicModelClient),
+ * #2493 (eval DSL).
+ */
+
+/**
+ * Typed rubric for an LLM-as-judge scoring pass. The framework renders
+ * this as a system prompt for [judgeModel] when the judge runs.
+ *
+ * @property criteria the rubric text shown to the judge model. Be
+ *   specific about what's being scored ("tone: professional and
+ *   neutral" vs. "good answer").
+ * @property scoreRange the integer range judges score within. Defaults
+ *   to `0..10`. Verdict scores outside this range trip a clear error
+ *   in [LlmJudge.score].
+ * @property judgeModel the [ModelClient] that produces the verdict.
+ *   Independent of the production agent's model — use a pinned model
+ *   here. For reproducible unit tests, use [DeterministicModelClient].
+ */
+data class JudgeRubric(
+    val criteria: String,
+    val scoreRange: IntRange = 0..10,
+    val judgeModel: ModelClient,
+)
+
+/**
+ * Structured judge output. `@Generable` so the judge model returns
+ * JSON the framework parses through the existing `fromLlmOutput`
+ * pipeline — no string parsing in test code.
+ *
+ * @property score the integer score within [JudgeRubric.scoreRange].
+ *   Out-of-range scores throw at parse time.
+ * @property rationale one sentence justifying the score. Surfaces in
+ *   test reports alongside the deterministic outcomes.
+ */
+@Generable("A structured verdict from an LLM-as-judge scoring pass.")
+data class JudgeVerdict(
+    @Guide("Integer score within the rubric's scoreRange.")
+    val score: Int,
+    @Guide("One sentence justifying the score.")
+    val rationale: String,
+)
+
+/**
+ * Runs a [JudgeRubric] over (input, output) pairs and returns a typed
+ * [JudgeVerdict]. Internal — eval cases use the `judge(label, rubric)`
+ * DSL on [EvalCaseBuilder] rather than calling this directly.
+ */
+internal class LlmJudge(private val rubric: JudgeRubric) {
+    fun score(input: Any?, output: Any?): JudgeVerdict {
+        val messages = listOf(
+            LlmMessage(
+                role = "system",
+                content = """
+                    You are a strict but fair judge.
+
+                    Rubric: ${rubric.criteria}
+
+                    Score the assistant's response on an integer scale in ${rubric.scoreRange.first}..${rubric.scoreRange.last}.
+                    Respond ONLY with JSON of the shape: {"score": <integer>, "rationale": "<one sentence>"}.
+                """.trimIndent(),
+            ),
+            LlmMessage(
+                role = "user",
+                content = """
+                    Input: ${toLlmInput(input)}
+                    Output: ${toLlmInput(output)}
+                """.trimIndent(),
+            ),
+        )
+        val response = rubric.judgeModel.chat(messages)
+        val text = when (response) {
+            is LlmResponse.Text -> response.content
+            is LlmResponse.ToolCalls -> error(
+                "Judge model returned tool calls instead of a text verdict. " +
+                    "The judge model must produce a JSON object matching JudgeVerdict.",
+            )
+        }
+        val verdict = JudgeVerdict::class.fromLlmOutput(text) as? JudgeVerdict
+            ?: error(
+                "Judge response did not parse as JudgeVerdict. " +
+                    "Expected JSON like {\"score\":7,\"rationale\":\"...\"}; got: $text",
+            )
+        require(verdict.score in rubric.scoreRange) {
+            "Judge returned score ${verdict.score} outside rubric range ${rubric.scoreRange}. " +
+                "Rationale: ${verdict.rationale}"
+        }
+        return verdict
+    }
+}
diff --git a/src/test/kotlin/agents_engine/testing/LlmJudgeTest.kt b/src/test/kotlin/agents_engine/testing/LlmJudgeTest.kt
new file mode 100644
index 0000000..4dd0db6
--- /dev/null
+++ b/src/test/kotlin/agents_engine/testing/LlmJudgeTest.kt
@@ -0,0 +1,190 @@
+package agents_engine.testing
+
+import agents_engine.core.agent
+import agents_engine.model.LlmResponse
+import kotlin.test.Test
+import kotlin.test.assertEquals
+import kotlin.test.assertFalse
+import kotlin.test.assertNotNull
+import kotlin.test.assertNull
+import kotlin.test.assertTrue
+
+/**
+ * #2494 — LLM-as-judge scorer. Pins:
+ *
+ * 1. `judge(label, rubric)` adds an advisory scorer that runs after the
+ *    agent succeeds.
+ * 2. The verdict surfaces on `EvalResult.judgeVerdicts[label]` as a
+ *    `JudgeOutcome.Scored(JudgeVerdict)`.
+ * 3. A low judge score does NOT fail the case — judges are advisory;
+ *    only deterministic `expect` blocks gate pass/fail.
+ * 4. A judge parse error / out-of-range score surfaces as
+ *    `JudgeOutcome.Errored` but still doesn't gate pass/fail.
+ * 5. Multiple judges per case allowed; each keyed by label.
+ * 6. The `judgeSummary` field renders advisory output cleanly with
+ *    the `[advisory]` marker.
+ * 7. Judges only run if the agent itself succeeded (no judge run on
+ *    invocation error).
+ */
+class LlmJudgeTest {
+
+    private fun judgeReturning(json: String) = DeterministicModelClient(LlmResponse.Text(json))
+
+    private fun simpleAgent(text: String) = agent<String, String>("a") {
+        model { ollama("t"); client = DeterministicModelClient(LlmResponse.Text(text)) }
+        skills { skill<String, String>("s", "") { tools() } }
+    }
+
+    @Test
+    fun `judge verdict is captured on EvalResult judgeVerdicts`() {
+        val rubric = JudgeRubric(
+            criteria = "Tone: warm and helpful.",
+            judgeModel = judgeReturning("""{"score":8,"rationale":"clear and warm"}"""),
+        )
+        val case = eval<String, String>("tone-check") {
+            input("hi")
+            expect("nonempty") { it.isNotEmpty() }
+            judge("tone", rubric)
+        }
+        val result = case.run(simpleAgent("hello there"))
+        assertTrue(result.passed, "deterministic check passed")
+        val outcome = result.judgeVerdicts["tone"] as JudgeOutcome.Scored
+        assertEquals(8, outcome.verdict.score)
+        assertEquals("clear and warm", outcome.verdict.rationale)
+    }
+
+    @Test
+    fun `low judge score does NOT fail the case (advisory only)`() {
+        val rubric = JudgeRubric(
+            criteria = "Tone: warm and helpful.",
+            judgeModel = judgeReturning("""{"score":2,"rationale":"cold and clipped"}"""),
+        )
+        val case = eval<String, String>("low-score") {
+            input("hi")
+            expect("nonempty") { it.isNotEmpty() }
+            judge("tone", rubric)
+        }
+        val result = case.run(simpleAgent("k."))
+        // Deterministic expect passed → case passes despite low score
+        assertTrue(result.passed, "judge score 2 must not gate case pass/fail")
+        assertNull(result.failureMessage)
+        val outcome = result.judgeVerdicts["tone"] as JudgeOutcome.Scored
+        assertEquals(2, outcome.verdict.score)
+    }
+
+    @Test
+    fun `judge errors do not gate pass-fail and surface as Errored`() {
+        val rubric = JudgeRubric(
+            criteria = "Tone.",
+            judgeModel = judgeReturning("not valid json"),
+        )
+        val case = eval<String, String>("bad-judge") {
+            input("hi")
+            expect("nonempty") { it.isNotEmpty() }
+            judge("tone", rubric)
+        }
+        val result = case.run(simpleAgent("hello"))
+        assertTrue(result.passed, "judge parse error does not gate the deterministic pass")
+        val outcome = result.judgeVerdicts["tone"]
+        assertTrue(outcome is JudgeOutcome.Errored, "non-parseable verdict surfaces as Errored")
+    }
+
+    @Test
+    fun `out-of-range score surfaces as Errored, not as Scored`() {
+        val rubric = JudgeRubric(
+            criteria = "Tone.",
+            scoreRange = 0..10,
+            judgeModel = judgeReturning("""{"score":99,"rationale":"out of range"}"""),
+        )
+        val case = eval<String, String>("out-of-range") {
+            input("hi")
+            expect("nonempty") { it.isNotEmpty() }
+            judge("tone", rubric)
+        }
+        val result = case.run(simpleAgent("hello"))
+        assertTrue(result.passed)
+        val outcome = result.judgeVerdicts["tone"]
+        assertTrue(outcome is JudgeOutcome.Errored, "out-of-range score is a judge failure mode")
+        assertTrue("99" in (outcome as JudgeOutcome.Errored).errorDetail)
+    }
+
+    @Test
+    fun `multiple judges per case are keyed by label`() {
+        val toneRubric = JudgeRubric(
+            criteria = "Tone.",
+            judgeModel = judgeReturning("""{"score":7,"rationale":"warm"}"""),
+        )
+        val relevanceRubric = JudgeRubric(
+            criteria = "Relevance to the question.",
+            judgeModel = judgeReturning("""{"score":9,"rationale":"on topic"}"""),
+        )
+        val case = eval<String, String>("multi-judge") {
+            input("hi")
+            expect("nonempty") { it.isNotEmpty() }
+            judge("tone", toneRubric)
+            judge("relevance", relevanceRubric)
+        }
+        val result = case.run(simpleAgent("yes hello"))
+        assertEquals(2, result.judgeVerdicts.size)
+        assertEquals(7, (result.judgeVerdicts["tone"] as JudgeOutcome.Scored).verdict.score)
+        assertEquals(9, (result.judgeVerdicts["relevance"] as JudgeOutcome.Scored).verdict.score)
+    }
+
+    @Test
+    fun `judgeSummary renders advisory marker on every line`() {
+        val rubric = JudgeRubric(
+            criteria = "Tone.",
+            judgeModel = judgeReturning("""{"score":6,"rationale":"acceptable"}"""),
+        )
+        val case = eval<String, String>("summary") {
+            input("hi")
+            expect("nonempty") { it.isNotEmpty() }
+            judge("tone", rubric)
+        }
+        val result = case.run(simpleAgent("hello"))
+        val summary = result.judgeSummary
+        assertTrue("[advisory]" in summary, "summary marks judges as advisory: $summary")
+        assertTrue("tone" in summary)
+        assertTrue("6" in summary)
+        assertTrue("acceptable" in summary)
+    }
+
+    @Test
+    fun `judges do not run when the agent invocation itself fails`() {
+        val rubric = JudgeRubric(
+            criteria = "Tone.",
+            judgeModel = judgeReturning("""{"score":10,"rationale":"never reached"}"""),
+        )
+        val crashingAgent = agent<String, String>("crash") {
+            model { ollama("t"); client = DeterministicModelClient() /* empty script → exhaustion */ }
+            skills { skill<String, String>("s", "") { tools() } }
+        }
+        val case = eval<String, String>("crash-then-judge") {
+            input("hi")
+            expect("never reached") { true }
+            judge("tone", rubric)
+        }
+        val result = case.run(crashingAgent)
+        assertFalse(result.passed)
+        assertNotNull(result.invocationError)
+        assertEquals(emptyMap(), result.judgeVerdicts, "no judges run when the agent didn't return an output")
+    }
+
+    @Test
+    fun `duplicate judge labels fail fast at builder time`() {
+        val rubric = JudgeRubric(
+            criteria = "X.",
+            judgeModel = judgeReturning("""{"score":5,"rationale":"y"}"""),
+        )
+        val ex = kotlin.runCatching {
+            eval<String, String>("dup") {
+                input("hi")
+                expect("nonempty") { it.isNotEmpty() }
+                judge("tone", rubric)
+                judge("tone", rubric)
+            }
+        }.exceptionOrNull()
+        assertNotNull(ex)
+        assertTrue("duplicate" in ex.message!!.lowercase(), "error names the dup case: ${ex.message}")
+    }
+}

From e17eff5d0ce54a93c8d0407616710674533e388c Mon Sep 17 00:00:00 2001
From: skobeltsyn <Konstantin@skobeltsyn.com>
Date: Sat, 30 May 2026 11:29:11 +0300
Subject: [PATCH 4/4] =?UTF-8?q?docs(#2494):=20LLM-as-judge=20=E2=80=94=20e?=
 =?UTF-8?q?val=20doc=20+=20internals=20adjunct=20+=20README=20+=20CHANGELO?=
 =?UTF-8?q?G?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- docs/eval.md — adds a new "LLM-as-judge (advisory)" section after
  DeterministicModelClient and before the eval DSL. Walks through:
  the example with tone scoring; why advisory + opt-in (LLM judges
  are themselves nondeterministic; gating them imports flakiness);
  pinning the judge model (DeterministicModelClient for unit tests,
  pinned cloud model for live eval); the sealed JudgeOutcome
  Scored/Errored failure modes; what happens when the agent itself
  fails (no judges run). Header summary updated to "three pieces."
- src/main/resources/internals-agent/testing/EvalHarness.md — adjunct
  description string updated to cover all three pieces and the
  judge-doesn't-gate constraint explicitly. Code-shape block adds
  the judge() DSL line + JudgeOutcome / JudgeRubric / JudgeVerdict
  types. Failure-modes section gains the judge errored case.
- README.md — extends the "Eval harness" bullet with the optional
  judge(...) sentence and the explicit "judges never gate" callout.
- CHANGELOG.md — adds a third entry under "Eval harness (#2491 epic)"
  for #2494 with the advisory-only semantics, sealed JudgeOutcome,
  judge-model pinning, and agent-failure interaction. Header changes
  from "in progress" to "feature-complete."

No source changes. Full suite stays at 1780 / 0 failures from the
prior commit.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md                                  |  3 +-
 README.md                                     |  2 +-
 docs/eval.md                                  | 63 ++++++++++++++++++-
 .../internals-agent/testing/EvalHarness.md    | 23 +++++--
 4 files changed, 82 insertions(+), 9 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d5b6961..86f9065 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,10 +4,11 @@ All notable changes to Agents.KT are documented here. The format follows [Keep a
 
 ## [Unreleased]
 
-### Added — Eval harness (#2491 epic, in progress)
+### Added — Eval harness (#2491 epic, feature-complete)
 
 - **`DeterministicModelClient` (#2492)** — `agents_engine.testing.DeterministicModelClient(scripted: List<LlmResponse>)` (or vararg ctor) hands back pre-scripted responses one per `chat` call. No network, byte-deterministic. `requests` records every message list the agent built up; `remaining()` reports unconsumed responses. Exhaustion throws `DeterministicScriptExhausted(callIndex, scriptSize, lastMessages)`. Streaming uses the default `ModelClient.chatStream` wrap. Out of scope for v1: record-from-live HTTP capture (mentioned in the ticket — needs an HTTP-fixture story we'll write when there's demand) and per-token chunk replay.
 - **`eval { }` DSL (#2493)** — `agents_engine.testing.eval<IN, OUT>("name") { input(...); expect { ... } }` builds a typed eval case. Three expectation styles: `expect("label") { predicate }` (typed predicate over `OUT`), `expectSnapshot(snapshot = "...")` (pin canonical `toLlmInput(output)` JSON; diff on regression), `expectFieldEquals(field, value)` (single-field substring on rendered JSON). Multiple expects compose — all must pass. `EvalResult.failureMessage` is null on pass, structured on fail with per-expectation diagnostics. `evalSuite("name") { + case; + case }.runAll(agent)` bundles cases; type-homogeneous over the agent type at call time (mixed-shape suite is a compile error). Composes with `DeterministicModelClient` for fully reproducible end-to-end agentic-loop eval against typed `OUT`. See [docs/eval.md](docs/eval.md).
+- **LLM-as-judge scorer (#2494)** — `agents_engine.testing.JudgeRubric(criteria, scoreRange, judgeModel)` + `@Generable JudgeVerdict(score, rationale)`. Opt-in via `eval { ... judge("tone", rubric) }`. Verdicts surface on `EvalResult.judgeVerdicts: Map<String, JudgeOutcome>` keyed by label; sealed `JudgeOutcome { Scored(verdict) | Errored(detail) }` so parse failures or out-of-range scores surface without aborting. `EvalResult.passed` is structurally restricted to deterministic `outcomes` + `invocationError` — judges NEVER gate pass/fail. `EvalResult.judgeSummary` renders `[advisory] <label>: <score> — <rationale>` lines for test reports. The judge model is independent of the production agent's model: use `DeterministicModelClient` for unit tests (so the judge itself is reproducible) or a pinned cloud model for live eval. Judges don't run when the agent itself fails (no output to score). See [docs/eval.md](docs/eval.md).
 
 ## [0.6.4] — 2026-05-30
 
diff --git a/README.md b/README.md
index c54bdfb..ed99b54 100644
--- a/README.md
+++ b/README.md
@@ -154,7 +154,7 @@ These APIs work in `main`, are unit-tested, and are exercised by integration tes
 - **Tool error recovery** — per-tool `onError`, per-skill default, agent default; built-in `escalate` and `throwException` agents. See [docs/error-recovery.md](docs/error-recovery.md).
 - **Budget controls** — `budget { maxTurns; maxToolCalls; maxDuration; perToolTimeout; maxTokens; maxConsecutiveSameTool }` (`perToolTimeout` covers regular and session-aware tools; token counts cumulative across turns when the provider reports usage; `maxConsecutiveSameTool` catches LLM retry loops on a broken tool) (#637, #963, #969, #1903). `onBudgetExceeded { reason, currentLimit -> BudgetDecision.Extend(newLimit) }` raises a cap and continues instead of throwing — a long-running agent can grant itself more tool calls mid-run rather than failing (#2412). `BudgetDecision.Checkpoint` (#2749) is the third variant — pause at the cap, deliver a `SessionSnapshot` via the registered `onTurnCheckpoint` hook, throw a recoverable `BudgetCheckpointException`, and resume later via `agent.invokeSuspendResuming(input, resumeFrom = snapshot)` once the human approves a raise (no history replay).
 - **Public snapshot / resume** — `agent.invokeSuspendResuming(input, resumeFrom = null, onTurnCheckpoint = null)` (#2749) is the public seam over the internal `executeAgentic(resumeFrom, onTurnCheckpoint)` primitives from #2416. With defaults it matches `invokeSuspend(input)` byte-for-byte; with `onTurnCheckpoint` set it captures a `SessionSnapshot` at every turn boundary; with `resumeFrom = snapshot` it continues an in-flight invocation without replaying history. On resume the loop honors `max(snapshot.toolCallLimit, agent.budget.maxToolCalls)` so a rebuilt agent with a raised cap actually picks it up.
-- **Eval harness** — `DeterministicModelClient(LlmResponse.Text("..."), LlmResponse.ToolCalls(...))` (#2492) scripts model responses for reproducible eval without a live provider; the streaming flow folds into the same Started → ArgsDelta → Finished → End chunk sequence a native streaming provider would emit. Typed assertion DSL `eval<IN, OUT>("name") { input(...); expect { ... }; expectSnapshot(...) }` (#2493) runs against the parsed `OUT` — not regex on the wire. Snapshot mode pins `toLlmInput(output)` JSON for structural diffs; `evalSuite { + case; + case }` bundles cases. See [docs/eval.md](docs/eval.md).
+- **Eval harness** — `DeterministicModelClient(LlmResponse.Text("..."), LlmResponse.ToolCalls(...))` (#2492) scripts model responses for reproducible eval without a live provider; the streaming flow folds into the same Started → ArgsDelta → Finished → End chunk sequence a native streaming provider would emit. Typed assertion DSL `eval<IN, OUT>("name") { input(...); expect { ... }; expectSnapshot(...) }` (#2493) runs against the parsed `OUT` — not regex on the wire. Snapshot mode pins `toLlmInput(output)` JSON for structural diffs; `evalSuite { + case; + case }` bundles cases. Optional `judge("tone", rubric)` (#2494) runs an advisory LLM-as-judge scorer with a typed `@Generable` `JudgeVerdict` — explicitly separate from the deterministic pass/fail contract (judges never gate). See [docs/eval.md](docs/eval.md).
 - **Prompt caching across providers** — `agent { caching { enabled = true; cacheSystemPrompt = true; cacheToolDefs = true; cacheConversation = Rolling; ttl = 1.hours; cacheable("doc-id") { ... } } }`. Vendor-neutral DSL drives Anthropic's explicit `cache_control` breakpoints (#2658), OpenAI / DeepSeek automatic prefix caching with a stable `prompt_cache_key` routing hint (#2659 / #2661), Ollama / vLLM / SGLang engine-level KV-cache reuse (no-op hints, #2662), and surfaces cache reads + writes + hit-rate on `TokenUsage` (#2663). A prefix-stability guard (#2657) detects silent cache-busters — timestamps, UUIDs, non-deterministic ordering inside cacheable segments — and warns before you pay for a single non-cached run. Off by default; non-breaking. See [docs/caching.md](docs/caching.md).
 - **JSONL audit exporter** — `:agents-kt-observability` writes append-only, one-line-per-event audit rows with `requestId`, `sessionId`, `manifestHash`, agent/skill/tool ids, event type, provider, and model; raw arguments/results are omitted by default (#1914). See [docs/observability.md](docs/observability.md).
 - **ObservabilityBridge adapters** — `.observe(OtelBridge(tracer))` maps runtime events to OTel spans (#1908), `.observe(LangSmithBridge(apiKey, project))` maps the same events to LangSmith run trees (#1909), and `.observe(LangfuseBridge(publicKey, secretKey))` maps them to Langfuse traces, generations, spans, and events (#1910), while keeping core vendor-free. See [docs/observability.md](docs/observability.md).
diff --git a/docs/eval.md b/docs/eval.md
index c61a401..e6c3701 100644
--- a/docs/eval.md
+++ b/docs/eval.md
@@ -2,12 +2,13 @@
 
 # Eval harness
 
-Two pieces ship today, layered:
+Three pieces ship today, layered:
 
 - **`DeterministicModelClient`** (#2492) — a `ModelClient` that scripts responses, no network. Pairs with any agent so you can run the full agentic loop deterministically.
 - **`eval { }` DSL** (#2493) — declarative cases with typed assertions over the agent's `OUT`. Supports per-field checks, full structural snapshots, and grouped suites.
+- **LLM-as-judge** (#2494) — opt-in advisory scorer for criteria that resist deterministic assertion (tone, relevance, completeness). Typed rubric, structured `JudgeVerdict`, explicitly separate from the deterministic pass/fail contract.
 
-Both live in package `agents_engine.testing` and ship in the main module — usable from any consumer's test source set without an extra artifact.
+All three live in package `agents_engine.testing` and ship in the main module — usable from any consumer's test source set without an extra artifact.
 
 ---
 
@@ -48,6 +49,64 @@ mock.requests           // List<List<LlmMessage>> — every `chat` call's input
 
 ---
 
+## LLM-as-judge (advisory)
+
+For criteria that resist deterministic assertion — tone, relevance, completeness — opt into a `judge`. The judge runs after the agent succeeds, scores the (input, output) pair with a typed `@Generable` verdict, and surfaces on `EvalResult.judgeVerdicts`. **Judges never gate the case's pass/fail** — only deterministic `expect { }` blocks do.
+
+```kotlin
+import agents_engine.testing.JudgeRubric
+
+val toneRubric = JudgeRubric(
+    criteria = "Tone: warm, professional, no jargon.",
+    judgeModel = DeterministicModelClient(
+        LlmResponse.Text("""{"score":8,"rationale":"clear and warm"}"""),
+    ),
+)
+
+val case = eval<String, Review>("repo-review") {
+    input(spec)
+    expect("approved") { it.approved }       // ← gates pass/fail
+    judge("tone", toneRubric)                // ← advisory only
+}
+
+val result = case.run(reviewAgent)
+result.passed                                // depends ONLY on `expect` blocks
+result.judgeVerdicts["tone"]                 // JudgeOutcome.Scored(JudgeVerdict)
+println(result.judgeSummary)
+// [advisory] tone: 8 — clear and warm
+```
+
+### Why opt-in and advisory
+
+LLM judges are themselves nondeterministic and prompt-sensitive. Treating them as gating regression checks would import the same flakiness the deterministic harness is designed to eliminate. The split is intentional:
+
+- **Deterministic `expect`** ⇒ pass/fail contract. Reproducible across runs.
+- **`judge`** ⇒ qualitative score for the report. Useful as a quality trend over time; never as a fail signal.
+
+### Pinning the judge model
+
+The `judgeModel` in `JudgeRubric` is a regular `ModelClient`:
+
+- **Unit tests:** use `DeterministicModelClient` with a scripted verdict JSON. The judge call itself becomes reproducible.
+- **Live eval:** use a pinned cloud model — explicit version + low temperature. Even then, drift between runs is expected; that's why the judge is advisory.
+
+### Failure modes
+
+`EvalResult.judgeVerdicts` carries `JudgeOutcome` for each registered judge — a sealed type:
+
+| Variant | When |
+|---|---|
+| `JudgeOutcome.Scored(verdict: JudgeVerdict)` | Judge model returned valid JSON; score in range. |
+| `JudgeOutcome.Errored(errorDetail: String)` | Judge model returned non-JSON, or returned a score outside `rubric.scoreRange`. |
+
+Both surface in the report. Neither affects `EvalResult.passed`.
+
+### Judges and agent failures
+
+If the agent invocation itself throws (`EvalResult.invocationError` is set), no judges run — there's no output to score. The `judgeVerdicts` map is empty in that case.
+
+---
+
 ## `eval { }` DSL
 
 Declarative cases with typed predicates over the agent's `OUT`.
diff --git a/src/main/resources/internals-agent/testing/EvalHarness.md b/src/main/resources/internals-agent/testing/EvalHarness.md
index 49b03d4..7078c99 100644
--- a/src/main/resources/internals-agent/testing/EvalHarness.md
+++ b/src/main/resources/internals-agent/testing/EvalHarness.md
@@ -1,10 +1,10 @@
 ---
-description: Source-file knowledge for agents_engine/testing/DeterministicModelClient.kt and agents_engine/testing/EvalDsl.kt — eval harness (#2491 / #2492 / #2493). DeterministicModelClient is a ModelClient that scripts LlmResponses in order, fails fast on exhaustion (DeterministicScriptExhausted), records every requests list for assertions, byte-deterministic. eval<IN,OUT>(name) { input + expect + expectSnapshot + expectFieldEquals } DSL produces a typed EvalCase whose .run(agent) returns EvalResult(output, outcomes, invocationError). evalSuite(name) { + case + case } bundles cases. Composes for no-network eval — DeterministicModelClient + eval together give reproducible end-to-end assertions over Agent<IN,OUT>. Out of scope v1: record-from-live HTTP capture, per-token streaming chunk replay. Call when reasoning about deterministic test patterns or typed-assertion eval cases.
+description: Source-file knowledge for the eval harness in agents_engine/testing/ — #2491 epic (#2492 + #2493 + #2494). Three cooperating pieces in one package. DeterministicModelClient (#2492): scripts LlmResponses in order, fails fast on exhaustion, records requests for assertions, byte-deterministic, default chatStream wrap. eval<IN,OUT>(name) { input + expect + expectSnapshot + expectFieldEquals + judge } DSL (#2493 + #2494): typed predicates, snapshot diff, single-field check; judge(label, rubric) adds an opt-in advisory LLM scorer that does NOT gate pass/fail. EvalResult carries outcomes + invocationError + judgeVerdicts: Map<String, JudgeOutcome> with sealed Scored/Errored variants. evalSuite(name) { + case } bundles cases. JudgeRubric is typed config + a pinnable ModelClient; JudgeVerdict is @Generable so the judge model returns structured JSON. Pass/fail gating is structurally restricted to deterministic expect blocks — judges never gate. Out of scope v1: record-from-live HTTP capture, per-token chunk replay. Call when reasoning about deterministic test patterns, typed-assertion eval cases, or advisory LLM scoring.
 ---
 
 # `agents_engine/testing/*` — eval harness
 
-Two cooperating pieces in package `agents_engine.testing`:
+Three cooperating pieces in package `agents_engine.testing`:
 
 ## `DeterministicModelClient`
 
@@ -32,17 +32,29 @@ class EvalCaseBuilder<IN, OUT> {
     fun expect(label: String = "expect", predicate: (OUT) -> Boolean)
     fun expectSnapshot(label: String = "snapshot", snapshot: String)
     fun expectFieldEquals(fieldPath: String, expected: Any?)
+    fun judge(label: String, rubric: JudgeRubric)    // #2494 — advisory only
 }
 
 class EvalCase<IN, OUT> {
     fun run(agent: Agent<IN, OUT>): EvalResult<OUT>
 }
 
-data class EvalResult<OUT>(val caseName, val output, val outcomes, val invocationError) {
-    val passed: Boolean
+data class EvalResult<OUT>(val caseName, val output, val outcomes, val invocationError,
+                            val judgeVerdicts: Map<String, JudgeOutcome>) {
+    val passed: Boolean              // gated ONLY by outcomes + invocationError; judges excluded
     val failureMessage: String?
+    val judgeSummary: String         // multi-line "[advisory] label: score — rationale"
 }
 
+sealed interface JudgeOutcome {
+    data class Scored(val verdict: JudgeVerdict)
+    data class Errored(val errorDetail: String)
+}
+
+data class JudgeRubric(val criteria: String, val scoreRange: IntRange = 0..10, val judgeModel: ModelClient)
+
+@Generable data class JudgeVerdict(val score: Int, val rationale: String)
+
 fun evalSuite(name: String, block: EvalSuite.() -> Unit): EvalSuite
 
 class EvalSuite {
@@ -67,9 +79,10 @@ All compose — multiple expects in one case must all pass. Failure messages nam
 
 ## Failure modes
 
-- Agent threw mid-invocation: `EvalResult.invocationError` is non-null; `outcomes` is empty. `failureMessage` names the exception class + message + case name.
+- Agent threw mid-invocation: `EvalResult.invocationError` is non-null; `outcomes` is empty; `judgeVerdicts` empty (no judges run without an output). `failureMessage` names the exception class + message + case name.
 - Expectation predicate returned false: per-outcome entry with `failureDetail` set.
 - Predicate itself threw: per-outcome entry with `failureDetail = "expectation threw: ..."`.
+- Judge model returned non-JSON or out-of-range score: `JudgeOutcome.Errored(errorDetail)` in `judgeVerdicts[label]`. Does NOT gate `passed`. The judge model is responsible for the structured JSON shape — typically a `DeterministicModelClient` in unit tests or a pinned cloud model in live eval.
 
 ## Out of scope (v1)