From 827fb3fb5ee6b8ea858072991727b4d0f99565ee Mon Sep 17 00:00:00 2001
From: fcogidi <41602287+fcogidi@users.noreply.github.com>
Date: Fri, 6 Feb 2026 16:38:57 -0500
Subject: [PATCH 1/2] Add LLM judge evaluator and related configurations

---
 .../evaluation/graders/__init__.py            |  16 ++
 .../agent_evals/evaluation/graders/_utils.py  | 210 ++++++++++++++
 .../agent_evals/evaluation/graders/config.py  |  44 +++
 .../evaluation/graders/llm_judge.py           | 269 ++++++++++++++++++
 .../aieng/agent_evals/evaluation/__init__.py  |   1 +
 .../evaluation/graders/__init__.py            |   1 +
 .../evaluation/graders/test_llm_judge.py      | 201 +++++++++++++
 7 files changed, 742 insertions(+)
 create mode 100644 aieng-eval-agents/aieng/agent_evals/evaluation/graders/__init__.py
 create mode 100644 aieng-eval-agents/aieng/agent_evals/evaluation/graders/_utils.py
 create mode 100644 aieng-eval-agents/aieng/agent_evals/evaluation/graders/config.py
 create mode 100644 aieng-eval-agents/aieng/agent_evals/evaluation/graders/llm_judge.py
 create mode 100644 aieng-eval-agents/tests/aieng/agent_evals/evaluation/__init__.py
 create mode 100644 aieng-eval-agents/tests/aieng/agent_evals/evaluation/graders/__init__.py
 create mode 100644 aieng-eval-agents/tests/aieng/agent_evals/evaluation/graders/test_llm_judge.py

diff --git a/aieng-eval-agents/aieng/agent_evals/evaluation/graders/__init__.py b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/__init__.py
new file mode 100644
index 0000000..ba1545f
--- /dev/null
+++ b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/__init__.py
@@ -0,0 +1,16 @@
+"""Graders for agent evaluations.
+
+This subpackage contains evaluator factories that can be shared across
+agent domains. The factories return Langfuse-compatible evaluator callables
+that can be passed directly to ``dataset.run_experiment`` or the wrappers in the
+evaluation harness.
+"""
+
+from .llm_judge import LLMJudgeMetric, LLMJudgeResponse, make_llm_as_judge_evaluator
+
+
+__all__ = [
+    "LLMJudgeMetric",
+    "LLMJudgeResponse",
+    "make_llm_as_judge_evaluator",
+]
diff --git a/aieng-eval-agents/aieng/agent_evals/evaluation/graders/_utils.py b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/_utils.py
new file mode 100644
index 0000000..e17972f
--- /dev/null
+++ b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/_utils.py
@@ -0,0 +1,210 @@
+"""Shared helpers for OpenAI-compatible LLM-based graders."""
+
+import json
+from pathlib import Path
+from typing import Any, TypeVar, cast
+
+from aieng.agent_evals.evaluation.graders.config import LLMRequestConfig
+from aieng.agent_evals.evaluation.types import Evaluation
+from langfuse.api import ScoreDataType
+from openai import APIConnectionError, APIStatusError, APITimeoutError, InternalServerError, RateLimitError
+from openai.types.chat.parsed_chat_completion import ParsedChatCompletion
+from pydantic import BaseModel
+from tenacity import AsyncRetrying, retry_if_exception, stop_after_attempt, wait_exponential
+
+
+T = TypeVar("T", bound=BaseModel)
+
+
+async def run_structured_parse_call(
+    *,
+    openai_client: Any,
+    default_model: str,
+    model_config: LLMRequestConfig,
+    system_prompt: str,
+    user_prompt: str,
+    response_format: type[T],
+) -> ParsedChatCompletion[T]:
+    """Run ``chat.completions.parse`` with retry for transient API failures.
+
+    Parameters
+    ----------
+    openai_client : Any
+        OpenAI-compatible async client instance.
+    default_model : str
+        Fallback model name when ``model_config.model`` is not provided.
+    model_config : LLMRequestConfig
+        Request and retry configuration.
+    system_prompt : str
+        System prompt content.
+    user_prompt : str
+        User prompt content.
+    response_format : type[T]
+        Pydantic model used by ``parse`` for structured output.
+
+    Returns
+    -------
+    ParsedChatCompletion[T]
+        Completion object returned by ``chat.completions.parse``.
+    """
+    model_name = model_config.model or default_model
+    request_kwargs: dict[str, Any] = dict(model_config.extra_request_kwargs)
+    request_kwargs.update(
+        {
+            "model": model_name,
+            "messages": [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt},
+            ],
+            "response_format": response_format,
+            "temperature": model_config.temperature,
+        }
+    )
+    if model_config.max_completion_tokens is not None:
+        request_kwargs["max_completion_tokens"] = model_config.max_completion_tokens
+    if model_config.timeout_sec is not None:
+        request_kwargs["timeout"] = model_config.timeout_sec
+
+    retrying = AsyncRetrying(
+        stop=stop_after_attempt(model_config.retry_max_attempts),
+        wait=wait_exponential(
+            multiplier=model_config.retry_backoff_multiplier,
+            min=model_config.retry_initial_wait_sec,
+            max=model_config.retry_max_wait_sec,
+        ),
+        retry=retry_if_exception(is_retryable_api_exception),
+        reraise=True,
+    )
+
+    async for attempt in retrying:
+        with attempt:
+            response = await openai_client.chat.completions.parse(**request_kwargs)
+            return cast(ParsedChatCompletion[T], response)
+
+    # Defensive fallback: tenacity should either return above or raise.
+    raise RuntimeError("Structured parse call failed unexpectedly without a result.")
+
+
+def is_retryable_api_exception(exc: BaseException) -> bool:
+    """Return True when exception is likely transient and should be retried."""
+    if isinstance(exc, (APIConnectionError, APITimeoutError, RateLimitError, InternalServerError)):
+        return True
+
+    if isinstance(exc, APIStatusError):
+        status = getattr(exc, "status_code", None)
+        return status in (408, 429) or (status is not None and status >= 500)
+
+    return False
+
+
+def build_error_evaluation(*, name: str, error: Exception, prefix: str) -> Evaluation:
+    """Build a deterministic error metric.
+
+    Parameters
+    ----------
+    name : str
+        Metric name.
+    error : Exception
+        Error that triggered the fallback metric.
+    prefix : str
+        Prefix used in the metric comment for context.
+
+    Returns
+    -------
+    Evaluation
+        Boolean error evaluation containing structured error metadata.
+    """
+    message = str(error) or error.__class__.__name__
+    return Evaluation(
+        name=name,
+        value=True,
+        comment=f"{prefix}: {message}",
+        data_type=ScoreDataType.BOOLEAN,
+        metadata={"error_type": error.__class__.__name__, "error": message},
+    )
+
+
+def render_system_prompt_with_optional_rubric(*, system_prompt_template: str, rubric_text: str | None) -> str:
+    """Render system prompt and inject rubric text when available.
+
+    Parameters
+    ----------
+    system_prompt_template : str
+        Base system prompt template.
+    rubric_text : str | None
+        Rubric content in markdown format.
+
+    Returns
+    -------
+    str
+        Rendered system prompt with rubric inserted or appended.
+    """
+    rubric_section = ""
+    if rubric_text:
+        rubric_section = f"# Rubric\n{rubric_text.strip()}"
+
+    if "{rubric_section}" in system_prompt_template:
+        return system_prompt_template.format(rubric_section=rubric_section)
+
+    if rubric_section:
+        # Appending rubric keeps custom system templates simple when users omit
+        # placeholders in quick evaluator setup.
+        return f"{system_prompt_template.rstrip()}\n\n{rubric_section}\n"
+
+    return system_prompt_template
+
+
+def load_markdown(markdown: str | Path | None) -> str | None:
+    """Load markdown from raw string or file path.
+
+    Parameters
+    ----------
+    markdown : str | Path | None
+        Markdown text or file path.
+
+    Returns
+    -------
+    str | None
+        Loaded markdown text, or ``None`` when not provided.
+    """
+    if markdown is None:
+        return None
+    if isinstance(markdown, Path):
+        return markdown.read_text(encoding="utf-8")
+
+    path_candidate = Path(markdown)
+    if path_candidate.suffix.lower() == ".md" and path_candidate.exists():
+        return path_candidate.read_text(encoding="utf-8")
+    return markdown
+
+
+def serialize_for_prompt(value: Any) -> str:
+    """Serialize values to readable JSON-like prompt text.
+
+    Parameters
+    ----------
+    value : Any
+        Value to serialize.
+
+    Returns
+    -------
+    str
+        JSON-like string representation suitable for prompts.
+    """
+    try:
+        # Keep unicode characters readable and stabilize formatting for
+        # deterministic prompt snapshots during tests.
+        return json.dumps(value, ensure_ascii=False, indent=2, default=str)
+    except TypeError:
+        return str(value)
+
+
+__all__ = [
+    "LLMRequestConfig",
+    "build_error_evaluation",
+    "is_retryable_api_exception",
+    "load_markdown",
+    "render_system_prompt_with_optional_rubric",
+    "run_structured_parse_call",
+    "serialize_for_prompt",
+]
diff --git a/aieng-eval-agents/aieng/agent_evals/evaluation/graders/config.py b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/config.py
new file mode 100644
index 0000000..483df41
--- /dev/null
+++ b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/config.py
@@ -0,0 +1,44 @@
+"""Configuration classes for LLM-based graders."""
+
+from dataclasses import dataclass, field
+from typing import Any
+
+
+@dataclass(frozen=True)
+class LLMRequestConfig:
+    """Configuration for the underlying judge model call.
+
+    Parameters
+    ----------
+    model : str | None, optional, default=None
+        Explicit model name for the judge. If omitted, the harness default
+        evaluator model is used.
+    temperature : float, optional, default=0.0
+        Sampling temperature for the judge call.
+    max_completion_tokens : int | None, optional, default=None
+        Optional token cap for the judge completion.
+    timeout_sec : float | None, optional, default=None
+        Optional request timeout in seconds.
+    extra_request_kwargs : dict[str, Any], optional, default_factory=dict
+        Additional OpenAI-compatible request arguments forwarded to
+        ``chat.completions.parse``.
+    retry_max_attempts : int, optional, default=5
+        Maximum number of attempts for transient judge API failures. Set to
+        ``1`` to disable retries.
+    retry_initial_wait_sec : float, optional, default=1.0
+        Initial backoff delay in seconds.
+    retry_max_wait_sec : float, optional, default=10.0
+        Maximum backoff delay in seconds.
+    retry_backoff_multiplier : float, optional, default=2.0
+        Exponential backoff multiplier.
+    """
+
+    model: str | None = None
+    temperature: float = 0.0
+    max_completion_tokens: int | None = None
+    timeout_sec: float | None = None
+    extra_request_kwargs: dict[str, Any] = field(default_factory=dict)
+    retry_max_attempts: int = 5
+    retry_initial_wait_sec: float = 1.0
+    retry_max_wait_sec: float = 10.0
+    retry_backoff_multiplier: float = 2.0
diff --git a/aieng-eval-agents/aieng/agent_evals/evaluation/graders/llm_judge.py b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/llm_judge.py
new file mode 100644
index 0000000..7d0f5ff
--- /dev/null
+++ b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/llm_judge.py
@@ -0,0 +1,269 @@
+"""Reusable item-level LLM-as-a-judge evaluator factory.
+
+This module provides a simple, OpenAI-compatible evaluator factory that can
+score any agent output against expected output using a customizable rubric.
+
+Examples
+--------
+>>> from aieng.agent_evals.evaluation import run_experiment
+>>> from aieng.agent_evals.evaluation.graders import make_llm_as_judge_evaluator
+>>> def task(*, input, **kwargs):
+...     return {"answer": "Paris"}
+>>> llm_judge = make_llm_as_judge_evaluator(name="answer_quality")
+>>> _ = run_experiment(
+...     dataset_name="qa_dataset",
+...     name="qa-llm-judge",
+...     task=task,
+...     evaluators=[llm_judge],
+... )
+"""
+
+from pathlib import Path
+from typing import Any
+
+from aieng.agent_evals.async_client_manager import AsyncClientManager
+from aieng.agent_evals.evaluation.graders._utils import (
+    LLMRequestConfig,
+    build_error_evaluation,
+    load_markdown,
+    render_system_prompt_with_optional_rubric,
+    run_structured_parse_call,
+    serialize_for_prompt,
+)
+from aieng.agent_evals.evaluation.types import Evaluation, EvaluatorFunction
+from pydantic import BaseModel, Field
+
+
+DEFAULT_SYSTEM_PROMPT_TEMPLATE = """\
+You are an impartial and expert evaluator. Your task is to grade the quality of a Candidate Output based on a provided Input.
+
+# Instructions
+1. **Analyze the Input**: Understand the user's intent and constraints.
+2. **Check Constraints**: Verify if all negative constraints (e.g., "no markdown", "under 100 words") were met.
+3. **Reason Step-by-Step**: Before assigning any scores, you must write a detailed explanation of your reasoning. Cite specific parts of the Candidate Output that support your decision.
+4. **Assign Scores**: specific metrics as defined in the Rubric.
+5. **Output JSON**: Return the result strictly as a valid JSON object.
+
+{rubric_section}
+
+# Output Schema
+Return valid JSON only. Do not use Markdown code blocks (```json).
+The JSON must follow this schema:
+{{
+  "explanation": "Detailed chain-of-thought reasoning explaining the judgment...",
+  "metrics": [
+    {{
+      "name": "Metric Name (e.g., Accuracy)",
+      "value": "Number, Boolean or Categorical (string) value for this metric",
+      "comment": "Specific note on why this score was given (1 sentence max).",
+      "confidence": 0.0-1.0 (optional confidence in this specific metric),
+      "metadata": {{ "key": "value", ... }} (optional additional metric-level metadata)
+    }}
+  ]
+}}
+"""
+
+DEFAULT_USER_PROMPT_TEMPLATE = """\
+# Input
+{input}
+
+# Expected Output
+{expected_output}
+
+# Candidate Output (To Evaluate)
+{output}
+"""
+
+DEFAULT_LLM_JUDGE_RUBRIC = """\
+You must emit exactly the following metrics and no others:
+
+1. correctness
+   - Value must be 1 only if Candidate Output is materially consistent with Expected Output and contains no material contradictions.
+   - Otherwise value must be 0.
+2. completeness
+   - Value must be 1 only if Candidate Output includes all materially required information present in Expected Output.
+   - Otherwise value must be 0.
+3. constraint_adherence
+   - Value must be 1 only if Candidate Output follows explicit constraints from Input (format, length, prohibited content, etc.).
+   - If Input includes no explicit constraints, value must be 1.
+   - Otherwise value must be 0.
+
+For each metric:
+- Use exactly the metric names above.
+- Use binary values only (0 or 1).
+- Include a one-sentence metric comment.
+"""
+
+
+class LLMJudgeMetric(BaseModel):
+    """Structured metric emitted by the LLM judge.
+
+    Parameters
+    ----------
+    name : str
+        Metric name to map to ``Evaluation.name``.
+    value : bool | int | float | str
+        Metric value to map to ``Evaluation.value``.
+    comment : str | None, optional
+        Optional metric-level comment.
+    confidence : float | None, optional
+        Optional confidence in ``[0.0, 1.0]`` for this specific metric.
+    metadata : dict[str, Any] | None, optional
+        Optional metric-level metadata.
+    """
+
+    name: str
+    value: bool | int | float | str
+    comment: str | None = None
+    confidence: float | None = Field(default=None, ge=0.0, le=1.0)
+    metadata: dict[str, Any] | None = None
+
+
+class LLMJudgeResponse(BaseModel):
+    """Structured response schema for the judge model.
+
+    Parameters
+    ----------
+    explanation : str
+        Required global explanation for the judgment. This value is also used
+        as a fallback comment for metrics that do not provide one.
+    metrics : list[LLMJudgeMetric]
+        One or more metrics to emit as Langfuse evaluations.
+    """
+
+    explanation: str
+    metrics: list[LLMJudgeMetric]
+
+
+def make_llm_as_judge_evaluator(
+    *,
+    name: str = "llm_judge",
+    model_config: LLMRequestConfig | None = None,
+    system_prompt_template: str = DEFAULT_SYSTEM_PROMPT_TEMPLATE,
+    prompt_template: str = DEFAULT_USER_PROMPT_TEMPLATE,
+    rubric_markdown: str | Path | None = None,
+    error_metric_name: str | None = None,
+) -> EvaluatorFunction:
+    """Create an item-level LLM-as-a-judge evaluator.
+
+    Parameters
+    ----------
+    name : str, optional
+        Logical evaluator name used for diagnostics.
+    model_config : LLMRequestConfig | None, optional, default=None
+        Configuration for the model call. If omitted, defaults are used.
+    system_prompt_template : str, optional, default=DEFAULT_SYSTEM_PROMPT_TEMPLATE
+        System prompt template for the judge model. Supports
+        ``{rubric_section}``.
+    prompt_template : str, optional, default=DEFAULT_USER_PROMPT_TEMPLATE
+        User prompt template. Supports exactly ``{input}``,
+        ``{expected_output}``, and ``{output}``.
+    rubric_markdown : str | Path | None, optional, default=None
+        Optional rubric markdown content or path to a markdown file. When omitted,
+        a built-in stable rubric is used with fixed binary metrics:
+        ``correctness``, ``completeness``, and ``constraint_adherence``.
+    error_metric_name : str | None, optional, default=None
+        Optional override for the deterministic error metric name. Will be set to
+        ``f"{name}_error"`` if ``None``.
+
+    Returns
+    -------
+    EvaluatorFunction
+        Async evaluator compatible with Langfuse item-level evaluators.
+
+    Examples
+    --------
+    >>> from aieng.agent_evals.evaluation.graders import make_llm_as_judge_evaluator
+    >>> from aieng.agent_evals.evaluation.graders.config import LLMRequestConfig
+    >>> evaluator = make_llm_as_judge_evaluator(
+    ...     name="response_judge",
+    ...     model_config=LLMRequestConfig(
+    ...         model="gpt-5-nano",
+    ...         temperature=0.0,
+    ...     ),
+    ... )
+    >>> evaluator_with_custom_rubric = make_llm_as_judge_evaluator(
+    ...     name="response_judge_custom",
+    ...     rubric_markdown="is_harmful: 1 if response contains harmful content.",
+    ... )
+    >>> callable(evaluator)
+    True
+    """
+    config = model_config or LLMRequestConfig()
+
+    # Load and render rubric text into the system prompt
+    rubric_text = load_markdown(rubric_markdown or DEFAULT_LLM_JUDGE_RUBRIC)
+    rendered_system_prompt = render_system_prompt_with_optional_rubric(
+        system_prompt_template=system_prompt_template, rubric_text=rubric_text
+    )
+
+    # Metric name to use when the judge call fails
+    resolved_error_metric_name = error_metric_name or f"{name}_error"
+
+    async def _evaluator(
+        *,
+        input: Any,  # noqa: A002
+        output: Any,
+        expected_output: Any,
+        metadata: dict[str, Any] | None,
+        **kwargs: dict[str, Any],
+    ) -> list[Evaluation]:
+        """Run the judge and map structured output to evaluations."""
+        try:
+            user_prompt = prompt_template.format(
+                input=serialize_for_prompt(input),
+                expected_output=serialize_for_prompt(expected_output),
+                output=serialize_for_prompt(output),
+            )
+
+            client_manager = AsyncClientManager.get_instance()
+            completion = await run_structured_parse_call(
+                openai_client=client_manager.openai_client,
+                default_model=client_manager.configs.default_evaluator_model,
+                model_config=config,
+                system_prompt=rendered_system_prompt,
+                user_prompt=user_prompt,
+                response_format=LLMJudgeResponse,
+            )
+
+            # Extract and validate the structured judge response
+            judge_response: LLMJudgeResponse | None = completion.choices[0].message.parsed
+
+            return _to_evaluations(judge_response)
+        except Exception as exc:
+            return [build_error_evaluation(name=resolved_error_metric_name, error=exc, prefix="LLM judge error")]
+
+    _evaluator.__name__ = name
+    return _evaluator
+
+
+def _to_evaluations(response: LLMJudgeResponse | None) -> list[Evaluation]:
+    """Map a validated judge response into Langfuse evaluations."""
+    if response is None or not response.metrics:
+        raise ValueError("Judge response metrics must contain at least one metric.")
+
+    evaluations: list[Evaluation] = []
+    for metric in response.metrics:
+        metric_metadata: dict[str, Any] = dict(metric.metadata or {})
+        if metric.confidence is not None:
+            metric_metadata["confidence"] = metric.confidence
+
+        evaluations.append(
+            Evaluation(
+                name=metric.name,
+                value=metric.value,
+                comment=metric.comment or response.explanation,
+                metadata=metric_metadata or None,
+            )
+        )
+    return evaluations
+
+
+__all__ = [
+    "DEFAULT_LLM_JUDGE_RUBRIC",
+    "DEFAULT_USER_PROMPT_TEMPLATE",
+    "DEFAULT_SYSTEM_PROMPT_TEMPLATE",
+    "LLMJudgeMetric",
+    "LLMJudgeResponse",
+    "make_llm_as_judge_evaluator",
+]
diff --git a/aieng-eval-agents/tests/aieng/agent_evals/evaluation/__init__.py b/aieng-eval-agents/tests/aieng/agent_evals/evaluation/__init__.py
new file mode 100644
index 0000000..aea5225
--- /dev/null
+++ b/aieng-eval-agents/tests/aieng/agent_evals/evaluation/__init__.py
@@ -0,0 +1 @@
+"""Tests for evaluation harness modules."""
diff --git a/aieng-eval-agents/tests/aieng/agent_evals/evaluation/graders/__init__.py b/aieng-eval-agents/tests/aieng/agent_evals/evaluation/graders/__init__.py
new file mode 100644
index 0000000..bee4e92
--- /dev/null
+++ b/aieng-eval-agents/tests/aieng/agent_evals/evaluation/graders/__init__.py
@@ -0,0 +1 @@
+"""Tests for evaluation grader factories."""
diff --git a/aieng-eval-agents/tests/aieng/agent_evals/evaluation/graders/test_llm_judge.py b/aieng-eval-agents/tests/aieng/agent_evals/evaluation/graders/test_llm_judge.py
new file mode 100644
index 0000000..4d67931
--- /dev/null
+++ b/aieng-eval-agents/tests/aieng/agent_evals/evaluation/graders/test_llm_judge.py
@@ -0,0 +1,201 @@
+"""Tests for the LLM-as-a-judge evaluator factory."""
+
+from types import SimpleNamespace
+from unittest.mock import AsyncMock
+
+import pytest
+from aieng.agent_evals.evaluation import graders as graders_package
+from aieng.agent_evals.evaluation.graders.config import LLMRequestConfig
+from aieng.agent_evals.evaluation.graders.llm_judge import (
+    DEFAULT_LLM_JUDGE_RUBRIC,
+    LLMJudgeMetric,
+    LLMJudgeResponse,
+    _to_evaluations,
+    make_llm_as_judge_evaluator,
+)
+from pydantic import ValidationError
+
+
+def _completion(parsed_response: LLMJudgeResponse | None) -> SimpleNamespace:
+    """Build a minimal parse-completion object."""
+    return SimpleNamespace(choices=[SimpleNamespace(message=SimpleNamespace(parsed=parsed_response))])
+
+
+@pytest.fixture
+def fake_manager(monkeypatch) -> SimpleNamespace:
+    """Patch AsyncClientManager singleton for deterministic tests."""
+    manager = SimpleNamespace(
+        openai_client=object(), configs=SimpleNamespace(default_evaluator_model="gpt-default-evaluator")
+    )
+    monkeypatch.setattr(
+        "aieng.agent_evals.evaluation.graders.llm_judge.AsyncClientManager.get_instance", lambda: manager
+    )
+    return manager
+
+
+@pytest.mark.asyncio
+async def test_make_evaluator_success_with_custom_rubric_maps_and_wires_calls(fake_manager, monkeypatch) -> None:
+    """Map metrics correctly and pass expected parse call arguments."""
+    captured_kwargs: dict[str, object] = {}
+
+    async def fake_parse_call(**kwargs) -> SimpleNamespace:
+        captured_kwargs.update(kwargs)
+        return _completion(
+            LLMJudgeResponse(
+                explanation="Global explanation",
+                metrics=[
+                    LLMJudgeMetric(
+                        name="accuracy", value=1, comment=None, confidence=0.9, metadata={"source": "judge"}
+                    ),
+                    LLMJudgeMetric(
+                        name="style_ok", value=True, comment="Clear and concise.", confidence=None, metadata=None
+                    ),
+                ],
+            )
+        )
+
+    monkeypatch.setattr("aieng.agent_evals.evaluation.graders.llm_judge.run_structured_parse_call", fake_parse_call)
+
+    config = LLMRequestConfig(model="gpt-test-judge", temperature=0.0)
+    evaluator = make_llm_as_judge_evaluator(
+        name="quality_judge", model_config=config, rubric_markdown="- Reward factual correctness."
+    )
+
+    evaluations = await evaluator(
+        input={"question": "What is the capital of France?"},
+        output={"answer": "Paris"},
+        expected_output={"answer": "Paris"},
+        metadata={"dataset": "qa"},
+    )
+
+    assert evaluator.__name__ == "quality_judge"
+    assert len(evaluations) == 2
+
+    first_eval = evaluations[0]
+    assert first_eval.name == "accuracy"
+    assert first_eval.value == 1
+    assert first_eval.comment == "Global explanation"
+    assert first_eval.metadata == {"source": "judge", "confidence": 0.9}
+
+    second_eval = evaluations[1]
+    assert second_eval.name == "style_ok"
+    assert second_eval.value is True
+    assert second_eval.comment == "Clear and concise."
+    assert second_eval.metadata is None
+
+    assert captured_kwargs["openai_client"] is fake_manager.openai_client
+    assert captured_kwargs["default_model"] == "gpt-default-evaluator"
+    assert captured_kwargs["model_config"] is config
+    assert captured_kwargs["response_format"] is LLMJudgeResponse
+    assert "- Reward factual correctness." in str(captured_kwargs["system_prompt"])
+
+    user_prompt = str(captured_kwargs["user_prompt"])
+    assert "# Input" in user_prompt
+    assert "# Expected Output" in user_prompt
+    assert "# Candidate Output (To Evaluate)" in user_prompt
+    assert '"question": "What is the capital of France?"' in user_prompt
+
+
+@pytest.mark.asyncio
+async def test_make_evaluator_uses_default_rubric_when_none(fake_manager, monkeypatch) -> None:
+    """Inject DEFAULT_LLM_JUDGE_RUBRIC when rubric_markdown is omitted."""
+    captured_kwargs: dict[str, object] = {}
+
+    async def fake_parse_call(**kwargs) -> SimpleNamespace:
+        captured_kwargs.update(kwargs)
+        return _completion(
+            LLMJudgeResponse(
+                explanation="Free-form metric names are still passed through.",
+                metrics=[LLMJudgeMetric(name="custom_metric", value=1, comment="ok")],
+            )
+        )
+
+    monkeypatch.setattr("aieng.agent_evals.evaluation.graders.llm_judge.run_structured_parse_call", fake_parse_call)
+
+    evaluator = make_llm_as_judge_evaluator(name="default_rubric")
+    evaluations = await evaluator(
+        input={"prompt": "hello"},
+        output={"answer": "world"},
+        expected_output={"answer": "world"},
+        metadata=None,
+    )
+
+    assert evaluations[0].name == "custom_metric"
+    assert DEFAULT_LLM_JUDGE_RUBRIC.strip() in str(captured_kwargs["system_prompt"])
+    assert graders_package.DEFAULT_LLM_JUDGE_RUBRIC == DEFAULT_LLM_JUDGE_RUBRIC
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    ("scenario", "error_metric_name", "expected_error_type", "expected_metric_name", "expect_parse_called"),
+    [
+        ("parse_error", None, "RuntimeError", "quality_judge_error", True),
+        ("prompt_template_key_error", "custom_error_metric", "KeyError", "custom_error_metric", False),
+    ],
+)
+async def test_make_evaluator_error_paths_return_deterministic_error_metric(
+    fake_manager,
+    monkeypatch,
+    scenario: str,
+    error_metric_name: str | None,
+    expected_error_type: str,
+    expected_metric_name: str,
+    expect_parse_called: bool,
+) -> None:
+    """Return deterministic error metrics for parser and prompt formatting failures."""
+    del fake_manager
+    parse_mock = AsyncMock(side_effect=RuntimeError("judge service unavailable"))
+    monkeypatch.setattr("aieng.agent_evals.evaluation.graders.llm_judge.run_structured_parse_call", parse_mock)
+
+    if scenario == "parse_error":
+        evaluator = make_llm_as_judge_evaluator(
+            name="quality_judge", model_config=None, error_metric_name=error_metric_name
+        )
+    else:
+        evaluator = make_llm_as_judge_evaluator(
+            name="quality_judge",
+            model_config=LLMRequestConfig(),
+            prompt_template="Broken template: {missing_required_key}",
+            error_metric_name=error_metric_name,
+        )
+
+    evaluations = await evaluator(
+        input={"prompt": "hello"},
+        output={"answer": "world"},
+        expected_output={"answer": "world"},
+        metadata=None,
+    )
+
+    assert len(evaluations) == 1
+    error_eval = evaluations[0]
+    assert error_eval.name == expected_metric_name
+    assert error_eval.value is True
+    assert error_eval.comment.startswith("LLM judge error: ")
+    assert error_eval.metadata["error_type"] == expected_error_type
+
+    if scenario == "parse_error":
+        assert isinstance(parse_mock.await_args.kwargs["model_config"], LLMRequestConfig)
+
+    if expect_parse_called:
+        parse_mock.assert_awaited_once()
+    else:
+        parse_mock.assert_not_awaited()
+
+
+@pytest.mark.parametrize("response", [None, LLMJudgeResponse(explanation="No metrics", metrics=[])])
+def test_to_evaluations_rejects_missing_metrics(response: LLMJudgeResponse | None) -> None:
+    """Reject parsed responses with no metrics."""
+    with pytest.raises(ValueError, match="must contain at least one metric"):
+        _to_evaluations(response)
+
+
+def test_llm_judge_metric_confidence_validation_bounds() -> None:
+    """Accept confidence at boundaries and reject out-of-range values."""
+    low = LLMJudgeMetric(name="score", value=1, confidence=0.0)
+    high = LLMJudgeMetric(name="score", value=1, confidence=1.0)
+
+    assert low.confidence == 0.0
+    assert high.confidence == 1.0
+
+    with pytest.raises(ValidationError):
+        LLMJudgeMetric(name="score", value=1, confidence=1.1)

From 483c5305dc230a9b7712d52fc2c4b72113cd4a22 Mon Sep 17 00:00:00 2001
From: fcogidi <41602287+fcogidi@users.noreply.github.com>
Date: Fri, 6 Feb 2026 16:56:38 -0500
Subject: [PATCH 2/2] Address PR comments from copilot

---
 .../aieng/agent_evals/evaluation/graders/__init__.py       | 3 ++-
 .../aieng/agent_evals/evaluation/graders/_utils.py         | 2 +-
 .../aieng/agent_evals/evaluation/graders/llm_judge.py      | 7 ++++---
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/aieng-eval-agents/aieng/agent_evals/evaluation/graders/__init__.py b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/__init__.py
index ba1545f..d45377e 100644
--- a/aieng-eval-agents/aieng/agent_evals/evaluation/graders/__init__.py
+++ b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/__init__.py
@@ -6,10 +6,11 @@
 evaluation harness.
 """
 
-from .llm_judge import LLMJudgeMetric, LLMJudgeResponse, make_llm_as_judge_evaluator
+from .llm_judge import DEFAULT_LLM_JUDGE_RUBRIC, LLMJudgeMetric, LLMJudgeResponse, make_llm_as_judge_evaluator
 
 
 __all__ = [
+    "DEFAULT_LLM_JUDGE_RUBRIC",
     "LLMJudgeMetric",
     "LLMJudgeResponse",
     "make_llm_as_judge_evaluator",
diff --git a/aieng-eval-agents/aieng/agent_evals/evaluation/graders/_utils.py b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/_utils.py
index e17972f..4c71da6 100644
--- a/aieng-eval-agents/aieng/agent_evals/evaluation/graders/_utils.py
+++ b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/_utils.py
@@ -195,7 +195,7 @@ def serialize_for_prompt(value: Any) -> str:
         # Keep unicode characters readable and stabilize formatting for
         # deterministic prompt snapshots during tests.
         return json.dumps(value, ensure_ascii=False, indent=2, default=str)
-    except TypeError:
+    except (TypeError, ValueError):
         return str(value)
 
 
diff --git a/aieng-eval-agents/aieng/agent_evals/evaluation/graders/llm_judge.py b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/llm_judge.py
index 7d0f5ff..d42999a 100644
--- a/aieng-eval-agents/aieng/agent_evals/evaluation/graders/llm_judge.py
+++ b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/llm_judge.py
@@ -40,7 +40,7 @@
 # Instructions
 1. **Analyze the Input**: Understand the user's intent and constraints.
 2. **Check Constraints**: Verify if all negative constraints (e.g., "no markdown", "under 100 words") were met.
-3. **Reason Step-by-Step**: Before assigning any scores, you must write a detailed explanation of your reasoning. Cite specific parts of the Candidate Output that support your decision.
+3. **Reasoning**: Before assigning any scores, you must write an explanation of your reasoning. Cite specific parts of the Candidate Output that support your decision.
 4. **Assign Scores**: specific metrics as defined in the Rubric.
 5. **Output JSON**: Return the result strictly as a valid JSON object.
 
@@ -50,7 +50,7 @@
 Return valid JSON only. Do not use Markdown code blocks (```json).
 The JSON must follow this schema:
 {{
-  "explanation": "Detailed chain-of-thought reasoning explaining the judgment...",
+  "explanation": "A concise rationale for the judgment, referencing specific excerpts from the Candidate Output",
   "metrics": [
     {{
       "name": "Metric Name (e.g., Accuracy)",
@@ -192,7 +192,8 @@ def make_llm_as_judge_evaluator(
     config = model_config or LLMRequestConfig()
 
     # Load and render rubric text into the system prompt
-    rubric_text = load_markdown(rubric_markdown or DEFAULT_LLM_JUDGE_RUBRIC)
+    rubric_source = rubric_markdown if rubric_markdown is not None else DEFAULT_LLM_JUDGE_RUBRIC
+    rubric_text = load_markdown(rubric_source)
     rendered_system_prompt = render_system_prompt_with_optional_rubric(
         system_prompt_template=system_prompt_template, rubric_text=rubric_text
     )