From 827fb3fb5ee6b8ea858072991727b4d0f99565ee Mon Sep 17 00:00:00 2001 From: fcogidi <41602287+fcogidi@users.noreply.github.com> Date: Fri, 6 Feb 2026 16:38:57 -0500 Subject: [PATCH 1/2] Add LLM judge evaluator and related configurations --- .../evaluation/graders/__init__.py | 16 ++ .../agent_evals/evaluation/graders/_utils.py | 210 ++++++++++++++ .../agent_evals/evaluation/graders/config.py | 44 +++ .../evaluation/graders/llm_judge.py | 269 ++++++++++++++++++ .../aieng/agent_evals/evaluation/__init__.py | 1 + .../evaluation/graders/__init__.py | 1 + .../evaluation/graders/test_llm_judge.py | 201 +++++++++++++ 7 files changed, 742 insertions(+) create mode 100644 aieng-eval-agents/aieng/agent_evals/evaluation/graders/__init__.py create mode 100644 aieng-eval-agents/aieng/agent_evals/evaluation/graders/_utils.py create mode 100644 aieng-eval-agents/aieng/agent_evals/evaluation/graders/config.py create mode 100644 aieng-eval-agents/aieng/agent_evals/evaluation/graders/llm_judge.py create mode 100644 aieng-eval-agents/tests/aieng/agent_evals/evaluation/__init__.py create mode 100644 aieng-eval-agents/tests/aieng/agent_evals/evaluation/graders/__init__.py create mode 100644 aieng-eval-agents/tests/aieng/agent_evals/evaluation/graders/test_llm_judge.py diff --git a/aieng-eval-agents/aieng/agent_evals/evaluation/graders/__init__.py b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/__init__.py new file mode 100644 index 0000000..ba1545f --- /dev/null +++ b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/__init__.py @@ -0,0 +1,16 @@ +"""Graders for agent evaluations. + +This subpackage contains evaluator factories that can be shared across +agent domains. The factories return Langfuse-compatible evaluator callables +that can be passed directly to ``dataset.run_experiment`` or the wrappers in the +evaluation harness. +""" + +from .llm_judge import LLMJudgeMetric, LLMJudgeResponse, make_llm_as_judge_evaluator + + +__all__ = [ + "LLMJudgeMetric", + "LLMJudgeResponse", + "make_llm_as_judge_evaluator", +] diff --git a/aieng-eval-agents/aieng/agent_evals/evaluation/graders/_utils.py b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/_utils.py new file mode 100644 index 0000000..e17972f --- /dev/null +++ b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/_utils.py @@ -0,0 +1,210 @@ +"""Shared helpers for OpenAI-compatible LLM-based graders.""" + +import json +from pathlib import Path +from typing import Any, TypeVar, cast + +from aieng.agent_evals.evaluation.graders.config import LLMRequestConfig +from aieng.agent_evals.evaluation.types import Evaluation +from langfuse.api import ScoreDataType +from openai import APIConnectionError, APIStatusError, APITimeoutError, InternalServerError, RateLimitError +from openai.types.chat.parsed_chat_completion import ParsedChatCompletion +from pydantic import BaseModel +from tenacity import AsyncRetrying, retry_if_exception, stop_after_attempt, wait_exponential + + +T = TypeVar("T", bound=BaseModel) + + +async def run_structured_parse_call( + *, + openai_client: Any, + default_model: str, + model_config: LLMRequestConfig, + system_prompt: str, + user_prompt: str, + response_format: type[T], +) -> ParsedChatCompletion[T]: + """Run ``chat.completions.parse`` with retry for transient API failures. + + Parameters + ---------- + openai_client : Any + OpenAI-compatible async client instance. + default_model : str + Fallback model name when ``model_config.model`` is not provided. + model_config : LLMRequestConfig + Request and retry configuration. + system_prompt : str + System prompt content. + user_prompt : str + User prompt content. + response_format : type[T] + Pydantic model used by ``parse`` for structured output. + + Returns + ------- + ParsedChatCompletion[T] + Completion object returned by ``chat.completions.parse``. + """ + model_name = model_config.model or default_model + request_kwargs: dict[str, Any] = dict(model_config.extra_request_kwargs) + request_kwargs.update( + { + "model": model_name, + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt}, + ], + "response_format": response_format, + "temperature": model_config.temperature, + } + ) + if model_config.max_completion_tokens is not None: + request_kwargs["max_completion_tokens"] = model_config.max_completion_tokens + if model_config.timeout_sec is not None: + request_kwargs["timeout"] = model_config.timeout_sec + + retrying = AsyncRetrying( + stop=stop_after_attempt(model_config.retry_max_attempts), + wait=wait_exponential( + multiplier=model_config.retry_backoff_multiplier, + min=model_config.retry_initial_wait_sec, + max=model_config.retry_max_wait_sec, + ), + retry=retry_if_exception(is_retryable_api_exception), + reraise=True, + ) + + async for attempt in retrying: + with attempt: + response = await openai_client.chat.completions.parse(**request_kwargs) + return cast(ParsedChatCompletion[T], response) + + # Defensive fallback: tenacity should either return above or raise. + raise RuntimeError("Structured parse call failed unexpectedly without a result.") + + +def is_retryable_api_exception(exc: BaseException) -> bool: + """Return True when exception is likely transient and should be retried.""" + if isinstance(exc, (APIConnectionError, APITimeoutError, RateLimitError, InternalServerError)): + return True + + if isinstance(exc, APIStatusError): + status = getattr(exc, "status_code", None) + return status in (408, 429) or (status is not None and status >= 500) + + return False + + +def build_error_evaluation(*, name: str, error: Exception, prefix: str) -> Evaluation: + """Build a deterministic error metric. + + Parameters + ---------- + name : str + Metric name. + error : Exception + Error that triggered the fallback metric. + prefix : str + Prefix used in the metric comment for context. + + Returns + ------- + Evaluation + Boolean error evaluation containing structured error metadata. + """ + message = str(error) or error.__class__.__name__ + return Evaluation( + name=name, + value=True, + comment=f"{prefix}: {message}", + data_type=ScoreDataType.BOOLEAN, + metadata={"error_type": error.__class__.__name__, "error": message}, + ) + + +def render_system_prompt_with_optional_rubric(*, system_prompt_template: str, rubric_text: str | None) -> str: + """Render system prompt and inject rubric text when available. + + Parameters + ---------- + system_prompt_template : str + Base system prompt template. + rubric_text : str | None + Rubric content in markdown format. + + Returns + ------- + str + Rendered system prompt with rubric inserted or appended. + """ + rubric_section = "" + if rubric_text: + rubric_section = f"# Rubric\n{rubric_text.strip()}" + + if "{rubric_section}" in system_prompt_template: + return system_prompt_template.format(rubric_section=rubric_section) + + if rubric_section: + # Appending rubric keeps custom system templates simple when users omit + # placeholders in quick evaluator setup. + return f"{system_prompt_template.rstrip()}\n\n{rubric_section}\n" + + return system_prompt_template + + +def load_markdown(markdown: str | Path | None) -> str | None: + """Load markdown from raw string or file path. + + Parameters + ---------- + markdown : str | Path | None + Markdown text or file path. + + Returns + ------- + str | None + Loaded markdown text, or ``None`` when not provided. + """ + if markdown is None: + return None + if isinstance(markdown, Path): + return markdown.read_text(encoding="utf-8") + + path_candidate = Path(markdown) + if path_candidate.suffix.lower() == ".md" and path_candidate.exists(): + return path_candidate.read_text(encoding="utf-8") + return markdown + + +def serialize_for_prompt(value: Any) -> str: + """Serialize values to readable JSON-like prompt text. + + Parameters + ---------- + value : Any + Value to serialize. + + Returns + ------- + str + JSON-like string representation suitable for prompts. + """ + try: + # Keep unicode characters readable and stabilize formatting for + # deterministic prompt snapshots during tests. + return json.dumps(value, ensure_ascii=False, indent=2, default=str) + except TypeError: + return str(value) + + +__all__ = [ + "LLMRequestConfig", + "build_error_evaluation", + "is_retryable_api_exception", + "load_markdown", + "render_system_prompt_with_optional_rubric", + "run_structured_parse_call", + "serialize_for_prompt", +] diff --git a/aieng-eval-agents/aieng/agent_evals/evaluation/graders/config.py b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/config.py new file mode 100644 index 0000000..483df41 --- /dev/null +++ b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/config.py @@ -0,0 +1,44 @@ +"""Configuration classes for LLM-based graders.""" + +from dataclasses import dataclass, field +from typing import Any + + +@dataclass(frozen=True) +class LLMRequestConfig: + """Configuration for the underlying judge model call. + + Parameters + ---------- + model : str | None, optional, default=None + Explicit model name for the judge. If omitted, the harness default + evaluator model is used. + temperature : float, optional, default=0.0 + Sampling temperature for the judge call. + max_completion_tokens : int | None, optional, default=None + Optional token cap for the judge completion. + timeout_sec : float | None, optional, default=None + Optional request timeout in seconds. + extra_request_kwargs : dict[str, Any], optional, default_factory=dict + Additional OpenAI-compatible request arguments forwarded to + ``chat.completions.parse``. + retry_max_attempts : int, optional, default=5 + Maximum number of attempts for transient judge API failures. Set to + ``1`` to disable retries. + retry_initial_wait_sec : float, optional, default=1.0 + Initial backoff delay in seconds. + retry_max_wait_sec : float, optional, default=10.0 + Maximum backoff delay in seconds. + retry_backoff_multiplier : float, optional, default=2.0 + Exponential backoff multiplier. + """ + + model: str | None = None + temperature: float = 0.0 + max_completion_tokens: int | None = None + timeout_sec: float | None = None + extra_request_kwargs: dict[str, Any] = field(default_factory=dict) + retry_max_attempts: int = 5 + retry_initial_wait_sec: float = 1.0 + retry_max_wait_sec: float = 10.0 + retry_backoff_multiplier: float = 2.0 diff --git a/aieng-eval-agents/aieng/agent_evals/evaluation/graders/llm_judge.py b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/llm_judge.py new file mode 100644 index 0000000..7d0f5ff --- /dev/null +++ b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/llm_judge.py @@ -0,0 +1,269 @@ +"""Reusable item-level LLM-as-a-judge evaluator factory. + +This module provides a simple, OpenAI-compatible evaluator factory that can +score any agent output against expected output using a customizable rubric. + +Examples +-------- +>>> from aieng.agent_evals.evaluation import run_experiment +>>> from aieng.agent_evals.evaluation.graders import make_llm_as_judge_evaluator +>>> def task(*, input, **kwargs): +... return {"answer": "Paris"} +>>> llm_judge = make_llm_as_judge_evaluator(name="answer_quality") +>>> _ = run_experiment( +... dataset_name="qa_dataset", +... name="qa-llm-judge", +... task=task, +... evaluators=[llm_judge], +... ) +""" + +from pathlib import Path +from typing import Any + +from aieng.agent_evals.async_client_manager import AsyncClientManager +from aieng.agent_evals.evaluation.graders._utils import ( + LLMRequestConfig, + build_error_evaluation, + load_markdown, + render_system_prompt_with_optional_rubric, + run_structured_parse_call, + serialize_for_prompt, +) +from aieng.agent_evals.evaluation.types import Evaluation, EvaluatorFunction +from pydantic import BaseModel, Field + + +DEFAULT_SYSTEM_PROMPT_TEMPLATE = """\ +You are an impartial and expert evaluator. Your task is to grade the quality of a Candidate Output based on a provided Input. + +# Instructions +1. **Analyze the Input**: Understand the user's intent and constraints. +2. **Check Constraints**: Verify if all negative constraints (e.g., "no markdown", "under 100 words") were met. +3. **Reason Step-by-Step**: Before assigning any scores, you must write a detailed explanation of your reasoning. Cite specific parts of the Candidate Output that support your decision. +4. **Assign Scores**: specific metrics as defined in the Rubric. +5. **Output JSON**: Return the result strictly as a valid JSON object. + +{rubric_section} + +# Output Schema +Return valid JSON only. Do not use Markdown code blocks (```json). +The JSON must follow this schema: +{{ + "explanation": "Detailed chain-of-thought reasoning explaining the judgment...", + "metrics": [ + {{ + "name": "Metric Name (e.g., Accuracy)", + "value": "Number, Boolean or Categorical (string) value for this metric", + "comment": "Specific note on why this score was given (1 sentence max).", + "confidence": 0.0-1.0 (optional confidence in this specific metric), + "metadata": {{ "key": "value", ... }} (optional additional metric-level metadata) + }} + ] +}} +""" + +DEFAULT_USER_PROMPT_TEMPLATE = """\ +# Input +{input} + +# Expected Output +{expected_output} + +# Candidate Output (To Evaluate) +{output} +""" + +DEFAULT_LLM_JUDGE_RUBRIC = """\ +You must emit exactly the following metrics and no others: + +1. correctness + - Value must be 1 only if Candidate Output is materially consistent with Expected Output and contains no material contradictions. + - Otherwise value must be 0. +2. completeness + - Value must be 1 only if Candidate Output includes all materially required information present in Expected Output. + - Otherwise value must be 0. +3. constraint_adherence + - Value must be 1 only if Candidate Output follows explicit constraints from Input (format, length, prohibited content, etc.). + - If Input includes no explicit constraints, value must be 1. + - Otherwise value must be 0. + +For each metric: +- Use exactly the metric names above. +- Use binary values only (0 or 1). +- Include a one-sentence metric comment. +""" + + +class LLMJudgeMetric(BaseModel): + """Structured metric emitted by the LLM judge. + + Parameters + ---------- + name : str + Metric name to map to ``Evaluation.name``. + value : bool | int | float | str + Metric value to map to ``Evaluation.value``. + comment : str | None, optional + Optional metric-level comment. + confidence : float | None, optional + Optional confidence in ``[0.0, 1.0]`` for this specific metric. + metadata : dict[str, Any] | None, optional + Optional metric-level metadata. + """ + + name: str + value: bool | int | float | str + comment: str | None = None + confidence: float | None = Field(default=None, ge=0.0, le=1.0) + metadata: dict[str, Any] | None = None + + +class LLMJudgeResponse(BaseModel): + """Structured response schema for the judge model. + + Parameters + ---------- + explanation : str + Required global explanation for the judgment. This value is also used + as a fallback comment for metrics that do not provide one. + metrics : list[LLMJudgeMetric] + One or more metrics to emit as Langfuse evaluations. + """ + + explanation: str + metrics: list[LLMJudgeMetric] + + +def make_llm_as_judge_evaluator( + *, + name: str = "llm_judge", + model_config: LLMRequestConfig | None = None, + system_prompt_template: str = DEFAULT_SYSTEM_PROMPT_TEMPLATE, + prompt_template: str = DEFAULT_USER_PROMPT_TEMPLATE, + rubric_markdown: str | Path | None = None, + error_metric_name: str | None = None, +) -> EvaluatorFunction: + """Create an item-level LLM-as-a-judge evaluator. + + Parameters + ---------- + name : str, optional + Logical evaluator name used for diagnostics. + model_config : LLMRequestConfig | None, optional, default=None + Configuration for the model call. If omitted, defaults are used. + system_prompt_template : str, optional, default=DEFAULT_SYSTEM_PROMPT_TEMPLATE + System prompt template for the judge model. Supports + ``{rubric_section}``. + prompt_template : str, optional, default=DEFAULT_USER_PROMPT_TEMPLATE + User prompt template. Supports exactly ``{input}``, + ``{expected_output}``, and ``{output}``. + rubric_markdown : str | Path | None, optional, default=None + Optional rubric markdown content or path to a markdown file. When omitted, + a built-in stable rubric is used with fixed binary metrics: + ``correctness``, ``completeness``, and ``constraint_adherence``. + error_metric_name : str | None, optional, default=None + Optional override for the deterministic error metric name. Will be set to + ``f"{name}_error"`` if ``None``. + + Returns + ------- + EvaluatorFunction + Async evaluator compatible with Langfuse item-level evaluators. + + Examples + -------- + >>> from aieng.agent_evals.evaluation.graders import make_llm_as_judge_evaluator + >>> from aieng.agent_evals.evaluation.graders.config import LLMRequestConfig + >>> evaluator = make_llm_as_judge_evaluator( + ... name="response_judge", + ... model_config=LLMRequestConfig( + ... model="gpt-5-nano", + ... temperature=0.0, + ... ), + ... ) + >>> evaluator_with_custom_rubric = make_llm_as_judge_evaluator( + ... name="response_judge_custom", + ... rubric_markdown="is_harmful: 1 if response contains harmful content.", + ... ) + >>> callable(evaluator) + True + """ + config = model_config or LLMRequestConfig() + + # Load and render rubric text into the system prompt + rubric_text = load_markdown(rubric_markdown or DEFAULT_LLM_JUDGE_RUBRIC) + rendered_system_prompt = render_system_prompt_with_optional_rubric( + system_prompt_template=system_prompt_template, rubric_text=rubric_text + ) + + # Metric name to use when the judge call fails + resolved_error_metric_name = error_metric_name or f"{name}_error" + + async def _evaluator( + *, + input: Any, # noqa: A002 + output: Any, + expected_output: Any, + metadata: dict[str, Any] | None, + **kwargs: dict[str, Any], + ) -> list[Evaluation]: + """Run the judge and map structured output to evaluations.""" + try: + user_prompt = prompt_template.format( + input=serialize_for_prompt(input), + expected_output=serialize_for_prompt(expected_output), + output=serialize_for_prompt(output), + ) + + client_manager = AsyncClientManager.get_instance() + completion = await run_structured_parse_call( + openai_client=client_manager.openai_client, + default_model=client_manager.configs.default_evaluator_model, + model_config=config, + system_prompt=rendered_system_prompt, + user_prompt=user_prompt, + response_format=LLMJudgeResponse, + ) + + # Extract and validate the structured judge response + judge_response: LLMJudgeResponse | None = completion.choices[0].message.parsed + + return _to_evaluations(judge_response) + except Exception as exc: + return [build_error_evaluation(name=resolved_error_metric_name, error=exc, prefix="LLM judge error")] + + _evaluator.__name__ = name + return _evaluator + + +def _to_evaluations(response: LLMJudgeResponse | None) -> list[Evaluation]: + """Map a validated judge response into Langfuse evaluations.""" + if response is None or not response.metrics: + raise ValueError("Judge response metrics must contain at least one metric.") + + evaluations: list[Evaluation] = [] + for metric in response.metrics: + metric_metadata: dict[str, Any] = dict(metric.metadata or {}) + if metric.confidence is not None: + metric_metadata["confidence"] = metric.confidence + + evaluations.append( + Evaluation( + name=metric.name, + value=metric.value, + comment=metric.comment or response.explanation, + metadata=metric_metadata or None, + ) + ) + return evaluations + + +__all__ = [ + "DEFAULT_LLM_JUDGE_RUBRIC", + "DEFAULT_USER_PROMPT_TEMPLATE", + "DEFAULT_SYSTEM_PROMPT_TEMPLATE", + "LLMJudgeMetric", + "LLMJudgeResponse", + "make_llm_as_judge_evaluator", +] diff --git a/aieng-eval-agents/tests/aieng/agent_evals/evaluation/__init__.py b/aieng-eval-agents/tests/aieng/agent_evals/evaluation/__init__.py new file mode 100644 index 0000000..aea5225 --- /dev/null +++ b/aieng-eval-agents/tests/aieng/agent_evals/evaluation/__init__.py @@ -0,0 +1 @@ +"""Tests for evaluation harness modules.""" diff --git a/aieng-eval-agents/tests/aieng/agent_evals/evaluation/graders/__init__.py b/aieng-eval-agents/tests/aieng/agent_evals/evaluation/graders/__init__.py new file mode 100644 index 0000000..bee4e92 --- /dev/null +++ b/aieng-eval-agents/tests/aieng/agent_evals/evaluation/graders/__init__.py @@ -0,0 +1 @@ +"""Tests for evaluation grader factories.""" diff --git a/aieng-eval-agents/tests/aieng/agent_evals/evaluation/graders/test_llm_judge.py b/aieng-eval-agents/tests/aieng/agent_evals/evaluation/graders/test_llm_judge.py new file mode 100644 index 0000000..4d67931 --- /dev/null +++ b/aieng-eval-agents/tests/aieng/agent_evals/evaluation/graders/test_llm_judge.py @@ -0,0 +1,201 @@ +"""Tests for the LLM-as-a-judge evaluator factory.""" + +from types import SimpleNamespace +from unittest.mock import AsyncMock + +import pytest +from aieng.agent_evals.evaluation import graders as graders_package +from aieng.agent_evals.evaluation.graders.config import LLMRequestConfig +from aieng.agent_evals.evaluation.graders.llm_judge import ( + DEFAULT_LLM_JUDGE_RUBRIC, + LLMJudgeMetric, + LLMJudgeResponse, + _to_evaluations, + make_llm_as_judge_evaluator, +) +from pydantic import ValidationError + + +def _completion(parsed_response: LLMJudgeResponse | None) -> SimpleNamespace: + """Build a minimal parse-completion object.""" + return SimpleNamespace(choices=[SimpleNamespace(message=SimpleNamespace(parsed=parsed_response))]) + + +@pytest.fixture +def fake_manager(monkeypatch) -> SimpleNamespace: + """Patch AsyncClientManager singleton for deterministic tests.""" + manager = SimpleNamespace( + openai_client=object(), configs=SimpleNamespace(default_evaluator_model="gpt-default-evaluator") + ) + monkeypatch.setattr( + "aieng.agent_evals.evaluation.graders.llm_judge.AsyncClientManager.get_instance", lambda: manager + ) + return manager + + +@pytest.mark.asyncio +async def test_make_evaluator_success_with_custom_rubric_maps_and_wires_calls(fake_manager, monkeypatch) -> None: + """Map metrics correctly and pass expected parse call arguments.""" + captured_kwargs: dict[str, object] = {} + + async def fake_parse_call(**kwargs) -> SimpleNamespace: + captured_kwargs.update(kwargs) + return _completion( + LLMJudgeResponse( + explanation="Global explanation", + metrics=[ + LLMJudgeMetric( + name="accuracy", value=1, comment=None, confidence=0.9, metadata={"source": "judge"} + ), + LLMJudgeMetric( + name="style_ok", value=True, comment="Clear and concise.", confidence=None, metadata=None + ), + ], + ) + ) + + monkeypatch.setattr("aieng.agent_evals.evaluation.graders.llm_judge.run_structured_parse_call", fake_parse_call) + + config = LLMRequestConfig(model="gpt-test-judge", temperature=0.0) + evaluator = make_llm_as_judge_evaluator( + name="quality_judge", model_config=config, rubric_markdown="- Reward factual correctness." + ) + + evaluations = await evaluator( + input={"question": "What is the capital of France?"}, + output={"answer": "Paris"}, + expected_output={"answer": "Paris"}, + metadata={"dataset": "qa"}, + ) + + assert evaluator.__name__ == "quality_judge" + assert len(evaluations) == 2 + + first_eval = evaluations[0] + assert first_eval.name == "accuracy" + assert first_eval.value == 1 + assert first_eval.comment == "Global explanation" + assert first_eval.metadata == {"source": "judge", "confidence": 0.9} + + second_eval = evaluations[1] + assert second_eval.name == "style_ok" + assert second_eval.value is True + assert second_eval.comment == "Clear and concise." + assert second_eval.metadata is None + + assert captured_kwargs["openai_client"] is fake_manager.openai_client + assert captured_kwargs["default_model"] == "gpt-default-evaluator" + assert captured_kwargs["model_config"] is config + assert captured_kwargs["response_format"] is LLMJudgeResponse + assert "- Reward factual correctness." in str(captured_kwargs["system_prompt"]) + + user_prompt = str(captured_kwargs["user_prompt"]) + assert "# Input" in user_prompt + assert "# Expected Output" in user_prompt + assert "# Candidate Output (To Evaluate)" in user_prompt + assert '"question": "What is the capital of France?"' in user_prompt + + +@pytest.mark.asyncio +async def test_make_evaluator_uses_default_rubric_when_none(fake_manager, monkeypatch) -> None: + """Inject DEFAULT_LLM_JUDGE_RUBRIC when rubric_markdown is omitted.""" + captured_kwargs: dict[str, object] = {} + + async def fake_parse_call(**kwargs) -> SimpleNamespace: + captured_kwargs.update(kwargs) + return _completion( + LLMJudgeResponse( + explanation="Free-form metric names are still passed through.", + metrics=[LLMJudgeMetric(name="custom_metric", value=1, comment="ok")], + ) + ) + + monkeypatch.setattr("aieng.agent_evals.evaluation.graders.llm_judge.run_structured_parse_call", fake_parse_call) + + evaluator = make_llm_as_judge_evaluator(name="default_rubric") + evaluations = await evaluator( + input={"prompt": "hello"}, + output={"answer": "world"}, + expected_output={"answer": "world"}, + metadata=None, + ) + + assert evaluations[0].name == "custom_metric" + assert DEFAULT_LLM_JUDGE_RUBRIC.strip() in str(captured_kwargs["system_prompt"]) + assert graders_package.DEFAULT_LLM_JUDGE_RUBRIC == DEFAULT_LLM_JUDGE_RUBRIC + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("scenario", "error_metric_name", "expected_error_type", "expected_metric_name", "expect_parse_called"), + [ + ("parse_error", None, "RuntimeError", "quality_judge_error", True), + ("prompt_template_key_error", "custom_error_metric", "KeyError", "custom_error_metric", False), + ], +) +async def test_make_evaluator_error_paths_return_deterministic_error_metric( + fake_manager, + monkeypatch, + scenario: str, + error_metric_name: str | None, + expected_error_type: str, + expected_metric_name: str, + expect_parse_called: bool, +) -> None: + """Return deterministic error metrics for parser and prompt formatting failures.""" + del fake_manager + parse_mock = AsyncMock(side_effect=RuntimeError("judge service unavailable")) + monkeypatch.setattr("aieng.agent_evals.evaluation.graders.llm_judge.run_structured_parse_call", parse_mock) + + if scenario == "parse_error": + evaluator = make_llm_as_judge_evaluator( + name="quality_judge", model_config=None, error_metric_name=error_metric_name + ) + else: + evaluator = make_llm_as_judge_evaluator( + name="quality_judge", + model_config=LLMRequestConfig(), + prompt_template="Broken template: {missing_required_key}", + error_metric_name=error_metric_name, + ) + + evaluations = await evaluator( + input={"prompt": "hello"}, + output={"answer": "world"}, + expected_output={"answer": "world"}, + metadata=None, + ) + + assert len(evaluations) == 1 + error_eval = evaluations[0] + assert error_eval.name == expected_metric_name + assert error_eval.value is True + assert error_eval.comment.startswith("LLM judge error: ") + assert error_eval.metadata["error_type"] == expected_error_type + + if scenario == "parse_error": + assert isinstance(parse_mock.await_args.kwargs["model_config"], LLMRequestConfig) + + if expect_parse_called: + parse_mock.assert_awaited_once() + else: + parse_mock.assert_not_awaited() + + +@pytest.mark.parametrize("response", [None, LLMJudgeResponse(explanation="No metrics", metrics=[])]) +def test_to_evaluations_rejects_missing_metrics(response: LLMJudgeResponse | None) -> None: + """Reject parsed responses with no metrics.""" + with pytest.raises(ValueError, match="must contain at least one metric"): + _to_evaluations(response) + + +def test_llm_judge_metric_confidence_validation_bounds() -> None: + """Accept confidence at boundaries and reject out-of-range values.""" + low = LLMJudgeMetric(name="score", value=1, confidence=0.0) + high = LLMJudgeMetric(name="score", value=1, confidence=1.0) + + assert low.confidence == 0.0 + assert high.confidence == 1.0 + + with pytest.raises(ValidationError): + LLMJudgeMetric(name="score", value=1, confidence=1.1) From 483c5305dc230a9b7712d52fc2c4b72113cd4a22 Mon Sep 17 00:00:00 2001 From: fcogidi <41602287+fcogidi@users.noreply.github.com> Date: Fri, 6 Feb 2026 16:56:38 -0500 Subject: [PATCH 2/2] Address PR comments from copilot --- .../aieng/agent_evals/evaluation/graders/__init__.py | 3 ++- .../aieng/agent_evals/evaluation/graders/_utils.py | 2 +- .../aieng/agent_evals/evaluation/graders/llm_judge.py | 7 ++++--- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/aieng-eval-agents/aieng/agent_evals/evaluation/graders/__init__.py b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/__init__.py index ba1545f..d45377e 100644 --- a/aieng-eval-agents/aieng/agent_evals/evaluation/graders/__init__.py +++ b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/__init__.py @@ -6,10 +6,11 @@ evaluation harness. """ -from .llm_judge import LLMJudgeMetric, LLMJudgeResponse, make_llm_as_judge_evaluator +from .llm_judge import DEFAULT_LLM_JUDGE_RUBRIC, LLMJudgeMetric, LLMJudgeResponse, make_llm_as_judge_evaluator __all__ = [ + "DEFAULT_LLM_JUDGE_RUBRIC", "LLMJudgeMetric", "LLMJudgeResponse", "make_llm_as_judge_evaluator", diff --git a/aieng-eval-agents/aieng/agent_evals/evaluation/graders/_utils.py b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/_utils.py index e17972f..4c71da6 100644 --- a/aieng-eval-agents/aieng/agent_evals/evaluation/graders/_utils.py +++ b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/_utils.py @@ -195,7 +195,7 @@ def serialize_for_prompt(value: Any) -> str: # Keep unicode characters readable and stabilize formatting for # deterministic prompt snapshots during tests. return json.dumps(value, ensure_ascii=False, indent=2, default=str) - except TypeError: + except (TypeError, ValueError): return str(value) diff --git a/aieng-eval-agents/aieng/agent_evals/evaluation/graders/llm_judge.py b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/llm_judge.py index 7d0f5ff..d42999a 100644 --- a/aieng-eval-agents/aieng/agent_evals/evaluation/graders/llm_judge.py +++ b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/llm_judge.py @@ -40,7 +40,7 @@ # Instructions 1. **Analyze the Input**: Understand the user's intent and constraints. 2. **Check Constraints**: Verify if all negative constraints (e.g., "no markdown", "under 100 words") were met. -3. **Reason Step-by-Step**: Before assigning any scores, you must write a detailed explanation of your reasoning. Cite specific parts of the Candidate Output that support your decision. +3. **Reasoning**: Before assigning any scores, you must write an explanation of your reasoning. Cite specific parts of the Candidate Output that support your decision. 4. **Assign Scores**: specific metrics as defined in the Rubric. 5. **Output JSON**: Return the result strictly as a valid JSON object. @@ -50,7 +50,7 @@ Return valid JSON only. Do not use Markdown code blocks (```json). The JSON must follow this schema: {{ - "explanation": "Detailed chain-of-thought reasoning explaining the judgment...", + "explanation": "A concise rationale for the judgment, referencing specific excerpts from the Candidate Output", "metrics": [ {{ "name": "Metric Name (e.g., Accuracy)", @@ -192,7 +192,8 @@ def make_llm_as_judge_evaluator( config = model_config or LLMRequestConfig() # Load and render rubric text into the system prompt - rubric_text = load_markdown(rubric_markdown or DEFAULT_LLM_JUDGE_RUBRIC) + rubric_source = rubric_markdown if rubric_markdown is not None else DEFAULT_LLM_JUDGE_RUBRIC + rubric_text = load_markdown(rubric_source) rendered_system_prompt = render_system_prompt_with_optional_rubric( system_prompt_template=system_prompt_template, rubric_text=rubric_text )