Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions aieng-eval-agents/aieng/agent_evals/evaluation/graders/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""Graders for agent evaluations.

This subpackage contains evaluator factories that can be shared across
agent domains. The factories return Langfuse-compatible evaluator callables
that can be passed directly to ``dataset.run_experiment`` or the wrappers in the
evaluation harness.
"""

from .llm_judge import DEFAULT_LLM_JUDGE_RUBRIC, LLMJudgeMetric, LLMJudgeResponse, make_llm_as_judge_evaluator


__all__ = [
"DEFAULT_LLM_JUDGE_RUBRIC",
"LLMJudgeMetric",
"LLMJudgeResponse",
"make_llm_as_judge_evaluator",
]
210 changes: 210 additions & 0 deletions aieng-eval-agents/aieng/agent_evals/evaluation/graders/_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
"""Shared helpers for OpenAI-compatible LLM-based graders."""

import json
from pathlib import Path
from typing import Any, TypeVar, cast

from aieng.agent_evals.evaluation.graders.config import LLMRequestConfig
from aieng.agent_evals.evaluation.types import Evaluation
from langfuse.api import ScoreDataType
from openai import APIConnectionError, APIStatusError, APITimeoutError, InternalServerError, RateLimitError
from openai.types.chat.parsed_chat_completion import ParsedChatCompletion
from pydantic import BaseModel
from tenacity import AsyncRetrying, retry_if_exception, stop_after_attempt, wait_exponential


T = TypeVar("T", bound=BaseModel)


async def run_structured_parse_call(
*,
openai_client: Any,
default_model: str,
model_config: LLMRequestConfig,
system_prompt: str,
user_prompt: str,
response_format: type[T],
) -> ParsedChatCompletion[T]:
"""Run ``chat.completions.parse`` with retry for transient API failures.

Parameters
----------
openai_client : Any
OpenAI-compatible async client instance.
default_model : str
Fallback model name when ``model_config.model`` is not provided.
model_config : LLMRequestConfig
Request and retry configuration.
system_prompt : str
System prompt content.
user_prompt : str
User prompt content.
response_format : type[T]
Pydantic model used by ``parse`` for structured output.

Returns
-------
ParsedChatCompletion[T]
Completion object returned by ``chat.completions.parse``.
"""
model_name = model_config.model or default_model
request_kwargs: dict[str, Any] = dict(model_config.extra_request_kwargs)
request_kwargs.update(
{
"model": model_name,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
],
"response_format": response_format,
"temperature": model_config.temperature,
}
)
if model_config.max_completion_tokens is not None:
request_kwargs["max_completion_tokens"] = model_config.max_completion_tokens
if model_config.timeout_sec is not None:
request_kwargs["timeout"] = model_config.timeout_sec

retrying = AsyncRetrying(
stop=stop_after_attempt(model_config.retry_max_attempts),
wait=wait_exponential(
multiplier=model_config.retry_backoff_multiplier,
min=model_config.retry_initial_wait_sec,
max=model_config.retry_max_wait_sec,
),
retry=retry_if_exception(is_retryable_api_exception),
reraise=True,
)

async for attempt in retrying:
with attempt:
response = await openai_client.chat.completions.parse(**request_kwargs)
return cast(ParsedChatCompletion[T], response)

# Defensive fallback: tenacity should either return above or raise.
raise RuntimeError("Structured parse call failed unexpectedly without a result.")


def is_retryable_api_exception(exc: BaseException) -> bool:
"""Return True when exception is likely transient and should be retried."""
if isinstance(exc, (APIConnectionError, APITimeoutError, RateLimitError, InternalServerError)):
return True

if isinstance(exc, APIStatusError):
status = getattr(exc, "status_code", None)
return status in (408, 429) or (status is not None and status >= 500)

return False


def build_error_evaluation(*, name: str, error: Exception, prefix: str) -> Evaluation:
"""Build a deterministic error metric.

Parameters
----------
name : str
Metric name.
error : Exception
Error that triggered the fallback metric.
prefix : str
Prefix used in the metric comment for context.

Returns
-------
Evaluation
Boolean error evaluation containing structured error metadata.
"""
message = str(error) or error.__class__.__name__
return Evaluation(
name=name,
value=True,
comment=f"{prefix}: {message}",
data_type=ScoreDataType.BOOLEAN,
metadata={"error_type": error.__class__.__name__, "error": message},
)


def render_system_prompt_with_optional_rubric(*, system_prompt_template: str, rubric_text: str | None) -> str:
"""Render system prompt and inject rubric text when available.

Parameters
----------
system_prompt_template : str
Base system prompt template.
rubric_text : str | None
Rubric content in markdown format.

Returns
-------
str
Rendered system prompt with rubric inserted or appended.
"""
rubric_section = ""
if rubric_text:
rubric_section = f"# Rubric\n{rubric_text.strip()}"

if "{rubric_section}" in system_prompt_template:
return system_prompt_template.format(rubric_section=rubric_section)

if rubric_section:
# Appending rubric keeps custom system templates simple when users omit
# placeholders in quick evaluator setup.
return f"{system_prompt_template.rstrip()}\n\n{rubric_section}\n"

return system_prompt_template


def load_markdown(markdown: str | Path | None) -> str | None:
"""Load markdown from raw string or file path.

Parameters
----------
markdown : str | Path | None
Markdown text or file path.

Returns
-------
str | None
Loaded markdown text, or ``None`` when not provided.
"""
if markdown is None:
return None
if isinstance(markdown, Path):
return markdown.read_text(encoding="utf-8")

path_candidate = Path(markdown)
if path_candidate.suffix.lower() == ".md" and path_candidate.exists():
return path_candidate.read_text(encoding="utf-8")
return markdown


def serialize_for_prompt(value: Any) -> str:
"""Serialize values to readable JSON-like prompt text.

Parameters
----------
value : Any
Value to serialize.

Returns
-------
str
JSON-like string representation suitable for prompts.
"""
try:
# Keep unicode characters readable and stabilize formatting for
# deterministic prompt snapshots during tests.
return json.dumps(value, ensure_ascii=False, indent=2, default=str)
except (TypeError, ValueError):
return str(value)


__all__ = [
"LLMRequestConfig",
"build_error_evaluation",
"is_retryable_api_exception",
"load_markdown",
"render_system_prompt_with_optional_rubric",
"run_structured_parse_call",
"serialize_for_prompt",
]
44 changes: 44 additions & 0 deletions aieng-eval-agents/aieng/agent_evals/evaluation/graders/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""Configuration classes for LLM-based graders."""

from dataclasses import dataclass, field
from typing import Any


@dataclass(frozen=True)
class LLMRequestConfig:
"""Configuration for the underlying judge model call.

Parameters
----------
model : str | None, optional, default=None
Explicit model name for the judge. If omitted, the harness default
evaluator model is used.
temperature : float, optional, default=0.0
Sampling temperature for the judge call.
max_completion_tokens : int | None, optional, default=None
Optional token cap for the judge completion.
timeout_sec : float | None, optional, default=None
Optional request timeout in seconds.
extra_request_kwargs : dict[str, Any], optional, default_factory=dict
Additional OpenAI-compatible request arguments forwarded to
``chat.completions.parse``.
retry_max_attempts : int, optional, default=5
Maximum number of attempts for transient judge API failures. Set to
``1`` to disable retries.
retry_initial_wait_sec : float, optional, default=1.0
Initial backoff delay in seconds.
retry_max_wait_sec : float, optional, default=10.0
Maximum backoff delay in seconds.
retry_backoff_multiplier : float, optional, default=2.0
Exponential backoff multiplier.
"""

model: str | None = None
temperature: float = 0.0
max_completion_tokens: int | None = None
timeout_sec: float | None = None
extra_request_kwargs: dict[str, Any] = field(default_factory=dict)
retry_max_attempts: int = 5
retry_initial_wait_sec: float = 1.0
retry_max_wait_sec: float = 10.0
retry_backoff_multiplier: float = 2.0
Loading