diff --git a/py/pyproject.toml b/py/pyproject.toml index 413e7f88..b3434c73 100644 --- a/py/pyproject.toml +++ b/py/pyproject.toml @@ -177,6 +177,7 @@ test-cli = [ test-types = [ {include-group = "test"}, + "autoevals==0.2.0", "pyright==1.1.408", "mypy==1.20.0", ] diff --git a/py/src/braintrust/framework.py b/py/src/braintrust/framework.py index 5367dd40..41b5b011 100644 --- a/py/src/braintrust/framework.py +++ b/py/src/braintrust/framework.py @@ -48,7 +48,7 @@ validate_parameters, ) from .resource_manager import ResourceManager -from .score import Classification, ClassificationItem, Score, is_classification, is_score, is_scorer +from .score import Classification, ClassificationItem, Score, ScoreLike, is_classification, is_score, is_scorer from .serializable_data_class import SerializableDataClass from .span_types import SpanTypeAttribute from .types._eval import EvalCaseDict, EvalCaseDictNoOutput, ExperimentDatasetEvent @@ -216,7 +216,7 @@ class EvalScorerArgs(SerializableDataClass, Generic[Input, Output, Expected]): metadata: Metadata | None = None -OneOrMoreScores = float | int | bool | None | Score | list[Score] +OneOrMoreScores = float | int | bool | None | ScoreLike | list[ScoreLike] OneOrMoreClassifications = None | Classification | Mapping[str, Any] | list[Classification | Mapping[str, Any]] @@ -1286,7 +1286,7 @@ def _classifier_name(classifier, classifier_idx): return _callable_name(classifier, classifier_idx, "classifier") -def _build_span_metadata(results: list[Score] | list[Classification]) -> Metadata | None: +def _build_span_metadata(results: list[ScoreLike] | list[Classification]) -> Metadata | None: if not results: return None if len(results) == 1: diff --git a/py/src/braintrust/score.py b/py/src/braintrust/score.py index ca500984..43515669 100644 --- a/py/src/braintrust/score.py +++ b/py/src/braintrust/score.py @@ -2,9 +2,10 @@ import inspect import warnings from abc import ABC, abstractmethod -from typing import Any, TypedDict +from collections.abc import Mapping +from typing import Any, Protocol, TypedDict -from typing_extensions import NotRequired +from typing_extensions import NotRequired, TypeGuard from .serializable_data_class import SerializableDataClass from .types import Metadata @@ -53,6 +54,19 @@ def __post_init__(self): ) +class ScoreLike(Protocol): + @property + def name(self) -> str: ... + + @property + def score(self) -> float | None: ... + + @property + def metadata(self) -> Metadata: ... + + def as_dict(self) -> Mapping[str, Any]: ... + + class ClassificationItem(TypedDict): id: str label: NotRequired[str] @@ -76,7 +90,7 @@ class Classification(SerializableDataClass): """Optional metadata attached to the classification result.""" def as_dict(self): - result = {"id": self.id} + result: Mapping[str, Any] = {"id": self.id} if self.name is not None: result["name"] = self.name if self.label is not None: @@ -102,7 +116,7 @@ def __post_init__(self): raise ValueError("classification label must be a string when provided") -def is_score(obj): +def is_score(obj: object) -> TypeGuard[ScoreLike]: return hasattr(obj, "name") and hasattr(obj, "score") and hasattr(obj, "metadata") and hasattr(obj, "as_dict") @@ -151,6 +165,7 @@ def is_scorer(obj): "Classification", "ClassificationItem", "Score", + "ScoreLike", "Scorer", "is_classification", "is_score", diff --git a/py/src/braintrust/type_tests/test_autoevals_scorers.py b/py/src/braintrust/type_tests/test_autoevals_scorers.py new file mode 100644 index 00000000..9898a4ff --- /dev/null +++ b/py/src/braintrust/type_tests/test_autoevals_scorers.py @@ -0,0 +1,131 @@ +"""Type-check and runtime tests for autoevals scorers in Eval.""" + +import pytest +from autoevals import Levenshtein # type: ignore[import-untyped] +from braintrust.framework import Eval, EvalCase, EvalScorer + + +def accepts_autoevals_scorer( + scorer: EvalScorer[str, str, str], +) -> EvalScorer[str, str, str]: + return scorer + + +def autoevals_data(): + return iter([EvalCase(input="query", expected="hello world")]) + + +def autoevals_task(input: str) -> str: + return "hello world" + + +async def autoevals_task_async(input: str) -> str: + return "hello world" + + +autoevals_scores: list[EvalScorer[str, str, str]] = [ + accepts_autoevals_scorer(Levenshtein()), + accepts_autoevals_scorer(Levenshtein), + accepts_autoevals_scorer(Levenshtein.partial(foo="bar")), +] + +autoevals_scores_untyped = [ + Levenshtein(), + Levenshtein, + Levenshtein.partial(foo="bar"), +] + + +def test_eval_accepts_autoevals_scorers_typed(): + result = Eval( + "test-autoevals-scorers", + data=autoevals_data, + task=autoevals_task, + scores=autoevals_scores, + no_send_logs=True, + ) + + score = result.results[0].scores["Levenshtein"] + assert score is not None + assert score > 0 + + +def test_eval_accepts_autoevals_scorers_untyped(): + result = Eval( + "test-autoevals-scorers", + data=autoevals_data, + task=autoevals_task, + scores=autoevals_scores, + no_send_logs=True, + ) + + score = result.results[0].scores["Levenshtein"] + assert score is not None + assert score > 0 + + +def test_eval_accepts_autoevals_scorers_inline(): + result = Eval( + "test-autoevals-scorers", + data=autoevals_data, + task=autoevals_task, + scores=[ + Levenshtein(), + Levenshtein, + Levenshtein.partial(foo="bar"), + ], + no_send_logs=True, + ) + + score = result.results[0].scores["Levenshtein"] + assert score is not None + assert score > 0 + + +@pytest.mark.asyncio +async def test_eval_async_accepts_autoevals_scorers_typed(): + result = await EvalAsync( + "test-autoevals-scorers", + data=autoevals_data, + task=autoevals_task_async, + scores=autoevals_scores, + no_send_logs=True, + ) + + score = result.results[0].scores["Levenshtein"] + assert score is not None + assert score > 0 + + +@pytest.mark.asyncio +async def test_eval_async_accepts_autoevals_scorers_untyped(): + result = await EvalAsync( + "test-autoevals-scorers", + data=autoevals_data, + task=autoevals_task_async, + scores=autoevals_scores, + no_send_logs=True, + ) + + score = result.results[0].scores["Levenshtein"] + assert score is not None + assert score > 0 + + +@pytest.mark.asyncio +async def test_eval_async_accepts_autoevals_scorers_inline(): + result = await EvalAsync( + "test-autoevals-scorers", + data=autoevals_data, + task=autoevals_task_async, + scores=[ + Levenshtein(), + Levenshtein, + Levenshtein.partial(foo="bar"), + ], + no_send_logs=True, + ) + + score = result.results[0].scores["Levenshtein"] + assert score is not None + assert score > 0 diff --git a/py/uv.lock b/py/uv.lock index 16fd63be..0da4c9c2 100644 --- a/py/uv.lock +++ b/py/uv.lock @@ -471,7 +471,8 @@ version = "0.2.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "chevron" }, - { name = "jsonschema", version = "4.26.0", source = { registry = "https://pypi.org/simple" } }, + { name = "jsonschema", version = "4.23.0", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'group-10-braintrust-test-crewai' or (extra == 'group-10-braintrust-lint' and extra == 'group-10-braintrust-test-agentscope') or (extra == 'group-10-braintrust-lint' and extra == 'group-10-braintrust-test-agno') or (extra == 'group-10-braintrust-test-agentscope' and extra == 'group-10-braintrust-test-agno') or (extra == 'group-10-braintrust-test-agno' and extra == 'group-10-braintrust-test-langchain') or (extra == 'group-10-braintrust-test-agno' and extra == 'group-10-braintrust-test-litellm') or (extra == 'group-10-braintrust-test-agno' and extra == 'group-10-braintrust-test-openai-agents') or (extra == 'group-10-braintrust-test-agno' and extra == 'group-10-braintrust-test-strands') or (extra == 'group-10-braintrust-test-langchain' and extra == 'group-10-braintrust-test-litellm') or (extra == 'group-10-braintrust-test-langchain' and extra == 'group-10-braintrust-test-openai-agents') or (extra == 'group-10-braintrust-test-langchain' and extra == 'group-10-braintrust-test-strands') or (extra == 'group-10-braintrust-test-litellm' and extra == 'group-10-braintrust-test-openai-agents') or (extra == 'group-10-braintrust-test-litellm' and extra == 'group-10-braintrust-test-strands') or (extra == 'group-10-braintrust-test-openai-agents' and extra == 'group-10-braintrust-test-strands') or (extra == 'group-10-braintrust-lint' and extra != 'group-10-braintrust-test-crewai' and extra == 'group-10-braintrust-test-langchain') or (extra == 'group-10-braintrust-lint' and extra != 'group-10-braintrust-test-crewai' and extra == 'group-10-braintrust-test-litellm') or (extra == 'group-10-braintrust-lint' and extra != 'group-10-braintrust-test-crewai' and extra == 'group-10-braintrust-test-openai-agents') or (extra == 'group-10-braintrust-lint' and extra != 'group-10-braintrust-test-crewai' and extra == 'group-10-braintrust-test-pydantic-ai-logfire') or (extra == 'group-10-braintrust-lint' and extra != 'group-10-braintrust-test-crewai' and extra == 'group-10-braintrust-test-strands') or (extra == 'group-10-braintrust-test-agentscope' and extra != 'group-10-braintrust-test-crewai' and extra == 'group-10-braintrust-test-langchain') or (extra == 'group-10-braintrust-test-agentscope' and extra != 'group-10-braintrust-test-crewai' and extra == 'group-10-braintrust-test-litellm') or (extra == 'group-10-braintrust-test-agentscope' and extra != 'group-10-braintrust-test-crewai' and extra == 'group-10-braintrust-test-openai-agents') or (extra == 'group-10-braintrust-test-agentscope' and extra != 'group-10-braintrust-test-crewai' and extra == 'group-10-braintrust-test-strands')" }, + { name = "jsonschema", version = "4.26.0", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'group-10-braintrust-lint' or extra == 'group-10-braintrust-test-agentscope' or extra == 'group-10-braintrust-test-agno' or extra != 'group-10-braintrust-test-crewai' or (extra == 'group-10-braintrust-test-crewai' and extra == 'group-10-braintrust-test-langchain') or (extra == 'group-10-braintrust-test-crewai' and extra == 'group-10-braintrust-test-litellm') or (extra == 'group-10-braintrust-test-crewai' and extra == 'group-10-braintrust-test-openai-agents') or (extra == 'group-10-braintrust-test-crewai' and extra == 'group-10-braintrust-test-strands')" }, { name = "polyleven" }, { name = "pyyaml" }, ] @@ -882,6 +883,7 @@ test-strands = [ { name = "pytest-vcr" }, ] test-types = [ + { name = "autoevals" }, { name = "mypy" }, { name = "pyright" }, { name = "pytest" }, @@ -1055,6 +1057,7 @@ test-strands = [ { name = "pytest-vcr", specifier = "==1.0.2" }, ] test-types = [ + { name = "autoevals", specifier = "==0.2.0" }, { name = "mypy", specifier = "==1.20.0" }, { name = "pyright", specifier = "==1.1.408" }, { name = "pytest", specifier = "==9.0.2" },