From 2621a0e6b06abb8025a1465507867a79c94a2b34 Mon Sep 17 00:00:00 2001 From: Akshaya Shanbhogue Date: Sat, 31 Jan 2026 11:38:37 -0800 Subject: [PATCH] refactor(DecoupleLoadFromRuntime): decouple eval load from eval runtime This allows customers to run evals dynamically without having to materialize to files. --- pyproject.toml | 2 +- src/uipath/_cli/_evals/_runtime.py | 545 ++++++------------ src/uipath/_cli/_evals/_span_utils.py | 6 +- src/uipath/_cli/_utils/_eval_set.py | 60 ++ src/uipath/_cli/cli_eval.py | 150 ++++- tests/cli/eval/test_eval_resume_flow.py | 94 ++- tests/cli/eval/test_eval_runtime_metadata.py | 263 +++------ .../eval/test_eval_runtime_suspend_resume.py | 27 +- tests/cli/eval/test_eval_span_utils.py | 19 +- .../cli/eval/test_eval_tracing_integration.py | 45 +- tests/cli/eval/test_evaluate.py | 91 ++- uv.lock | 2 +- 12 files changed, 666 insertions(+), 638 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index bf024661e..326970586 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "uipath" -version = "2.6.26" +version = "2.6.27" description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools." readme = { file = "README.md", content-type = "text/markdown" } requires-python = ">=3.11" diff --git a/src/uipath/_cli/_evals/_runtime.py b/src/uipath/_cli/_evals/_runtime.py index bb0e4a227..3120a77d4 100644 --- a/src/uipath/_cli/_evals/_runtime.py +++ b/src/uipath/_cli/_evals/_runtime.py @@ -1,9 +1,7 @@ import json import logging -import uuid from collections import defaultdict from contextlib import contextmanager -from pathlib import Path from time import time from typing import ( Any, @@ -38,7 +36,6 @@ UiPathExecuteOptions, UiPathExecutionRuntime, UiPathRuntimeFactoryProtocol, - UiPathRuntimeProtocol, UiPathRuntimeResult, UiPathRuntimeStatus, UiPathRuntimeStorageProtocol, @@ -72,10 +69,8 @@ from ...eval.evaluators import BaseEvaluator from ...eval.models import EvaluationResult from ...eval.models.models import AgentExecution, EvalItemResult -from .._utils._eval_set import EvalHelpers from .._utils._parallelization import execute_parallel from ._eval_util import apply_input_overrides -from ._evaluator_factory import EvaluatorFactory from ._models._evaluation_set import ( EvaluationItem, EvaluationSet, @@ -195,17 +190,20 @@ def clear(self, execution_id: str | None = None) -> None: class UiPathEvalContext: """Context used for evaluation runs.""" + # Required Fields + runtime_schema: UiPathRuntimeSchema + evaluation_set: EvaluationSet + evaluators: list[BaseEvaluator[Any, Any, Any]] + execution_id: str + + # Optional Fields entrypoint: str | None = None - no_report: bool | None = False workers: int | None = 1 - eval_set: str | None = None - eval_ids: list[str] | None = None eval_set_run_id: str | None = None verbose: bool = False enable_mocker_cache: bool = False report_coverage: bool = False input_overrides: dict[str, Any] | None = None - model_settings_id: str = "default" resume: bool = False job_id: str | None = None @@ -233,15 +231,11 @@ def __init__( self.trace_manager.tracer_provider.add_span_processor(span_processor) self.logs_exporter: ExecutionLogsExporter = ExecutionLogsExporter() - # Use job_id if available, then eval_set_run_id for stability across suspend/resume, - # otherwise generate UUID logger.debug( f"EVAL RUNTIME INIT: job_id={context.job_id}, " f"eval_set_run_id={context.eval_set_run_id}" ) - self.execution_id = ( - context.job_id or context.eval_set_run_id or str(uuid.uuid4()) - ) + self.execution_id = context.execution_id logger.info(f"EVAL RUNTIME: execution_id set to: {self.execution_id}") self.coverage = coverage.Coverage(branch=True) @@ -259,11 +253,8 @@ async def __aexit__(self, *args: Any) -> None: self.coverage.stop() self.coverage.report(include=["./*"], show_missing=True) - async def get_schema(self, runtime: UiPathRuntimeProtocol) -> UiPathRuntimeSchema: - schema = await runtime.get_schema() - if schema is None: - raise ValueError("Schema could not be loaded") - return schema + async def get_schema(self) -> UiPathRuntimeSchema: + return self.context.runtime_schema @contextmanager def _mocker_cache(self) -> Iterator[None]: @@ -283,48 +274,37 @@ def _mocker_cache(self) -> Iterator[None]: async def initiate_evaluation( self, - runtime: UiPathRuntimeProtocol, ) -> Tuple[ EvaluationSet, list[BaseEvaluator[Any, Any, Any]], Iterable[Awaitable[EvaluationRunResult]], ]: - if self.context.eval_set is None: - raise ValueError("eval_set must be provided for evaluation runs") - - # Load eval set (path is already resolved in cli_eval.py) - evaluation_set, _ = EvalHelpers.load_eval_set( - self.context.eval_set, self.context.eval_ids - ) - # Validate that resume mode is not used with multiple evaluations - if self.context.resume and len(evaluation_set.evaluations) > 1: + if self.context.resume and len(self.context.evaluation_set.evaluations) > 1: raise ValueError( f"Resume mode is not supported with multiple evaluations. " - f"Found {len(evaluation_set.evaluations)} evaluations in the set. " + f"Found {len(self.context.evaluation_set.evaluations)} evaluations in the set. " f"Please run with a single evaluation using --eval-ids to specify one evaluation." ) - evaluators = await self._load_evaluators(evaluation_set, runtime) - await self.event_bus.publish( EvaluationEvents.CREATE_EVAL_SET_RUN, EvalSetRunCreatedEvent( execution_id=self.execution_id, entrypoint=self.context.entrypoint or "", eval_set_run_id=self.context.eval_set_run_id, - eval_set_id=evaluation_set.id, - no_of_evals=len(evaluation_set.evaluations), - evaluators=evaluators, + eval_set_id=self.context.evaluation_set.id, + no_of_evals=len(self.context.evaluation_set.evaluations), + evaluators=self.context.evaluators, ), ) return ( - evaluation_set, - evaluators, + self.context.evaluation_set, + self.context.evaluators, ( - self._execute_eval(eval_item, evaluators, runtime) - for eval_item in evaluation_set.evaluations + self._execute_eval(eval_item, self.context.evaluators) + for eval_item in self.context.evaluation_set.evaluations ), ) @@ -336,191 +316,178 @@ async def execute(self) -> UiPathRuntimeResult: logger.info(f"EVAL RUNTIME: Resume mode: {self.context.resume}") logger.info("=" * 80) - # Resolve model settings override from eval set - settings_override = self._resolve_model_settings_override() + with self._mocker_cache(): + tracer = self.trace_manager.tracer_provider.get_tracer(__name__) - runtime = await self.factory.new_runtime( - entrypoint=self.context.entrypoint or "", - runtime_id=self.execution_id, - settings=settings_override, - ) - try: - with self._mocker_cache(): - tracer = self.trace_manager.tracer_provider.get_tracer(__name__) + # During resume, restore the parent "Evaluation Set Run" span context + # This prevents creating duplicate eval set run spans across jobs + eval_set_parent_span = await self._restore_parent_span( + "eval_set_run", "Evaluation Set Run" + ) - # During resume, restore the parent "Evaluation Set Run" span context - # This prevents creating duplicate eval set run spans across jobs - eval_set_parent_span = await self._restore_parent_span( - "eval_set_run", "Evaluation Set Run" + # Create "Evaluation Set Run" span or use restored parent context + # NOTE: Do NOT set execution.id on this parent span, as the mixin in + # UiPathExecutionBatchTraceProcessor propagates execution.id from parent + # to child spans, which would overwrite the per-eval execution.id + span_attributes: dict[str, str | bool] = { + "span_type": "eval_set_run", + "uipath.custom_instrumentation": True, + } + if self.context.eval_set_run_id: + span_attributes["eval_set_run_id"] = self.context.eval_set_run_id + + eval_set_span_context_manager = ( + use_span( + eval_set_parent_span, end_on_exit=False + ) # Don't end the remote span + if eval_set_parent_span + else tracer.start_as_current_span( + "Evaluation Set Run", attributes=span_attributes ) + ) - # Create "Evaluation Set Run" span or use restored parent context - # NOTE: Do NOT set execution.id on this parent span, as the mixin in - # UiPathExecutionBatchTraceProcessor propagates execution.id from parent - # to child spans, which would overwrite the per-eval execution.id - span_attributes: dict[str, str | bool] = { - "span_type": "eval_set_run", - "uipath.custom_instrumentation": True, - } - if self.context.eval_set_run_id: - span_attributes["eval_set_run_id"] = self.context.eval_set_run_id - - eval_set_span_context_manager = ( - use_span( - eval_set_parent_span, end_on_exit=False - ) # Don't end the remote span - if eval_set_parent_span - else tracer.start_as_current_span( - "Evaluation Set Run", attributes=span_attributes - ) + with eval_set_span_context_manager as span: + await self._save_span_context_for_resume( + span, "eval_set_run", "Evaluation Set Run" ) - with eval_set_span_context_manager as span: - await self._save_span_context_for_resume( - span, "eval_set_run", "Evaluation Set Run" + try: + ( + evaluation_set, + evaluators, + evaluation_iterable, + ) = await self.initiate_evaluation() + workers = self.context.workers or 1 + assert workers >= 1 + eval_run_result_list = await execute_parallel( + evaluation_iterable, workers + ) + results = UiPathEvalOutput( + evaluation_set_name=evaluation_set.name, + evaluation_set_results=eval_run_result_list, ) - try: - ( - evaluation_set, - evaluators, - evaluation_iterable, - ) = await self.initiate_evaluation(runtime) - workers = self.context.workers or 1 - assert workers >= 1 - eval_run_result_list = await execute_parallel( - evaluation_iterable, workers - ) - results = UiPathEvalOutput( - evaluation_set_name=evaluation_set.name, - evaluation_set_results=eval_run_result_list, - ) + # Computing evaluator averages + evaluator_averages: dict[str, float] = defaultdict(float) + evaluator_count: dict[str, int] = defaultdict(int) + + # Check if any eval runs failed + any_failed = False + for eval_run_result in results.evaluation_set_results: + # Check if the agent execution had an error + if ( + eval_run_result.agent_execution_output + and eval_run_result.agent_execution_output.result.error + ): + any_failed = True + + for result_dto in eval_run_result.evaluation_run_results: + evaluator_averages[result_dto.evaluator_id] += ( + result_dto.result.score + ) + evaluator_count[result_dto.evaluator_id] += 1 - # Computing evaluator averages - evaluator_averages: dict[str, float] = defaultdict(float) - evaluator_count: dict[str, int] = defaultdict(int) - - # Check if any eval runs failed - any_failed = False - for eval_run_result in results.evaluation_set_results: - # Check if the agent execution had an error - if ( - eval_run_result.agent_execution_output - and eval_run_result.agent_execution_output.result.error - ): - any_failed = True - - for result_dto in eval_run_result.evaluation_run_results: - evaluator_averages[result_dto.evaluator_id] += ( - result_dto.result.score - ) - evaluator_count[result_dto.evaluator_id] += 1 + for eval_id in evaluator_averages: + evaluator_averages[eval_id] = ( + evaluator_averages[eval_id] / evaluator_count[eval_id] + ) - for eval_id in evaluator_averages: - evaluator_averages[eval_id] = ( - evaluator_averages[eval_id] / evaluator_count[eval_id] - ) + # Configure span with output and metadata + await configure_eval_set_run_span( + span=span, + evaluator_averages=evaluator_averages, + execution_id=self.execution_id, + schema=await self.get_schema(), + success=not any_failed, + ) - # Configure span with output and metadata - await configure_eval_set_run_span( - span=span, - evaluator_averages=evaluator_averages, + await self.event_bus.publish( + EvaluationEvents.UPDATE_EVAL_SET_RUN, + EvalSetRunUpdatedEvent( execution_id=self.execution_id, - runtime=runtime, - get_schema_func=self.get_schema, + evaluator_scores=evaluator_averages, success=not any_failed, - ) + ), + wait_for_completion=False, + ) - await self.event_bus.publish( - EvaluationEvents.UPDATE_EVAL_SET_RUN, - EvalSetRunUpdatedEvent( - execution_id=self.execution_id, - evaluator_scores=evaluator_averages, - success=not any_failed, - ), - wait_for_completion=False, - ) + # Collect triggers from all evaluation runs (pass-through from inner runtime) + logger.info("=" * 80) + logger.info( + "EVAL RUNTIME: Collecting triggers from all evaluation runs" + ) + all_triggers = [] + for eval_run_result in results.evaluation_set_results: + if ( + eval_run_result.agent_execution_output + and eval_run_result.agent_execution_output.result + ): + runtime_result = ( + eval_run_result.agent_execution_output.result + ) + if runtime_result.triggers: + all_triggers.extend(runtime_result.triggers) - # Collect triggers from all evaluation runs (pass-through from inner runtime) - logger.info("=" * 80) + if all_triggers: logger.info( - "EVAL RUNTIME: Collecting triggers from all evaluation runs" + f"EVAL RUNTIME: ✅ Passing through {len(all_triggers)} trigger(s) to top-level result" ) - all_triggers = [] - for eval_run_result in results.evaluation_set_results: - if ( - eval_run_result.agent_execution_output - and eval_run_result.agent_execution_output.result - ): - runtime_result = ( - eval_run_result.agent_execution_output.result - ) - if runtime_result.triggers: - all_triggers.extend(runtime_result.triggers) - - if all_triggers: + for i, trigger in enumerate(all_triggers, 1): logger.info( - f"EVAL RUNTIME: ✅ Passing through {len(all_triggers)} trigger(s) to top-level result" + f"EVAL RUNTIME: Pass-through trigger {i}: {trigger.model_dump(by_alias=True)}" ) - for i, trigger in enumerate(all_triggers, 1): + else: + logger.info("EVAL RUNTIME: No triggers to pass through") + logger.info("=" * 80) + + # Determine overall status - propagate status from inner runtime + # This is critical for serverless executor to know to save state and suspend job + # Priority: SUSPENDED > FAULTED > SUCCESSFUL + overall_status = UiPathRuntimeStatus.SUCCESSFUL + for eval_run_result in results.evaluation_set_results: + if ( + eval_run_result.agent_execution_output + and eval_run_result.agent_execution_output.result + ): + inner_status = ( + eval_run_result.agent_execution_output.result.status + ) + if inner_status == UiPathRuntimeStatus.SUSPENDED: + overall_status = UiPathRuntimeStatus.SUSPENDED logger.info( - f"EVAL RUNTIME: Pass-through trigger {i}: {trigger.model_dump(by_alias=True)}" + "EVAL RUNTIME: Propagating SUSPENDED status from inner runtime" ) - else: - logger.info("EVAL RUNTIME: No triggers to pass through") - logger.info("=" * 80) - - # Determine overall status - propagate status from inner runtime - # This is critical for serverless executor to know to save state and suspend job - # Priority: SUSPENDED > FAULTED > SUCCESSFUL - overall_status = UiPathRuntimeStatus.SUCCESSFUL - for eval_run_result in results.evaluation_set_results: - if ( - eval_run_result.agent_execution_output - and eval_run_result.agent_execution_output.result - ): - inner_status = ( - eval_run_result.agent_execution_output.result.status - ) - if inner_status == UiPathRuntimeStatus.SUSPENDED: - overall_status = UiPathRuntimeStatus.SUSPENDED - logger.info( - "EVAL RUNTIME: Propagating SUSPENDED status from inner runtime" - ) - break # SUSPENDED takes highest priority, stop checking - elif inner_status == UiPathRuntimeStatus.FAULTED: - overall_status = UiPathRuntimeStatus.FAULTED - # Continue checking in case a later eval is SUSPENDED - - result = UiPathRuntimeResult( - output={**results.model_dump(by_alias=True)}, - status=overall_status, - triggers=all_triggers if all_triggers else None, - ) - return result - except Exception as e: - # Set span status to ERROR on exception - span.set_status(Status(StatusCode.ERROR, str(e))) + break # SUSPENDED takes highest priority, stop checking + elif inner_status == UiPathRuntimeStatus.FAULTED: + overall_status = UiPathRuntimeStatus.FAULTED + # Continue checking in case a later eval is SUSPENDED + + result = UiPathRuntimeResult( + output={**results.model_dump(by_alias=True)}, + status=overall_status, + triggers=all_triggers if all_triggers else None, + ) + return result + except Exception as e: + # Set span status to ERROR on exception + span.set_status(Status(StatusCode.ERROR, str(e))) - # Publish failure event for eval set run - await self.event_bus.publish( - EvaluationEvents.UPDATE_EVAL_SET_RUN, - EvalSetRunUpdatedEvent( - execution_id=self.execution_id, - evaluator_scores={}, - success=False, - ), - wait_for_completion=False, - ) - raise - finally: - await runtime.dispose() + # Publish failure event for eval set run + await self.event_bus.publish( + EvaluationEvents.UPDATE_EVAL_SET_RUN, + EvalSetRunUpdatedEvent( + execution_id=self.execution_id, + evaluator_scores={}, + success=False, + ), + wait_for_completion=False, + ) + raise async def _execute_eval( self, eval_item: EvaluationItem, evaluators: list[BaseEvaluator[Any, Any, Any]], - runtime: UiPathRuntimeProtocol, ) -> EvaluationRunResult: execution_id = str(eval_item.id) @@ -558,9 +525,7 @@ async def _execute_eval( try: # Generate LLM-based input if input_mocking_strategy is defined if eval_item.input_mocking_strategy: - eval_item = await self._generate_input_for_eval( - eval_item, runtime - ) + eval_item = await self._generate_input_for_eval(eval_item) set_execution_context( MockingContext( @@ -586,7 +551,6 @@ async def _execute_eval( agent_execution_output = await self.execute_runtime( eval_item, execution_id, - runtime, input_overrides=self.context.input_overrides, ) @@ -812,7 +776,8 @@ async def _execute_eval( return evaluation_run_results async def _generate_input_for_eval( - self, eval_item: EvaluationItem, runtime: UiPathRuntimeProtocol + self, + eval_item: EvaluationItem, ) -> EvaluationItem: """Use LLM to generate a mock input for an evaluation item.""" expected_output = ( @@ -822,7 +787,7 @@ async def _generate_input_for_eval( ) generated_input = await generate_llm_input( eval_item.input_mocking_strategy, - (await self.get_schema(runtime)).input, + (await self.get_schema()).input, expected_behavior=eval_item.expected_agent_behavior or "", expected_output=expected_output, ) @@ -841,71 +806,10 @@ def _get_and_clear_execution_data( return spans, logs - def _resolve_model_settings_override( - self, - ) -> dict[str, Any] | None: - """Resolve model settings override from evaluation set. - - Returns: - Model settings dict to use for override, or None if using defaults. - Settings are passed to factory via settings kwarg. - """ - # Skip if no model settings ID specified or using default - if ( - not self.context.model_settings_id - or self.context.model_settings_id == "default" - ): - return None - - # Load evaluation set to get model settings - evaluation_set, _ = EvalHelpers.load_eval_set(self.context.eval_set or "") - if ( - not hasattr(evaluation_set, "model_settings") - or not evaluation_set.model_settings - ): - logger.warning("No model settings available in evaluation set") - return None - - # Find the specified model settings - target_model_settings = next( - ( - ms - for ms in evaluation_set.model_settings - if ms.id == self.context.model_settings_id - ), - None, - ) - - if not target_model_settings: - logger.warning( - f"Model settings ID '{self.context.model_settings_id}' not found in evaluation set" - ) - return None - - logger.info( - f"Applying model settings override: model={target_model_settings.model_name}, temperature={target_model_settings.temperature}" - ) - - # Return settings dict with correct keys for factory - override: dict[str, str | float] = {} - if ( - target_model_settings.model_name - and target_model_settings.model_name != "same-as-agent" - ): - override["model"] = target_model_settings.model_name - if ( - target_model_settings.temperature is not None - and target_model_settings.temperature != "same-as-agent" - ): - override["temperature"] = float(target_model_settings.temperature) - - return override if override else None - async def execute_runtime( self, eval_item: EvaluationItem, execution_id: str, - runtime: UiPathRuntimeProtocol, input_overrides: dict[str, Any] | None = None, ) -> UiPathEvalRunExecutionOutput: log_handler = self._setup_execution_logging(execution_id) @@ -1076,119 +980,6 @@ async def run_evaluator( return result - async def _get_agent_model(self, runtime: UiPathRuntimeProtocol) -> str | None: - """Get agent model from the runtime schema metadata. - - The model is read from schema.metadata["settings"]["model"] which is - populated by the low-code agents runtime from agent.json. - - Returns: - The model name from agent settings, or None if not found. - """ - try: - schema = await self.get_schema(runtime) - if schema.metadata and "settings" in schema.metadata: - settings = schema.metadata["settings"] - model = settings.get("model") - if model: - logger.debug(f"Got agent model from schema.metadata: {model}") - return model - - # Fallback to protocol-based approach for backwards compatibility - model = self._find_agent_model_in_runtime(runtime) - if model: - logger.debug(f"Got agent model from runtime protocol: {model}") - return model - except Exception: - return None - - def _find_agent_model_in_runtime( - self, runtime: UiPathRuntimeProtocol - ) -> str | None: - """Recursively search for get_agent_model in runtime and its delegates. - - Runtimes may be wrapped (e.g., ResumableRuntime wraps TelemetryWrapper - which wraps the base runtime). This method traverses the wrapper chain - to find a runtime that implements LLMAgentRuntimeProtocol. - - Args: - runtime: The runtime to check (may be a wrapper) - - Returns: - The model name if found, None otherwise. - """ - # Check if this runtime implements the protocol - if isinstance(runtime, LLMAgentRuntimeProtocol): - return runtime.get_agent_model() - - # Check for delegate property (used by UiPathResumableRuntime, TelemetryRuntimeWrapper) - delegate = getattr(runtime, "delegate", None) or getattr( - runtime, "_delegate", None - ) - if delegate is not None: - return self._find_agent_model_in_runtime(delegate) - - return None - - async def _load_evaluators( - self, evaluation_set: EvaluationSet, runtime: UiPathRuntimeProtocol - ) -> list[BaseEvaluator[Any, Any, Any]]: - """Load evaluators referenced by the evaluation set.""" - evaluators = [] - eval_set = self.context.eval_set - if eval_set is None: - raise ValueError("eval_set cannot be None") - evaluators_dir = Path(eval_set).parent.parent / "evaluators" - - # Load agent model for 'same-as-agent' resolution in legacy evaluators - agent_model = await self._get_agent_model(runtime) - - # If evaluatorConfigs is specified, use that (new field with weights) - # Otherwise, fall back to evaluatorRefs (old field without weights) - if ( - hasattr(evaluation_set, "evaluator_configs") - and evaluation_set.evaluator_configs - ): - # Use new evaluatorConfigs field - supports weights - evaluator_ref_ids = {ref.ref for ref in evaluation_set.evaluator_configs} - else: - # Fall back to old evaluatorRefs field - plain strings - evaluator_ref_ids = set(evaluation_set.evaluator_refs) - - found_evaluator_ids = set() - - for file in evaluators_dir.glob("*.json"): - try: - with open(file, "r", encoding="utf-8") as f: - data = json.load(f) - except json.JSONDecodeError as e: - raise ValueError( - f"Invalid JSON in evaluator file '{file}': {str(e)}. " - f"Please check the file for syntax errors." - ) from e - - try: - evaluator_id = data.get("id") - if evaluator_id in evaluator_ref_ids: - evaluator = EvaluatorFactory.create_evaluator( - data, evaluators_dir, agent_model=agent_model - ) - evaluators.append(evaluator) - found_evaluator_ids.add(evaluator_id) - except Exception as e: - raise ValueError( - f"Failed to create evaluator from file '{file}': {str(e)}. " - f"Please verify the evaluator configuration." - ) from e - - missing_evaluators = evaluator_ref_ids - found_evaluator_ids - if missing_evaluators: - raise ValueError( - f"Could not find the following evaluators: {missing_evaluators}" - ) - - return evaluators - async def _restore_parent_span( self, span_key: str, span_type: str ) -> NonRecordingSpan | None: diff --git a/src/uipath/_cli/_evals/_span_utils.py b/src/uipath/_cli/_evals/_span_utils.py index 7c99729b7..270c4c904 100644 --- a/src/uipath/_cli/_evals/_span_utils.py +++ b/src/uipath/_cli/_evals/_span_utils.py @@ -8,7 +8,7 @@ # Type hint for runtime protocol (avoids circular imports) try: - from uipath.runtime import UiPathRuntimeProtocol + from uipath.runtime import UiPathRuntimeProtocol, UiPathRuntimeSchema except ImportError: UiPathRuntimeProtocol = Any # type: ignore @@ -192,8 +192,7 @@ async def configure_eval_set_run_span( span: Span, evaluator_averages: Dict[str, float], execution_id: str, - runtime: Any, - get_schema_func: Any, + schema: UiPathRuntimeSchema, success: bool = True, ) -> None: """Configure Evaluation Set Run span with output and metadata. @@ -216,7 +215,6 @@ async def configure_eval_set_run_span( # Get runtime schemas try: - schema = await get_schema_func(runtime) input_schema = schema.input output_schema = schema.output except Exception: diff --git a/src/uipath/_cli/_utils/_eval_set.py b/src/uipath/_cli/_utils/_eval_set.py index b4ac727b1..ccad1e89a 100644 --- a/src/uipath/_cli/_utils/_eval_set.py +++ b/src/uipath/_cli/_utils/_eval_set.py @@ -5,6 +5,7 @@ import click from pydantic import ValidationError +from uipath._cli._evals._evaluator_factory import EvaluatorFactory from uipath._cli._evals._models._evaluation_set import ( EvaluationItem, EvaluationSet, @@ -13,6 +14,7 @@ ) from uipath._cli._evals.mocks.types import InputMockingStrategy, LLMMockingStrategy from uipath._cli._utils._console import ConsoleLogger +from uipath.eval.evaluators import BaseEvaluator console = ConsoleLogger() @@ -175,3 +177,61 @@ def migrate_evaluation_item( if eval_ids: eval_set.extract_selected_evals(eval_ids) return eval_set, resolved_path + + @staticmethod + async def load_evaluators( + eval_set_path: str, + evaluation_set: EvaluationSet, + agent_model: str | None = None, + ) -> list[BaseEvaluator[Any, Any, Any]]: + """Load evaluators referenced by the evaluation set.""" + evaluators = [] + if evaluation_set is None: + raise ValueError("eval_set cannot be None") + evaluators_dir = Path(eval_set_path).parent.parent / "evaluators" + + # If evaluatorConfigs is specified, use that (new field with weights) + # Otherwise, fall back to evaluatorRefs (old field without weights) + if ( + hasattr(evaluation_set, "evaluator_configs") + and evaluation_set.evaluator_configs + ): + # Use new evaluatorConfigs field - supports weights + evaluator_ref_ids = {ref.ref for ref in evaluation_set.evaluator_configs} + else: + # Fall back to old evaluatorRefs field - plain strings + evaluator_ref_ids = set(evaluation_set.evaluator_refs) + + found_evaluator_ids = set() + + for file in evaluators_dir.glob("*.json"): + try: + with open(file, "r", encoding="utf-8") as f: + data = json.load(f) + except json.JSONDecodeError as e: + raise ValueError( + f"Invalid JSON in evaluator file '{file}': {str(e)}. " + f"Please check the file for syntax errors." + ) from e + + try: + evaluator_id = data.get("id") + if evaluator_id in evaluator_ref_ids: + evaluator = EvaluatorFactory.create_evaluator( + data, evaluators_dir, agent_model=agent_model + ) + evaluators.append(evaluator) + found_evaluator_ids.add(evaluator_id) + except Exception as e: + raise ValueError( + f"Failed to create evaluator from file '{file}': {str(e)}. " + f"Please verify the evaluator configuration." + ) from e + + missing_evaluators = evaluator_ref_ids - found_evaluator_ids + if missing_evaluators: + raise ValueError( + f"Could not find the following evaluators: {missing_evaluators}" + ) + + return evaluators diff --git a/src/uipath/_cli/cli_eval.py b/src/uipath/_cli/cli_eval.py index 295bd67fe..4cab05d5e 100644 --- a/src/uipath/_cli/cli_eval.py +++ b/src/uipath/_cli/cli_eval.py @@ -2,16 +2,24 @@ import asyncio import logging import os +import uuid from typing import Any import click from uipath.core.tracing import UiPathTraceManager -from uipath.runtime import UiPathRuntimeContext, UiPathRuntimeFactoryRegistry +from uipath.runtime import ( + UiPathRuntimeContext, + UiPathRuntimeFactoryRegistry, + UiPathRuntimeProtocol, + UiPathRuntimeSchema, +) from uipath._cli._evals._console_progress_reporter import ConsoleProgressReporter from uipath._cli._evals._evaluate import evaluate +from uipath._cli._evals._models._evaluation_set import EvaluationSet from uipath._cli._evals._progress_reporter import StudioWebProgressReporter from uipath._cli._evals._runtime import ( + LLMAgentRuntimeProtocol, UiPathEvalContext, ) from uipath._cli._evals._telemetry import EvalTelemetrySubscriber @@ -62,6 +70,109 @@ def setup_reporting_prereq(no_report: bool) -> bool: return True +def _find_agent_model_in_runtime(runtime: UiPathRuntimeProtocol) -> str | None: + """Recursively search for get_agent_model in runtime and its delegates. + + Runtimes may be wrapped (e.g., ResumableRuntime wraps TelemetryWrapper + which wraps the base runtime). This method traverses the wrapper chain + to find a runtime that implements LLMAgentRuntimeProtocol. + + Args: + runtime: The runtime to check (may be a wrapper) + + Returns: + The model name if found, None otherwise. + """ + # Check if this runtime implements the protocol + if isinstance(runtime, LLMAgentRuntimeProtocol): + return runtime.get_agent_model() + + # Check for delegate property (used by UiPathResumableRuntime, TelemetryRuntimeWrapper) + delegate = getattr(runtime, "delegate", None) or getattr(runtime, "_delegate", None) + if delegate is not None: + return _find_agent_model_in_runtime(delegate) + + return None + + +async def _get_agent_model( + runtime: UiPathRuntimeProtocol, schema: UiPathRuntimeSchema +) -> str | None: + """Get agent model from the runtime schema metadata. + + The model is read from schema.metadata["settings"]["model"] which is + populated by the low-code agents runtime from agent.json. + + Returns: + The model name from agent settings, or None if not found. + """ + try: + if schema.metadata and "settings" in schema.metadata: + settings = schema.metadata["settings"] + model = settings.get("model") + if model: + logger.debug(f"Got agent model from schema.metadata: {model}") + return model + + # Fallback to protocol-based approach for backwards compatibility + model = _find_agent_model_in_runtime(runtime) + if model: + logger.debug(f"Got agent model from runtime protocol: {model}") + return model + except Exception: + return None + + +def _resolve_model_settings_override( + model_settings_id: str, evaluation_set: EvaluationSet +) -> dict[str, Any] | None: + """Resolve model settings override from evaluation set. + + Returns: + Model settings dict to use for override, or None if using defaults. + Settings are passed to factory via settings kwarg. + """ + # Skip if no model settings ID specified or using default + if not model_settings_id or model_settings_id == "default": + return None + + # Load evaluation set to get model settings + if not evaluation_set.model_settings: + logger.warning("No model settings available in evaluation set") + return None + + # Find the specified model settings + target_model_settings = next( + (ms for ms in evaluation_set.model_settings if ms.id == model_settings_id), + None, + ) + + if not target_model_settings: + logger.warning( + f"Model settings ID '{model_settings_id}' not found in evaluation set" + ) + return None + + logger.info( + f"Applying model settings override: model={target_model_settings.model_name}, temperature={target_model_settings.temperature}" + ) + + # Return settings dict with correct keys for factory + override: dict[str, str | float] = {} + if ( + target_model_settings.model_name + and target_model_settings.model_name != "same-as-agent" + ): + override["model"] = target_model_settings.model_name + if ( + target_model_settings.temperature is not None + and target_model_settings.temperature != "same-as-agent" + ): + override["temperature"] = float(target_model_settings.temperature) + + return override if override else None + + @click.command() @click.argument("entrypoint", required=False) @click.argument("eval_set", required=False) @@ -188,7 +299,6 @@ def eval( eval_context = UiPathEvalContext() eval_context.entrypoint = entrypoint or auto_discover_entrypoint() - eval_context.no_report = no_report eval_context.workers = workers eval_context.eval_set_run_id = eval_set_run_id eval_context.enable_mocker_cache = enable_mocker_cache @@ -197,10 +307,7 @@ def eval( eval_set_path = eval_set or EvalHelpers.auto_discover_eval_set() _, resolved_eval_set_path = EvalHelpers.load_eval_set(eval_set_path, eval_ids) - eval_context.eval_set = resolved_eval_set_path - eval_context.eval_ids = eval_ids eval_context.report_coverage = report_coverage - eval_context.model_settings_id = model_settings_id eval_context.input_overrides = input_overrides eval_context.resume = resume @@ -269,6 +376,39 @@ async def execute_eval(): project_id = UiPathConfig.project_id + eval_context.execution_id = ( + eval_context.job_id + or eval_context.eval_set_run_id + or str(uuid.uuid4()) + ) + + # Load eval set (path is already resolved in cli_eval.py) + eval_context.evaluation_set, _ = EvalHelpers.load_eval_set( + resolved_eval_set_path, eval_ids + ) + + # Resolve model settings override from eval set + settings_override = _resolve_model_settings_override( + model_settings_id, eval_context.evaluation_set + ) + + runtime = await runtime_factory.new_runtime( + entrypoint=eval_context.entrypoint or "", + runtime_id=eval_context.execution_id, + settings=settings_override, + ) + + eval_context.runtime_schema = await runtime.get_schema() + + eval_context.evaluators = await EvalHelpers.load_evaluators( + resolved_eval_set_path, + eval_context.evaluation_set, + await _get_agent_model(runtime, eval_context.runtime_schema), + ) + + # Runtime is not required anymore. + await runtime.dispose() + try: if project_id: studio_client = StudioClient(project_id) diff --git a/tests/cli/eval/test_eval_resume_flow.py b/tests/cli/eval/test_eval_resume_flow.py index 7547b195c..2096f481a 100644 --- a/tests/cli/eval/test_eval_resume_flow.py +++ b/tests/cli/eval/test_eval_resume_flow.py @@ -1,5 +1,6 @@ """Unit tests for eval resume flow to ensure UiPathExecuteOptions is passed correctly.""" +import uuid from pathlib import Path from unittest.mock import AsyncMock, patch @@ -14,6 +15,7 @@ ) from uipath._cli._evals._runtime import UiPathEvalContext, UiPathEvalRuntime +from uipath._cli._utils._eval_set import EvalHelpers from uipath._events._event_bus import EventBus # ============================================================================ @@ -27,25 +29,42 @@ @pytest.mark.asyncio async def test_execute_runtime_method_passes_options_with_resume_false(): - """Direct test of execute_runtime method to verify UiPathExecuteOptions(resume=False) is passed.""" + """Test that execute_runtime respects resume=False setting.""" # Arrange from uipath._cli._evals._models._evaluation_set import EvaluationItem event_bus = EventBus() trace_manager = UiPathTraceManager() - context = UiPathEvalContext() - context.eval_set = str( - Path(__file__).parent / "evals" / "eval-sets" / "default.json" - ) - context.resume = False # Test resume=False - # Create a mock runtime that will be wrapped + # Load evaluation set + eval_set_path = str(Path(__file__).parent / "evals" / "eval-sets" / "default.json") + evaluation_set, _ = EvalHelpers.load_eval_set(eval_set_path) + + # Create a mock runtime to get schema mock_runtime = AsyncMock(spec=UiPathRuntimeProtocol) mock_runtime.execute = AsyncMock( return_value=UiPathRuntimeResult( output={"result": "success"}, status=UiPathRuntimeStatus.SUCCESSFUL ) ) + mock_runtime.get_schema = AsyncMock() + + runtime_schema = await mock_runtime.get_schema() + runtime_schema.input = {"type": "object", "properties": {}} + runtime_schema.output = {"type": "object", "properties": {}} + + # Load evaluators + evaluators = await EvalHelpers.load_evaluators( + eval_set_path, evaluation_set, agent_model=None + ) + + # Set up context + context = UiPathEvalContext() + context.execution_id = str(uuid.uuid4()) + context.evaluation_set = evaluation_set + context.runtime_schema = runtime_schema + context.evaluators = evaluators + context.resume = False # Test resume=False # Create a mock factory mock_factory = AsyncMock(spec=UiPathRuntimeFactoryProtocol) @@ -74,7 +93,7 @@ async def test_execute_runtime_method_passes_options_with_resume_false(): mock_execution_runtime_class.return_value = mock_execution_runtime_instance await eval_runtime.execute_runtime( - eval_item=eval_item, execution_id="test-exec-id", runtime=mock_runtime + eval_item=eval_item, execution_id="test-exec-id" ) # Assert - Verify that execute was called with UiPathExecuteOptions(resume=False) @@ -96,17 +115,16 @@ async def test_execute_runtime_method_passes_options_with_resume_false(): @pytest.mark.asyncio async def test_execute_runtime_method_passes_options_with_resume_true(): - """Direct test of execute_runtime method to verify UiPathExecuteOptions(resume=True) is passed.""" + """Test that execute_runtime respects resume=True setting.""" # Arrange from uipath._cli._evals._models._evaluation_set import EvaluationItem event_bus = EventBus() trace_manager = UiPathTraceManager() - context = UiPathEvalContext() - context.eval_set = str( - Path(__file__).parent / "evals" / "eval-sets" / "default.json" - ) - context.resume = True # Test resume=True + + # Load evaluation set + eval_set_path = str(Path(__file__).parent / "evals" / "eval-sets" / "default.json") + evaluation_set, _ = EvalHelpers.load_eval_set(eval_set_path) # Create a mock runtime mock_runtime = AsyncMock(spec=UiPathRuntimeProtocol) @@ -115,6 +133,24 @@ async def test_execute_runtime_method_passes_options_with_resume_true(): output={"result": "success"}, status=UiPathRuntimeStatus.SUCCESSFUL ) ) + mock_runtime.get_schema = AsyncMock() + + runtime_schema = await mock_runtime.get_schema() + runtime_schema.input = {"type": "object", "properties": {}} + runtime_schema.output = {"type": "object", "properties": {}} + + # Load evaluators + evaluators = await EvalHelpers.load_evaluators( + eval_set_path, evaluation_set, agent_model=None + ) + + # Set up context + context = UiPathEvalContext() + context.execution_id = str(uuid.uuid4()) + context.evaluation_set = evaluation_set + context.runtime_schema = runtime_schema + context.evaluators = evaluators + context.resume = True # Test resume=True # Create a mock factory mock_factory = AsyncMock(spec=UiPathRuntimeFactoryProtocol) @@ -143,7 +179,7 @@ async def test_execute_runtime_method_passes_options_with_resume_true(): mock_execution_runtime_class.return_value = mock_execution_runtime_instance await eval_runtime.execute_runtime( - eval_item=eval_item, execution_id="test-exec-id", runtime=mock_runtime + eval_item=eval_item, execution_id="test-exec-id" ) # Assert - Verify that execute was called with UiPathExecuteOptions(resume=True) @@ -167,15 +203,35 @@ async def test_resume_with_multiple_evaluations_raises_error(): # Arrange event_bus = EventBus() trace_manager = UiPathTraceManager() - context = UiPathEvalContext() - context.eval_set = str( + + # Load evaluation set with multiple evals + eval_set_path = str( Path(__file__).parent / "evals" / "eval-sets" / "multiple-evals.json" ) + evaluation_set, _ = EvalHelpers.load_eval_set(eval_set_path) + + # Create a mock runtime + mock_runtime = AsyncMock(spec=UiPathRuntimeProtocol) + mock_runtime.get_schema = AsyncMock() + runtime_schema = await mock_runtime.get_schema() + runtime_schema.input = {"type": "object", "properties": {}} + runtime_schema.output = {"type": "object", "properties": {}} + + # Load evaluators + evaluators = await EvalHelpers.load_evaluators( + eval_set_path, evaluation_set, agent_model=None + ) + + # Set up context + context = UiPathEvalContext() + context.execution_id = str(uuid.uuid4()) + context.evaluation_set = evaluation_set + context.runtime_schema = runtime_schema + context.evaluators = evaluators context.resume = True # Enable resume mode # Create a mock factory mock_factory = AsyncMock(spec=UiPathRuntimeFactoryProtocol) - mock_runtime = AsyncMock(spec=UiPathRuntimeProtocol) mock_factory.new_runtime = AsyncMock(return_value=mock_runtime) eval_runtime = UiPathEvalRuntime( @@ -190,4 +246,4 @@ async def test_resume_with_multiple_evaluations_raises_error(): ValueError, match=r"Resume mode is not supported with multiple evaluations.*Found 2 evaluations", ): - await eval_runtime.initiate_evaluation(mock_runtime) + await eval_runtime.initiate_evaluation() diff --git a/tests/cli/eval/test_eval_runtime_metadata.py b/tests/cli/eval/test_eval_runtime_metadata.py index d85d36680..ab62cbf34 100644 --- a/tests/cli/eval/test_eval_runtime_metadata.py +++ b/tests/cli/eval/test_eval_runtime_metadata.py @@ -8,7 +8,7 @@ - LLMAgentRuntimeProtocol - protocol implementation detection """ -from pathlib import Path +import uuid from typing import Any, AsyncGenerator import pytest @@ -30,6 +30,10 @@ UiPathEvalContext, UiPathEvalRuntime, ) +from uipath._cli.cli_eval import ( + _find_agent_model_in_runtime, + _get_agent_model, +) from uipath._events._event_bus import EventBus @@ -155,184 +159,113 @@ def test_protocol_rejects_wrapper_without_method(self): class TestFindAgentModelInRuntime: """Tests for _find_agent_model_in_runtime recursive search.""" - @pytest.fixture - def eval_runtime(self): - """Create an eval runtime for testing.""" - context = UiPathEvalContext() - context.eval_set = str( - Path(__file__).parent / "evals" / "eval-sets" / "default.json" - ) - event_bus = EventBus() - trace_manager = UiPathTraceManager() - - async def create_runtime(): - return BaseTestRuntime() - - factory = MockFactory(create_runtime) - return UiPathEvalRuntime( - context, - factory, - trace_manager, - event_bus, - ) - - def test_finds_model_in_direct_runtime(self, eval_runtime): + def test_finds_model_in_direct_runtime(self): """Test finding agent model directly on runtime.""" runtime = AgentModelRuntime("gpt-4o") - result = eval_runtime._find_agent_model_in_runtime(runtime) + result = _find_agent_model_in_runtime(runtime) assert result == "gpt-4o" - def test_finds_model_in_wrapped_runtime(self, eval_runtime): + def test_finds_model_in_wrapped_runtime(self): """Test finding agent model through wrapper's delegate.""" inner = AgentModelRuntime("claude-3") wrapper = WrapperRuntime(inner) - result = eval_runtime._find_agent_model_in_runtime(wrapper) + result = _find_agent_model_in_runtime(wrapper) assert result == "claude-3" - def test_finds_model_in_deeply_wrapped_runtime(self, eval_runtime): + def test_finds_model_in_deeply_wrapped_runtime(self): """Test finding agent model through multiple wrapper layers.""" inner = AgentModelRuntime("gpt-4-turbo") wrapper1 = WrapperRuntime(inner) wrapper2 = WrapperRuntime(wrapper1) - result = eval_runtime._find_agent_model_in_runtime(wrapper2) + result = _find_agent_model_in_runtime(wrapper2) assert result == "gpt-4-turbo" - def test_finds_model_via_private_delegate(self, eval_runtime): + def test_finds_model_via_private_delegate(self): """Test finding agent model through _delegate attribute.""" inner = AgentModelRuntime("gemini-pro") wrapper = PrivateDelegateRuntime(inner) - result = eval_runtime._find_agent_model_in_runtime(wrapper) + result = _find_agent_model_in_runtime(wrapper) assert result == "gemini-pro" - def test_returns_none_when_no_model(self, eval_runtime): + def test_returns_none_when_no_model(self): """Test returns None when no runtime implements the protocol.""" runtime = BaseTestRuntime() - result = eval_runtime._find_agent_model_in_runtime(runtime) + result = _find_agent_model_in_runtime(runtime) assert result is None - def test_returns_none_for_none_model(self, eval_runtime): + def test_returns_none_for_none_model(self): """Test returns None when runtime returns None for model.""" runtime = AgentModelRuntime(None) - result = eval_runtime._find_agent_model_in_runtime(runtime) + result = _find_agent_model_in_runtime(runtime) assert result is None class TestGetAgentModel: - """Tests for _get_agent_model method.""" + """Tests for _get_agent_model function.""" - @pytest.fixture - def context(self): - """Create eval context.""" - context = UiPathEvalContext() - context.eval_set = str( - Path(__file__).parent / "evals" / "eval-sets" / "default.json" - ) - return context - - async def test_returns_agent_model(self, context): - """Test that _get_agent_model returns the correct model.""" + @pytest.mark.asyncio + async def test_returns_agent_model(self): + """Test that _get_agent_model returns the correct model from schema.""" + runtime = AgentModelRuntime("gpt-4o-2024-11-20") + schema = MockRuntimeSchema() + schema.metadata = {"settings": {"model": "gpt-4o-2024-11-20"}} - async def create_runtime(): - return AgentModelRuntime("gpt-4o-2024-11-20") - - factory = MockFactory(create_runtime) - event_bus = EventBus() - trace_manager = UiPathTraceManager() - eval_runtime = UiPathEvalRuntime( - context, - factory, - trace_manager, - event_bus, - ) - - runtime = await create_runtime() - model = await eval_runtime._get_agent_model(runtime) + model = await _get_agent_model(runtime, schema) assert model == "gpt-4o-2024-11-20" - async def test_returns_none_when_no_model(self, context): + @pytest.mark.asyncio + async def test_returns_none_when_no_model(self): """Test that _get_agent_model returns None when runtime has no model.""" + runtime = BaseTestRuntime() + schema = MockRuntimeSchema() - async def create_runtime(): - return BaseTestRuntime() - - factory = MockFactory(create_runtime) - event_bus = EventBus() - trace_manager = UiPathTraceManager() - eval_runtime = UiPathEvalRuntime( - context, - factory, - trace_manager, - event_bus, - ) - - runtime = await create_runtime() - model = await eval_runtime._get_agent_model(runtime) + model = await _get_agent_model(runtime, schema) assert model is None - async def test_returns_model_consistently(self, context): + @pytest.mark.asyncio + async def test_returns_model_consistently(self): """Test that _get_agent_model returns consistent results.""" - - async def create_runtime(): - return AgentModelRuntime("consistent-model") - - factory = MockFactory(create_runtime) - event_bus = EventBus() - trace_manager = UiPathTraceManager() - eval_runtime = UiPathEvalRuntime( - context, - factory, - trace_manager, - event_bus, - ) - - runtime = await create_runtime() + runtime = AgentModelRuntime("consistent-model") + schema = MockRuntimeSchema() + schema.metadata = {"settings": {"model": "consistent-model"}} # Multiple calls should return the same value - model1 = await eval_runtime._get_agent_model(runtime) - model2 = await eval_runtime._get_agent_model(runtime) + model1 = await _get_agent_model(runtime, schema) + model2 = await _get_agent_model(runtime, schema) assert model1 == model2 == "consistent-model" - async def test_handles_exception_gracefully(self, context): - """Test that _get_agent_model returns None on exception.""" + @pytest.mark.asyncio + async def test_handles_exception_gracefully(self, monkeypatch): + """Test that _get_agent_model returns None when _find_agent_model_in_runtime raises exception.""" + runtime = BaseTestRuntime() + schema = MockRuntimeSchema() - async def create_good_runtime(): - return AgentModelRuntime("model") + # Mock _find_agent_model_in_runtime to raise an exception + def mock_find_agent_model_error(r): + raise RuntimeError("Unexpected error during model lookup") - factory = MockFactory(create_good_runtime) - event_bus = EventBus() - trace_manager = UiPathTraceManager() - eval_runtime = UiPathEvalRuntime( - context, - factory, - trace_manager, - event_bus, + monkeypatch.setattr( + "uipath._cli.cli_eval._find_agent_model_in_runtime", + mock_find_agent_model_error, ) - # Create a bad runtime that raises during get_agent_model - class BadRuntime(BaseTestRuntime): - def get_agent_model(self): - raise RuntimeError("Get model error") - - bad_runtime = BadRuntime() - model = await eval_runtime._get_agent_model(bad_runtime) + model = await _get_agent_model(runtime, schema) assert model is None class TestGetSchema: """Tests for get_schema method.""" - @pytest.fixture - def context(self): - """Create eval context.""" + @pytest.mark.asyncio + async def test_returns_schema(self): + """Test that get_schema returns the schema from context.""" + schema = MockRuntimeSchema() context = UiPathEvalContext() - context.eval_set = str( - Path(__file__).parent / "evals" / "eval-sets" / "default.json" - ) - return context - - async def test_returns_schema(self, context): - """Test that get_schema returns the schema.""" + context.execution_id = str(uuid.uuid4()) + context.evaluation_set = None # type: ignore + context.runtime_schema = schema + context.evaluators = [] async def create_runtime(): return BaseTestRuntime() @@ -347,13 +280,19 @@ async def create_runtime(): event_bus, ) - runtime = await create_runtime() - schema = await eval_runtime.get_schema(runtime) - assert schema is not None - assert schema.file_path == "test.py" + retrieved_schema = await eval_runtime.get_schema() + assert retrieved_schema is not None + assert retrieved_schema.file_path == "test.py" - async def test_returns_schema_consistently(self, context): - """Test that get_schema returns consistent results.""" + @pytest.mark.asyncio + async def test_returns_schema_consistently(self): + """Test that get_schema returns the same schema from context.""" + schema = MockRuntimeSchema() + context = UiPathEvalContext() + context.execution_id = str(uuid.uuid4()) + context.evaluation_set = None # type: ignore + context.runtime_schema = schema + context.evaluators = [] async def create_runtime(): return BaseTestRuntime() @@ -368,56 +307,19 @@ async def create_runtime(): event_bus, ) - runtime = await create_runtime() - - # Multiple calls should return equivalent values - schema1 = await eval_runtime.get_schema(runtime) - schema2 = await eval_runtime.get_schema(runtime) + # Multiple calls should return the same schema from context + schema1 = await eval_runtime.get_schema() + schema2 = await eval_runtime.get_schema() - # Should have the same properties + # Should be the same object + assert schema1 is schema2 assert schema1.file_path == schema2.file_path == "test.py" - async def test_schema_and_model_work_with_same_runtime(self, context): - """Test that get_schema and _get_agent_model work with the same runtime.""" - - async def create_runtime(): - return AgentModelRuntime("shared-model") - - factory = MockFactory(create_runtime) - event_bus = EventBus() - trace_manager = UiPathTraceManager() - eval_runtime = UiPathEvalRuntime( - context, - factory, - trace_manager, - event_bus, - ) - - runtime = await create_runtime() - - # Call both methods with the same runtime - schema = await eval_runtime.get_schema(runtime) - model = await eval_runtime._get_agent_model(runtime) - - # Both should work correctly - assert schema is not None - assert schema.file_path == "test.py" - assert model == "shared-model" - class TestWrappedRuntimeModelResolution: """Tests for model resolution through realistic wrapper chains.""" - @pytest.fixture - def context(self): - """Create eval context.""" - context = UiPathEvalContext() - context.eval_set = str( - Path(__file__).parent / "evals" / "eval-sets" / "default.json" - ) - return context - - async def test_resolves_model_through_resumable_telemetry_chain(self, context): + def test_resolves_model_through_resumable_telemetry_chain(self): """Test model resolution through ResumableRuntime -> TelemetryWrapper -> BaseRuntime chain. This mimics the real wrapper chain: @@ -432,18 +334,5 @@ async def test_resolves_model_through_resumable_telemetry_chain(self, context): # Simulate UiPathResumableRuntime resumable_runtime = WrapperRuntime(telemetry_wrapper) - async def create_runtime(): - return resumable_runtime - - factory = MockFactory(create_runtime) - event_bus = EventBus() - trace_manager = UiPathTraceManager() - eval_runtime = UiPathEvalRuntime( - context, - factory, - trace_manager, - event_bus, - ) - - model = await eval_runtime._get_agent_model(resumable_runtime) + model = _find_agent_model_in_runtime(resumable_runtime) assert model == "gpt-4o-from-agent-json" diff --git a/tests/cli/eval/test_eval_runtime_suspend_resume.py b/tests/cli/eval/test_eval_runtime_suspend_resume.py index ec8e07167..db4cd4d71 100644 --- a/tests/cli/eval/test_eval_runtime_suspend_resume.py +++ b/tests/cli/eval/test_eval_runtime_suspend_resume.py @@ -7,6 +7,7 @@ - Ensures no duplicate eval run entries in StudioWeb """ +import uuid from pathlib import Path from typing import Any, AsyncGenerator from unittest.mock import AsyncMock @@ -27,6 +28,7 @@ from uipath._cli._evals._evaluate import evaluate from uipath._cli._evals._runtime import UiPathEvalContext +from uipath._cli._utils._eval_set import EvalHelpers from uipath._events._event_bus import EventBus from uipath._events._events import EvaluationEvents @@ -131,12 +133,29 @@ async def dispose(self) -> None: @pytest.fixture -def context(): +async def context(): """Create eval context.""" - context = UiPathEvalContext() - context.eval_set = str( - Path(__file__).parent / "evals" / "eval-sets" / "default.json" + eval_set_path = str(Path(__file__).parent / "evals" / "eval-sets" / "default.json") + + # Load evaluation set + evaluation_set, _ = EvalHelpers.load_eval_set(eval_set_path) + + # Create a mock runtime to get schema + runtime = SuccessfulRuntime() + runtime_schema = await runtime.get_schema() + + # Load evaluators + evaluators = await EvalHelpers.load_evaluators( + eval_set_path, evaluation_set, agent_model=None ) + + # Set up context + context = UiPathEvalContext() + context.execution_id = str(uuid.uuid4()) + context.evaluation_set = evaluation_set + context.runtime_schema = runtime_schema + context.evaluators = evaluators + return context diff --git a/tests/cli/eval/test_eval_span_utils.py b/tests/cli/eval/test_eval_span_utils.py index 7557861aa..5e523276c 100644 --- a/tests/cli/eval/test_eval_span_utils.py +++ b/tests/cli/eval/test_eval_span_utils.py @@ -286,8 +286,7 @@ async def test_configure_eval_set_run_span(self): "eval2": 90.0, } - # Mock runtime and get_schema_func - mock_runtime = MagicMock() + # Mock schema mock_schema = MagicMock() mock_schema.input = { "type": "object", @@ -295,15 +294,11 @@ async def test_configure_eval_set_run_span(self): } mock_schema.output = {"type": "string"} - async def mock_get_schema(runtime): - return mock_schema - await configure_eval_set_run_span( span=span, # type: ignore[arg-type] evaluator_averages=evaluator_averages, execution_id="exec-complete", - runtime=mock_runtime, - get_schema_func=mock_get_schema, + schema=mock_schema, success=True, ) @@ -331,16 +326,16 @@ async def test_configure_eval_set_run_span_schema_error(self): evaluator_averages = {"eval1": 75.0} - # Mock get_schema_func that raises exception - async def mock_get_schema_error(runtime): - raise Exception("Schema not found") + # Mock schema with missing fields + mock_schema = MagicMock() + mock_schema.input = None + mock_schema.output = None await configure_eval_set_run_span( span=span, # type: ignore[arg-type] evaluator_averages=evaluator_averages, execution_id="exec-no-schema", - runtime=MagicMock(), - get_schema_func=mock_get_schema_error, + schema=mock_schema, success=True, ) diff --git a/tests/cli/eval/test_eval_tracing_integration.py b/tests/cli/eval/test_eval_tracing_integration.py index 3a9cf3d33..0b2f42fa0 100644 --- a/tests/cli/eval/test_eval_tracing_integration.py +++ b/tests/cli/eval/test_eval_tracing_integration.py @@ -4,12 +4,15 @@ with the expected attributes by mocking the tracer. """ +import uuid from contextlib import contextmanager from typing import Any from unittest.mock import AsyncMock, MagicMock, patch import pytest +from uipath.runtime.schema import UiPathRuntimeSchema +from uipath._cli._evals._models._evaluation_set import EvaluationSet from uipath._cli._evals._runtime import UiPathEvalContext, UiPathEvalRuntime from uipath.eval.evaluators import BaseEvaluator from uipath.eval.models import NumericEvaluationResult @@ -71,10 +74,39 @@ def get_span_by_name(self, name: str) -> dict[str, Any] | None: def create_eval_context(**kwargs: Any) -> UiPathEvalContext: - """Helper to create UiPathEvalContext with specific attribute values.""" + """Helper to create UiPathEvalContext with specific attribute values. + + Sets sensible defaults for required fields if not provided. + """ context = UiPathEvalContext() + + # Set required fields with defaults + if "execution_id" not in kwargs: + context.execution_id = str(uuid.uuid4()) + + if "runtime_schema" not in kwargs: + context.runtime_schema = UiPathRuntimeSchema( + filePath="test.py", + uniqueId="test", + type="workflow", + input={"type": "object", "properties": {}}, + output={"type": "object", "properties": {}}, + ) + + if "evaluation_set" not in kwargs: + context.evaluation_set = EvaluationSet( + id="test-eval-set", + name="Test Evaluation Set", + evaluations=[], + ) + + if "evaluators" not in kwargs: + context.evaluators = [] + + # Override with provided kwargs for key, value in kwargs.items(): setattr(context, key, value) + return context @@ -258,14 +290,12 @@ async def test_execute_eval_creates_evaluation_span( mock_execution_output.spans = [] mock_execution_output.logs = [] - mock_runtime = AsyncMock() - with patch.object( runtime, "execute_runtime", new=AsyncMock(return_value=mock_execution_output), ): - await runtime._execute_eval(mock_eval_item, [], mock_runtime) + await runtime._execute_eval(mock_eval_item, []) # Verify Evaluation span was created evaluation_spans = capturing_tracer.get_spans_by_type("evaluation") @@ -466,7 +496,6 @@ async def test_evaluation_span_has_unique_execution_id( event_bus=mock_event_bus, ) - mock_runtime = AsyncMock() mock_execution_output = MagicMock() mock_execution_output.result.output = {} mock_execution_output.result.status = "successful" @@ -489,7 +518,7 @@ async def test_evaluation_span_has_unique_execution_id( "execute_runtime", new=AsyncMock(return_value=mock_execution_output), ): - await runtime._execute_eval(eval_item, [], mock_runtime) + await runtime._execute_eval(eval_item, []) # Get execution IDs from spans evaluation_spans = capturing_tracer.get_spans_by_type("evaluation") @@ -747,7 +776,7 @@ async def test_evaluation_set_run_span_has_output_attribute( ) # Execute evaluation - await runtime._execute_eval(eval_item, [evaluator], mock_runtime) + await runtime._execute_eval(eval_item, [evaluator]) # Check that Evaluation span has output attribute eval_spans = self.capturing_tracer.get_spans_by_type("evaluation") @@ -825,7 +854,7 @@ async def test_evaluation_span_has_metadata_attributes( evaluation_criterias={"test-evaluator": {}}, ) - await runtime._execute_eval(eval_item, [evaluator], mock_runtime) + await runtime._execute_eval(eval_item, [evaluator]) # Check metadata attributes on Evaluation span eval_spans = self.capturing_tracer.get_spans_by_type("evaluation") diff --git a/tests/cli/eval/test_evaluate.py b/tests/cli/eval/test_evaluate.py index 68700a4ee..daf67b4de 100644 --- a/tests/cli/eval/test_evaluate.py +++ b/tests/cli/eval/test_evaluate.py @@ -1,3 +1,4 @@ +import uuid from pathlib import Path from typing import Any, AsyncGenerator @@ -18,17 +19,15 @@ from uipath._cli._evals._evaluate import evaluate from uipath._cli._evals._models._output import UiPathEvalOutput from uipath._cli._evals._runtime import UiPathEvalContext, UiPathEvalRuntime +from uipath._cli._utils._eval_set import EvalHelpers from uipath._events._event_bus import EventBus async def test_evaluate(): event_bus = EventBus() trace_manager = UiPathTraceManager() - context = UiPathEvalContext() - context.eval_set = str( - Path(__file__).parent / "evals" / "eval-sets" / "default.json" - ) + # Create a mock runtime and factory async def identity(input: dict[str, Any]) -> dict[str, Any]: return input @@ -93,6 +92,26 @@ async def dispose(self) -> None: factory = TestFactory(identity) + # Load evaluation set + eval_set_path = str(Path(__file__).parent / "evals" / "eval-sets" / "default.json") + evaluation_set, _ = EvalHelpers.load_eval_set(eval_set_path) + + # Create runtime and get schema + runtime = await factory.new_runtime("test", "test-runtime-id") + runtime_schema = await runtime.get_schema() + + # Load evaluators + evaluators = await EvalHelpers.load_evaluators( + eval_set_path, evaluation_set, agent_model=None + ) + + # Set up context + context = UiPathEvalContext() + context.execution_id = str(uuid.uuid4()) + context.evaluation_set = evaluation_set + context.runtime_schema = runtime_schema + context.evaluators = evaluators + # Act result = await evaluate( factory, @@ -123,12 +142,8 @@ async def dispose(self) -> None: async def test_eval_runtime_generates_uuid_when_no_custom_id(): - """Test that UiPathEvalRuntime generates UUID when no custom eval_set_run_id provided.""" + """Test that UiPathEvalRuntime uses execution_id from context.""" # Arrange - context = UiPathEvalContext() - context.eval_set = str( - Path(__file__).parent / "evals" / "eval-sets" / "default.json" - ) event_bus = EventBus() trace_manager = UiPathTraceManager() @@ -197,8 +212,28 @@ async def dispose(self) -> None: factory = TestFactory(identity) + # Load evaluation set + eval_set_path = str(Path(__file__).parent / "evals" / "eval-sets" / "default.json") + evaluation_set, _ = EvalHelpers.load_eval_set(eval_set_path) + + # Create runtime and get schema + runtime = await factory.new_runtime("test", "test-runtime-id") + runtime_schema = await runtime.get_schema() + + # Load evaluators + evaluators = await EvalHelpers.load_evaluators( + eval_set_path, evaluation_set, agent_model=None + ) + + # Set up context + context = UiPathEvalContext() + context.execution_id = str(uuid.uuid4()) + context.evaluation_set = evaluation_set + context.runtime_schema = runtime_schema + context.evaluators = evaluators + # Act - runtime = UiPathEvalRuntime( + eval_runtime = UiPathEvalRuntime( context, factory, trace_manager, @@ -207,17 +242,13 @@ async def dispose(self) -> None: # Assert # Should be a valid UUID format (36 characters with dashes) - assert len(runtime.execution_id) == 36 - assert runtime.execution_id.count("-") == 4 + assert len(eval_runtime.execution_id) == 36 + assert eval_runtime.execution_id.count("-") == 4 async def test_eval_runtime_works_without_exporters(): """Test that UiPathEvalRuntime works when both exporters are None (local execution).""" # Arrange - context = UiPathEvalContext() - context.eval_set = str( - Path(__file__).parent / "evals" / "eval-sets" / "default.json" - ) event_bus = EventBus() trace_manager = UiPathTraceManager() @@ -286,8 +317,28 @@ async def dispose(self) -> None: factory = TestFactory(identity) + # Load evaluation set + eval_set_path = str(Path(__file__).parent / "evals" / "eval-sets" / "default.json") + evaluation_set, _ = EvalHelpers.load_eval_set(eval_set_path) + + # Create runtime and get schema + runtime = await factory.new_runtime("test", "test-runtime-id") + runtime_schema = await runtime.get_schema() + + # Load evaluators + evaluators = await EvalHelpers.load_evaluators( + eval_set_path, evaluation_set, agent_model=None + ) + + # Set up context + context = UiPathEvalContext() + context.execution_id = str(uuid.uuid4()) + context.evaluation_set = evaluation_set + context.runtime_schema = runtime_schema + context.evaluators = evaluators + # Act - runtime = UiPathEvalRuntime( + eval_runtime = UiPathEvalRuntime( context, factory, trace_manager, @@ -295,9 +346,9 @@ async def dispose(self) -> None: ) # Assert - Runtime should work - assert runtime is not None - assert len(runtime.execution_id) == 36 - assert runtime.execution_id.count("-") == 4 + assert eval_runtime is not None + assert len(eval_runtime.execution_id) == 36 + assert eval_runtime.execution_id.count("-") == 4 # Verify that evaluate() also works result = await evaluate( diff --git a/uv.lock b/uv.lock index fc6d8b446..993b17e28 100644 --- a/uv.lock +++ b/uv.lock @@ -2491,7 +2491,7 @@ wheels = [ [[package]] name = "uipath" -version = "2.6.26" +version = "2.6.27" source = { editable = "." } dependencies = [ { name = "applicationinsights" },