From 2621a0e6b06abb8025a1465507867a79c94a2b34 Mon Sep 17 00:00:00 2001
From: Akshaya Shanbhogue <akshaya.shanbhogue@uipath.com>
Date: Sat, 31 Jan 2026 11:38:37 -0800
Subject: [PATCH] refactor(DecoupleLoadFromRuntime): decouple eval load from
 eval runtime

This allows customers to run evals dynamically without having to materialize to files.
---
 pyproject.toml                                |   2 +-
 src/uipath/_cli/_evals/_runtime.py            | 545 ++++++------------
 src/uipath/_cli/_evals/_span_utils.py         |   6 +-
 src/uipath/_cli/_utils/_eval_set.py           |  60 ++
 src/uipath/_cli/cli_eval.py                   | 150 ++++-
 tests/cli/eval/test_eval_resume_flow.py       |  94 ++-
 tests/cli/eval/test_eval_runtime_metadata.py  | 263 +++------
 .../eval/test_eval_runtime_suspend_resume.py  |  27 +-
 tests/cli/eval/test_eval_span_utils.py        |  19 +-
 .../cli/eval/test_eval_tracing_integration.py |  45 +-
 tests/cli/eval/test_evaluate.py               |  91 ++-
 uv.lock                                       |   2 +-
 12 files changed, 666 insertions(+), 638 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index bf024661e..326970586 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "uipath"
-version = "2.6.26"
+version = "2.6.27"
 description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools."
 readme = { file = "README.md", content-type = "text/markdown" }
 requires-python = ">=3.11"
diff --git a/src/uipath/_cli/_evals/_runtime.py b/src/uipath/_cli/_evals/_runtime.py
index bb0e4a227..3120a77d4 100644
--- a/src/uipath/_cli/_evals/_runtime.py
+++ b/src/uipath/_cli/_evals/_runtime.py
@@ -1,9 +1,7 @@
 import json
 import logging
-import uuid
 from collections import defaultdict
 from contextlib import contextmanager
-from pathlib import Path
 from time import time
 from typing import (
     Any,
@@ -38,7 +36,6 @@
     UiPathExecuteOptions,
     UiPathExecutionRuntime,
     UiPathRuntimeFactoryProtocol,
-    UiPathRuntimeProtocol,
     UiPathRuntimeResult,
     UiPathRuntimeStatus,
     UiPathRuntimeStorageProtocol,
@@ -72,10 +69,8 @@
 from ...eval.evaluators import BaseEvaluator
 from ...eval.models import EvaluationResult
 from ...eval.models.models import AgentExecution, EvalItemResult
-from .._utils._eval_set import EvalHelpers
 from .._utils._parallelization import execute_parallel
 from ._eval_util import apply_input_overrides
-from ._evaluator_factory import EvaluatorFactory
 from ._models._evaluation_set import (
     EvaluationItem,
     EvaluationSet,
@@ -195,17 +190,20 @@ def clear(self, execution_id: str | None = None) -> None:
 class UiPathEvalContext:
     """Context used for evaluation runs."""
 
+    # Required Fields
+    runtime_schema: UiPathRuntimeSchema
+    evaluation_set: EvaluationSet
+    evaluators: list[BaseEvaluator[Any, Any, Any]]
+    execution_id: str
+
+    # Optional Fields
     entrypoint: str | None = None
-    no_report: bool | None = False
     workers: int | None = 1
-    eval_set: str | None = None
-    eval_ids: list[str] | None = None
     eval_set_run_id: str | None = None
     verbose: bool = False
     enable_mocker_cache: bool = False
     report_coverage: bool = False
     input_overrides: dict[str, Any] | None = None
-    model_settings_id: str = "default"
     resume: bool = False
     job_id: str | None = None
 
@@ -233,15 +231,11 @@ def __init__(
         self.trace_manager.tracer_provider.add_span_processor(span_processor)
 
         self.logs_exporter: ExecutionLogsExporter = ExecutionLogsExporter()
-        # Use job_id if available, then eval_set_run_id for stability across suspend/resume,
-        # otherwise generate UUID
         logger.debug(
             f"EVAL RUNTIME INIT: job_id={context.job_id}, "
             f"eval_set_run_id={context.eval_set_run_id}"
         )
-        self.execution_id = (
-            context.job_id or context.eval_set_run_id or str(uuid.uuid4())
-        )
+        self.execution_id = context.execution_id
         logger.info(f"EVAL RUNTIME: execution_id set to: {self.execution_id}")
         self.coverage = coverage.Coverage(branch=True)
 
@@ -259,11 +253,8 @@ async def __aexit__(self, *args: Any) -> None:
             self.coverage.stop()
             self.coverage.report(include=["./*"], show_missing=True)
 
-    async def get_schema(self, runtime: UiPathRuntimeProtocol) -> UiPathRuntimeSchema:
-        schema = await runtime.get_schema()
-        if schema is None:
-            raise ValueError("Schema could not be loaded")
-        return schema
+    async def get_schema(self) -> UiPathRuntimeSchema:
+        return self.context.runtime_schema
 
     @contextmanager
     def _mocker_cache(self) -> Iterator[None]:
@@ -283,48 +274,37 @@ def _mocker_cache(self) -> Iterator[None]:
 
     async def initiate_evaluation(
         self,
-        runtime: UiPathRuntimeProtocol,
     ) -> Tuple[
         EvaluationSet,
         list[BaseEvaluator[Any, Any, Any]],
         Iterable[Awaitable[EvaluationRunResult]],
     ]:
-        if self.context.eval_set is None:
-            raise ValueError("eval_set must be provided for evaluation runs")
-
-        # Load eval set (path is already resolved in cli_eval.py)
-        evaluation_set, _ = EvalHelpers.load_eval_set(
-            self.context.eval_set, self.context.eval_ids
-        )
-
         # Validate that resume mode is not used with multiple evaluations
-        if self.context.resume and len(evaluation_set.evaluations) > 1:
+        if self.context.resume and len(self.context.evaluation_set.evaluations) > 1:
             raise ValueError(
                 f"Resume mode is not supported with multiple evaluations. "
-                f"Found {len(evaluation_set.evaluations)} evaluations in the set. "
+                f"Found {len(self.context.evaluation_set.evaluations)} evaluations in the set. "
                 f"Please run with a single evaluation using --eval-ids to specify one evaluation."
             )
 
-        evaluators = await self._load_evaluators(evaluation_set, runtime)
-
         await self.event_bus.publish(
             EvaluationEvents.CREATE_EVAL_SET_RUN,
             EvalSetRunCreatedEvent(
                 execution_id=self.execution_id,
                 entrypoint=self.context.entrypoint or "",
                 eval_set_run_id=self.context.eval_set_run_id,
-                eval_set_id=evaluation_set.id,
-                no_of_evals=len(evaluation_set.evaluations),
-                evaluators=evaluators,
+                eval_set_id=self.context.evaluation_set.id,
+                no_of_evals=len(self.context.evaluation_set.evaluations),
+                evaluators=self.context.evaluators,
             ),
         )
 
         return (
-            evaluation_set,
-            evaluators,
+            self.context.evaluation_set,
+            self.context.evaluators,
             (
-                self._execute_eval(eval_item, evaluators, runtime)
-                for eval_item in evaluation_set.evaluations
+                self._execute_eval(eval_item, self.context.evaluators)
+                for eval_item in self.context.evaluation_set.evaluations
             ),
         )
 
@@ -336,191 +316,178 @@ async def execute(self) -> UiPathRuntimeResult:
         logger.info(f"EVAL RUNTIME: Resume mode: {self.context.resume}")
         logger.info("=" * 80)
 
-        # Resolve model settings override from eval set
-        settings_override = self._resolve_model_settings_override()
+        with self._mocker_cache():
+            tracer = self.trace_manager.tracer_provider.get_tracer(__name__)
 
-        runtime = await self.factory.new_runtime(
-            entrypoint=self.context.entrypoint or "",
-            runtime_id=self.execution_id,
-            settings=settings_override,
-        )
-        try:
-            with self._mocker_cache():
-                tracer = self.trace_manager.tracer_provider.get_tracer(__name__)
+            # During resume, restore the parent "Evaluation Set Run" span context
+            # This prevents creating duplicate eval set run spans across jobs
+            eval_set_parent_span = await self._restore_parent_span(
+                "eval_set_run", "Evaluation Set Run"
+            )
 
-                # During resume, restore the parent "Evaluation Set Run" span context
-                # This prevents creating duplicate eval set run spans across jobs
-                eval_set_parent_span = await self._restore_parent_span(
-                    "eval_set_run", "Evaluation Set Run"
+            # Create "Evaluation Set Run" span or use restored parent context
+            # NOTE: Do NOT set execution.id on this parent span, as the mixin in
+            # UiPathExecutionBatchTraceProcessor propagates execution.id from parent
+            # to child spans, which would overwrite the per-eval execution.id
+            span_attributes: dict[str, str | bool] = {
+                "span_type": "eval_set_run",
+                "uipath.custom_instrumentation": True,
+            }
+            if self.context.eval_set_run_id:
+                span_attributes["eval_set_run_id"] = self.context.eval_set_run_id
+
+            eval_set_span_context_manager = (
+                use_span(
+                    eval_set_parent_span, end_on_exit=False
+                )  # Don't end the remote span
+                if eval_set_parent_span
+                else tracer.start_as_current_span(
+                    "Evaluation Set Run", attributes=span_attributes
                 )
+            )
 
-                # Create "Evaluation Set Run" span or use restored parent context
-                # NOTE: Do NOT set execution.id on this parent span, as the mixin in
-                # UiPathExecutionBatchTraceProcessor propagates execution.id from parent
-                # to child spans, which would overwrite the per-eval execution.id
-                span_attributes: dict[str, str | bool] = {
-                    "span_type": "eval_set_run",
-                    "uipath.custom_instrumentation": True,
-                }
-                if self.context.eval_set_run_id:
-                    span_attributes["eval_set_run_id"] = self.context.eval_set_run_id
-
-                eval_set_span_context_manager = (
-                    use_span(
-                        eval_set_parent_span, end_on_exit=False
-                    )  # Don't end the remote span
-                    if eval_set_parent_span
-                    else tracer.start_as_current_span(
-                        "Evaluation Set Run", attributes=span_attributes
-                    )
+            with eval_set_span_context_manager as span:
+                await self._save_span_context_for_resume(
+                    span, "eval_set_run", "Evaluation Set Run"
                 )
 
-                with eval_set_span_context_manager as span:
-                    await self._save_span_context_for_resume(
-                        span, "eval_set_run", "Evaluation Set Run"
+                try:
+                    (
+                        evaluation_set,
+                        evaluators,
+                        evaluation_iterable,
+                    ) = await self.initiate_evaluation()
+                    workers = self.context.workers or 1
+                    assert workers >= 1
+                    eval_run_result_list = await execute_parallel(
+                        evaluation_iterable, workers
+                    )
+                    results = UiPathEvalOutput(
+                        evaluation_set_name=evaluation_set.name,
+                        evaluation_set_results=eval_run_result_list,
                     )
 
-                    try:
-                        (
-                            evaluation_set,
-                            evaluators,
-                            evaluation_iterable,
-                        ) = await self.initiate_evaluation(runtime)
-                        workers = self.context.workers or 1
-                        assert workers >= 1
-                        eval_run_result_list = await execute_parallel(
-                            evaluation_iterable, workers
-                        )
-                        results = UiPathEvalOutput(
-                            evaluation_set_name=evaluation_set.name,
-                            evaluation_set_results=eval_run_result_list,
-                        )
+                    # Computing evaluator averages
+                    evaluator_averages: dict[str, float] = defaultdict(float)
+                    evaluator_count: dict[str, int] = defaultdict(int)
+
+                    # Check if any eval runs failed
+                    any_failed = False
+                    for eval_run_result in results.evaluation_set_results:
+                        # Check if the agent execution had an error
+                        if (
+                            eval_run_result.agent_execution_output
+                            and eval_run_result.agent_execution_output.result.error
+                        ):
+                            any_failed = True
+
+                        for result_dto in eval_run_result.evaluation_run_results:
+                            evaluator_averages[result_dto.evaluator_id] += (
+                                result_dto.result.score
+                            )
+                            evaluator_count[result_dto.evaluator_id] += 1
 
-                        # Computing evaluator averages
-                        evaluator_averages: dict[str, float] = defaultdict(float)
-                        evaluator_count: dict[str, int] = defaultdict(int)
-
-                        # Check if any eval runs failed
-                        any_failed = False
-                        for eval_run_result in results.evaluation_set_results:
-                            # Check if the agent execution had an error
-                            if (
-                                eval_run_result.agent_execution_output
-                                and eval_run_result.agent_execution_output.result.error
-                            ):
-                                any_failed = True
-
-                            for result_dto in eval_run_result.evaluation_run_results:
-                                evaluator_averages[result_dto.evaluator_id] += (
-                                    result_dto.result.score
-                                )
-                                evaluator_count[result_dto.evaluator_id] += 1
+                    for eval_id in evaluator_averages:
+                        evaluator_averages[eval_id] = (
+                            evaluator_averages[eval_id] / evaluator_count[eval_id]
+                        )
 
-                        for eval_id in evaluator_averages:
-                            evaluator_averages[eval_id] = (
-                                evaluator_averages[eval_id] / evaluator_count[eval_id]
-                            )
+                    # Configure span with output and metadata
+                    await configure_eval_set_run_span(
+                        span=span,
+                        evaluator_averages=evaluator_averages,
+                        execution_id=self.execution_id,
+                        schema=await self.get_schema(),
+                        success=not any_failed,
+                    )
 
-                        # Configure span with output and metadata
-                        await configure_eval_set_run_span(
-                            span=span,
-                            evaluator_averages=evaluator_averages,
+                    await self.event_bus.publish(
+                        EvaluationEvents.UPDATE_EVAL_SET_RUN,
+                        EvalSetRunUpdatedEvent(
                             execution_id=self.execution_id,
-                            runtime=runtime,
-                            get_schema_func=self.get_schema,
+                            evaluator_scores=evaluator_averages,
                             success=not any_failed,
-                        )
+                        ),
+                        wait_for_completion=False,
+                    )
 
-                        await self.event_bus.publish(
-                            EvaluationEvents.UPDATE_EVAL_SET_RUN,
-                            EvalSetRunUpdatedEvent(
-                                execution_id=self.execution_id,
-                                evaluator_scores=evaluator_averages,
-                                success=not any_failed,
-                            ),
-                            wait_for_completion=False,
-                        )
+                    # Collect triggers from all evaluation runs (pass-through from inner runtime)
+                    logger.info("=" * 80)
+                    logger.info(
+                        "EVAL RUNTIME: Collecting triggers from all evaluation runs"
+                    )
+                    all_triggers = []
+                    for eval_run_result in results.evaluation_set_results:
+                        if (
+                            eval_run_result.agent_execution_output
+                            and eval_run_result.agent_execution_output.result
+                        ):
+                            runtime_result = (
+                                eval_run_result.agent_execution_output.result
+                            )
+                            if runtime_result.triggers:
+                                all_triggers.extend(runtime_result.triggers)
 
-                        # Collect triggers from all evaluation runs (pass-through from inner runtime)
-                        logger.info("=" * 80)
+                    if all_triggers:
                         logger.info(
-                            "EVAL RUNTIME: Collecting triggers from all evaluation runs"
+                            f"EVAL RUNTIME: ✅ Passing through {len(all_triggers)} trigger(s) to top-level result"
                         )
-                        all_triggers = []
-                        for eval_run_result in results.evaluation_set_results:
-                            if (
-                                eval_run_result.agent_execution_output
-                                and eval_run_result.agent_execution_output.result
-                            ):
-                                runtime_result = (
-                                    eval_run_result.agent_execution_output.result
-                                )
-                                if runtime_result.triggers:
-                                    all_triggers.extend(runtime_result.triggers)
-
-                        if all_triggers:
+                        for i, trigger in enumerate(all_triggers, 1):
                             logger.info(
-                                f"EVAL RUNTIME: ✅ Passing through {len(all_triggers)} trigger(s) to top-level result"
+                                f"EVAL RUNTIME: Pass-through trigger {i}: {trigger.model_dump(by_alias=True)}"
                             )
-                            for i, trigger in enumerate(all_triggers, 1):
+                    else:
+                        logger.info("EVAL RUNTIME: No triggers to pass through")
+                    logger.info("=" * 80)
+
+                    # Determine overall status - propagate status from inner runtime
+                    # This is critical for serverless executor to know to save state and suspend job
+                    # Priority: SUSPENDED > FAULTED > SUCCESSFUL
+                    overall_status = UiPathRuntimeStatus.SUCCESSFUL
+                    for eval_run_result in results.evaluation_set_results:
+                        if (
+                            eval_run_result.agent_execution_output
+                            and eval_run_result.agent_execution_output.result
+                        ):
+                            inner_status = (
+                                eval_run_result.agent_execution_output.result.status
+                            )
+                            if inner_status == UiPathRuntimeStatus.SUSPENDED:
+                                overall_status = UiPathRuntimeStatus.SUSPENDED
                                 logger.info(
-                                    f"EVAL RUNTIME: Pass-through trigger {i}: {trigger.model_dump(by_alias=True)}"
+                                    "EVAL RUNTIME: Propagating SUSPENDED status from inner runtime"
                                 )
-                        else:
-                            logger.info("EVAL RUNTIME: No triggers to pass through")
-                        logger.info("=" * 80)
-
-                        # Determine overall status - propagate status from inner runtime
-                        # This is critical for serverless executor to know to save state and suspend job
-                        # Priority: SUSPENDED > FAULTED > SUCCESSFUL
-                        overall_status = UiPathRuntimeStatus.SUCCESSFUL
-                        for eval_run_result in results.evaluation_set_results:
-                            if (
-                                eval_run_result.agent_execution_output
-                                and eval_run_result.agent_execution_output.result
-                            ):
-                                inner_status = (
-                                    eval_run_result.agent_execution_output.result.status
-                                )
-                                if inner_status == UiPathRuntimeStatus.SUSPENDED:
-                                    overall_status = UiPathRuntimeStatus.SUSPENDED
-                                    logger.info(
-                                        "EVAL RUNTIME: Propagating SUSPENDED status from inner runtime"
-                                    )
-                                    break  # SUSPENDED takes highest priority, stop checking
-                                elif inner_status == UiPathRuntimeStatus.FAULTED:
-                                    overall_status = UiPathRuntimeStatus.FAULTED
-                                    # Continue checking in case a later eval is SUSPENDED
-
-                        result = UiPathRuntimeResult(
-                            output={**results.model_dump(by_alias=True)},
-                            status=overall_status,
-                            triggers=all_triggers if all_triggers else None,
-                        )
-                        return result
-                    except Exception as e:
-                        # Set span status to ERROR on exception
-                        span.set_status(Status(StatusCode.ERROR, str(e)))
+                                break  # SUSPENDED takes highest priority, stop checking
+                            elif inner_status == UiPathRuntimeStatus.FAULTED:
+                                overall_status = UiPathRuntimeStatus.FAULTED
+                                # Continue checking in case a later eval is SUSPENDED
+
+                    result = UiPathRuntimeResult(
+                        output={**results.model_dump(by_alias=True)},
+                        status=overall_status,
+                        triggers=all_triggers if all_triggers else None,
+                    )
+                    return result
+                except Exception as e:
+                    # Set span status to ERROR on exception
+                    span.set_status(Status(StatusCode.ERROR, str(e)))
 
-                        # Publish failure event for eval set run
-                        await self.event_bus.publish(
-                            EvaluationEvents.UPDATE_EVAL_SET_RUN,
-                            EvalSetRunUpdatedEvent(
-                                execution_id=self.execution_id,
-                                evaluator_scores={},
-                                success=False,
-                            ),
-                            wait_for_completion=False,
-                        )
-                        raise
-        finally:
-            await runtime.dispose()
+                    # Publish failure event for eval set run
+                    await self.event_bus.publish(
+                        EvaluationEvents.UPDATE_EVAL_SET_RUN,
+                        EvalSetRunUpdatedEvent(
+                            execution_id=self.execution_id,
+                            evaluator_scores={},
+                            success=False,
+                        ),
+                        wait_for_completion=False,
+                    )
+                    raise
 
     async def _execute_eval(
         self,
         eval_item: EvaluationItem,
         evaluators: list[BaseEvaluator[Any, Any, Any]],
-        runtime: UiPathRuntimeProtocol,
     ) -> EvaluationRunResult:
         execution_id = str(eval_item.id)
 
@@ -558,9 +525,7 @@ async def _execute_eval(
                 try:
                     # Generate LLM-based input if input_mocking_strategy is defined
                     if eval_item.input_mocking_strategy:
-                        eval_item = await self._generate_input_for_eval(
-                            eval_item, runtime
-                        )
+                        eval_item = await self._generate_input_for_eval(eval_item)
 
                     set_execution_context(
                         MockingContext(
@@ -586,7 +551,6 @@ async def _execute_eval(
                     agent_execution_output = await self.execute_runtime(
                         eval_item,
                         execution_id,
-                        runtime,
                         input_overrides=self.context.input_overrides,
                     )
 
@@ -812,7 +776,8 @@ async def _execute_eval(
             return evaluation_run_results
 
     async def _generate_input_for_eval(
-        self, eval_item: EvaluationItem, runtime: UiPathRuntimeProtocol
+        self,
+        eval_item: EvaluationItem,
     ) -> EvaluationItem:
         """Use LLM to generate a mock input for an evaluation item."""
         expected_output = (
@@ -822,7 +787,7 @@ async def _generate_input_for_eval(
         )
         generated_input = await generate_llm_input(
             eval_item.input_mocking_strategy,
-            (await self.get_schema(runtime)).input,
+            (await self.get_schema()).input,
             expected_behavior=eval_item.expected_agent_behavior or "",
             expected_output=expected_output,
         )
@@ -841,71 +806,10 @@ def _get_and_clear_execution_data(
 
         return spans, logs
 
-    def _resolve_model_settings_override(
-        self,
-    ) -> dict[str, Any] | None:
-        """Resolve model settings override from evaluation set.
-
-        Returns:
-            Model settings dict to use for override, or None if using defaults.
-            Settings are passed to factory via settings kwarg.
-        """
-        # Skip if no model settings ID specified or using default
-        if (
-            not self.context.model_settings_id
-            or self.context.model_settings_id == "default"
-        ):
-            return None
-
-        # Load evaluation set to get model settings
-        evaluation_set, _ = EvalHelpers.load_eval_set(self.context.eval_set or "")
-        if (
-            not hasattr(evaluation_set, "model_settings")
-            or not evaluation_set.model_settings
-        ):
-            logger.warning("No model settings available in evaluation set")
-            return None
-
-        # Find the specified model settings
-        target_model_settings = next(
-            (
-                ms
-                for ms in evaluation_set.model_settings
-                if ms.id == self.context.model_settings_id
-            ),
-            None,
-        )
-
-        if not target_model_settings:
-            logger.warning(
-                f"Model settings ID '{self.context.model_settings_id}' not found in evaluation set"
-            )
-            return None
-
-        logger.info(
-            f"Applying model settings override: model={target_model_settings.model_name}, temperature={target_model_settings.temperature}"
-        )
-
-        # Return settings dict with correct keys for factory
-        override: dict[str, str | float] = {}
-        if (
-            target_model_settings.model_name
-            and target_model_settings.model_name != "same-as-agent"
-        ):
-            override["model"] = target_model_settings.model_name
-        if (
-            target_model_settings.temperature is not None
-            and target_model_settings.temperature != "same-as-agent"
-        ):
-            override["temperature"] = float(target_model_settings.temperature)
-
-        return override if override else None
-
     async def execute_runtime(
         self,
         eval_item: EvaluationItem,
         execution_id: str,
-        runtime: UiPathRuntimeProtocol,
         input_overrides: dict[str, Any] | None = None,
     ) -> UiPathEvalRunExecutionOutput:
         log_handler = self._setup_execution_logging(execution_id)
@@ -1076,119 +980,6 @@ async def run_evaluator(
 
             return result
 
-    async def _get_agent_model(self, runtime: UiPathRuntimeProtocol) -> str | None:
-        """Get agent model from the runtime schema metadata.
-
-        The model is read from schema.metadata["settings"]["model"] which is
-        populated by the low-code agents runtime from agent.json.
-
-        Returns:
-            The model name from agent settings, or None if not found.
-        """
-        try:
-            schema = await self.get_schema(runtime)
-            if schema.metadata and "settings" in schema.metadata:
-                settings = schema.metadata["settings"]
-                model = settings.get("model")
-                if model:
-                    logger.debug(f"Got agent model from schema.metadata: {model}")
-                    return model
-
-            # Fallback to protocol-based approach for backwards compatibility
-            model = self._find_agent_model_in_runtime(runtime)
-            if model:
-                logger.debug(f"Got agent model from runtime protocol: {model}")
-            return model
-        except Exception:
-            return None
-
-    def _find_agent_model_in_runtime(
-        self, runtime: UiPathRuntimeProtocol
-    ) -> str | None:
-        """Recursively search for get_agent_model in runtime and its delegates.
-
-        Runtimes may be wrapped (e.g., ResumableRuntime wraps TelemetryWrapper
-        which wraps the base runtime). This method traverses the wrapper chain
-        to find a runtime that implements LLMAgentRuntimeProtocol.
-
-        Args:
-            runtime: The runtime to check (may be a wrapper)
-
-        Returns:
-            The model name if found, None otherwise.
-        """
-        # Check if this runtime implements the protocol
-        if isinstance(runtime, LLMAgentRuntimeProtocol):
-            return runtime.get_agent_model()
-
-        # Check for delegate property (used by UiPathResumableRuntime, TelemetryRuntimeWrapper)
-        delegate = getattr(runtime, "delegate", None) or getattr(
-            runtime, "_delegate", None
-        )
-        if delegate is not None:
-            return self._find_agent_model_in_runtime(delegate)
-
-        return None
-
-    async def _load_evaluators(
-        self, evaluation_set: EvaluationSet, runtime: UiPathRuntimeProtocol
-    ) -> list[BaseEvaluator[Any, Any, Any]]:
-        """Load evaluators referenced by the evaluation set."""
-        evaluators = []
-        eval_set = self.context.eval_set
-        if eval_set is None:
-            raise ValueError("eval_set cannot be None")
-        evaluators_dir = Path(eval_set).parent.parent / "evaluators"
-
-        # Load agent model for 'same-as-agent' resolution in legacy evaluators
-        agent_model = await self._get_agent_model(runtime)
-
-        # If evaluatorConfigs is specified, use that (new field with weights)
-        # Otherwise, fall back to evaluatorRefs (old field without weights)
-        if (
-            hasattr(evaluation_set, "evaluator_configs")
-            and evaluation_set.evaluator_configs
-        ):
-            # Use new evaluatorConfigs field - supports weights
-            evaluator_ref_ids = {ref.ref for ref in evaluation_set.evaluator_configs}
-        else:
-            # Fall back to old evaluatorRefs field - plain strings
-            evaluator_ref_ids = set(evaluation_set.evaluator_refs)
-
-        found_evaluator_ids = set()
-
-        for file in evaluators_dir.glob("*.json"):
-            try:
-                with open(file, "r", encoding="utf-8") as f:
-                    data = json.load(f)
-            except json.JSONDecodeError as e:
-                raise ValueError(
-                    f"Invalid JSON in evaluator file '{file}': {str(e)}. "
-                    f"Please check the file for syntax errors."
-                ) from e
-
-            try:
-                evaluator_id = data.get("id")
-                if evaluator_id in evaluator_ref_ids:
-                    evaluator = EvaluatorFactory.create_evaluator(
-                        data, evaluators_dir, agent_model=agent_model
-                    )
-                    evaluators.append(evaluator)
-                    found_evaluator_ids.add(evaluator_id)
-            except Exception as e:
-                raise ValueError(
-                    f"Failed to create evaluator from file '{file}': {str(e)}. "
-                    f"Please verify the evaluator configuration."
-                ) from e
-
-        missing_evaluators = evaluator_ref_ids - found_evaluator_ids
-        if missing_evaluators:
-            raise ValueError(
-                f"Could not find the following evaluators: {missing_evaluators}"
-            )
-
-        return evaluators
-
     async def _restore_parent_span(
         self, span_key: str, span_type: str
     ) -> NonRecordingSpan | None:
diff --git a/src/uipath/_cli/_evals/_span_utils.py b/src/uipath/_cli/_evals/_span_utils.py
index 7c99729b7..270c4c904 100644
--- a/src/uipath/_cli/_evals/_span_utils.py
+++ b/src/uipath/_cli/_evals/_span_utils.py
@@ -8,7 +8,7 @@
 
 # Type hint for runtime protocol (avoids circular imports)
 try:
-    from uipath.runtime import UiPathRuntimeProtocol
+    from uipath.runtime import UiPathRuntimeProtocol, UiPathRuntimeSchema
 except ImportError:
     UiPathRuntimeProtocol = Any  # type: ignore
 
@@ -192,8 +192,7 @@ async def configure_eval_set_run_span(
     span: Span,
     evaluator_averages: Dict[str, float],
     execution_id: str,
-    runtime: Any,
-    get_schema_func: Any,
+    schema: UiPathRuntimeSchema,
     success: bool = True,
 ) -> None:
     """Configure Evaluation Set Run span with output and metadata.
@@ -216,7 +215,6 @@ async def configure_eval_set_run_span(
 
     # Get runtime schemas
     try:
-        schema = await get_schema_func(runtime)
         input_schema = schema.input
         output_schema = schema.output
     except Exception:
diff --git a/src/uipath/_cli/_utils/_eval_set.py b/src/uipath/_cli/_utils/_eval_set.py
index b4ac727b1..ccad1e89a 100644
--- a/src/uipath/_cli/_utils/_eval_set.py
+++ b/src/uipath/_cli/_utils/_eval_set.py
@@ -5,6 +5,7 @@
 import click
 from pydantic import ValidationError
 
+from uipath._cli._evals._evaluator_factory import EvaluatorFactory
 from uipath._cli._evals._models._evaluation_set import (
     EvaluationItem,
     EvaluationSet,
@@ -13,6 +14,7 @@
 )
 from uipath._cli._evals.mocks.types import InputMockingStrategy, LLMMockingStrategy
 from uipath._cli._utils._console import ConsoleLogger
+from uipath.eval.evaluators import BaseEvaluator
 
 console = ConsoleLogger()
 
@@ -175,3 +177,61 @@ def migrate_evaluation_item(
         if eval_ids:
             eval_set.extract_selected_evals(eval_ids)
         return eval_set, resolved_path
+
+    @staticmethod
+    async def load_evaluators(
+        eval_set_path: str,
+        evaluation_set: EvaluationSet,
+        agent_model: str | None = None,
+    ) -> list[BaseEvaluator[Any, Any, Any]]:
+        """Load evaluators referenced by the evaluation set."""
+        evaluators = []
+        if evaluation_set is None:
+            raise ValueError("eval_set cannot be None")
+        evaluators_dir = Path(eval_set_path).parent.parent / "evaluators"
+
+        # If evaluatorConfigs is specified, use that (new field with weights)
+        # Otherwise, fall back to evaluatorRefs (old field without weights)
+        if (
+            hasattr(evaluation_set, "evaluator_configs")
+            and evaluation_set.evaluator_configs
+        ):
+            # Use new evaluatorConfigs field - supports weights
+            evaluator_ref_ids = {ref.ref for ref in evaluation_set.evaluator_configs}
+        else:
+            # Fall back to old evaluatorRefs field - plain strings
+            evaluator_ref_ids = set(evaluation_set.evaluator_refs)
+
+        found_evaluator_ids = set()
+
+        for file in evaluators_dir.glob("*.json"):
+            try:
+                with open(file, "r", encoding="utf-8") as f:
+                    data = json.load(f)
+            except json.JSONDecodeError as e:
+                raise ValueError(
+                    f"Invalid JSON in evaluator file '{file}': {str(e)}. "
+                    f"Please check the file for syntax errors."
+                ) from e
+
+            try:
+                evaluator_id = data.get("id")
+                if evaluator_id in evaluator_ref_ids:
+                    evaluator = EvaluatorFactory.create_evaluator(
+                        data, evaluators_dir, agent_model=agent_model
+                    )
+                    evaluators.append(evaluator)
+                    found_evaluator_ids.add(evaluator_id)
+            except Exception as e:
+                raise ValueError(
+                    f"Failed to create evaluator from file '{file}': {str(e)}. "
+                    f"Please verify the evaluator configuration."
+                ) from e
+
+        missing_evaluators = evaluator_ref_ids - found_evaluator_ids
+        if missing_evaluators:
+            raise ValueError(
+                f"Could not find the following evaluators: {missing_evaluators}"
+            )
+
+        return evaluators
diff --git a/src/uipath/_cli/cli_eval.py b/src/uipath/_cli/cli_eval.py
index 295bd67fe..4cab05d5e 100644
--- a/src/uipath/_cli/cli_eval.py
+++ b/src/uipath/_cli/cli_eval.py
@@ -2,16 +2,24 @@
 import asyncio
 import logging
 import os
+import uuid
 from typing import Any
 
 import click
 from uipath.core.tracing import UiPathTraceManager
-from uipath.runtime import UiPathRuntimeContext, UiPathRuntimeFactoryRegistry
+from uipath.runtime import (
+    UiPathRuntimeContext,
+    UiPathRuntimeFactoryRegistry,
+    UiPathRuntimeProtocol,
+    UiPathRuntimeSchema,
+)
 
 from uipath._cli._evals._console_progress_reporter import ConsoleProgressReporter
 from uipath._cli._evals._evaluate import evaluate
+from uipath._cli._evals._models._evaluation_set import EvaluationSet
 from uipath._cli._evals._progress_reporter import StudioWebProgressReporter
 from uipath._cli._evals._runtime import (
+    LLMAgentRuntimeProtocol,
     UiPathEvalContext,
 )
 from uipath._cli._evals._telemetry import EvalTelemetrySubscriber
@@ -62,6 +70,109 @@ def setup_reporting_prereq(no_report: bool) -> bool:
     return True
 
 
+def _find_agent_model_in_runtime(runtime: UiPathRuntimeProtocol) -> str | None:
+    """Recursively search for get_agent_model in runtime and its delegates.
+
+    Runtimes may be wrapped (e.g., ResumableRuntime wraps TelemetryWrapper
+    which wraps the base runtime). This method traverses the wrapper chain
+    to find a runtime that implements LLMAgentRuntimeProtocol.
+
+    Args:
+        runtime: The runtime to check (may be a wrapper)
+
+    Returns:
+        The model name if found, None otherwise.
+    """
+    # Check if this runtime implements the protocol
+    if isinstance(runtime, LLMAgentRuntimeProtocol):
+        return runtime.get_agent_model()
+
+    # Check for delegate property (used by UiPathResumableRuntime, TelemetryRuntimeWrapper)
+    delegate = getattr(runtime, "delegate", None) or getattr(runtime, "_delegate", None)
+    if delegate is not None:
+        return _find_agent_model_in_runtime(delegate)
+
+    return None
+
+
+async def _get_agent_model(
+    runtime: UiPathRuntimeProtocol, schema: UiPathRuntimeSchema
+) -> str | None:
+    """Get agent model from the runtime schema metadata.
+
+    The model is read from schema.metadata["settings"]["model"] which is
+    populated by the low-code agents runtime from agent.json.
+
+    Returns:
+        The model name from agent settings, or None if not found.
+    """
+    try:
+        if schema.metadata and "settings" in schema.metadata:
+            settings = schema.metadata["settings"]
+            model = settings.get("model")
+            if model:
+                logger.debug(f"Got agent model from schema.metadata: {model}")
+                return model
+
+        # Fallback to protocol-based approach for backwards compatibility
+        model = _find_agent_model_in_runtime(runtime)
+        if model:
+            logger.debug(f"Got agent model from runtime protocol: {model}")
+        return model
+    except Exception:
+        return None
+
+
+def _resolve_model_settings_override(
+    model_settings_id: str, evaluation_set: EvaluationSet
+) -> dict[str, Any] | None:
+    """Resolve model settings override from evaluation set.
+
+    Returns:
+        Model settings dict to use for override, or None if using defaults.
+        Settings are passed to factory via settings kwarg.
+    """
+    # Skip if no model settings ID specified or using default
+    if not model_settings_id or model_settings_id == "default":
+        return None
+
+    # Load evaluation set to get model settings
+    if not evaluation_set.model_settings:
+        logger.warning("No model settings available in evaluation set")
+        return None
+
+    # Find the specified model settings
+    target_model_settings = next(
+        (ms for ms in evaluation_set.model_settings if ms.id == model_settings_id),
+        None,
+    )
+
+    if not target_model_settings:
+        logger.warning(
+            f"Model settings ID '{model_settings_id}' not found in evaluation set"
+        )
+        return None
+
+    logger.info(
+        f"Applying model settings override: model={target_model_settings.model_name}, temperature={target_model_settings.temperature}"
+    )
+
+    # Return settings dict with correct keys for factory
+    override: dict[str, str | float] = {}
+    if (
+        target_model_settings.model_name
+        and target_model_settings.model_name != "same-as-agent"
+    ):
+        override["model"] = target_model_settings.model_name
+    if (
+        target_model_settings.temperature is not None
+        and target_model_settings.temperature != "same-as-agent"
+    ):
+        override["temperature"] = float(target_model_settings.temperature)
+
+    return override if override else None
+
+
 @click.command()
 @click.argument("entrypoint", required=False)
 @click.argument("eval_set", required=False)
@@ -188,7 +299,6 @@ def eval(
         eval_context = UiPathEvalContext()
 
         eval_context.entrypoint = entrypoint or auto_discover_entrypoint()
-        eval_context.no_report = no_report
         eval_context.workers = workers
         eval_context.eval_set_run_id = eval_set_run_id
         eval_context.enable_mocker_cache = enable_mocker_cache
@@ -197,10 +307,7 @@ def eval(
         eval_set_path = eval_set or EvalHelpers.auto_discover_eval_set()
         _, resolved_eval_set_path = EvalHelpers.load_eval_set(eval_set_path, eval_ids)
 
-        eval_context.eval_set = resolved_eval_set_path
-        eval_context.eval_ids = eval_ids
         eval_context.report_coverage = report_coverage
-        eval_context.model_settings_id = model_settings_id
         eval_context.input_overrides = input_overrides
         eval_context.resume = resume
 
@@ -269,6 +376,39 @@ async def execute_eval():
 
                     project_id = UiPathConfig.project_id
 
+                    eval_context.execution_id = (
+                        eval_context.job_id
+                        or eval_context.eval_set_run_id
+                        or str(uuid.uuid4())
+                    )
+
+                    # Load eval set (path is already resolved in cli_eval.py)
+                    eval_context.evaluation_set, _ = EvalHelpers.load_eval_set(
+                        resolved_eval_set_path, eval_ids
+                    )
+
+                    # Resolve model settings override from eval set
+                    settings_override = _resolve_model_settings_override(
+                        model_settings_id, eval_context.evaluation_set
+                    )
+
+                    runtime = await runtime_factory.new_runtime(
+                        entrypoint=eval_context.entrypoint or "",
+                        runtime_id=eval_context.execution_id,
+                        settings=settings_override,
+                    )
+
+                    eval_context.runtime_schema = await runtime.get_schema()
+
+                    eval_context.evaluators = await EvalHelpers.load_evaluators(
+                        resolved_eval_set_path,
+                        eval_context.evaluation_set,
+                        await _get_agent_model(runtime, eval_context.runtime_schema),
+                    )
+
+                    # Runtime is not required anymore.
+                    await runtime.dispose()
+
                     try:
                         if project_id:
                             studio_client = StudioClient(project_id)
diff --git a/tests/cli/eval/test_eval_resume_flow.py b/tests/cli/eval/test_eval_resume_flow.py
index 7547b195c..2096f481a 100644
--- a/tests/cli/eval/test_eval_resume_flow.py
+++ b/tests/cli/eval/test_eval_resume_flow.py
@@ -1,5 +1,6 @@
 """Unit tests for eval resume flow to ensure UiPathExecuteOptions is passed correctly."""
 
+import uuid
 from pathlib import Path
 from unittest.mock import AsyncMock, patch
 
@@ -14,6 +15,7 @@
 )
 
 from uipath._cli._evals._runtime import UiPathEvalContext, UiPathEvalRuntime
+from uipath._cli._utils._eval_set import EvalHelpers
 from uipath._events._event_bus import EventBus
 
 # ============================================================================
@@ -27,25 +29,42 @@
 
 @pytest.mark.asyncio
 async def test_execute_runtime_method_passes_options_with_resume_false():
-    """Direct test of execute_runtime method to verify UiPathExecuteOptions(resume=False) is passed."""
+    """Test that execute_runtime respects resume=False setting."""
     # Arrange
     from uipath._cli._evals._models._evaluation_set import EvaluationItem
 
     event_bus = EventBus()
     trace_manager = UiPathTraceManager()
-    context = UiPathEvalContext()
-    context.eval_set = str(
-        Path(__file__).parent / "evals" / "eval-sets" / "default.json"
-    )
-    context.resume = False  # Test resume=False
 
-    # Create a mock runtime that will be wrapped
+    # Load evaluation set
+    eval_set_path = str(Path(__file__).parent / "evals" / "eval-sets" / "default.json")
+    evaluation_set, _ = EvalHelpers.load_eval_set(eval_set_path)
+
+    # Create a mock runtime to get schema
     mock_runtime = AsyncMock(spec=UiPathRuntimeProtocol)
     mock_runtime.execute = AsyncMock(
         return_value=UiPathRuntimeResult(
             output={"result": "success"}, status=UiPathRuntimeStatus.SUCCESSFUL
         )
     )
+    mock_runtime.get_schema = AsyncMock()
+
+    runtime_schema = await mock_runtime.get_schema()
+    runtime_schema.input = {"type": "object", "properties": {}}
+    runtime_schema.output = {"type": "object", "properties": {}}
+
+    # Load evaluators
+    evaluators = await EvalHelpers.load_evaluators(
+        eval_set_path, evaluation_set, agent_model=None
+    )
+
+    # Set up context
+    context = UiPathEvalContext()
+    context.execution_id = str(uuid.uuid4())
+    context.evaluation_set = evaluation_set
+    context.runtime_schema = runtime_schema
+    context.evaluators = evaluators
+    context.resume = False  # Test resume=False
 
     # Create a mock factory
     mock_factory = AsyncMock(spec=UiPathRuntimeFactoryProtocol)
@@ -74,7 +93,7 @@ async def test_execute_runtime_method_passes_options_with_resume_false():
         mock_execution_runtime_class.return_value = mock_execution_runtime_instance
 
         await eval_runtime.execute_runtime(
-            eval_item=eval_item, execution_id="test-exec-id", runtime=mock_runtime
+            eval_item=eval_item, execution_id="test-exec-id"
         )
 
         # Assert - Verify that execute was called with UiPathExecuteOptions(resume=False)
@@ -96,17 +115,16 @@ async def test_execute_runtime_method_passes_options_with_resume_false():
 
 @pytest.mark.asyncio
 async def test_execute_runtime_method_passes_options_with_resume_true():
-    """Direct test of execute_runtime method to verify UiPathExecuteOptions(resume=True) is passed."""
+    """Test that execute_runtime respects resume=True setting."""
     # Arrange
     from uipath._cli._evals._models._evaluation_set import EvaluationItem
 
     event_bus = EventBus()
     trace_manager = UiPathTraceManager()
-    context = UiPathEvalContext()
-    context.eval_set = str(
-        Path(__file__).parent / "evals" / "eval-sets" / "default.json"
-    )
-    context.resume = True  # Test resume=True
+
+    # Load evaluation set
+    eval_set_path = str(Path(__file__).parent / "evals" / "eval-sets" / "default.json")
+    evaluation_set, _ = EvalHelpers.load_eval_set(eval_set_path)
 
     # Create a mock runtime
     mock_runtime = AsyncMock(spec=UiPathRuntimeProtocol)
@@ -115,6 +133,24 @@ async def test_execute_runtime_method_passes_options_with_resume_true():
             output={"result": "success"}, status=UiPathRuntimeStatus.SUCCESSFUL
         )
     )
+    mock_runtime.get_schema = AsyncMock()
+
+    runtime_schema = await mock_runtime.get_schema()
+    runtime_schema.input = {"type": "object", "properties": {}}
+    runtime_schema.output = {"type": "object", "properties": {}}
+
+    # Load evaluators
+    evaluators = await EvalHelpers.load_evaluators(
+        eval_set_path, evaluation_set, agent_model=None
+    )
+
+    # Set up context
+    context = UiPathEvalContext()
+    context.execution_id = str(uuid.uuid4())
+    context.evaluation_set = evaluation_set
+    context.runtime_schema = runtime_schema
+    context.evaluators = evaluators
+    context.resume = True  # Test resume=True
 
     # Create a mock factory
     mock_factory = AsyncMock(spec=UiPathRuntimeFactoryProtocol)
@@ -143,7 +179,7 @@ async def test_execute_runtime_method_passes_options_with_resume_true():
         mock_execution_runtime_class.return_value = mock_execution_runtime_instance
 
         await eval_runtime.execute_runtime(
-            eval_item=eval_item, execution_id="test-exec-id", runtime=mock_runtime
+            eval_item=eval_item, execution_id="test-exec-id"
         )
 
         # Assert - Verify that execute was called with UiPathExecuteOptions(resume=True)
@@ -167,15 +203,35 @@ async def test_resume_with_multiple_evaluations_raises_error():
     # Arrange
     event_bus = EventBus()
     trace_manager = UiPathTraceManager()
-    context = UiPathEvalContext()
-    context.eval_set = str(
+
+    # Load evaluation set with multiple evals
+    eval_set_path = str(
         Path(__file__).parent / "evals" / "eval-sets" / "multiple-evals.json"
     )
+    evaluation_set, _ = EvalHelpers.load_eval_set(eval_set_path)
+
+    # Create a mock runtime
+    mock_runtime = AsyncMock(spec=UiPathRuntimeProtocol)
+    mock_runtime.get_schema = AsyncMock()
+    runtime_schema = await mock_runtime.get_schema()
+    runtime_schema.input = {"type": "object", "properties": {}}
+    runtime_schema.output = {"type": "object", "properties": {}}
+
+    # Load evaluators
+    evaluators = await EvalHelpers.load_evaluators(
+        eval_set_path, evaluation_set, agent_model=None
+    )
+
+    # Set up context
+    context = UiPathEvalContext()
+    context.execution_id = str(uuid.uuid4())
+    context.evaluation_set = evaluation_set
+    context.runtime_schema = runtime_schema
+    context.evaluators = evaluators
     context.resume = True  # Enable resume mode
 
     # Create a mock factory
     mock_factory = AsyncMock(spec=UiPathRuntimeFactoryProtocol)
-    mock_runtime = AsyncMock(spec=UiPathRuntimeProtocol)
     mock_factory.new_runtime = AsyncMock(return_value=mock_runtime)
 
     eval_runtime = UiPathEvalRuntime(
@@ -190,4 +246,4 @@ async def test_resume_with_multiple_evaluations_raises_error():
         ValueError,
         match=r"Resume mode is not supported with multiple evaluations.*Found 2 evaluations",
     ):
-        await eval_runtime.initiate_evaluation(mock_runtime)
+        await eval_runtime.initiate_evaluation()
diff --git a/tests/cli/eval/test_eval_runtime_metadata.py b/tests/cli/eval/test_eval_runtime_metadata.py
index d85d36680..ab62cbf34 100644
--- a/tests/cli/eval/test_eval_runtime_metadata.py
+++ b/tests/cli/eval/test_eval_runtime_metadata.py
@@ -8,7 +8,7 @@
 - LLMAgentRuntimeProtocol - protocol implementation detection
 """
 
-from pathlib import Path
+import uuid
 from typing import Any, AsyncGenerator
 
 import pytest
@@ -30,6 +30,10 @@
     UiPathEvalContext,
     UiPathEvalRuntime,
 )
+from uipath._cli.cli_eval import (
+    _find_agent_model_in_runtime,
+    _get_agent_model,
+)
 from uipath._events._event_bus import EventBus
 
 
@@ -155,184 +159,113 @@ def test_protocol_rejects_wrapper_without_method(self):
 class TestFindAgentModelInRuntime:
     """Tests for _find_agent_model_in_runtime recursive search."""
 
-    @pytest.fixture
-    def eval_runtime(self):
-        """Create an eval runtime for testing."""
-        context = UiPathEvalContext()
-        context.eval_set = str(
-            Path(__file__).parent / "evals" / "eval-sets" / "default.json"
-        )
-        event_bus = EventBus()
-        trace_manager = UiPathTraceManager()
-
-        async def create_runtime():
-            return BaseTestRuntime()
-
-        factory = MockFactory(create_runtime)
-        return UiPathEvalRuntime(
-            context,
-            factory,
-            trace_manager,
-            event_bus,
-        )
-
-    def test_finds_model_in_direct_runtime(self, eval_runtime):
+    def test_finds_model_in_direct_runtime(self):
         """Test finding agent model directly on runtime."""
         runtime = AgentModelRuntime("gpt-4o")
-        result = eval_runtime._find_agent_model_in_runtime(runtime)
+        result = _find_agent_model_in_runtime(runtime)
         assert result == "gpt-4o"
 
-    def test_finds_model_in_wrapped_runtime(self, eval_runtime):
+    def test_finds_model_in_wrapped_runtime(self):
         """Test finding agent model through wrapper's delegate."""
         inner = AgentModelRuntime("claude-3")
         wrapper = WrapperRuntime(inner)
-        result = eval_runtime._find_agent_model_in_runtime(wrapper)
+        result = _find_agent_model_in_runtime(wrapper)
         assert result == "claude-3"
 
-    def test_finds_model_in_deeply_wrapped_runtime(self, eval_runtime):
+    def test_finds_model_in_deeply_wrapped_runtime(self):
         """Test finding agent model through multiple wrapper layers."""
         inner = AgentModelRuntime("gpt-4-turbo")
         wrapper1 = WrapperRuntime(inner)
         wrapper2 = WrapperRuntime(wrapper1)
-        result = eval_runtime._find_agent_model_in_runtime(wrapper2)
+        result = _find_agent_model_in_runtime(wrapper2)
         assert result == "gpt-4-turbo"
 
-    def test_finds_model_via_private_delegate(self, eval_runtime):
+    def test_finds_model_via_private_delegate(self):
         """Test finding agent model through _delegate attribute."""
         inner = AgentModelRuntime("gemini-pro")
         wrapper = PrivateDelegateRuntime(inner)
-        result = eval_runtime._find_agent_model_in_runtime(wrapper)
+        result = _find_agent_model_in_runtime(wrapper)
         assert result == "gemini-pro"
 
-    def test_returns_none_when_no_model(self, eval_runtime):
+    def test_returns_none_when_no_model(self):
         """Test returns None when no runtime implements the protocol."""
         runtime = BaseTestRuntime()
-        result = eval_runtime._find_agent_model_in_runtime(runtime)
+        result = _find_agent_model_in_runtime(runtime)
         assert result is None
 
-    def test_returns_none_for_none_model(self, eval_runtime):
+    def test_returns_none_for_none_model(self):
         """Test returns None when runtime returns None for model."""
         runtime = AgentModelRuntime(None)
-        result = eval_runtime._find_agent_model_in_runtime(runtime)
+        result = _find_agent_model_in_runtime(runtime)
         assert result is None
 
 
 class TestGetAgentModel:
-    """Tests for _get_agent_model method."""
+    """Tests for _get_agent_model function."""
 
-    @pytest.fixture
-    def context(self):
-        """Create eval context."""
-        context = UiPathEvalContext()
-        context.eval_set = str(
-            Path(__file__).parent / "evals" / "eval-sets" / "default.json"
-        )
-        return context
-
-    async def test_returns_agent_model(self, context):
-        """Test that _get_agent_model returns the correct model."""
+    @pytest.mark.asyncio
+    async def test_returns_agent_model(self):
+        """Test that _get_agent_model returns the correct model from schema."""
+        runtime = AgentModelRuntime("gpt-4o-2024-11-20")
+        schema = MockRuntimeSchema()
+        schema.metadata = {"settings": {"model": "gpt-4o-2024-11-20"}}
 
-        async def create_runtime():
-            return AgentModelRuntime("gpt-4o-2024-11-20")
-
-        factory = MockFactory(create_runtime)
-        event_bus = EventBus()
-        trace_manager = UiPathTraceManager()
-        eval_runtime = UiPathEvalRuntime(
-            context,
-            factory,
-            trace_manager,
-            event_bus,
-        )
-
-        runtime = await create_runtime()
-        model = await eval_runtime._get_agent_model(runtime)
+        model = await _get_agent_model(runtime, schema)
         assert model == "gpt-4o-2024-11-20"
 
-    async def test_returns_none_when_no_model(self, context):
+    @pytest.mark.asyncio
+    async def test_returns_none_when_no_model(self):
         """Test that _get_agent_model returns None when runtime has no model."""
+        runtime = BaseTestRuntime()
+        schema = MockRuntimeSchema()
 
-        async def create_runtime():
-            return BaseTestRuntime()
-
-        factory = MockFactory(create_runtime)
-        event_bus = EventBus()
-        trace_manager = UiPathTraceManager()
-        eval_runtime = UiPathEvalRuntime(
-            context,
-            factory,
-            trace_manager,
-            event_bus,
-        )
-
-        runtime = await create_runtime()
-        model = await eval_runtime._get_agent_model(runtime)
+        model = await _get_agent_model(runtime, schema)
         assert model is None
 
-    async def test_returns_model_consistently(self, context):
+    @pytest.mark.asyncio
+    async def test_returns_model_consistently(self):
         """Test that _get_agent_model returns consistent results."""
-
-        async def create_runtime():
-            return AgentModelRuntime("consistent-model")
-
-        factory = MockFactory(create_runtime)
-        event_bus = EventBus()
-        trace_manager = UiPathTraceManager()
-        eval_runtime = UiPathEvalRuntime(
-            context,
-            factory,
-            trace_manager,
-            event_bus,
-        )
-
-        runtime = await create_runtime()
+        runtime = AgentModelRuntime("consistent-model")
+        schema = MockRuntimeSchema()
+        schema.metadata = {"settings": {"model": "consistent-model"}}
 
         # Multiple calls should return the same value
-        model1 = await eval_runtime._get_agent_model(runtime)
-        model2 = await eval_runtime._get_agent_model(runtime)
+        model1 = await _get_agent_model(runtime, schema)
+        model2 = await _get_agent_model(runtime, schema)
 
         assert model1 == model2 == "consistent-model"
 
-    async def test_handles_exception_gracefully(self, context):
-        """Test that _get_agent_model returns None on exception."""
+    @pytest.mark.asyncio
+    async def test_handles_exception_gracefully(self, monkeypatch):
+        """Test that _get_agent_model returns None when _find_agent_model_in_runtime raises exception."""
+        runtime = BaseTestRuntime()
+        schema = MockRuntimeSchema()
 
-        async def create_good_runtime():
-            return AgentModelRuntime("model")
+        # Mock _find_agent_model_in_runtime to raise an exception
+        def mock_find_agent_model_error(r):
+            raise RuntimeError("Unexpected error during model lookup")
 
-        factory = MockFactory(create_good_runtime)
-        event_bus = EventBus()
-        trace_manager = UiPathTraceManager()
-        eval_runtime = UiPathEvalRuntime(
-            context,
-            factory,
-            trace_manager,
-            event_bus,
+        monkeypatch.setattr(
+            "uipath._cli.cli_eval._find_agent_model_in_runtime",
+            mock_find_agent_model_error,
         )
 
-        # Create a bad runtime that raises during get_agent_model
-        class BadRuntime(BaseTestRuntime):
-            def get_agent_model(self):
-                raise RuntimeError("Get model error")
-
-        bad_runtime = BadRuntime()
-        model = await eval_runtime._get_agent_model(bad_runtime)
+        model = await _get_agent_model(runtime, schema)
         assert model is None
 
 
 class TestGetSchema:
     """Tests for get_schema method."""
 
-    @pytest.fixture
-    def context(self):
-        """Create eval context."""
+    @pytest.mark.asyncio
+    async def test_returns_schema(self):
+        """Test that get_schema returns the schema from context."""
+        schema = MockRuntimeSchema()
         context = UiPathEvalContext()
-        context.eval_set = str(
-            Path(__file__).parent / "evals" / "eval-sets" / "default.json"
-        )
-        return context
-
-    async def test_returns_schema(self, context):
-        """Test that get_schema returns the schema."""
+        context.execution_id = str(uuid.uuid4())
+        context.evaluation_set = None  # type: ignore
+        context.runtime_schema = schema
+        context.evaluators = []
 
         async def create_runtime():
             return BaseTestRuntime()
@@ -347,13 +280,19 @@ async def create_runtime():
             event_bus,
         )
 
-        runtime = await create_runtime()
-        schema = await eval_runtime.get_schema(runtime)
-        assert schema is not None
-        assert schema.file_path == "test.py"
+        retrieved_schema = await eval_runtime.get_schema()
+        assert retrieved_schema is not None
+        assert retrieved_schema.file_path == "test.py"
 
-    async def test_returns_schema_consistently(self, context):
-        """Test that get_schema returns consistent results."""
+    @pytest.mark.asyncio
+    async def test_returns_schema_consistently(self):
+        """Test that get_schema returns the same schema from context."""
+        schema = MockRuntimeSchema()
+        context = UiPathEvalContext()
+        context.execution_id = str(uuid.uuid4())
+        context.evaluation_set = None  # type: ignore
+        context.runtime_schema = schema
+        context.evaluators = []
 
         async def create_runtime():
             return BaseTestRuntime()
@@ -368,56 +307,19 @@ async def create_runtime():
             event_bus,
         )
 
-        runtime = await create_runtime()
-
-        # Multiple calls should return equivalent values
-        schema1 = await eval_runtime.get_schema(runtime)
-        schema2 = await eval_runtime.get_schema(runtime)
+        # Multiple calls should return the same schema from context
+        schema1 = await eval_runtime.get_schema()
+        schema2 = await eval_runtime.get_schema()
 
-        # Should have the same properties
+        # Should be the same object
+        assert schema1 is schema2
         assert schema1.file_path == schema2.file_path == "test.py"
 
-    async def test_schema_and_model_work_with_same_runtime(self, context):
-        """Test that get_schema and _get_agent_model work with the same runtime."""
-
-        async def create_runtime():
-            return AgentModelRuntime("shared-model")
-
-        factory = MockFactory(create_runtime)
-        event_bus = EventBus()
-        trace_manager = UiPathTraceManager()
-        eval_runtime = UiPathEvalRuntime(
-            context,
-            factory,
-            trace_manager,
-            event_bus,
-        )
-
-        runtime = await create_runtime()
-
-        # Call both methods with the same runtime
-        schema = await eval_runtime.get_schema(runtime)
-        model = await eval_runtime._get_agent_model(runtime)
-
-        # Both should work correctly
-        assert schema is not None
-        assert schema.file_path == "test.py"
-        assert model == "shared-model"
-
 
 class TestWrappedRuntimeModelResolution:
     """Tests for model resolution through realistic wrapper chains."""
 
-    @pytest.fixture
-    def context(self):
-        """Create eval context."""
-        context = UiPathEvalContext()
-        context.eval_set = str(
-            Path(__file__).parent / "evals" / "eval-sets" / "default.json"
-        )
-        return context
-
-    async def test_resolves_model_through_resumable_telemetry_chain(self, context):
+    def test_resolves_model_through_resumable_telemetry_chain(self):
         """Test model resolution through ResumableRuntime -> TelemetryWrapper -> BaseRuntime chain.
 
         This mimics the real wrapper chain:
@@ -432,18 +334,5 @@ async def test_resolves_model_through_resumable_telemetry_chain(self, context):
         # Simulate UiPathResumableRuntime
         resumable_runtime = WrapperRuntime(telemetry_wrapper)
 
-        async def create_runtime():
-            return resumable_runtime
-
-        factory = MockFactory(create_runtime)
-        event_bus = EventBus()
-        trace_manager = UiPathTraceManager()
-        eval_runtime = UiPathEvalRuntime(
-            context,
-            factory,
-            trace_manager,
-            event_bus,
-        )
-
-        model = await eval_runtime._get_agent_model(resumable_runtime)
+        model = _find_agent_model_in_runtime(resumable_runtime)
         assert model == "gpt-4o-from-agent-json"
diff --git a/tests/cli/eval/test_eval_runtime_suspend_resume.py b/tests/cli/eval/test_eval_runtime_suspend_resume.py
index ec8e07167..db4cd4d71 100644
--- a/tests/cli/eval/test_eval_runtime_suspend_resume.py
+++ b/tests/cli/eval/test_eval_runtime_suspend_resume.py
@@ -7,6 +7,7 @@
 - Ensures no duplicate eval run entries in StudioWeb
 """
 
+import uuid
 from pathlib import Path
 from typing import Any, AsyncGenerator
 from unittest.mock import AsyncMock
@@ -27,6 +28,7 @@
 
 from uipath._cli._evals._evaluate import evaluate
 from uipath._cli._evals._runtime import UiPathEvalContext
+from uipath._cli._utils._eval_set import EvalHelpers
 from uipath._events._event_bus import EventBus
 from uipath._events._events import EvaluationEvents
 
@@ -131,12 +133,29 @@ async def dispose(self) -> None:
 
 
 @pytest.fixture
-def context():
+async def context():
     """Create eval context."""
-    context = UiPathEvalContext()
-    context.eval_set = str(
-        Path(__file__).parent / "evals" / "eval-sets" / "default.json"
+    eval_set_path = str(Path(__file__).parent / "evals" / "eval-sets" / "default.json")
+
+    # Load evaluation set
+    evaluation_set, _ = EvalHelpers.load_eval_set(eval_set_path)
+
+    # Create a mock runtime to get schema
+    runtime = SuccessfulRuntime()
+    runtime_schema = await runtime.get_schema()
+
+    # Load evaluators
+    evaluators = await EvalHelpers.load_evaluators(
+        eval_set_path, evaluation_set, agent_model=None
     )
+
+    # Set up context
+    context = UiPathEvalContext()
+    context.execution_id = str(uuid.uuid4())
+    context.evaluation_set = evaluation_set
+    context.runtime_schema = runtime_schema
+    context.evaluators = evaluators
+
     return context
 
 
diff --git a/tests/cli/eval/test_eval_span_utils.py b/tests/cli/eval/test_eval_span_utils.py
index 7557861aa..5e523276c 100644
--- a/tests/cli/eval/test_eval_span_utils.py
+++ b/tests/cli/eval/test_eval_span_utils.py
@@ -286,8 +286,7 @@ async def test_configure_eval_set_run_span(self):
             "eval2": 90.0,
         }
 
-        # Mock runtime and get_schema_func
-        mock_runtime = MagicMock()
+        # Mock schema
         mock_schema = MagicMock()
         mock_schema.input = {
             "type": "object",
@@ -295,15 +294,11 @@ async def test_configure_eval_set_run_span(self):
         }
         mock_schema.output = {"type": "string"}
 
-        async def mock_get_schema(runtime):
-            return mock_schema
-
         await configure_eval_set_run_span(
             span=span,  # type: ignore[arg-type]
             evaluator_averages=evaluator_averages,
             execution_id="exec-complete",
-            runtime=mock_runtime,
-            get_schema_func=mock_get_schema,
+            schema=mock_schema,
             success=True,
         )
 
@@ -331,16 +326,16 @@ async def test_configure_eval_set_run_span_schema_error(self):
 
         evaluator_averages = {"eval1": 75.0}
 
-        # Mock get_schema_func that raises exception
-        async def mock_get_schema_error(runtime):
-            raise Exception("Schema not found")
+        # Mock schema with missing fields
+        mock_schema = MagicMock()
+        mock_schema.input = None
+        mock_schema.output = None
 
         await configure_eval_set_run_span(
             span=span,  # type: ignore[arg-type]
             evaluator_averages=evaluator_averages,
             execution_id="exec-no-schema",
-            runtime=MagicMock(),
-            get_schema_func=mock_get_schema_error,
+            schema=mock_schema,
             success=True,
         )
 
diff --git a/tests/cli/eval/test_eval_tracing_integration.py b/tests/cli/eval/test_eval_tracing_integration.py
index 3a9cf3d33..0b2f42fa0 100644
--- a/tests/cli/eval/test_eval_tracing_integration.py
+++ b/tests/cli/eval/test_eval_tracing_integration.py
@@ -4,12 +4,15 @@
 with the expected attributes by mocking the tracer.
 """
 
+import uuid
 from contextlib import contextmanager
 from typing import Any
 from unittest.mock import AsyncMock, MagicMock, patch
 
 import pytest
+from uipath.runtime.schema import UiPathRuntimeSchema
 
+from uipath._cli._evals._models._evaluation_set import EvaluationSet
 from uipath._cli._evals._runtime import UiPathEvalContext, UiPathEvalRuntime
 from uipath.eval.evaluators import BaseEvaluator
 from uipath.eval.models import NumericEvaluationResult
@@ -71,10 +74,39 @@ def get_span_by_name(self, name: str) -> dict[str, Any] | None:
 
 
 def create_eval_context(**kwargs: Any) -> UiPathEvalContext:
-    """Helper to create UiPathEvalContext with specific attribute values."""
+    """Helper to create UiPathEvalContext with specific attribute values.
+
+    Sets sensible defaults for required fields if not provided.
+    """
     context = UiPathEvalContext()
+
+    # Set required fields with defaults
+    if "execution_id" not in kwargs:
+        context.execution_id = str(uuid.uuid4())
+
+    if "runtime_schema" not in kwargs:
+        context.runtime_schema = UiPathRuntimeSchema(
+            filePath="test.py",
+            uniqueId="test",
+            type="workflow",
+            input={"type": "object", "properties": {}},
+            output={"type": "object", "properties": {}},
+        )
+
+    if "evaluation_set" not in kwargs:
+        context.evaluation_set = EvaluationSet(
+            id="test-eval-set",
+            name="Test Evaluation Set",
+            evaluations=[],
+        )
+
+    if "evaluators" not in kwargs:
+        context.evaluators = []
+
+    # Override with provided kwargs
     for key, value in kwargs.items():
         setattr(context, key, value)
+
     return context
 
 
@@ -258,14 +290,12 @@ async def test_execute_eval_creates_evaluation_span(
         mock_execution_output.spans = []
         mock_execution_output.logs = []
 
-        mock_runtime = AsyncMock()
-
         with patch.object(
             runtime,
             "execute_runtime",
             new=AsyncMock(return_value=mock_execution_output),
         ):
-            await runtime._execute_eval(mock_eval_item, [], mock_runtime)
+            await runtime._execute_eval(mock_eval_item, [])
 
         # Verify Evaluation span was created
         evaluation_spans = capturing_tracer.get_spans_by_type("evaluation")
@@ -466,7 +496,6 @@ async def test_evaluation_span_has_unique_execution_id(
             event_bus=mock_event_bus,
         )
 
-        mock_runtime = AsyncMock()
         mock_execution_output = MagicMock()
         mock_execution_output.result.output = {}
         mock_execution_output.result.status = "successful"
@@ -489,7 +518,7 @@ async def test_evaluation_span_has_unique_execution_id(
                 "execute_runtime",
                 new=AsyncMock(return_value=mock_execution_output),
             ):
-                await runtime._execute_eval(eval_item, [], mock_runtime)
+                await runtime._execute_eval(eval_item, [])
 
         # Get execution IDs from spans
         evaluation_spans = capturing_tracer.get_spans_by_type("evaluation")
@@ -747,7 +776,7 @@ async def test_evaluation_set_run_span_has_output_attribute(
                 )
 
                 # Execute evaluation
-                await runtime._execute_eval(eval_item, [evaluator], mock_runtime)
+                await runtime._execute_eval(eval_item, [evaluator])
 
         # Check that Evaluation span has output attribute
         eval_spans = self.capturing_tracer.get_spans_by_type("evaluation")
@@ -825,7 +854,7 @@ async def test_evaluation_span_has_metadata_attributes(
                     evaluation_criterias={"test-evaluator": {}},
                 )
 
-                await runtime._execute_eval(eval_item, [evaluator], mock_runtime)
+                await runtime._execute_eval(eval_item, [evaluator])
 
         # Check metadata attributes on Evaluation span
         eval_spans = self.capturing_tracer.get_spans_by_type("evaluation")
diff --git a/tests/cli/eval/test_evaluate.py b/tests/cli/eval/test_evaluate.py
index 68700a4ee..daf67b4de 100644
--- a/tests/cli/eval/test_evaluate.py
+++ b/tests/cli/eval/test_evaluate.py
@@ -1,3 +1,4 @@
+import uuid
 from pathlib import Path
 from typing import Any, AsyncGenerator
 
@@ -18,17 +19,15 @@
 from uipath._cli._evals._evaluate import evaluate
 from uipath._cli._evals._models._output import UiPathEvalOutput
 from uipath._cli._evals._runtime import UiPathEvalContext, UiPathEvalRuntime
+from uipath._cli._utils._eval_set import EvalHelpers
 from uipath._events._event_bus import EventBus
 
 
 async def test_evaluate():
     event_bus = EventBus()
     trace_manager = UiPathTraceManager()
-    context = UiPathEvalContext()
-    context.eval_set = str(
-        Path(__file__).parent / "evals" / "eval-sets" / "default.json"
-    )
 
+    # Create a mock runtime and factory
     async def identity(input: dict[str, Any]) -> dict[str, Any]:
         return input
 
@@ -93,6 +92,26 @@ async def dispose(self) -> None:
 
     factory = TestFactory(identity)
 
+    # Load evaluation set
+    eval_set_path = str(Path(__file__).parent / "evals" / "eval-sets" / "default.json")
+    evaluation_set, _ = EvalHelpers.load_eval_set(eval_set_path)
+
+    # Create runtime and get schema
+    runtime = await factory.new_runtime("test", "test-runtime-id")
+    runtime_schema = await runtime.get_schema()
+
+    # Load evaluators
+    evaluators = await EvalHelpers.load_evaluators(
+        eval_set_path, evaluation_set, agent_model=None
+    )
+
+    # Set up context
+    context = UiPathEvalContext()
+    context.execution_id = str(uuid.uuid4())
+    context.evaluation_set = evaluation_set
+    context.runtime_schema = runtime_schema
+    context.evaluators = evaluators
+
     # Act
     result = await evaluate(
         factory,
@@ -123,12 +142,8 @@ async def dispose(self) -> None:
 
 
 async def test_eval_runtime_generates_uuid_when_no_custom_id():
-    """Test that UiPathEvalRuntime generates UUID when no custom eval_set_run_id provided."""
+    """Test that UiPathEvalRuntime uses execution_id from context."""
     # Arrange
-    context = UiPathEvalContext()
-    context.eval_set = str(
-        Path(__file__).parent / "evals" / "eval-sets" / "default.json"
-    )
     event_bus = EventBus()
     trace_manager = UiPathTraceManager()
 
@@ -197,8 +212,28 @@ async def dispose(self) -> None:
 
     factory = TestFactory(identity)
 
+    # Load evaluation set
+    eval_set_path = str(Path(__file__).parent / "evals" / "eval-sets" / "default.json")
+    evaluation_set, _ = EvalHelpers.load_eval_set(eval_set_path)
+
+    # Create runtime and get schema
+    runtime = await factory.new_runtime("test", "test-runtime-id")
+    runtime_schema = await runtime.get_schema()
+
+    # Load evaluators
+    evaluators = await EvalHelpers.load_evaluators(
+        eval_set_path, evaluation_set, agent_model=None
+    )
+
+    # Set up context
+    context = UiPathEvalContext()
+    context.execution_id = str(uuid.uuid4())
+    context.evaluation_set = evaluation_set
+    context.runtime_schema = runtime_schema
+    context.evaluators = evaluators
+
     # Act
-    runtime = UiPathEvalRuntime(
+    eval_runtime = UiPathEvalRuntime(
         context,
         factory,
         trace_manager,
@@ -207,17 +242,13 @@ async def dispose(self) -> None:
 
     # Assert
     # Should be a valid UUID format (36 characters with dashes)
-    assert len(runtime.execution_id) == 36
-    assert runtime.execution_id.count("-") == 4
+    assert len(eval_runtime.execution_id) == 36
+    assert eval_runtime.execution_id.count("-") == 4
 
 
 async def test_eval_runtime_works_without_exporters():
     """Test that UiPathEvalRuntime works when both exporters are None (local execution)."""
     # Arrange
-    context = UiPathEvalContext()
-    context.eval_set = str(
-        Path(__file__).parent / "evals" / "eval-sets" / "default.json"
-    )
     event_bus = EventBus()
     trace_manager = UiPathTraceManager()
 
@@ -286,8 +317,28 @@ async def dispose(self) -> None:
 
     factory = TestFactory(identity)
 
+    # Load evaluation set
+    eval_set_path = str(Path(__file__).parent / "evals" / "eval-sets" / "default.json")
+    evaluation_set, _ = EvalHelpers.load_eval_set(eval_set_path)
+
+    # Create runtime and get schema
+    runtime = await factory.new_runtime("test", "test-runtime-id")
+    runtime_schema = await runtime.get_schema()
+
+    # Load evaluators
+    evaluators = await EvalHelpers.load_evaluators(
+        eval_set_path, evaluation_set, agent_model=None
+    )
+
+    # Set up context
+    context = UiPathEvalContext()
+    context.execution_id = str(uuid.uuid4())
+    context.evaluation_set = evaluation_set
+    context.runtime_schema = runtime_schema
+    context.evaluators = evaluators
+
     # Act
-    runtime = UiPathEvalRuntime(
+    eval_runtime = UiPathEvalRuntime(
         context,
         factory,
         trace_manager,
@@ -295,9 +346,9 @@ async def dispose(self) -> None:
     )
 
     # Assert - Runtime should work
-    assert runtime is not None
-    assert len(runtime.execution_id) == 36
-    assert runtime.execution_id.count("-") == 4
+    assert eval_runtime is not None
+    assert len(eval_runtime.execution_id) == 36
+    assert eval_runtime.execution_id.count("-") == 4
 
     # Verify that evaluate() also works
     result = await evaluate(
diff --git a/uv.lock b/uv.lock
index fc6d8b446..993b17e28 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2491,7 +2491,7 @@ wheels = [
 
 [[package]]
 name = "uipath"
-version = "2.6.26"
+version = "2.6.27"
 source = { editable = "." }
 dependencies = [
     { name = "applicationinsights" },