diff --git a/docs.json b/docs.json index 0878a67b..3738b767 100644 --- a/docs.json +++ b/docs.json @@ -215,7 +215,8 @@ { "group": "Extensions", "pages": [ - "openhands/usage/cli/mcp-servers" + "openhands/usage/cli/mcp-servers", + "openhands/usage/cli/critic" ] }, { @@ -268,7 +269,8 @@ "sdk/guides/agent-custom", "sdk/guides/convo-custom-visualizer", "sdk/guides/agent-stuck-detector", - "sdk/guides/agent-tom-agent" + "sdk/guides/agent-tom-agent", + "sdk/guides/critic" ] }, { diff --git a/openhands/usage/cli/critic.mdx b/openhands/usage/cli/critic.mdx new file mode 100644 index 00000000..f76ff2aa --- /dev/null +++ b/openhands/usage/cli/critic.mdx @@ -0,0 +1,35 @@ +--- +title: Critic (Experimental) +description: Automatic task success prediction for OpenHands LLM Provider users +--- + + +**This feature is highly experimental** and subject to change. The API, configuration, and behavior may evolve significantly based on feedback and testing. + + +## Overview + +If you're using the [OpenHands LLM Provider](/openhands/usage/llms/openhands-llms), an experimental **critic feature** is automatically enabled to predict task success in real-time. + +For detailed information about the critic feature, including programmatic access and advanced usage, see the [SDK Critic Guide](/sdk/guides/critic). + + +## What is the Critic? + +The critic is an LLM-based evaluator that analyzes agent actions and conversation history to predict the quality or success probability of agent decisions. It provides: + +- **Quality scores**: Probability scores between 0.0 and 1.0 indicating predicted success +- **Real-time feedback**: Scores computed during agent execution, not just at completion + +![Critic output in CLI](./screenshots/critic-cli-output.png) + +## Pricing + +The critic feature is **free during the public beta phase** for all OpenHands LLM Provider users. + +## Disabling the Critic + +If you prefer not to use the critic feature, you can disable it in your settings. + +![Critic settings in CLI](./screenshots/critic-cli-settings.png) + diff --git a/openhands/usage/cli/screenshots/critic-cli-output.png b/openhands/usage/cli/screenshots/critic-cli-output.png new file mode 100644 index 00000000..1dc97ea6 Binary files /dev/null and b/openhands/usage/cli/screenshots/critic-cli-output.png differ diff --git a/openhands/usage/cli/screenshots/critic-cli-settings.png b/openhands/usage/cli/screenshots/critic-cli-settings.png new file mode 100644 index 00000000..3eb41695 Binary files /dev/null and b/openhands/usage/cli/screenshots/critic-cli-settings.png differ diff --git a/sdk/guides/critic.mdx b/sdk/guides/critic.mdx new file mode 100644 index 00000000..9b20483b --- /dev/null +++ b/sdk/guides/critic.mdx @@ -0,0 +1,180 @@ +--- +title: Critic (Experimental) +description: Real-time evaluation of agent actions using an LLM-based critic model. +--- + + +**This feature is highly experimental** and subject to change. The API, configuration, and behavior may evolve significantly based on feedback and testing. + + + +The critic model is hosted by the OpenHands LLM Provider and is currently free to use. This example is available on GitHub: [examples/01_standalone_sdk/34_critic_example.py](https://github.com/OpenHands/software-agent-sdk/blob/main/examples/01_standalone_sdk/34_critic_example.py) + + +## What is a Critic? + +A **critic** is an evaluator that analyzes agent actions and conversation history to predict the quality or success probability of agent decisions. The critic runs alongside the agent and provides: + +- **Quality scores**: Probability scores between 0.0 and 1.0 indicating predicted success +- **Real-time feedback**: Scores computed during agent execution, not just at completion + +You can use critic scores to build automated workflows, such as triggering the agent to reflect on and fix its previous solution when the critic indicates poor task performance. + + +This critic is a more advanced extension of the approach described in our blog post [SOTA on SWE-Bench Verified with Inference-Time Scaling and Critic Model](https://openhands.dev/blog/sota-on-swe-bench-verified-with-inference-time-scaling-and-critic-model). A technical report with detailed evaluation metrics is forthcoming. + + +## Quick Start + +When using the OpenHands LLM Provider (`llm-proxy.*.all-hands.dev`), the critic is **automatically configured** - no additional setup required. + +```python icon="python" expandable examples/01_standalone_sdk/34_critic_example.py +"""Example demonstrating critic-based evaluation of agent actions. + +This is EXPERIMENTAL. + +This shows how to configure an agent with a critic to evaluate action quality +in real-time. The critic scores are displayed in the conversation visualizer. + +For All-Hands LLM proxy (llm-proxy.*.all-hands.dev), the critic is auto-configured +using the same base_url with /vllm suffix and "critic" as the model name. +""" + +import os +import re +import sys + +from openhands.sdk import LLM, Agent, Conversation, Tool +from openhands.sdk.critic import APIBasedCritic +from openhands.sdk.critic.base import CriticBase +from openhands.tools.file_editor import FileEditorTool +from openhands.tools.task_tracker import TaskTrackerTool +from openhands.tools.terminal import TerminalTool + + +def get_required_env(name: str) -> str: + value = os.getenv(name) + if value: + return value + sys.exit( + f"Missing required environment variable: {name}. " + f"Set {name} before running this example." + ) + + +def get_default_critic(llm: LLM) -> CriticBase | None: + """Auto-configure critic for All-Hands LLM proxy. + + When the LLM base_url matches `llm-proxy.*.all-hands.dev`, returns an + APIBasedCritic configured with: + - server_url: {base_url}/vllm + - api_key: same as LLM + - model_name: "critic" + + Returns None if base_url doesn't match or api_key is not set. + """ + base_url = llm.base_url + api_key = llm.api_key + if base_url is None or api_key is None: + return None + + # Match: llm-proxy.{env}.all-hands.dev (e.g., staging, prod, eval) + pattern = r"^https?://llm-proxy\.[^./]+\.all-hands\.dev" + if not re.match(pattern, base_url): + return None + + return APIBasedCritic( + server_url=f"{base_url.rstrip('/')}/vllm", + api_key=api_key, + model_name="critic", + ) + + +llm_api_key = get_required_env("LLM_API_KEY") + +llm = LLM( + model=os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929"), + api_key=llm_api_key, + base_url=os.getenv("LLM_BASE_URL", None), +) + +# Try auto-configuration for All-Hands proxy, fall back to explicit env vars +critic = get_default_critic(llm) +if critic is None: + critic = APIBasedCritic( + server_url=get_required_env("CRITIC_SERVER_URL"), + api_key=get_required_env("CRITIC_API_KEY"), + model_name=get_required_env("CRITIC_MODEL_NAME"), + ) + + +# Configure agent with critic +agent = Agent( + llm=llm, + tools=[ + Tool(name=TerminalTool.name), + Tool(name=FileEditorTool.name), + Tool(name=TaskTrackerTool.name), + ], + # Add critic to evaluate agent actions + critic=critic, +) + +cwd = os.getcwd() +conversation = Conversation(agent=agent, workspace=cwd) + +conversation.send_message( + "Create a file called GREETING.txt with a friendly greeting message." +) +conversation.run() + +print("\nAll done! Check the output above for 'Critic Score' in the visualizer.") +``` + +```bash Running the Example +uv run python examples/01_standalone_sdk/34_critic_example.py +``` + +## Understanding Critic Results + +Critic evaluations produce scores and feedback: + +- **`score`**: Float between 0.0 and 1.0 representing predicted success probability +- **`message`**: Optional feedback with detailed probabilities +- **`success`**: Boolean property (True if score >= 0.5) + +Results are automatically displayed in the conversation visualizer: + +![Critic results in SDK visualizer](./screenshots/critic-sdk-visualizer.png) + +### Accessing Results Programmatically + +```python +from openhands.sdk import Event, ActionEvent, MessageEvent + +def callback(event: Event): + if isinstance(event, (ActionEvent, MessageEvent)): + if event.critic_result is not None: + print(f"Critic score: {event.critic_result.score:.3f}") + print(f"Success: {event.critic_result.success}") + +conversation = Conversation(agent=agent, callbacks=[callback]) +``` + +## Troubleshooting + +### Critic Evaluations Not Appearing + +- Verify the critic is properly configured and passed to the Agent +- Ensure you're using the OpenHands LLM Provider (`llm-proxy.*.all-hands.dev`) + +### API Authentication Errors + +- Verify `LLM_API_KEY` is set correctly +- Check that the API key has not expired + +## Next Steps + +- **[Observability](/sdk/guides/observability)** - Monitor and log agent behavior +- **[Metrics](/sdk/guides/metrics)** - Collect performance metrics +- **[Stuck Detector](/sdk/guides/agent-stuck-detector)** - Detect unproductive agent patterns diff --git a/sdk/guides/screenshots/critic-sdk-visualizer.png b/sdk/guides/screenshots/critic-sdk-visualizer.png new file mode 100644 index 00000000..b8a7473c Binary files /dev/null and b/sdk/guides/screenshots/critic-sdk-visualizer.png differ