Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 27 additions & 3 deletions openadapt_ml/training/grpo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,21 @@
Connects to openadapt-evals RLEnvironment for rollout collection and
task evaluation against live Windows Agent Arena VMs.

Supports two training backends (set via GRPOConfig.backend):
- "standalone" (default): Built-in trainer using HuggingFace + PEFT.
Good for single-GPU prototyping and debugging. See trainer.py.
- "verl": Integration with verl-agent/VAGEN for GiGPO and multi-GPU
distributed training. See verl_backend.py.

Key components:
- GRPOConfig: Training configuration dataclass
- GRPOTrainer: Main training loop
- GRPOConfig: Training configuration dataclass (includes backend field)
- GRPOTrainer: Main training loop (standalone backend)
- GRPORolloutCollector: Collects rollouts via RLEnvironment
- reward functions: Binary task success + group-relative advantages
- CoT warm-up: Chain-of-thought SFT before GRPO
- verl_backend: verl-agent/VAGEN integration (verl backend)

Example:
Example (standalone):
from openadapt_ml.training.grpo import GRPOConfig, GRPOTrainer

config = GRPOConfig(
Expand All @@ -20,6 +27,17 @@
)
trainer = GRPOTrainer(config)
trainer.train()

Example (verl backend):
from openadapt_ml.training.grpo import GRPOConfig
from openadapt_ml.training.grpo.verl_backend import train_with_verl

config = GRPOConfig(
backend="verl",
task_ids=["notepad_1", "settings_1"],
num_training_steps=100,
)
train_with_verl(config) # Prints instructions; raises NotImplementedError
"""

from __future__ import annotations
Expand All @@ -44,6 +62,10 @@
build_cot_sft_samples,
generate_cot_annotations,
)
from openadapt_ml.training.grpo.verl_backend import (
build_vagen_config,
train_with_verl,
)

__all__ = [
"GRPOConfig",
Expand All @@ -58,4 +80,6 @@
"format_action_as_text",
"build_cot_sft_samples",
"generate_cot_annotations",
"build_vagen_config",
"train_with_verl",
]
11 changes: 11 additions & 0 deletions openadapt_ml/training/grpo/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@

Follows the same pattern as TRLTrainingConfig in trl_trainer.py, with
additional fields for GRPO-specific hyperparameters and environment setup.

Supports two training backends:
- "standalone" (default): Built-in GRPO trainer using HuggingFace + PEFT.
- "verl": Integration point for verl-agent/VAGEN, which provides GiGPO
and multi-GPU support. See verl_backend.py for details.
"""

from __future__ import annotations
Expand All @@ -16,6 +21,9 @@ class GRPOConfig:
Groups model/LoRA defaults with TRLTrainingConfig for consistency.

Attributes:
backend: Training backend to use. "standalone" for the built-in
HuggingFace + PEFT trainer, or "verl" for verl-agent/VAGEN
integration (requires separate installation).
model_name: HuggingFace model identifier.
load_in_4bit: Whether to use 4-bit quantization.
lora_r: LoRA rank.
Expand All @@ -32,6 +40,9 @@ class GRPOConfig:
stuck_window: Number of identical screenshots before early termination.
"""

# Backend: "standalone" (built-in HF+PEFT) or "verl" (verl-agent/VAGEN)
backend: str = "standalone"

# Model
model_name: str = "Qwen/Qwen2.5-VL-7B-Instruct"
load_in_4bit: bool = True
Expand Down
4 changes: 4 additions & 0 deletions openadapt_ml/training/grpo/trainer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
"""Minimal GRPO trainer bridging TRL/HuggingFace and openadapt-evals RLEnvironment.

Note: This is the "standalone" backend. For the verl-agent backend (recommended
for production training with GiGPO and multi-GPU support), see verl_backend.py
or use the VAGEN training config in openadapt-evals/configs/train_waa_vagen.yaml.

Uses REINFORCE with group-relative advantages (equivalent to single-epoch GRPO).
The policy_gradient_loss function includes PPO-style clipping for future multi-epoch
support, but with the current single-epoch design (old_logps == current_logps),
Expand Down
125 changes: 125 additions & 0 deletions openadapt_ml/training/grpo/verl_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
"""verl-agent / VAGEN backend for GRPO training.

This module provides the integration point for training via verl-agent
(https://github.com/VAGEN), which offers:
- GiGPO (Generalized Group Relative Policy Optimization)
- Multi-GPU distributed training via veRL
- Desktop environment integration via WAADesktopEnv

The actual training loop is managed by verl-agent's own training script,
not by our GRPOTrainer. This module builds the VAGEN-compatible config
from our GRPOConfig and documents how to run training.

Usage:
To train with the verl backend, set backend="verl" in GRPOConfig.
The train_with_verl() function will print instructions and raise
NotImplementedError until full integration is wired up.

For now, training with verl-agent should be done via:
1. Generate a VAGEN config: train_with_verl(config)
2. Run verl-agent's training script with that config

See also:
- openadapt-evals/configs/train_waa_vagen.yaml
- docs/verl_agent_decision.md (if available)
"""

from __future__ import annotations

import logging
from typing import Any

from openadapt_ml.training.grpo.config import GRPOConfig

logger = logging.getLogger(__name__)

# Deferred import for openadapt-evals WAADesktopEnv (optional dependency)
try:
from openadapt_evals.adapters.verl_env import WAADesktopEnv
except ImportError:
WAADesktopEnv = None # type: ignore[assignment, misc]


def build_vagen_config(config: GRPOConfig) -> dict[str, Any]:
"""Build a VAGEN-compatible config dict from GRPOConfig.

Maps our config fields to the structure expected by verl-agent's
training script. This dict can be serialized to YAML for use with
VAGEN's CLI.

Args:
config: Our GRPO training configuration.

Returns:
Dict matching VAGEN's expected config structure.
"""
return {
"model": {
"name": config.model_name,
"load_in_4bit": config.load_in_4bit,
"lora_r": config.lora_r,
"lora_alpha": config.lora_alpha,
},
"training": {
"learning_rate": config.learning_rate,
"num_training_steps": config.num_training_steps,
"save_every_steps": config.save_every_steps,
"output_dir": config.output_dir,
"num_rollouts_per_step": config.num_rollouts_per_step,
"temperature": config.temperature,
},
"environment": {
"type": "waa_desktop",
"server_url": config.server_url,
"task_ids": config.task_ids,
"max_steps_per_episode": config.max_steps_per_episode,
"screen_size": list(config.screen_size),
"stuck_window": config.stuck_window,
},
}


def train_with_verl(config: GRPOConfig) -> None:
"""Entry point for verl-agent backend training.

Currently a placeholder that documents the integration point.
The actual training happens via verl-agent's own CLI/training script,
not through this function.

Args:
config: GRPO training configuration with backend="verl".

Raises:
NotImplementedError: Always, until full verl-agent integration
is wired up. The error message includes instructions for
running training via verl-agent directly.
"""
vagen_config = build_vagen_config(config)

if WAADesktopEnv is not None:
logger.info(
"WAADesktopEnv is available. verl-agent can use it for "
"desktop environment interaction."
)
else:
logger.warning(
"WAADesktopEnv not found. Install openadapt-evals to enable "
"desktop environment support: uv add openadapt-evals"
)

logger.info("VAGEN config built from GRPOConfig:")
logger.info(" Model: %s", vagen_config["model"]["name"])
logger.info(" Tasks: %s", vagen_config["environment"]["task_ids"])
logger.info(" Steps: %d", vagen_config["training"]["num_training_steps"])
logger.info("")
logger.info(
"To train with verl-agent, use the VAGEN training script with "
"a config derived from the above. Example:"
)
logger.info(" python -m vagen.train --config configs/train_waa_vagen.yaml")

raise NotImplementedError(
"verl-agent training requires running via VAGEN's training script. "
"See docs/verl_agent_decision.md for setup instructions. "
"Use build_vagen_config() to generate a compatible config dict."
)