diff --git a/.gitignore b/.gitignore
index ecda38b38..def0d2c77 100644
--- a/.gitignore
+++ b/.gitignore
@@ -236,6 +236,7 @@ logs
 /_test_workspace
 /debug
 cache
+results
 
 # configuration
 config.toml
diff --git a/NVIDIA_Assets/swe-bench.png b/NVIDIA_Assets/swe-bench.png
index f9a5ea735..470f80888 100644
Binary files a/NVIDIA_Assets/swe-bench.png and b/NVIDIA_Assets/swe-bench.png differ
diff --git a/README.md b/README.md
index dc3532490..8c6b6a781 100644
--- a/README.md
+++ b/README.md
@@ -73,6 +73,7 @@ This command starts the FastAPI-based async evaluation server and listens on the
 It exposes /start, /process, and /status endpoints, and uses --max-init-workers/--max-run-workers and --timeout to control concurrency and time limits.
 
 ```bash
+export OH_RUNTIME_SINGULARITY_IMAGE_REPO=/path/to/singularity_images
 python scripts/start_server.py --host 0.0.0.0 --port 8006 --max-init-workers 64 --max-run-workers 64 --timeout 300
 ```
 
@@ -154,6 +155,28 @@ Output (response body):
 }
 ```
 
+## 💻 Do RL Training with verl
+1) Clone [verl](https://github.com/verl-project/verl) and switch to specific commit
+```shell
+cd /path/to/verl
+git checkout 60138ebd
+```
+2) Install verl following [verl](https://github.com/verl-project/verl)'s instructions
+3) Install our verl patch
+```shell
+cd ProRL-Agent-Server/trainer_integration/verl
+pip install -e .
+```
+4) Start agent server
+```shell
+export OH_RUNTIME_SINGULARITY_IMAGE_REPO=/path/to/singularity_images
+python scripts/start_server.py --host 0.0.0.0 --port 8006 --max-init-workers 64 --max-run-workers 64 --timeout 1000
+```
+5) Run training script
+```shell
+bash trainer_integration/verl/verl_custom/nvidia/scripts/run_proagent_qwn3_4B_instruct.sh
+```
+
 ## 💻 Add a New Task/Handler
 
 To add a new task:
@@ -195,7 +218,7 @@ TEST_RUNTIME=singularity RUN_AS_OPENHANDS=False PYTHONPATH='.' pytest tests/runt
 #### Image Storage Location
 **`OH_RUNTIME_SINGULARITY_IMAGE_REPO`** - Specifies the directory where Singularity runtime images will be stored.
 ```bash
-OH_RUNTIME_SINGULARITY_IMAGE_REPO=/path/to/singularity_images
+export OH_RUNTIME_SINGULARITY_IMAGE_REPO=/path/to/singularity_images
 ```
 
 ## 📄 Documentation
@@ -211,6 +234,6 @@ More module READMEs (click to open):
 ## 💡 Current Results
 
 
-To validate the functionality of the ProRLAgent servers, we conducted experiments on software engineering (SWE) tasks by integrating the server with our ProRLAgent Training framework based on verl. We did some initial RL training on Qwen3-4B-Instruct-2507 model. We used 32 A100 GPUs to train the model. Our training data is a subset of [SWE-GYM](https://huggingface.co/datasets/NovaSky-AI/SkyRL-v0-293-data) with 293 training examples. Training for around 66 steps have allowed the Pass@1 on SWE-Bench-Verified to be improved from 14.2% to 20.8%，the following charts shows the test results on SWE-Bench-Verified. It increases during training.
+To validate the functionality of the ProRLAgent servers, we conducted experiments on software engineering (SWE) tasks by integrating the server with our ProRLAgent Training framework based on verl. We did some initial RL training on Qwen3-4B-Instruct-2507 model. We used 32 A100 GPUs to train the model. Our training data is a subset of [SWE-GYM](https://huggingface.co/datasets/NovaSky-AI/SkyRL-v0-293-data) with 293 training examples. Training for around 66 steps have allowed the Pass@1 on SWE-Bench-Verified to be improved from 14.8% to 21.2%，the following charts shows the test results on SWE-Bench-Verified. It increases during training.
 <img src="NVIDIA_Assets/swe-bench.png" alt="swe-bench curve" width="600" />
 
diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py
index dc541f64f..76f4ddb80 100644
--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@@ -228,7 +228,7 @@ def __init__(
             kwargs['chat_template_kwargs'] = {'enable_thinking': False}
 
         if self.token_level_generation:
-            if 'VL' or 'vl' in self.config.model:
+            if 'qwen2.5-vl' in self.config.model.lower() or 'qwen2_5_vl' in self.config.model.lower():
                 from openhands.llm.nvidia.qwen2_5_vl import request_response_tokens
             else:
                 from openhands.llm.nvidia.qwen3 import request_response_tokens
diff --git a/openhands/nvidia/utils.py b/openhands/nvidia/utils.py
index c32a30a5e..cfed99718 100644
--- a/openhands/nvidia/utils.py
+++ b/openhands/nvidia/utils.py
@@ -385,7 +385,7 @@ def process_messages_from_agent_state(
                     [new_message],
                     tokenizer,
                     chat_template=chat_template,
-                    add_generation_prompt=False,
+                    add_generation_prompt=True,
                     enable_thinking=enable_thinking,
                     tools=tools,
                 )[0]
diff --git a/scripts/tests/run_standalone_swebench_test_qwen3_4b_instruct.sh b/scripts/tests/run_standalone_swebench_test_qwen3_4b_instruct.sh
new file mode 100644
index 000000000..aee88cb71
--- /dev/null
+++ b/scripts/tests/run_standalone_swebench_test_qwen3_4b_instruct.sh
@@ -0,0 +1,225 @@
+#!/bin/bash
+# Multi-node standalone SWE-bench evaluation via SLURM for Qwen3-4B-Instruct-2507.
+# Usage: sbatch run_standalone_swebench_test_qwen3_4b_instruct.sh
+#
+# This script submits a SLURM job that starts ProRL Agent Server + vLLM on multiple
+# nodes, then runs evaluation. For single-node local run without SLURM, use
+# run_standalone_swebench_test_qwen3_4b_instruct_single_node.sh instead.
+
+#SBATCH --job-name=standalone-swebench-test
+#SBATCH --nodes=8
+#SBATCH --ntasks-per-node=1
+#SBATCH --mem=1000G
+#SBATCH --partition=YOUR_PARTITION
+#SBATCH --time=4:00:00
+#SBATCH --account=YOUR_ACCOUNT
+#SBATCH --gpus-per-node=8
+#SBATCH --cpus-per-task=64
+#SBATCH --output=/path/to/ProRL-Agent-Server/logs/slurm-%A_%a.out
+#SBATCH --error=/path/to/ProRL-Agent-Server/logs/slurm-%A_%a.err
+
+set -x  # Enable debug output
+
+# ==================== Configuration ====================
+ProRL_Agent_WORKDIR=/path/to/ProRL-Agent-Server
+RESULTS_DIR="${ProRL_Agent_WORKDIR}/results/standalone_test_$(date +%Y%m%d_%H%M%S)"
+container_name=/path/to/your/container.sqsh
+MOUNTS="--container-mounts=/path/to/data:/path/to/data"
+
+# Model configuration
+MODEL_PATH='Qwen/Qwen3-4B-Instruct-2507'
+TOKENIZER_PATH='Qwen/Qwen3-4B-Instruct-2507'
+
+# Data configuration
+DATA_PATH='/path/to/data/swe-bench-verified.parquet'
+OUTPUT_DIR="${RESULTS_DIR}/standalone_swebench_test_${SLURM_JOB_ID}"
+
+# Server configuration
+GPUS_PER_NODE=8
+TP_SIZE=4
+GPU_MEM_UTIL=0.8
+NUM_SERVERS_PER_NODE=$((GPUS_PER_NODE / TP_SIZE))
+VLLM_BASE_PORT=8100
+ProRL_Agent_Server_PORT=8006
+ProRL_Agent_NUM_WORKERS=64
+
+# Evaluation configuration
+NUM_TRAJECTORIES=1
+TEMPERATURE=0.0
+TOP_P=1.0
+MAX_ITERATIONS=50
+MAX_OUTPUT_TOKENS=1536
+MAX_MODEL_LEN=32768
+TIMEOUT=1500
+HINT_MODE=none
+TOKEN_LEVEL_GENERATION=true  # set to true for token-level generation
+
+# ==================== Node Setup ====================
+nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
+nodes_array=($nodes)
+NNODES=$SLURM_NNODES
+
+mkdir -p "$RESULTS_DIR"
+
+# ==================== Resolve Node IPs ====================
+declare -a node_ips
+for i in "${!nodes_array[@]}"; do
+    node=${nodes_array[$i]}
+    node_ip=$(srun --nodes=1 --ntasks=1 -w "$node" hostname --ip-address)
+    # Convert to ipv4 if needed
+    if [[ "$node_ip" == *" "* ]]; then
+        IFS=' ' read -ra ADDR <<<"$node_ip"
+        if [[ ${#ADDR[0]} -gt 16 ]]; then
+            node_ip=${ADDR[1]}
+        else
+            node_ip=${ADDR[0]}
+        fi
+    fi
+    node_ips[$i]=$node_ip
+    echo "Node $i: ${nodes_array[$i]} -> IP: $node_ip"
+done
+
+# ==================== Start ProRL Agent Server on all nodes ====================
+echo "Starting ProRL Agent Server on all nodes..."
+ProRL_Agent_Server_urls=""
+
+for i in "${!nodes_array[@]}"; do
+    node=${nodes_array[$i]}
+    node_ip=${node_ips[$i]}
+
+    echo "Starting ProRL Agent Server on node $node (IP: $node_ip)"
+
+    srun --nodes=1 --ntasks=1 -w "$node" \
+        -o "$RESULTS_DIR/output-%A_%a-ProRL_Agent-node-$i.out" \
+        -e "$RESULTS_DIR/output-%A_%a-ProRL_Agent-node-$i.err" \
+        --container-image="$container_name" $MOUNTS \
+        bash -c "cd $ProRL_Agent_WORKDIR \
+        && export OH_RUNTIME_SINGULARITY_IMAGE_REPO=/path/to/singularity_images \
+        && export OVERWRITE_OPENHANDS_DIR=$ProRL_Agent_WORKDIR \
+        && export PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:\$PATH \
+        && export PYTHONPATH=$ProRL_Agent_WORKDIR:\$PYTHONPATH \
+        && export LOG_LEVEL=ERROR \
+        && export DEBUG=False \
+        && nohup /usr/bin/python scripts/start_server_thread.py --max-init-workers 70 --max-run-workers $ProRL_Agent_NUM_WORKERS --timeout 9999999" &
+
+    # Build the ProRL Agent Server URLs string
+    if [ -z "$ProRL_Agent_Server_urls" ]; then
+        ProRL_Agent_Server_urls="http://$node_ip:$ProRL_Agent_Server_PORT"
+    else
+        ProRL_Agent_Server_urls="$ProRL_Agent_Server_urls+http://$node_ip:$ProRL_Agent_Server_PORT"
+    fi
+done
+
+echo "ProRL Agent Server URLs: $ProRL_Agent_Server_urls"
+
+# ==================== Start vLLM servers on all nodes ====================
+echo "Starting vLLM servers on all nodes..."
+llm_server_urls=""
+
+for i in "${!nodes_array[@]}"; do
+    node=${nodes_array[$i]}
+    node_ip=${node_ips[$i]}
+
+    echo "Starting $NUM_SERVERS_PER_NODE vLLM server(s) on node $node (IP: $node_ip)"
+
+    for server_idx in $(seq 0 $((NUM_SERVERS_PER_NODE - 1))); do
+        gpu_start=$((server_idx * TP_SIZE))
+        gpu_end=$((gpu_start + TP_SIZE - 1))
+        cuda_devices=$(seq -s, $gpu_start $gpu_end)
+        port=$((VLLM_BASE_PORT + server_idx))
+
+        if [ "$TOKEN_LEVEL_GENERATION" = "true" ]; then
+            # Token-level generation: use custom vllm_api_server.py
+            vllm_cmd="CUDA_VISIBLE_DEVICES=$cuda_devices python $ProRL_Agent_WORKDIR/scripts/tests/vllm_api_server.py \
+                --model $MODEL_PATH \
+                --tensor-parallel-size $TP_SIZE \
+                --port $port \
+                --host 0.0.0.0 \
+                --gpu-memory-utilization $GPU_MEM_UTIL \
+                --max-model-len $MAX_MODEL_LEN"
+        else
+            # Standard mode: use OpenAI-compatible vLLM server
+            vllm_cmd="CUDA_VISIBLE_DEVICES=$cuda_devices python -m vllm.entrypoints.openai.api_server \
+                --model $MODEL_PATH \
+                --tensor-parallel-size $TP_SIZE \
+                --port $port \
+                --host 0.0.0.0 \
+                --gpu-memory-utilization $GPU_MEM_UTIL \
+                --max-model-len $MAX_MODEL_LEN"
+        fi
+
+        srun --overlap --nodes=1 --ntasks=1 -w "$node" \
+            -o "$RESULTS_DIR/output-%A_%a-vllm-node-$i-server-$server_idx.out" \
+            -e "$RESULTS_DIR/output-%A_%a-vllm-node-$i-server-$server_idx.err" \
+            --container-image="$container_name" $MOUNTS \
+            bash -c "$vllm_cmd" &
+
+        # Build the LLM server URLs string
+        if [ -z "$llm_server_urls" ]; then
+            llm_server_urls="http://$node_ip:$port"
+        else
+            llm_server_urls="$llm_server_urls+http://$node_ip:$port"
+        fi
+    done
+done
+
+echo "LLM Server URLs: $llm_server_urls"
+
+# ==================== Wait for servers to be ready ====================
+echo "Waiting for servers to start up..."
+sleep 120
+
+# Health check for vLLM servers
+echo "Checking vLLM server health..."
+IFS='+' read -ra LLM_URLS <<< "$llm_server_urls"
+for url in "${LLM_URLS[@]}"; do
+    for attempt in $(seq 1 60); do
+        if curl -s -o /dev/null -w "%{http_code}" "$url/health" | grep -q "200"; then
+            echo "vLLM server $url is healthy"
+            break
+        fi
+        if [ $attempt -eq 60 ]; then
+            echo "WARNING: vLLM server $url did not become healthy after 5 minutes"
+        fi
+        sleep 5
+    done
+done
+
+# ==================== Build evaluation command args ====================
+TOKEN_LEVEL_FLAG=""
+if [ "$TOKEN_LEVEL_GENERATION" = "true" ]; then
+    TOKEN_LEVEL_FLAG="--token_level_generation"
+fi
+
+# ==================== Run standalone evaluation ====================
+echo "Starting standalone SWE-bench evaluation..."
+echo "  ProRL Agent Server URLs: $ProRL_Agent_Server_urls"
+echo "  LLM Server URLs: $llm_server_urls"
+
+srun --overlap --nodes=1 --ntasks=1 -w "${nodes_array[0]}" \
+    --cpus-per-task=8 \
+    --mem=16G \
+    -o "$RESULTS_DIR/output-%A_%a-evaluation.out" \
+    -e "$RESULTS_DIR/output-%A_%a-evaluation.err" \
+    --container-image="$container_name" $MOUNTS \
+    bash -c "cd $ProRL_Agent_WORKDIR \
+    && export PYTHONPATH=$ProRL_Agent_WORKDIR:\$PYTHONPATH \
+    && python scripts/tests/standalone_swebench_test.py \
+        --data_path $DATA_PATH \
+        --ProRL_Agent_Server_urls '$ProRL_Agent_Server_urls' \
+        --llm_server_urls '$llm_server_urls' \
+        --model_name $MODEL_PATH \
+        --output_dir $OUTPUT_DIR \
+        --num_trajectories $NUM_TRAJECTORIES \
+        --num_workers_per_server $ProRL_Agent_NUM_WORKERS \
+        --temperature $TEMPERATURE \
+        --top_p $TOP_P \
+        --max_iterations $MAX_ITERATIONS \
+        --max_output_tokens $MAX_OUTPUT_TOKENS \
+        --max_model_len $MAX_MODEL_LEN \
+        --timeout $TIMEOUT \
+        --hint_mode $HINT_MODE \
+        --custom_tokenizer $TOKENIZER_PATH \
+        $TOKEN_LEVEL_FLAG"
+
+echo "Evaluation completed! Results saved to: $OUTPUT_DIR"
diff --git a/scripts/tests/run_standalone_swebench_test_qwen3_4b_instruct_single_node.sh b/scripts/tests/run_standalone_swebench_test_qwen3_4b_instruct_single_node.sh
new file mode 100644
index 000000000..f1df62103
--- /dev/null
+++ b/scripts/tests/run_standalone_swebench_test_qwen3_4b_instruct_single_node.sh
@@ -0,0 +1,210 @@
+#!/bin/bash
+# Single-node standalone SWE-bench evaluation script for Qwen3-4B-Instruct-2507.
+# Usage: bash run_standalone_swebench_test_qwen3_4b_instruct_single_node.sh
+#
+# This script launches ProRL Agent Server + vLLM servers locally, then runs evaluation.
+# All background processes are cleaned up on exit (Ctrl-C or natural completion).
+
+set -x  # Enable debug output
+
+# ==================== Configuration ====================
+ProRL_Agent_WORKDIR=/path/to/ProRL-Agent-Server
+LOG_DIR="${ProRL_Agent_WORKDIR}/logs/standalone_test_$(date +%Y%m%d_%H%M%S)"
+OUTPUT_DIR="${ProRL_Agent_WORKDIR}/results/standalone_test_$(date +%Y%m%d_%H%M%S)"
+
+# Model configuration
+MODEL_PATH='Qwen/Qwen3-4B-Instruct-2507'
+TOKENIZER_PATH='Qwen/Qwen3-4B-Instruct-2507'
+
+# Data configuration
+DATA_PATH='/path/to/data/swe-bench-verified.parquet'
+
+# Server configuration
+GPUS_PER_NODE=8
+TP_SIZE=4
+GPU_MEM_UTIL=0.8
+NUM_SERVERS=$((GPUS_PER_NODE / TP_SIZE))
+VLLM_BASE_PORT=8100
+ProRL_Agent_Server_PORT=8006
+ProRL_Agent_NUM_WORKERS=64
+
+# Evaluation configuration
+NUM_TRAJECTORIES=1
+TEMPERATURE=0.0
+TOP_P=1.0
+MAX_ITERATIONS=50
+MAX_OUTPUT_TOKENS=1536
+MAX_MODEL_LEN=32768
+TIMEOUT=1500
+HINT_MODE=none
+TOKEN_LEVEL_GENERATION=true  # set to true for token-level generation
+
+# ==================== Cleanup trap ====================
+PIDS=()
+
+cleanup() {
+    echo ""
+    echo "Cleaning up background processes..."
+    for pid in "${PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            echo "  Killing PID $pid"
+            kill "$pid" 2>/dev/null
+            wait "$pid" 2>/dev/null
+        fi
+    done
+    echo "Cleanup done."
+}
+
+trap cleanup EXIT INT TERM
+
+# ==================== Setup ====================
+mkdir -p "$LOG_DIR" "$OUTPUT_DIR"
+
+NODE_IP=$(hostname --ip-address)
+# Convert to ipv4 if needed
+if [[ "$NODE_IP" == *" "* ]]; then
+    IFS=' ' read -ra ADDR <<<"$NODE_IP"
+    if [[ ${#ADDR[0]} -gt 16 ]]; then
+        NODE_IP=${ADDR[1]}
+    else
+        NODE_IP=${ADDR[0]}
+    fi
+fi
+echo "Node IP: $NODE_IP"
+
+# ==================== Start ProRL Agent Server ====================
+echo "Starting ProRL Agent Server..."
+
+cd "$ProRL_Agent_WORKDIR"
+export OH_RUNTIME_SINGULARITY_IMAGE_REPO=/path/to/singularity_images
+export OVERWRITE_OPENHANDS_DIR="$ProRL_Agent_WORKDIR"
+export PYTHONPATH="${ProRL_Agent_WORKDIR}:${PYTHONPATH}"
+export LOG_LEVEL=ERROR
+export DEBUG=False
+
+python scripts/start_server_thread.py \
+    --max-init-workers 70 \
+    --max-run-workers "$ProRL_Agent_NUM_WORKERS" \
+    --timeout 9999999 \
+    > "$LOG_DIR/ProRL_Agent_Server.out" 2> "$LOG_DIR/ProRL_Agent_Server.err" &
+PIDS+=($!)
+echo "ProRL Agent Server PID: ${PIDS[-1]}"
+
+ProRL_Agent_Server_urls="http://${NODE_IP}:${ProRL_Agent_Server_PORT}"
+echo "ProRL Agent Server URL: $ProRL_Agent_Server_urls"
+
+cd "$WORKDIR"
+
+# ==================== Start vLLM servers ====================
+echo "Starting $NUM_SERVERS vLLM server(s)..."
+llm_server_urls=""
+
+for server_idx in $(seq 0 $((NUM_SERVERS - 1))); do
+    gpu_start=$((server_idx * TP_SIZE))
+    gpu_end=$((gpu_start + TP_SIZE - 1))
+    cuda_devices=$(seq -s, "$gpu_start" "$gpu_end")
+    port=$((VLLM_BASE_PORT + server_idx))
+
+    echo "  Server $server_idx: GPUs=$cuda_devices, port=$port"
+
+    if [ "$TOKEN_LEVEL_GENERATION" = "true" ]; then
+        CUDA_VISIBLE_DEVICES=$cuda_devices python "$ProRL_Agent_WORKDIR/scripts/tests/vllm_api_server.py" \
+            --model "$MODEL_PATH" \
+            --tensor-parallel-size "$TP_SIZE" \
+            --port "$port" \
+            --host 0.0.0.0 \
+            --gpu-memory-utilization "$GPU_MEM_UTIL" \
+            --max-model-len "$MAX_MODEL_LEN" \
+            > "$LOG_DIR/vllm_server_${server_idx}.out" 2> "$LOG_DIR/vllm_server_${server_idx}.err" &
+    else
+        CUDA_VISIBLE_DEVICES=$cuda_devices python -m vllm.entrypoints.openai.api_server \
+            --model "$MODEL_PATH" \
+            --tensor-parallel-size "$TP_SIZE" \
+            --port "$port" \
+            --host 0.0.0.0 \
+            --gpu-memory-utilization "$GPU_MEM_UTIL" \
+            --max-model-len "$MAX_MODEL_LEN" \
+            > "$LOG_DIR/vllm_server_${server_idx}.out" 2> "$LOG_DIR/vllm_server_${server_idx}.err" &
+    fi
+    PIDS+=($!)
+    echo "  vLLM server $server_idx PID: ${PIDS[-1]}"
+
+    if [ -z "$llm_server_urls" ]; then
+        llm_server_urls="http://${NODE_IP}:${port}"
+    else
+        llm_server_urls="${llm_server_urls}+http://${NODE_IP}:${port}"
+    fi
+done
+
+echo "LLM Server URLs: $llm_server_urls"
+
+# ==================== Wait for vLLM servers to be healthy ====================
+echo "Waiting for vLLM servers to become healthy..."
+IFS='+' read -ra LLM_URLS <<< "$llm_server_urls"
+all_healthy=true
+for url in "${LLM_URLS[@]}"; do
+    healthy=false
+    for attempt in $(seq 1 120); do
+        if curl -s -o /dev/null -w "%{http_code}" "${url}/health" 2>/dev/null | grep -q "200"; then
+            echo "  vLLM server $url is healthy (attempt $attempt)"
+            healthy=true
+            break
+        fi
+        sleep 5
+    done
+    if [ "$healthy" = false ]; then
+        echo "ERROR: vLLM server $url did not become healthy after 10 minutes"
+        echo "  Check logs: $LOG_DIR/vllm_server_*.err"
+        all_healthy=false
+    fi
+done
+
+if [ "$all_healthy" = false ]; then
+    echo "Some vLLM servers failed to start. Aborting."
+    exit 1
+fi
+
+echo "All vLLM servers are healthy."
+
+
+# ==================== Run evaluation ====================
+echo ""
+echo "=========================================="
+echo "Starting standalone SWE-bench evaluation"
+echo "=========================================="
+echo "  ProRL Agent Server URLs:   $ProRL_Agent_Server_urls"
+echo "  LLM Server URLs:  $llm_server_urls"
+echo "  Data:             $DATA_PATH"
+echo "  Output:           $OUTPUT_DIR"
+echo "  Logs:             $LOG_DIR"
+echo "=========================================="
+
+TOKEN_LEVEL_FLAG=""
+if [ "$TOKEN_LEVEL_GENERATION" = "true" ]; then
+    TOKEN_LEVEL_FLAG="--token_level_generation"
+fi
+
+cd "$ProRL_Agent_WORKDIR"
+export PYTHONPATH="${ProRL_Agent_WORKDIR}:${PYTHONPATH}"
+
+python scripts/tests/standalone_swebench_test.py \
+    --data_path "$DATA_PATH" \
+    --ProRL_Agent_Server_urls "$ProRL_Agent_Server_urls" \
+    --llm_server_urls "$llm_server_urls" \
+    --model_name "$MODEL_PATH" \
+    --output_dir "$OUTPUT_DIR" \
+    --num_trajectories "$NUM_TRAJECTORIES" \
+    --num_workers_per_server "$ProRL_Agent_NUM_WORKERS" \
+    --temperature "$TEMPERATURE" \
+    --top_p "$TOP_P" \
+    --max_iterations "$MAX_ITERATIONS" \
+    --max_output_tokens "$MAX_OUTPUT_TOKENS" \
+    --max_model_len "$MAX_MODEL_LEN" \
+    --timeout "$TIMEOUT" \
+    --hint_mode "$HINT_MODE" \
+    --custom_tokenizer "$TOKENIZER_PATH" \
+    $TOKEN_LEVEL_FLAG
+
+echo ""
+echo "Evaluation completed! Results saved to: $OUTPUT_DIR"
+echo "Logs available at: $LOG_DIR"
diff --git a/scripts/tests/standalone_swebench_test.py b/scripts/tests/standalone_swebench_test.py
new file mode 100644
index 000000000..c41ee8811
--- /dev/null
+++ b/scripts/tests/standalone_swebench_test.py
@@ -0,0 +1,1032 @@
+#!/usr/bin/env python3
+"""
+Standalone SWE-bench validation/testing script.
+
+This script replicates the validation logic from the verl training framework
+without depending on any verl project files. It:
+1. Loads test data from a parquet file
+2. Sends instances to ProRL Agent Server servers for multi-turn agent interaction
+3. Computes rewards (resolved, file_iou, block_iou)
+4. Computes pass@k metrics
+5. Saves results to JSONL
+
+Usage:
+    python standalone_swebench_test.py \
+        --data_path /path/to/test-transformed-with-prompt.parquet \
+        --ProRL_Agent_Server_urls "http://host1:8006+http://host2:8006" \
+        --model_name "Qwen/Qwen3-32B" \
+        --output_dir ./results
+"""
+
+import argparse
+import asyncio
+import json
+import logging
+import os
+import re
+import time
+from collections import defaultdict
+from functools import partial
+from typing import Any, Callable, Dict, List, Set, Tuple
+
+import aiohttp
+import numpy as np
+import pandas as pd
+from scipy.special import comb
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+logger = logging.getLogger(__name__)
+
+
+# =====================================================================
+# File / Block IoU computation (from unified diffs)
+# =====================================================================
+
+HunkRange = Tuple[int, int]
+
+
+def merge_intervals(intervals: List[HunkRange]) -> List[HunkRange]:
+    if not intervals:
+        return []
+    intervals = sorted(intervals)
+    merged: List[HunkRange] = []
+    s, e = intervals[0]
+    for cs, ce in intervals[1:]:
+        if cs <= e + 1:
+            e = max(e, ce)
+        else:
+            merged.append((s, e))
+            s, e = cs, ce
+    merged.append((s, e))
+    return merged
+
+
+def intervals_length(intervals: List[HunkRange]) -> int:
+    return sum(e - s + 1 for s, e in intervals)
+
+
+def intervals_intersection_length(a: List[HunkRange], b: List[HunkRange]) -> int:
+    i = j = 0
+    inter = 0
+    while i < len(a) and j < len(b):
+        s1, e1 = a[i]
+        s2, e2 = b[j]
+        s, e = max(s1, s2), min(e1, e2)
+        if s <= e:
+            inter += e - s + 1
+        if e1 < e2:
+            i += 1
+        else:
+            j += 1
+    return inter
+
+
+def _strip_ab_prefix(path: str) -> str:
+    if path.startswith("a/"):
+        return path[2:]
+    if path.startswith("b/"):
+        return path[2:]
+    return path
+
+
+def _set_to_blocks(line_set: Set[int]) -> List[HunkRange]:
+    if not line_set:
+        return []
+    seq = sorted(line_set)
+    blocks: List[HunkRange] = []
+    start = prev = seq[0]
+    for v in seq[1:]:
+        if v == prev + 1:
+            prev = v
+        else:
+            blocks.append((start, prev))
+            start = prev = v
+    blocks.append((start, prev))
+    return blocks
+
+
+def parse_unified_diff(patch_text: str):
+    """Parse a unified diff and return (files, new_blocks, old_blocks)."""
+    files: Set[str] = set()
+    new_blocks: Dict[str, List[HunkRange]] = {}
+    old_blocks: Dict[str, List[HunkRange]] = {}
+
+    lines = patch_text.splitlines()
+    current_file = None
+    old_file = None
+    new_file = None
+    in_hunk = False
+    old_line = None
+    new_line = None
+    file_old_changed: Set[int] = set()
+    file_new_changed: Set[int] = set()
+
+    def finalize_file():
+        nonlocal current_file, file_old_changed, file_new_changed
+        if current_file is not None:
+            files.add(current_file)
+            if file_new_changed:
+                new_blocks.setdefault(current_file, []).extend(_set_to_blocks(file_new_changed))
+            if file_old_changed:
+                old_blocks.setdefault(current_file, []).extend(_set_to_blocks(file_old_changed))
+        current_file = None
+
+    hunk_re = re.compile(r"^@@\s+-([0-9]+)(?:,([0-9]+))?\s+\+([0-9]+)(?:,([0-9]+))?\s+@@")
+
+    for line in lines:
+        if line.startswith("diff --git "):
+            if current_file is not None:
+                finalize_file()
+                file_old_changed = set()
+                file_new_changed = set()
+            in_hunk = False
+            old_file = None
+            new_file = None
+            current_file = None
+            old_line = None
+            new_line = None
+            continue
+        if line.startswith("--- "):
+            old_file = line[4:].strip()
+            continue
+        if line.startswith("+++ "):
+            new_file = line[4:].strip()
+            token_new = (new_file or "").split()[0]
+            token_old = (old_file or "").split()[0]
+            if token_new and not token_new.endswith("/dev/null"):
+                current_file = _strip_ab_prefix(token_new)
+            elif token_old and not token_old.endswith("/dev/null"):
+                current_file = _strip_ab_prefix(token_old)
+            else:
+                current_file = token_new or token_old or ""
+            continue
+        if line.startswith("Binary files "):
+            m = re.search(r"and\s+b/(.+?)\s+differ", line)
+            if m:
+                files.add(m.group(1))
+            else:
+                m2 = re.search(r"a/(.+?)\s+and", line)
+                if m2:
+                    files.add(m2.group(1))
+            continue
+        m = hunk_re.match(line)
+        if m:
+            in_hunk = True
+            old_line = int(m.group(1))
+            new_line = int(m.group(3))
+            continue
+        if in_hunk and current_file is not None:
+            if line.startswith(" "):
+                old_line += 1
+                new_line += 1
+            elif line.startswith("-"):
+                file_old_changed.add(old_line)
+                old_line += 1
+            elif line.startswith("+"):
+                file_new_changed.add(new_line)
+                new_line += 1
+
+    if current_file is not None:
+        finalize_file()
+
+    return files, new_blocks, old_blocks
+
+
+def file_iou(gt_patch: str, pred_patch: str) -> float:
+    gt_files, _, _ = parse_unified_diff(gt_patch)
+    pred_files, _, _ = parse_unified_diff(pred_patch)
+    inter = len(gt_files & pred_files)
+    union = len(gt_files | pred_files)
+    if union == 0:
+        return 1.0
+    return inter / union
+
+
+def block_iou(gt_patch: str, pred_patch: str) -> float:
+    gt_files, gt_new, gt_old = parse_unified_diff(gt_patch)
+    pred_files, pred_new, pred_old = parse_unified_diff(pred_patch)
+    all_files = set(gt_files) | set(pred_files)
+    inter_sum = 0
+    union_sum = 0
+    for side in ("new", "old"):
+        for f in all_files:
+            g = merge_intervals((gt_new if side == "new" else gt_old).get(f, []))
+            p = merge_intervals((pred_new if side == "new" else pred_old).get(f, []))
+            if not g and not p:
+                continue
+            inter_len = intervals_intersection_length(g, p)
+            union_len = intervals_length(merge_intervals(g + p))
+            inter_sum += inter_len
+            union_sum += union_len
+    if union_sum == 0:
+        return 1.0
+    return inter_sum / union_sum
+
+
+def compute_git_patch_ious(gt_patch: str, pred_patch: str) -> Dict[str, float]:
+    return {
+        "file_iou": file_iou(gt_patch, pred_patch),
+        "block_iou": block_iou(gt_patch, pred_patch),
+    }
+
+
+# =====================================================================
+# Reward computation
+# =====================================================================
+
+
+def compute_rewards(results: List[dict], file_iou_coef: float = 0.0, block_iou_coef: float = 0.0) -> List[dict]:
+    """
+    Compute rewards for each result returned by ProRL Agent Server.
+
+    Each result dict should have keys: success, resolved, finish, error, git_patch, messages, instance.
+    Returns the same list of dicts with added keys: score, file_iou, block_iou.
+    """
+    for r in results:
+        gt_patch = r.get("instance", {}).get("patch", "") or ""
+        pred_patch = r.get("git_patch", "") or ""
+
+        r["resolved"] = bool(r.get("resolved", False))
+        r["finish"] = bool(r.get("finish", False))
+        r["error"] = r.get("error", None) or ""
+
+        if gt_patch and pred_patch:
+            ious = compute_git_patch_ious(gt_patch, pred_patch)
+            r["file_iou"] = ious["file_iou"]
+            r["block_iou"] = ious["block_iou"]
+        else:
+            r["file_iou"] = 0.0
+            r["block_iou"] = 0.0
+
+        base_score = 1.0 if r["resolved"] else 0.0
+        converted_file_iou = 1.0 if r["resolved"] else r["file_iou"]
+        converted_block_iou = 1.0 if r["resolved"] else r["block_iou"]
+        r["score"] = base_score + file_iou_coef * converted_file_iou + block_iou_coef * converted_block_iou
+
+    return results
+
+
+# =====================================================================
+# Metrics: process_validation_metrics
+# =====================================================================
+
+
+def bootstrap_metric(
+    data: list,
+    subset_size: int,
+    reduce_fns: list,
+    n_bootstrap: int = 1000,
+    seed: int = 42,
+) -> list:
+    np.random.seed(seed)
+    bootstrap_metric_lsts = [[] for _ in range(len(reduce_fns))]
+    for _ in range(n_bootstrap):
+        bootstrap_idxs = np.random.choice(len(data), size=subset_size, replace=True)
+        bootstrap_data = [data[i] for i in bootstrap_idxs]
+        for i, reduce_fn in enumerate(reduce_fns):
+            bootstrap_metric_lsts[i].append(reduce_fn(bootstrap_data))
+    return [(np.mean(lst), np.std(lst)) for lst in bootstrap_metric_lsts]
+
+
+def calc_maj_val(data: list, vote_key: str, val_key: str) -> float:
+    vote2vals = defaultdict(list)
+    for d in data:
+        vote2vals[d[vote_key]].append(d[val_key])
+    vote2cnt = {k: len(v) for k, v in vote2vals.items()}
+    maj_vote = max(vote2cnt, key=vote2cnt.get)
+    return vote2vals[maj_vote][0]
+
+
+def process_validation_metrics(
+    data_sources: list,
+    sample_inputs: list,
+    infos_dict: dict,
+    seed: int = 42,
+) -> dict:
+    """Group by data_source and prompt, then compute mean, best@N, worst@N, maj@N."""
+    data_src2prompt2var2vals = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
+    for idx, ds in enumerate(data_sources):
+        prompt = sample_inputs[idx]
+        var2vals = data_src2prompt2var2vals[ds][prompt]
+        for var_name, var_vals in infos_dict.items():
+            var2vals[var_name].append(var_vals[idx])
+
+    data_src2prompt2var2metric = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
+    for ds, prompt2var2vals in data_src2prompt2var2vals.items():
+        for prompt, var2vals in prompt2var2vals.items():
+            for var_name, var_vals in var2vals.items():
+                if isinstance(var_vals[0], str):
+                    continue
+                metric = {}
+                n_resps = len(var_vals)
+                metric[f"mean@{n_resps}"] = np.mean(var_vals)
+                if n_resps > 1:
+                    metric[f"std@{n_resps}"] = np.std(var_vals)
+                    ns = []
+                    n = 2
+                    while n < n_resps:
+                        ns.append(n)
+                        n *= 2
+                    ns.append(n_resps)
+                    for n in ns:
+                        [(bon_mean, bon_std), (won_mean, won_std)] = bootstrap_metric(
+                            data=var_vals, subset_size=n, reduce_fns=[np.max, np.min], seed=seed
+                        )
+                        metric[f"best@{n}/mean"], metric[f"best@{n}/std"] = bon_mean, bon_std
+                        metric[f"worst@{n}/mean"], metric[f"worst@{n}/std"] = won_mean, won_std
+                data_src2prompt2var2metric[ds][prompt][var_name] = metric
+
+    data_src2var2metric2prompt_vals = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
+    for ds, prompt2var2metric in data_src2prompt2var2metric.items():
+        for prompt, var2metric in prompt2var2metric.items():
+            for var_name, metric in var2metric.items():
+                for metric_name, metric_val in metric.items():
+                    data_src2var2metric2prompt_vals[ds][var_name][metric_name].append(metric_val)
+
+    data_src2var2metric2val = defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
+    for ds, var2metric2prompt_vals in data_src2var2metric2prompt_vals.items():
+        for var_name, metric2prompt_vals in var2metric2prompt_vals.items():
+            for metric_name, prompt_vals in metric2prompt_vals.items():
+                data_src2var2metric2val[ds][var_name][metric_name] = np.mean(prompt_vals)
+
+    return data_src2var2metric2val
+
+
+# =====================================================================
+# Pass@k
+# =====================================================================
+
+
+def calculate_pass_at_k(n: int, c: int, k: int) -> float:
+    if n - c < k:
+        return 1.0
+    return 1.0 - float(comb(n - c, k)) / float(comb(n, k))
+
+
+def compute_pass_k_metrics(problem_results: dict, k_values: list = None) -> dict:
+    if k_values is None:
+        k_values = [1, 4, 8, 16]
+    results = {}
+    for k in k_values:
+        pass_at_k_scores = []
+        valid_problems = 0
+        for pid, stats in problem_results.items():
+            n = stats["total"]
+            c = stats["correct"]
+            if n >= k:
+                pass_at_k_scores.append(calculate_pass_at_k(n, c, k))
+                valid_problems += 1
+        if pass_at_k_scores:
+            results[f"pass@{k}"] = {
+                "score": float(np.mean(pass_at_k_scores)),
+                "std": float(np.std(pass_at_k_scores)),
+                "num_problems": valid_problems,
+            }
+        else:
+            results[f"pass@{k}"] = {"score": 0.0, "std": 0.0, "num_problems": 0}
+    return results
+
+
+# =====================================================================
+# Data loading
+# =====================================================================
+
+
+def load_test_data(parquet_path: str) -> List[dict]:
+    """
+    Load test instances from a parquet file.
+
+    Expected columns:
+      - prompt: JSON string of chat messages [{role, content}, ...]
+      - instance: dict with instance_id, problem_statement, patch, etc.
+      - data_source: str (e.g. "SWE-Gym/SWE-Gym")
+      - extra_info: dict with name, index, split, etc.
+    """
+    df = pd.read_parquet(parquet_path)
+    records = []
+    for i, row in df.iterrows():
+        record = {}
+
+        # Parse instance
+        instance = row.get("instance", {})
+        if isinstance(instance, str):
+            instance = json.loads(instance)
+        record["instance"] = instance
+
+        # Data source
+        record["data_source"] = row.get("data_source", "unknown")
+
+        # Extra info
+        extra_info = row.get("extra_info", {})
+        if isinstance(extra_info, str):
+            extra_info = json.loads(extra_info)
+        record["extra_info"] = extra_info
+
+        # Build instance_id if not present
+        if "instance_id" not in record["instance"]:
+            ds = record["data_source"]
+            name = extra_info.get("name", "")
+            split = extra_info.get("split", "")
+            index = extra_info.get("index", i)
+            record["instance"]["instance_id"] = f"{ds}/{name}/{split}/{index}"
+
+        # Prompt text (for input dedup / pass@k grouping)
+        prompt = row.get("prompt", "")
+        if isinstance(prompt, str):
+            try:
+                prompt = json.loads(prompt)
+            except (json.JSONDecodeError, TypeError):
+                pass
+        if isinstance(prompt, list):
+            record["prompt_text"] = " ".join(m.get("content", "") for m in prompt if isinstance(m, dict))
+        else:
+            record["prompt_text"] = str(prompt)
+
+        records.append(record)
+
+    logger.info(f"Loaded {len(records)} test instances from {parquet_path}")
+    return records
+
+
+# =====================================================================
+# ProRL Agent Server interaction
+# =====================================================================
+
+
+async def send_to_ProRL_Agent_Server(
+    session: aiohttp.ClientSession,
+    ProRL_Agent_Server_url: str,
+    instance: dict,
+    sampling_params: dict,
+    timeout_seconds: int = 1500,
+) -> dict:
+    """Send a single instance to a ProRL Agent Server server's /process endpoint."""
+    request_data = {"instance": instance, "sampling_params": sampling_params}
+    try:
+        timeout = aiohttp.ClientTimeout(total=timeout_seconds)
+        async with session.post(
+            f"{ProRL_Agent_Server_url}/process",
+            headers={"Content-Type": "application/json"},
+            json=request_data,
+            timeout=timeout,
+        ) as resp:
+            if resp.status == 200:
+                data = await resp.json()
+                messages = data.get("messages", [])
+                if messages and len(messages) > 0:
+                    return data
+                else:
+                    logger.warning(f"Empty messages from {ProRL_Agent_Server_url} for {instance.get('instance_id', '?')}")
+                    return {"success": False, "messages": [], "resolved": False, "finish": False, "error": "Empty response", "git_patch": ""}
+            else:
+                error_text = await resp.text()
+                logger.error(f"HTTP {resp.status} from {ProRL_Agent_Server_url}: {error_text}")
+                return {"success": False, "messages": [], "resolved": False, "finish": False, "error": f"HTTP {resp.status}", "git_patch": ""}
+    except asyncio.TimeoutError:
+        logger.error(f"Timeout for instance {instance.get('instance_id', '?')}")
+        return {"success": False, "messages": [], "resolved": False, "finish": False, "error": "Timeout", "git_patch": ""}
+    except Exception as e:
+        logger.error(f"Error for instance {instance.get('instance_id', '?')}: {e}")
+        return {"success": False, "messages": [], "resolved": False, "finish": False, "error": str(e), "git_patch": ""}
+
+
+async def start_ProRL_Agent_Server_server(session: aiohttp.ClientSession, url: str):
+    try:
+        timeout = aiohttp.ClientTimeout(total=60)
+        async with session.post(f"{url}/start", timeout=timeout) as resp:
+            if resp.status == 200:
+                logger.info(f"Started ProRL Agent Server server: {url}")
+            else:
+                logger.warning(f"Failed to start {url}: HTTP {resp.status}")
+    except Exception as e:
+        logger.warning(f"Failed to start {url}: {e}")
+
+
+async def stop_ProRL_Agent_Server_server(session: aiohttp.ClientSession, url: str):
+    try:
+        timeout = aiohttp.ClientTimeout(total=30)
+        async with session.post(f"{url}/stop", timeout=timeout) as resp:
+            if resp.status == 200:
+                logger.info(f"Stopped ProRL Agent Server server: {url}")
+    except Exception as e:
+        logger.warning(f"Failed to stop {url}: {e}")
+
+
+# =====================================================================
+# LLM Server Management
+# =====================================================================
+
+
+async def clear_llm_servers_from_ProRL_Agent_Server(session: aiohttp.ClientSession, ProRL_Agent_Server_url: str):
+    """Clear existing LLM servers from a ProRL Agent Server."""
+    try:
+        timeout = aiohttp.ClientTimeout(total=30)
+        async with session.post(f"{ProRL_Agent_Server_url}/clear_llm_server", timeout=timeout) as resp:
+            if resp.status == 200:
+                result = await resp.json()
+                logger.info(f"Cleared LLM servers from {ProRL_Agent_Server_url}: {result}")
+            else:
+                error_text = await resp.text()
+                logger.warning(f"Failed to clear LLM servers from {ProRL_Agent_Server_url}: HTTP {resp.status}: {error_text}")
+    except Exception as e:
+        logger.warning(f"Failed to clear LLM servers from {ProRL_Agent_Server_url}: {e}")
+
+
+async def add_llm_server_to_ProRL_Agent_Server(session: aiohttp.ClientSession, ProRL_Agent_Server_url: str, llm_address: str):
+    """Register a single LLM server address with a ProRL Agent Server server."""
+    try:
+        timeout = aiohttp.ClientTimeout(total=30)
+        payload = {"address": llm_address}
+        async with session.post(f"{ProRL_Agent_Server_url}/add_llm_server", json=payload, timeout=timeout) as resp:
+            if resp.status == 200:
+                result = await resp.json()
+                logger.info(f"Added LLM server {llm_address} to {ProRL_Agent_Server_url}: {result}")
+            else:
+                error_text = await resp.text()
+                logger.error(f"Failed to add LLM server {llm_address} to {ProRL_Agent_Server_url}: HTTP {resp.status}: {error_text}")
+    except Exception as e:
+        logger.error(f"Failed to add LLM server {llm_address} to {ProRL_Agent_Server_url}: {e}")
+
+
+def _url_to_ip(url: str) -> str:
+    """Extract IP address from URL by removing protocol and port."""
+    url = url.replace("http://", "").replace("https://", "")
+    return url.split(":")[0]
+
+
+def assign_llm_servers_to_ProRL_Agent_Server(
+    ProRL_Agent_Server_urls: List[str], llm_server_urls: List[str]
+) -> Dict[str, List[str]]:
+    """
+    Assign LLM servers to ProRL Agent Server servers based on IP locality.
+
+    Phase 1: Assign LLM servers to ProRL Agent Server servers on the same IP.
+    Phase 2: Distribute remaining LLM servers evenly across all ProRL Agent Server servers.
+    """
+    assignments: Dict[str, List[str]] = {url: [] for url in ProRL_Agent_Server_urls}
+    assigned: Set[str] = set()
+
+    # Phase 1: Locality-based assignment
+    for llm_url in llm_server_urls:
+        llm_ip = _url_to_ip(llm_url)
+        for oh_url in ProRL_Agent_Server_urls:
+            oh_ip = _url_to_ip(oh_url)
+            if llm_ip == oh_ip:
+                assignments[oh_url].append(llm_url)
+                assigned.add(llm_url)
+                break
+
+    # Phase 2: Distribute remaining evenly (round-robin)
+    remaining = [url for url in llm_server_urls if url not in assigned]
+    if remaining:
+        logger.info(f"Distributing {len(remaining)} remaining LLM servers across {len(ProRL_Agent_Server_urls)} ProRL Agent Server servers")
+        for i, llm_url in enumerate(remaining):
+            oh_url = ProRL_Agent_Server_urls[i % len(ProRL_Agent_Server_urls)]
+            assignments[oh_url].append(llm_url)
+
+    return assignments
+
+
+async def wait_for_server_health(urls: List[str], timeout: int = 600):
+    """Wait for all servers to become healthy by polling /health endpoint."""
+    logger.info(f"Waiting for {len(urls)} servers to become healthy (timeout: {timeout}s)...")
+    start = time.time()
+    async with aiohttp.ClientSession() as session:
+        for url in urls:
+            healthy = False
+            while time.time() - start < timeout:
+                try:
+                    health_url = f"{url}/health"
+                    async with session.get(health_url, timeout=aiohttp.ClientTimeout(total=5)) as resp:
+                        if resp.status == 200:
+                            logger.info(f"Server {url} is healthy")
+                            healthy = True
+                            break
+                except Exception:
+                    pass
+                await asyncio.sleep(5)
+            if not healthy:
+                raise TimeoutError(f"Server {url} did not become healthy within {timeout}s")
+    logger.info("All servers are healthy")
+
+
+async def setup_llm_servers_with_ProRL_Agent_Server(
+    ProRL_Agent_Server_urls: List[str],
+    llm_server_urls: List[str],
+    token_level_generation: bool = False,
+):
+    """
+    Clear existing LLM servers and register new ones with all ProRL Agent Server servers.
+
+    Steps:
+    1. Wait for all LLM servers to be healthy.
+    2. Clear existing LLM server registrations from all ProRL Agent Server servers.
+    3. Compute locality-based assignment of LLM servers to ProRL Agent Server servers.
+    4. Register each LLM server with its assigned ProRL Agent Server server.
+    """
+    # Step 1: Wait for LLM servers to be healthy
+    await wait_for_server_health(llm_server_urls)
+
+    connector = aiohttp.TCPConnector(limit=0)
+    async with aiohttp.ClientSession(connector=connector) as session:
+        # Step 2: Clear existing LLM servers
+        logger.info("Clearing existing LLM servers from all ProRL Agent Server servers...")
+        clear_tasks = [clear_llm_servers_from_ProRL_Agent_Server(session, url) for url in ProRL_Agent_Server_urls]
+        await asyncio.gather(*clear_tasks)
+
+        # Step 3: Compute assignment
+        assignments = assign_llm_servers_to_ProRL_Agent_Server(ProRL_Agent_Server_urls, llm_server_urls)
+
+        # Step 4: Register LLM servers
+        logger.info(f"Registering {len(llm_server_urls)} LLM servers with {len(ProRL_Agent_Server_urls)} ProRL Agent Server servers...")
+        add_tasks = []
+        for oh_url, llm_urls in assignments.items():
+            for llm_url in llm_urls:
+                # Format address based on token_level_generation mode
+                if token_level_generation:
+                    address = llm_url if llm_url.startswith("http") else f"http://{llm_url}"
+                else:
+                    base = llm_url if llm_url.startswith("http") else f"http://{llm_url}"
+                    address = f"{base}/v1" if not base.endswith("/v1") else base
+                add_tasks.append(add_llm_server_to_ProRL_Agent_Server(session, oh_url, address))
+        await asyncio.gather(*add_tasks)
+
+        # Log summary
+        for oh_url, llm_urls in assignments.items():
+            logger.info(f"  {oh_url} -> {llm_urls}")
+        logger.info("LLM server registration complete")
+
+
+async def run_ProRL_Agent_Server_evaluation(
+    instances: List[dict],
+    ProRL_Agent_Server_urls: List[str],
+    sampling_params: dict,
+    num_trajectories: int = 1,
+    num_workers_per_server: int = 64,
+    max_retries: int = 2,
+    timeout_seconds: int = 1500,
+) -> List[dict]:
+    """
+    Send all instances to ProRL Agent Server servers with load balancing.
+
+    Returns a flat list of result dicts (one per instance * trajectory).
+    Each result has the original instance attached.
+    """
+    # Build job list: expand each instance into num_trajectories copies
+    jobs = []
+    for inst in instances:
+        for traj_id in range(num_trajectories):
+            job_instance = inst.copy()
+            job_instance["trajectory_id"] = traj_id
+            jobs.append(job_instance)
+
+    logger.info(f"Total jobs: {len(jobs)} ({len(instances)} instances x {num_trajectories} trajectories)")
+
+    # Build server worker pool
+    server_workers = []
+    for url in ProRL_Agent_Server_urls:
+        for w in range(num_workers_per_server):
+            server_workers.append(url)
+
+    results = [None] * len(jobs)
+    job_queue = asyncio.Queue()
+    for i, job in enumerate(jobs):
+        await job_queue.put((i, job, 0))
+
+    server_queue = asyncio.Queue()
+    for sw in server_workers:
+        await server_queue.put(sw)
+
+    completed = 0
+    total = len(jobs)
+    processing_start_time = time.time()
+
+    connector = aiohttp.TCPConnector(limit=0)
+    async with aiohttp.ClientSession(connector=connector) as session:
+        # Start all servers
+        start_tasks = [start_ProRL_Agent_Server_server(session, url) for url in ProRL_Agent_Server_urls]
+        await asyncio.gather(*start_tasks)
+        logger.info("All ProRL Agent Server servers started")
+
+        async def worker():
+            nonlocal completed
+            while True:
+                try:
+                    idx, instance, retry_count = await asyncio.wait_for(job_queue.get(), timeout=5.0)
+                except asyncio.TimeoutError:
+                    if job_queue.empty():
+                        break
+                    continue
+
+                server_url = await server_queue.get()
+                try:
+                    result = await send_to_ProRL_Agent_Server(session, server_url, instance, sampling_params, timeout_seconds)
+                    result["instance"] = instance
+
+                    if not result.get("success", False) and not result.get("messages") and retry_count < max_retries:
+                        await job_queue.put((idx, instance, retry_count + 1))
+                        logger.info(f"Retrying job {idx} (attempt {retry_count + 1})")
+                    else:
+                        results[idx] = result
+                        completed += 1
+                        if completed % 10 == 0 or completed == total:
+                            # Compute resolved/finish ratios from completed results
+                            resolved_count = 0
+                            finish_count = 0
+                            all_results_count = 0
+                            resolved_instances = set()
+                            for r in results:
+                                if r is None:
+                                    continue
+                                all_results_count += 1
+                                inst_id = r.get("instance", {}).get("instance_id", "")
+                                if r.get("resolved", False):
+                                    resolved_count += 1
+                                    resolved_instances.add(inst_id)
+                                if r.get("finish", False):
+                                    finish_count += 1
+                            resolved_ratio = resolved_count / all_results_count if all_results_count else 0
+                            finish_ratio = finish_count / all_results_count if all_results_count else 0
+                            total_instances = len(instances)
+                            resolved_instance_ratio = len(resolved_instances) / total_instances if total_instances else 0
+                            elapsed_time = time.time() - processing_start_time
+                            logger.info(
+                                f"Progress: {completed}/{total} ({completed / total * 100:.1f}%), "
+                                f"elapsed: {elapsed_time:.1f}s, "
+                                f"resolved: {resolved_ratio:.2f}, finish: {finish_ratio:.2f}, "
+                                f"resolved_instance: {resolved_instance_ratio:.2f}"
+                            )
+                finally:
+                    await server_queue.put(server_url)
+
+        # Run workers concurrently
+        num_concurrent = len(server_workers)
+        workers = [asyncio.create_task(worker()) for _ in range(num_concurrent)]
+        await asyncio.gather(*workers)
+
+        # Stop all servers
+        stop_tasks = [stop_ProRL_Agent_Server_server(session, url) for url in ProRL_Agent_Server_urls]
+        await asyncio.gather(*stop_tasks)
+
+    # Fill in any None results (jobs that exhausted all retries)
+    for i, r in enumerate(results):
+        if r is None:
+            results[i] = {
+                "success": False,
+                "messages": [],
+                "resolved": False,
+                "finish": False,
+                "error": "All retries exhausted",
+                "git_patch": "",
+                "instance": jobs[i],
+            }
+
+    return results
+
+
+# =====================================================================
+# Main evaluation logic
+# =====================================================================
+
+
+def save_results_jsonl(results: List[dict], output_path: str):
+    """Save results as JSONL."""
+    os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
+    with open(output_path, "w") as f:
+        for r in results:
+            record = {
+                "instance_id": r.get("instance", {}).get("instance_id", ""),
+                "trajectory_id": r.get("instance", {}).get("trajectory_id", 0),
+                "score": r.get("score", 0.0),
+                "resolved": r.get("resolved", False),
+                "finish": r.get("finish", False),
+                "error": r.get("error", ""),
+                "file_iou": r.get("file_iou", 0.0),
+                "block_iou": r.get("block_iou", 0.0),
+                "git_patch": r.get("git_patch", ""),
+                "num_messages": len(r.get("messages", [])),
+            }
+            f.write(json.dumps(record) + "\n")
+    logger.info(f"Saved {len(results)} results to {output_path}")
+
+
+def print_summary(results: List[dict], instances: List[dict], num_trajectories: int, k_values: List[int]):
+    """Print evaluation summary with metrics."""
+    print("\n" + "=" * 60)
+    print("EVALUATION SUMMARY")
+    print("=" * 60)
+
+    total = len(results)
+    resolved_count = sum(1 for r in results if r.get("resolved", False))
+    finish_count = sum(1 for r in results if r.get("finish", False))
+    error_count = sum(1 for r in results if r.get("error") and r["error"] not in ("", None))
+    max_turn_count = sum(
+        1 for r in results
+        if r.get("error") and "maximum iteration" in str(r["error"])
+    )
+    stuck_count = sum(
+        1 for r in results
+        if r.get("error") and "stuck in a loop" in str(r["error"])
+    )
+
+    scores = [r.get("score", 0.0) for r in results]
+    file_ious = [r.get("file_iou", 0.0) for r in results]
+    block_ious = [r.get("block_iou", 0.0) for r in results]
+
+    print(f"Total instances:       {len(instances)}")
+    print(f"Trajectories per inst: {num_trajectories}")
+    print(f"Total evaluations:     {total}")
+    print(f"Resolved:              {resolved_count}/{total} ({resolved_count / total * 100:.1f}%)")
+    print(f"Finish action:         {finish_count}/{total} ({finish_count / total * 100:.1f}%)")
+    print(f"Max turn reached:      {max_turn_count}/{total} ({max_turn_count / total * 100:.1f}%)")
+    print(f"Stuck in loop:         {stuck_count}/{total} ({stuck_count / total * 100:.1f}%)")
+    print(f"Has error:             {error_count}/{total}")
+    print(f"Mean score:            {np.mean(scores):.4f}")
+    print(f"Mean file IoU:         {np.mean(file_ious):.4f}")
+    print(f"Mean block IoU:        {np.mean(block_ious):.4f}")
+
+    # Group by data source for per-source metrics
+    data_sources = [r.get("instance", {}).get("data_source", "unknown") for r in results]
+    unique_sources = sorted(set(data_sources))
+    if len(unique_sources) > 1:
+        print(f"\nPer data-source breakdown:")
+        for src in unique_sources:
+            src_scores = [s for s, ds in zip(scores, data_sources) if ds == src]
+            src_resolved = sum(1 for r, ds in zip(results, data_sources) if ds == src and r.get("resolved"))
+            print(f"  {src}: resolved={src_resolved}/{len(src_scores)} mean_score={np.mean(src_scores):.4f}")
+
+    # Compute process_validation_metrics
+    sample_inputs = [r.get("instance", {}).get("instance_id", str(i)) for i, r in enumerate(results)]
+    infos_dict = {
+        "reward": scores,
+        "acc": [1.0 if r.get("resolved") else 0.0 for r in results],
+    }
+    val_metrics = process_validation_metrics(data_sources, sample_inputs, infos_dict)
+    print(f"\nValidation metrics (process_validation_metrics):")
+    for ds, var2metric2val in val_metrics.items():
+        for var_name, metric2val in var2metric2val.items():
+            for metric_name, val in metric2val.items():
+                print(f"  {ds}/{var_name}/{metric_name}: {val:.4f}")
+
+    # Compute pass@k
+    print(f"\n{'=' * 50}")
+    print("PASS@K RESULTS")
+    print("=" * 50)
+
+    problem_results = defaultdict(lambda: {"total": 0, "correct": 0})
+    for r in results:
+        problem_id = r.get("instance", {}).get("instance_id", "unknown")
+        problem_results[problem_id]["total"] += 1
+        if r.get("resolved", False):
+            problem_results[problem_id]["correct"] += 1
+
+    pass_k = compute_pass_k_metrics(dict(problem_results), k_values)
+    for k_metric, k_result in sorted(pass_k.items()):
+        print(f"  {k_metric}: {k_result['score']:.4f} +/- {k_result['std']:.4f} (n={k_result['num_problems']})")
+
+    print("=" * 60)
+    return val_metrics, pass_k
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Standalone SWE-bench validation script")
+    parser.add_argument("--data_path", type=str, required=True, help="Path to test parquet file")
+    parser.add_argument(
+        "--ProRL_Agent_Server_urls",
+        type=str,
+        required=True,
+        help="ProRL Agent Server URLs separated by '+' (e.g. http://host1:8006+http://host2:8006)",
+    )
+    parser.add_argument(
+        "--llm_server_urls",
+        type=str,
+        default=None,
+        help="vLLM server URLs separated by '+' (e.g. http://host1:8100+http://host2:8100). "
+             "If provided, these will be registered with ProRL Agent Server servers before evaluation.",
+    )
+    parser.add_argument("--model_name", type=str, default="Qwen/Qwen3-32B", help="Model name for vLLM")
+    parser.add_argument("--output_dir", type=str, default="./test_results", help="Output directory")
+    parser.add_argument("--num_trajectories", type=int, default=1, help="Number of trajectories per instance")
+    parser.add_argument("--num_workers_per_server", type=int, default=64, help="Number of concurrent workers per ProRL Agent Server server")
+    parser.add_argument("--max_retries", type=int, default=2, help="Max retries per job")
+    parser.add_argument("--temperature", type=float, default=0.0, help="Sampling temperature (0 = greedy)")
+    parser.add_argument("--top_p", type=float, default=1.0, help="Top-p sampling")
+    parser.add_argument("--max_iterations", type=int, default=50, help="Max agent iterations")
+    parser.add_argument("--max_output_tokens", type=int, default=1536, help="Max output tokens per turn")
+    parser.add_argument("--max_model_len", type=int, default=32768, help="Max model context length")
+    parser.add_argument("--timeout", type=int, default=1500, help="Timeout per request (seconds)")
+    parser.add_argument("--hint_mode", type=str, default="none", help="Hint mode: none, file, old_str")
+    parser.add_argument("--file_iou_coef", type=float, default=0.0, help="Coefficient for file IoU in reward")
+    parser.add_argument("--block_iou_coef", type=float, default=0.0, help="Coefficient for block IoU in reward")
+    parser.add_argument("--k_values", type=str, default="1,4,8,16", help="Comma-separated k values for pass@k")
+    parser.add_argument("--token_level_generation", action="store_true", help="Enable token-level generation")
+    parser.add_argument("--custom_tokenizer", type=str, default=None, help="Custom tokenizer path (defaults to model_name)")
+    parser.add_argument("--native_tool_calling", action="store_true", default=True, help="Enable native tool calling")
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    # Parse ProRL Agent Server URLs
+    ProRL_Agent_Server_urls = [url.strip() for url in args.ProRL_Agent_Server_urls.split("+") if url.strip()]
+    logger.info(f"ProRL Agent Server servers: {ProRL_Agent_Server_urls}")
+
+    # Register LLM servers with ProRL Agent Server if provided
+    if args.llm_server_urls:
+        llm_server_urls = [url.strip() for url in args.llm_server_urls.split("+") if url.strip()]
+        logger.info(f"LLM servers: {llm_server_urls}")
+        asyncio.run(
+            setup_llm_servers_with_ProRL_Agent_Server(
+                ProRL_Agent_Server_urls, llm_server_urls, args.token_level_generation
+            )
+        )
+
+    # Parse k values
+    k_values = [int(k.strip()) for k in args.k_values.split(",")]
+
+    # Load test data
+    instances = load_test_data(args.data_path)
+
+    # Set hint_mode on all instances
+    for inst in instances:
+        inst["instance"]["hint_mode"] = args.hint_mode
+
+    # Build sampling params
+    sampling_params = {
+        "model": f"hosted_vllm/{args.model_name}",
+        "api_key": "dummy_key",
+        "modify_params": False,
+        "log_completions": False,
+        "native_tool_calling": args.native_tool_calling,
+        "temperature": args.temperature,
+        "top_p": args.top_p,
+        "max_iterations": args.max_iterations,
+        "max_output_tokens": args.max_output_tokens,
+        "token_level_generation": args.token_level_generation,
+        "custom_tokenizer": args.custom_tokenizer or args.model_name,
+        "max_model_len": args.max_model_len,
+        "ensure_thinking_end_properly": not args.token_level_generation,
+    }
+    logger.info(f"Sampling params: {json.dumps(sampling_params, indent=2)}")
+
+    # Run evaluation
+    start_time = time.time()
+    results = asyncio.run(
+        run_ProRL_Agent_Server_evaluation(
+            instances=[inst["instance"] for inst in instances],
+            ProRL_Agent_Server_urls=ProRL_Agent_Server_urls,
+            sampling_params=sampling_params,
+            num_trajectories=args.num_trajectories,
+            num_workers_per_server=args.num_workers_per_server,
+            max_retries=args.max_retries,
+            timeout_seconds=args.timeout,
+        )
+    )
+    elapsed = time.time() - start_time
+    logger.info(f"Evaluation took {elapsed:.1f}s")
+
+    # Attach data_source back to results
+    for r in results:
+        inst_id = r.get("instance", {}).get("instance_id", "")
+        # Find matching source instance
+        for src_inst in instances:
+            if src_inst["instance"].get("instance_id") == inst_id.replace(f"/{r['instance'].get('trajectory_id', 0)}", ""):
+                r["instance"]["data_source"] = src_inst.get("data_source", "unknown")
+                break
+        if "data_source" not in r.get("instance", {}):
+            r["instance"]["data_source"] = "unknown"
+
+    # Compute rewards
+    results = compute_rewards(results, file_iou_coef=args.file_iou_coef, block_iou_coef=args.block_iou_coef)
+
+    # Save results
+    os.makedirs(args.output_dir, exist_ok=True)
+    output_path = os.path.join(args.output_dir, "results.jsonl")
+    save_results_jsonl(results, output_path)
+
+    # Print summary
+    val_metrics, pass_k = print_summary(results, instances, args.num_trajectories, k_values)
+
+    # Save metrics as JSON
+    metrics_path = os.path.join(args.output_dir, "metrics.json")
+    all_metrics = {
+        "elapsed_seconds": elapsed,
+        "num_instances": len(instances),
+        "num_trajectories": args.num_trajectories,
+        "total_evaluations": len(results),
+        "resolved_count": sum(1 for r in results if r.get("resolved")),
+        "resolve_rate": sum(1 for r in results if r.get("resolved")) / len(results) if results else 0,
+        "mean_score": float(np.mean([r.get("score", 0) for r in results])),
+        "mean_file_iou": float(np.mean([r.get("file_iou", 0) for r in results])),
+        "mean_block_iou": float(np.mean([r.get("block_iou", 0) for r in results])),
+        "pass_k": {k: v for k, v in pass_k.items()},
+        "sampling_params": sampling_params,
+    }
+    with open(metrics_path, "w") as f:
+        json.dump(all_metrics, f, indent=2)
+    logger.info(f"Saved metrics to {metrics_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/tests/vllm_api_server.py b/scripts/tests/vllm_api_server.py
new file mode 100644
index 000000000..e39ebc18d
--- /dev/null
+++ b/scripts/tests/vllm_api_server.py
@@ -0,0 +1,184 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+NOTE: This API server is used only for demonstrating usage of AsyncEngine
+and simple performance benchmarks. It is not intended for production use.
+For production use, we recommend using our OpenAI compatible server.
+We are also not going to accept PRs modifying this file, please
+change `vllm/entrypoints/openai/api_server.py` instead.
+"""
+
+import asyncio
+import ssl
+from argparse import Namespace
+from typing import Any, Optional
+
+import vllm.envs as envs
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse, Response
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.entrypoints.launcher import serve_http
+from vllm.entrypoints.utils import with_cancellation
+from vllm.inputs import TokensPrompt
+from vllm.logger import init_logger
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import SamplingParams
+from vllm.usage.usage_lib import UsageContext
+from vllm.utils import FlexibleArgumentParser, random_uuid, set_ulimit
+from vllm.version import __version__ as VLLM_VERSION
+
+logger = init_logger('vllm.nvidia.api_server')
+
+app = FastAPI()
+engine = None
+
+
+@app.get('/health')
+async def health() -> Response:
+    """Health check."""
+    return Response(status_code=200)
+
+
+@app.post('/generate')
+async def generate(request: Request) -> Response:
+    """Generate completion for the request.
+
+    The request should be a JSON object with the following fields:
+    - prompt: the prompt to use for the generation.
+    - stream: whether to stream the results or not.
+    - other fields: the sampling parameters (See `SamplingParams` for details).
+    """
+    request_dict = await request.json()
+    return await _generate(request_dict, raw_request=request)
+
+
+@with_cancellation
+async def _generate(request_dict: dict, raw_request: Request) -> list[int]:
+    prompt_ids = request_dict.pop('prompt_ids')
+    # Set logprobs to 0 to only return selected tokens
+    request_dict['logprobs'] = 0
+    sampling_params = SamplingParams(**request_dict)
+    request_id = random_uuid()
+
+    # Create prompt from token IDs
+    prompt = TokensPrompt(prompt_token_ids=prompt_ids)
+    generator = engine.generate(
+        prompt=prompt, sampling_params=sampling_params, request_id=request_id
+    )
+
+    # Get final response
+    final_res: Optional[RequestOutput] = None
+    try:
+        async for output in generator:
+            final_res = output
+    except asyncio.CancelledError:
+        return Response(status_code=499)
+
+    assert final_res is not None
+
+    def obtain_logprobs(logprobs):
+        if logprobs is None:
+            return None
+        log_probs = []
+        for d in logprobs:
+            cur_logprobs = list(d.values())
+            assert len(cur_logprobs) == 1, f"Expected 1 logprob per token when logprobs=0, but got {len(cur_logprobs)}"
+            log_probs.append(cur_logprobs[0].logprob)
+        return log_probs
+
+    ret = {
+        'response_ids': final_res.outputs[0].token_ids,
+        'logprobs': obtain_logprobs(final_res.outputs[0].logprobs),
+    }
+    return JSONResponse(ret)
+
+
+def build_app(args: Namespace) -> FastAPI:
+    global app
+
+    app.root_path = args.root_path
+    return app
+
+
+async def init_app(
+    args: Namespace,
+    llm_engine: Optional[AsyncLLMEngine] = None,
+) -> FastAPI:
+    app = build_app(args)
+
+    global engine
+
+    engine_args = AsyncEngineArgs.from_cli_args(args)
+    engine = (
+        llm_engine
+        if llm_engine is not None
+        else AsyncLLMEngine.from_engine_args(
+            engine_args, usage_context=UsageContext.API_SERVER
+        )
+    )
+    app.state.engine_client = engine
+    return app
+
+
+async def run_server(
+    args: Namespace, llm_engine: Optional[AsyncLLMEngine] = None, **uvicorn_kwargs: Any
+) -> None:
+    logger.info('vLLM API server version %s', VLLM_VERSION)
+    logger.info('args: %s', args)
+
+    set_ulimit()
+
+    app = await init_app(args, llm_engine)
+    assert engine is not None
+
+    shutdown_task = await serve_http(
+        app,
+        sock=None,
+        enable_ssl_refresh=args.enable_ssl_refresh,
+        host=args.host,
+        port=args.port,
+        log_level=args.log_level,
+        timeout_keep_alive=getattr(envs, 'VLLM_HTTP_TIMEOUT_KEEP_ALIVE', 5),
+        ssl_keyfile=args.ssl_keyfile,
+        ssl_certfile=args.ssl_certfile,
+        ssl_ca_certs=args.ssl_ca_certs,
+        ssl_cert_reqs=args.ssl_cert_reqs,
+        **uvicorn_kwargs,
+    )
+
+    await shutdown_task
+
+
+if __name__ == '__main__':
+    parser = FlexibleArgumentParser()
+    parser.add_argument('--host', type=str, default=None)
+    parser.add_argument('--port', type=parser.check_port, default=8000)
+    parser.add_argument('--ssl-keyfile', type=str, default=None)
+    parser.add_argument('--ssl-certfile', type=str, default=None)
+    parser.add_argument(
+        '--ssl-ca-certs', type=str, default=None, help='The CA certificates file'
+    )
+    parser.add_argument(
+        '--enable-ssl-refresh',
+        action='store_true',
+        default=False,
+        help='Refresh SSL Context when SSL certificate files change',
+    )
+    parser.add_argument(
+        '--ssl-cert-reqs',
+        type=int,
+        default=int(ssl.CERT_NONE),
+        help="Whether client certificate is required (see stdlib ssl module's)",
+    )
+    parser.add_argument(
+        '--root-path',
+        type=str,
+        default=None,
+        help='FastAPI root_path when app is behind a path based routing proxy',
+    )
+    parser.add_argument('--log-level', type=str, default='error')
+    parser = AsyncEngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+
+    asyncio.run(run_server(args))
diff --git a/trainer_integration/verl/pyproject.toml b/trainer_integration/verl/pyproject.toml
new file mode 100644
index 000000000..fcf6ab49c
--- /dev/null
+++ b/trainer_integration/verl/pyproject.toml
@@ -0,0 +1,15 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "verl_custom"
+version = "0.1.0"
+description = "Custom extensions for verl"
+requires-python = ">=3.8"
+
+[tool.setuptools.packages.find]
+include = ["verl_custom*"]
+
+[tool.setuptools.package-data]
+verl_custom = ["**/*.yaml"]
diff --git a/trainer_integration/verl/verl_custom/__init__.py b/trainer_integration/verl/verl_custom/__init__.py
new file mode 100644
index 000000000..1ce90c5eb
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/trainer_integration/verl/verl_custom/nvidia/__init__.py b/trainer_integration/verl/verl_custom/nvidia/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/trainer_integration/verl/verl_custom/nvidia/docs/preparation/prepare_data.md b/trainer_integration/verl/verl_custom/nvidia/docs/preparation/prepare_data.md
new file mode 100644
index 000000000..852c9552a
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/docs/preparation/prepare_data.md
@@ -0,0 +1,205 @@
+# Prepare Data for Post-Training
+
+Before starting the post-training job, we need to prepare the data for
+the policy training. The data should be stored in the parquet format.
+
+We provide several data preprocess scripts for different datasets,
+including GSM8K, MATH, HelloSwag, Full_hh_rlhf. To prepare other datasets, we need
+to follow the following steps: The data preprocess script can be divided
+into two parts:
+
+1. The first part is the common part, which loads the dataset from
+   huggingface's `datasets` package. Then preprocess the datasets with
+   the `make_map_fn` and then store in the parquet format.
+
+```python
+import re
+import os
+import datasets
+
+from verl.utils.hdfs_io import copy, makedirs
+import argparse
+
+# To extract the solution for each prompts in the dataset
+# def extract_solution(solution_str): 
+# ...
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--local_dir', default='/opt/tiger/gsm8k')
+    parser.add_argument('--hdfs_dir', default=None)
+
+    args = parser.parse_args()
+
+    num_few_shot = 5
+    data_source = 'openai/gsm8k'
+
+    dataset = datasets.load_dataset(data_source, 'main')
+
+    train_dataset = dataset['train']
+    test_dataset = dataset['test']
+
+        # Construct a `def make_map_fn(split)` for the corresponding datasets.
+    # ...
+        
+    train_dataset = train_dataset.map(function=make_map_fn('train'), with_indices=True)
+    test_dataset = test_dataset.map(function=make_map_fn('test'), with_indices=True)
+
+    local_dir = args.local_dir
+    hdfs_dir = args.hdfs_dir
+
+    train_dataset.to_parquet(os.path.join(local_dir, 'train.parquet'))
+    test_dataset.to_parquet(os.path.join(local_dir, 'test.parquet'))
+
+    makedirs(hdfs_dir)
+
+    copy(src=local_dir, dst=hdfs_dir)
+```
+
+2. The users are required to implement the `make_map_fn()` function
+   (as well as the `extract_solution`) on their own to support
+   different datasets or tasks.
+
+We already implemented the data preprocess of GSM8k, MATH, Hellaswag and Full_hh_rlhf
+datasets. And we take the GSM8k dataset as an example:
+
+**GSM8K**
+
+In the `make_map_fn`, each data field should consist of the following
+fields:
+
+**Required Fields:**
+
+1. `data_source`: The name of the dataset. To index the corresponding
+   reward function in the `RewardModule`
+2. `prompt`: This field should be constructed in the format of
+   huggingface chat_template. The tokenizer in `RLHFDataset` will
+   apply chat template and tokenize the prompt. **This field can be either:**
+   
+   - A **list of dictionaries** (direct format)
+   - A **JSON string** representing the list of dictionaries
+   
+   Both formats will be properly handled by the dataset loader.
+3. `ability`: Define the task category.
+4. `reward_model`: Currently, we only utilize the `ground_truth`
+   field during evaluation. The `ground_truth` is computed by the
+   `extract_solution` function. **NOTED** that the implementation of
+   the corresponding reward function should align with this extracted
+   `ground_truth`.
+5. `extra_info`: Record some information of the current prompt. Not
+   use for now.
+
+**Optional Fields:**
+
+6. `tools` (optional): JSON string format that defines the tools available 
+   for function calling. This field is used for tool-access training data. 
+   The tools will be parsed and passed to the tokenizer's `apply_chat_template` 
+   method. If not provided or empty, no tools will be used.
+
+**Extra Info Fields:**
+
+The `extra_info` field can contain additional metadata:
+
+- `split`: The dataset split (e.g., 'train', 'test')
+- `index`: The unique index of the data sample
+- `tools_kwargs`: Additional keyword arguments for tool handling
+- `need_tools_kwargs`: Boolean flag indicating if tools_kwargs is required
+
+```python
+def extract_solution(solution_str):
+    solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str) # extract the solution after ####
+    assert solution is not None
+    final_solution = solution.group(0)
+    final_solution = final_solution.split('#### ')[1].replace(',', '')
+    return final_solution
+
+instruction_following = "Let's think step by step and output the final answer after \"####\"."
+
+# add a row to each data item that represents a unique id
+def make_map_fn(split):
+
+    def process_fn(example, idx):
+        question = example.pop('question')
+
+        question = question + ' ' + instruction_following
+
+        answer = example.pop('answer')
+        solution = extract_solution(answer)
+        data = {
+            "data_source": data_source,
+            "prompt": [{
+                "role": "user",
+                "content": question
+            }],
+            "ability": "math",
+            "reward_model": {
+                "style": "rule",
+                "ground_truth": solution
+            },
+            "extra_info": {
+                'split': split,
+                'index': idx
+            }
+        }
+        return data
+
+    return process_fn
+```
+
+**Tool Call Data Example:**
+
+For datasets that include function calling capabilities, you can add the `tools` field:
+
+```python
+def make_map_fn_with_tools(split):
+
+    def process_fn(example, idx):
+        question = example.pop('question')
+        
+        # Define tools as JSON string
+        tools_definition = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "calculator",
+                    "description": "Calculate mathematical expressions",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "expression": {
+                                "type": "string",
+                                "description": "The mathematical expression to evaluate"
+                            }
+                        },
+                        "required": ["expression"]
+                    }
+                }
+            }
+        ]
+
+        answer = example.pop('answer')
+        solution = extract_solution(answer)
+        data = {
+            "data_source": data_source,
+            "prompt": [{
+                "role": "user",
+                "content": question
+            }],
+            "ability": "math",
+            "reward_model": {
+                "style": "rule",
+                "ground_truth": solution
+            },
+            "tools": json.dumps(tools_definition),  # Tools as JSON string
+            "extra_info": {
+                'split': split,
+                'index': idx,
+                'need_tools_kwargs': True,
+                'tools_kwargs': {}  # Additional tool-specific arguments
+            }
+        }
+        return data
+
+    return process_fn
+``` 
\ No newline at end of file
diff --git a/trainer_integration/verl/verl_custom/nvidia/eval/__init__.py b/trainer_integration/verl/verl_custom/nvidia/eval/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/trainer_integration/verl/verl_custom/nvidia/eval/deepscaler_eval.yaml b/trainer_integration/verl/verl_custom/nvidia/eval/deepscaler_eval.yaml
new file mode 100644
index 000000000..2e084316f
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/eval/deepscaler_eval.yaml
@@ -0,0 +1,29 @@
+hydra:
+  searchpath:
+    - file://verl/trainer/config
+
+defaults:
+  - ppo_trainer
+  - _self_
+
+data:
+  output_path: null
+  truncation: error
+  prompt_key: prompt
+  response_key: responses
+  data_source_key: data_source
+  reward_model_key: reward_model
+
+actor_rollout_ref:
+  rollout:
+    enable_chunked_prefill: False # may get higher throughput when set to True. When activated, Please increase max_num_batched_tokens or decrease max_model_len.
+    val_kwargs:
+      # sampling parameters for validation
+      top_k: -1 # 0 for hf rollout, -1 for vllm rollout
+      top_p: 0.95
+      temperature: 0.6
+      n: 16
+      do_sample: True # default eager for validation
+
+trainer:
+  project_name: verl-evaluation
diff --git a/trainer_integration/verl/verl_custom/nvidia/eval/gen_utils.py b/trainer_integration/verl/verl_custom/nvidia/eval/gen_utils.py
new file mode 100644
index 000000000..5e0bbf390
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/eval/gen_utils.py
@@ -0,0 +1,349 @@
+from verl.workers.fsdp_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker
+from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
+from verl import DataProto
+import ray
+import pandas as pd
+import os
+import numpy as np
+from verl_custom.utils.dataset.rl_dataset import RLHFDataset, collate_fn
+from torchdata.stateful_dataloader import StatefulDataLoader
+import torch
+import uuid
+from verl.protocol import pad_dataproto_to_divisor, unpad_dataproto
+from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
+from verl_custom.nvidia.reward_manager import PrimeRewardManager
+
+
+def get_generation_results(config, tokenizer):
+    if config.actor_rollout_ref.rollout.val_kwargs.temperature == 0.:
+        assert config.actor_rollout_ref.rollout.val_kwargs.n == 1, 'When temperature=0, n_samples must be 1.'
+
+    ray_cls_with_init = RayClassWithInitArgs(cls=ray.remote(ActorRolloutRefWorker), config=config.actor_rollout_ref, role='rollout')
+    resource_pool = RayResourcePool(process_on_nodes=[config.trainer.n_gpus_per_node] * config.trainer.nnodes)
+    wg = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=ray_cls_with_init)
+    wg.init_model()
+
+    strategy = NodeAffinitySchedulingStrategy(node_id = ray.get_runtime_context().get_node_id(), soft = False)
+    reward_fn = PrimeRewardManager.options(scheduling_strategy=strategy).remote(tokenizer=tokenizer, compute_score=None, config=config.reward_manager)
+
+    rl_dataset = RLHFDataset(data_files=config.data.train_files,
+                             tokenizer=tokenizer,
+                             config=config.data,
+                             processor=None)
+
+    dataloader = StatefulDataLoader(dataset=rl_dataset,
+                                    batch_size=config.data.train_batch_size,
+                                    shuffle=False,
+                                    drop_last=False,
+                                    collate_fn=collate_fn)
+    reward_tensor_lst = []
+    data_source_lst = []
+    data_uid_lst = []
+
+
+    sample_inputs = []
+    sample_outputs = []
+    sample_scores = []
+    for batch_idx, test_data in enumerate(dataloader):
+        test_batch = DataProto.from_single_dict(test_data)
+        # extend with uid info
+        test_batch.non_tensor_batch['uid'] = np.array([str(uuid.uuid4()) for _ in range(len(test_batch.batch))], dtype=object)
+        # repeat test batch
+        test_batch = test_batch.repeat(repeat_times=config.actor_rollout_ref.rollout.val_kwargs.n, interleave=True)
+
+        # Store original inputs
+        input_ids = test_batch.batch["input_ids"]
+        # TODO: Can we keep special tokens except for padding tokens?
+        input_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids]
+        sample_inputs.extend(input_texts)
+
+        batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
+        non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
+        if "multi_modal_inputs" in test_batch.non_tensor_batch:
+            non_tensor_batch_keys_to_pop.extend(["multi_modal_data", "multi_modal_inputs"])
+        if "raw_prompt" in test_batch.non_tensor_batch:
+            non_tensor_batch_keys_to_pop.append("raw_prompt")
+        if "tools_kwargs" in test_batch.non_tensor_batch:
+            non_tensor_batch_keys_to_pop.append("tools_kwargs")
+        test_gen_batch = test_batch.pop(
+            batch_keys=batch_keys_to_pop,
+            non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
+        )
+        test_gen_batch.meta_info = {
+            "eos_token_id": tokenizer.eos_token_id,
+            "pad_token_id": tokenizer.pad_token_id,
+            "recompute_log_prob": False,
+            "do_sample": True,
+            "validate": True,
+        }
+        print(f"test_gen_batch meta info: {test_gen_batch.meta_info}")
+        
+        output_texts_file = config.data.output_path.replace('.parquet', f'_tmp_{batch_idx}.pickle')
+        # load the output_texts from tmp file if it exists
+        if os.path.exists(output_texts_file):
+            print(f'loading generation results from {output_texts_file}')
+            test_batch = DataProto.load_from_disk(output_texts_file)
+        else:
+            # pad to be divisible by dp_size
+            test_gen_batch_padded, pad_size = pad_dataproto_to_divisor(test_gen_batch, wg.world_size)
+            test_output_gen_batch_padded = wg.generate_sequences(test_gen_batch_padded)
+            # unpad
+            test_output_gen_batch = unpad_dataproto(test_output_gen_batch_padded, pad_size=pad_size)
+            print("validation generation end")
+            print(f'finished generation for batch {batch_idx}')
+
+            test_batch = test_batch.union(test_output_gen_batch)
+            # save the test_batch to tmp file
+            os.makedirs(os.path.dirname(output_texts_file), exist_ok=True)
+            test_batch.save_to_disk(output_texts_file)
+
+        # Store generated outputs
+        output_ids = test_batch.batch['responses']
+        output_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids]
+        sample_outputs.extend(output_texts)
+        # evaluate using reward_function
+        reward_tensor = ray.get(reward_fn.__call__.remote(test_batch))['score']
+
+        # Store scores
+        scores = reward_tensor.sum(-1).cpu().tolist()
+        print("*" * 100)
+        print(f"scores: {scores}")
+        sample_scores.extend(scores)
+
+        reward_tensor_lst.append(reward_tensor)
+        data_source = test_batch.non_tensor_batch.get('data_source', ['unknown'] * reward_tensor.shape[0])
+        # set the task name
+        if 'reward_model' in test_batch.non_tensor_batch:
+            reward_model_info = test_batch.non_tensor_batch['reward_model']
+            for idx, x in enumerate(reward_model_info):
+                if 'reasoning_task' in x:
+                    test_batch.non_tensor_batch['extra_info'][idx]['name'] = x['reasoning_task']
+        for idx, x in enumerate(test_batch.non_tensor_batch['extra_info']):
+            if 'name' in x:
+                data_source[idx] += '/' + x['name']
+        data_source_lst.append(data_source) 
+        data_uid_lst.append(test_batch.non_tensor_batch['uid'])
+    
+    reward_tensor = torch.cat(reward_tensor_lst, dim=0).sum(-1).cpu()  # (batch_size,)
+    data_sources = np.concatenate(data_source_lst, axis=0)
+    data_uids = np.concatenate(data_uid_lst, axis=0)
+
+    # save prompts, responses, scores to parquet
+    dataset = pd.DataFrame({'prompt': sample_inputs, 'response': sample_outputs, 'score': sample_scores})
+    # make the path if not exists
+    os.makedirs(os.path.dirname(config.data.output_path), exist_ok=True)
+    dataset.to_parquet(config.data.output_path)
+
+    # evaluate test_score based on data source
+    data_source_reward = {}
+    for i in range(reward_tensor.shape[0]):
+        data_source = data_sources[i]
+        data_uid = data_uids[i]
+        if data_source not in data_source_reward:
+            data_source_reward[data_source] = {}
+        if data_uid not in data_source_reward[data_source]:
+            data_source_reward[data_source][data_uid] = []
+        data_source_reward[data_source][data_uid].append(reward_tensor[i].item())
+
+    metric_dict = {}
+    stacked_rewards = []
+    for data_source, rewards in data_source_reward.items():
+        # shape (bs, n)
+        rewards = np.array(list(rewards.values()))
+        stacked_rewards.append(rewards)
+        # Here we assume reward model generates binary 0/1 rewards
+        metric_dict[f'{data_source}/pass@1'] = np.mean(rewards)
+        metric_dict[f'{data_source}/pass@1/std'] = np.mean(rewards.std(axis=1))
+        metric_dict[f'{data_source}/pass@{config.actor_rollout_ref.rollout.val_kwargs.n}'] = np.mean(rewards.max(axis=1))
+
+    if len(list(data_source_reward.keys())) > 1:
+        stacked_rewards = np.concatenate(stacked_rewards, axis=0)
+        metric_dict[f'all/pass@1'] = np.mean(stacked_rewards)
+        metric_dict[f'all/pass@1/std'] = np.mean(stacked_rewards.std(axis=1))
+        metric_dict[f'all/pass@{config.actor_rollout_ref.rollout.val_kwargs.n}'] = np.mean(stacked_rewards.max(axis=1))
+
+    return metric_dict
+
+def get_agent_generation_results(config, tokenizer):
+    if config.actor_rollout_ref.rollout.val_kwargs.temperature == 0.:
+        assert config.actor_rollout_ref.rollout.val_kwargs.n == 1, 'When temperature=0, n_samples must be 1.'
+    
+    from verl_custom.trainer.ppo.ray_trainer import ResourcePoolManager, Role
+    role_worker_mapping = {
+        Role.ActorRollout: ray.remote(AsyncActorRolloutRefWorker),
+    }
+    global_pool_id = "global_pool"
+    resource_pool_spec = {
+        global_pool_id: [config.trainer.n_gpus_per_node] * config.trainer.nnodes,
+    }
+    mapping = {
+        Role.ActorRollout: global_pool_id,
+    }
+    resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
+
+    # see RayPPOTrainer.init_workers()
+    resource_pool_manager.create_resource_pool()
+    resource_pool_to_cls = {pool: {} for pool in resource_pool_manager.resource_pool_dict.values()}
+    resource_pool = resource_pool_manager.get_resource_pool(Role.ActorRollout)
+    actor_rollout_cls = RayClassWithInitArgs(
+        cls=role_worker_mapping[Role.ActorRollout],
+        config=config.actor_rollout_ref,
+        role="actor_rollout",
+    )
+    resource_pool_to_cls[resource_pool]["actor_rollout"] = actor_rollout_cls
+
+    all_wg = {}
+    from verl.single_controller.ray.base import create_colocated_worker_cls
+    for resource_pool, class_dict in resource_pool_to_cls.items():
+        worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict)
+        wg_dict = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=worker_dict_cls, device_name='cuda')
+        spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys())
+        all_wg.update(spawn_wg)
+
+    wg = all_wg['actor_rollout']
+    wg.init_model()
+
+    from verl_custom.nvidia.rollout.async_server import AsyncLLMServerManager
+
+    async_rollout_manager = AsyncLLMServerManager(
+        config=config,
+        worker_group=wg,
+    )
+    async_rollout_manager.sleep()
+
+    rl_dataset = RLHFDataset(data_files=config.data.train_files,
+                             tokenizer=tokenizer,
+                             config=config.data,
+                             processor=None)
+
+    dataloader = StatefulDataLoader(dataset=rl_dataset,
+                                    batch_size=config.data.train_batch_size,
+                                    shuffle=False,
+                                    drop_last=False,
+                                    collate_fn=collate_fn)
+    data_source_lst = []
+    data_uid_lst = []
+
+    sample_outputs = []
+    sample_scores = []
+    sample_finish = []
+    sample_success = []
+    sample_instance = []
+
+    for batch_idx, test_data in enumerate(dataloader):
+        test_batch = DataProto.from_single_dict(test_data)
+        # extend with uid info
+        test_batch.non_tensor_batch['uid'] = np.array([str(uuid.uuid4()) for _ in range(len(test_batch.batch))], dtype=object)
+
+        batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
+        non_tensor_batch_keys_to_pop = ["instance"]
+        if "multi_modal_inputs" in test_batch.non_tensor_batch:
+            non_tensor_batch_keys_to_pop.extend(["multi_modal_data", "multi_modal_inputs"])
+        if "raw_prompt" in test_batch.non_tensor_batch:
+            non_tensor_batch_keys_to_pop.append("raw_prompt")
+        if "tools_kwargs" in test_batch.non_tensor_batch:
+            non_tensor_batch_keys_to_pop.append("tools_kwargs")
+        test_gen_batch = test_batch.pop(
+            batch_keys=batch_keys_to_pop,
+            non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
+        )
+        test_gen_batch.meta_info = {
+            "eos_token_id": tokenizer.eos_token_id,
+            "pad_token_id": tokenizer.pad_token_id,
+            "recompute_log_prob": False,
+            "do_sample": True,
+            "validate": True,
+        }
+        print(f"test_gen_batch meta info: {test_gen_batch.meta_info}")
+        
+        output_texts_file = config.data.output_path.replace('.parquet', f'_tmp_{batch_idx}.pickle')
+        # load the output_texts from tmp file if it exists
+        if os.path.exists(output_texts_file):
+            print(f'loading generation results from {output_texts_file}')
+            test_batch = DataProto.load_from_disk(output_texts_file)
+        else:
+            # pad to be divisible by dp_size
+            test_gen_batch_padded, pad_size = pad_dataproto_to_divisor(test_gen_batch, config.trainer.nnodes)
+            async_rollout_manager.wake_up()
+            test_output_gen_batch_padded = async_rollout_manager.generate_sequences(test_gen_batch_padded)
+            async_rollout_manager.sleep()
+            # unpad
+            test_output_gen_batch = unpad_dataproto(test_output_gen_batch_padded, pad_size=pad_size*config.actor_rollout_ref.rollout.val_kwargs.n)
+            print("validation generation end")
+            print(f'finished generation for batch {batch_idx}')
+
+            test_batch = test_batch.repeat(repeat_times=config.actor_rollout_ref.rollout.val_kwargs.n, interleave=True)
+            test_batch = test_batch.union(test_output_gen_batch)
+            # save the test_batch to tmp file
+            os.makedirs(os.path.dirname(output_texts_file), exist_ok=True)
+            test_batch.save_to_disk(output_texts_file)
+
+        # Store generated outputs
+        output_ids = test_batch.batch['responses'].numpy()
+        sample_outputs.extend(output_ids)
+        # evaluate using reward_function
+        scores = test_batch.non_tensor_batch['resolved']
+        finish = test_batch.non_tensor_batch['finish']
+        success = test_batch.non_tensor_batch['success']
+        instance = test_batch.non_tensor_batch['instance']
+        
+        print("*" * 100)
+        print(f"scores: {scores}")
+        sample_scores.extend(scores)
+        sample_finish.extend(finish)
+        sample_success.extend(success)
+        sample_instance.extend(instance)
+
+        data_source = test_batch.non_tensor_batch.get('data_source', ['unknown'] * len(scores))
+        # set the task name
+        if 'reward_model' in test_batch.non_tensor_batch:
+            reward_model_info = test_batch.non_tensor_batch['reward_model']
+            if 'extra_info' in test_batch.non_tensor_batch:
+                for idx, x in enumerate(reward_model_info):
+                    if 'reasoning_task' in x:
+                        test_batch.non_tensor_batch['extra_info'][idx]['name'] = x['reasoning_task']
+        if 'extra_info' in test_batch.non_tensor_batch:
+            for idx, x in enumerate(test_batch.non_tensor_batch['extra_info']):
+                if 'name' in x:
+                    data_source[idx] += '/' + x['name']
+        data_source_lst.append(data_source) 
+        data_uid_lst.append(test_batch.non_tensor_batch['uid'])
+    
+    data_sources = np.concatenate(data_source_lst, axis=0)
+    data_uids = np.concatenate(data_uid_lst, axis=0)
+
+    # save prompts, responses, scores to parquet
+    dataset = pd.DataFrame({'instance': sample_instance, 'response': sample_outputs, 'score': sample_scores, 'finish': sample_finish, 'success': sample_success})
+    # make the path if not exists
+    os.makedirs(os.path.dirname(config.data.output_path), exist_ok=True)
+    dataset.to_parquet(config.data.output_path)
+
+    # evaluate test_score based on data source
+    data_source_reward = {}
+    for i in range(len(sample_scores)):
+        data_source = data_sources[i]
+        data_uid = data_uids[i]
+        if data_source not in data_source_reward:
+            data_source_reward[data_source] = {}
+        if data_uid not in data_source_reward[data_source]:
+            data_source_reward[data_source][data_uid] = []
+        data_source_reward[data_source][data_uid].append(sample_scores[i])
+
+    metric_dict = {}
+    stacked_rewards = []
+    for data_source, rewards in data_source_reward.items():
+        # shape (bs, n)
+        rewards = np.array(list(rewards.values()))
+        stacked_rewards.append(rewards)
+        # Here we assume reward model generates binary 0/1 rewards
+        metric_dict[f'{data_source}/pass@1'] = np.mean(rewards)
+        metric_dict[f'{data_source}/pass@1/std'] = np.mean(rewards.std(axis=1))
+        metric_dict[f'{data_source}/pass@{config.actor_rollout_ref.rollout.val_kwargs.n}'] = np.mean(rewards.max(axis=1))
+
+    if len(list(data_source_reward.keys())) > 1:
+        stacked_rewards = np.concatenate(stacked_rewards, axis=0)
+        metric_dict[f'all/pass@1'] = np.mean(stacked_rewards)
+        metric_dict[f'all/pass@1/std'] = np.mean(stacked_rewards.std(axis=1))
+        metric_dict[f'all/pass@{config.actor_rollout_ref.rollout.val_kwargs.n}'] = np.mean(stacked_rewards.max(axis=1))
+
+    return metric_dict
\ No newline at end of file
diff --git a/trainer_integration/verl/verl_custom/nvidia/eval/general_eval.py b/trainer_integration/verl/verl_custom/nvidia/eval/general_eval.py
new file mode 100644
index 000000000..4b5ca65c0
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/eval/general_eval.py
@@ -0,0 +1,93 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Generate responses given a dataset of prompts
+"""
+import csv
+import numpy as np
+import hydra
+import os
+from tabulate import tabulate
+
+os.environ['NCCL_DEBUG'] = 'WARN'
+os.environ['TOKENIZERS_PARALLELISM'] = 'true'
+import pandas as pd
+from verl.utils.fs import copy_local_path_from_hdfs
+from verl_custom.nvidia.eval.gen_utils import get_generation_results, get_agent_generation_results
+import ray
+
+@hydra.main(config_path='.', config_name='deepscaler_eval', version_base=None)
+def main(config):
+    from pprint import pprint
+    from omegaconf import OmegaConf
+    pprint(OmegaConf.to_container(config, resolve=True))  # resolve=True will eval symbol values
+    OmegaConf.resolve(config)
+
+    if not ray.is_initialized():
+        # Initialize Ray with a local cluster configuration
+        # Set environment variables in the runtime environment to control tokenizer parallelism,
+        # NCCL debug level, VLLM logging level, and allow runtime LoRA updating
+        # `num_cpus` specifies the number of CPU cores Ray can use, obtained from the configuration
+        ray.init(
+            runtime_env={"env_vars": {"TOKENIZERS_PARALLELISM": "true", "NCCL_DEBUG": "WARN", "VLLM_LOGGING_LEVEL": "WARN", "VLLM_ALLOW_RUNTIME_LORA_UPDATING": "true", "VLLM_USE_V1": "1"}},
+            num_cpus=config.ray_init.num_cpus,
+        )
+
+    output_dir = os.path.dirname(config.data.output_path)
+    # dataset name without extension
+    name_without_ext = os.path.splitext(os.path.basename(config.data.train_files))[0]
+    csv_path = os.path.join(output_dir, f'{name_without_ext}_summary.csv')
+ 
+    # skip the job if the csv file already exists
+    if os.path.exists(csv_path):
+        print(f"CSV file {csv_path} already exists. Skipping the job.")
+        return
+
+    local_path = copy_local_path_from_hdfs(config.actor_rollout_ref.model.path)
+    from verl.utils import hf_tokenizer
+    tokenizer = hf_tokenizer(local_path)
+
+    if config.actor_rollout_ref.rollout.get("task_type", None) == "swegym":
+        metric_dict = get_agent_generation_results(config, tokenizer)
+    else:
+        metric_dict = get_generation_results(config, tokenizer)
+   
+    dataset_name = os.path.basename(config.data.train_files)
+    metric_dict['model_path'] = config.actor_rollout_ref.model.path
+    metric_dict['dataset'] = dataset_name
+    metric_dict['temperature'] = config.actor_rollout_ref.rollout.val_kwargs.temperature
+    metric_dict['n_samples'] = config.actor_rollout_ref.rollout.val_kwargs.n
+    metric_dict['response_length'] = config.data.max_response_length
+
+   # Check if file exists
+    file_exists = os.path.isfile(csv_path)
+    
+    # Write to CSV
+    with open(csv_path, mode='w', newline='') as f:
+        writer = csv.DictWriter(f, fieldnames=metric_dict.keys())
+        if not file_exists:
+            writer.writeheader()
+        writer.writerow(metric_dict)
+
+    # Convert the row data into a list of lists format for tabulate
+    table_data = [[k, v] for k, v in metric_dict.items()]
+    # Print table
+    print(tabulate(table_data, headers=['Metric', 'Value'], tablefmt='grid'))
+    # write tabule into txt file
+    with open(os.path.join(output_dir, f'{name_without_ext}_summary.txt'), 'w') as f:
+        f.write(tabulate(table_data, headers=['Metric', 'Value'], tablefmt='grid'))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/trainer_integration/verl/verl_custom/nvidia/remote_reward_server/benchmark_remote_reward.py b/trainer_integration/verl/verl_custom/nvidia/remote_reward_server/benchmark_remote_reward.py
new file mode 100644
index 000000000..b360a1062
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/remote_reward_server/benchmark_remote_reward.py
@@ -0,0 +1,108 @@
+
+import requests
+import time
+ip = 'localhost'
+
+solution_str = """
+```python
+def find_nice_string(n, k, s):
+    s = list(s)  # Convert string to a list for mutability
+    for i in range(n):
+        max_change = max(ord(s[i]) - ord('a'), ord('z') - ord(s[i]))
+        change = min(max_change, k)
+        
+        if ord(s[i]) - ord('a') >= ord('z') - ord(s[i]):
+            s[i] = chr(ord(s[i]) - change)
+        else:
+            s[i] = chr(ord(s[i]) + change)
+        
+        k -= change
+        if k == 0:
+            break
+
+    if k > 0:
+        print("-1")
+    else:
+        print("".join(s))
+
+# Reading input
+n, k = map(int, input().split())
+s = input().strip()
+
+find_nice_string(n, k, s)
+```
+"""
+ground_truth = {"inputs": ["4 26\nbear"], "outputs": ["zgar"]}
+data_source = 'codeforces'
+
+start = time.time()
+res = requests.post(f"http://{ip}:8388/compute_score", json={"solution_str": solution_str, "ground_truth": ground_truth, "data_source": data_source})
+score = res.json()
+res = score
+
+print(score)
+print("Total time for single request: ", time.time() - start)
+
+num_parallel_requests = 2
+
+start = time.time()
+for i in range(num_parallel_requests):
+    res = requests.post(f"http://{ip}:8388/compute_score", json={"solution_str": solution_str, "ground_truth": ground_truth, "data_source": data_source})
+    score = res.json()
+    res = score
+
+print(f"Total time for {num_parallel_requests} requests in sequence: ", time.time() - start)
+
+num_parallel_requests = 256
+import asyncio
+from concurrent.futures import ProcessPoolExecutor
+from functools import partial
+
+def request_wrapper(idx):
+    res = requests.post(f"http://{ip}:8388/compute_score", json={"solution_str": solution_str, "ground_truth": ground_truth, "data_source": data_source})
+    score = res.json()
+    res = score
+    return res
+
+async def single_compute_score(idx, executor,timeout=300.):
+    loop = asyncio.get_running_loop()
+    try:
+        # Ensure process_completion is called properly
+        tasks = [
+            asyncio.wait_for(
+                loop.run_in_executor(
+                    executor,
+                    partial(request_wrapper, idx)  # Ensure synchronous
+                ),
+                timeout=timeout)
+        ]
+        return await asyncio.gather(*tasks)
+    except asyncio.TimeoutError:
+        print(f"Timeout occurred for completion: {completion}")
+        return None  # Default value for timed-out rows
+    except Exception as e:
+        print(f"Error processing completion: {completion[:10]}, Error: {e}")
+        return None  # Default value for failed rows
+
+start = time.time()
+async def main():
+    with ProcessPoolExecutor(max_workers=num_parallel_requests) as executor:
+        # Create tasks for all rows
+        tasks_async = [
+            single_compute_score(idx, executor, timeout=300.)
+            for idx in range(num_parallel_requests)
+        ]
+        # to prevent very occasional starvation caused by some anomalous programs ( like infinite loop ), the exceptions in async programs will instantly halt the evaluation, and all summoned processes will be killed.
+        try:
+            results = await asyncio.gather(*tasks_async, return_exceptions=False)
+        except:
+            for pid, proc in executor._processes.items():
+                try:
+                    proc.kill()
+                except Exception as kill_err:
+                    print('shut down failed: ' + str(kill_err))
+            raise
+    return results
+
+results = asyncio.run(main())
+print(f"Total time for {num_parallel_requests} requests in async parallel: ", time.time() - start)
diff --git a/trainer_integration/verl/verl_custom/nvidia/remote_reward_server/launch_default_reward_server.py b/trainer_integration/verl/verl_custom/nvidia/remote_reward_server/launch_default_reward_server.py
new file mode 100644
index 000000000..6235cec1d
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/remote_reward_server/launch_default_reward_server.py
@@ -0,0 +1,59 @@
+import asyncio
+import os
+from concurrent.futures import ProcessPoolExecutor
+from functools import partial
+from typing import Any
+
+import uvicorn
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+
+from verl_custom.nvidia.reward_score import _default_compute_score as compute_score_fn
+
+app = FastAPI()
+
+# Global ProcessPoolExecutor
+# Adjust based on your server load
+# Recommended 64 since prime reward manager make 64 requests in parallel
+num_workers = 512
+executor = ProcessPoolExecutor(num_workers)  
+
+timeout = 300
+
+# Request model
+class ComputeScoreRequest(BaseModel):
+    solution_str: str
+    ground_truth: Any
+    data_source: str
+
+def single_compute_score(evaluation_func, completion, reference, task):
+    """ Synchronous function executed inside ProcessPoolExecutor """
+    try:
+        return evaluation_func(task, completion, reference)
+    except Exception as e:
+        print(f"Error processing completion: {completion[:10]}, data source: {task}, Error: {e}")
+        return 0.0  # Default score for failed rows
+
+@app.post("/compute_score")
+async def compute_score(request: ComputeScoreRequest):
+    """ Asynchronous API endpoint using ProcessPoolExecutor with timeout """
+    loop = asyncio.get_running_loop()
+    task = loop.run_in_executor(
+                executor, partial(single_compute_score, compute_score_fn, request.solution_str, request.ground_truth, request.data_source)
+            )
+    try:
+        score = await asyncio.wait_for(task, timeout=timeout)
+        return {"score": float(score)}
+    except asyncio.TimeoutError as e:
+        try:
+            task.cancel()
+        except asyncio.CancelledError:
+            pass  # Task was cancelled, we can safely ignore this error
+        print(f"Request timed out for completion: {request.solution_str[:10]}, data source: {request.data_source}")
+        raise HTTPException(status_code=408, detail="Computation timed out")
+    except Exception as e:
+        print(f"An unexpected error occurred for completion: {request.solution_str[:10]}, data source: {request.data_source}, Error: {e}")
+        raise HTTPException(status_code=500, detail="Internal server error")
+
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8388)
diff --git a/trainer_integration/verl/verl_custom/nvidia/remote_reward_server/test_local_code_reward.py b/trainer_integration/verl/verl_custom/nvidia/remote_reward_server/test_local_code_reward.py
new file mode 100644
index 000000000..f9d1f6cc2
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/remote_reward_server/test_local_code_reward.py
@@ -0,0 +1,115 @@
+
+import requests
+from verl.utils.reward_score.prime_code import compute_score
+ip = 'localhost'
+
+solution_str = """
+<think> I am omniscient. </think> 
+```python
+def find_nice_string(n, k, s):
+    s = list(s)  # Convert string to a list for mutability
+    for i in range(n):
+        max_change = max(ord(s[i]) - ord('a'), ord('z') - ord(s[i]))
+        change = min(max_change, k)
+        
+        if ord(s[i]) - ord('a') >= ord('z') - ord(s[i]):
+            s[i] = chr(ord(s[i]) - change)
+        else:
+            s[i] = chr(ord(s[i]) + change)
+        
+        k -= change
+        if k == 0:
+            break
+
+    if k > 0:
+        print("-1")
+    else:
+        print("".join(s))
+
+# Reading input
+n, k = map(int, input().split())
+s = input().strip()
+
+find_nice_string(n, k, s)
+```
+"""
+ground_truth = {"inputs": ["4 26\nbear", "4 26\nbear", "4 26\nbear"], "outputs": ["zgar", "zbar", "zgar"]}
+data_source = 'codeforces'
+
+# This should return 0.6666666666666666
+res = compute_score(solution_str, ground_truth, True)
+print(res)
+
+solution_str = """
+<think> I am omniscient. </think> 
+```python
+def find_nice_string(n, k, s):
+    s = list(s)  # Convert string to a list for mutability
+    for i in range(n):
+        max_change = max(ord(s[i]) - ord('a'), ord('z') - ord(s[i]))
+        change = min(max_change, k)
+        
+        if ord(s[i]) - ord('a') >= ord('z') - ord(s[i]):
+            s[i] = chr(ord(s[i]) - change)
+        else:
+            s[i] = chr(ord(s[i]) + change)
+        
+        k -= change
+        if k == 0:
+            break
+
+    if k > 0:
+        print("-1")
+    else:
+        print("".join(s))
+
+# Reading input
+n, k = map(int, input().split())
+s = input().strip() SYNTAX ERROR
+
+find_nice_string(n, k, s)
+```
+"""
+
+# This should return 0.0, syntax error
+res = compute_score(solution_str, ground_truth, True)
+print(res)
+
+solution_str = """
+<think> I am omniscient. </think> 
+```python
+import time
+time.sleep(10)
+n, k = map(int, input().split())
+s = input().strip()
+if n == 4:
+    print("zgar")
+else:
+    print("zbar")
+```
+"""
+import time
+start_time = time.time()
+ground_truth = {"inputs": ["4 26\nbear"] * 100, "outputs": ["zgar"] * 100}
+# This should return 0.0, timeout error
+res = compute_score(solution_str, ground_truth, True)
+print(res)
+print(f"time: {time.time() - start_time}")
+
+solution_str = """
+<think> I am omniscient. </think> 
+```python
+import time
+n, k = map(int, input().split())
+s = input().strip()
+if n == 4:
+    print("zgar")
+else:
+    assert False
+```
+"""
+
+ground_truth = {"inputs": ["4 26\nbear", "5 26\nbear", "4 26\nbear"] , "outputs": ["zgar", "zbar", "zgar"]}
+# This should return 0.0, runtime error
+res = compute_score(solution_str, ground_truth, True)
+print(res)
diff --git a/trainer_integration/verl/verl_custom/nvidia/remote_reward_server/test_local_rllm_code_reward.py b/trainer_integration/verl/verl_custom/nvidia/remote_reward_server/test_local_rllm_code_reward.py
new file mode 100644
index 000000000..e024f89bf
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/remote_reward_server/test_local_rllm_code_reward.py
@@ -0,0 +1,79 @@
+
+import requests
+from verl.utils.reward_score.deepcoder import deepcoder_reward_fn
+
+import pandas as pd
+tests = pd.read_parquet('/lustre/fsw/portfolios/nvr/users/mingjiel/data/eurus2-rl-data/deepcoder_codeforces.parquet')
+
+solution_str = """
+<think> I am omniscient. </think> 
+```python
+t = int(input())
+for _ in range(t):
+    x, y = map(int, input().split())
+    min_val = min(x, y)
+    max_val = max(x, y)
+    print(min_val, max_val)
+```
+"""
+
+import json
+ground_truth = json.loads(tests.iloc[0]['reward_model']['ground_truth'])
+data_source = tests.iloc[0]['data_source']
+#print(tests.iloc[0]['prompt'][0]['content'])
+# This should return 1.0, runtime error
+res = deepcoder_reward_fn(data_source[10:], solution_str, ground_truth)
+print(res)
+
+import pandas as pd
+tests = pd.read_parquet('/lustre/fsw/portfolios/nvr/users/mingjiel/data/eurus2-rl-data/deepcoder_humanevalplus.parquet')
+
+solution_str = """
+<think> I am omniscient. </think> 
+```python
+def has_close_elements(numbers: List[float], threshold: float) -> bool:
+    numbers.sort()
+    for i in range(len(numbers) - 1):
+        if abs(numbers[i] - numbers[i + 1]) < threshold:
+            return True
+    return False
+```
+"""
+
+import json
+ground_truth = json.loads(tests.iloc[0]['reward_model']['ground_truth'])
+data_source = tests.iloc[0]['data_source']
+#print(tests.iloc[0]['prompt'][0]['content'])
+# This should return 1.0, runtime error
+res = deepcoder_reward_fn(data_source[10:], solution_str, ground_truth)
+print(res)
+
+tests = pd.read_json('/lustre/fsw/portfolios/nvr/users/mingjiel/data/eurus2-rl-data/deepcoder_livecodebench.json', orient='records')
+
+
+solution_str = """
+<think> I am omniscient. </think> 
+```python
+n = int(input())
+A = list(map(int, input().split()))
+sorted_A = sorted(A, reverse=True)
+second_largest = sorted_A[1]
+for i in range(n):
+    if A[i] == second_largest:
+        print(i + 1)
+        break
+```
+"""
+
+import json
+ground_truth = json.loads(tests.iloc[0]['reward_model']['ground_truth'])
+data_source = tests.iloc[0]['data_source']
+print(type(ground_truth))
+#print(tests.iloc[0]['prompt'][0]['content'])
+# This should return 1.0, runtime error
+import time
+start_time = time.time()
+res = deepcoder_reward_fn(data_source[10:], solution_str, ground_truth)
+end_time = time.time()
+print(f"Time taken: {end_time - start_time} seconds")
+print(res)
\ No newline at end of file
diff --git a/trainer_integration/verl/verl_custom/nvidia/remote_reward_server/test_remote_code_reward.py b/trainer_integration/verl/verl_custom/nvidia/remote_reward_server/test_remote_code_reward.py
new file mode 100644
index 000000000..4e9099314
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/remote_reward_server/test_remote_code_reward.py
@@ -0,0 +1,95 @@
+import requests
+
+ip = 'localhost'
+
+solution_str = """
+```python
+def find_nice_string(n, k, s):
+    s = list(s)  # Convert string to a list for mutability
+    for i in range(n):
+        max_change = max(ord(s[i]) - ord('a'), ord('z') - ord(s[i]))
+        change = min(max_change, k)
+        
+        if ord(s[i]) - ord('a') >= ord('z') - ord(s[i]):
+            s[i] = chr(ord(s[i]) - change)
+        else:
+            s[i] = chr(ord(s[i]) + change)
+        
+        k -= change
+        if k == 0:
+            break
+
+    if k > 0:
+        print("-1")
+    else:
+        print("".join(s))
+
+# Reading input
+n, k = map(int, input().split())
+s = input().strip()
+
+find_nice_string(n, k, s)
+```
+"""
+ground_truth = {"inputs": ["4 26\nbear", "4 26\nbear"], "outputs": ["zgar", "zbar"]}
+data_source = 'codeforces'
+
+res = requests.post(f"http://{ip}:8388/compute_score", json={"solution_str": solution_str, "ground_truth": ground_truth, "data_source": data_source})
+score = res.json()
+res = score
+
+print(score)
+
+
+import pandas as pd
+tests = pd.read_parquet('/lustre/fsw/portfolios/nvr/users/mingjiel/data/eurus2-rl-data/deepcoder_codeforces.parquet')
+
+solution_str = """
+<think> I am omniscient. </think> 
+```python
+t = int(input())
+for _ in range(t):
+    x, y = map(int, input().split())
+    min_val = min(x, y)
+    max_val = max(x, y)
+    print(min_val, max_val)
+```
+"""
+import json
+
+ground_truth = json.loads(tests.iloc[0]['reward_model']['ground_truth'])
+data_source = tests.iloc[0]['data_source']
+
+# This should return 1.0, runtime error
+res = requests.post(f"http://{ip}:8388/compute_score", json={"solution_str": solution_str, "ground_truth": ground_truth, "data_source": data_source})
+score = res.json()
+res = score
+
+print(score)
+
+tests = pd.read_json('/lustre/fsw/portfolios/nvr/users/mingjiel/data/eurus2-rl-data/deepcoder_livecodebench.json', orient='records')
+
+
+solution_str = """
+<think> I am omniscient. </think> 
+```python
+n = int(input())
+A = list(map(int, input().split()))
+sorted_A = sorted(A, reverse=True)
+second_largest = sorted_A[1]
+for i in range(n):
+    if A[i] == second_largest:
+        print(i + 1)
+        break
+```
+"""
+
+import json
+ground_truth = json.loads(tests.iloc[0]['reward_model']['ground_truth'])
+data_source = tests.iloc[0]['data_source']
+# This should return 1.0, runtime error
+res = requests.post(f"http://{ip}:8388/compute_score", json={"solution_str": solution_str, "ground_truth": ground_truth, "data_source": data_source})
+score = res.json()
+res = score
+
+print(score)
\ No newline at end of file
diff --git a/trainer_integration/verl/verl_custom/nvidia/remote_reward_server/test_remote_math_reward.py b/trainer_integration/verl/verl_custom/nvidia/remote_reward_server/test_remote_math_reward.py
new file mode 100644
index 000000000..d90479426
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/remote_reward_server/test_remote_math_reward.py
@@ -0,0 +1,14 @@
+
+import requests
+
+ip = 'localhost'
+
+solution_str = """<think> I am omniscient. </think> The answer is \\boxed{24}."""
+ground_truth = ['24']
+data_source = 'deepscaler'
+
+res = requests.post(f"http://{ip}:8388/compute_score", json={"solution_str": solution_str, "ground_truth": ground_truth, "data_source": data_source})
+score = res.json()
+res = score
+
+print(score)
\ No newline at end of file
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_manager/__init__.py b/trainer_integration/verl/verl_custom/nvidia/reward_manager/__init__.py
new file mode 100644
index 000000000..b704159e9
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_manager/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2024 PRIME team and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .naive import NaiveRewardManager
+from .prime import PrimeRewardManager
+from .swebench import SWEBenchRewardManager
\ No newline at end of file
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_manager/length_penalty.py b/trainer_integration/verl/verl_custom/nvidia/reward_manager/length_penalty.py
new file mode 100644
index 000000000..62067d0bc
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_manager/length_penalty.py
@@ -0,0 +1,160 @@
+from verl import DataProto
+import torch
+import json
+import math
+import multiprocessing as mp
+
+class LengthPenalty:
+    def __init__(self, config=None):
+        self.config = config
+
+    def _ngram_repetition_reward(self, args):
+        sequence, ngram_size, penalty = args
+        # Create ngram array using unfold
+        ngram_array = sequence.unfold(0, ngram_size, 1)
+        
+        # Get unique ngrams and their counts
+        unique_ngrams, counts = torch.unique(ngram_array, dim=0, return_counts=True)
+        
+        # Find positions of repeated ngrams
+        repeated_mask = counts > 1
+        if not repeated_mask.any():
+            return torch.zeros(len(sequence), dtype=torch.float32)
+            
+        repeated_ngrams = unique_ngrams[repeated_mask]
+        curr_reward = torch.zeros(len(sequence), dtype=torch.float32)
+        
+        if len(repeated_ngrams) > 0:
+            # Find all occurrences of repeated ngrams
+            for ng in repeated_ngrams:
+                matches = (ngram_array == ng.unsqueeze(0)).all(dim=1)
+                positions = torch.where(matches)[0]
+                
+                # Apply penalty to all occurrences except the first one
+                for pos in positions[1:]:
+                    curr_reward[pos] = penalty
+        return curr_reward
+
+    def get_repetition_reward(self, generations):
+        bsz = generations.shape[0]
+        ngram_size = self.config.repetition_ngram_size
+        
+        # Prepare arguments for multiprocessing
+        args = [(generations[i], ngram_size, self.config.repetition_penalty) for i in range(bsz)]
+        
+        # Use multiprocessing to process sequences in parallel
+        num_processes = min(256, bsz) 
+        
+        with mp.Pool(processes=num_processes) as pool:
+            results = pool.map(self._ngram_repetition_reward, args)
+        
+        # Stack results
+        curr_reward = torch.stack(results)
+        return curr_reward
+
+    def __call__(self, data:DataProto):
+        if self.config is None or self.config.length_penalty_type == 'none':
+            return data.batch['token_level_scores']
+
+        with torch.no_grad():    
+            prompt_ids = data.batch['prompts']
+            prompt_length = prompt_ids.shape[-1]
+            response_ids = data.batch['responses']
+            response_length = response_ids.shape[-1]
+            valid_response_length = data.batch['attention_mask'][:, prompt_length:].sum(dim=-1)
+            response_mask = data.batch['attention_mask'][:, -response_length:]
+            scores = data.batch['token_level_scores']
+            data_sources = data.non_tensor_batch['data_source']
+
+            modulated_scores = torch.zeros_like(data.batch['responses'], dtype=torch.float32)
+            
+            if self.config.length_penalty_type == 'linear':
+                modulated_scores =  scores * (1 - valid_response_length.unsqueeze(-1) / self.config.max_length)
+            elif self.config.length_penalty_type == 'instance_linear':
+                # Penalty is only applied to if prompt that have scores >= 1.0
+                index = data.non_tensor_batch['uid']
+                instance_scores = scores.sum(dim=-1)
+                bsz = scores.shape[0]
+
+                # get minimum length for each instance where score >= 1.0
+                id2min = {}
+                for i in range(bsz):
+                    if instance_scores[i] >= 1.0:
+                        idx = index[i]
+                        if idx not in id2min:
+                            id2min[idx] = valid_response_length[i]
+                        else:
+                            id2min[idx] = min(id2min[idx], valid_response_length[i])
+
+                # get length coef 
+                length_coef = []
+                for i in range(bsz):
+                    idx = index[i]
+                    if idx not in id2min:
+                        length_coef.append(1.0)
+                    else:
+                        coef = 1.0 - (valid_response_length[i] - id2min[idx]) / (self.config.max_length - id2min[idx])
+                        length_coef.append(min(1.0, coef))
+
+                modulated_scores[torch.arange(bsz), valid_response_length-1] = torch.tensor(length_coef, dtype=torch.float32)    
+                modulated_scores = scores * modulated_scores  
+            elif self.config.length_penalty_type == 'cosine':
+                instance_scores = scores.sum(dim=-1)
+                bsz = scores.shape[0]
+                
+                # Pre-compute masks
+                is_binary_source = torch.tensor([ds in self.config.binary_score_data_sources for ds in data_sources], device=instance_scores.device)
+                is_selected_source = torch.tensor([ds in self.config.selected_data_sources for ds in data_sources], device=instance_scores.device)
+                
+                # Convert scores to binary where needed
+                cur_scores = torch.where(
+                    is_binary_source & (instance_scores > 0.99),
+                    torch.ones_like(instance_scores),
+                    torch.where(
+                        is_binary_source,
+                        torch.zeros_like(instance_scores),
+                        instance_scores
+                    )
+                )
+                
+                # Apply cosine length penalty only to selected sources
+                if is_selected_source.any():
+                    # Pre-compute progress values
+                    progress = valid_response_length.float() / self.config.max_length
+                    cosine = torch.cos(progress * math.pi)
+                    
+                    # Compute min and max values based on scores
+                    min_values = torch.where(
+                        cur_scores > 0.99,
+                        torch.full_like(cur_scores, self.config.min_value_correct),
+                        torch.full_like(cur_scores, self.config.max_value_wrong)
+                    )
+                    max_values = torch.where(
+                        cur_scores > 0.99,
+                        torch.full_like(cur_scores, self.config.max_value_correct),
+                        torch.full_like(cur_scores, self.config.min_value_wrong)
+                    )
+                    
+                    # Apply cosine formula
+                    r = min_values + 0.5 * (max_values - min_values) * (1.0 + cosine)
+                    
+                    # Apply linear penalty for low scores and long generations
+                    long_gen_mask = (cur_scores <= 0.99) & (valid_response_length + self.config.max_length_margin > self.config.max_length)
+                    if long_gen_mask.any():
+                        linear_penalty = (max_values - min_values) / self.config.max_length_margin * \
+                                       (valid_response_length - self.config.max_length + self.config.max_length_margin) + min_values
+                        r = torch.where(long_gen_mask, linear_penalty, r)
+                    
+                    # Only apply to selected sources
+                    cur_scores = torch.where(is_selected_source, r, cur_scores)
+
+                # Add repetition penalty all responses
+                modulated_scores = self.get_repetition_reward(response_ids)
+                modulated_scores = torch.where(response_mask.bool(), modulated_scores, torch.zeros_like(modulated_scores))
+
+                # Add length penalty
+                modulated_scores[torch.arange(bsz), valid_response_length-1] += cur_scores
+            else:
+                raise ValueError(f"Length penalty type {self.config.length_penalty_type} not supported")
+        
+        return modulated_scores
\ No newline at end of file
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_manager/naive.py b/trainer_integration/verl/verl_custom/nvidia/reward_manager/naive.py
new file mode 100644
index 000000000..44d8fe64c
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_manager/naive.py
@@ -0,0 +1,132 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from verl import DataProto
+from verl_custom.nvidia.reward_score import _default_compute_score, _remote_compute_score
+import torch
+import numpy as np
+import ray
+
+@ray.remote
+class NaiveRewardManager:
+    """The reward manager.
+    """
+
+    def __init__(self, tokenizer, config, compute_score=None) -> None:
+        self.tokenizer = tokenizer
+        self.config = config
+        self.num_examine = self.config.num_examine  # the number of batches of decoded responses to print to the console
+        self.compute_score = compute_score or _default_compute_score
+        if isinstance(self.config.server_ip, str):
+            self.server_ip = self.config.server_ip
+        else:
+            try: 
+                self.server_ip = list(self.config.server_ip)[0]
+                print(f"Detected list of server ips: {self.config.server_ip}. Please use prime reward manager for parallel processing. Currently using naive reward manager with single server ip: {self.server_ip}.")
+            except:
+                raise ValueError(f"server_ip should be a list or a string: {self.config.server_ip}")
+        self.use_remote_reward = self.config.use_remote_reward
+        if self.use_remote_reward:
+            self.compute_score = _remote_compute_score
+        self.binary_score = self.config.get('binary_score', False)
+        self.length_penalty = None
+        self.stop_properly_penalty = None
+
+    def set_length_penalty(self, length_penalty):
+        self.length_penalty = length_penalty
+    
+    def set_stop_properly_penalty(self, stop_properly_penalty):
+        self.stop_properly_penalty = stop_properly_penalty
+
+    def _apply_stop_properly_penalty(self, reward_tensor: torch.Tensor, batch: DataProto):
+        # apply stop properly penalty
+        # get coefficient from config
+        stop_penalty_coef = self.stop_properly_penalty
+        # stop_properly is a tensor of shape (bs) float type
+        stop_properly = batch.batch['stop_properly'].type(reward_tensor.dtype)
+
+        # if stop_properly is True, the value is 1.0, otherwise, it is self.config.stop_properly_penalty.penalty_coef 
+        stop_properly_scale = (1.0 - stop_properly) * stop_penalty_coef + stop_properly
+        stop_properly_scale = stop_properly_scale.unsqueeze(-1)
+        # apply the scale to the reward_tensor
+        reward_tensor = reward_tensor * stop_properly_scale
+        return reward_tensor
+            
+    def __call__(self, data: DataProto):
+        """We will expand this function gradually based on the available datasets"""
+
+        # If there is rm score, we directly return rm score. Otherwise, we compute via rm_score_fn
+        if 'rm_scores' in data.batch.keys():
+            return data.batch['rm_scores']
+
+        reward_tensor = torch.zeros_like(data.batch['responses'], dtype=torch.float32)
+
+        already_print_data_sources = {}
+
+        for i in range(len(data)):
+            data_item = data[i]  # DataProtoItem
+
+            prompt_ids = data_item.batch['prompts']
+
+            prompt_length = prompt_ids.shape[-1]
+
+            response_ids = data_item.batch['responses']
+            valid_response_length = data_item.batch['attention_mask'][prompt_length:].sum()
+            valid_response_ids = response_ids[:valid_response_length]
+
+            # decode only response
+            sequences_str = self.tokenizer.decode(valid_response_ids)
+
+            ground_truth = []
+            for data_item in data:
+                if 'ground_truth' in data_item.non_tensor_batch['reward_model']:
+                    ground_truth.append(data_item.non_tensor_batch['reward_model']['ground_truth'].tolist() if isinstance(data_item.non_tensor_batch['reward_model']['ground_truth'], np.ndarray) else data_item.non_tensor_batch['reward_model']['ground_truth'])
+                else:
+                    ground_truth.append(None)
+
+            data_source = data_item.non_tensor_batch['data_source']
+            data_item.non_tensor_batch['server_ip'] = self.server_ip
+
+            score = self.compute_score(
+                data_source=data_source,
+                solution_str=sequences_str,
+                ground_truth=ground_truth,
+                extra_info=data_item.non_tensor_batch,
+            )
+
+            if score > 1.0 or score < 0.0:
+                print(f"Warning: score {score} is greater than 1.0 or less than 0.0 for completion: {sequences_str[:10]}, data source: {data_source}. Double check this is as intended.")
+
+            reward_tensor[i, valid_response_length - 1] = score
+
+            if data_source not in already_print_data_sources:
+                already_print_data_sources[data_source] = 0
+
+            if already_print_data_sources[data_source] < self.num_examine:
+                already_print_data_sources[data_source] += 1
+                print(sequences_str)
+
+        # score_tensor is the unmodulated score of the response
+        score_tensor = reward_tensor    
+        
+        # reward_tensor is the modulated score of the response
+        # after applying stop properly penalty and length penalty
+        if self.stop_properly_penalty is not None:
+            reward_tensor = self._apply_stop_properly_penalty(score_tensor, data)
+        
+        if self.length_penalty is not None:
+            data.batch['token_level_scores'] = reward_tensor
+            reward_tensor = self.length_penalty(data)
+
+        return {'score': score_tensor, 'reward': reward_tensor}
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_manager/prime.py b/trainer_integration/verl/verl_custom/nvidia/reward_manager/prime.py
new file mode 100644
index 000000000..87c619bc0
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_manager/prime.py
@@ -0,0 +1,261 @@
+# Copyright 2024 PRIME team and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+from concurrent.futures import ProcessPoolExecutor
+from functools import partial
+
+import aiohttp
+import numpy as np
+import ray
+import torch
+
+from verl import DataProto
+from verl_custom.nvidia.reward_score import (_default_compute_score,
+                                      _remote_compute_score_async)
+
+
+def remove_pad_token(input_ids: torch.Tensor, attention_mask: torch.Tensor):
+    """ Remove the pad token. Return tensors and not lists.
+
+    Args:
+        input_ids shape: [bs, seq_length]
+        attention_mask shape: [bs, seq_length]
+    Returns:
+        no_padding_batch(List[Tensor[int]]): contains the rmpad token ids per query.
+    """
+    no_padding_batch = []
+    for ids, mask in zip(input_ids, attention_mask):
+        # Fix for both left and right padding
+        no_padding_batch.append((ids[mask.bool()]))
+    return no_padding_batch
+
+
+async def single_compute_score(evaluation_func, completion, reference, task, executor, timeout=600, extra_info=None):
+    loop = asyncio.get_running_loop()
+    try:
+        # Ensure process_completion is called properly
+        tasks = [
+            asyncio.wait_for(
+                loop.run_in_executor(
+                    executor,
+                    partial(evaluation_func, task, completion, reference, extra_info)  # Ensure synchronous
+                ),
+                timeout=timeout)
+        ]
+        return await asyncio.gather(*tasks)
+    except asyncio.TimeoutError:
+        print(f"Timeout occurred for completion: {completion[:10]}, data source: {task}")
+        return None  # Default value for timed-out rows
+    except Exception as e:
+        print(f"Error processing completion: {completion[:10]}, data source: {task}, Error: {e}")
+        return None  # Default value for failed rows
+
+
+async def parallel_compute_score_async(evaluation_func, completions, references, tasks, extra_infos, num_processes=256):
+    scores = []
+    with ProcessPoolExecutor(max_workers=num_processes) as executor:
+        # Create tasks for all rows
+        tasks_async = [
+            single_compute_score(evaluation_func, completion, reference, task, executor, timeout=300, extra_info=extra_info)
+            for completion, reference, task, extra_info in zip(completions, references, tasks, extra_infos)
+        ]
+        # to prevent very occasional starvation caused by some anomalous programs ( like infinite loop ), the exceptions in async programs will instantly halt the evaluation, and all summoned processes will be killed.
+        try:
+            results = await asyncio.gather(*tasks_async, return_exceptions=False)
+        except:
+            for pid, proc in executor._processes.items():
+                try:
+                    proc.kill()
+                except Exception as kill_err:
+                    print('shut down failed: ' + str(kill_err))
+            raise
+
+    # Process results
+    for result, completion, reference, task in zip(results, completions, references, tasks):
+        if isinstance(result, Exception) or result is None:
+            # Handle failed or timed-out tasks
+            print(f"Error in parallel_compute_score_async. Completion: {completion[:10]}, data source: {task}, Error: {result}. Setting score to 0.")
+            scores.append(0.0)
+        elif isinstance(result[0], (int, float, bool)):
+            scores.append(float(result[0]))
+        else:
+            scores.append(float(result[0][0]))
+    return scores
+
+async def _remote_score_wrapper_with_semaphore(task, completion, reference, extra_info, session, semaphore):
+    async with semaphore:
+        return await _remote_compute_score_async(task, completion, reference, extra_info, session)
+
+async def parallel_remote_score_async(completions, references, tasks, extra_infos, max_concurrency=4096):
+    # Using client session to avoid spinning multiple sessions
+    # We avoid using ProcessPoolExecutor/ThreadPoolExecutor. might clash resource with server and lots of overhead.
+    scores = []
+    semaphore = asyncio.Semaphore(max_concurrency)
+    try:
+        timeout = aiohttp.ClientTimeout(total=600) # set larger than remote server timeout
+        async with aiohttp.ClientSession(timeout=timeout) as session:
+            tasks_async = [
+                _remote_score_wrapper_with_semaphore(task, completion, reference, extra_info, session, semaphore)
+                for completion, reference, task, extra_info in zip(completions, references, tasks, extra_infos)
+            ]
+            results = await asyncio.gather(*tasks_async, return_exceptions=False)
+    except Exception as e:
+        print(f"Error in parallel_remote_score_async. All results are set to 0. Error: {e}")
+        return [0. for _ in range(len(completions))]
+
+    # Process results
+    for result, completion, reference, task in zip(results, completions, references, tasks):
+        if isinstance(result, Exception) or result is None:
+            # Handle failed or timed-out tasks
+            print(f"Error in parallel_remote_score_async. Completion: {completion[:10]}, data source: {task}, Error: {result}. Setting score to 0.")
+            scores.append(0.0)
+        else:
+            result = float(result)
+            if result > 1.0 or result < 0.0:
+                print(f"Warning: score {result} is greater than 1.0 or less than 0.0 for completion: {completion[:10]}, data source: {task}. Double check this is as intended.")
+            scores.append(result)
+    return scores
+
+@ray.remote
+class PrimeRewardManager:
+    """
+    The Reward Manager used in https://github.com/PRIME-RL/PRIME
+    """
+
+    def __init__(self, tokenizer, config, compute_score=None) -> None:
+        self.tokenizer = tokenizer
+        self.config = config
+        self.num_examine = self.config.num_examine  # the number of batches of decoded responses to print to the console
+        self.compute_score = compute_score or _default_compute_score
+        if isinstance(self.config.server_ip, str):
+            self.server_ip = [self.config.server_ip]
+        else:
+            try: 
+                self.server_ip = list(self.config.server_ip)
+            except:
+                raise ValueError(f"server_ip should be a list or a string: {self.config.server_ip}")
+        self.use_remote_reward = self.config.use_remote_reward
+        if self.use_remote_reward:
+            assert self.config.max_concurrency >= len(self.server_ip), f"max_concurrency ({self.config.max_concurrency}) must be greater than or equal to the number of server IPs ({len(self.server_ip)})."
+            self.compute_score = _remote_compute_score_async
+            self.max_concurrency = self.config.max_concurrency
+        self.binary_score = self.config.get('binary_score', False)
+        self.length_penalty = None
+        self.stop_properly_penalty = None
+
+    def set_length_penalty(self, length_penalty):
+        self.length_penalty = length_penalty
+    
+    def set_stop_properly_penalty(self, stop_properly_penalty):
+        self.stop_properly_penalty = stop_properly_penalty
+
+    def _apply_stop_properly_penalty(self, reward_tensor: torch.Tensor, batch: DataProto):
+        # apply stop properly penalty
+        # get coefficient from config
+        stop_penalty_coef = self.stop_properly_penalty
+        # stop_properly is a tensor of shape (bs) float type
+        stop_properly = batch.batch['stop_properly'].type(reward_tensor.dtype)
+
+        # if stop_properly is True, the value is 1.0, otherwise, it is self.config.stop_properly_penalty.penalty_coef 
+        stop_properly_scale = (1.0 - stop_properly) * stop_penalty_coef + stop_properly
+        stop_properly_scale = stop_properly_scale.unsqueeze(-1)
+        # apply the scale to the reward_tensor
+        reward_tensor = reward_tensor * stop_properly_scale
+        return reward_tensor
+        
+    def __call__(self, data: DataProto):
+        """We will expand this function gradually based on the available datasets"""
+        # If there is rm score, we directly return rm score. Otherwise, we compute via rm_score_fn
+        if 'rm_scores' in data.batch.keys():
+            return data.batch['rm_scores']
+
+        reward_tensor = torch.zeros_like(data.batch['responses'], dtype=torch.float32)
+
+        already_print_data_sources = {}
+
+        # batched scoring
+        prompt_ids = data.batch['prompts']
+        prompt_length = prompt_ids.shape[-1]
+
+        response_ids = data.batch['responses']
+        valid_response_length = data.batch['attention_mask'][:, prompt_length:].sum(dim=-1)
+        sequences = remove_pad_token(response_ids, data.batch['attention_mask'][:, prompt_length:])
+        sequences_str = self.tokenizer.batch_decode(sequences)
+
+        ground_truth = []
+        for data_item in data:
+            if 'ground_truth' in data_item.non_tensor_batch['reward_model']:
+                ground_truth.append(data_item.non_tensor_batch['reward_model']['ground_truth'].tolist() if isinstance(data_item.non_tensor_batch['reward_model']['ground_truth'], np.ndarray) else data_item.non_tensor_batch['reward_model']['ground_truth'])
+            else:
+                ground_truth.append(None)
+
+        data_sources = data.non_tensor_batch['data_source']
+
+        # Evenly distribute the data across the server IPs in round-robin manner
+        if len(data.batch) % len(self.server_ip) == 0:
+            data.non_tensor_batch['server_ip'] = np.array( self.server_ip * (len(data.batch) // len(self.server_ip)), dtype=object)
+        else:
+            server_ip_list = self.server_ip * (len(data.batch) // len(self.server_ip) + 1)
+            data.non_tensor_batch['server_ip'] = np.array(server_ip_list[:len(data.batch)], dtype=object)
+        extra_infos = [data_item.non_tensor_batch for data_item in data]
+
+        assert len(sequences_str) == len(ground_truth) == len(data_sources)
+        try:
+            if self.use_remote_reward:
+                scores = asyncio.run(
+                    parallel_remote_score_async(sequences_str,
+                                            ground_truth,
+                                            data_sources,
+                                            extra_infos,
+                                            self.max_concurrency))
+            else:
+                scores = asyncio.run(
+                    parallel_compute_score_async(self.compute_score,
+                                            sequences_str,
+                                            ground_truth,
+                                            data_sources,
+                                            extra_infos,
+                                            num_processes=64))
+        except asyncio.TimeoutError as e:
+            print('Global timeout in reward computing! Setting all as 0.')
+            scores = [0. for _ in range(len(sequences_str))]
+        except Exception as e:
+            print(f"Unexpected error in batched reward computing. Setting all as 0.: {e}")
+            scores = [0. for _ in range(len(sequences_str))]
+
+        for i in range(len(data)):
+            data_source = data_sources[i]
+            reward_tensor[i, valid_response_length[i].item() - 1] = scores[i]
+
+            if data_source not in already_print_data_sources:
+                already_print_data_sources[data_source] = 0
+
+            if already_print_data_sources[data_source] < self.num_examine:
+                already_print_data_sources[data_source] += 1
+                print(sequences_str)
+        
+        # score_tensor is the unmodulated score of the response
+        score_tensor = reward_tensor
+        
+        # reward_tensor is the modulated score of the response
+        # after applying stop properly penalty and length penalty
+        if self.stop_properly_penalty is not None:
+            reward_tensor = self._apply_stop_properly_penalty(score_tensor, data)
+
+        if self.length_penalty is not None:
+            data.batch['token_level_scores'] = reward_tensor
+            reward_tensor = self.length_penalty(data)
+
+        return {'score': score_tensor, 'reward': reward_tensor}
\ No newline at end of file
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_manager/swebench.py b/trainer_integration/verl/verl_custom/nvidia/reward_manager/swebench.py
new file mode 100644
index 000000000..e84ce9e8e
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_manager/swebench.py
@@ -0,0 +1,133 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import statistics
+from verl import DataProto
+from verl.utils.reward_score import _default_compute_score
+import torch
+import ray
+import logging
+import os
+
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "INFO"))
+
+@ray.remote
+class SWEBenchRewardManager:
+    """The reward manager.
+    """
+
+    def __init__(self, tokenizer, config, compute_score=None, num_examine=1) -> None:
+        self.data_source = "SWE-Gym/SWE-Gym"
+
+        self.tokenizer = tokenizer
+        self.num_examine = num_examine  # the number of batches of decoded responses to print to the console
+        self.verifier_func = compute_score or _default_compute_score
+        self.config = config
+    
+    def set_length_penalty(self, length_penalty):
+        self.length_penalty = length_penalty
+    
+    def set_stop_properly_penalty(self, stop_properly_penalty):
+        self.stop_properly_penalty = stop_properly_penalty
+    
+    def _apply_stop_properly_penalty(self, reward_tensor: torch.Tensor, batch: DataProto):
+        # apply stop properly penalty
+        # get coefficient from config
+        stop_penalty_coef = self.stop_properly_penalty
+        # stop_properly is a tensor of shape (bs) float type
+        stop_properly = 1.
+
+        # if stop_properly is True, the value is 1.0, otherwise, it is self.config.stop_properly_penalty.penalty_coef 
+        stop_properly_scale = (1.0 - stop_properly) * stop_penalty_coef + stop_properly
+        # apply the scale to the reward_tensor
+        reward_tensor = reward_tensor * stop_properly_scale
+        return reward_tensor
+    
+    
+    
+    def verify(self, data):
+        resolved = data.non_tensor_batch['resolved']
+        error = data.non_tensor_batch['error']
+        has_finish_action = data.non_tensor_batch['finish']
+        score = [0. for _ in range(len(resolved))]
+        for i, r in enumerate(resolved):
+            if r:
+                score[i] = 1.0
+
+        reward_metrics = {}
+        reward_metrics['max_turn_ratio'] = sum("RuntimeError: Agent reached maximum iteration in headless mode" in e for e in error if e) / len(error)
+        reward_metrics['finish_action_ratio'] = sum(has_finish_action) / len(has_finish_action)
+        reward_metrics['stuck_ratio'] = sum("stuck in a loop" in e for e in error if e) / len(error)
+
+        data.batch['acc'] = torch.tensor(score, dtype=torch.float32, device=data.batch['responses'].device)
+        for ability in list(set(data.non_tensor_batch['ability'])):
+            score_ = [data.batch['acc'][i].item() for i in range(len(data.batch['acc'])) if
+                      data.non_tensor_batch['ability'][i] == ability]
+            reward_metrics[f'{ability}'] = statistics.mean(score_)
+        reward_metrics['all'] = data.batch['acc'].mean().item()
+        
+        return score, reward_metrics
+    
+    def __call__(self, data: DataProto):
+        """We will expand this function gradually based on the available datasets"""
+        reward_tensor_dict={}
+        reward_metrics={}
+        reward_tensor = torch.zeros_like(data.batch['responses'], dtype=torch.float32)
+
+        verifier_reward=torch.zeros_like(data.batch['responses'], dtype=torch.float32)
+
+        response_ids = data.batch['responses']
+        response_length = response_ids.shape[-1]
+        valid_response_length = data.batch['attention_mask'][:, -response_length:].sum(-1)
+        
+        # if the batch already contains evaluation results, the verification is skipped here.
+        if 'acc' in data.batch:
+            verifier_score = data.batch['acc'].cpu().numpy().tolist()
+        else:
+            # verifier_score, verifier_metrics = self.verify(data)
+            # Use ray based concurrency
+            verifier_score, verifier_metrics = self.verify(data)
+            reward_metrics.update(verifier_metrics)
+        for i in range(verifier_reward.shape[0]):
+            verifier_reward[i, valid_response_length[i] - 1] = verifier_score[i]
+
+        reward_tensor_dict['gt_scores'] = verifier_reward
+        
+        if 'rm_scores' in data.batch.keys():
+            reward_tensor_dict['rm_scores'] = data.batch['rm_scores']
+            reward_metrics['reward_model']=data.batch['rm_scores'].sum(dim=1).mean().item()
+            if self.config.reward_model.rm_coef!=0:
+                reward_tensor += self.config.reward_model.rm_coef * reward_tensor_dict['rm_scores']
+
+        if self.config.verifier.reward_coef!=0: # TODO: check if this is correct
+            reward_metrics['verifier'] = reward_tensor_dict['gt_scores'].sum(dim=1).mean().item()
+            reward_tensor += self.config.verifier.reward_coef * reward_tensor_dict['gt_scores']
+
+        reward_tensor_dict['all'] = reward_tensor
+        # TODO: log the reward_metrics
+        reward_metrics['reward_all'] = reward_tensor.sum(dim=-1).mean(dim=0).item()
+        # reward_tensor is the modulated score of the response
+        # after applying stop properly penalty and length penalty
+        score_tensor=reward_tensor
+        data.batch['stop_properly']=torch.ones_like(reward_tensor, dtype=torch.float32) # HACK: this is a hack to make the stop properly penalty work
+        if self.stop_properly_penalty is not None:
+            reward_tensor = self._apply_stop_properly_penalty(reward_tensor, data)
+
+        if self.length_penalty is not None:
+            data.batch['token_level_scores'] = reward_tensor
+            reward_tensor = self.length_penalty(data)
+        #reward_tensor: [batch_size*NUM_TRAJ, message_length]
+        return {'score': score_tensor, 'reward': reward_tensor, 'reward_metrics': reward_metrics}
+
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/__init__.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/__init__.py
new file mode 100644
index 000000000..74c1ec40f
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/__init__.py
@@ -0,0 +1,146 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# from . import gsm8k, math, prime_math, prime_code
+
+def _default_compute_score(data_source, solution_str, ground_truth, extra_info=None):
+    if data_source == 'deepscaler':
+        from . import deepscaler
+        res = deepscaler.deepscaler_reward_fn(solution_str, ground_truth)
+    elif data_source[:9] == 'deepcoder':
+        # data_source will be named as deepcoder_<dataset_name>, e.g. deepcoder_codeforces
+        # this is to avoid collision with prime_code
+        from . import deepcoder
+        res = deepcoder.deepcoder_reward_fn(data_source[10:], solution_str, ground_truth)
+    elif data_source == 'openai/gsm8k':
+        from . import gsm8k
+        res = gsm8k.compute_score(solution_str, ground_truth)
+    elif data_source in ['lighteval/MATH', 'DigitalLearningGmbH/MATH-lighteval']:
+        from . import math
+        res = math.compute_score(solution_str, ground_truth)
+    elif data_source in [
+            'numina_aops_forum', 'numina_synthetic_math', 'numina_amc_aime', 'numina_synthetic_amc', 'numina_cn_k12',
+            'numina_olympiads'
+    ]:
+        from . import prime_math
+        res = prime_math.compute_score(solution_str, ground_truth)
+    elif data_source in ['codecontests', 'apps', 'codeforces', 'taco', 'rStar']:
+        from . import prime_code
+        res = prime_code.compute_score(solution_str, ground_truth, continuous=True)
+    elif data_source in ['tool_call']:
+        from . import toolcall
+        res = toolcall.compute_score(solution_str, ground_truth) 
+    elif data_source == 'if_eval':
+        from . import ifeval
+        import json
+        info = json.loads(ground_truth)
+        prompt = info["prompt"]
+        instruction_id_list = info["instruction_id_list"]
+        kwargs = info["kwargs"]
+        res = ifeval.compute_score_strict(prompt, solution_str, instruction_id_list, kwargs)
+    elif data_source == 'llm_judge':
+        from . import llm_judge
+        import os
+        import random
+        
+        # Extract information from ground_truth_info
+        question = ground_truth.get('prompt')
+        metadata = {
+            "reference_answer": ground_truth.get('answer'),
+            "extract_box": ground_truth.get('extract_box', False)
+        }
+        
+        # Get base_url from environment variable, support list format
+        base_url_env = os.getenv("OPENAI_BASE_URL", "https://integrate.api.nvidia.com/v1")
+        # Parse as comma-separated list and randomly select one
+        base_url_list = [url.strip() for url in base_url_env.split(',') if url.strip()]
+        base_url = random.choice(base_url_list) if len(base_url_list) > 1 else base_url_list[0]
+            
+        sampling_params = {
+            "base_url": base_url,
+            "api_key": os.getenv("OPENAI_API_KEY", ""),
+            "model": os.getenv("OPENAI_MODEL", "meta/llama-3.3-70b-instruct"),
+            "temperature": float(os.getenv("OPENAI_TEMPERATURE", "0.0")),
+            "top_p": float(os.getenv("OPENAI_TOP_P", "0.7")),
+            "top_k": int(os.getenv("OPENAI_TOP_K", "-1")),
+            "max_tokens": int(os.getenv("OPENAI_MAX_TOKENS", "1024")),
+            "max_model_len": int(os.getenv("OPENAI_MAX_MODEL_LEN", "8192"))
+        }
+        
+        res = llm_judge.compute_score(question, solution_str, metadata, sampling_params)
+    else:    
+        raise NotImplementedError
+
+    if isinstance(res, (int, float, bool)):
+        return float(res)
+    else:
+        return float(res[0])
+
+# Currently assuming all remote rewards are hosted on same server
+def _remote_compute_score(data_source, solution_str, ground_truth, extra_info=None, session=None):
+    import requests
+    session = session or requests
+    ip = extra_info['server_ip']
+    if data_source == 'reasoning_gym':
+        try:
+            res = session.post(f"http://{ip}:8288/score", json={"answer": solution_str, "entry": extra_info['reward_model']['entry'], "task": extra_info['reward_model']['reasoning_task']})
+            score = res.json()['score']
+            res = score
+        except Exception as e:
+            print(f"Error: {e}, ip: {ip}")
+            print(f"answer: {solution_str[:10]}, task: {extra_info['reward_model']['reasoning_task']}, entry: {extra_info['reward_model']['entry']}"[:100])
+            res = 0
+    else:
+        try: 
+            res = session.post(f"http://{ip}:8388/compute_score", json={"solution_str": solution_str, "ground_truth": ground_truth, "data_source": data_source})
+            score = res.json()['score']
+            res = score
+        except Exception as e:
+            print(f"Error: {e}, ip: {ip}")
+            print(f"answer: {solution_str[:10]}, data_source: {data_source}, ground_truth: {ground_truth}"[:100])
+            res = 0
+
+    if isinstance(res, (int, float, bool)):
+        return float(res)
+    else:
+        return float(res[0])
+
+async def _remote_compute_score_async(data_source, solution_str, ground_truth, extra_info=None, session=None):
+    import aiohttp
+    # please pass in session, spinning each session is very bad
+    session = session or aiohttp.ClientSession()
+    ip = extra_info['server_ip']
+    if data_source == 'reasoning_gym':
+        try:
+            async with session.post(f"http://{ip}:8288/score", json={"answer": solution_str, "entry": extra_info['reward_model']['entry'], "task": extra_info['reward_model']['reasoning_task']}) as res:
+                res = await res.json()
+                res = res['score']
+        except Exception as e:
+            print(f"Error: {e}, ip: {ip}")
+            print(f"answer: {solution_str[:10]}, task: {extra_info['reward_model']['reasoning_task']}, entry: {extra_info['reward_model']['entry']}"[:100])
+            res = 0
+    else:
+        try: 
+            async with session.post(f"http://{ip}:8388/compute_score", json={"solution_str": solution_str, "ground_truth": ground_truth, "data_source": data_source}) as res:
+                res = await res.json()
+                res = res['score']
+        except Exception as e:
+            print(f"Error: {e}, ip: {ip}")
+            print(f"answer: {solution_str[:10]}, data_source: {data_source}, ground_truth: {ground_truth}"[:100])
+            res = 0
+
+    if isinstance(res, (int, float, bool)):
+        return float(res)
+    else:
+        return float(res[0])
+
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/__init__.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/__init__.py
new file mode 100644
index 000000000..efe790884
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/__init__.py
@@ -0,0 +1,3 @@
+"""Import reward-related classes and types from the reward module."""
+
+from .code_reward import rllm_reward_fn_code as deepcoder_reward_fn
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/code_reward.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/code_reward.py
new file mode 100644
index 000000000..087e1bee0
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/code_reward.py
@@ -0,0 +1,385 @@
+
+"""
+This module contains the RewardCode class, which evaluates code datasets answers
+and assigns rewards based on their correctness on unit tests.
+"""
+import json
+import multiprocessing
+import re
+import time
+from multiprocessing import Manager
+from typing import List, Dict, Union
+import random
+import ast 
+
+from .code_utils.livecodebench import run_test as lcb_run_test
+from .code_utils.codeforces import run_test as codeforces_run_test
+from .code_utils.humanevalplus import run_test as humanevalplus_run_test, get_num_test_cases
+from .code_utils.taco import run_test as taco_run_test
+from .code_utils.firejail_exec import code_exec_firejail as lc_code_exec
+from .code_utils.kodcode import code_exec as kod_code_exec
+from .reward_types import RewardConfig, RewardFn, RewardInput, RewardOutput, RewardType
+
+
+def extract_code_from_model(model_response: str):
+    """
+    Extracts the code from a Markdown-style code block in an LLM output.
+
+    Parameters:
+        model_response (str): The text output from the LLM.
+
+    Returns:
+        str: The extracted code, or an empty string if no code block is found.
+    """
+    code_blocks = re.findall(r"```(?:\w+)?\n(.*?)```", model_response, re.DOTALL)
+    if not code_blocks:
+        return None
+    return code_blocks[-1].strip()
+
+
+def clean_code_main_block(code: str) -> str:
+    """
+    Removes `if __name__ == "__main__"` blocks from Python code.
+
+    Args:
+        code (str): The input Python code.
+
+    Returns:
+        str: Cleaned code without the main execution block.
+    """
+    code_lines = code.split('\n')
+    filtered_lines = []
+    skip_block = False
+
+    for line in code_lines:
+        if line.strip().startswith('if __name__ == "__main__"') or line.strip().startswith("if __name__ == '__main__'"):
+            skip_block = True
+            continue
+        if skip_block:
+            # Check if we're out of the block (less indentation)
+            if line.strip() and not line.startswith(' ') and not line.startswith('\t'):
+                skip_block = False
+            else:
+                continue
+        filtered_lines.append(line)
+
+    return '\n'.join(filtered_lines)
+
+
+def check_correctness(tests: Union[List[Dict[str, str]], Dict[str, List[str]]], code: str, test_fn, timeout_per_test: int = 12, max_tests: int = 15) -> bool:
+    """
+    Check if generated code passes all test cases within a timeout period.
+
+    Args:
+        tests: Test cases in either list of dictionaries or dictionary of lists format
+        code: Generated code to test
+        test_fn: Function to run tests
+        timeout: Maximum execution time in seconds before killing process
+
+    Returns:
+        bool: True if all tests pass, False otherwise
+
+    Raises:
+        AssertionError: If test results list is empty
+    """
+    manager = Manager()
+    test_results = manager.list()
+    def evaluate_code(tests, generation, debug, test_results, test_fn):
+        """Helper function to run tests in separate process."""
+        try:
+            test_results.append(test_fn(tests, test=generation, debug=debug, timeout=timeout_per_test))
+        except Exception as e:
+            print(f"Error in evaluate_code: {e}")
+    if isinstance(tests, list):
+        total_tests = len(tests)
+        if total_tests > max_tests:
+            # Sort indices by test input length and take the max_tests longest ones
+            selected_indices = sorted(range(total_tests), key=lambda i: len(tests[i]['input']), reverse=True)[:max_tests]
+            tests = [tests[i] for i in selected_indices]
+        num_tests = len(tests)
+    else:
+        total_tests = len(tests['inputs'])
+        if total_tests > max_tests:
+            # Select the tests with the longest input length.
+            selected_indices = sorted(range(total_tests), key=lambda i: len(tests['inputs'][i]), reverse=True)[:max_tests]
+            # Create a new dict with only the selected test cases
+            selected_tests = {
+                'inputs': [tests['inputs'][i] for i in selected_indices],
+                'outputs': [tests['outputs'][i] for i in selected_indices]
+            }
+            tests = selected_tests
+        num_tests = len(tests['inputs'])
+    
+    process = multiprocessing.Process(
+        target=evaluate_code,
+        args=(tests, code, False, test_results, test_fn)
+    )
+    process.start()
+    process.join()
+
+    if process.is_alive():
+        process.kill()
+    test_results = test_results[:]
+    if len(test_results) == 0:
+        return False
+    #assert len(test_results) == 1, f"Expected exactly one test result, but got {test_results}"
+    test_results = test_results[0]
+    test_results = [r==True for r in test_results]
+    return all(test_results)
+
+
+def postprocess_lcb_sample(sample):
+    sample_inputs = [sample['input'] for sample in sample]
+    sample_outputs = [sample['output'] for sample in sample]
+    
+    sample_dict = {
+        'inputs': sample_inputs,
+        'outputs': sample_outputs,
+    }
+    
+    if sample[0].get("testtype") == "functional":
+        metadata = sample[0].get("metadata", {})
+        fn_name = metadata.get("func_name", None)
+        assert fn_name is not None, f"Function name is not found, check if your LCB data is preprocessed correctly: {metadata}"
+        # Fill in the blank
+        sample_dict['fn_name'] = fn_name
+    
+    sample = {
+        'input_output': json.dumps(sample_dict),
+    }
+    return sample
+
+# https://huggingface.co/datasets/PrimeIntellect/verifiable-coding-problems
+def primeintellect_check_correctness(tests, code):
+    if isinstance(tests, str):
+        try:
+            tests =  ast.literal_eval(tests)
+            assert isinstance(tests, dict)
+        except (ValueError, SyntaxError) as e:
+            print(f"Error parsing string: {e}")
+            return False
+
+    assert len(tests) >= 1, "PrimeIntellect needs at least one test case"
+    # Convert the tests to the format expected by the taco_run_test function
+    inputs = [t['input'] for t in tests]
+    outputs = [t['output'] for t in tests]
+    fn_name = tests[0].get('fn_name', None)
+    tests = {
+        'inputs': inputs,
+        'outputs': outputs,
+    }
+    if fn_name:
+        tests['fn_name'] = fn_name
+    return check_correctness(tests, code, taco_run_test)
+
+def lcb_check_correctness_v2(sample, generation, timeout=6, debug=False):
+    """Check correctness of code generation with a global timeout.
+    The global timeout is to catch some extreme/rare cases not handled by the timeouts
+    inside `run_test`"""
+    assert len(sample) >= 1, "Sample must contain at least one test case"
+    sample = postprocess_lcb_sample(sample)
+
+    manager = multiprocessing.Manager()
+    result = manager.list()
+    metadata_list = manager.list()
+
+
+    def _temp_run(sample, generation, debug, result, metadata_list, timeout):
+        res, metadata = lcb_run_test(sample, test=generation, debug=debug, timeout=timeout)
+        result.append(res)
+        metadata_list.append(metadata)
+
+    p = multiprocessing.Process(
+        target=_temp_run,
+        args=(sample, generation, debug, result, metadata_list, timeout),
+    )
+    p.start()
+    p.join(
+        timeout=(timeout + 1) * len(json.loads(sample["input_output"])["inputs"]) + 5
+    )
+    if p.is_alive():
+        p.kill()
+    if not result:
+        in_outs = json.loads(sample["input_output"])
+        # consider that all tests failed
+        result = [[-1 for i in range(len(in_outs["inputs"]))]]
+        if debug:
+            print(f"global timeout")
+    if not result:
+        return False
+    # print(result[0], metadata_list)
+    # Check if all elements in result[0] are True
+    return all(x == True for x in result[0])
+
+
+def leetcode_check_correctness(tests: List[Dict[str, str]], code: str) -> bool:
+     """
+     Check if generated code passes all LeetCode test cases.
+    
+     Args:
+          tests: List of test cases, each containing input/output pairs
+          code: Generated code to test
+          timeout: Maximum execution time in seconds before killing process
+          runtime_debug: Whether to print debug info during test execution
+    
+     Returns:
+          bool: True if all tests pass and result list exists, False otherwise
+     """
+     succ, output = lc_code_exec(code + '\n' + tests["functional"])
+     if not succ:
+         print(f"Error in code execution: {output}")
+     return succ
+
+def kodcode_check_correctness(test: str, code: str, timeout_per_test: int = 5) -> bool:
+    """
+    Check if generated code passes all Kodcode test cases.
+    
+    Args:
+        test: String of the test file content
+        code: Generated code to test
+        timeout: Maximum execution time in seconds before killing process
+        runtime_debug: Whether to print debug info during test execution
+    
+    Returns:
+        bool: True if all tests pass and result list exists, False otherwise
+    """
+    # Count the number of test functions in the test file
+    num_tests = test.count('def test')
+
+    # Remove 'if __name__ == "__main__":' block if present
+    code = clean_code_main_block(code)
+    
+    succ, output = kod_code_exec(code, test, timeout_per_test * num_tests)
+    if not succ:
+        print(f"Error in code execution: {output}")
+    return succ
+
+def humanevalplus_check_correctness(test: str, code: str, timeout_per_test: int = 1) -> bool:
+    """
+    Check if generated code passes all HumanEvalPlus test cases.
+    
+    Args:
+        test: String of the test file content
+        code: Generated code to test
+        timeout: Maximum execution time in seconds before killing process
+        runtime_debug: Whether to print debug info during test execution
+    
+    Returns:
+        bool: True if all tests pass and result list exists, False otherwise
+    """
+    code = clean_code_main_block(code)
+    num_test_cases = get_num_test_cases(test)
+    succ, output = humanevalplus_run_test(code, test, timeout_per_test * num_test_cases)
+    if not succ:
+        print(f"Error in code execution: {output}")
+    return succ
+
+class RewardCodeFn(RewardFn):
+    """
+    Reward function for evaluating code dataset answers.
+
+    This class implements the __call__ method to process the input and determine
+    the reward based on the correctness of the unit tests provided
+    """
+    def __call__(self, input: RewardInput) -> RewardOutput:
+        total_start_time = time.time()
+
+        assert input.problem_type == RewardType.CODE, \
+            "Invalid problem type: expected 'CODE', but got '{}'".format(input.problem_type)
+
+        model_response= input.model_response
+        metadata = input.metadata
+        
+        dataset_name = input.data_source
+        tests = metadata
+        if tests is None:
+            print("No tests found in metadata")
+            return RewardOutput(reward=self.config.format_error_reward, is_correct=False)
+
+        model_code = extract_code_from_model(model_response)
+        if model_code is None:
+            # print("No code found in model response")
+            return RewardOutput(reward=self.config.format_error_reward, is_correct=False)
+
+        # Tests: List[Dictionary] - Codeforces, LiveCodeBench
+        # Tests: Dictionary[Lists] - CodeContests, Taco/Apps
+        is_correct = False
+        if dataset_name in ["taco", "apps", "code_contests"]:
+            test_fn = taco_run_test
+            is_correct = check_correctness(tests, model_code, test_fn)
+        elif dataset_name == "codeforces":
+            test_fn = codeforces_run_test
+            is_correct = check_correctness(tests, model_code, test_fn)
+        elif dataset_name == "leetcode":
+            is_correct = leetcode_check_correctness(tests, model_code)
+        elif dataset_name == "livecodebench":
+            is_correct = lcb_check_correctness_v2(tests, model_code, debug=False)
+        elif dataset_name == "primeintellect":
+            is_correct = primeintellect_check_correctness(tests, model_code)
+        elif dataset_name == "kodcode":
+            is_correct = kodcode_check_correctness(tests, model_code)
+        elif dataset_name == "humanevalplus":
+            is_correct = humanevalplus_check_correctness(tests, model_code)
+        else:
+            is_correct = check_correctness(tests, model_code, test_fn)
+
+        total_time = time.time() - total_start_time
+        # print(f"Total reward function execution time: {total_time:.2f} seconds")
+
+        if is_correct:
+            return RewardOutput(reward=self.config.correct_reward, is_correct=True)
+        else:
+            return RewardOutput(reward=self.config.incorrect_reward, is_correct=False)
+
+def rllm_reward_fn_code(data_source: str, llm_solution: str, ground_truth: Dict, **kwargs):
+    """Evaluate code solutions against ground truth ansters
+    
+    This function creates a reward function to evaluate code solutions by pass the test_case from groun_truth. It can optionally use a language model
+    for more sophisticated answer validation.
+
+    Args:
+        data_source: The source/dataset the problem comes from
+        llm_solution: The solution string provided by the language model to evaluate
+        ground_truth: some tests for this llm_solution
+        enable_llm: Whether to enable language model validation for complex cases (default: False)
+
+    Returns:
+        bool: True if the solution passes all the test_case, False otherwise
+
+    Example:
+            model_response = '''
+import sys
+from itertools import permutations
+def main():
+    n,m=map(int, input().split()) 
+    a=sum(list(map(int, input().split()))) 
+    if a+(n-1)*10<=m: 
+        print(5) 
+    else: 
+        print(5)
+if __name__ == "__main__":
+    main()
+'''
+    
+    print(f"test the code_forces")
+    # tests = [ { "input": "3 30\n2 2 1", "output": "5" }, { "input": "3 10\n3 2 1", "output": "5" } ] 
+    metadata = {
+         "tests": tests,
+    }
+    True
+    """
+    
+    ground_truth = json.loads(ground_truth)
+
+    reward_config = RewardConfig()
+    reward_fn = RewardCodeFn(reward_config)
+    reward_response = reward_fn(
+        RewardInput(
+            problem=None,
+            problem_type=RewardType.CODE,
+            data_source=data_source,
+            model_response=llm_solution,
+            metadata=ground_truth
+        ))
+    return reward_response.is_correct
+  
\ No newline at end of file
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/code_utils/codeforces.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/code_utils/codeforces.py
new file mode 100644
index 000000000..ce3bb65b1
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/code_utils/codeforces.py
@@ -0,0 +1,615 @@
+# modifed from https://github.com/hendrycks/apps/blob/main/eval/testing_util.py to fix some evaluation bugs and add instructions
+from .pyext2 import RuntimeModule
+import signal
+import numpy as np
+
+# used for debugging to time steps
+from datetime import datetime
+
+import os, sys, json
+import faulthandler
+
+import subprocess
+import tempfile
+import inspect
+from enum import Enum
+from unittest.mock import patch, mock_open
+from io import StringIO
+import platform
+
+class CODE_TYPE(Enum):
+    call_based = 0
+    standard_input = 1
+
+class Capturing(list):
+    def __enter__(self):
+        self._stdout = sys.stdout
+        sys.stdout = self._stringio = StringIO()
+        # Make closing the StringIO a no-op
+        self._stringio.close = lambda x: 1
+        return self
+    def __exit__(self, *args):
+        self.extend(self._stringio.getvalue().splitlines())
+        del self._stringio    # free up some memory
+        sys.stdout = self._stdout
+
+# to run the solution files we're using a timing based approach
+import signal
+# stuff for setting up signal timer
+class TimeoutException(Exception):
+    pass
+def timeout_handler(signum, frame):
+    print(f"alarm went off")
+    # return
+    raise TimeoutException
+signal.signal(signal.SIGALRM, timeout_handler)
+TIMEOUT = 4  # seconds
+
+EXECUTION_RESULTS = {1: "passed", 0: "false", -1: "timeout", -2: "runtime_error", -3: "returncode:{code}", -4: "compile_error"}
+
+def run_test(in_outs, test=None, debug=False, timeout=TIMEOUT):
+    """
+    if test(generated_code) is not None it'll try to run the code.
+    otherwise it'll just return an input and output pair.
+    """
+    if in_outs is None or len(in_outs) == 0: 
+        return []
+    #test_cases:[ { "input": "3 6 9", "output": "6" }, { "input": "4 4 4", "output": "4" }, { "input": "0 0 0", "output": "0" } ]
+    #test_cases: [ { "input": "20 40 60 80 100\n0 1 2 3 4\n1 0", "output": "4900" }, { "input": "119 119 119 119 119\n0 0 0 0 0\n10 0", "output": "4930" }]
+    if in_outs:
+        if in_outs[0].get("fn_name") is None:
+            fn_name = in_outs[0].get("fn_name")
+            which_type = CODE_TYPE.standard_input  # Standard input
+            method_name = None
+        else:
+            which_type = CODE_TYPE.call_based  # Call-based
+            method_name = in_outs["fn_name"]
+    inputs_list = []
+    outputs_list = []
+    for index, in_out in enumerate(in_outs):
+        inputs = in_out["input"] #"20 40 60 80 100\n0 1 2 3 4\n1 0"
+        outputs = in_out["output"] #"4900"
+        inputs, outputs = process_input_output(inputs, outputs)
+        inputs_list.append(inputs)
+        outputs_list.append(outputs)
+
+    if debug:
+        print(f"loaded input_output = {datetime.now().time()}")
+    if test is None:
+        return None
+    elif test is not None:
+        results = []
+        if debug:
+            print(f"loading test code = {datetime.now().time()}")
+        if which_type == CODE_TYPE.call_based:
+            synthesized_code = synthesize_cb_code(test, debug)
+            method_func = compile_and_get_func(synthesized_code, which_type, method_name, timeout=timeout, debug=debug)
+        elif which_type == CODE_TYPE.standard_input:
+            synthesized_code, exec_code = synthesize_std_code(test, debug)
+            method_func = compile_and_get_func(synthesized_code, which_type, method_name, timeout=timeout, debug=debug)
+        if not method_func:
+            results.append(-2)
+            return results
+        else:
+            if which_type == CODE_TYPE.call_based:  # Call-based
+                detail_results, debug_infos = execute_cb_code(method_func, inputs_list, outputs_list, timeout=timeout, early_stop=True, debug=debug)
+            elif which_type == CODE_TYPE.standard_input:
+                detail_results = execute_std_code(method_func, exec_code, inputs_list, outputs_list, timeout=timeout, early_stop=True, debug=debug)
+                debug_infos = detail_results.get('debug', None)
+                detail_results = {k:v for k, v in detail_results.items() if k!='debug'}
+                if set(detail_results.values()) == {(False, 'returncode:1')}:
+                    detail_results = execute_std_code(method_func, synthesized_code+'\ncode()\n', inputs_list, outputs_list, timeout=timeout, early_stop=True, debug=debug)
+        if isinstance(detail_results, list):
+            if len(detail_results) == 1:
+                detail_results = detail_results * len(inputs_list)
+            detail_results = dict(zip([i for i in range(len(inputs_list))], detail_results))
+        for test_id, test_result in detail_results.items():
+            if test_result[1] == "passed":
+                results.append(True)
+            elif test_result[1] == "false":
+                results.append(False)
+            elif test_result[1] == "timeout":
+                results.append(-1)
+            else:
+                results.append(-3)
+        return results
+
+def process_input_output(inputs, outputs):
+    # JSON forces dictionaries to have string keys; this undoes this (assuming a singleton list)
+    try:
+        if isinstance(inputs[0], dict):
+            inputs = [{int(k): v for k,v in inputs[0].items()}]
+    except:
+        True
+    
+    try:
+        if isinstance(outputs, dict):
+            outputs = [{int(k): v for k,v in outputs.items()}]
+    except:
+        True
+
+    try:
+        if isinstance(outputs[0], dict):
+            outputs = [{int(k): v for k,v in outputs[0].items()}]
+    except:
+        True
+    
+    return inputs, outputs
+
+def compile_and_get_func(program, which_type, method_name, timeout, debug):
+    try:
+        signal.alarm(timeout)
+        tmp_sol = RuntimeModule.from_string("tmp_sol", "", program)
+        if which_type == CODE_TYPE.call_based and "class Solution" in program:
+            tmp = tmp_sol.Solution()
+        else:
+            tmp = tmp_sol
+        signal.alarm(0)
+    except Exception as e:
+        signal.alarm(0)
+        if debug:
+            print(f"compilation error = {e}")
+        return False
+    
+    if which_type == CODE_TYPE.call_based:
+        assert isinstance(method_name, str)
+    else:
+        method_name = "code"
+    
+    try:
+        signal.alarm(timeout)
+        method = getattr(tmp, method_name)  # get_attr second arg must be str
+        signal.alarm(0)
+    except:
+        signal.alarm(0)
+        e = sys.exc_info()
+        if debug:
+            print(f"unable to get function error = {e}")
+        return False
+    return method
+
+def synthesize_cb_code(raw_code, debug=False):
+    sol = "import sys\nimport time\nimport itertools\nfrom itertools import accumulate, product, permutations, combinations\nimport collections\nfrom collections import Counter, OrderedDict, deque, defaultdict, ChainMap\nfrom functools import lru_cache\nimport math\nfrom math import sqrt, sin, cos, tan, ceil, fabs, floor, gcd, exp, log, log2\nimport fractions\nfrom typing import List, Tuple\nimport numpy as np\nimport random\nimport heapq\nfrom heapq import *\n"
+    if debug:
+        print(f"loading test code = {datetime.now().time()}")
+    sol += raw_code
+    return sol
+
+def synthesize_std_code(raw_code, debug=False):
+    normal_import_lines = "import sys\nimport time\nimport itertools\nfrom itertools import accumulate, product, permutations, combinations\nimport collections\nfrom collections import Counter, OrderedDict, deque, defaultdict, ChainMap\nfrom functools import lru_cache\nimport math\nfrom math import sqrt, sin, cos, tan, ceil, fabs, floor, gcd, exp, log, log2\nimport fractions\nfrom typing import List, Tuple\nimport numpy as np\nimport random\nimport heapq\nfrom heapq import *\n"
+    if debug:
+        print(f"loading test code = {datetime.now().time()}")
+    
+    sol = "" # code for compile
+    sol2 = "" # code for execute
+
+    tmp_test = raw_code.split("\n")
+    # define the code line type, 1 for import lines, 2 for import * lines with indent, 0 for normal codes
+    code_types = [] 
+
+
+    for x in tmp_test:
+        if 'import *' in x:
+            code_types.append(2)
+        elif x.startswith("from ") or x.startswith("import "):
+            code_types.append(1) 
+        else:
+            code_types.append(0)
+    
+    started = False
+
+    special_import_lines = [i.lstrip('\t') for idx, i in enumerate(tmp_test) if code_types[idx]==2]
+    special_import_lines = '\n'.join(special_import_lines)
+
+    for idx, i in enumerate(tmp_test):
+        code_type = code_types[idx]
+        if code_type == 0 and not started:
+            sol2 += normal_import_lines
+            sol2 += "\nstdin = sys.stdin\nstdout = sys.stdout\n"
+            sol2 += f"{i}\n"
+
+            sol += normal_import_lines
+            sol += special_import_lines
+            sol += "\nstdin = sys.stdin\nstdout = sys.stdout\n"
+            sol += "def code():\n"
+            sol += f"\t{i}\n"
+            started = True
+        else:
+            sol2 += f"{i}\n"
+            if code_type < 2:
+                if started:
+                    sol += '\t'
+                sol += f"{i}\n"
+    
+    if debug:
+        print(f"sol = {sol}")
+        print(f"sol2 = {sol2}")
+    
+    return sol, sol2
+
+def call_method(method, inputs):
+
+    if isinstance(inputs, list):
+        inputs = "\n".join(inputs)
+
+    inputs_line_iterator = iter(inputs.split("\n"))
+
+    # sys.setrecursionlimit(10000)
+
+    # @patch('builtins.input', side_effect=inputs.split("\n"))
+    @patch('builtins.open', mock_open(read_data=inputs))
+    @patch('sys.stdin', StringIO(inputs))
+    @patch('sys.stdin.readline', lambda *args: next(inputs_line_iterator))
+    @patch('sys.stdin.readlines', lambda *args: inputs.split("\n"))
+    @patch('sys.stdin.read', lambda *args: inputs)
+    # @patch('sys.stdout.write', print)
+    def _inner_call_method(_method):
+        try:
+            return _method()
+        except SystemExit as e:
+            pass
+        finally:
+            pass
+    return _inner_call_method(method) 
+
+def execute_cb_code(method, inputs_list, outputs_list, timeout, early_stop=True, debug=True):
+    # Disable functionalities that can make destructive changes to the test.
+    reliability_guard()
+    results = []
+    debug_infos = {}
+    for index, inputs in enumerate(inputs_list):
+        if debug:
+            debug_infos[index] = {}
+        outputs = outputs_list[index]
+        try:
+            signal.alarm(timeout) 
+            faulthandler.enable()
+            exec_outputs = method(*inputs)
+            signal.alarm(0)
+            faulthandler.disable()
+        except Exception as e:
+            signal.alarm(0)
+            faulthandler.disable()
+            if debug:
+                print(f"Standard input runtime error = {e}")
+            if early_stop:
+                for i in range(index, len(inputs_list)):
+                    results.append((False, EXECUTION_RESULTS[-2]))
+                break
+            else:
+                continue
+        try:
+            # ground truth sequences are not tuples
+            if isinstance(exec_outputs, tuple):
+                exec_outputs = list(exec_outputs)
+            
+            tmp_result = exec_outputs == outputs
+            if isinstance(outputs, list) and outputs:
+                tmp_result = tmp_result or (exec_outputs == outputs[0])
+
+            # ground truth sequences are not tuples
+            try:
+                if isinstance(exec_outputs[0], tuple):
+                    exec_outputs = [list(x) for x in exec_outputs]
+                    tmp_result = tmp_result or (exec_outputs == outputs[0])
+            except:
+                True
+            if tmp_result:
+                results.append((True, EXECUTION_RESULTS[1]))
+            else:
+                results.append((False, EXECUTION_RESULTS[0]))
+        except Exception as e:
+            if debug:
+                print(f"Standard input time limit exceeded error = {e}")
+            results.append((False, EXECUTION_RESULTS[-1]))
+            continue
+
+        if debug:
+            print(f"outputs = {exec_outputs}, test outputs = {outputs}, inputs = {inputs}, {type(inputs)}, {tmp_result}")
+            debug_infos[index] = {
+                    'inputs': inputs,
+                    'gt_outputs': outputs,
+                    'exec_outputs': exec_outputs
+                }
+    return results, debug_infos
+
+def remove_tmp_files():
+    tmp_files = ['input.txt', 'output.txt']
+    for tmp_file in tmp_files:
+        if tmp_file in os.listdir('.'):
+            os.remove(tmp_file)
+#TODO(Xiaoxiang): has to modify that 
+def execute_std_code(method, synthesized_code, inputs_list, outputs_list, timeout, early_stop=False, debug=False):
+    temp_program_path = create_temp_file(synthesized_code)
+    if debug:
+        print("Test program:", temp_program_path)
+    assert isinstance(inputs_list, list) and isinstance(outputs_list, list)
+    assert len(inputs_list) == len(outputs_list)
+    exec_results = {}
+    if debug:
+        exec_results['debug'] = {}
+    for i, inputs in enumerate(inputs_list):
+        remove_tmp_files()
+        outputs = outputs_list[i]
+        if isinstance(inputs, list):
+            inputs = "\n".join(inputs)
+        if isinstance(outputs, list):
+            outputs = "\n".join(outputs)
+        
+        try:
+            result = subprocess.run(['python', temp_program_path], input=inputs, text=True, capture_output=True, timeout=timeout)  
+            exec_code = 999
+        except subprocess.TimeoutExpired:
+            exec_code = -1
+        except Exception as e:
+            print(e)
+            exec_code = -2
+
+        if exec_code > 0:
+            # if result.returncode != 0:
+            #     try:
+            #         inputs_tmp_file = open(create_temp_file(inputs), 'r')
+            #         result = subprocess.run(['python', temp_program_path], stdin=inputs_tmp_file, text=True, capture_output=True, timeout=timeout)
+            #         assert result.returncode == 0
+            #         if compare_std_results(result.stdout, outputs, debug):
+            #             exec_code = 1
+            #         else:
+            #             exec_code = 0
+            #     except:
+            #         try:
+            #             inputs_tmp_file = 'input.txt'
+            #             with open(inputs_tmp_file, 'w') as ftemp:
+            #                 ftemp.write(inputs)
+            #             result = subprocess.run(['python', temp_program_path], text=True, timeout=timeout)
+            #             assert result.returncode == 0
+            #             if compare_std_results(open('output.txt').read(), outputs, debug):
+            #                 exec_code = 1
+            #             else:
+            #                 exec_code = 0
+                        
+            #         except:
+            #             exec_code = -3
+            if compare_std_results(result.stdout, outputs, debug):
+                exec_code = 1
+            else:
+                exec_code = 0
+        assert exec_code != -3
+        exec_results[i] = (exec_code==1, EXECUTION_RESULTS[exec_code] if exec_code>-3 else EXECUTION_RESULTS[exec_code].format(result.returncode))
+        if exec_code >= 0:
+            if debug:
+                print_debug_info(inputs=inputs, outputs=outputs, exec_outputs=result.stdout)
+                exec_results['debug'][i] = {
+                    'inputs': inputs,
+                    'gt_outputs': outputs,
+                    'exec_outputs': result.stdout
+                }
+        if early_stop and exec_code<=0:
+            break
+    return exec_results
+
+def print_debug_info(inputs, outputs, exec_outputs):
+    nl = "\n"
+    if not isinstance(inputs, list):
+        print(f"exec output = {exec_outputs}, test outputs = {outputs}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {exec_outputs == [outputs]}")
+    else:
+        print(f"exec output = {exec_outputs}, test outputs = {outputs}, inputs = {inputs}, {type(inputs)}, {exec_outputs == [outputs]}")
+
+def create_temp_file(content):
+    with tempfile.NamedTemporaryFile(delete=False, mode='w', encoding='utf-8') as temp_file:
+        temp_file.write(content)
+        temp_file_path = temp_file.name
+    return temp_file_path
+
+def compare_std_results(exec_outputs, outputs, debug=False):
+    if stripped_string_compare(exec_outputs, outputs):
+        return True
+    
+    if isinstance(exec_outputs, list):
+        output_1 = "\n".join(exec_outputs)
+        if stripped_string_compare(output_1, outputs):
+            return True
+    
+    if isinstance(exec_outputs, list):
+        output_2 = [o.lstrip().rstrip() for o in exec_outputs]
+        output_2 = "\n".join(output_2)
+        if stripped_string_compare(output_2, outputs):
+            return True
+    
+    tmp_result = False
+    # ground truth sequences are expressed as lists not tuples
+    if isinstance(outputs, tuple):
+        outputs = list(outputs)
+    
+    try:
+        tmp_result = (exec_outputs == [outputs])
+        if isinstance(outputs, list):
+            tmp_result = tmp_result or (exec_outputs == outputs)
+            if isinstance(exec_outputs[0], str):
+                tmp_result = tmp_result or ([e.strip() for e in exec_outputs] == outputs)
+    except Exception as e:
+        if debug:
+            print(f"Failed check1 exception = {e}")
+        pass
+    if tmp_result:
+        return True
+    
+    # try one more time without \n
+    if isinstance(outputs, list):
+        for tmp_index, i in enumerate(outputs):
+            outputs[tmp_index] = i.split("\n")
+            outputs[tmp_index] = [x.strip() for x in outputs[tmp_index] if x]
+    else:
+        outputs = outputs.split("\n")
+        outputs = list(filter(len, outputs))
+        outputs = list(map(lambda x:x.strip(), outputs))
+    
+    try:
+        tmp_result = (exec_outputs == [outputs])
+        if isinstance(outputs, list):
+            tmp_result = tmp_result or (exec_outputs == outputs)
+    except Exception as e:
+        if debug:
+            print(f"Failed check2 exception = {e}")
+        pass
+    if tmp_result:
+        return True
+    
+    # try by converting the output into a split up list too
+    if isinstance(exec_outputs, list):
+        exec_outputs = list(filter(len, exec_outputs))
+    try:
+        tmp_result = (exec_outputs == [outputs])
+        if isinstance(outputs, list):
+            tmp_result = tmp_result or (exec_outputs == outputs)
+    except Exception as e:
+        if debug:
+            print(f"Failed check3 exception = {e}")
+        pass
+    if tmp_result:
+        return True
+    
+
+    try:
+        output_float = [float(e) for e in exec_outputs]
+        gt_float = [float(e) for e in outputs]
+        tmp_result = tmp_result or ((len(output_float) == len(gt_float)) and np.allclose(output_float, gt_float))
+    except Exception as e:
+        pass
+    try:
+        if isinstance(exec_outputs[0], list):
+            output_float = [float(e) for e in exec_outputs[0]]
+            gt_float = [float(e) for e in outputs[0]]
+            tmp_result = tmp_result or ((len(output_float) == len(gt_float)) and np.allclose(output_float, gt_float))
+    except Exception as e:
+        pass
+    if tmp_result:
+        return True
+
+    
+    if isinstance(outputs, list):
+        for tmp_index, i in enumerate(outputs):
+            outputs[tmp_index] = set(i.split())
+    else:
+        outputs = set(outputs.split())
+
+    try:
+        tmp_result = (exec_outputs == outputs)
+    except Exception as e:
+        if debug:
+            print(f"Failed check4 exception = {e}")
+    if tmp_result:
+        return True
+    
+    # try by converting the output into a split up list too
+    if isinstance(exec_outputs, list):
+        for tmp_index, i in enumerate(exec_outputs):
+            exec_outputs[tmp_index] = i.split()
+        exec_outputs = list(filter(len, exec_outputs))
+        for tmp_index, i in enumerate(exec_outputs):
+            exec_outputs[tmp_index] = set(i)    
+    else:
+        exec_outputs = exec_outputs.split()
+        exec_outputs = list(filter(len, exec_outputs))
+        exec_outputs = set(exec_outputs)
+    try:
+        tmp_result = (set(frozenset(s) for s in exec_outputs) == set(frozenset(s) for s in outputs))
+    except Exception as e:
+        if debug:
+            print(f"Failed check5 exception = {e}")
+    
+    # if they are all numbers, round so that similar numbers are treated as identical
+    try:
+        tmp_result = tmp_result or (set(frozenset(round(float(t),3) for t in s) for s in exec_outputs) ==\
+            set(frozenset(round(float(t),3) for t in s) for s in outputs))
+    except Exception as e:
+        if debug:
+            print(f"Failed check6 exception = {e}")
+
+    if tmp_result:
+        return True
+    
+    return False
+
+def stripped_string_compare(s1, s2):
+    s1 = s1.lstrip().rstrip()
+    s2 = s2.lstrip().rstrip()
+    return s1 == s2
+
+def reliability_guard(maximum_memory_bytes=None):
+    """
+    This disables various destructive functions and prevents the generated code
+    from interfering with the test (e.g. fork bomb, killing other processes,
+    removing filesystem files, etc.)
+    WARNING
+    This function is NOT a security sandbox. Untrusted code, including, model-
+    generated code, should not be blindly executed outside of one. See the
+    Codex paper for more information about OpenAI's code sandbox, and proceed
+    with caution.
+    """
+
+    if maximum_memory_bytes is not None:
+        import resource
+
+        resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
+        resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
+        if not platform.uname().system == "Darwin":
+            resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
+
+    faulthandler.disable()
+
+    import builtins
+
+    builtins.exit = None
+    builtins.quit = None
+
+    import os
+
+    os.environ["OMP_NUM_THREADS"] = "1"
+
+    os.kill = None
+    os.system = None
+    os.putenv = None
+    os.remove = None
+    os.removedirs = None
+    os.rmdir = None
+    os.fchdir = None
+    os.setuid = None
+    os.fork = None
+    os.forkpty = None
+    os.killpg = None
+    os.rename = None
+    os.renames = None
+    os.truncate = None
+    os.replace = None
+    os.unlink = None
+    os.fchmod = None
+    os.fchown = None
+    os.chmod = None
+    os.chown = None
+    os.chroot = None
+    os.fchdir = None
+    os.lchflags = None
+    os.lchmod = None
+    os.lchown = None
+    os.getcwd = None
+    os.chdir = None
+
+    import shutil
+
+    shutil.rmtree = None
+    shutil.move = None
+    shutil.chown = None
+
+    import subprocess
+
+    subprocess.Popen = None  # type: ignore
+
+    __builtins__["help"] = None
+
+    import sys
+
+    sys.modules["ipdb"] = None
+    sys.modules["joblib"] = None
+    sys.modules["resource"] = None
+    sys.modules["psutil"] = None
+    sys.modules["tkinter"] = None
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/code_utils/firejail_exec.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/code_utils/firejail_exec.py
new file mode 100644
index 000000000..c44e6e9d1
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/code_utils/firejail_exec.py
@@ -0,0 +1,83 @@
+# https://github.com/ganler/code-r1/blob/main/verl/utils/reward_score/coder1/firejail_exec.py
+import os
+import subprocess
+
+from tempfile import NamedTemporaryFile, TemporaryDirectory
+from .utils import BASE_IMPORTS, BASE_LEETCODE_IMPORTS
+
+
+# sudo add-apt-repository ppa:deki/firejail
+# sudo apt-get update
+# sudo apt-get install firejail firejail-profiles
+
+CLI_ARG_SIZE_LIMIT = 1024 * 3
+
+_ERROR_MSG_PREFIX = "Failed to execute program: "
+_DEFAULT_TIMEOUT_SECONDS = 30
+
+
+def code_exec_firejail(code, stdin: str = None, timeout=_DEFAULT_TIMEOUT_SECONDS, pytest: str = None):
+    env = os.environ.copy()
+    env["OPENBLAS_NUM_THREADS"] = "1"
+
+    # Build the firejail command with resource limits and cleanup options
+    command = [
+        "firejail",
+        "--private",
+        "--quiet",
+        "--seccomp=socket",
+        "--profile=pip",
+        "--rlimit-nproc=32",
+        "--rlimit-nofile=32",
+        "--rlimit-fsize=2097152",  # Limit file size
+        "--rlimit-as=4294967296",
+        f"--timeout=00:00:{timeout}",
+    ]
+
+    if pytest:
+        # solution is in {tmpdir}/solution.py
+        with TemporaryDirectory() as tmpdir:
+            assert stdin is None, "STDIN is not supported with pytest"
+            # Write the solution to a file
+            with open(os.path.join(tmpdir, "solution.py"), "w") as f:
+                f.write(code)
+            with open(os.path.join(tmpdir, "test_solution.py"), "w") as f:
+                f.write(pytest)
+            command.insert(4, f"--whitelist={tmpdir}")
+            command.extend(["python3", "-m", "pytest", tmpdir])
+            result = subprocess.run(
+                command,
+                cwd=tmpdir,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                env=env,
+                check=False,
+            )
+    else:
+        code = BASE_IMPORTS + "\n" + BASE_LEETCODE_IMPORTS + "\n" + code
+        if len(code) < CLI_ARG_SIZE_LIMIT:
+            command.extend(["python3", "-c", code])
+            result = subprocess.run(command,
+                                    input=stdin.encode() if stdin else None,
+                                    stdout=subprocess.PIPE,
+                                    stderr=subprocess.PIPE,
+                                    env=env,
+                                    check=False)
+        else:
+            with NamedTemporaryFile(dir="/tmp", suffix=".py") as tmp:
+                tmp.write(code.encode())
+                tmp.flush()
+                command.insert(4, f"--whitelist={tmp.name}")
+                command.extend(["python3", tmp.name])
+                result = subprocess.run(command,
+                                        input=stdin.encode() if stdin else None,
+                                        stdout=subprocess.PIPE,
+                                        stderr=subprocess.PIPE,
+                                        env=env,
+                                        check=False)
+
+    stderr = result.stderr.decode().strip()
+    stdout = result.stdout.decode()
+    if result.returncode == 0:
+        return True, stdout
+    return False, _ERROR_MSG_PREFIX + f"STDOUT:\n{stdout}\n\nSTDERR:\n{stderr}"
\ No newline at end of file
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/code_utils/humanevalplus.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/code_utils/humanevalplus.py
new file mode 100644
index 000000000..f7bb53ddd
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/code_utils/humanevalplus.py
@@ -0,0 +1,172 @@
+import ast
+import os
+import subprocess
+import faulthandler
+from tempfile import TemporaryDirectory
+import platform
+
+from .utils import BASE_IMPORTS
+
+CLI_ARG_SIZE_LIMIT = 1024 * 3
+
+_ERROR_MSG_PREFIX = "Failed to execute program: "
+_DEFAULT_TIMEOUT_SECONDS = 60
+
+
+def get_num_test_cases(test_code):
+    # Parse the code into an AST
+    parsed = ast.parse(test_code)
+    
+    # Find the assignment node for 'inputs'
+    inputs_node = None
+    results_node = None
+    
+    for node in ast.walk(parsed):
+        if isinstance(node, ast.Assign):
+            for target in node.targets:
+                if isinstance(target, ast.Name):
+                    if target.id == 'inputs':
+                        inputs_node = node.value
+    
+    if inputs_node is None:
+        return "Could not find inputs or results in the code"
+    
+    # Count number of test cases
+    if isinstance(inputs_node, ast.List):
+        input_count = len(inputs_node.elts)
+    else:
+        input_count = "Unknown (not a direct list)"
+    return input_count
+
+
+def run_test(code, test: str = None, timeout=_DEFAULT_TIMEOUT_SECONDS):
+    env = os.environ.copy()
+    
+    # Create a preexec_fn function to set resource limits
+    def preexec_fn():
+        reliability_guard()
+
+
+    if not test:
+        raise ValueError("No test provided.")
+
+    code_to_run = f"""
+{BASE_IMPORTS}
+
+{code}
+
+{test}
+
+"""
+    # solution is in {tmpdir}/solution.py
+    with TemporaryDirectory() as tmpdir:
+        # Write the solution to a file
+
+        solution_path = os.path.join(tmpdir, "solution.py")
+
+        with open(solution_path, "w") as f:
+            f.write(code_to_run)
+            
+        command = ["python3", solution_path]
+        try:
+            result = subprocess.run(
+                command,
+                cwd=tmpdir,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                env=env,
+                check=False,
+                preexec_fn=preexec_fn,
+                timeout=timeout
+            )
+            
+            stderr = result.stderr.decode().strip()
+            stdout = result.stdout.decode()
+            if result.returncode == 0:
+                return True, stdout
+            return False, _ERROR_MSG_PREFIX + f"STDOUT:\n{stdout}\n\nSTDERR:\n{stderr}"
+            
+        except subprocess.TimeoutExpired:
+            return False, _ERROR_MSG_PREFIX + f"Execution timed out after {timeout} seconds."
+        except Exception as e:
+            return False, _ERROR_MSG_PREFIX + f"An Exception occurred in the code: {str(e)}"
+
+
+def reliability_guard(maximum_memory_bytes=None):
+    """
+    This disables various destructive functions and prevents the generated code
+    from interfering with the test (e.g. fork bomb, killing other processes,
+    removing filesystem files, etc.)
+    WARNING
+    This function is NOT a security sandbox. Untrusted code, including, model-
+    generated code, should not be blindly executed outside of one. See the
+    Codex paper for more information about OpenAI's code sandbox, and proceed
+    with caution.
+    """
+
+    if maximum_memory_bytes is not None:
+        import resource
+
+        resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
+        resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
+        if not platform.uname().system == "Darwin":
+            resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
+
+    faulthandler.disable()
+
+    import builtins
+
+    builtins.exit = None
+    builtins.quit = None
+
+    import os
+
+    os.environ["OMP_NUM_THREADS"] = "1"
+
+    os.kill = None
+    os.system = None
+    os.putenv = None
+    os.remove = None
+    os.removedirs = None
+    os.rmdir = None
+    os.fchdir = None
+    os.setuid = None
+    os.fork = None
+    os.forkpty = None
+    os.killpg = None
+    os.rename = None
+    os.renames = None
+    os.truncate = None
+    os.replace = None
+    os.unlink = None
+    os.fchmod = None
+    os.fchown = None
+    os.chmod = None
+    os.chown = None
+    os.chroot = None
+    os.fchdir = None
+    os.lchflags = None
+    os.lchmod = None
+    os.lchown = None
+    os.getcwd = None
+    os.chdir = None
+
+    import shutil
+
+    shutil.rmtree = None
+    shutil.move = None
+    shutil.chown = None
+
+    import subprocess
+
+    subprocess.Popen = None  # type: ignore
+
+    __builtins__["help"] = None
+
+    import sys
+
+    sys.modules["ipdb"] = None
+    sys.modules["joblib"] = None
+    sys.modules["resource"] = None
+    sys.modules["psutil"] = None
+    sys.modules["tkinter"] = None
\ No newline at end of file
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/code_utils/kodcode.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/code_utils/kodcode.py
new file mode 100644
index 000000000..5246be492
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/code_utils/kodcode.py
@@ -0,0 +1,152 @@
+import os
+import signal
+import subprocess
+import resource
+import faulthandler
+from tempfile import TemporaryDirectory
+import platform
+
+from .utils import BASE_IMPORTS
+
+CLI_ARG_SIZE_LIMIT = 1024 * 3
+
+_ERROR_MSG_PREFIX = "Failed to execute program: "
+_DEFAULT_TIMEOUT_SECONDS = 30
+
+
+def code_exec(code, test: str = None, timeout=_DEFAULT_TIMEOUT_SECONDS):
+    env = os.environ.copy()
+    
+    # Create a preexec_fn function to set resource limits
+    def preexec_fn():
+        reliability_guard()
+
+    if 'pytest' not in code:
+
+        code_to_run = f"""
+{BASE_IMPORTS}
+import pytest
+
+{code}
+
+{test}
+
+if __name__ == "__main__":
+    pytest.main([__file__])
+"""
+    else:
+        code_to_run = code
+    
+    # solution is in {tmpdir}/solution.py
+    with TemporaryDirectory() as tmpdir:
+        # Write the solution to a file
+
+        solution_path = os.path.join(tmpdir, "solution.py")
+
+        with open(solution_path, "w") as f:
+            f.write(code_to_run)
+            
+        command = ["pytest", "--maxfail=1", solution_path]
+        
+        try:
+            result = subprocess.run(
+                command,
+                cwd=tmpdir,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                env=env,
+                check=False,
+                preexec_fn=preexec_fn,
+                timeout=timeout
+            )
+            
+            stderr = result.stderr.decode().strip()
+            stdout = result.stdout.decode()
+            if result.returncode == 0:
+                return True, stdout
+            return False, _ERROR_MSG_PREFIX + f"STDOUT:\n{stdout}\n\nSTDERR:\n{stderr}"
+            
+        except subprocess.TimeoutExpired:
+            return False, _ERROR_MSG_PREFIX + f"Execution timed out after {timeout} seconds."
+        except Exception as e:
+            return False, _ERROR_MSG_PREFIX + f"An Exception occurred in the code: {str(e)}"
+
+
+def reliability_guard(maximum_memory_bytes=None):
+    """
+    This disables various destructive functions and prevents the generated code
+    from interfering with the test (e.g. fork bomb, killing other processes,
+    removing filesystem files, etc.)
+    WARNING
+    This function is NOT a security sandbox. Untrusted code, including, model-
+    generated code, should not be blindly executed outside of one. See the
+    Codex paper for more information about OpenAI's code sandbox, and proceed
+    with caution.
+    """
+
+    if maximum_memory_bytes is not None:
+        import resource
+
+        resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
+        resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
+        if not platform.uname().system == "Darwin":
+            resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
+
+    faulthandler.disable()
+
+    import builtins
+
+    builtins.exit = None
+    builtins.quit = None
+
+    import os
+
+    os.environ["OMP_NUM_THREADS"] = "1"
+
+    os.kill = None
+    os.system = None
+    os.putenv = None
+    os.remove = None
+    os.removedirs = None
+    os.rmdir = None
+    os.fchdir = None
+    os.setuid = None
+    os.fork = None
+    os.forkpty = None
+    os.killpg = None
+    os.rename = None
+    os.renames = None
+    os.truncate = None
+    os.replace = None
+    os.unlink = None
+    os.fchmod = None
+    os.fchown = None
+    os.chmod = None
+    os.chown = None
+    os.chroot = None
+    os.fchdir = None
+    os.lchflags = None
+    os.lchmod = None
+    os.lchown = None
+    os.getcwd = None
+    os.chdir = None
+
+    import shutil
+
+    shutil.rmtree = None
+    shutil.move = None
+    shutil.chown = None
+
+    import subprocess
+
+    subprocess.Popen = None  # type: ignore
+
+    __builtins__["help"] = None
+
+    import sys
+
+    sys.modules["ipdb"] = None
+    sys.modules["joblib"] = None
+    sys.modules["resource"] = None
+    sys.modules["psutil"] = None
+    sys.modules["tkinter"] = None
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/code_utils/livecodebench.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/code_utils/livecodebench.py
new file mode 100644
index 000000000..a19a79108
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/code_utils/livecodebench.py
@@ -0,0 +1,562 @@
+# Taken from https://github.com/LiveCodeBench/LiveCodeBench/blob/998c52d394b836f15fff3b9a29866191108ff81b/lcb_runner/evaluation/testing_util.py
+import ast
+import json
+import sys
+import faulthandler
+import platform
+import multiprocessing
+import queue
+
+# used for debugging to time steps
+from datetime import datetime
+
+# to run the solution files we're using a timing based approach
+import signal
+
+import numpy as np
+
+from io import StringIO
+
+# used for testing the code that reads from input
+from unittest.mock import patch, mock_open
+
+# from pyext import RuntimeModule
+from types import ModuleType
+
+from enum import Enum
+from decimal import Decimal
+import time
+
+from .utils import BASE_IMPORTS
+
+import_string = BASE_IMPORTS
+ 
+#"from string import *\nfrom re import *\nfrom datetime import *\nfrom collections import *\nfrom heapq import *\nfrom bisect import *\nfrom copy import *\nfrom math import *\nfrom random import *\nfrom statistics import *\nfrom itertools import *\nfrom functools import *\nfrom operator import *\nfrom io import *\nfrom sys import *\nfrom json import *\nfrom builtins import *\nfrom typing import *\nimport string\nimport re\nimport datetime\nimport collections\nimport heapq\nimport bisect\nimport copy\nimport math\nimport random\nimport statistics\nimport itertools\nimport functools\nimport operator\nimport io\nimport sys\nimport json\nsys.setrecursionlimit(50000)\n"
+
+
+def truncatefn(s, length=300):
+    if isinstance(s, str):
+        pass
+    else:
+        s = str(s)
+    if len(s) <= length:
+        return s
+
+    return s[: length // 2] + "...(truncated) ..." + s[-length // 2 :]
+
+
+class CODE_TYPE(Enum):
+    call_based = 0
+    standard_input = 1
+
+
+# stuff for setting up signal timer
+class TimeoutException(Exception):
+    pass
+
+
+def timeout_handler(signum, frame):
+    print("timeout occured: alarm went off")
+    raise TimeoutException
+
+
+# used to capture stdout as a list
+# from https://stackoverflow.com/a/16571630/6416660
+# alternative use redirect_stdout() from contextlib
+class Capturing(list):
+    def __enter__(self):
+        self._stdout = sys.stdout
+        sys.stdout = self._stringio = StringIO()
+        # Make closing the StringIO a no-op
+        self._stringio.close = lambda x: 1
+        return self
+
+    def __exit__(self, *args):
+        self.append(self._stringio.getvalue())
+        del self._stringio  # free up some memory
+        sys.stdout = self._stdout
+
+
+def clean_if_name(code: str) -> str:
+    try:
+        astree = ast.parse(code)
+        last_block = astree.body[-1]
+        if isinstance(last_block, ast.If):
+            condition = last_block.test
+            if ast.unparse(condition).strip() == "__name__ == '__main__'":
+                code = (
+                    ast.unparse(astree.body[:-1]) + "\n" + ast.unparse(last_block.body)  # type: ignore
+                )
+    except:
+        pass
+
+    return code
+
+
+def make_function(code: str) -> str:
+    try:
+        import_stmts = []
+        all_other_stmts = []
+        astree = ast.parse(code)
+        for stmt in astree.body:
+            if isinstance(stmt, (ast.Import, ast.ImportFrom)):
+                import_stmts.append(stmt)
+            else:
+                all_other_stmts.append(stmt)
+
+        function_ast = ast.FunctionDef(
+            name="wrapped_function",
+            args=ast.arguments(
+                posonlyargs=[], args=[], kwonlyargs=[], kw_defaults=[], defaults=[]
+            ),
+            body=all_other_stmts,
+            decorator_list=[],
+            lineno=-1,
+        )
+        main_code = (
+            import_string
+            + "\n"
+            + ast.unparse(import_stmts)  # type: ignore
+            + "\n"
+            + ast.unparse(function_ast)  # type: ignore
+        )
+        return main_code
+    except Exception as e:
+        return code
+
+
+def call_method(method, inputs):
+
+    if isinstance(inputs, list):
+        inputs = "\n".join(inputs)
+
+    inputs_line_iterator = iter(inputs.split("\n"))
+
+    # sys.setrecursionlimit(10000)
+
+    # @patch('builtins.input', side_effect=inputs.split("\n"))
+    @patch("builtins.open", mock_open(read_data=inputs))
+    @patch("sys.stdin", StringIO(inputs))
+    @patch("sys.stdin.readline", lambda *args: next(inputs_line_iterator))
+    @patch("sys.stdin.readlines", lambda *args: inputs.split("\n"))
+    @patch("sys.stdin.read", lambda *args: inputs)
+    # @patch('sys.stdout.write', print)
+    def _inner_call_method(_method):
+        try:
+            return _method()
+        except SystemExit as e:
+            pass
+        finally:
+            pass
+
+    return _inner_call_method(method)
+
+
+def get_function(compiled_sol, fn_name: str):  # type: ignore
+    try:
+        assert hasattr(compiled_sol, fn_name)
+        return getattr(compiled_sol, fn_name)
+    except Exception as e:
+        return
+
+
+def compile_code(code: str, timeout: int):
+    signal.alarm(timeout)
+    try:
+        tmp_sol = ModuleType("tmp_sol", "")
+        exec(code, tmp_sol.__dict__)
+        if "class Solution" in code:
+            # leetcode wraps solutions in `Solution`
+            # this is a hack to check if it is leetcode solution or not
+            # currently livecodebench only supports LeetCode but
+            # else condition allows future extensibility to other platforms
+            compiled_sol = tmp_sol.Solution()
+        else:
+            # do nothing in the other case since function is accesible
+            compiled_sol = tmp_sol
+
+        assert compiled_sol is not None
+    finally:
+        signal.alarm(0)
+
+    return compiled_sol
+
+
+def convert_line_to_decimals(line: str) -> tuple[bool, list[Decimal]]:
+    try:
+        decimal_line = [Decimal(elem) for elem in line.split()]
+    except:
+        return False, []
+    return True, decimal_line
+
+
+def get_stripped_lines(val: str):
+    ## you don't want empty lines to add empty list after splitlines!
+    val = val.strip()
+
+    return [val_line.strip() for val_line in val.split("\n")]
+
+
+def grade_call_based(
+    code: str, all_inputs: list, all_outputs: list, fn_name: str, timeout: int
+):
+    # call-based clean up logic
+    # need to wrap in try-catch logic after to catch the correct errors, but for now this is fine.
+    code = import_string + "\n\n" + code
+    compiled_sol = compile_code(code, timeout)
+
+    if compiled_sol is None:
+        return
+
+    method = get_function(compiled_sol, fn_name)
+
+    if method is None:
+        return
+
+    all_inputs = [
+        [json.loads(line) for line in inputs.split("\n")] for inputs in all_inputs
+    ]
+
+    all_outputs = [json.loads(output) for output in all_outputs]
+
+    total_execution = 0
+    all_results = []
+    for idx, (gt_inp, gt_out) in enumerate(zip(all_inputs, all_outputs)):
+        signal.alarm(timeout)
+        faulthandler.enable()
+        try:
+            # can lock here so time is useful
+            start = time.time()
+            prediction = method(*gt_inp)
+            total_execution += time.time() - start
+            signal.alarm(0)
+
+            # don't penalize model if it produces tuples instead of lists
+            # ground truth sequences are not tuples
+            if isinstance(prediction, tuple):
+                prediction = list(prediction)
+
+            tmp_result = prediction == gt_out
+
+            # handle floating point comparisons
+
+            all_results.append(tmp_result)
+
+            if not tmp_result:
+                return all_results, {
+                    "output": truncatefn(prediction),
+                    "inputs": truncatefn(gt_inp),
+                    "expected": truncatefn(gt_out),
+                    "error_code": -2,
+                    "error_message": "Wrong Answer",
+                }
+        except Exception as e:
+            signal.alarm(0)
+            if "timeoutexception" in repr(e).lower():
+                all_results.append(-3)
+                return all_results, {
+                    "error": repr(e),
+                    "error_code": -3,
+                    "error_message": "Time Limit Exceeded",
+                    "inputs": truncatefn(gt_inp),
+                    "expected": truncatefn(gt_out),
+                }
+            else:
+                all_results.append(-4)
+                return all_results, {
+                    "error": repr(e),
+                    "error_code": -4,
+                    "error_message": "Runtime Error",
+                    "inputs": truncatefn(gt_inp),
+                    "expected": truncatefn(gt_out),
+                }
+
+        finally:
+            signal.alarm(0)
+            faulthandler.disable()
+
+    return all_results, {"execution time": total_execution}
+
+
+def grade_stdio(
+    code: str,
+    all_inputs: list,
+    all_outputs: list,
+    timeout: int,
+):
+    ## runtime doesn't interact well with __name__ == '__main__'
+    code = clean_if_name(code)
+
+    ## we wrap the given code inside another function
+    code = make_function(code)
+
+    compiled_sol = compile_code(code, timeout)
+    if compiled_sol is None:
+        return
+
+    method = get_function(compiled_sol, "wrapped_function")
+
+    if method is None:
+        return
+
+    all_results = []
+    total_execution_time = 0
+    for idx, (gt_inp, gt_out) in enumerate(zip(all_inputs, all_outputs)):
+        signal.alarm(timeout)
+        faulthandler.enable()
+
+        signal.alarm(timeout)
+        with Capturing() as captured_output:
+            try:
+                start = time.time()
+                call_method(method, gt_inp)
+                total_execution_time += time.time() - start
+                # reset the alarm
+                signal.alarm(0)
+            except Exception as e:
+                signal.alarm(0)
+                if "timeoutexception" in repr(e).lower():
+                    all_results.append(-3)
+                    return all_results, {
+                        "error": repr(e),
+                        "error_code": -3,
+                        "error_message": "Time Limit Exceeded",
+                        "inputs": truncatefn(gt_inp),
+                        "expected": truncatefn(gt_out),
+                    }
+                else:
+                    all_results.append(-4)
+                    return all_results, {
+                        "error": repr(e),
+                        "error_code": -4,
+                        "error_message": "Runtime Error",
+                        "inputs": truncatefn(gt_inp),
+                        "expected": truncatefn(gt_out),
+                    }
+
+            finally:
+                signal.alarm(0)
+                faulthandler.disable()
+
+        prediction = captured_output[0]
+
+        stripped_prediction_lines = get_stripped_lines(prediction)
+        stripped_gt_out_lines = get_stripped_lines(gt_out)
+
+        ## WA happens in multiple circumstances
+        ## so cache the return to make it clean!
+        WA_send_args = {
+            "output": truncatefn(prediction),
+            "inputs": truncatefn(gt_inp),
+            "expected": truncatefn(gt_out),
+            "error_code": -2,
+        }
+
+        if len(stripped_prediction_lines) != len(stripped_gt_out_lines):
+            all_results.append(-2)
+            WA_send_args["error_message"] = "Wrong answer: mismatched output length"
+            return all_results, WA_send_args
+
+        for output_line_idx, (
+            stripped_prediction_line,
+            stripped_gt_out_line,
+        ) in enumerate(zip(stripped_prediction_lines, stripped_gt_out_lines)):
+            WA_send_args["error_message"] = (
+                f"Wrong answer at {output_line_idx=}: {truncatefn(stripped_prediction_line)} != {truncatefn(stripped_gt_out_line)}"
+            )
+
+            ## CASE 1: exact match
+            if stripped_prediction_line == stripped_gt_out_line:
+                continue
+
+            ## CASE 2: element-wise comparision
+            ## if there are floating elements
+            ## use `decimal` library for good floating point comparision
+            ## otherwise gotcha: np.isclose(50000000000000000, 50000000000000001) = True
+            ## note that we should always be able to convert to decimals
+
+            success, decimal_prediction_line = convert_line_to_decimals(
+                stripped_prediction_line
+            )
+            if not success:
+                all_results.append(-2)
+                return all_results, WA_send_args
+            success, decimal_gtout_line = convert_line_to_decimals(stripped_gt_out_line)
+            if not success:
+                all_results.append(-2)
+                return all_results, WA_send_args
+
+            if decimal_prediction_line == decimal_gtout_line:
+                continue
+
+            all_results.append(-2)
+            return all_results, WA_send_args
+        all_results.append(True)
+
+    return all_results, {"execution time": total_execution_time}
+
+
+def run_test(sample, test=None, debug=False, timeout=6):
+    """
+    if test(generated_code) is not None it'll try to run the code.
+    otherwise it'll just return an input and output pair.
+    """
+    signal.signal(signal.SIGALRM, timeout_handler)
+
+    # Disable functionalities that can make destructive changes to the test.
+    # max memory is set to 4GB
+    reliability_guard()
+
+    if debug:
+        print(f"start = {datetime.now().time()}")
+
+    try:
+        in_outs = json.loads(sample["input_output"])
+    except ValueError as e:
+        raise e
+        in_outs = None
+
+    if in_outs:
+        if in_outs.get("fn_name") is None:
+            which_type = CODE_TYPE.standard_input  # Standard input
+            method_name = None
+        else:
+            which_type = CODE_TYPE.call_based  # Call-based
+            method_name = in_outs["fn_name"]
+
+    if debug:
+        print(f"loaded input_output = {datetime.now().time()}")
+
+    if test is None:
+        assert False, "should not happen: test code is none"
+        return in_outs, {"error": "no test code provided"}
+    elif test is not None:
+        results = []
+        sol = import_string
+        if debug:
+            print(f"loading test code = {datetime.now().time()}")
+
+        if which_type == CODE_TYPE.call_based:
+            signal.alarm(timeout)
+            try:
+                results, metadata = grade_call_based(
+                    code=test,
+                    all_inputs=in_outs["inputs"],
+                    all_outputs=in_outs["outputs"],
+                    fn_name=method_name,
+                    timeout=timeout,
+                )
+                return results, metadata
+            except Exception as e:
+                return [-4], {
+                    "error_code": -4,
+                    "error_message": f"Error during testing: {e}",
+                }
+            finally:
+                signal.alarm(0)
+        elif which_type == CODE_TYPE.standard_input:
+            # sol
+            # if code has if __name__ == "__main__": then remove it
+
+            signal.alarm(timeout)
+            try:
+                results, metadata = grade_stdio(
+                    code=test,
+                    all_inputs=in_outs["inputs"],
+                    all_outputs=in_outs["outputs"],
+                    timeout=timeout,
+                )
+                return results, metadata
+            except Exception as e:
+                return [-4], {
+                    "error_code": -4,
+                    "error_message": f"Error during testing: {e}",
+                }
+            finally:
+                signal.alarm(0)
+
+
+def reliability_guard(maximum_memory_bytes=None):
+    """
+    This disables various destructive functions and prevents the generated code
+    from interfering with the test (e.g. fork bomb, killing other processes,
+    removing filesystem files, etc.)
+    WARNING
+    This function is NOT a security sandbox. Untrusted code, including, model-
+    generated code, should not be blindly executed outside of one. See the
+    Codex paper for more information about OpenAI's code sandbox, and proceed
+    with caution.
+    """
+
+    if maximum_memory_bytes is not None:
+        import resource
+
+        resource.setrlimit(
+            resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes)
+        )
+        resource.setrlimit(
+            resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes)
+        )
+        if not platform.uname().system == "Darwin":
+            resource.setrlimit(
+                resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes)
+            )
+
+    faulthandler.disable()
+
+    import builtins
+
+    # builtins.exit = None
+    builtins.quit = None
+
+    import os
+
+    os.environ["OMP_NUM_THREADS"] = "1"
+
+    os.kill = None
+    os.system = None
+    os.putenv = None
+    os.remove = None
+    os.removedirs = None
+    os.rmdir = None
+    os.fchdir = None
+    os.setuid = None
+    os.fork = None
+    os.forkpty = None
+    os.killpg = None
+    os.rename = None
+    os.renames = None
+    os.truncate = None
+    os.replace = None
+    os.unlink = None
+    os.fchmod = None
+    os.fchown = None
+    os.chmod = None
+    os.chown = None
+    os.chroot = None
+    os.fchdir = None
+    os.lchflags = None
+    os.lchmod = None
+    os.lchown = None
+    os.getcwd = None
+    os.chdir = None
+
+    import shutil
+
+    shutil.rmtree = None
+    shutil.move = None
+    shutil.chown = None
+
+    import subprocess
+
+    subprocess.Popen = None  # type: ignore
+
+    __builtins__["help"] = None
+
+    import sys
+
+    sys.modules["ipdb"] = None
+    sys.modules["joblib"] = None
+    sys.modules["resource"] = None
+    sys.modules["psutil"] = None
+    sys.modules["tkinter"] = None
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/code_utils/pyext2.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/code_utils/pyext2.py
new file mode 100644
index 000000000..9cdec39d5
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/code_utils/pyext2.py
@@ -0,0 +1,482 @@
+'''
+Copyright (C) 2014 Ryan Gonzalez
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the
+Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+'''
+
+g_backup = globals().copy()
+
+__version__ = '0.7'
+
+__all__ = ['overload', 'RuntimeModule', 'switch', 'tail_recurse', 'copyfunc', 'set_docstring', 'annotate', 'safe_unpack', 'modify_function', 'assign', 'fannotate', 'compare_and_swap', 'is_main', 'call_if_main', 'run_main']
+
+import sys, inspect, types
+
+def __targspec(func, specs, attr='__orig_arg__'):
+    if hasattr(func, '__is_overload__') and func.__is_overload__:
+        return getattr(func, attr)
+    return specs(func)
+
+def set_docstring(doc):
+    '''A simple decorator to set docstrings.
+
+       :param doc: The docstring to tie to the function.
+
+       Example::
+
+          @set_docstring('This is a docstring')
+          def myfunc(x):
+              pass'''
+    def _wrap(f):
+        f.__doc__ = doc
+        return f
+    return _wrap
+
+__modify_function_doc = '''
+Creates a copy of a function, changing its attributes.
+
+:param globals: Will be added to the function's globals.
+
+:param name: The new function name. Set to ``None`` to use the function's original name.
+
+:param code: The new function code object. Set to ``None`` to use the function's original code object.
+
+:param defaults: The new function defaults. Set to ``None`` to use the function's original defaults.
+
+:param closure: The new function closure. Set to ``None`` to use the function's original closure.
+
+.. warning:: This function can be potentially dangerous.
+'''
+
+def copyfunc(f):
+   '''Copies a funcion.
+
+      :param f: The function to copy.
+
+      :return: The copied function.
+
+      .. deprecated:: 0.4
+         Use :func:`modify_function` instead.
+      '''
+   return modify_function(f)
+
+if sys.version_info.major == 3:
+    @set_docstring(__modify_function_doc)
+    def modify_function(f, globals={}, name=None, code=None, defaults=None,
+                        closure=None):
+        if code is None: code = f.__code__
+        if name is None: name = f.__name__
+        if defaults is None: defaults = f.__defaults__
+        if closure is None: closure = f.__closure__
+        newf = types.FunctionType(code, dict(f.__globals__, **globals), name=name,
+                                  argdefs=defaults, closure=closure)
+        newf.__dict__.update(f.__dict__)
+        return newf
+    def argspec(f):
+        return inspect.getfullargspec(f)
+    ofullargspec = inspect.getfullargspec
+    def _fullargspec(func):
+        return __targspec(func, ofullargspec)
+    inspect.getfullargspec = _fullargspec
+    def _exec(m,g): exec(m,g)
+else:
+    @set_docstring(__modify_function_doc)
+    def modify_function(f, globals={}, name=None, code=None, defaults=None,
+                        closure=None):
+        if code is None: code = f.func_code
+        if name is None: name = f.__name__
+        if defaults is None: defaults = f.func_defaults
+        if closure is None: closure = f.func_closure
+        newf = types.FunctionType(code, dict(f.func_globals, **globals), name=name,
+                                  argdefs=defaults, closure=closure)
+        newf.__dict__.update(f.__dict__)
+        return newf
+    def argspec(f):
+        return inspect.getargspec(f)
+    eval(compile('def _exec(m,g): exec m in g', '<exec>', 'exec'))
+
+def _gettypes(args):
+    return tuple(map(type, args))
+
+oargspec = inspect.getargspec
+
+def _argspec(func):
+    return __targspec(func, oargspec)
+
+inspect.getargspec = _argspec
+
+try:
+    import IPython
+except ImportError:
+    IPython = None
+else:
+    # Replace IPython's argspec
+    oipyargspec = IPython.core.oinspect.getargspec
+    def _ipyargspec(func):
+        return __targspec(func, oipyargspec, '__orig_arg_ipy__')
+    IPython.core.oinspect.getargspec = _ipyargspec
+
+class overload(object):
+    '''Simple function overloading in Python.'''
+    _items = {}
+    _types = {}
+    @classmethod
+    def argc(self, argc=None):
+        '''Overloads a function based on the specified argument count.
+
+           :param argc: The argument count. Defaults to ``None``. If ``None`` is given, automatically compute the argument count from the given function.
+
+           .. note::
+
+              Keyword argument counts are NOT checked! In addition, when the argument count is automatically calculated, the keyword argument count is also ignored!
+
+           Example::
+
+               @overload.argc()
+               def func(a):
+                   print 'Function 1 called'
+
+               @overload.argc()
+               def func(a, b):
+                   print 'Function 2 called'
+
+               func(1) # Calls first function
+               func(1, 2) # Calls second function
+               func() # Raises error
+               '''
+        # Python 2 UnboundLocalError fix
+        argc = {'argc': argc}
+        def _wrap(f):
+            def _newf(*args, **kwargs):
+                if len(args) not in self._items[f.__name__]:
+                    raise TypeError("No overload of function '%s' that takes %d args" % (f.__name__, len(args)))
+                return self._items[f.__name__][len(args)](*args, **kwargs)
+            if f.__name__ not in self._items:
+                self._items[f.__name__] = {}
+            if argc['argc'] is None:
+                argc['argc'] = len(argspec(f).args)
+            self._items[f.__name__][argc['argc']] = f
+            _newf.__name__ = f.__name__
+            _newf.__doc__ = f.__doc__
+            _newf.__is_overload__ = True
+            _newf.__orig_arg__ = argspec(f)
+            if IPython:
+                _newf.__orig_arg_ipy__ = IPython.core.oinspect.getargspec(f)
+            return _newf
+        return _wrap
+    @classmethod
+    def args(self, *argtypes, **kw):
+        '''Overload a function based on the specified argument types.
+
+           :param argtypes: The argument types. If None is given, get the argument types from the function annotations(Python 3 only)
+           :param kw: Can only contain 1 argument, `is_cls`. If True, the function is assumed to be part of a class.
+
+           Example::
+
+               @overload.args(str)
+               def func(s):
+                   print 'Got string'
+
+               @overload.args(int, str)
+               def func(i, s):
+                   print 'Got int and string'
+
+               @overload.args()
+               def func(i:int): # A function annotation example
+                   print 'Got int'
+
+               func('s')
+               func(1)
+               func(1, 's')
+               func(True) # Raises error
+            '''
+
+        # Python 2 UnboundLocalError fix...again!
+        argtypes = {'args': tuple(argtypes)}
+        def _wrap(f):
+            def _newf(*args):
+                if len(kw) == 0:
+                    cargs = args
+                elif len(kw) == 1 and 'is_cls' in kw and kw['is_cls']:
+                    cargs = args[1:]
+                else:
+                    raise ValueError('Invalid keyword args specified')
+                if _gettypes(cargs) not in self._types[f.__name__]:
+                    raise TypeError("No overload of function '%s' that takes '%s' types and %d arg(s)" % (f.__name__, _gettypes(cargs), len(cargs)))
+                return self._types[f.__name__][_gettypes(cargs)](*args)
+            if f.__name__ not in self._types:
+                self._types[f.__name__] = {}
+            if len(argtypes['args']) == 1 and argtypes['args'][0] is None:
+                aspec = argspec(f)
+                argtypes['args'] = tuple(map(lambda x: x[1], sorted(
+                    aspec.annotations.items(), key=lambda x: aspec.args.index(x[0]))))
+            self._types[f.__name__][argtypes['args']] = f
+            _newf.__name__ = f.__name__
+            _newf.__doc__ = f.__doc__
+            _newf.__is_overload__ = True
+            _newf.__orig_arg__ = argspec(f)
+            if IPython:
+                _newf.__orig_arg_ipy__ = IPython.core.oinspect.getargspec(f)
+            return _newf
+        return _wrap
+
+class _RuntimeModule(object):
+    'Create a module object at runtime and insert it into sys.path. If called, same as :py:func:`from_objects`.'
+    def __call__(self, *args, **kwargs):
+        return self.from_objects(*args, **kwargs)
+    @staticmethod
+    @overload.argc(1)
+    def from_objects(module_name_for_code_eval, **d):
+        return _RuntimeModule.from_objects(module_name_for_code_eval, '', **d)
+    @staticmethod
+    @overload.argc(2)
+    def from_objects(module_name_for_code_eval, docstring, **d):
+        '''Create a module at runtime from `d`.
+
+           :param name: The module name.
+
+           :param docstring: Optional. The module's docstring.
+
+           :param \*\*d: All the keyword args, mapped from name->value.
+
+           Example: ``RuntimeModule.from_objects('name', 'doc', a=1, b=2)``'''
+        module = types.ModuleType(module_name_for_code_eval, docstring)
+        module.__dict__.update(d)
+        module.__file__ = '<runtime_module>'
+        sys.modules[module_name_for_code_eval] = module
+        return module
+    @staticmethod
+    @overload.argc(2)
+    def from_string(module_name_for_code_eval, s):
+        return _RuntimeModule.from_string(module_name_for_code_eval, '', s)
+    @staticmethod
+    @overload.argc(3)
+    def from_string(module_name_for_code_eval, docstring, s):
+        '''Create a module at runtime from `s``.
+
+           :param name: The module name.
+
+           :param docstring: Optional. The module docstring.
+
+           :param s: A string containing the module definition.'''
+        g = {}
+        _exec(s, g)
+        return _RuntimeModule.from_objects(module_name_for_code_eval, docstring, **dict(filter(lambda x: x[0] not in g_backup, g.items())))
+
+RuntimeModule = _RuntimeModule()
+
+class CaseObject(object):
+    'The object returned by a switch statement. When called, it will return True if the given argument equals its value, else False. It can be called with multiple parameters, in which case it checks if its value equals any of the arguments.'
+    def __init__(self, value):
+        self.value = value
+        self.did_match = False
+        self.did_pass = False
+    def __call__(self, *args):
+        if assign('res', not self.did_pass and any([self.value == rhs for rhs in args])):
+            self.did_match = True
+        return res
+    def quit(self):
+        'Forces all other calls to return False. Equilavent of a ``break`` statement.'
+        self.did_pass = True
+    def default(self):
+        "Executed if quit wasn't called."
+        return not self.did_match and not self.did_pass
+    def __iter__(self):
+        yield self
+    def __enter__(self):
+        return self
+    def __exit__(self, *args):
+        pass
+
+def switch(value):
+    '''A Python switch statement implementation that is used with a ``with`` statement.
+
+       :param value: The value to "switch".
+
+       ``with`` statement example::
+
+           with switch('x'):
+               if case(1): print 'Huh?'
+               if case('x'): print 'It works!!!'
+
+       .. warning:: If you modify a variable named "case" in the same scope that you use the ``with`` statement version, you will get an UnboundLocalError. The soluction is to use ``with switch('x') as case:`` instead of ``with switch('x'):``.'''
+    res = CaseObject(value)
+    inspect.stack()[1][0].f_globals['case'] = res
+    return res
+
+def tail_recurse(spec=None):
+    '''Remove tail recursion from a function.
+
+       :param spec: A function that, when given the arguments, returns a bool indicating whether or not to exit. If ``None,`` tail recursion is always called unless the function returns a value.
+
+       .. note::
+
+           This function has a slight overhead that is noticable when using timeit. Only use it if the function has a possibility of going over the recursion limit.
+
+       .. warning::
+
+           This function will BREAK any code that either uses any recursion other than tail recursion or calls itself multiple times. For example, ``def x(): return x()+1`` will fail.
+
+       Example::
+
+           @tail_recurse()
+           def add(a, b):
+               if a == 0: return b
+               return add(a-1, b+1)
+
+           add(10000000, 1) # Doesn't max the recursion limit.
+           '''
+    def _wrap(f):
+        class TailRecursion(Exception):
+            def __init__(self, args, kwargs):
+                self.args = args
+                self.kwargs = kwargs
+        def _newf(*args, **kwargs):
+            if inspect.stack()[1][3] == f.__name__:
+                if (spec and spec(args)) or not spec:
+                    raise TailRecursion(args, kwargs)
+            while True:
+                try:
+                    res = f(*args, **kwargs)
+                except TailRecursion as ex:
+                    args = ex.args
+                    kwargs = ex.kwargs
+                    continue
+                else:
+                    return res
+        _newf.__doc__ = f.__doc__
+        return _newf
+    return _wrap
+
+def annotate(*args, **kwargs):
+    '''Set function annotations using decorators.
+
+       :param args: This is a list of annotations for the function, in the order of the function's parameters. For example, ``annotate('Annotation 1', 'Annotation 2')`` will set the annotations of parameter 1 of the function to ``Annotation 1``.
+
+       :param kwargs: This is a mapping of argument names to annotations. Note that these are applied *after* the argument list, so any args set that way will be overriden by this mapping. If there is a key named `ret`, that will be the annotation for the function's return value.
+
+       .. deprecated:: 0.5
+         Use :func:`fannotate` instead.
+'''
+    def _wrap(f):
+        if not hasattr(f, '__annotations__'):
+            f.__annotations__ = {}
+        if 'ret' in kwargs:
+            f.__annotations__['return'] = kwargs.pop('ret')
+        f.__annotations__.update(dict(zip(argspec(f).args, args)))
+        f.__annotations__.update(kwargs)
+        return f
+    return _wrap
+
+def fannotate(*args, **kwargs):
+    '''Set function annotations using decorators.
+
+       :param \*args: The first positional argument is used for the function's return value; all others are discarded.
+
+       :param \**kwargs: This is a mapping of argument names to annotations.
+
+       Example::
+
+           @fannotate('This for the return value', a='Parameter a', b='Parameter b')
+           def x(a, b):
+               pass
+
+       '''
+    def _wrap(f):
+        if not hasattr(f, '__annotations__'):
+            f.__annotations__ = {}
+        if len(args) >= 1:
+            f.__annotations__['return'] = args[0]
+        f.__annotations__.update(kwargs)
+        return f
+    return _wrap
+
+def safe_unpack(seq, ln, fill=None):
+    '''Safely unpack a sequence to length `ln`, without raising ValueError. Based on Lua's method of unpacking. Empty values will be filled in with `fill`, while any extra values will be cut off.
+
+       :param seq: The sequence to unpack.
+
+       :param ln: The expected length of the sequence.
+
+       :param fill: The value to substitute if the sequence is too small. Defaults to ``None``.
+
+       Example::
+
+           s = 'a:b'
+           a, b = safe_unpack(s.split(':'), 2)
+           # a = 'a'
+           # b = 'b'
+           s = 'a'
+           a, b = safe_unpack(s.split(':'), 2)
+           # a = 'a'
+           # b = None'''
+    if len(seq) > ln:
+        return seq[:ln]
+    elif len(seq) < ln:
+        return seq + type(seq)([fill]*(ln-len(seq)))
+    else:
+        return seq
+
+def assign(varname, value):
+    '''Assign `value` to `varname` and return it. If `varname` is an attribute and the instance name it belongs to is not defined, a NameError is raised.
+       This can be used to emulate assignment as an expression. For example, this::
+
+          if assign('x', 7): ...
+
+       is equilavent to this C code::
+
+          if (x = 7) ...
+
+       .. warning::
+
+          When assigning an attribute, the instance it belongs to MUST be declared as global prior to the assignment. Otherwise, the assignment will not work.
+    '''
+    fd = inspect.stack()[1][0].f_globals
+    if '.' not in varname:
+        fd[varname] = value
+    else:
+        vsplit = list(map(str.strip, varname.split('.')))
+        if vsplit[0] not in fd:
+            raise NameError('Unknown object: %s'%vsplit[0])
+        base = fd[vsplit[0]]
+        for x in vsplit[1:-1]:
+            base = getattr(base, x)
+        setattr(base, vsplit[-1], value)
+    return value
+
+def is_main(frame=1):
+    "Return if the caller is main. Equilavent to ``__name__ == '__main__'``."
+    return inspect.stack()[frame][0].f_globals['__name__'] == '__main__'
+
+def _call_if_main(frame, f, args):
+    if is_main(frame): return f(*args)
+
+def call_if_main(f,*args):
+    "Call the `f` with `args` if the caller's module is main."
+    return _call_if_main(3,f,args)
+
+def run_main(f,*args):
+    "Call `f` with the `args` and terminate the program with its return code if the caller's module is main."
+    sys.exit(_call_if_main(3,f,args))
+
+def compare_and_swap(var, compare, new):
+    "If `var` is equal to `compare`, set it to `new`."
+    if assign('v', inspect.stack()[1][0].f_globals)[var] == compare:
+        v[var] = new
+
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/code_utils/taco.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/code_utils/taco.py
new file mode 100644
index 000000000..4e0b6a848
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/code_utils/taco.py
@@ -0,0 +1,712 @@
+# modifed from https://github.com/hendrycks/apps/blob/main/eval/testing_util.py to fix some evaluation bugs and add instructions
+# https://github.com/NovaSky-AI/SkyThought/blob/main/skythought/skythought_evals/tasks/taco/taco_util.py
+from .pyext2 import RuntimeModule
+import signal
+import numpy as np
+import platform
+import time
+
+# used for debugging to time steps
+from datetime import datetime
+
+import os, sys, json
+import faulthandler
+import psutil
+import subprocess
+import tempfile
+import inspect
+from enum import Enum
+from unittest.mock import patch, mock_open
+from io import StringIO
+import ast 
+import platform
+
+from .utils import BASE_IMPORTS
+
+import pdb
+class ForkedPDB(pdb.Pdb):
+    """A Pdb subclass that may be used from a forked multiprocessing child."""
+
+    def interaction(self, *args, **kwargs):
+        _stdin = sys.stdin
+        try:
+            sys.stdin = open('/dev/stdin')
+            pdb.Pdb.interaction(self, *args, **kwargs)
+        finally:
+            sys.stdin = _stdin
+
+
+class CODE_TYPE(Enum):
+    call_based = 0
+    standard_input = 1
+
+class Capturing(list):
+    def __enter__(self):
+        self._stdout = sys.stdout
+        sys.stdout = self._stringio = StringIO()
+        # Make closing the StringIO a no-op
+        self._stringio.close = lambda x: 1
+        return self
+    def __exit__(self, *args):
+        self.extend(self._stringio.getvalue().splitlines())
+        del self._stringio    # free up some memory
+        sys.stdout = self._stdout
+
+# to run the solution files we're using a timing based approach
+import signal
+# stuff for setting up signal timer
+class TimeoutException(Exception):
+    pass
+def timeout_handler(signum, frame):
+    print(f"alarm went off")
+    # return
+    raise TimeoutException
+signal.signal(signal.SIGALRM, timeout_handler)
+TIMEOUT = 90  # seconds
+
+EXECUTION_RESULTS = {1: "passed", 0: "false", -1: "timeout", -2: "runtime_error", -3: "returncode:{code}", -4: "compile_error"}
+
+def run_test(in_outs, test=None, debug=False, timeout=TIMEOUT):
+    """
+    if test(generated_code) is not None it'll try to run the code.
+    otherwise it'll just return an input and output pair.
+    """
+    test = f"{BASE_IMPORTS}\n{test}"
+    if isinstance(in_outs, str):
+        try:
+            in_outs =  ast.literal_eval(in_outs)
+            assert isinstance(in_outs, dict)
+        except (ValueError, SyntaxError) as e:
+            print(f"run_tests app/taco, Error parsing string: {e}")
+            return []
+    if in_outs:
+        if in_outs.get("fn_name") is None:
+            which_type = CODE_TYPE.standard_input  # Standard input
+            method_name = None
+        else:
+            which_type = CODE_TYPE.call_based  # Call-based
+            method_name = in_outs["fn_name"]
+    inputs_list = []
+    outputs_list = []
+    for index, inputs in enumerate(in_outs["inputs"]):
+        outputs = in_outs["outputs"][index]
+        inputs, outputs = process_input_output(inputs, outputs)
+        inputs_list.append(inputs)
+        outputs_list.append(outputs)
+
+    if debug:
+        print(f"loaded input_output = {datetime.now().time()}")
+    if test is None:
+        return None
+    elif test is not None:
+        results = []
+        if debug:
+            print(f"loading test code = {datetime.now().time()}")
+        if which_type == CODE_TYPE.call_based:
+            synthesized_code = synthesize_cb_code(test, debug)
+            method_func = compile_and_get_func(synthesized_code, which_type, method_name, timeout=timeout, debug=debug)
+        elif which_type == CODE_TYPE.standard_input:
+            synthesized_code, exec_code = synthesize_std_code(test, debug)
+            method_func = compile_and_get_func(synthesized_code, which_type, method_name, timeout=timeout, debug=debug)
+        if not method_func:
+            results.append(-2)
+            return results
+        else:
+            if which_type == CODE_TYPE.call_based:  # Call-based
+                detail_results, debug_infos = execute_cb_code(method_func, inputs_list, outputs_list, timeout=timeout, early_stop=True, debug=debug)
+            elif which_type == CODE_TYPE.standard_input:
+                detail_results = execute_std_code(method_func, exec_code, inputs_list, outputs_list, timeout=timeout, early_stop=True, debug=debug)
+                debug_infos = detail_results.get('debug', None)
+                detail_results = {k:v for k, v in detail_results.items() if k!='debug'}
+                if set(detail_results.values()) == {(False, 'returncode:1')}:
+                    synthesized_code, exec_code = synthesize_std_code(test, debug)
+                    detail_results = execute_std_code(method_func, synthesized_code+'\ncode()\n', inputs_list, outputs_list, timeout=timeout, early_stop=True, debug=debug)
+        if isinstance(detail_results, list):
+            if len(detail_results) == 1:
+                detail_results = detail_results * len(inputs_list)
+            detail_results = dict(zip([i for i in range(len(inputs_list))], detail_results))
+        for test_id, test_result in detail_results.items():
+            if test_result[1] == "passed":
+                results.append(True)
+            elif test_result[1] == "false":
+                results.append(False)
+            elif test_result[1] == "timeout":
+                results.append(-1)
+            else:
+                results.append(-3)
+        return results
+
+def process_input_output(inputs, outputs):
+    # JSON forces dictionaries to have string keys; this undoes this (assuming a singleton list)
+    try:
+        if isinstance(inputs[0], dict):
+            inputs = [{int(k): v for k,v in inputs[0].items()}]
+    except:
+        True
+    
+    try:
+        if isinstance(outputs, dict):
+            outputs = [{int(k): v for k,v in outputs.items()}]
+    except:
+        True
+
+    try:
+        if isinstance(outputs[0], dict):
+            outputs = [{int(k): v for k,v in outputs[0].items()}]
+    except:
+        True
+    
+    return inputs, outputs
+
+def compile_and_get_func(program, which_type, method_name, timeout, debug):
+    try:
+        signal.alarm(timeout)
+        tmp_sol = RuntimeModule.from_string("tmp_sol", "", program)
+        if which_type == CODE_TYPE.call_based and "class Solution" in program:
+            tmp = tmp_sol.Solution()
+        else:
+            tmp = tmp_sol
+        signal.alarm(0)
+    except Exception as e:
+        signal.alarm(0)
+        if debug:
+            print(f"compilation error = {e}")
+        return False
+    
+    if which_type == CODE_TYPE.call_based:
+        assert isinstance(method_name, str)
+    else:
+        method_name = "code"
+    
+    try:
+        signal.alarm(timeout)
+        method = getattr(tmp, method_name)  # get_attr second arg must be str
+        signal.alarm(0)
+    except:
+        signal.alarm(0)
+        e = sys.exc_info()
+        if debug:
+            print(f"unable to get function error = {e}")
+        return False
+    return method
+
+def synthesize_cb_code(raw_code, debug=False):
+    sol = "import sys\nimport time\nimport itertools\nfrom itertools import accumulate, product, permutations, combinations\nimport collections\nfrom collections import Counter, OrderedDict, deque, defaultdict, ChainMap\nfrom functools import lru_cache\nimport math\nfrom math import sqrt, sin, cos, tan, ceil, fabs, floor, gcd, exp, log, log2\nimport fractions\nfrom typing import List, Tuple\nimport numpy as np\nimport random\nimport heapq\nfrom heapq import *\n"
+    if debug:
+        print(f"loading test code = {datetime.now().time()}")
+    sol += raw_code
+    return sol
+
+def synthesize_std_code(raw_code, debug=False):
+    normal_import_lines = "import sys\nimport time\nimport itertools\nfrom itertools import accumulate, product, permutations, combinations\nimport collections\nfrom collections import Counter, OrderedDict, deque, defaultdict, ChainMap\nfrom functools import lru_cache\nimport math\nfrom math import sqrt, sin, cos, tan, ceil, fabs, floor, gcd, exp, log, log2\nimport fractions\nfrom typing import List, Tuple\nimport numpy as np\nimport random\nimport heapq\nfrom heapq import *\n"
+    if debug:
+        print(f"loading test code = {datetime.now().time()}")
+    
+    sol = "" # code for compile
+    sol2 = "" # code for execute
+
+    tmp_test = raw_code.split("\n")
+    # define the code line type, 1 for import lines, 2 for import * lines with indent, 0 for normal codes
+    code_types = [] 
+
+
+    for x in tmp_test:
+        if 'import *' in x:
+            code_types.append(2)
+        elif x.startswith("from ") or x.startswith("import "):
+            code_types.append(1) 
+        else:
+            code_types.append(0)
+    
+    started = False
+
+    special_import_lines = [i.lstrip('\t') for idx, i in enumerate(tmp_test) if code_types[idx]==2]
+    special_import_lines = '\n'.join(special_import_lines)
+
+    for idx, i in enumerate(tmp_test):
+        code_type = code_types[idx]
+        if code_type == 0 and not started:
+            sol2 += normal_import_lines
+            sol2 += "\nstdin = sys.stdin\nstdout = sys.stdout\n"
+            sol2 += f"{i}\n"
+
+            sol += normal_import_lines
+            sol += special_import_lines
+            sol += "\nstdin = sys.stdin\nstdout = sys.stdout\n"
+            sol += "def code():\n"
+            sol += f"\t{i}\n"
+            started = True
+        else:
+            sol2 += f"{i}\n"
+            if code_type < 2:
+                if started:
+                    sol += '\t'
+                sol += f"{i}\n"
+    
+    if debug:
+        print(f"sol = {sol}")
+        print(f"sol2 = {sol2}")
+    
+    return sol, sol2
+
+def call_method(method, inputs):
+
+    if isinstance(inputs, list):
+        inputs = "\n".join(inputs)
+
+    inputs_line_iterator = iter(inputs.split("\n"))
+
+    # sys.setrecursionlimit(10000)
+
+    # @patch('builtins.input', side_effect=inputs.split("\n"))
+    @patch('builtins.open', mock_open(read_data=inputs))
+    @patch('sys.stdin', StringIO(inputs))
+    @patch('sys.stdin.readline', lambda *args: next(inputs_line_iterator))
+    @patch('sys.stdin.readlines', lambda *args: inputs.split("\n"))
+    @patch('sys.stdin.read', lambda *args: inputs)
+    # @patch('sys.stdout.write', print)
+    def _inner_call_method(_method):
+        try:
+            return _method()
+        except SystemExit as e:
+            pass
+        finally:
+            pass
+    return _inner_call_method(method) 
+
+def execute_cb_code(method, inputs_list, outputs_list, timeout, early_stop=True, debug=True):
+    # Disable functionalities that can make destructive changes to the test.
+    reliability_guard()
+    results = []
+    debug_infos = {}
+    for index, inputs in enumerate(inputs_list):
+        if debug:
+            debug_infos[index] = {}
+        outputs = outputs_list[index]
+        try:
+            signal.alarm(timeout) 
+            faulthandler.enable()
+            exec_outputs = method(*inputs)
+            signal.alarm(0)
+            faulthandler.disable()
+        except Exception as e:
+            signal.alarm(0)
+            faulthandler.disable()
+            if debug:
+                print(f"Standard input runtime error = {e}")
+            if early_stop:
+                for i in range(index, len(inputs_list)):
+                    results.append((False, EXECUTION_RESULTS[-2]))
+                break
+            else:
+                continue
+        try:
+            # ground truth sequences are not tuples
+            if isinstance(exec_outputs, tuple):
+                exec_outputs = list(exec_outputs)
+            
+            tmp_result = exec_outputs == outputs
+            if isinstance(outputs, list) and outputs:
+                tmp_result = tmp_result or (exec_outputs == outputs[0])
+
+            # ground truth sequences are not tuples
+            try:
+                if isinstance(exec_outputs[0], tuple):
+                    exec_outputs = [list(x) for x in exec_outputs]
+                    tmp_result = tmp_result or (exec_outputs == outputs[0])
+            except:
+                True
+            if tmp_result:
+                results.append((True, EXECUTION_RESULTS[1]))
+            else:
+                results.append((False, EXECUTION_RESULTS[0]))
+        except Exception as e:
+            if debug:
+                print(f"Standard input time limit exceeded error = {e}")
+            results.append((False, EXECUTION_RESULTS[-1]))
+            continue
+
+        if debug:
+            print(f"outputs = {exec_outputs}, test outputs = {outputs}, inputs = {inputs}, {type(inputs)}, {tmp_result}")
+            debug_infos[index] = {
+                    'inputs': inputs,
+                    'gt_outputs': outputs,
+                    'exec_outputs': exec_outputs
+                }
+    return results, debug_infos
+
+def remove_tmp_files():
+    tmp_files = ['input.txt', 'output.txt']
+    for tmp_file in tmp_files:
+        if tmp_file in os.listdir('.'):
+            os.remove(tmp_file)
+
+def clean_stdout(stdout):
+    return stdout.rstrip('\n')
+
+def execute_std_code(method, synthesized_code, inputs_list, outputs_list, timeout, early_stop=False, debug=False):
+    temp_program_path = create_temp_file(synthesized_code)
+    if debug:
+        print("Test program:", temp_program_path)
+    assert isinstance(inputs_list, list) and isinstance(outputs_list, list)
+    assert len(inputs_list) == len(outputs_list)
+    exec_results = {}
+    if debug:
+        exec_results['debug'] = {}
+    for i, inputs in enumerate(inputs_list):
+        remove_tmp_files()
+        outputs = outputs_list[i]
+        if isinstance(inputs, list):
+            inputs = [str(k) for k in inputs]
+            inputs = "\n".join(inputs)
+        if isinstance(outputs, list):
+            outputs = [str(k) for k in outputs]
+            outputs = "\n".join(outputs)
+        with tempfile.NamedTemporaryFile(mode='w+') as temp_input:
+            temp_input.write(inputs)
+            temp_input.flush()
+            temp_input.seek(0)
+            temp_file_name = temp_input.name
+            stdout, stderr = "", ""
+            try:
+                result = subprocess.run(['bash', '-c', 'ulimit -v 10485760; python3 ' + temp_program_path], 
+                                        stdin=temp_input,
+                                        stdout=subprocess.PIPE,
+                                        stderr=subprocess.PIPE,
+                                        preexec_fn=os.setsid,
+                                        universal_newlines=True,
+                                        timeout=timeout,
+                                        text=True)
+                
+                stdout, stderr = result.stdout, result.stderr
+                return_code = result.returncode
+                # result = subprocess.run(['python3', temp_program_path], input=inputs, text=True, capture_output=True, timeout=timeout)
+                exec_code = 999
+            except subprocess.TimeoutExpired as e:
+                print(e, temp_file_name)
+                stderr = "TIMEOUT"
+                return_code = -9
+                exec_code = -1
+            except Exception as e:
+                print(e, temp_file_name)
+                return_code = -99
+                stderr = f"{e}"
+                exec_code = -2
+
+        stdout = clean_stdout(stdout)
+
+        if exec_code > 0:
+            if compare_std_results(stdout, outputs, debug):
+                exec_code = 1
+            else:
+                exec_code = 0
+                # if return_code != 0:
+                #     try:
+                #         inputs_tmp_file = open(create_temp_file(inputs), 'r')
+                #         result = subprocess.run(['python', temp_program_path], stdin=inputs_tmp_file, text=True, capture_output=True, timeout=timeout)
+                #         assert result.returncode == 0
+                #         stdout = result.stdout
+                #         if compare_std_results(stdout, outputs, debug):
+                #             exec_code = 1
+                #         else:
+                #             exec_code = 0
+                #     except:
+                #         try:
+                #             inputs_tmp_file = 'input.txt'
+                #             with open(inputs_tmp_file, 'w') as ftemp:
+                #                 ftemp.write(inputs)
+                #             result = subprocess.run(['python', temp_program_path], text=True, timeout=timeout)
+                #             assert result.returncode == 0
+                #             stdout = open('output.txt').read()
+                #             if compare_std_results(stdout, outputs, debug):
+                #                 exec_code = 1
+                #             else:
+                #                 exec_code = 0
+                            
+                #         except:
+                #             exec_code = -3
+        assert exec_code != -3
+        exec_results[i] = (exec_code==1, EXECUTION_RESULTS[exec_code] if exec_code>-3 else EXECUTION_RESULTS[exec_code].format(return_code))
+        if exec_code >= 0:
+            if debug:
+                print_debug_info(inputs=inputs, outputs=outputs, exec_outputs=stdout)
+                print("Stderr:", stderr)
+                exec_results['debug'][i] = {
+                    'inputs': inputs,
+                    'gt_outputs': outputs,
+                    'exec_outputs': stdout,
+                    'stderr': stderr
+                }
+        if early_stop and exec_code<=0:
+            break
+    return exec_results
+
+def print_debug_info(inputs, outputs, exec_outputs):
+    nl = "\n"
+    if not isinstance(inputs, list):
+        print(f"exec output = {exec_outputs}, test outputs = {outputs}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {exec_outputs == [outputs]}")
+    else:
+        print(f"exec output = {exec_outputs}, test outputs = {outputs}, inputs = {inputs}, {type(inputs)}, {exec_outputs == [outputs]}")
+
+def create_temp_file(content):
+    with tempfile.NamedTemporaryFile(delete=False, mode='w', encoding='utf-8') as temp_file:
+        temp_file.write(content)
+        temp_file_path = temp_file.name
+    return temp_file_path
+
+def compare_std_results(exec_outputs, outputs, debug=False):
+    if stripped_string_compare(exec_outputs, outputs):
+        return True
+    
+    if isinstance(exec_outputs, list):
+        output_1 = "\n".join(exec_outputs)
+        if stripped_string_compare(output_1, outputs):
+            return True
+    
+    if isinstance(exec_outputs, list):
+        output_2 = [o.lstrip().rstrip() for o in exec_outputs]
+        output_2 = "\n".join(output_2)
+        if stripped_string_compare(output_2, outputs):
+            return True
+    
+    tmp_result = False
+    # ground truth sequences are expressed as lists not tuples
+    if isinstance(outputs, tuple):
+        outputs = list(outputs)
+    
+    try:
+        tmp_result = (exec_outputs == [outputs])
+        if isinstance(outputs, list):
+            tmp_result = tmp_result or (exec_outputs == outputs)
+            if isinstance(exec_outputs[0], str):
+                tmp_result = tmp_result or ([e.strip() for e in exec_outputs] == outputs)
+    except Exception as e:
+        if debug:
+            print(f"Failed check1 exception = {e}")
+        pass
+    if tmp_result:
+        return True
+    
+    # try one more time without \n
+    if isinstance(outputs, list):
+        for tmp_index, i in enumerate(outputs):
+            outputs[tmp_index] = i.split("\n")
+            outputs[tmp_index] = [x.strip() for x in outputs[tmp_index] if x]
+    else:
+        outputs = outputs.split("\n")
+        outputs = list(filter(len, outputs))
+        outputs = list(map(lambda x:x.strip(), outputs))
+    
+    try:
+        tmp_result = (exec_outputs == [outputs])
+        if isinstance(outputs, list):
+            tmp_result = tmp_result or (exec_outputs == outputs)
+    except Exception as e:
+        if debug:
+            print(f"Failed check2 exception = {e}")
+        pass
+    if tmp_result:
+        return True
+    
+    # try by converting the output into a split up list too
+    if isinstance(exec_outputs, list):
+        exec_outputs = list(filter(len, exec_outputs))
+    try:
+        tmp_result = (exec_outputs == [outputs])
+        if isinstance(outputs, list):
+            tmp_result = tmp_result or (exec_outputs == outputs)
+    except Exception as e:
+        if debug:
+            print(f"Failed check3 exception = {e}")
+        pass
+    if tmp_result:
+        return True
+    
+
+    try:
+        output_float = [float(e) for e in exec_outputs]
+        gt_float = [float(e) for e in outputs]
+        tmp_result = tmp_result or ((len(output_float) == len(gt_float)) and np.allclose(output_float, gt_float))
+    except Exception as e:
+        pass
+    try:
+        if isinstance(exec_outputs[0], list):
+            output_float = [float(e) for e in exec_outputs[0]]
+            gt_float = [float(e) for e in outputs[0]]
+            tmp_result = tmp_result or ((len(output_float) == len(gt_float)) and np.allclose(output_float, gt_float))
+    except Exception as e:
+        pass
+    if tmp_result:
+        return True
+
+    
+    if isinstance(outputs, list):
+        for tmp_index, i in enumerate(outputs):
+            outputs[tmp_index] = set(i.split())
+    else:
+        outputs = set(outputs.split())
+
+    try:
+        tmp_result = (exec_outputs == outputs)
+    except Exception as e:
+        if debug:
+            print(f"Failed check4 exception = {e}")
+    if tmp_result:
+        return True
+    
+    # try by converting the output into a split up list too
+    if isinstance(exec_outputs, list):
+        for tmp_index, i in enumerate(exec_outputs):
+            exec_outputs[tmp_index] = i.split()
+        exec_outputs = list(filter(len, exec_outputs))
+        for tmp_index, i in enumerate(exec_outputs):
+            exec_outputs[tmp_index] = set(i)    
+    else:
+        exec_outputs = exec_outputs.split()
+        exec_outputs = list(filter(len, exec_outputs))
+        exec_outputs = set(exec_outputs)
+    try:
+        tmp_result = (set(frozenset(s) for s in exec_outputs) == set(frozenset(s) for s in outputs))
+    except Exception as e:
+        if debug:
+            print(f"Failed check5 exception = {e}")
+    
+    # if they are all numbers, round so that similar numbers are treated as identical
+    try:
+        tmp_result = tmp_result or (set(frozenset(round(float(t),3) for t in s) for s in exec_outputs) ==\
+            set(frozenset(round(float(t),3) for t in s) for s in outputs))
+    except Exception as e:
+        if debug:
+            print(f"Failed check6 exception = {e}")
+
+    if tmp_result:
+        return True
+    
+    return False
+
+def stripped_string_compare(s1, s2):
+    s1 = s1.lstrip().rstrip()
+    s2 = s2.lstrip().rstrip()
+    is_equal = s1 == s2
+    if is_equal:
+        return True
+
+    # Edge case: Check if s1 and s2 are floats.
+    try:
+        s1_float = float(s1)
+        s2_float = float(s2)
+        is_equal = np.isclose(s1_float, s2_float)
+        return is_equal
+    except Exception as e:
+        pass
+
+    # Edge case: Check if s1 and s2 rows are equal.
+    s1_list = s1.split("\n") 
+    s2_list = s2.split("\n")
+    s1_list = [s.lstrip().rstrip() for s in s1_list]
+    s2_list = [s.lstrip().rstrip() for s in s2_list]
+    
+    s1_list = [s for s in s1_list if s]
+    s2_list = [s for s in s2_list if s]
+    if len(s1_list) != len(s2_list):
+        return False
+    
+    for s1, s2 in zip(s1_list, s2_list):
+        sub_s1_list = s1.split()
+        sub_s2_list = s2.split()
+        sub_s1_list = [s.lstrip().rstrip() for s in sub_s1_list]
+        sub_s2_list = [s.lstrip().rstrip() for s in sub_s2_list]
+        sub_s1_list = [s for s in sub_s1_list if s]
+        sub_s2_list = [s for s in sub_s2_list if s]
+        if len(sub_s1_list) != len(sub_s2_list):
+            return False
+        for sub_s1, sub_s2 in zip(sub_s1_list, sub_s2_list):
+            if sub_s1 != sub_s2:
+                # If they are floats...
+                try:
+                    sub_s1_float = float(sub_s1)
+                    sub_s2_float = float(sub_s2)
+                    if not np.isclose(sub_s1_float, sub_s2_float):
+                        return False
+                except Exception as e:
+                    pass
+    return True
+
+def reliability_guard(maximum_memory_bytes=4 * 1024 * 1024 * 1024):
+    """
+    This disables various destructive functions and prevents the generated code
+    from interfering with the test (e.g. fork bomb, killing other processes,
+    removing filesystem files, etc.)
+    WARNING
+    This function is NOT a security sandbox. Untrusted code, including, model-
+    generated code, should not be blindly executed outside of one. See the
+    Codex paper for more information about OpenAI's code sandbox, and proceed
+    with caution.
+    """
+
+    if maximum_memory_bytes is not None:
+        import resource
+
+        resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
+        resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
+        if not platform.uname().system == "Darwin":
+            resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
+
+    faulthandler.disable()
+
+    import builtins
+
+    builtins.exit = None
+    builtins.quit = None
+
+    import os
+
+    os.environ["OMP_NUM_THREADS"] = "1"
+
+    os.kill = None
+    os.system = None
+    os.putenv = None
+    os.remove = None
+    os.removedirs = None
+    os.rmdir = None
+    os.fchdir = None
+    os.setuid = None
+    os.fork = None
+    os.forkpty = None
+    os.killpg = None
+    os.rename = None
+    os.renames = None
+    os.truncate = None
+    os.replace = None
+    os.unlink = None
+    os.fchmod = None
+    os.fchown = None
+    os.chmod = None
+    os.chown = None
+    os.chroot = None
+    os.fchdir = None
+    os.lchflags = None
+    os.lchmod = None
+    os.lchown = None
+    os.getcwd = None
+    os.chdir = None
+
+    import shutil
+
+    shutil.rmtree = None
+    shutil.move = None
+    shutil.chown = None
+
+    import subprocess
+
+    subprocess.Popen = None  # type: ignore
+
+    __builtins__["help"] = None
+
+    import sys
+
+    sys.modules["ipdb"] = None
+    sys.modules["joblib"] = None
+    sys.modules["resource"] = None
+    sys.modules["psutil"] = None
+    sys.modules["tkinter"] = None
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/code_utils/utils.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/code_utils/utils.py
new file mode 100644
index 000000000..c628fb96c
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/code_utils/utils.py
@@ -0,0 +1,98 @@
+BASE_IMPORTS = """from itertools import accumulate, chain, combinations, count, permutations, product, groupby, islice, repeat
+from copy import deepcopy
+from string import ascii_lowercase, ascii_uppercase
+from math import floor, log2, log10, sqrt, comb, gcd, ceil, inf, isqrt, factorial, atan2, pi
+from collections import defaultdict, deque, Counter
+from bisect import bisect, bisect_left, bisect_right, insort
+from heapq import heappush, heappop, heapify, merge, nlargest, nsmallest, heapreplace
+from functools import reduce, cache, lru_cache, cmp_to_key, reduce
+from random import randrange, shuffle
+from operator import itemgetter, sub, xor, or_
+from re import search as re_search  # Assuming 're' refers to a regex search
+from os.path import commonprefix
+from typing import List, Tuple, Dict, Set, Optional, Union, Any, Callable, Iterable, Iterator, Generator, Deque
+import copy
+import string
+import math
+import collections
+import bisect
+import heapq
+import functools
+import random
+import itertools
+import operator
+import re
+import datetime
+from time import time
+import numpy as np
+import pandas as pd
+from math import log, prod  # 'log' and 'prod' are functions in the math module
+from collections import deque, defaultdict, Counter, OrderedDict
+from itertools import accumulate, permutations, combinations, product, groupby, islice, chain, repeat, zip_longest, cycle, pairwise
+from functools import lru_cache, reduce, partial
+from operator import iand
+import sys
+import io, os
+"""
+
+
+
+BASE_LEETCODE_IMPORTS = """
+# Leetcode specials
+class ListNode:
+    def __init__(self, val=0, next=None):
+        self.val = val
+        self.next = next
+
+def is_same_list(l1: Optional[ListNode], l2: Optional[ListNode]) -> bool:
+    while l1 and l2:
+        if l1.val != l2.val:
+            return False
+        l1 = l1.next
+        l2 = l2.next
+    return l1 is None and l2 is None
+
+def list_node(values: List[Any]) -> Optional[ListNode]:
+    dummy = ListNode(0)
+    current = dummy
+    for value in values:
+        current.next = ListNode(value)
+        current = current.next
+    return dummy.next
+
+class TreeNode:
+    def __init__(self, val=0, left=None, right=None):
+        self.val = val
+        self.left = left
+        self.right = right
+
+def tree_node(values: List[Any]) -> Optional[TreeNode]:
+    if not values:
+        return None
+
+    root = TreeNode(values[0])
+    queue = deque([root])
+    i = 1
+
+    while queue and i < len(values):
+        node = queue.popleft()
+
+        if values[i] is not None:
+            node.left = TreeNode(values[i])
+            queue.append(node.left)
+        i += 1
+
+        if i < len(values) and values[i] is not None:
+            node.right = TreeNode(values[i])
+            queue.append(node.right)
+        i += 1
+
+    return root
+
+def is_same_tree(p: Optional[TreeNode], q: Optional[TreeNode]) -> bool:
+    if not p and not q:
+        return True
+    if not p or not q:
+        return False
+    return p.val == q.val and is_same_tree(p.left, q.left) and is_same_tree(p.right, q.right)
+"""
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/reward_types.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/reward_types.py
new file mode 100644
index 000000000..dffff084d
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepcoder/reward_types.py
@@ -0,0 +1,116 @@
+"""
+This module defines data structures and base classes for reward calculations
+to evaluate model responses for various problem types, including math and coding.
+"""
+
+from dataclasses import dataclass
+from enum import Enum
+
+@dataclass
+class RewardConfig:
+    # Config for math-bsed rewards
+    math_reward_weight: float = 1.0
+    use_math_orm: bool = False
+
+    # Config for code-based rewards
+    code_reward_weight: float = 1.0
+    
+    # Config for cot-based rewards
+    cot_reward_weight: float = 0.0
+    
+    # General reward constants
+    correct_reward: float = 1.0
+    incorrect_reward: float = 0.0
+    format_error_reward: float = 0.0
+    unk_error_reward: float = 0.0
+    
+    # Bonus reward for calling tools.
+    toolcall_bonus: float = 0.5
+
+
+class RewardType(Enum):
+    """
+    Enum class representing the different types of rewards that can be assigned.
+
+    Attributes:
+        MATH (str): Represents a math-related problem type.
+        CODE (str): Represents a coding-related problem type.
+        UNK (str): Represents an unknown or unclassified problem type.
+    """
+    MATH = 'MATH'
+    CODE = 'CODE'
+    UNK = 'UNK'
+
+@dataclass(slots=True, kw_only=True)
+class RewardInput:
+    """Data structure for input required to calculate rewards.
+
+    Attributes:
+        problem (str): The original problem text or prompt provided to the model.
+        problem_type (RewardType): The category of the problem (e.g., math, code) to be evaluated.
+        data_source (str): The source of the data (e.g., dataset name) used for evaluation.
+        model_response (str): The response generated by the model that needs evaluation.
+        metadata (dict): Additional contextual information necessary for evaluation:
+            - For math problems: This may include the ground truth answer.
+            - For coding problems: This may include unit tests to validate the solution.
+    """
+    problem: str
+    problem_type: RewardType = RewardType.UNK
+    data_source: str
+    model_response: str
+    metadata: dict
+    """
+    for code dataset
+
+    
+    #codeforces
+    metadata['test_cases'] = [[ { "input": "3 6 9", "output": "6" }, { "input": "4 4 4", "output": "4" }, { "input": "0 0 0", "output": "0" }]
+
+    #codetest
+    metadata[['public_tests'] = { 
+    { "input": [ "3\n((()))\n(())()\n()(()" ],
+	 "output": [ "YES\nYES\nNO" ] } }
+
+    # apps/ TACO
+    metadata[["input_output"] =  {
+    { "inputs": [ "8\n5 2\nWLWLL\n6 5\nLLLWWL\n7 1\nLWLWLWL\n15 5\nWWWLLLWWWLLLWWW\n40 7\nLLWLWLWWWLWLLWLWWWLWLLWLLWLLLLWLLWWWLWWL\n1 0\nL\n1 1\nL\n6 1\nWLLWLW\n" ],
+    "outputs": [ "7\n11\n6\n26\n46\n0\n1\n6\n" ] }
+    }
+    """
+
+@dataclass(slots=True, kw_only=True)
+class LiveCodebenchInput:
+    """Data structure for input required to calculate rewards.
+    
+    """
+    problem_type: RewardType = RewardType.CODE
+    question: str
+    generation_code: str
+    problem: dict
+    difficult:str='easy'
+
+@dataclass(slots=True, kw_only=True)
+class RewardOutput:
+    """Data structure for the output of reward calculations.
+
+    Attributes:
+        reward (float): The computed reward value based on the evaluation of the model's response.
+        is_correct (bool): A boolean flag indicating whether the model's response is deemed correct.
+    """
+    reward: float
+    is_correct: bool
+
+
+class RewardFn:
+    """Abstract base class for defining reward calculation strategies.
+
+    This class should be subclassed to implement specific reward calculation logic.
+    The __call__ method must be overridden to provide the functionality for evaluating
+    the input and returning the corresponding reward output.
+    """
+    def __init__(self, config: RewardConfig):
+        self.config = config
+
+    def __call__(self, input: RewardInput) -> RewardOutput:
+        raise NotImplementedError("Subclasses must implement this method.")
+
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/deepscaler/__init__.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepscaler/__init__.py
new file mode 100644
index 000000000..f660cbe2b
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepscaler/__init__.py
@@ -0,0 +1,3 @@
+"""Import reward-related classes and types from the reward module."""
+
+from .math_reward import deepscaler_reward_fn
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/deepscaler/globals.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepscaler/globals.py
new file mode 100644
index 000000000..6dbc5891a
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepscaler/globals.py
@@ -0,0 +1,7 @@
+"""
+Global variables for Deepscaler repo.
+"""
+
+# Reward function constants
+THOUGHT_DELIMITER_START = "<think>"
+THOUGHT_DELIMITER_END = "</think>"
\ No newline at end of file
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/deepscaler/math_reward.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepscaler/math_reward.py
new file mode 100644
index 000000000..8686713cf
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepscaler/math_reward.py
@@ -0,0 +1,78 @@
+"""
+This module contains the RewardMathFn class, which evaluates mathematical answers
+and assigns rewards based on their correctness. It utilizes a language model to 
+validate answers when necessary.
+"""
+from typing import List, Union
+
+from .globals import THOUGHT_DELIMITER_START, THOUGHT_DELIMITER_END
+from .reward_types import RewardConfig, RewardFn, RewardInput, RewardOutput, RewardType
+from .math_utils.utils import extract_answer, grade_answer_sympy, grade_answer_mathd, grade_answer_math_verify
+
+class RewardMathFn(RewardFn):
+    """
+    Reward function for evaluating mathematical answers.
+
+    This class implements the __call__ method to process the input and determine
+    the reward based on the correctness of the provided answer compared to the ground truth.
+    """
+
+    def __call__(self, input: RewardInput) -> RewardOutput:
+        assert input.problem_type == RewardType.MATH, \
+            "Invalid problem type: expected 'MATH', but got '{}'".format(input.problem_type)
+        
+        problem = input.problem
+        model_response = input.model_response
+        
+        # Extract solution.
+        # THOUGHT_DELIMITER_START is always present in prompt.
+        if THOUGHT_DELIMITER_END in model_response:
+            model_response = model_response.split(THOUGHT_DELIMITER_END)[-1].strip()
+        
+        model_answer = extract_answer(model_response)
+        if model_answer is None:
+            return RewardOutput(reward=self.config.format_error_reward, is_correct=False)
+
+        # Process the ground truth(s)
+        ground_truths = input.ground_truth.get("answer", None)
+        if ground_truths is None:
+            return RewardOutput(reward=self.config.unk_error_reward, is_correct=False)
+        
+        # Convert single answer to list for uniform processing
+        if isinstance(ground_truths, (str, float, int)):
+            ground_truths = [ground_truths]
+            
+        # Process each ground truth
+        processed_ground_truths = []
+        for truth in ground_truths:
+            truth = str(truth)
+            if "\\boxed" in truth:
+                processed_truth = extract_answer(truth)
+                if processed_truth is not None:
+                    processed_ground_truths.append(processed_truth)
+            else:
+                processed_ground_truths.append(truth)
+        
+        if not processed_ground_truths:
+            return RewardOutput(reward=self.config.unk_error_reward, is_correct=False)
+
+        # Check against all possible correct answers
+        for ground_truth in processed_ground_truths:
+            is_correct = grade_answer_math_verify(model_answer, ground_truth) or grade_answer_mathd(model_answer, ground_truth) or grade_answer_sympy(model_answer, ground_truth)
+            if is_correct:
+                return RewardOutput(reward=self.config.correct_reward, is_correct=True)
+                
+        return RewardOutput(reward=self.config.incorrect_reward, is_correct=False)
+
+def deepscaler_reward_fn(solution_str: str, ground_truth: Union[str, List[str]], enable_llm = False):
+    reward_config = RewardConfig()
+    reward_config.use_math_orm = enable_llm
+    reward_fn = RewardMathFn(reward_config)
+    reward_response = reward_fn(RewardInput(problem=solution_str, problem_type=RewardType.MATH, model_response=solution_str, ground_truth={"answer": ground_truth}))
+    return reward_response.is_correct
+
+if __name__ == "__main__":
+    reward = RewardMathFn(RewardConfig)
+    input = RewardInput(problem="Let $P(x)=x^{4}+2 x^{3}-13 x^{2}-14 x+24$ be a polynomial with roots $r_{1}, r_{2}, r_{3}, r_{4}$. Let $Q$ be the quartic polynomial with roots $r_{1}^{2}, r_{2}^{2}, r_{3}^{2}, r_{4}^{2}$, such that the coefficient of the $x^{4}$ term of $Q$ is 1. Simplify the quotient $Q\\left(x^{2}\\right) / P(x)$, leaving your answer in terms of $x$. (You may assume that $x$ is not equal to any of $\\left.r_{1}, r_{2}, r_{3}, r_{4}\\right)$.", problem_type=RewardType.MATH, model_response="<think> I am omniscient. </think> The answer is \\boxed{24 + 14*x + (-13)*x^2 - 2*x^3 + x^4}.", ground_truth={"answer": ["10", "$x^{4}-2 x^{3}-13 x^{2}+14 x+24$"]})
+    output = reward(input)
+    print(output)
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/deepscaler/math_utils/__init__.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepscaler/math_utils/__init__.py
new file mode 100644
index 000000000..1ae405e29
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepscaler/math_utils/__init__.py
@@ -0,0 +1,15 @@
+"""
+This module provides utility functions for grading mathematical answers and extracting answers from LaTeX formatted strings.
+"""
+
+from .utils import (
+    extract_answer,
+    grade_answer_sympy,
+    grade_answer_mathd,
+)
+
+__all__ = [
+    "extract_answer",
+    "grade_answer_sympy",
+    "grade_answer_mathd"
+]
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/deepscaler/math_utils/utils.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepscaler/math_utils/utils.py
new file mode 100644
index 000000000..6baca262f
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepscaler/math_utils/utils.py
@@ -0,0 +1,585 @@
+"""
+Answer checker API that uses sympy to simplify expressions and check for equality.
+
+Call grade_answer(given_answer: str, ground_truth: str).
+"""
+import re
+from pylatexenc import latex2text
+import sympy
+from sympy.parsing import sympy_parser
+from typing import Optional
+from math_verify import parse, verify
+
+# Dan Hendrycks' code
+def mathd_normalize_answer(answer: Optional[str]) -> Optional[str]:
+    if answer is None:
+        return None
+    answer = answer.strip()
+    try:
+        # Remove enclosing `\text{}`.
+        m = re.search("^\\\\text\{(?P<text>.+?)\}$", answer)
+        if m is not None:
+            answer = m.group("text").strip()
+        return _strip_string(answer)
+    except:
+        return answer
+
+def _strip_string(string):
+    def _fix_fracs(string):
+        substrs = string.split("\\frac")
+        new_str = substrs[0]
+        if len(substrs) > 1:
+            substrs = substrs[1:]
+            for substr in substrs:
+                new_str += "\\frac"
+                if substr[0] == "{":
+                    new_str += substr
+                else:
+                    try:
+                        assert len(substr) >= 2
+                    except:
+                        return string
+                    a = substr[0]
+                    b = substr[1]
+                    if b != "{":
+                        if len(substr) > 2:
+                            post_substr = substr[2:]
+                            new_str += "{" + a + "}{" + b + "}" + post_substr
+                        else:
+                            new_str += "{" + a + "}{" + b + "}"
+                    else:
+                        if len(substr) > 2:
+                            post_substr = substr[2:]
+                            new_str += "{" + a + "}" + b + post_substr
+                        else:
+                            new_str += "{" + a + "}" + b
+        string = new_str
+        return string
+
+
+    def _fix_a_slash_b(string):
+        if len(string.split("/")) != 2:
+            return string
+        a = string.split("/")[0]
+        b = string.split("/")[1]
+        try:
+            a = int(a)
+            b = int(b)
+            assert string == "{}/{}".format(a, b)
+            new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
+            return new_string
+        except:
+            return string
+
+
+    def _remove_right_units(string):
+        # "\\text{ " only ever occurs (at least in the val set) when describing units
+        if "\\text{ " in string:
+            splits = string.split("\\text{ ")
+            assert len(splits) == 2
+            return splits[0]
+        else:
+            return string
+
+
+    def _fix_sqrt(string):
+        if "\\sqrt" not in string:
+            return string
+        splits = string.split("\\sqrt")
+        new_string = splits[0]
+        for split in splits[1:]:
+            if split[0] != "{":
+                a = split[0]
+                new_substr = "\\sqrt{" + a + "}" + split[1:]
+            else:
+                new_substr = "\\sqrt" + split
+            new_string += new_substr
+        return new_string
+    # linebreaks
+    string = string.replace("\n", "")
+    # print(string)
+
+    # remove inverse spaces
+    string = string.replace("\\!", "")
+    # print(string)
+
+    # replace \\ with \
+    string = string.replace("\\\\", "\\")
+    # print(string)
+
+    # replace tfrac and dfrac with frac
+    string = string.replace("tfrac", "frac")
+    string = string.replace("dfrac", "frac")
+    # print(string)
+
+    # remove \left and \right
+    string = string.replace("\\left", "")
+    string = string.replace("\\right", "")
+    # print(string)
+
+    # Remove circ (degrees)
+    string = string.replace("^{\\circ}", "")
+    string = string.replace("^\\circ", "")
+
+    # remove dollar signs
+    string = string.replace("\\$", "")
+
+    # remove units (on the right)
+    string = _remove_right_units(string)
+
+    # remove percentage
+    string = string.replace("\\%", "")
+    string = string.replace("\%", "")
+
+    # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
+    string = string.replace(" .", " 0.")
+    string = string.replace("{.", "{0.")
+    # if empty, return empty string
+    if len(string) == 0:
+        return string
+    if string[0] == ".":
+        string = "0" + string
+
+    # to consider: get rid of e.g. "k = " or "q = " at beginning
+    if len(string.split("=")) == 2:
+        if len(string.split("=")[0]) <= 2:
+            string = string.split("=")[1]
+
+    # fix sqrt3 --> sqrt{3}
+    string = _fix_sqrt(string)
+
+    # remove spaces
+    string = string.replace(" ", "")
+
+    # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
+    string = _fix_fracs(string)
+
+    # manually change 0.5 --> \frac{1}{2}
+    if string == "0.5":
+        string = "\\frac{1}{2}"
+
+    # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
+    string = _fix_a_slash_b(string)
+
+    return string
+
+
+# sympy might hang -- we don't care about trying to be lenient in these cases
+BAD_SUBSTRINGS = ["^{", "^("]
+BAD_REGEXES = ["\^[0-9]+\^", "\^[0-9][0-9]+"]
+TUPLE_CHARS = "()[]"
+
+
+def _sympy_parse(expr: str):
+    """Parses an expression with sympy."""
+    py_expr = expr.replace("^", "**")
+    return sympy_parser.parse_expr(
+        py_expr,
+        transformations=(
+            sympy_parser.standard_transformations
+            + (sympy_parser.implicit_multiplication_application,)
+        ),
+    )
+
+
+def _parse_latex(expr: str) -> str:
+    """Attempts to parse latex to an expression sympy can read."""
+    expr = expr.replace("\\tfrac", "\\frac")
+    expr = expr.replace("\\dfrac", "\\frac")
+    expr = expr.replace("\\frac", " \\frac")  # Play nice with mixed numbers.
+    expr = latex2text.LatexNodes2Text().latex_to_text(expr)
+
+    # Replace the specific characters that this parser uses.
+    expr = expr.replace("√", "sqrt")
+    expr = expr.replace("π", "pi")
+    expr = expr.replace("∞", "inf")
+    expr = expr.replace("∪", "U")
+    expr = expr.replace("·", "*")
+    expr = expr.replace("×", "*")
+
+    return expr.strip()
+
+
+def _is_float(num: str) -> bool:
+    try:
+        float(num)
+        return True
+    except ValueError:
+        return False
+
+
+def _is_int(x: float) -> bool:
+    try:
+        return abs(x - int(round(x))) <= 1e-7
+    except:
+        return False
+
+
+def _is_frac(expr: str) -> bool:
+    return bool(re.search(r"^-?[0-9]+.?/0*[1-9][0-9]*.?$", expr))
+
+
+def _str_is_int(x: str) -> bool:
+    try:
+        x = _strip_properly_formatted_commas(x)
+        x = float(x)
+        return abs(x - int(round(x))) <= 1e-7
+    except:
+        return False
+
+
+def _str_to_int(x: str) -> bool:
+    x = x.replace(",", "")
+    x = float(x)
+    return int(x)
+
+
+def _inject_implicit_mixed_number(step: str):
+    """
+    Automatically make a mixed number evalable
+    e.g. 7 3/4 => 7+3/4
+    """
+    p1 = re.compile("([0-9]) +([0-9])")
+    step = p1.sub("\\1+\\2", step)  ## implicit mults
+    return step
+
+
+def _strip_properly_formatted_commas(expr: str):
+    # We want to be careful because we don't want to strip tuple commas
+    p1 = re.compile("(\d)(,)(\d\d\d)($|\D)")
+    while True:
+        next_expr = p1.sub("\\1\\3\\4", expr)
+        if next_expr == expr:
+            break
+        expr = next_expr
+    return next_expr
+
+
+def _normalize(expr: str) -> str:
+    """Normalize answer expressions."""
+    if expr is None:
+        return None
+
+    # Remove enclosing `\text{}`.
+    m = re.search("^\\\\text\{(?P<text>.+?)\}$", expr)
+    if m is not None:
+        expr = m.group("text")
+
+    expr = expr.replace("\\%", "%")
+    expr = expr.replace("\\$", "$")
+    expr = expr.replace("$", "")
+    expr = expr.replace("%", "")
+    expr = expr.replace(" or ", " , ")
+    expr = expr.replace(" and ", " , ")
+
+    expr = expr.replace("million", "*10^6")
+    expr = expr.replace("billion", "*10^9")
+    expr = expr.replace("trillion", "*10^12")
+
+    for unit in [
+        "degree",
+        "cm",
+        "centimeter",
+        "meter",
+        "mile",
+        "second",
+        "minute",
+        "hour",
+        "day",
+        "week",
+        "month",
+        "year",
+        "foot",
+        "feet",
+        "inch",
+        "yard",
+    ]:
+        expr = re.sub(f"{unit}(es)?(s)? *(\^[0-9]+)?", "", expr)
+    expr = re.sub(f"\^ *\\\\circ", "", expr)
+
+    if len(expr) > 0 and expr[0] == "{" and expr[-1] == "}":
+        expr = expr[1:-1]
+
+    expr = re.sub(",\\\\! *", "", expr)
+    if _is_float(expr) and _is_int(float(expr)):
+        expr = str(int(round(float(expr))))
+    if "\\" in expr:
+        try:
+            expr = _parse_latex(expr)
+        except:
+            pass
+
+    # edge case with mixed numbers and negative signs
+    expr = re.sub("- *", "-", expr)
+
+    expr = _inject_implicit_mixed_number(expr)
+    expr = expr.replace(" ", "")
+
+    # if we somehow still have latex braces here, just drop them
+    expr = expr.replace("{", "")
+    expr = expr.replace("}", "")
+
+    # don't be case sensitive for text answers
+    expr = expr.lower()
+
+    if _str_is_int(expr):
+        expr = str(_str_to_int(expr))
+
+    return expr
+
+
+def count_unknown_letters_in_expr(expr: str):
+    expr = expr.replace("sqrt", "")
+    expr = expr.replace("frac", "")
+    letters_in_expr = set([x for x in expr if x.isalpha()])
+    return len(letters_in_expr)
+
+
+def should_allow_eval(expr: str):
+    # we don't want to try parsing unknown text or functions of more than two variables
+    if count_unknown_letters_in_expr(expr) > 2:
+        return False
+
+    for bad_string in BAD_SUBSTRINGS:
+        if bad_string in expr:
+            return False
+
+    for bad_regex in BAD_REGEXES:
+        if re.search(bad_regex, expr) is not None:
+            return False
+
+    return True
+
+
+def are_equal_under_sympy(ground_truth_normalized: str, given_normalized: str):
+    are_equal = False
+    try:
+        expr = f"({ground_truth_normalized})-({given_normalized})"
+        if should_allow_eval(expr):
+            sympy_diff = _sympy_parse(expr)
+            simplified = sympy.simplify(sympy_diff)
+            if simplified == 0:
+                are_equal = True
+    except:
+        pass
+    return are_equal
+
+
+def split_tuple(expr: str):
+    """
+    Split the elements in a tuple/interval, while handling well-formatted commas in large numbers
+    """
+    expr = _strip_properly_formatted_commas(expr)
+    if len(expr) == 0:
+        return []
+    if (
+        len(expr) > 2
+        and expr[0] in TUPLE_CHARS
+        and expr[-1] in TUPLE_CHARS
+        and all([ch not in expr[1:-1] for ch in TUPLE_CHARS])
+    ):
+        elems = [elem.strip() for elem in expr[1:-1].split(",")]
+    else:
+        elems = [expr]
+    return elems
+
+
+def last_boxed_only_string(string):
+    idx = string.rfind("\\boxed")
+    if idx < 0:
+        idx = string.rfind("\\fbox")
+        if idx < 0:
+            return None
+
+    i = idx
+    right_brace_idx = None
+    num_left_braces_open = 0
+    while i < len(string):
+        if string[i] == "{":
+            num_left_braces_open += 1
+        if string[i] == "}":
+            num_left_braces_open -= 1
+            if num_left_braces_open == 0:
+                right_brace_idx = i
+                break
+        i += 1
+    
+    if right_brace_idx == None:
+        retval = None
+    else:
+        retval = string[idx:right_brace_idx + 1]
+    
+    return retval
+
+def remove_boxed(s):
+    left = "\\boxed{"
+    try:
+        assert s[:len(left)] == left
+        assert s[-1] == "}"
+        return s[len(left):-1]
+    except:
+        return None
+
+
+def extract_boxed_answer(solution: str) -> str:
+    """Extract the answer from inside a LaTeX \\boxed{} command"""
+    solution = last_boxed_only_string(solution)
+    solution = remove_boxed(solution)
+    return solution
+
+def grade_answer_sympy(given_answer: str, ground_truth: str) -> bool:
+    ground_truth_normalized = _normalize(ground_truth)
+    given_normalized = _normalize(given_answer)
+
+    if ground_truth_normalized is None:
+        return False
+
+    if ground_truth_normalized == given_normalized:
+        return True
+
+    if len(given_normalized) == 0:
+        return False
+
+    ground_truth_elems = split_tuple(ground_truth_normalized)
+    given_elems = split_tuple(given_normalized)
+
+    if len(ground_truth_elems) > 1 and (
+        ground_truth_normalized[0] != given_normalized[0]
+        or ground_truth_normalized[-1] != given_normalized[-1]
+    ):
+        is_correct = False
+    elif len(ground_truth_elems) != len(given_elems):
+        is_correct = False
+    else:
+        for ground_truth_elem, given_elem in zip(ground_truth_elems, given_elems):
+            if _is_frac(ground_truth_elem) and _is_frac(given_elem):
+                # if fractions aren't reduced, then shouldn't be marked as correct
+                # so, we don't want to allow sympy.simplify in this case
+                is_correct = ground_truth_elem == given_elem
+            elif _str_is_int(ground_truth_elem) != _str_is_int(given_elem):
+                # if the ground truth answer is an integer, we require the given answer to be a strict match (no sympy.simplify)
+                is_correct = False
+            else:
+                is_correct = are_equal_under_sympy(ground_truth_elem, given_elem)
+            if not is_correct:
+                break
+
+    return is_correct
+
+def grade_answer_mathd(given_answer: str, ground_truth: str) -> bool:
+    ground_truth_normalized_mathd = mathd_normalize_answer(ground_truth)
+    given_answer_normalized_mathd = mathd_normalize_answer(given_answer)
+
+    # be at least as lenient as mathd
+    if ground_truth_normalized_mathd == given_answer_normalized_mathd:
+        return True
+    return False
+
+def extract_answer(passage: str) -> str:
+    if "\\boxed" in passage:
+        return extract_boxed_answer(passage)
+    return None
+
+def grade_answer_verl(solution_str, ground_truth):
+    if not ground_truth:
+        return False
+    if '\\boxed' in ground_truth:
+        ground_truth = extract_answer(ground_truth)
+    given_answer = extract_answer(solution_str)
+    if given_answer is None:
+        return False
+    return grade_answer_mathd(given_answer, ground_truth) \
+        or grade_answer_sympy(given_answer, ground_truth)
+
+def grade_answer_math_verify(given_answer: str, ground_truth: str) -> bool:
+    try:
+        ground_truth_normalized = parse(ground_truth)
+        given_normalized = parse(given_answer)
+        result = verify(ground_truth_normalized, given_normalized)
+        return result
+    except:
+        return False
+
+if __name__ == "__main__":
+    test_cases = [
+        { # True
+            "model_answer": "24 + 14*x + (-13)*x^2 - 2*x^3 + x^4",
+            "ground_truth": "$x^{4}-2 x^{3}-13 x^{2}+14 x+24$",
+        },
+        { # True
+            "model_answer": "${1,2,3,4}$",
+            "ground_truth": "${1,3} \\cup {2,4}$"
+        },
+        { # True
+            "model_answer": "d",
+            "ground_truth": "D"
+        },
+        { # True
+            "model_answer": "D",
+            "ground_truth": "D"
+        },
+        { # True
+            "model_answer": "(D)",
+            "ground_truth": "D"
+        },
+        { # False
+            "model_answer": "D)",
+            "ground_truth": "D"
+        },
+        { # True
+            "model_answer": "50 \\, \\text{g/mol}",
+            "ground_truth": "50.0 \\text{ grams per mol}"
+        },        
+        { # True
+            "model_answer": "6 \\, \\text{meters}",
+            "ground_truth": "6"
+        },        
+        { # False
+            "model_answer": "16.2 \\, \\text{M}",
+            "ground_truth": "16.23 \\, M"
+        }, 
+        { # True
+            "model_answer": "16.2 \\, \\text{M}",
+            "ground_truth": "16.2 \\, M"
+        },        
+        { # False
+            "model_answer": "KBr \\\\\nCaBr_2 \\\\\nAlBr_3",
+            "ground_truth": "\\text{KBr, CaBr}_2, \\text{AlBr}_3"
+        },        
+        { # True
+            "model_answer": "\\text{2-Aminopentane}",
+            "ground_truth": "2\\text{-Aminopentane}"
+        },
+        { # False
+            "model_answer": "305 \\, \\text{W}",
+            "ground_truth": "\\[ 0.3052 \\, \\text{kW} \\]"
+        },
+        { # False
+            "model_answer": "305 W",
+            "ground_truth": "0.305 kW"
+        },
+        { # True
+            "model_answer": "bed bug",
+            "ground_truth": "bed bug"
+        },
+        { # True
+            "model_answer": "bed bug",
+            "ground_truth": "\\text{bed bug}"
+        },
+        { # True
+            "model_answer": "Can screen for aneuploidies and other conditions",
+            "ground_truth": "\\text{Can screen for aneuploidies and other conditions}"
+        }
+    ]
+
+    grading_functions = [
+        ("grade_answer_mathd", grade_answer_mathd),
+        ("grade_answer_sympy", grade_answer_sympy),
+        ("grade_answer_math_verify", grade_answer_math_verify)
+    ]
+
+    for test_case in test_cases:
+        print(f"\n------Testing model_answer: {test_case['model_answer']} v.s. ground_truth: {test_case['ground_truth']}------")
+        for func_name, func in grading_functions:
+            is_correct = func(test_case["model_answer"], test_case["ground_truth"])
+            print(f"{func_name}: {is_correct}")
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/deepscaler/reward_types.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepscaler/reward_types.py
new file mode 100644
index 000000000..d5d48795d
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/deepscaler/reward_types.py
@@ -0,0 +1,76 @@
+"""
+This module defines data structures and base classes for reward calculations
+to evaluate model responses for various problem types, including math and coding.
+"""
+
+from dataclasses import dataclass, field
+from enum import Enum
+
+@dataclass
+class RewardConfig:
+    # Use LLM as ORM to evaluate correctness.
+    use_math_orm: bool = False
+    
+    # General reward constants.
+    correct_reward: float = 1.0
+    incorrect_reward: float = -1.0
+    format_error_reward: float = -1.0
+    unk_error_reward: float = -1.0
+
+
+class RewardType(Enum):
+    """
+    Enum class representing the different types of rewards that can be assigned.
+
+    Attributes:
+        MATH (str): Represents a math-related problem type.
+        CODE (str): Represents a coding-related problem type.
+        UNK (str): Represents an unknown or unclassified problem type.
+    """
+    MATH = 'MATH'
+    CODE = 'CODE'
+    UNK = 'UNK'
+
+
+@dataclass
+class RewardInput:
+    """Data structure for input required to calculate rewards.
+
+    Attributes:
+        problem (str): The original problem text or prompt provided to the model.
+        model_response (str): The response generated by the model that needs evaluation.
+        problem_type (RewardType): The category of the problem (e.g., math, code) to be evaluated.
+        ground_truth (dict): Additional contextual information necessary for evaluation:
+            - For math problems: This may include the ground truth answer.
+            - For coding problems: This may include unit tests to validate the solution.
+    """
+    problem: str
+    model_response: str
+    problem_type: RewardType = RewardType.UNK
+    ground_truth: dict = field(default_factory=dict)
+
+
+@dataclass
+class RewardOutput:
+    """Data structure for the output of reward calculations.
+
+    Attributes:
+        reward (float): The computed reward value based on the evaluation of the model's response.
+        is_correct (bool): A boolean flag indicating whether the model's response is deemed correct.
+    """
+    reward: float
+    is_correct: bool
+
+
+class RewardFn:
+    """Abstract base class for defining reward calculation strategies.
+
+    This class should be subclassed to implement specific reward calculation logic.
+    The __call__ method must be overridden to provide the functionality for evaluating
+    the input and returning the corresponding reward output.
+    """
+    def __init__(self, config: RewardConfig):
+        self.config = config
+
+    def __call__(self, input: RewardInput) -> RewardOutput:
+        raise NotImplementedError("Subclasses must implement this method.")
\ No newline at end of file
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/gsm8k.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/gsm8k.py
new file mode 100644
index 000000000..709103764
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/gsm8k.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+
+def extract_solution(solution_str, method='strict'):
+    assert method in ['strict', 'flexible']
+
+    if method == 'strict':
+        # this also tests the formatting of the model
+        solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str)
+        if solution is None:
+            final_answer = None
+        else:
+            final_answer = solution.group(0)
+            final_answer = final_answer.split('#### ')[1].replace(',', '').replace('$', '')
+    elif method == 'flexible':
+        answer = re.findall("(\\-?[0-9\\.\\,]+)", solution_str)
+        final_answer = None
+        if len(answer) == 0:
+            # no reward is there is no answer
+            pass
+        else:
+            invalid_str = ['', '.']
+            # find the last number that is not '.'
+            for final_answer in reversed(answer):
+                if final_answer not in invalid_str:
+                    break
+    return final_answer
+
+
+def compute_score(solution_str, ground_truth, method='strict', format_score=0., score=1.):
+    """The scoring function for GSM8k.
+
+    Reference: Trung, Luong, et al. "Reft: Reasoning with reinforced fine-tuning." Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). 2024.
+
+    Args:
+        solution_str: the solution text
+        ground_truth: the ground truth
+        method: the method to extract the solution, choices are 'strict' and 'flexible'
+        format_score: the score for the format
+        score: the score for the correct answer
+    """
+    answer = extract_solution(solution_str=solution_str, method=method)
+    if answer is None:
+        return 0
+    else:
+        if answer == ground_truth:
+            return score
+        else:
+            return format_score
\ No newline at end of file
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/ifeval/__init__.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/ifeval/__init__.py
new file mode 100644
index 000000000..962c80032
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/ifeval/__init__.py
@@ -0,0 +1 @@
+from .evaluation_main import compute_score_strict, compute_score_loose
\ No newline at end of file
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/ifeval/evaluation_main.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/ifeval/evaluation_main.py
new file mode 100644
index 000000000..cab09c3f9
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/ifeval/evaluation_main.py
@@ -0,0 +1,98 @@
+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Binary of evaluating instruction following. See README.md."""
+
+from . import instructions_registry
+import torch
+
+def extract_response(response0):
+  response = response0.split("</think>")[-1]
+  response = response.split("<answer>")[-1].strip("</answer>").strip()
+  return response
+
+def compute_score_strict(
+    prompt,
+    response,
+    instruction_list,
+    kwargs
+):
+  """Tests response to see if instrutions are followed."""
+  response = extract_response(response)
+  is_following_list = []
+
+  for index, instruction_id in enumerate(instruction_list):
+    instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
+    instruction = instruction_cls(instruction_id)
+
+    instruction.build_description(**kwargs[index])
+    args = instruction.get_instruction_args()
+    if args and "prompt" in args:
+      instruction.build_description(prompt=prompt)
+
+    if response.strip() and instruction.check_following(response):
+      is_following_list.append(True)
+    else:
+      is_following_list.append(False)
+
+  return torch.tensor(is_following_list, dtype=torch.float32).mean().item()
+
+
+def compute_score_loose(
+    prompt,
+    response,
+    instruction_list,
+    kwargs,
+):
+  """Tests response for an upper bound for following instructions."""
+  response = extract_response(response)
+  r = response.split("\n")
+  response_remove_first = "\n".join(r[1:]).strip()
+  response_remove_last = "\n".join(r[:-1]).strip()
+  response_remove_both = "\n".join(r[1:-1]).strip()
+  revised_response = response.replace("*", "")
+  revised_response_remove_first = response_remove_first.replace("*", "")
+  revised_response_remove_last = response_remove_last.replace("*", "")
+  revised_response_remove_both = response_remove_both.replace("*", "")
+  all_responses = [
+      response,
+      revised_response,
+      response_remove_first,
+      response_remove_last,
+      response_remove_both,
+      revised_response_remove_first,
+      revised_response_remove_last,
+      revised_response_remove_both,
+  ]
+  is_following_list = []
+
+  for index, instruction_id in enumerate(instruction_list):
+    instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
+    instruction = instruction_cls(instruction_id)
+
+    instruction.build_description(**kwargs[index])
+    args = instruction.get_instruction_args()
+    if args and "prompt" in args:
+      instruction.build_description(prompt=prompt)
+
+    is_following = False
+    for r in all_responses:
+      if r.strip() and instruction.check_following(r):
+        is_following = True
+        break
+
+    is_following_list.append(is_following)
+
+  return torch.tensor(is_following_list, dtype=torch.float32).mean().item()
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/ifeval/instructions.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/ifeval/instructions.py
new file mode 100644
index 000000000..484555e26
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/ifeval/instructions.py
@@ -0,0 +1,1566 @@
+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Library of instructions."""
+import collections
+import json
+import random
+import re
+import string
+from typing import Dict, Optional, Sequence, Union
+
+from absl import logging
+import langdetect
+
+from . import instructions_util
+
+
+_InstructionArgsDtype = Optional[Dict[str, Union[int, str, Sequence[str]]]]
+
+_LANGUAGES = instructions_util.LANGUAGE_CODES
+
+# The relational operation for comparison.
+_COMPARISON_RELATION = ("less than", "at least")
+
+# The maximum number of sentences.
+_MAX_NUM_SENTENCES = 20
+
+# The number of placeholders.
+_NUM_PLACEHOLDERS = 4
+
+# The number of bullet lists.
+_NUM_BULLETS = 5
+
+# The options of constrained response.
+_CONSTRAINED_RESPONSE_OPTIONS = (
+    "My answer is yes.", "My answer is no.", "My answer is maybe.")
+
+# The options of starter keywords.
+_STARTER_OPTIONS = ("I would say", "My answer is", "I believe",
+                    "In my opinion", "I think", "I reckon", "I feel",
+                    "From my perspective", "As I see it", "According to me",
+                    "As far as I'm concerned", "To my understanding",
+                    "In my view", "My take on it is", "As per my perception")
+
+# The options of ending keywords.
+# TODO(jeffreyzhou) add more ending options
+_ENDING_OPTIONS = ("Any other questions?",
+                   "Is there anything else I can help with?")
+
+# The number of highlighted sections.
+_NUM_HIGHLIGHTED_SECTIONS = 4
+
+# The section spliter.
+_SECTION_SPLITER = ("Section", "SECTION")
+
+# The number of sections.
+_NUM_SECTIONS = 5
+
+# The number of paragraphs.
+_NUM_PARAGRAPHS = 5
+
+# The postscript marker.
+_POSTSCRIPT_MARKER = ("P.S.", "P.P.S")
+
+# The number of keywords.
+_NUM_KEYWORDS = 2
+
+# The occurrences of a single keyword.
+_KEYWORD_FREQUENCY = 3
+
+# The occurrences of a single letter.
+_LETTER_FREQUENCY = 10
+
+# The occurrences of words with all capital letters.
+_ALL_CAPITAL_WORD_FREQUENCY = 20
+
+# The number of words in the response.
+_NUM_WORDS_LOWER_LIMIT = 100
+_NUM_WORDS_UPPER_LIMIT = 500
+
+
+class Instruction:
+  """An instruction template."""
+
+  def __init__(self, instruction_id):
+    self.id = instruction_id
+
+  def build_description(self, **kwargs):
+    raise NotImplementedError("`build_description` not implemented.")
+
+  def get_instruction_args(self):
+    raise NotImplementedError("`get_instruction_args` not implemented.")
+
+  def get_instruction_args_keys(self):
+    raise NotImplementedError("`get_instruction_args_keys` not implemented.")
+
+  def check_following(self, value):
+    raise NotImplementedError("`check_following` not implemented.")
+
+
+class ResponseLanguageChecker(Instruction):
+  """Check the language of the entire response."""
+
+  def build_description(self, *, language = None):
+    """Build the instruction description.
+
+    Args:
+      language: A string representing the expected language of the response. The
+        language has to comply to the 97 types defined in
+        `langid.py` (https://pypi.org/project/langid/1.1.5/), which follows
+        ISO 639-1 codes (https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes);
+        for example, `en` for English, `zh` for Chinese, `fr` for French.
+
+    Returns:
+      A string representing the instruction description.
+    """
+    self._language = language
+    if self._language is None:
+      self._language = random.choice(list(_LANGUAGES.keys()))
+    # TODO(tianjianlu): opens the description generation to more choices.
+    self._description_pattern = (
+        "Your ENTIRE response should be in {language} language, no other " +
+        "language is allowed.")
+    return self._description_pattern.format(language=_LANGUAGES[self._language])
+
+  def get_instruction_args(self):
+    """Returns the keyward args of `build_description`."""
+    return {"language": self._language}
+
+  def get_instruction_args_keys(self):
+    """Returns the args keys of `build_description`."""
+    return ["language"]
+
+  def check_following(self, value):
+    """Check if the language of the entire response follows the instruction.
+
+    Args:
+      value: A string representing the response.
+
+    Returns:
+      True if the language of `value` follows instruction; otherwise False.
+    """
+    assert isinstance(value, str)
+
+    try:
+      return langdetect.detect(value) == self._language
+    except langdetect.LangDetectException as e:
+      # Count as instruction is followed.
+      logging.error(
+          "Unable to detect language for text %s due to %s", value, e
+      )  # refex: disable=pytotw.037
+      return True
+
+
+class NumberOfSentences(Instruction):
+  """Check the number of sentences."""
+
+  def build_description(self, *, num_sentences = None,
+                        relation = None):
+    """Build the instruction description.
+
+    Args:
+      num_sentences: An integer specifying the number of sentences as a
+        threshold.
+      relation: A string in (`less than`, `at least`), defining the relational
+        operator for comparison.
+        Two relational comparisons are supported for now:
+        if 'less than', the actual number of sentences < the threshold;
+        if 'at least', the actual number of sentences >= the threshold.
+
+    Returns:
+      A string representing the instruction description.
+    """
+    # The number of sentences as a threshold for comparison.
+    self._num_sentences_threshold = num_sentences
+    if (self._num_sentences_threshold is None or
+        self._num_sentences_threshold < 0):
+      self._num_sentences_threshold = random.randint(1, _MAX_NUM_SENTENCES)
+
+    if relation is None:
+      self._comparison_relation = random.choice(_COMPARISON_RELATION)
+    elif relation not in _COMPARISON_RELATION:
+      raise ValueError("The supported relation for comparison must be in "
+                       f"{_COMPARISON_RELATION}, but {relation} is given.")
+    else:
+      self._comparison_relation = relation
+
+    self._description_pattern = (
+        "Your response should contain {relation} {num_sentences} sentences.")
+    return self._description_pattern.format(
+        relation=self._comparison_relation,
+        num_sentences=self._num_sentences_threshold)
+
+  def get_instruction_args(self):
+    """Returns the keyward args of `build_description`."""
+    return {"num_sentences": self._num_sentences_threshold,
+            "relation": self._comparison_relation}
+
+  def get_instruction_args_keys(self):
+    """Returns the args keys of `build_description`."""
+    return ["num_sentences", "relation"]
+
+  def check_following(self, value):
+    """Check if the number of sentences follows the instruction.
+
+    Args:
+      value: A string representing the response.
+
+    Returns:
+      True if the response follows the instruction.
+
+    Raise:
+        ValueError if the string in `instruction_args` is not in
+        [`less_than`, `at_least`].
+    """
+    num_sentences = instructions_util.count_sentences(value)
+    if self._comparison_relation == _COMPARISON_RELATION[0]:
+      return num_sentences < self._num_sentences_threshold
+    elif self._comparison_relation == _COMPARISON_RELATION[1]:
+      return num_sentences >= self._num_sentences_threshold  # pytype: disable=bad-return-type
+
+
+class PlaceholderChecker(Instruction):
+  """Check the placeholders in template writing."""
+
+  def build_description(self, *, num_placeholders = None):
+    """Build the instruction description.
+
+    Args:
+      num_placeholders: An integer denoting the minimum number of
+        placeholders required in the response.
+
+    Returns:
+      A string representing the instruction description.
+    """
+    self._num_placeholders = num_placeholders
+    if self._num_placeholders is None or self._num_placeholders < 0:
+      self._num_placeholders = random.randint(1, _NUM_PLACEHOLDERS)
+    self._description_pattern = (
+        "The response must contain at least {num_placeholders} placeholders " +
+        "represented by square brackets, such as [address].")
+    return self._description_pattern.format(
+        num_placeholders=self._num_placeholders)
+
+  def get_instruction_args(self):
+    """Returns the keyward args of `build_description`."""
+    return {"num_placeholders": self._num_placeholders}
+
+  def get_instruction_args_keys(self):
+    """Returns the args keys of `build_description`."""
+    return ["num_placeholders"]
+
+  def check_following(self, value):
+    """Check if the number of placeholders follows the instruction.
+
+    Args:
+      value: A string representing the response.
+
+    Returns:
+      True if the actual number of placeholders in the response is greater than
+      or equal to `num_placeholders`; otherwise, False.
+    """
+    placeholders = re.findall(r"\[.*?\]", value)
+    num_placeholders = len(placeholders)
+    return num_placeholders >= self._num_placeholders
+
+
+class BulletListChecker(Instruction):
+  """Checks the bullet list in the prompt."""
+
+  def build_description(self, *, num_bullets = None):
+    """Build the instruction description.
+
+    Args:
+      num_bullets: An integer specifying the exact number of bullet lists
+        that is required to appear in the response.
+
+    Returns:
+      A string representing the instruction description.
+    """
+    self._num_bullets = num_bullets
+    if self._num_bullets is None or self._num_bullets < 0:
+      self._num_bullets = random.randint(1, _NUM_BULLETS)
+    self._description_pattern = (
+        "Your answer must contain exactly {num_bullets} bullet points. " +
+        "Use the markdown bullet points such as:\n" +
+        "* This is point 1. \n" +
+        "* This is point 2")
+    return self._description_pattern.format(
+        num_bullets=self._num_bullets)
+
+  def get_instruction_args(self):
+    """Returns the keyward args of `build_description`."""
+    return {"num_bullets": self._num_bullets}
+
+  def get_instruction_args_keys(self):
+    """Returns the args keys of `build_description`."""
+    return ["num_bullets"]
+
+  def check_following(self, value):
+    r"""Check if the number of bullet lists meets the requirement.
+
+    Args:
+      value: A string representing the response. The response is expected to
+        contain some bullet lists that start with `\*`.
+
+    Returns:
+      True if the actual number of bullet lists in the response meets the
+      requirement.
+    """
+    bullet_lists = re.findall(r"^\s*\*[^\*].*$", value, flags=re.MULTILINE)
+    bullet_lists_2 = re.findall(r"^\s*-.*$", value, flags=re.MULTILINE)
+    num_bullet_lists = len(bullet_lists) + len(bullet_lists_2)
+    return num_bullet_lists == self._num_bullets
+
+
+class ConstrainedResponseChecker(Instruction):
+  """Checks the constrained response."""
+
+  def build_description(self):
+    """Build the instruction description."""
+    # A sequence of string(s) representing the options of the expected response.
+    self._constrained_responses = _CONSTRAINED_RESPONSE_OPTIONS
+    self._description_pattern = (
+        "Answer with one of the following options: {response_options}")
+    return self._description_pattern.format(
+        response_options=self._constrained_responses)
+
+  def get_instruction_args(self):
+    """Returns the keyward args of `build_description`."""
+    return None
+
+  def get_instruction_args_keys(self):
+    """Returns the args keys of `build_description`."""
+    return []
+
+  def check_following(self, value):
+    """Checks if the response matches the constrained options.
+
+    Args:
+      value: A string representing the response.
+
+    Returns:
+      True if the actual response contains one of the options in the constrained
+      responses; otherwise False.
+    """
+    value = value.strip()
+    for constrained_response in self._constrained_responses:
+      if constrained_response in value:
+        return True
+    return False
+
+
+class ConstrainedStartChecker(Instruction):
+  """Checks the response start."""
+
+  def build_description(self, *, starter = None):
+    """Build the instruction description.
+
+    Args:
+      starter: A string representing the keyward that the response should start
+        with.
+
+    Returns:
+      A string representing the instruction description.
+    """
+    self._starter = starter.strip() if isinstance(starter, str) else starter
+    if self._starter is None:
+      self._starter = random.choice(_STARTER_OPTIONS)
+    self._description_pattern = (
+        "During the conversation, when it is your turn, " +
+        "please always start with {starter}")
+    return self._description_pattern.format(starter=self._starter)
+
+  def get_instruction_args(self):
+    """Returns the keyward args of `build_description`."""
+    return {"starter": self._starter}
+
+  def get_instruction_args_keys(self):
+    """Returns the args keys of `build_description`."""
+    return ["starter"]
+
+  def check_following(self, value):
+    """Checks if the response starts with the constrained keyword or phrase.
+
+    Args:
+      value: A string representing the response.
+
+    Returns:
+      True if the response starts with the given phrase or keyword that is
+      contained in `instruction_args`; otherwise, False.
+    """
+    response_pattern = r"^\s*" + self._starter + r".*$"
+    response_with_constrained_start = re.search(response_pattern, value,
+                                                flags=re.MULTILINE)
+    return True if response_with_constrained_start else False
+
+
+class HighlightSectionChecker(Instruction):
+  """Checks the highlighted section."""
+
+  def build_description(self, *, num_highlights = None):
+    """Build the instruction description.
+
+    Args:
+      num_highlights: An integer specifying the minimum number of highlighted
+        sections.
+
+    Returns:
+      A string representing the instruction description.
+    """
+    self._num_highlights = num_highlights
+    if self._num_highlights is None or self._num_highlights < 0:
+      self._num_highlights = random.randint(1, _NUM_HIGHLIGHTED_SECTIONS)
+
+    self._description_pattern = (
+        "Highlight at least {num_highlights} sections in your answer with " +
+        "markdown, i.e. *highlighted section*.")
+
+    return self._description_pattern.format(num_highlights=self._num_highlights)
+
+  def get_instruction_args(self):
+    """Returns the keyward args of `build_description`."""
+    return {"num_highlights": self._num_highlights}
+
+  def get_instruction_args_keys(self):
+    """Returns the args keys of `build_description`."""
+    return ["num_highlights"]
+
+  def check_following(self, value):
+    """Checks if the number of highlighted sections meets the requirement.
+
+    Args:
+      value: a string repesenting the response. The response is expected to
+        contain highlighted sections in the format of *highlighted*.
+
+    Returns:
+      True if the actual number of highlighted sections in the format of
+      *highlighed sections* meets the minimum requirement; otherwise False.
+    """
+    num_highlights = 0
+    highlights = re.findall(r"\*[^\n\*]*\*", value)
+    double_highlights = re.findall(r"\*\*[^\n\*]*\*\*", value)
+    for highlight in highlights:
+      if highlight.strip("*").strip():
+        num_highlights += 1
+    for highlight in double_highlights:
+      if highlight.removeprefix("**").removesuffix("**").strip():
+        num_highlights += 1
+
+    return num_highlights >= self._num_highlights
+
+
+class SectionChecker(Instruction):
+  """Checks the sections."""
+
+  def build_description(self, *, section_spliter = None,
+                        num_sections = None):
+    """Build the instruction description.
+
+    Args:
+      section_spliter: A string represents the section spliter keyword that
+        marks a new section, i.e., `Section` or `SECTION`.
+      num_sections: An integer specifying the number of sections.
+
+    Returns:
+      A string representing the instruction description.
+    """
+    self._section_spliter = section_spliter.strip() if isinstance(
+        section_spliter, str) else section_spliter
+    if self._section_spliter is None:
+      self._section_spliter = random.choice(_SECTION_SPLITER)
+
+    self._num_sections = num_sections
+    if self._num_sections is None or self._num_sections < 0:
+      self._num_sections = random.randint(1, _NUM_SECTIONS)
+
+    self._description_pattern = (
+        "Your response must have {num_sections} sections. Mark the beginning " +
+        "of each section with {section_spliter} X, such as:\n" +
+        "{section_spliter} 1\n" +
+        "[content of section 1]\n" +
+        "{section_spliter} 2\n" +
+        "[content of section 2]")
+
+    return self._description_pattern.format(
+        num_sections=self._num_sections,
+        section_spliter=self._section_spliter)
+
+  def get_instruction_args(self):
+    """Returns the keyward args of `build_description`."""
+    return {"section_spliter": self._section_spliter,
+            "num_sections": self._num_sections}
+
+  def get_instruction_args_keys(self):
+    """Returns the args keys of `build_description`."""
+    return ["section_spliter", "num_sections"]
+
+  def check_following(self, value):
+    """Checks the response contains multiple sections.
+
+    Args:
+      value: A string representing the response. The response is expected
+        to contain multiple sections (number of sections is greater than 1).
+        A new section starts with `Section 1`, where the number denotes the
+        section index.
+
+    Returns:
+      True if the number of sections in the response is greater than or equal to
+      the minimum number of sections; otherwise, False.
+    """
+    section_splitter_patten = r"\s?" + self._section_spliter  + r"\s?\d+\s?"
+    sections = re.split(section_splitter_patten, value)
+    num_sections = len(sections) - 1
+    return num_sections >= self._num_sections
+
+
+class ParagraphChecker(Instruction):
+  """Checks the paragraphs."""
+
+  def build_description(self, *, num_paragraphs = None):
+    """Build the instruction description.
+
+    Args:
+      num_paragraphs: An integer specifying the number of paragraphs.
+
+    Returns:
+      A string representing the instruction description.
+    """
+    self._num_paragraphs = num_paragraphs
+    if self._num_paragraphs is None or self._num_paragraphs < 0:
+      self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS)
+
+    self._description_pattern = (
+        "There should be {num_paragraphs} paragraphs. " +
+        "Paragraphs are separated with the markdown divider: ***")
+
+    return self._description_pattern.format(num_paragraphs=self._num_paragraphs)
+
+  def get_instruction_args(self):
+    """Returns the keyward args of `build_description`."""
+    return {"num_paragraphs": self._num_paragraphs}
+
+  def get_instruction_args_keys(self):
+    """Returns the args keys of `build_description`."""
+    return ["num_paragraphs"]
+
+  def check_following(self, value):
+    """Checks the response contains required number of paragraphs.
+
+    Args:
+      value: A string representing the response. The response may contain
+        paragraphs that are separated by the markdown divider: `***`.
+
+    Returns:
+      True if the actual number of paragraphs is the same as required;
+      otherwise, False.
+    """
+    paragraphs = re.split(r"\s?\*\*\*\s?", value)
+    num_paragraphs = len(paragraphs)
+
+    for index, paragraph in enumerate(paragraphs):
+      if not paragraph.strip():
+        if index == 0 or index == len(paragraphs) - 1:
+          num_paragraphs -= 1
+        else:
+          return False
+
+    return num_paragraphs == self._num_paragraphs
+
+
+class PostscriptChecker(Instruction):
+  """Checks the postscript."""
+
+  def build_description(self, *, postscript_marker = None
+                        ):
+    """Build the instruction description.
+
+    Args:
+      postscript_marker: A string containing the keyword that marks the start
+        of the postscript section.
+
+    Returns:
+      A string representing the instruction description.
+    """
+    self._postscript_marker = postscript_marker.strip() if isinstance(
+        postscript_marker, str) else postscript_marker
+    if self._postscript_marker is None:
+      self._postscript_marker = random.choice(_POSTSCRIPT_MARKER)
+
+    self._description_pattern = (
+        "At the end of your response, please explicitly add a postscript " +
+        "starting with {postscript}")
+
+    return self._description_pattern.format(postscript=self._postscript_marker)
+
+  def get_instruction_args(self):
+    """Returns the keyward args of `build_description`."""
+    return {"postscript_marker": self._postscript_marker}
+
+  def get_instruction_args_keys(self):
+    """Returns the args keys of `build_description`."""
+    return ["postscript_marker"]
+
+  def check_following(self, value):
+    """Checks if the response follows the postscript format.
+
+    Args:
+      value: a string representing the response. The response is expected to
+        contain a postscript section.
+
+    Returns:
+      True if the response contains a postscript section starting with
+      the keyword containing in the `instruction_args`; otherwise False.
+    """
+    value = value.lower()
+    if self._postscript_marker == "P.P.S":
+      postscript_pattern = r"\s*p\.\s?p\.\s?s.*$"
+    elif self._postscript_marker == "P.S.":
+      postscript_pattern = r"\s*p\.\s?s\..*$"
+    else:
+      postscript_pattern = r"\s*" + self._postscript_marker.lower() + r".*$"
+    postscript = re.findall(postscript_pattern, value, flags=re.MULTILINE)
+    return True if postscript else False
+
+
+class RephraseChecker(Instruction):
+  """Checks the repharse."""
+
+  def build_description(self, *, original_message):
+    """Build the instruction description.
+
+    Args:
+      original_message: A string representing the original message. The
+        rephrased response should only change its words/sentences in between
+        its two asterisks, for example, *change me*. Both original and rephrased
+        messages should contain the changes in the form of *change me*.
+
+    Returns:
+      A string representing the instruction description.
+    """
+    if not self.is_change(original_message):
+      raise ValueError(f"Message {original_message} does not contain changes "
+                       "in the form of *change me*.")
+
+    self._reference_without_change = original_message
+    self._description = ("Rephrasing: Your rephrased response should only" +
+                         "change the words/sentences in between two asterisks" +
+                         "such as *change me*.")
+    return self._description
+
+  def get_instruction_args(self):
+    """Returns the keyward args of `build_description`."""
+    return {"original_message": self._reference_without_change}
+
+  def get_instruction_args_keys(self):
+    """Returns the args keys of `build_description`."""
+    return ["original_message"]
+
+  def check_following(self, value):
+    r"""Checks if the rephrasing follows the instruction.
+
+    Args:
+      value: A string representing the response, which is expected to rephras
+        the string of `instruction_args`.
+
+    Returns:
+      True if `value` and `instruction_args` only differ by the words/sentences
+      in between two asterisks such as *change me*; otherwise, False.
+    """
+
+    if not self.is_change(value):
+      raise ValueError(f"value {value} does not contain "
+                       "changes in the form of *change me*.")
+
+    response_without_changes = self.strip_changes(value)
+    reference_without_changes = self.strip_changes(
+        self._reference_without_change)
+
+    return response_without_changes == reference_without_changes
+
+  def is_change(self, response):
+    """Check if there is change in the response in the form of *change me*."""
+    return re.search(r"\*.*\*", response)
+
+  def strip_changes(self, response):
+    """Strips off the changes."""
+    return re.sub(r"\*.*\*", "", response)
+
+
+class KeywordChecker(Instruction):
+  """Check the exisitence of certain keywords."""
+
+  def build_description(self, *, keywords = None
+                        ):
+    """Build the instruction description.
+
+    Args:
+      keywords: A sequence of strings representing the keywords that are
+        expected in the response.
+
+    Returns:
+      A string representing the instruction description.
+    """
+
+    if not keywords:
+      self._keywords = instructions_util.generate_keywords(
+          num_keywords=_NUM_KEYWORDS)
+    else:
+      self._keywords = keywords
+    self._keywords = sorted(self._keywords)
+
+    self._description_pattern = ("Include keywords {keywords} in the response.")
+
+    return self._description_pattern.format(keywords=self._keywords)
+
+  def get_instruction_args(self):
+    """Returns the keyward args of `build_description`."""
+    return {"keywords": self._keywords}
+
+  def get_instruction_args_keys(self):
+    """Returns the args keys of `build_description`."""
+    return ["keywords"]
+
+  def check_following(self, value):
+    """Check if the response contain the expected keywords."""
+    for keyword in self._keywords:
+      if not re.search(keyword, value, flags=re.IGNORECASE):
+        return False
+    return True
+
+
+class KeywordFrequencyChecker(Instruction):
+  """Check the keyword frequency."""
+
+  def build_description(self, *, keyword = None,
+                        frequency = None,
+                        relation = None):
+    """Build the instruction description.
+
+    Args:
+      keyword: A string representing a keyword that is expected in the response.
+      frequency: An integer specifying the number of times `keyword` is expected
+        to appear in the response.
+      relation: A string in (`less than`, `at least`), defining the relational
+        operator for comparison.
+        Two relational comparisons are supported for now:
+        if 'less than', the actual number of occurrences < frequency;
+        if 'at least', the actual number of occurrences >= frequency.
+
+    Returns:
+      A string representing the instruction description.
+    """
+    if not keyword:
+      self._keyword = instructions_util.generate_keywords(num_keywords=1)[0]
+    else:
+      self._keyword = keyword.strip()
+
+    self._frequency = frequency
+    if self._frequency is None or self._frequency < 0:
+      self._frequency = random.randint(1, _KEYWORD_FREQUENCY)
+
+    if relation is None:
+      self._comparison_relation = random.choice(_COMPARISON_RELATION)
+    elif relation not in _COMPARISON_RELATION:
+      raise ValueError("The supported relation for comparison must be in "
+                       f"{_COMPARISON_RELATION}, but {relation} is given.")
+    else:
+      self._comparison_relation = relation
+
+    self._description_pattern = (
+        "In your response, the word {keyword} should appear {relation} " +
+        "{frequency} times.")
+
+    return self._description_pattern.format(
+        keyword=self._keyword,
+        relation=self._comparison_relation,
+        frequency=self._frequency)
+
+  def get_instruction_args(self):
+    """Returns the keyward args of `build_description`."""
+    return {"keyword": self._keyword,
+            "frequency": self._frequency,
+            "relation": self._comparison_relation}
+
+  def get_instruction_args_keys(self):
+    """Returns the args keys of `build_description`."""
+    return ["keyword", "frequency", "relation"]
+
+  def check_following(self, value):
+    """Checks if the response contain the keyword with required frequency."""
+    actual_occurrences = len(re.findall(
+        self._keyword, value, flags=re.IGNORECASE))
+
+    if self._comparison_relation == _COMPARISON_RELATION[0]:
+      return actual_occurrences < self._frequency
+    elif self._comparison_relation == _COMPARISON_RELATION[1]:
+      return actual_occurrences >= self._frequency  # pytype: disable=bad-return-type
+
+
+class NumberOfWords(Instruction):
+  """Checks the number of words."""
+
+  def build_description(self, *, num_words = None,
+                        relation = None):
+    """Build the instruction description.
+
+    Args:
+      num_words: An integer specifying the number of words contained in the
+        response.
+      relation: A string in (`less than`, `at least`), defining the relational
+        operator for comparison.
+        Two relational comparisons are supported for now:
+        if 'less than', the actual number of words < num_words;
+        if 'at least', the actual number of words >= num_words.
+
+    Returns:
+      A string representing the instruction description.
+    """
+
+    self._num_words = num_words
+    if self._num_words is None or self._num_words < 0:
+      self._num_words = random.randint(
+          _NUM_WORDS_LOWER_LIMIT, _NUM_WORDS_UPPER_LIMIT
+      )
+
+    if relation is None:
+      self._comparison_relation = random.choice(_COMPARISON_RELATION)
+    elif relation not in _COMPARISON_RELATION:
+      raise ValueError("The supported relation for comparison must be in "
+                       f"{_COMPARISON_RELATION}, but {relation} is given.")
+    else:
+      self._comparison_relation = relation
+
+    self._description_pattern = (
+        "Answer with {relation} {num_words} words.")
+
+    return self._description_pattern.format(
+        relation=self._comparison_relation,
+        num_words=self._num_words)
+
+  def get_instruction_args(self):
+    """Returns the keyward args of `build_description`."""
+    return {"num_words": self._num_words,
+            "relation": self._comparison_relation}
+
+  def get_instruction_args_keys(self):
+    """Returns the args keys of `build_description`."""
+    return ["num_words", "relation"]
+
+  def check_following(self, value):
+    """Checks if the response contains the expected number of words."""
+    num_words = instructions_util.count_words(value)
+
+    if self._comparison_relation == _COMPARISON_RELATION[0]:
+      return num_words < self._num_words
+    elif self._comparison_relation == _COMPARISON_RELATION[1]:
+      return num_words >= self._num_words  # pytype: disable=bad-return-type
+
+
+class JsonFormat(Instruction):
+  """Check the Json format."""
+
+  def build_description(self):
+    self._description_pattern = (
+        "Entire output should be wrapped in JSON format. You can use markdown"
+        " ticks such as ```."
+    )
+    return self._description_pattern
+
+  def get_instruction_args(self):
+    """Returns the keyward args of `build_description`."""
+    return None
+
+  def get_instruction_args_keys(self):
+    """Returns the args keys of `build_description`."""
+    return []
+
+  def check_following(self, value):
+    value = (
+        value.strip()
+        .removeprefix("```json")
+        .removeprefix("```Json")
+        .removeprefix("```JSON")
+        .removeprefix("```")
+        .removesuffix("```")
+        .strip()
+    )
+    try:
+      json.loads(value)
+    except ValueError as _:
+      return False
+    return True
+
+
+class ParagraphFirstWordCheck(Instruction):
+  """Check the paragraph and the first word of the nth paragraph."""
+
+  def build_description(self, num_paragraphs = None,
+                        nth_paragraph = None,
+                        first_word = None):
+    r"""Build the instruction description.
+
+    Args:
+      num_paragraphs: An integer indicating the number of paragraphs expected
+        in the response. A paragraph is a subset of the string that is
+        expected to be separated by '\n\n'.
+      nth_paragraph: An integer indicating the paragraph number that we look at.
+        Note that n starts from 1.
+      first_word: A string that represent the first word of the bth paragraph.
+
+    Returns:
+      A string representing the instruction description.
+    """
+    self._num_paragraphs = num_paragraphs
+    if self._num_paragraphs is None or self._num_paragraphs < 0:
+      self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS)
+
+    self._nth_paragraph = nth_paragraph
+    if (
+        self._nth_paragraph is None
+        or self._nth_paragraph <= 0
+        or self._nth_paragraph > self._num_paragraphs
+    ):
+      self._nth_paragraph = random.randint(1, self._num_paragraphs + 1)
+
+    self._first_word = first_word
+    if self._first_word is None:
+      self._first_word = instructions_util.generate_keywords(num_keywords=1)[0]
+    self._first_word = self._first_word.lower()
+
+    self._description_pattern = (
+        "There should be {num_paragraphs} paragraphs. " +
+        "Paragraphs and only paragraphs are separated with each other by two " +
+        "new lines as if it was '\\n\\n' in python. " +
+        "Paragraph {nth_paragraph} must start with word {first_word}.")
+
+    return self._description_pattern.format(
+        num_paragraphs=self._num_paragraphs,
+        nth_paragraph=self._nth_paragraph,
+        first_word=self._first_word)
+
+  def get_instruction_args(self):
+    """Returns the keyward args of `build_description`."""
+    return {"num_paragraphs": self._num_paragraphs,
+            "nth_paragraph": self._nth_paragraph,
+            "first_word": self._first_word}
+
+  def get_instruction_args_keys(self):
+    """Returns the args keys of `build_description`."""
+    return ["num_paragraphs", "nth_paragraph", "first_word"]
+
+  def check_following(self, value):
+    """Checks for required number of paragraphs and correct first word.
+
+    Args:
+      value: a string representing the response. The response may contain
+        paragraphs that are separated by two new lines and the first word of
+        the nth paragraph will have to match a specified word.
+
+    Returns:
+      True if the number of paragraphs is the same as required and the first
+      word of the specified paragraph is the same as required. Otherwise, false.
+    """
+
+    paragraphs = re.split(r"\n\n", value)
+    num_paragraphs = len(paragraphs)
+
+    for paragraph in paragraphs:
+      if not paragraph.strip():
+        num_paragraphs -= 1
+
+    # check that index doesn't go out of bounds
+    if self._nth_paragraph <= num_paragraphs:
+      paragraph = paragraphs[self._nth_paragraph - 1].strip()
+      if not paragraph:
+        return False
+    else:
+      return False
+
+    first_word = ""
+    punctuation = {".", ",", "?", "!", "'", '"'}
+
+    # get first word and remove punctuation
+    word = paragraph.split()[0].strip()
+    # TODO(jeffrey): make more complex?
+    word = word.lstrip("'")
+    word = word.lstrip('"')
+
+    for letter in word:
+      if letter in punctuation:
+        break
+      first_word += letter.lower()
+
+    return (
+        num_paragraphs == self._num_paragraphs
+        and first_word == self._first_word
+    )
+
+
+# TODO(jeffrey) add relation - at least/at most?
+class KeySentenceChecker(Instruction):
+  """Check the existence of certain key sentences."""
+
+  def build_description(self, key_sentences = None,
+                        num_sentences = None):
+    """Build the instruction description.
+
+    Args:
+      key_sentences: A sequences of strings representing the key sentences that
+        are expected in the response.
+      num_sentences: The number of key sentences that are expected to be seen in
+        the response.
+
+    Returns:
+      A string representing the instruction description.
+    """
+
+    if not key_sentences:
+      # TODO(jeffrey) make a generate sentences function? wonderwords package
+      self._key_sentences = set(["For now, this is fine."])
+    else:
+      self._key_sentences = key_sentences
+
+    if not num_sentences:
+      self._num_sentences = random.randint(1, len(self._key_sentences))
+    else:
+      self._num_sentences = num_sentences
+
+    self._description_pattern = (
+        "Include {num_sentences} of the following sentences {key_sentences}"
+    )
+
+    return self._description_pattern.format(
+        num_sentences=self._num_sentences, key_sentences=self._key_sentences
+    )
+
+  def get_instruction_args(self):
+    """Returns the keyward args of `build_description`."""
+    return {"num_sentences": self._num_sentences,
+            "key_sentences": list(self._key_sentences)}
+
+  def get_instruction_args_keys(self):
+    """Returns the args keys of `build_description`."""
+    return ["num_sentences", "key_sentences"]
+
+  def check_following(self, value):
+    """Checks if the response contains the expected key sentences."""
+    count = 0
+    sentences = instructions_util.split_into_sentences(value)
+    for sentence in self._key_sentences:
+      if sentence in sentences:
+        count += 1
+
+    return count == self._num_sentences
+
+
+class ForbiddenWords(Instruction):
+  """Checks that specified words are not used in response."""
+
+  def build_description(self, forbidden_words = None
+                        ):
+    """Build the instruction description.
+
+    Args:
+      forbidden_words: A sequences of strings respresenting words that are not
+        allowed in the response.
+
+    Returns:
+      A string representing the instruction description.
+    """
+
+    if not forbidden_words:
+      self._forbidden_words = instructions_util.generate_keywords(
+          num_keywords=_NUM_KEYWORDS)
+    else:
+      self._forbidden_words = list(set(forbidden_words))
+    self._forbidden_words = sorted(self._forbidden_words)
+    self._description_pattern = (
+        "Do not include keywords {forbidden_words} in the response."
+    )
+
+    return self._description_pattern.format(
+        forbidden_words=self._forbidden_words
+    )
+
+  def get_instruction_args(self):
+    """Returns the keyward args of `build_description`."""
+    return {"forbidden_words": self._forbidden_words}
+
+  def get_instruction_args_keys(self):
+    """Returns the args keys of `build_description`."""
+    return ["forbidden_words"]
+
+  def check_following(self, value):
+    """Check if the response does not contain the expected keywords."""
+    for word in self._forbidden_words:
+      if re.search(r"\b" + word + r"\b", value, flags=re.IGNORECASE):
+        return False
+    return True
+
+
+class RephraseParagraph(Instruction):
+  """Checks that the paragraph is rephrased."""
+
+  def build_description(self, *, original_paragraph, low, high
+                        ):
+    """Builds the instruction description.
+
+    Args:
+      original_paragraph: A string presenting the original paragraph. The
+        rephrases response should have betweeb low-high words in common.
+      low: An integer presenting the lower bound of similar words.
+      high: An integer representing the upper bound of similar words.
+
+    Returns:
+      A string representing the instruction description.
+    """
+    # TODO(jeffrey) make more encompassing
+    self._original_paragraph = original_paragraph
+    self._low = low
+    self._high = high
+
+    self._description = ("Rephrase the following paragraph: " +
+                         "{original_paragraph}\nYour response should have " +
+                         "between {low} and {high} of the same words. " +
+                         "Words are the same if and only if all of the " +
+                         "letters, ignoring cases, are the same. For " +
+                         "example, 'run' is the same as 'Run' but different " +
+                         "to 'ran'.")
+
+    return self._description.format(original_paragraph=original_paragraph,
+                                    low=self._low, high=self._high)
+
+  def get_instruction_args(self):
+    """Returns the keyward args of `build_description`."""
+    return {"original_paragraph": self._original_paragraph,
+            "low": self._low,
+            "high": self._high}
+
+  def get_instruction_args_keys(self):
+    """Returns the args keys of `build_description`."""
+    return ["original_paragraph", "low", "high"]
+
+  def check_following(self, value):
+    val_words = re.findall(r"\w+", value.lower())
+    original_words = re.findall(r"\w+", self._original_paragraph.lower())
+    similar_words = 0
+
+    dict_val = collections.Counter(val_words)
+    dict_original = collections.Counter(original_words)
+
+    for word in dict_original:
+      similar_words += min(dict_original[word], dict_val[word])
+
+    return similar_words >= self._low and similar_words <= self._high
+
+
+class TwoResponsesChecker(Instruction):
+  """Check that two responses were given."""
+
+  def build_description(self):
+    """Build the instruction description."""
+    self._description_pattern = (
+        "Give two different responses. Responses and only responses should"
+        " be separated by 6 asterisk symbols: ******."
+    )
+    return self._description_pattern
+
+  def get_instruction_args(self):
+    """Returns the keyward args of `build_description`."""
+    return None
+
+  def get_instruction_args_keys(self):
+    """Returns the args keys of `build_description`."""
+    return []
+
+  def check_following(self, value):
+    """Checks if the response has two different answers.
+
+    Args:
+      value: A string representing the response.
+
+    Returns:
+      True if two responses are detected and false otherwise.
+    """
+    valid_responses = list()
+    responses = value.split("******")
+    for index, response in enumerate(responses):
+      if not response.strip():
+        if index != 0 and index != len(responses) - 1:
+          return False
+      else:
+        valid_responses.append(response)
+    return (
+        len(valid_responses) == 2
+        and valid_responses[0].strip() != valid_responses[1].strip()
+    )
+
+
+class RepeatPromptThenAnswer(Instruction):
+  """Checks that Prompt is first repeated then answered."""
+
+  def build_description(self, *, prompt_to_repeat = None):
+    """Build the instruction description.
+
+    Args:
+      prompt_to_repeat: The prompt that is meant to be repeated.
+
+    Returns:
+      A string representing the instruction description.
+    """
+    if not prompt_to_repeat:
+      raise ValueError("prompt_to_repeat must be set.")
+    else:
+      self._prompt_to_repeat = prompt_to_repeat
+    self._description_pattern = (
+        "First repeat the request word for word without change,"
+        " then give your answer (1. do not say any words or characters"
+        " before repeating the request; 2. the request you need to repeat"
+        " does not include this sentence)"
+    )
+    return self._description_pattern
+
+  def get_instruction_args(self):
+    return {"prompt_to_repeat": self._prompt_to_repeat}
+
+  def get_instruction_args_keys(self):
+    """Returns the args keys of `build_description`."""
+    return ["prompt_to_repeat"]
+
+  def check_following(self, value):
+    if value.strip().lower().startswith(self._prompt_to_repeat.strip().lower()):
+      return True
+    return False
+
+
+class EndChecker(Instruction):
+  """Checks that the prompt ends with a given phrase."""
+
+  def build_description(self, *, end_phrase = None):
+    """Build the instruction description.
+
+    Args:
+      end_phrase: A string representing the phrase the response should end with.
+
+    Returns:
+      A string representing the instruction description.
+    """
+    self._end_phrase = (
+        end_phrase.strip() if isinstance(end_phrase, str) else end_phrase
+    )
+    if self._end_phrase is None:
+      self._end_phrase = random.choice(_ENDING_OPTIONS)
+    self._description_pattern = (
+        "Finish your response with this exact phrase {ender}. "
+        "No other words should follow this phrase.")
+    return self._description_pattern.format(ender=self._end_phrase)
+
+  def get_instruction_args(self):
+    return {"end_phrase": self._end_phrase}
+
+  def get_instruction_args_keys(self):
+    """Returns the args keys of `build_description`."""
+    return ["end_phrase"]
+
+  def check_following(self, value):
+    """Checks if the response ends with the expected phrase."""
+    value = value.strip().strip("\"").lower()
+    self._end_phrase = self._end_phrase.strip().lower()
+    return value.endswith(self._end_phrase)
+
+
+class TitleChecker(Instruction):
+  """Checks the response for a title."""
+
+  def build_description(self):
+    """Build the instruction description."""
+    self._description_pattern = (
+        "Your answer must contain a title, wrapped in double angular brackets,"
+        " such as <<poem of joy>>."
+    )
+    return self._description_pattern
+
+  def get_instruction_args(self):
+    return None
+
+  def get_instruction_args_keys(self):
+    """Returns the args keys of `build_description`."""
+    return []
+
+  def check_following(self, value):
+    """Checks if the response contains a title."""
+    pattern = r"<<[^\n]+>>"
+    re_pattern = re.compile(pattern)
+    titles = re.findall(re_pattern, value)
+
+    for title in titles:
+      if title.lstrip("<").rstrip(">").strip():
+        return True
+    return False
+
+
+class LetterFrequencyChecker(Instruction):
+  """Checks letter frequency."""
+
+  def build_description(self, *, letter = None,
+                        let_frequency = None,
+                        let_relation = None):
+    """Build the instruction description.
+
+    Args:
+      letter: A string representing a letter that is expected in the response.
+      let_frequency: An integer specifying the number of times `keyword` is
+        expected to appear in the response.
+      let_relation: A string in (`less than`, `at least`), defining the
+        relational operator for comparison. Two relational comparisons are
+        supported for now; if 'less than', the actual number of
+        occurrences < frequency; if 'at least', the actual number of
+        occurrences >= frequency.
+
+    Returns:
+      A string representing the instruction description.
+    """
+    if (
+        not letter
+        or len(letter) > 1
+        or ord(letter.lower()) < 97
+        or ord(letter.lower()) > 122
+    ):
+      self._letter = random.choice(list(string.ascii_letters))
+    else:
+      self._letter = letter.strip()
+    self._letter = self._letter.lower()
+
+    self._frequency = let_frequency
+    if self._frequency is None or self._frequency < 0:
+      self._frequency = random.randint(1, _LETTER_FREQUENCY)
+
+    if let_relation is None:
+      self._comparison_relation = random.choice(_COMPARISON_RELATION)
+    elif let_relation not in _COMPARISON_RELATION:
+      raise ValueError(
+          "The supported relation for comparison must be in "
+          f"{_COMPARISON_RELATION}, but {let_relation} is given."
+      )
+    else:
+      self._comparison_relation = let_relation
+
+    self._description_pattern = (
+        "In your response, the letter {letter} should appear {let_relation}"
+        " {let_frequency} times."
+    )
+
+    return self._description_pattern.format(
+        letter=self._letter,
+        let_frequency=self._frequency,
+        let_relation=self._comparison_relation,
+    )
+
+  def get_instruction_args(self):
+    """Returns the keyword args of build description."""
+    return {"letter": self._letter,
+            "let_frequency": self._frequency,
+            "let_relation": self._comparison_relation}
+
+  def get_instruction_args_keys(self):
+    """Returns the args keys of `build_description`."""
+    return ["letter", "let_frequency", "let_relation"]
+
+  def check_following(self, value):
+    """Checks that the response contains the letter at the right frequency."""
+    value = value.lower()
+    letters = collections.Counter(value)
+
+    if self._comparison_relation == _COMPARISON_RELATION[0]:
+      return letters[self._letter] < self._frequency
+    else:
+      return letters[self._letter] >= self._frequency
+
+
+class CapitalLettersEnglishChecker(Instruction):
+  """Checks that the response is in english and is in all capital letters."""
+
+  def build_description(self):
+    """Build the instruction description."""
+    self._description_pattern = (
+        "Your entire response should be in English, and in all capital letters."
+    )
+    return self._description_pattern
+
+  def get_instruction_args(self):
+    return None
+
+  def get_instruction_args_keys(self):
+    """Returns the args keys of `build_description`."""
+    return []
+
+  def check_following(self, value):
+    """Checks that the response is in English and in all capital letters."""
+    assert isinstance(value, str)
+
+    try:
+      return value.isupper() and langdetect.detect(value) == "en"
+    except langdetect.LangDetectException as e:
+      # Count as instruction is followed.
+      logging.error(
+          "Unable to detect language for text %s due to %s", value, e
+      )  # refex: disable=pytotw.037
+      return True
+
+
+class LowercaseLettersEnglishChecker(Instruction):
+  """Checks that the response is in english and is in all lowercase letters."""
+
+  def build_description(self):
+    """Build the instruction description."""
+    self._description_pattern = (
+        "Your entire response should be in English, and in all lowercase"
+        " letters. No capital letters are allowed."
+    )
+    return self._description_pattern
+
+  def get_instruction_args(self):
+    return None
+
+  def get_instruction_args_keys(self):
+    """Returns the args keys of `build_description`."""
+    return []
+
+  def check_following(self, value):
+    """Checks that the response is in English and in all lowercase letters."""
+    assert isinstance(value, str)
+
+    try:
+      return value.islower() and langdetect.detect(value) == "en"
+    except langdetect.LangDetectException as e:
+      # Count as instruction is followed.
+      logging.error(
+          "Unable to detect language for text %s due to %s", value, e
+      )  # refex: disable=pytotw.037
+      return True
+
+
+class CommaChecker(Instruction):
+  """Checks the response for no commas."""
+
+  def build_description(self):
+    """Build the instruction description."""
+    self._description_pattern = (
+        "In your entire response, refrain from the use of any commas."
+    )
+    return self._description_pattern
+
+  def get_instruction_args(self):
+    return None
+
+  def get_instruction_args_keys(self):
+    """Returns the args keys of `build_description`."""
+    return []
+
+  def check_following(self, value):
+    """Checks that the response does not contain commas."""
+    return not re.search(r"\,", value)
+
+
+class CapitalWordFrequencyChecker(Instruction):
+  """Checks frequency of words with all capital letters."""
+
+  def build_description(
+      self,
+      capital_frequency = None,
+      capital_relation = None,
+  ):
+    """Build the instruction description.
+
+    Args:
+      capital_frequency: An integer that represents the number of words that
+        should be in all capital letters.
+      capital_relation: A string that is 'at least' or 'at most' that refers to
+        the frequency.
+
+    Returns:
+      A string representing the instruction description.
+    """
+    self._frequency = capital_frequency
+    if self._frequency is None:
+      self._frequency = random.randint(1, _ALL_CAPITAL_WORD_FREQUENCY)
+
+    self._comparison_relation = capital_relation
+    if capital_relation is None:
+      self._comparison_relation = random.choice(_COMPARISON_RELATION)
+    elif capital_relation not in _COMPARISON_RELATION:
+      raise ValueError(
+          "The supported relation for comparison must be in "
+          f"{_COMPARISON_RELATION}, but {capital_relation} is given."
+      )
+
+    self._description_pattern = (
+        "In your response, words with all capital letters should appear"
+        " {relation} {frequency} times."
+    )
+
+    return self._description_pattern.format(
+        frequency=self._frequency, relation=self._comparison_relation
+    )
+
+  def get_instruction_args(self):
+    """Returns the keyword args of build description."""
+    return {
+        "capital_frequency": self._frequency,
+        "capital_relation": self._comparison_relation,
+    }
+
+  def get_instruction_args_keys(self):
+    """Returns the args keys of `build_description`."""
+    return ["capital_frequency", "capital_relation"]
+
+  def check_following(self, value):
+    """Checks the frequency of words with all capital letters."""
+    # Hyphenated words will count as one word
+    words = instructions_util.nltk.word_tokenize(value)
+    capital_words = [word for word in words if word.isupper()]
+
+    capital_words = len(capital_words)
+
+    if self._comparison_relation == _COMPARISON_RELATION[0]:
+      return capital_words < self._frequency
+    else:
+      return capital_words >= self._frequency
+
+
+class QuotationChecker(Instruction):
+  """Checks response is wrapped with double quotation marks."""
+
+  def build_description(self):
+    """Build the instruction description."""
+    self._description_pattern = (
+        "Wrap your entire response with double quotation marks."
+    )
+    return self._description_pattern
+
+  def get_instruction_args(self):
+    """Returns the keyword args of build description."""
+    return None
+
+  def get_instruction_args_keys(self):
+    """Returns the args keys of `build_description`."""
+    return []
+
+  def check_following(self, value):
+    """Checks if the response is wrapped with double quotation marks."""
+    value = value.strip()
+    return len(value) > 1 and value[0] == '"' and value[-1] == '"'
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/ifeval/instructions_registry.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/ifeval/instructions_registry.py
new file mode 100644
index 000000000..3aa1c0986
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/ifeval/instructions_registry.py
@@ -0,0 +1,176 @@
+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Registry of all instructions."""
+from . import instructions  
+
+_KEYWORD = "keywords:"
+
+_LANGUAGE = "language:"
+
+_LENGTH = "length_constraints:"
+
+_CONTENT = "detectable_content:"
+
+_FORMAT = "detectable_format:"
+
+_MULTITURN = "multi-turn:"
+
+_COMBINATION = "combination:"
+
+_STARTEND = "startend:"
+
+_CHANGE_CASES = "change_case:"
+
+_PUNCTUATION = "punctuation:"
+
+INSTRUCTION_DICT = {
+    _KEYWORD + "existence": instructions.KeywordChecker,
+    _KEYWORD + "frequency": instructions.KeywordFrequencyChecker,
+    # TODO(jeffreyzhou): make a proper set of sentences to choose from
+    # _KEYWORD + "key_sentences": instructions.KeySentenceChecker,
+    _KEYWORD + "forbidden_words": instructions.ForbiddenWords,
+    _KEYWORD + "letter_frequency": instructions.LetterFrequencyChecker,
+    _LANGUAGE + "response_language": instructions.ResponseLanguageChecker,
+    _LENGTH + "number_sentences": instructions.NumberOfSentences,
+    _LENGTH + "number_paragraphs": instructions.ParagraphChecker,
+    _LENGTH + "number_words": instructions.NumberOfWords,
+    _LENGTH + "nth_paragraph_first_word": instructions.ParagraphFirstWordCheck,
+    _CONTENT + "number_placeholders": instructions.PlaceholderChecker,
+    _CONTENT + "postscript": instructions.PostscriptChecker,
+    _FORMAT + "number_bullet_lists": instructions.BulletListChecker,
+    # TODO(jeffreyzhou): Pre-create paragraph or use prompt to replace
+    # _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph,
+    _FORMAT + "constrained_response": instructions.ConstrainedResponseChecker,
+    _FORMAT + "number_highlighted_sections": (
+        instructions.HighlightSectionChecker),
+    _FORMAT + "multiple_sections": instructions.SectionChecker,
+    # TODO(tianjianlu): Re-enable rephrasing with preprocessing the message.
+    # _FORMAT + "rephrase": instructions.RephraseChecker,
+    _FORMAT + "json_format": instructions.JsonFormat,
+    _FORMAT + "title": instructions.TitleChecker,
+    # TODO(tianjianlu): Re-enable with specific prompts.
+    # _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker,
+    _COMBINATION + "two_responses": instructions.TwoResponsesChecker,
+    _COMBINATION + "repeat_prompt": instructions.RepeatPromptThenAnswer,
+    _STARTEND + "end_checker": instructions.EndChecker,
+    _CHANGE_CASES
+    + "capital_word_frequency": instructions.CapitalWordFrequencyChecker,
+    _CHANGE_CASES
+    + "english_capital": instructions.CapitalLettersEnglishChecker,
+    _CHANGE_CASES
+    + "english_lowercase": instructions.LowercaseLettersEnglishChecker,
+    _PUNCTUATION + "no_comma": instructions.CommaChecker,
+    _STARTEND + "quotation": instructions.QuotationChecker,
+}
+
+INSTRUCTION_CONFLICTS = {
+    _KEYWORD + "existence": {_KEYWORD + "existence"},
+    _KEYWORD + "frequency": {_KEYWORD + "frequency"},
+    # TODO(jeffreyzhou): make a proper set of sentences to choose from
+    # _KEYWORD + "key_sentences": instructions.KeySentenceChecker,
+    _KEYWORD + "forbidden_words": {_KEYWORD + "forbidden_words"},
+    _KEYWORD + "letter_frequency": {_KEYWORD + "letter_frequency"},
+    _LANGUAGE
+    + "response_language": {
+        _LANGUAGE + "response_language",
+        _FORMAT + "multiple_sections",
+        _KEYWORD + "existence",
+        _KEYWORD + "frequency",
+        _KEYWORD + "forbidden_words",
+        _STARTEND + "end_checker",
+        _CHANGE_CASES + "english_capital",
+        _CHANGE_CASES + "english_lowercase",
+    },
+    _LENGTH + "number_sentences": {_LENGTH + "number_sentences"},
+    _LENGTH + "number_paragraphs": {
+        _LENGTH + "number_paragraphs",
+        _LENGTH + "nth_paragraph_first_word",
+        _LENGTH + "number_sentences",
+        _LENGTH + "nth_paragraph_first_word",
+    },
+    _LENGTH + "number_words": {_LENGTH + "number_words"},
+    _LENGTH + "nth_paragraph_first_word": {
+        _LENGTH + "nth_paragraph_first_word",
+        _LENGTH + "number_paragraphs",
+    },
+    _CONTENT + "number_placeholders": {_CONTENT + "number_placeholders"},
+    _CONTENT + "postscript": {_CONTENT + "postscript"},
+    _FORMAT + "number_bullet_lists": {_FORMAT + "number_bullet_lists"},
+    # TODO(jeffreyzhou): Pre-create paragraph or use prompt to replace
+    # _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph,
+    _FORMAT + "constrained_response": set(INSTRUCTION_DICT.keys()),
+    _FORMAT
+    + "number_highlighted_sections": {_FORMAT + "number_highlighted_sections"},
+    _FORMAT
+    + "multiple_sections": {
+        _FORMAT + "multiple_sections",
+        _LANGUAGE + "response_language",
+        _FORMAT + "number_highlighted_sections",
+    },
+    # TODO(tianjianlu): Re-enable rephrasing with preprocessing the message.
+    # _FORMAT + "rephrase": instructions.RephraseChecker,
+    _FORMAT
+    + "json_format": set(INSTRUCTION_DICT.keys()).difference(
+        {_KEYWORD + "forbidden_words", _KEYWORD + "existence"}
+    ),
+    _FORMAT + "title": {_FORMAT + "title"},
+    # TODO(tianjianlu): Re-enable with specific prompts.
+    # _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker,
+    _COMBINATION
+    + "two_responses": set(INSTRUCTION_DICT.keys()).difference({
+        _KEYWORD + "forbidden_words",
+        _KEYWORD + "existence",
+        _LANGUAGE + "response_language",
+        _FORMAT + "title",
+        _PUNCTUATION + "no_comma"
+    }),
+    _COMBINATION + "repeat_prompt": set(INSTRUCTION_DICT.keys()).difference({
+        _KEYWORD + "existence",
+        _FORMAT + "title",
+        _PUNCTUATION + "no_comma"
+    }),
+    _STARTEND + "end_checker": {_STARTEND + "end_checker"},
+    _CHANGE_CASES + "capital_word_frequency": {
+        _CHANGE_CASES + "capital_word_frequency",
+        _CHANGE_CASES + "english_lowercase",
+        _CHANGE_CASES + "english_capital",
+    },
+    _CHANGE_CASES + "english_capital": {_CHANGE_CASES + "english_capital"},
+    _CHANGE_CASES + "english_lowercase": {
+        _CHANGE_CASES + "english_lowercase",
+        _CHANGE_CASES + "english_capital",
+    },
+    _PUNCTUATION + "no_comma": {_PUNCTUATION + "no_comma"},
+    _STARTEND + "quotation": {_STARTEND + "quotation", _FORMAT + "title"},
+}
+
+
+def conflict_make(conflicts):
+  """Makes sure if A conflicts with B, B will conflict with A.
+
+  Args:
+    conflicts: Dictionary of potential conflicts where key is instruction id
+      and value is set of instruction ids that it conflicts with.
+
+  Returns:
+    Revised version of the dictionary. All instructions conflict with
+    themselves. If A conflicts with B, B will conflict with A.
+  """
+  for key in conflicts:
+    for k in conflicts[key]:
+      conflicts[k].add(key)
+    conflicts[key].add(key)
+  return conflicts
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/ifeval/instructions_util.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/ifeval/instructions_util.py
new file mode 100644
index 000000000..1f0dc0eaa
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/ifeval/instructions_util.py
@@ -0,0 +1,147 @@
+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utility library of instructions."""
+
+import functools
+import random
+import re
+from typing import List
+
+import immutabledict
+import nltk
+
+WORD_LIST = ["western", "sentence", "signal", "dump", "spot", "opposite", "bottom", "potato", "administration", "working", "welcome", "morning", "good", "agency", "primary", "wish", "responsibility", "press", "problem", "president", "steal", "brush", "read", "type", "beat", "trainer", "growth", "lock", "bone", "case", "equal", "comfortable", "region", "replacement", "performance", "mate", "walk", "medicine", "film", "thing", "rock", "tap", "total", "competition", "ease", "south", "establishment", "gather", "parking", "world", "plenty", "breath", "claim", "alcohol", "trade", "dear", "highlight", "street", "matter", "decision", "mess", "agreement", "studio", "coach", "assist", "brain", "wing", "style", "private", "top", "brown", "leg", "buy", "procedure", "method", "speed", "high", "company", "valuable", "pie", "analyst", "session", "pattern", "district", "pleasure", "dinner", "swimming", "joke", "order", "plate", "department", "motor", "cell", "spend", "cabinet", "difference", "power", "examination", "engine", "horse", "dimension", "pay", "toe", "curve", "literature", "bother", "fire", "possibility", "debate", "activity", "passage", "hello", "cycle", "background", "quiet", "author", "effect", "actor", "page", "bicycle", "error", "throat", "attack", "character", "phone", "tea", "increase", "outcome", "file", "specific", "inspector", "internal", "potential", "staff", "building", "employer", "shoe", "hand", "direction", "garden", "purchase", "interview", "study", "recognition", "member", "spiritual", "oven", "sandwich", "weird", "passenger", "particular", "response", "reaction", "size", "variation", "a", "cancel", "candy", "exit", "guest", "condition", "fly", "price", "weakness", "convert", "hotel", "great", "mouth", "mind", "song", "sugar", "suspect", "telephone", "ear", "roof", "paint", "refrigerator", "organization", "jury", "reward", "engineering", "day", "possession", "crew", "bar", "road", "description", "celebration", "score", "mark", "letter", "shower", "suggestion", "sir", "luck", "national", "progress", "hall", "stroke", "theory", "offer", "story", "tax", "definition", "history", "ride", "medium", "opening", "glass", "elevator", "stomach", "question", "ability", "leading", "village", "computer", "city", "grand", "confidence", "candle", "priest", "recommendation", "point", "necessary", "body", "desk", "secret", "horror", "noise", "culture", "warning", "water", "round", "diet", "flower", "bus", "tough", "permission", "week", "prompt", "connection", "abuse", "height", "save", "corner", "border", "stress", "drive", "stop", "rip", "meal", "listen", "confusion", "girlfriend", "living", "relation", "significance", "plan", "creative", "atmosphere", "blame", "invite", "housing", "paper", "drink", "roll", "silver", "drunk", "age", "damage", "smoke", "environment", "pack", "savings", "influence", "tourist", "rain", "post", "sign", "grandmother", "run", "profit", "push", "clerk", "final", "wine", "swim", "pause", "stuff", "singer", "funeral", "average", "source", "scene", "tradition", "personal", "snow", "nobody", "distance", "sort", "sensitive", "animal", "major", "negotiation", "click", "mood", "period", "arrival", "expression", "holiday", "repeat", "dust", "closet", "gold", "bad", "sail", "combination", "clothes", "emphasis", "duty", "black", "step", "school", "jump", "document", "professional", "lip", "chemical", "front", "wake", "while", "inside", "watch", "row", "subject", "penalty", "balance", "possible", "adult", "aside", "sample", "appeal", "wedding", "depth", "king", "award", "wife", "blow", "site", "camp", "music", "safe", "gift", "fault", "guess", "act", "shame", "drama", "capital", "exam", "stupid", "record", "sound", "swing", "novel", "minimum", "ratio", "machine", "shape", "lead", "operation", "salary", "cloud", "affair", "hit", "chapter", "stage", "quantity", "access", "army", "chain", "traffic", "kick", "analysis", "airport", "time", "vacation", "philosophy", "ball", "chest", "thanks", "place", "mountain", "advertising", "red", "past", "rent", "return", "tour", "house", "construction", "net", "native", "war", "figure", "fee", "spray", "user", "dirt", "shot", "task", "stick", "friend", "software", "promotion", "interaction", "surround", "block", "purpose", "practice", "conflict", "routine", "requirement", "bonus", "hole", "state", "junior", "sweet", "catch", "tear", "fold", "wall", "editor", "life", "position", "pound", "respect", "bathroom", "coat", "script", "job", "teach", "birth", "view", "resolve", "theme", "employee", "doubt", "market", "education", "serve", "recover", "tone", "harm", "miss", "union", "understanding", "cow", "river", "association", "concept", "training", "recipe", "relationship", "reserve", "depression", "proof", "hair", "revenue", "independent", "lift", "assignment", "temporary", "amount", "loss", "edge", "track", "check", "rope", "estimate", "pollution", "stable", "message", "delivery", "perspective", "mirror", "assistant", "representative", "witness", "nature", "judge", "fruit", "tip", "devil", "town", "emergency", "upper", "drop", "stay", "human", "neck", "speaker", "network", "sing", "resist", "league", "trip", "signature", "lawyer", "importance", "gas", "choice", "engineer", "success", "part", "external", "worker", "simple", "quarter", "student", "heart", "pass", "spite", "shift", "rough", "lady", "grass", "community", "garage", "youth", "standard", "skirt", "promise", "blind", "television", "disease", "commission", "positive", "energy", "calm", "presence", "tune", "basis", "preference", "head", "common", "cut", "somewhere", "presentation", "current", "thought", "revolution", "effort", "master", "implement", "republic", "floor", "principle", "stranger", "shoulder", "grade", "button", "tennis", "police", "collection", "account", "register", "glove", "divide", "professor", "chair", "priority", "combine", "peace", "extension", "maybe", "evening", "frame", "sister", "wave", "code", "application", "mouse", "match", "counter", "bottle", "half", "cheek", "resolution", "back", "knowledge", "make", "discussion", "screw", "length", "accident", "battle", "dress", "knee", "log", "package", "it", "turn", "hearing", "newspaper", "layer", "wealth", "profile", "imagination", "answer", "weekend", "teacher", "appearance", "meet", "bike", "rise", "belt", "crash", "bowl", "equivalent", "support", "image", "poem", "risk", "excitement", "remote", "secretary", "public", "produce", "plane", "display", "money", "sand", "situation", "punch", "customer", "title", "shake", "mortgage", "option", "number", "pop", "window", "extent", "nothing", "experience", "opinion", "departure", "dance", "indication", "boy", "material", "band", "leader", "sun", "beautiful", "muscle", "farmer", "variety", "fat", "handle", "director", "opportunity", "calendar", "outside", "pace", "bath", "fish", "consequence", "put", "owner", "go", "doctor", "information", "share", "hurt", "protection", "career", "finance", "force", "golf", "garbage", "aspect", "kid", "food", "boot", "milk", "respond", "objective", "reality", "raw", "ring", "mall", "one", "impact", "area", "news", "international", "series", "impress", "mother", "shelter", "strike", "loan", "month", "seat", "anything", "entertainment", "familiar", "clue", "year", "glad", "supermarket", "natural", "god", "cost", "conversation", "tie", "ruin", "comfort", "earth", "storm", "percentage", "assistance", "budget", "strength", "beginning", "sleep", "other", "young", "unit", "fill", "store", "desire", "hide", "value", "cup", "maintenance", "nurse", "function", "tower", "role", "class", "camera", "database", "panic", "nation", "basket", "ice", "art", "spirit", "chart", "exchange", "feedback", "statement", "reputation", "search", "hunt", "exercise", "nasty", "notice", "male", "yard", "annual", "collar", "date", "platform", "plant", "fortune", "passion", "friendship", "spread", "cancer", "ticket", "attitude", "island", "active", "object", "service", "buyer", "bite", "card", "face", "steak", "proposal", "patient", "heat", "rule", "resident", "broad", "politics", "west", "knife", "expert", "girl", "design", "salt", "baseball", "grab", "inspection", "cousin", "couple", "magazine", "cook", "dependent", "security", "chicken", "version", "currency", "ladder", "scheme", "kitchen", "employment", "local", "attention", "manager", "fact", "cover", "sad", "guard", "relative", "county", "rate", "lunch", "program", "initiative", "gear", "bridge", "breast", "talk", "dish", "guarantee", "beer", "vehicle", "reception", "woman", "substance", "copy", "lecture", "advantage", "park", "cold", "death", "mix", "hold", "scale", "tomorrow", "blood", "request", "green", "cookie", "church", "strip", "forever", "beyond", "debt", "tackle", "wash", "following", "feel", "maximum", "sector", "sea", "property", "economics", "menu", "bench", "try", "language", "start", "call", "solid", "address", "income", "foot", "senior", "honey", "few", "mixture", "cash", "grocery", "link", "map", "form", "factor", "pot", "model", "writer", "farm", "winter", "skill", "anywhere", "birthday", "policy", "release", "husband", "lab", "hurry", "mail", "equipment", "sink", "pair", "driver", "consideration", "leather", "skin", "blue", "boat", "sale", "brick", "two", "feed", "square", "dot", "rush", "dream", "location", "afternoon", "manufacturer", "control", "occasion", "trouble", "introduction", "advice", "bet", "eat", "kill", "category", "manner", "office", "estate", "pride", "awareness", "slip", "crack", "client", "nail", "shoot", "membership", "soft", "anybody", "web", "official", "individual", "pizza", "interest", "bag", "spell", "profession", "queen", "deal", "resource", "ship", "guy", "chocolate", "joint", "formal", "upstairs", "car", "resort", "abroad", "dealer", "associate", "finger", "surgery", "comment", "team", "detail", "crazy", "path", "tale", "initial", "arm", "radio", "demand", "single", "draw", "yellow", "contest", "piece", "quote", "pull", "commercial", "shirt", "contribution", "cream", "channel", "suit", "discipline", "instruction", "concert", "speech", "low", "effective", "hang", "scratch", "industry", "breakfast", "lay", "join", "metal", "bedroom", "minute", "product", "rest", "temperature", "many", "give", "argument", "print", "purple", "laugh", "health", "credit", "investment", "sell", "setting", "lesson", "egg", "middle", "marriage", "level", "evidence", "phrase", "love", "self", "benefit", "guidance", "affect", "you", "dad", "anxiety", "special", "boyfriend", "test", "blank", "payment", "soup", "obligation", "reply", "smile", "deep", "complaint", "addition", "review", "box", "towel", "minor", "fun", "soil", "issue", "cigarette", "internet", "gain", "tell", "entry", "spare", "incident", "family", "refuse", "branch", "can", "pen", "grandfather", "constant", "tank", "uncle", "climate", "ground", "volume", "communication", "kind", "poet", "child", "screen", "mine", "quit", "gene", "lack", "charity", "memory", "tooth", "fear", "mention", "marketing", "reveal", "reason", "court", "season", "freedom", "land", "sport", "audience", "classroom", "law", "hook", "win", "carry", "eye", "smell", "distribution", "research", "country", "dare", "hope", "whereas", "stretch", "library", "if", "delay", "college", "plastic", "book", "present", "use", "worry", "champion", "goal", "economy", "march", "election", "reflection", "midnight", "slide", "inflation", "action", "challenge", "guitar", "coast", "apple", "campaign", "field", "jacket", "sense", "way", "visual", "remove", "weather", "trash", "cable", "regret", "buddy", "beach", "historian", "courage", "sympathy", "truck", "tension", "permit", "nose", "bed", "son", "person", "base", "meat", "usual", "air", "meeting", "worth", "game", "independence", "physical", "brief", "play", "raise", "board", "she", "key", "writing", "pick", "command", "party", "yesterday", "spring", "candidate", "physics", "university", "concern", "development", "change", "string", "target", "instance", "room", "bitter", "bird", "football", "normal", "split", "impression", "wood", "long", "meaning", "stock", "cap", "leadership", "media", "ambition", "fishing", "essay", "salad", "repair", "today", "designer", "night", "bank", "drawing", "inevitable", "phase", "vast", "chip", "anger", "switch", "cry", "twist", "personality", "attempt", "storage", "being", "preparation", "bat", "selection", "white", "technology", "contract", "side", "section", "station", "till", "structure", "tongue", "taste", "truth", "difficulty", "group", "limit", "main", "move", "feeling", "light", "example", "mission", "might", "wait", "wheel", "shop", "host", "classic", "alternative", "cause", "agent", "consist", "table", "airline", "text", "pool", "craft", "range", "fuel", "tool", "partner", "load", "entrance", "deposit", "hate", "article", "video", "summer", "feature", "extreme", "mobile", "hospital", "flight", "fall", "pension", "piano", "fail", "result", "rub", "gap", "system", "report", "suck", "ordinary", "wind", "nerve", "ask", "shine", "note", "line", "mom", "perception", "brother", "reference", "bend", "charge", "treat", "trick", "term", "homework", "bake", "bid", "status", "project", "strategy", "orange", "let", "enthusiasm", "parent", "concentrate", "device", "travel", "poetry", "business", "society", "kiss", "end", "vegetable", "employ", "schedule", "hour", "brave", "focus", "process", "movie", "illegal", "general", "coffee", "ad", "highway", "chemistry", "psychology", "hire", "bell", "conference", "relief", "show", "neat", "funny", "weight", "quality", "club", "daughter", "zone", "touch", "tonight", "shock", "burn", "excuse", "name", "survey", "landscape", "advance", "satisfaction", "bread", "disaster", "item", "hat", "prior", "shopping", "visit", "east", "photo", "home", "idea", "father", "comparison", "cat", "pipe", "winner", "count", "lake", "fight", "prize", "foundation", "dog", "keep", "ideal", "fan", "struggle", "peak", "safety", "solution", "hell", "conclusion", "population", "strain", "alarm", "measurement", "second", "train", "race", "due", "insurance", "boss", "tree", "monitor", "sick", "course", "drag", "appointment", "slice", "still", "care", "patience", "rich", "escape", "emotion", "royal", "female", "childhood", "government", "picture", "will", "sock", "big", "gate", "oil", "cross", "pin", "improvement", "championship", "silly", "help", "sky", "pitch", "man", "diamond", "most", "transition", "work", "science", "committee", "moment", "fix", "teaching", "dig", "specialist", "complex", "guide", "people", "dead", "voice", "original", "break", "topic", "data", "degree", "reading", "recording", "bunch", "reach", "judgment", "lie", "regular", "set", "painting", "mode", "list", "player", "bear", "north", "wonder", "carpet", "heavy", "officer", "negative", "clock", "unique", "baby", "pain", "assumption", "disk", "iron", "bill", "drawer", "look", "double", "mistake", "finish", "future", "brilliant", "contact", "math", "rice", "leave", "restaurant", "discount", "sex", "virus", "bit", "trust", "event", "wear", "juice", "failure", "bug", "context", "mud", "whole", "wrap", "intention", "draft", "pressure", "cake", "dark", "explanation", "space", "angle", "word", "efficiency", "management", "habit", "star", "chance", "finding", "transportation", "stand", "criticism", "flow", "door", "injury", "insect", "surprise", "apartment"]  # pylint: disable=line-too-long
+
+# ISO 639-1 codes to language names.
+LANGUAGE_CODES = immutabledict.immutabledict({
+    "en": "English",
+    "es": "Spanish",
+    "pt": "Portuguese",
+    "ar": "Arabic",
+    "hi": "Hindi",
+    "fr": "French",
+    "ru": "Russian",
+    "de": "German",
+    "ja": "Japanese",
+    "it": "Italian",
+    "bn": "Bengali",
+    "uk": "Ukrainian",
+    "th": "Thai",
+    "ur": "Urdu",
+    "ta": "Tamil",
+    "te": "Telugu",
+    "bg": "Bulgarian",
+    "ko": "Korean",
+    "pl": "Polish",
+    "he": "Hebrew",
+    "fa": "Persian",
+    "vi": "Vietnamese",
+    "ne": "Nepali",
+    "sw": "Swahili",
+    "kn": "Kannada",
+    "mr": "Marathi",
+    "gu": "Gujarati",
+    "pa": "Punjabi",
+    "ml": "Malayalam",
+    "fi": "Finnish",
+    })
+
+_ALPHABETS = "([A-Za-z])"
+_PREFIXES = "(Mr|St|Mrs|Ms|Dr)[.]"
+_SUFFIXES = "(Inc|Ltd|Jr|Sr|Co)"
+_STARTERS = r"(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
+_ACRONYMS = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
+_WEBSITES = "[.](com|net|org|io|gov|edu|me)"
+_DIGITS = "([0-9])"
+_MULTIPLE_DOTS = r"\.{2,}"
+
+
+def split_into_sentences(text):
+  """Split the text into sentences.
+
+  Args:
+    text: A string that consists of more than or equal to one sentences.
+
+  Returns:
+    A list of strings where each string is a sentence.
+  """
+  text = " " + text + "  "
+  text = text.replace("\n", " ")
+  text = re.sub(_PREFIXES, "\\1<prd>", text)
+  text = re.sub(_WEBSITES, "<prd>\\1", text)
+  text = re.sub(_DIGITS + "[.]" + _DIGITS, "\\1<prd>\\2", text)
+  text = re.sub(
+      _MULTIPLE_DOTS,
+      lambda match: "<prd>" * len(match.group(0)) + "<stop>",
+      text,
+  )
+  if "Ph.D" in text:
+    text = text.replace("Ph.D.", "Ph<prd>D<prd>")
+  text = re.sub(r"\s" + _ALPHABETS + "[.] ", " \\1<prd> ", text)
+  text = re.sub(_ACRONYMS + " " + _STARTERS, "\\1<stop> \\2", text)
+  text = re.sub(
+      _ALPHABETS + "[.]" + _ALPHABETS + "[.]" + _ALPHABETS + "[.]",
+      "\\1<prd>\\2<prd>\\3<prd>",
+      text,
+  )
+  text = re.sub(
+      _ALPHABETS + "[.]" + _ALPHABETS + "[.]", "\\1<prd>\\2<prd>", text
+  )
+  text = re.sub(" " + _SUFFIXES + "[.] " + _STARTERS, " \\1<stop> \\2", text)
+  text = re.sub(" " + _SUFFIXES + "[.]", " \\1<prd>", text)
+  text = re.sub(" " + _ALPHABETS + "[.]", " \\1<prd>", text)
+  if "”" in text:
+    text = text.replace(".”", "”.")
+  if '"' in text:
+    text = text.replace('."', '".')
+  if "!" in text:
+    text = text.replace('!"', '"!')
+  if "?" in text:
+    text = text.replace('?"', '"?')
+  text = text.replace(".", ".<stop>")
+  text = text.replace("?", "?<stop>")
+  text = text.replace("!", "!<stop>")
+  text = text.replace("<prd>", ".")
+  sentences = text.split("<stop>")
+  sentences = [s.strip() for s in sentences]
+  if sentences and not sentences[-1]:
+    sentences = sentences[:-1]
+  return sentences
+
+
+def count_words(text):
+  """Counts the number of words."""
+  tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
+  tokens = tokenizer.tokenize(text)
+  num_words = len(tokens)
+  return num_words
+
+
+@functools.lru_cache(maxsize=None)
+def _get_sentence_tokenizer():
+  return nltk.data.load("nltk:tokenizers/punkt/english.pickle")
+
+
+def count_sentences(text):
+  """Count the number of sentences."""
+  tokenizer = _get_sentence_tokenizer()
+  tokenized_sentences = tokenizer.tokenize(text)
+  return len(tokenized_sentences)
+
+
+def generate_keywords(num_keywords):
+  """Randomly generates a few keywords."""
+  return random.sample(WORD_LIST, k=num_keywords)
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/llm_judge/__init__.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/llm_judge/__init__.py
new file mode 100644
index 000000000..95c260863
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/llm_judge/__init__.py
@@ -0,0 +1,3 @@
+from .llm_judge import compute_score
+
+__all__ = ['compute_score']
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/llm_judge/llm_judge.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/llm_judge/llm_judge.py
new file mode 100644
index 000000000..aa671beae
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/llm_judge/llm_judge.py
@@ -0,0 +1,251 @@
+from openai import OpenAI
+import os
+
+LLM_JUDGE_DEBUG = int(os.getenv("LLM_JUDGE_DEBUG", "1"))
+
+# STRICTLY match
+STRICTLY_JUDGE_PROMPT_TEMPLATE = """Given a problem, determine whether the final answer in the provided (incomplete) solution process matches the reference answer.
+
+The reference answer may be a single option character (e.g., A, B, C, D), a numerical value, an expression, or a list of answers if multiple questions are involved.
+
+**The reference answer may be in Chinese or another language, but your evaluation should be language-agnostic.**
+
+---
+
+Your task:
+
+1. Carefully compare the **final output** of the solution process with the **reference answer**.
+
+2. Provide a **brief explanation first**, including:
+   - Whether the final answer is clearly stated.
+   - Whether it matches the reference answer **exactly** (including form, number, symbol, or character).
+   - Any mismatch, ambiguity, or incompleteness in the solution process.
+
+3. After the explanation, output only **YES** or **NO**, strictly in the format: `\\boxed{{YES}}` or `\\boxed{{NO}}`.
+
+4. Matching must be **strict**:
+   - Any discrepancy in number, character, expression, or format = **NO**.
+   - If the solution is ambiguous, missing, or incomplete = **NO**.
+   - Only a clear and exact match = **YES**.
+
+---
+
+**Question:**
+
+{question}
+
+**Solution Process (Final Step Only):**
+
+{response}
+
+**Reference Answer:**
+
+{reference}
+
+**Output:**
+
+(Explain first, then give your final answer below in the format `\\boxed{{YES}}` or `\\boxed{{NO}}`.) 
+"""
+
+# EQUIVALENT match
+EQUIVALENT_JUDGE_PROMPT_TEMPLATE = """Given a problem, determine whether the final answer in the provided (incomplete) solution process is equivalent to the reference answer.
+
+The reference answer may be a single option character (e.g., A, B, C, D), a numerical value, an expression, or a list of answers if multiple parts are involved.
+
+**The reference answer may appear in Chinese or other languages. Your evaluation should remain language-agnostic.**
+
+---
+
+Your task:
+
+1. Analyze the final output in the solution process and compare it with the reference answer.
+
+2. If they are **logically or mathematically equivalent**, output **YES**.
+
+3. If they are **not equivalent**, output **NO**.
+
+4. If the final step of the solution is unclear, incomplete, or ambiguous, treat it as incorrect and output **NO**.
+
+---
+
+Your output must be either **'YES'** or **'NO'**, formatted strictly as: `\\boxed{{YES}}` or `\\boxed{{NO}}`.
+
+---
+
+**Question:**  
+{question}
+
+**Solution Process (Final Step Only):**  
+{response}
+
+**Reference Answer:**  
+{reference}
+
+**Output:**
+
+(First, briefly explain your reasoning. Then give your final answer in the required format: `\\boxed{{YES}}` or `\\boxed{{NO}}`.)"""
+
+
+LLM_JUDGE_PROMPT_TEMPLATE = os.getenv("LLM_JUDGE_PROMPT_TEMPLATE", "EQUIVALENT")
+
+if LLM_JUDGE_PROMPT_TEMPLATE == "STRICT":
+    LLM_JUDGE_PROMPT_TEMPLATE = STRICTLY_JUDGE_PROMPT_TEMPLATE
+elif LLM_JUDGE_PROMPT_TEMPLATE == "EQUIVALENT":
+    LLM_JUDGE_PROMPT_TEMPLATE = EQUIVALENT_JUDGE_PROMPT_TEMPLATE
+else:
+    print(f"Use env variable LLM_JUDGE_PROMPT_TEMPLATE: {LLM_JUDGE_PROMPT_TEMPLATE}")
+
+
+def extract_answer_from_box(string):
+    """Extract Answer String from \\boxed or \\fbox expression."""
+    if not string or not string.strip():
+        return None
+
+    # Try to find \\boxed first
+    idx = string.rfind("\\boxed")
+    box_type = "\\boxed"
+    
+    if idx < 0:
+        # If not found, try \\fbox
+        idx = string.rfind("\\fbox")
+        box_type = "\\fbox"
+        if idx < 0:
+            return None
+
+    i = idx
+    right_brace_idx = None
+    num_left_braces_open = 0
+    while i < len(string):
+        if string[i] == "{":
+            num_left_braces_open += 1
+        if string[i] == "}":
+            num_left_braces_open -= 1
+            if num_left_braces_open == 0:
+                right_brace_idx = i
+                break
+        i += 1
+
+    if right_brace_idx is None:
+        return None
+    else:
+        retval = string[idx : right_brace_idx + 1]
+
+    if retval:
+        left = box_type + "{"
+        try:
+            assert retval[: len(left)] == left
+            assert retval[-1] == "}"
+            return retval[len(left) : -1]
+        except AssertionError:
+            return None
+
+    return None
+
+def compute_score(
+    question: str,
+    response_to_judge: str,
+    metadata: dict,
+    sampling_params_dict: dict,
+) -> float:
+    """Judges a single response using the LLM.
+
+    Args:
+        question: The question string.
+        response_to_judge: The assistant's response string.
+        metadata: Metadata containing reference answer and criteria.
+        sampling_params_dict: Dictionary containing OpenAI API parameters including base_url and api_key.
+
+    Returns:
+        Score (float)
+    """
+    reference = metadata.get("reference_answer", "")
+    extract_box = metadata.get("extract_box", False)
+
+    # Remove thinking part if using thinking models
+    response_to_judge = response_to_judge.split("</think>")[-1].strip()
+    response_to_judge = response_to_judge.split("<answer>")[-1].strip()
+
+    response_to_judge = (
+        extract_answer_from_box(response_to_judge)
+        if extract_box
+        else response_to_judge
+    )
+    response_to_judge = "None" if response_to_judge is None else response_to_judge
+    # Prioritize metadata's judge_prompt_template, then default
+    # Note that if you want to use a custom judge_prompt_template, you may need to change the verdict extraction logic accordingly
+    current_judge_prompt = (
+        metadata.get("judge_prompt_template", None) or LLM_JUDGE_PROMPT_TEMPLATE
+    )
+
+    prompt = current_judge_prompt.format(
+        question=question,
+        response=response_to_judge,
+        reference=reference,
+    )
+    if LLM_JUDGE_DEBUG:
+        print(f"[LLM_JUDGE] Prompt: {prompt}")
+    
+    # Initialize OpenAI client with custom URL and key from sampling_params_dict
+    client = OpenAI(
+        base_url=sampling_params_dict.get("base_url", "https://integrate.api.nvidia.com/v1"),
+        api_key=sampling_params_dict.get("api_key", "$API_KEY_REQUIRED_IF_EXECUTING_OUTSIDE_NGC")
+    )
+    
+    # Extract OpenAI parameters from sampling_params_dict
+    model = sampling_params_dict.get("model", "meta/llama-3.3-70b-instruct")
+    temperature = sampling_params_dict.get("temperature", 0.2)
+    top_p = sampling_params_dict.get("top_p", 0.7)
+    top_k = sampling_params_dict.get("top_k", -1)
+    max_tokens = sampling_params_dict.get("max_tokens", 1024)
+    max_model_len = sampling_params_dict.get("max_model_len", 8192)
+    
+    try:
+        completion = client.chat.completions.create(
+            model=model,
+            messages=[{"role": "user", "content": prompt}],
+            temperature=temperature,
+            top_p=top_p,
+            max_tokens=max_tokens,
+            stream=True,
+            extra_body={"chat_template_kwargs": {"enable_thinking": False}, "truncate_prompt_tokens": max_model_len - max_tokens, "top_k": top_k}
+        )
+        
+        generated_text = ""
+        for chunk in completion:
+            if chunk.choices[0].delta.content is not None:
+                generated_text += chunk.choices[0].delta.content
+        
+        generated_text = generated_text.strip()
+        
+    except Exception as e:
+        if LLM_JUDGE_DEBUG:
+            print(f"[LLM_JUDGE] Error calling OpenAI API: {e}")
+        return 0.0
+
+    score = 0.0
+    if generated_text:
+        generated_text_lower = generated_text.lower()
+        result = extract_answer_from_box(generated_text_lower)
+        # to avoid the case that the answer is not in the box
+        if result is None:
+            has_yes = "yes" in generated_text_lower
+        else:
+            has_yes = "yes" in result
+
+        if has_yes:
+            score = 1.0
+            if LLM_JUDGE_DEBUG:
+                print(f"[LLM_JUDGE] Parsed 'yes'. Score: {score}. Output: '{generated_text}'")
+        else:
+            score = 0.0
+            if LLM_JUDGE_DEBUG:
+                print(f"[LLM_JUDGE] No 'yes' found. Score: {score}. Output: '{generated_text}'")
+    else:
+        if LLM_JUDGE_DEBUG:
+            print(f"[LLM_JUDGE] No output received from LLM.")
+            
+    if LLM_JUDGE_DEBUG:
+        import sys
+        sys.stdout.flush()
+
+    return score
\ No newline at end of file
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/math.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/math.py
new file mode 100644
index 000000000..50792aa6e
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/math.py
@@ -0,0 +1,227 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/hendrycks_math/utils.py
+
+
+def compute_score(solution_str, ground_truth) -> float:
+    retval = 0.
+    try:
+        string_in_last_boxed = last_boxed_only_string(solution_str)
+        if string_in_last_boxed is not None:
+            answer = remove_boxed(string_in_last_boxed)
+            if is_equiv(answer, ground_truth):
+                retval = 1.
+    except Exception as e:
+        print(e)
+
+    return retval
+
+
+# string normalization from https://github.com/EleutherAI/lm-evaluation-harness/blob/master/lm_eval/tasks/hendrycks_math.py
+def is_equiv(str1, str2, verbose=False):
+    if str1 is None and str2 is None:
+        print("WARNING: Both None")
+        return True
+    if str1 is None or str2 is None:
+        return False
+
+    try:
+        ss1 = strip_string(str1)
+        ss2 = strip_string(str2)
+        if verbose:
+            print(ss1, ss2)
+        return ss1 == ss2
+    except Exception:
+        return str1 == str2
+
+
+def remove_boxed(s):
+    if "\\boxed " in s:
+        left = "\\boxed "
+        assert s[:len(left)] == left
+        return s[len(left):]
+
+    left = "\\boxed{"
+
+    assert s[:len(left)] == left
+    assert s[-1] == "}"
+
+    return s[len(left):-1]
+
+
+def last_boxed_only_string(string):
+    idx = string.rfind("\\boxed")
+    if "\\boxed " in string:
+        return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
+    if idx < 0:
+        idx = string.rfind("\\fbox")
+        if idx < 0:
+            return None
+
+    i = idx
+    right_brace_idx = None
+    num_left_braces_open = 0
+    while i < len(string):
+        if string[i] == "{":
+            num_left_braces_open += 1
+        if string[i] == "}":
+            num_left_braces_open -= 1
+            if num_left_braces_open == 0:
+                right_brace_idx = i
+                break
+        i += 1
+
+    if right_brace_idx is None:
+        retval = None
+    else:
+        retval = string[idx:right_brace_idx + 1]
+
+    return retval
+
+
+def fix_fracs(string):
+    substrs = string.split("\\frac")
+    new_str = substrs[0]
+    if len(substrs) > 1:
+        substrs = substrs[1:]
+        for substr in substrs:
+            new_str += "\\frac"
+            if substr[0] == "{":
+                new_str += substr
+            else:
+                try:
+                    assert len(substr) >= 2
+                except AssertionError:
+                    return string
+                a = substr[0]
+                b = substr[1]
+                if b != "{":
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}{" + b + "}" + post_substr
+                    else:
+                        new_str += "{" + a + "}{" + b + "}"
+                else:
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}" + b + post_substr
+                    else:
+                        new_str += "{" + a + "}" + b
+    string = new_str
+    return string
+
+
+def fix_a_slash_b(string):
+    if len(string.split("/")) != 2:
+        return string
+    a = string.split("/")[0]
+    b = string.split("/")[1]
+    try:
+        a = int(a)
+        b = int(b)
+        assert string == "{}/{}".format(a, b)
+        new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
+        return new_string
+    except AssertionError:
+        return string
+
+
+def remove_right_units(string):
+    # "\\text{ " only ever occurs (at least in the val set) when describing units
+    if "\\text{ " in string:
+        splits = string.split("\\text{ ")
+        assert len(splits) == 2
+        return splits[0]
+    else:
+        return string
+
+
+def fix_sqrt(string):
+    if "\\sqrt" not in string:
+        return string
+    splits = string.split("\\sqrt")
+    new_string = splits[0]
+    for split in splits[1:]:
+        if split[0] != "{":
+            a = split[0]
+            new_substr = "\\sqrt{" + a + "}" + split[1:]
+        else:
+            new_substr = "\\sqrt" + split
+        new_string += new_substr
+    return new_string
+
+
+def strip_string(string):
+    # linebreaks
+    string = string.replace("\n", "")
+
+    # remove inverse spaces
+    string = string.replace("\\!", "")
+
+    # replace \\ with \
+    string = string.replace("\\\\", "\\")
+
+    # replace tfrac and dfrac with frac
+    string = string.replace("tfrac", "frac")
+    string = string.replace("dfrac", "frac")
+
+    # remove \left and \right
+    string = string.replace("\\left", "")
+    string = string.replace("\\right", "")
+
+    # Remove circ (degrees)
+    string = string.replace("^{\\circ}", "")
+    string = string.replace("^\\circ", "")
+
+    # remove dollar signs
+    string = string.replace("\\$", "")
+
+    # remove units (on the right)
+    string = remove_right_units(string)
+
+    # remove percentage
+    string = string.replace("\\%", "")
+    string = string.replace("\%", "")  # noqa: W605
+
+    # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
+    string = string.replace(" .", " 0.")
+    string = string.replace("{.", "{0.")
+    # if empty, return empty string
+    if len(string) == 0:
+        return string
+    if string[0] == ".":
+        string = "0" + string
+
+    # to consider: get rid of e.g. "k = " or "q = " at beginning
+    if len(string.split("=")) == 2:
+        if len(string.split("=")[0]) <= 2:
+            string = string.split("=")[1]
+
+    # fix sqrt3 --> sqrt{3}
+    string = fix_sqrt(string)
+
+    # remove spaces
+    string = string.replace(" ", "")
+
+    # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
+    string = fix_fracs(string)
+
+    # manually change 0.5 --> \frac{1}{2}
+    if string == "0.5":
+        string = "\\frac{1}{2}"
+
+    # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
+    string = fix_a_slash_b(string)
+
+    return string
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/prime_code/__init__.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/prime_code/__init__.py
new file mode 100644
index 000000000..088c8e6c6
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/prime_code/__init__.py
@@ -0,0 +1,55 @@
+# Copyright 2024 PRIME team and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .utils import check_correctness as apps_check_correctness
+import json
+import re
+import traceback
+
+
+def compute_score(completion, test_cases, continuous=False):
+    # try to get code solution from completion. if the completion is pure code, this will not take effect.
+    solution = completion.split('```python')[-1].split('```')[0]
+    try:
+        try:
+            if not isinstance(test_cases, dict):
+                test_cases = json.loads(test_cases)
+        except Exception as e:
+            print(f"Error:{e}")
+
+        # Complete check on all in-out pairs first. If there is no failure, per-sample test can be skipped.
+        try:
+            res, metadata = apps_check_correctness(in_outs=test_cases, generation=solution, timeout=5, debug=False)
+            metadata = list(metadata)
+            success = all(map(lambda x: x == True, res))
+            if continuous:
+                # compile error
+                if res[-1] == -2:
+                    return 0, metadata
+                else:
+                    # set runtime error to 0
+                    res = [0 if x == -1 else x for x in res]
+                    return sum(res) / len(res), metadata
+            else:
+                return success, metadata
+
+        except Exception as e:
+            success = False
+            metadata_list = None
+
+    except Exception as e:
+        traceback.print_exc(10)
+        success = False
+        metadata_list = None
+    return success, metadata_list
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/prime_code/testing_util.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/prime_code/testing_util.py
new file mode 100644
index 000000000..b5844befe
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/prime_code/testing_util.py
@@ -0,0 +1,686 @@
+# Copyright 2024 PRIME team and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ast
+import json
+import sys
+import faulthandler
+import platform
+
+# used for debugging to time steps
+from datetime import datetime
+
+# to run the solution files we're using a timing based approach
+import signal
+
+import numpy as np
+
+# for capturing the stdout
+from io import StringIO
+
+# used for testing the code that reads from input
+from unittest.mock import patch, mock_open
+
+from pyext import RuntimeModule
+
+from enum import Enum
+
+import traceback
+
+
+def truncatefn(s, length=300):
+    assert isinstance(s, str)
+    if len(s) <= length:
+        return s
+
+    return s[:length // 2] + "...(truncated) ..." + s[-length // 2:]
+
+
+class CODE_TYPE(Enum):
+    call_based = 0
+    standard_input = 1
+
+
+# stuff for setting up signal timer
+class TimeoutException(Exception):
+    pass
+
+
+def timeout_handler(signum, frame):
+    print("alarm went off")
+    return
+    # raise TimeoutException # this is an unhandled exception. just return None is OK
+
+
+signal.signal(signal.SIGALRM, timeout_handler)
+
+# timeout = 6  # seconds
+
+
+# used to capture stdout as a list
+# from https://stackoverflow.com/a/16571630/6416660
+# alternative use redirect_stdout() from contextlib
+class Capturing(list):
+
+    def __enter__(self):
+        self._stdout = sys.stdout
+        sys.stdout = self._stringio = StringIO()
+        # Make closing the StringIO a no-op
+        self._stringio.close = lambda x: 1
+        return self
+
+    def __exit__(self, *args):
+        self.append(self._stringio.getvalue())
+        del self._stringio  # free up some memory
+        sys.stdout = self._stdout
+
+
+def only_int_check(val):
+    return isinstance(val, int)
+
+
+def string_int_check(val):
+    return isinstance(val, str) and val.isdigit()
+
+
+def combined_int_check(val):
+    return only_int_check(val) or string_int_check(val)
+
+
+def clean_traceback(error_traceback):
+    file_start = error_traceback.find('File \"<string>\"')
+    # print(file_start)
+    error_traceback = "Traceback (most recent call last):\n  " + error_traceback[file_start:]
+    return error_traceback
+
+
+def run_test(in_outs, test=None, debug=False, timeout=15):
+    """
+    if test(generated_code) is not None it'll try to run the code.
+    otherwise it'll just return an input and output pair.
+    """
+    # Disable functionalities that can make destructive changes to the test.
+    reliability_guard()
+
+    if debug:
+        print(f"start = {datetime.now().time()}")
+
+    if in_outs:
+        if in_outs.get("fn_name") is None:
+            which_type = CODE_TYPE.standard_input  # Standard input
+            method_name = None
+        else:
+            which_type = CODE_TYPE.call_based  # Call-based
+            method_name = in_outs["fn_name"]
+
+    if debug:
+        print(f"loaded input_output = {datetime.now().time()}")
+
+    if test is None:
+        assert False, "should not happen: test code is none"
+        return in_outs, {"error": "no test code provided"}
+    elif test is not None:
+        results = []
+        metadata = []
+        sol = "from string import *\nfrom re import *\nfrom datetime import *\nfrom collections import *\nfrom heapq import *\nfrom bisect import *\nfrom copy import *\nfrom math import *\nfrom random import *\nfrom statistics import *\nfrom itertools import *\nfrom functools import *\nfrom operator import *\nfrom io import *\nfrom sys import *\nfrom json import *\nfrom builtins import *\nfrom typing import *\nimport string\nimport re\nimport datetime\nimport collections\nimport heapq\nimport bisect\nimport copy\nimport math\nimport random\nimport statistics\nimport itertools\nimport functools\nimport operator\nimport io\nimport sys\nimport json\nsys.setrecursionlimit(6*10**5)\n"
+        if debug:
+            print(f"loading test code = {datetime.now().time()}")
+
+        if which_type == CODE_TYPE.call_based:
+
+            sol += test
+            if debug:
+                print(f"sol = {sol}")
+            signal.alarm(timeout)
+            try:
+                tmp_sol = RuntimeModule.from_string("tmp_sol", "", sol)
+                if "class Solution" not in test:
+                    tmp = tmp_sol
+                else:
+                    tmp = tmp_sol.Solution()
+                signal.alarm(0)
+            except Exception as e:
+                signal.alarm(0)
+                error_traceback = traceback.format_exc()
+                if debug:
+                    print(f"type 0 compilation error = {e}")
+                results.append(-2)
+                metadata.append({"error": repr(e),
+                    "traceback": clean_traceback(error_traceback)})
+                return results, metadata
+            signal.alarm(0)
+
+        elif which_type == CODE_TYPE.standard_input:
+            # sol
+            # if code has if __name__ == "__main__": then remove it
+            try:
+                astree = ast.parse(test)
+                last_block = astree.body[-1]
+                if isinstance(last_block, ast.If):
+                    condition = last_block.test
+                    if ast.unparse(condition).strip() == "__name__ == '__main__'":
+                        test = (ast.unparse(astree.body[:-1]) + "\n" + ast.unparse(last_block.body))
+            except:
+                pass
+
+            tmp_test = test.split("\n")
+
+            new_test = []
+            for x in tmp_test:
+                if (not x.startswith("from ")) and (not x.startswith("import ")):
+                    new_test.append("\t" + x + "\n")
+                else:
+                    new_test.append(x + "\n")
+            tmp_test = new_test
+
+            new_test = ""
+            started = False
+            for i in tmp_test:
+                if i.startswith("\t") and not started:
+                    new_test += "stdin = sys.stdin\nstdout = sys.stdout\n"
+                    new_test += "def code():\n"
+                    new_test += i
+                    started = True
+                elif started and ((i.startswith("from ")) or (i.startswith("import "))):
+                    new_test += "\t" + i
+                else:
+                    new_test += i
+            tmp_test = new_test
+
+            sol += tmp_test
+            if debug:
+                print(f"sol = {sol}")
+            method_name = "code"
+            signal.alarm(timeout)
+            try:
+                tmp_sol = RuntimeModule.from_string("tmp_sol", "", sol)
+                tmp = tmp_sol
+                signal.alarm(0)
+            except Exception as e:
+                signal.alarm(0)
+                error_traceback = traceback.format_exc()
+                if debug:
+                    print(f"type 1 compilation error = {e}")
+                results.append(-2)
+                metadata.append({   
+                    "error": repr(e),
+                    "traceback": clean_traceback(error_traceback)})
+                return results, metadata
+            signal.alarm(0)
+        if debug:
+            print(f"get method = {datetime.now().time()}")
+
+        try:
+            method = getattr(tmp, method_name)  # get_attr second arg must be str
+        except:
+            signal.alarm(0)
+            error_traceback = traceback.format_exc()
+            e = sys.exc_info()
+            print(f"unable to get function error = {e}")
+            results.append(-2)
+            metadata.append({
+                "error": repr(e),
+                "traceback": clean_traceback(error_traceback)})
+            return results, metadata
+
+
+        for index, inputs in enumerate(in_outs["inputs"]):
+            raw_inputs = inputs
+            raw_outputs = in_outs["outputs"][index]
+            if which_type == CODE_TYPE.call_based:
+                inputs = [json.loads(line) for line in inputs.split("\n")]
+                in_outs["outputs"][index] = json.loads(in_outs["outputs"][index])
+
+                truncate_line_size = 300 // (raw_inputs.count("\n") + 1)
+                raw_inputs = "\n".join(
+                    [truncatefn(line, truncate_line_size) for line in raw_inputs.strip().split("\n")])
+                raw_outputs = truncatefn(raw_outputs, 200)
+            else:
+                raw_inputs = truncatefn(raw_inputs)
+                raw_outputs = truncatefn(raw_outputs, 200)
+            # JSON forces dictionaries to have string keys; this undoes this (assuming a singleton list)
+            try:
+                if isinstance(inputs[0], dict):
+                    inputs = [{int(k): v for k, v in inputs[0].items()}]
+            except:
+                True
+            try:
+                if isinstance(in_outs["outputs"][index], dict):
+                    in_outs["outputs"][index] = [{int(k): v for k, v in in_outs["outputs"][index].items()}]
+            except:
+                True
+            try:
+                if isinstance(in_outs["outputs"][index][0], dict):
+                    in_outs["outputs"][index] = [{int(k): v for k, v in in_outs["outputs"][index][0].items()}]
+            except:
+                True
+
+            if debug:
+                print(
+                    f"time: {datetime.now().time()} testing index = {index}  inputs = {inputs}, {type(inputs)}. type = {which_type}"
+                )
+            if which_type == CODE_TYPE.call_based:  # Call-based
+                signal.alarm(timeout)
+                faulthandler.enable()
+                try:
+                    output = method(*inputs)
+                    raw_true_output = output
+
+                    raw_true_output_copy = json.dumps(output)
+                    raw_true_output_copy = truncatefn(raw_true_output_copy, 200)
+
+                    # ground truth sequences are not tuples
+                    if isinstance(output, tuple):
+                        output = list(output)
+
+                    tmp_result = output == in_outs["outputs"][index]
+                    if (isinstance(in_outs["outputs"][index], list) and in_outs["outputs"][index]):
+                        tmp_result = tmp_result or (output == in_outs["outputs"][index][0])
+
+                    # ground truth sequences are not tuples
+                    try:
+                        if isinstance(output[0], tuple):
+                            tmp_result = tmp_result or ([list(x) for x in output] == in_outs["outputs"][index][0])
+                    except:
+                        True
+                    results.append(tmp_result)
+                    metadata.append({
+                            "output": raw_true_output_copy,
+                            "expected": raw_outputs,
+                            "inputs": raw_inputs,
+                            "error_message": "Wrong Answer"})
+                    # reset the alarm
+                    signal.alarm(0)
+                except Exception as e:
+                    signal.alarm(0)
+                    error_traceback = traceback.format_exc()
+                    faulthandler.disable()
+                    if debug:
+                        print(f"Standard input runtime error or time limit exceeded error = {e}")
+                    results.append(-1)
+                    metadata.append({
+                            "error": repr(e),
+                            "traceback": clean_traceback(error_traceback)})
+                faulthandler.disable()
+                signal.alarm(0)
+                if debug:
+                    print(
+                        f"outputs = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}"
+                    )
+            elif which_type == CODE_TYPE.standard_input:  # Standard input
+                faulthandler.enable()
+                passed = False
+
+                if isinstance(inputs, list):
+                    inputs = "\n".join(inputs)
+                if isinstance(in_outs["outputs"][index], list):
+                    in_outs["outputs"][index] = "\n".join(in_outs["outputs"][index])
+
+                signal.alarm(timeout)
+                with Capturing() as output:
+                    try:
+                        call_method(method, inputs)
+                        # reset the alarm
+                        signal.alarm(0)
+                        passed = True
+                    except Exception as e:
+                        # runtime error or took too long
+                        signal.alarm(0)
+                        error_traceback = traceback.format_exc()
+                        print(f"Call-based runtime error or time limit exceeded error = {repr(e)}{e}")
+                        results.append(-1)
+                        metadata.append({
+                            "error": repr(e),
+                            "traceback": clean_traceback(error_traceback)})
+                        continue
+                    signal.alarm(0)
+                raw_true_output = output[0]
+                raw_true_output_copy = truncatefn(raw_true_output, 200)
+                output = raw_true_output.splitlines()
+                if not passed:
+                    if debug:
+                        nl = "\n"
+                        if not isinstance(inputs, list):
+                            print(
+                                f"not passed output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}"
+                            )
+                        else:
+                            print(
+                                f"not passed output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}"
+                            )
+                    continue
+
+                if passed and debug:
+                    print(f"==> output = {output}, test outputs = {in_outs['outputs'][index]}")
+
+                if custom_compare_(output, in_outs["outputs"][index]):
+                    tmp_result = True
+                    results.append(tmp_result)
+                    metadata.append({})
+                    continue
+
+                # ground truth sequences are expressed as lists not tuples
+                if isinstance(output, tuple):
+                    output = list(output)
+
+                tmp_result = False
+                try:
+                    tmp_result = output == [in_outs["outputs"][index]]
+                    if isinstance(in_outs["outputs"][index], list):
+                        tmp_result = tmp_result or (output == in_outs["outputs"][index])
+                        if isinstance(output[0], str):
+                            tmp_result = tmp_result or ([e.strip() for e in output] == in_outs["outputs"][index])
+                except Exception as e:
+                    if debug:
+                        print(f"Failed check1 exception = {e}")
+                    pass
+
+                if tmp_result == True:
+                    results.append(tmp_result)
+                    metadata.append({})
+                    continue
+
+                # try one more time without \n
+                if isinstance(in_outs["outputs"][index], list):
+                    for tmp_index, i in enumerate(in_outs["outputs"][index]):
+                        in_outs["outputs"][index][tmp_index] = i.split("\n")
+                        in_outs["outputs"][index][tmp_index] = [
+                            x.strip() for x in in_outs["outputs"][index][tmp_index] if x
+                        ]
+                else:
+                    in_outs["outputs"][index] = in_outs["outputs"][index].split("\n")
+                    in_outs["outputs"][index] = list(filter(len, in_outs["outputs"][index]))
+                    in_outs["outputs"][index] = list(map(lambda x: x.strip(), in_outs["outputs"][index]))
+
+                try:
+                    tmp_result = output == [in_outs["outputs"][index]]
+                    if isinstance(in_outs["outputs"][index], list):
+                        tmp_result = tmp_result or (output == in_outs["outputs"][index])
+                except Exception as e:
+                    if debug:
+                        print(f"Failed check2 exception = {e}")
+                    pass
+
+                if tmp_result == True:
+                    results.append(tmp_result)
+                    metadata.append({})
+                    continue
+
+                # try by converting the output into a split up list too
+                if isinstance(output, list):
+                    output = list(filter(len, output))
+
+                if debug:
+                    nl = "\n"
+                    if not isinstance(inputs, list):
+                        print(
+                            f"@1 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]} {tmp_result=}"
+                        )
+                    else:
+                        print(
+                            f"@1 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]} {tmp_result=}"
+                        )
+
+                if tmp_result == True:
+                    results.append(tmp_result)
+                    metadata.append({})
+                    continue
+
+                if debug:
+                    print(f"{tmp_result=} @a")
+
+                try:
+                    tmp_result = output == [in_outs["outputs"][index]]
+                    if isinstance(in_outs["outputs"][index], list):
+                        tmp_result = tmp_result or (output == in_outs["outputs"][index])
+                except Exception as e:
+                    if debug:
+                        print(f"Failed check3 exception = {e}")
+                    pass
+
+                if debug:
+                    print(f"{tmp_result=} @b")
+
+                try:
+                    all_ints = all(
+                        combined_int_check(e1) and combined_int_check(e2)
+                        for e1, e2 in zip(output, in_outs["outputs"][index]))
+                    if not all_ints:
+                        if debug:
+                            print([
+                                combined_int_check(e1) and combined_int_check(e2)
+                                for e1, e2 in zip(output, in_outs["outputs"][index])
+                            ])
+                        output_float = [float(e) for e in output]
+                        gt_float = [float(e) for e in in_outs["outputs"][index]]
+                        tmp_result = tmp_result or ((len(output_float) == len(gt_float)) and
+                                                    np.allclose(output_float, gt_float))
+                except Exception as e:
+                    pass
+
+                if debug:
+                    print(f"{tmp_result=} @c")
+
+                try:
+                    if isinstance(output[0], list):
+                        all_ints = all(
+                            combined_int_check(e1) and combined_int_check(e2)
+                            for e1, e2 in zip(output[0], in_outs["outputs"][index]))
+                        if not all_ints:
+                            output_float = [float(e) for e in output[0]]
+                            gt_float = [float(e) for e in in_outs["outputs"][index][0]]
+                            tmp_result = tmp_result or ((len(output_float) == len(gt_float)) and
+                                                        np.allclose(output_float, gt_float))
+                except Exception as e:
+                    pass
+
+                if tmp_result == True:
+                    results.append(tmp_result)
+                    metadata.append({})
+                    continue
+
+                if debug:
+                    print(f"{tmp_result=} @d")
+                # try by converting the stuff into split up list
+                if isinstance(in_outs["outputs"][index], list):
+                    for tmp_index, i in enumerate(in_outs["outputs"][index]):
+                        in_outs["outputs"][index][tmp_index] = set(i.split())
+                else:
+                    in_outs["outputs"][index] = set(in_outs["outputs"][index].split())
+
+                if debug:
+                    print(f"{tmp_result=} @e")
+
+                try:
+                    tmp_result = output == in_outs["outputs"][index]
+                except Exception as e:
+                    if debug:
+                        print(f"Failed check4 exception = {e}")
+                    continue
+
+                if tmp_result == True:
+                    results.append(tmp_result)
+                    metadata.append({})
+                    continue
+
+                if debug:
+                    print(f"{tmp_result=} @f")
+
+                # try by converting the output into a split up list too
+                if isinstance(output, list):
+                    for tmp_index, i in enumerate(output):
+                        output[tmp_index] = i.split()
+                    output = list(filter(len, output))
+                    for tmp_index, i in enumerate(output):
+                        output[tmp_index] = set(i)
+                else:
+                    output = output.split()
+                    output = list(filter(len, output))
+                    output = set(output)
+
+                if debug:
+                    print(f"{tmp_result=} @g")
+
+                if tmp_result == True and debug:
+                    print("PASSED")
+
+                results.append(tmp_result)
+                metadata.append({"output": raw_true_output_copy,
+                        "expected": raw_outputs,
+                        "inputs": raw_inputs,
+                        "error_message": "Wrong Answer"})
+
+                if debug:
+                    nl = "\n"
+                    if not isinstance(inputs, list):
+                        print(
+                            f"@2 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}"
+                        )
+                    else:
+                        print(
+                            f"@2 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}"
+                        )
+
+                    print(f"results = {results}")
+
+    return results, metadata
+
+
+def custom_compare_(output, ground_truth):
+
+    if isinstance(output, list):
+        output_1 = "\n".join(output)
+        if stripped_string_compare(output_1, ground_truth):
+            return True
+
+    if isinstance(output, list):
+        output_2 = [o.lstrip().rstrip() for o in output]
+        output_2 = "\n".join(output_2)
+        if stripped_string_compare(output_2, ground_truth):
+            return True
+
+    return False
+
+
+def stripped_string_compare(s1, s2):
+    s1 = s1.lstrip().rstrip()
+    s2 = s2.lstrip().rstrip()
+    return s1 == s2
+
+
+def call_method(method, inputs):
+
+    if isinstance(inputs, list):
+        inputs = "\n".join(inputs)
+
+    inputs_line_iterator = iter(inputs.split("\n"))
+
+    # sys.setrecursionlimit(10000)
+
+    # @patch('builtins.input', side_effect=inputs.split("\n"))
+    @patch("builtins.open", mock_open(read_data=inputs))
+    @patch("sys.stdin", StringIO(inputs))
+    @patch("sys.stdin.readline", lambda *args: next(inputs_line_iterator))
+    @patch("sys.stdin.readlines", lambda *args: inputs.split("\n"))
+    @patch("sys.stdin.read", lambda *args: inputs)
+    # @patch('sys.stdout.write', print)
+    def _inner_call_method(_method):
+        try:
+            return _method()
+        except SystemExit as e:
+            pass
+        finally:
+            pass
+
+    return _inner_call_method(method)
+
+
+def reliability_guard(maximum_memory_bytes=None):
+    """
+    This disables various destructive functions and prevents the generated code
+    from interfering with the test (e.g. fork bomb, killing other processes,
+    removing filesystem files, etc.)
+    WARNING
+    This function is NOT a security sandbox. Untrusted code, including, model-
+    generated code, should not be blindly executed outside of one. See the
+    Codex paper for more information about OpenAI's code sandbox, and proceed
+    with caution.
+    """
+
+    if maximum_memory_bytes is not None:
+        import resource
+
+        resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
+        resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
+        if not platform.uname().system == "Darwin":
+            resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
+
+    faulthandler.disable()
+
+    import builtins
+
+    builtins.exit = None
+    builtins.quit = None
+
+    import os
+
+    os.environ["OMP_NUM_THREADS"] = "1"
+
+    os.kill = None
+    os.system = None  # Prevent interference with REPL evaluation
+    os.putenv = None
+    os.remove = None
+    os.removedirs = None
+    os.rmdir = None
+    os.fchdir = None
+    os.setuid = None
+    os.fork = None
+    os.forkpty = None
+    os.killpg = None
+    os.rename = None
+    os.renames = None
+    os.truncate = None
+    os.replace = None
+    os.unlink = None
+    os.fchmod = None
+    os.fchown = None
+    os.chmod = None
+    os.chown = None
+    os.chroot = None
+    os.fchdir = None
+    os.lchflags = None
+    os.lchmod = None
+    os.lchown = None
+    os.getcwd = None
+    os.chdir = None
+
+    import shutil
+
+    shutil.rmtree = None
+    shutil.move = None
+    shutil.chown = None
+
+    import subprocess
+
+    subprocess.Popen = None  # type: ignore
+
+    __builtins__["help"] = None
+
+    import sys
+
+    sys.modules["ipdb"] = None
+    sys.modules["joblib"] = None
+    sys.modules["resource"] = None
+    sys.modules["psutil"] = None
+    sys.modules["tkinter"] = None
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/prime_code/utils.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/prime_code/utils.py
new file mode 100644
index 000000000..07f3db88e
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/prime_code/utils.py
@@ -0,0 +1,59 @@
+# Copyright 2024 PRIME team and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Borrowed from: https://huggingface.co/spaces/codeparrot/apps_metric/blob/main/utils.py
+
+import multiprocessing
+from typing import Dict, Optional
+from datasets import load_dataset
+from .testing_util import run_test
+import traceback
+import os, sys
+
+
+def _temp_run(sample, generation, debug, result, metadata_list, timeout):
+    with open(os.devnull, 'w') as devnull:
+        sys.stdout = devnull
+        sys.stderr = devnull
+        try:
+            res, metadata = run_test(in_outs=sample, test=generation, debug=debug, timeout=timeout)
+            result.append(res)
+            metadata_list.append(metadata)
+        except Exception as e:
+            # print(e) # some tracebacks are extremely long.
+            traceback.print_exc(10)
+            result.append([-1 for i in range(len(sample['inputs']))])
+            metadata_list.append({})
+
+
+def check_correctness(in_outs: Optional[dict], generation, timeout=10, debug=True):
+    """Check correctness of code generation with a global timeout.
+    The global timeout is to catch some extreme/rare cases not handled by the timeouts
+    inside `run_test`"""
+
+    manager = multiprocessing.Manager()
+    result = manager.list()
+    metadata_list = manager.list()
+    p = multiprocessing.Process(target=_temp_run, args=(in_outs, generation, debug, result, metadata_list, timeout))
+    p.start()
+    p.join(timeout=timeout + 1)
+    if p.is_alive():
+        p.kill()
+        # p.terminate()
+    if not result:
+        # consider that all tests failed
+        result = [[-1 for i in range(len(in_outs["inputs"]))]]
+        if debug:
+            print(f"global timeout")
+    return result[0], metadata_list
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/prime_math/__init__.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/prime_math/__init__.py
new file mode 100644
index 000000000..39b2c831a
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/prime_math/__init__.py
@@ -0,0 +1,402 @@
+# Copyright 2024 PRIME team and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Answer checker API that uses sympy to simplify expressions and check for equality.
+
+Call grade_answer(given_answer: str, ground_truth: str).
+
+FROM: https://github.com/openai/prm800k/blob/main/prm800k/grading/grader.py
+"""
+import re
+import sympy
+from pylatexenc import latex2text
+from sympy.parsing import sympy_parser
+
+from . import math_normalize
+from .grader import math_equal
+
+# import math_normalize
+# from grader import math_equal
+
+# sympy might hang -- we don't care about trying to be lenient in these cases
+BAD_SUBSTRINGS = ["^{", "^("]
+BAD_REGEXES = ["\^[0-9]+\^", "\^[0-9][0-9]+"]
+TUPLE_CHARS = "()[]"
+
+
+def _sympy_parse(expr: str):
+    """Parses an expression with sympy."""
+    py_expr = expr.replace("^", "**")
+    return sympy_parser.parse_expr(
+        py_expr,
+        transformations=(sympy_parser.standard_transformations + (sympy_parser.implicit_multiplication_application,)),
+    )
+
+
+def _parse_latex(expr: str) -> str:
+    """Attempts to parse latex to an expression sympy can read."""
+    expr = expr.replace("\\tfrac", "\\frac")
+    expr = expr.replace("\\dfrac", "\\frac")
+    expr = expr.replace("\\frac", " \\frac")  # Play nice with mixed numbers.
+    expr = latex2text.LatexNodes2Text().latex_to_text(expr)
+
+    # Replace the specific characters that this parser uses.
+    expr = expr.replace("√", "sqrt")
+    expr = expr.replace("π", "pi")
+    expr = expr.replace("∞", "inf")
+    expr = expr.replace("∪", "U")
+    expr = expr.replace("·", "*")
+    expr = expr.replace("×", "*")
+
+    return expr.strip()
+
+
+def _is_float(num: str) -> bool:
+    try:
+        float(num)
+        return True
+    except ValueError:
+        return False
+
+
+def _is_int(x: float) -> bool:
+    try:
+        return abs(x - int(round(x))) <= 1e-7
+    except:
+        return False
+
+
+def _is_frac(expr: str) -> bool:
+    return bool(re.search(r"^-?[0-9]+.?/0*[1-9][0-9]*.?$", expr))
+
+
+def _str_is_int(x: str) -> bool:
+    try:
+        x = _strip_properly_formatted_commas(x)
+        x = float(x)
+        return abs(x - int(round(x))) <= 1e-7
+    except:
+        return False
+
+
+def _str_to_int(x: str) -> bool:
+    x = x.replace(",", "")
+    x = float(x)
+    return int(x)
+
+
+def _inject_implicit_mixed_number(step: str):
+    """
+    Automatically make a mixed number evalable
+    e.g. 7 3/4 => 7+3/4
+    """
+    p1 = re.compile("([0-9]) +([0-9])")
+    step = p1.sub("\\1+\\2", step)  ## implicit mults
+    return step
+
+
+def _strip_properly_formatted_commas(expr: str):
+    # We want to be careful because we don't want to strip tuple commas
+    p1 = re.compile("(\d)(,)(\d\d\d)($|\D)")
+    while True:
+        next_expr = p1.sub("\\1\\3\\4", expr)
+        if next_expr == expr:
+            break
+        expr = next_expr
+    return next_expr
+
+
+def _normalize(expr: str) -> str:
+    """Normalize answer expressions."""
+    if expr is None:
+        return None
+
+    # Remove enclosing `\text{}`.
+    m = re.search("^\\\\text\{(?P<text>.+?)\}$", expr)
+    if m is not None:
+        expr = m.group("text")
+
+    expr = expr.replace("\\%", "%")
+    expr = expr.replace("\\$", "$")
+    expr = expr.replace("$", "")
+    expr = expr.replace("%", "")
+    expr = expr.replace(" or ", " , ")
+    expr = expr.replace(" and ", " , ")
+
+    expr = expr.replace("million", "*10^6")
+    expr = expr.replace("billion", "*10^9")
+    expr = expr.replace("trillion", "*10^12")
+
+    for unit in [
+            "degree",
+            "cm",
+            "centimeter",
+            "meter",
+            "mile",
+            "second",
+            "minute",
+            "hour",
+            "day",
+            "week",
+            "month",
+            "year",
+            "foot",
+            "feet",
+            "inch",
+            "yard",
+            "liter",
+    ]:
+        expr = re.sub(f"{unit}(es)?(s)? *(\^[0-9]+)?", "", expr)
+    expr = re.sub(f"\^ *\\\\circ", "", expr)
+
+    if len(expr) > 0 and expr[0] == "{" and expr[-1] == "}":
+        expr = expr[1:-1]
+
+    expr = re.sub(",\\\\! *", "", expr)
+    if _is_float(expr) and _is_int(float(expr)):
+        expr = str(int(round(float(expr))))
+    if "\\" in expr:
+        try:
+            expr = _parse_latex(expr)
+        except:
+            pass
+
+    # edge case with mixed numbers and negative signs
+    expr = re.sub("- *", "-", expr)
+
+    expr = _inject_implicit_mixed_number(expr)
+
+    # don't be case sensitive for text answers
+    expr = expr.lower()
+
+    if _str_is_int(expr):
+        expr = str(_str_to_int(expr))
+
+    return expr
+
+
+def count_unknown_letters_in_expr(expr: str):
+    expr = expr.replace("sqrt", "")
+    expr = expr.replace("frac", "")
+    letters_in_expr = set([x for x in expr if x.isalpha()])
+    return len(letters_in_expr)
+
+
+def should_allow_eval(expr: str):
+    # we don't want to try parsing unknown text or functions of more than two variables
+    if count_unknown_letters_in_expr(expr) > 2:
+        return False
+
+    for bad_string in BAD_SUBSTRINGS:
+        if bad_string in expr:
+            return False
+
+    for bad_regex in BAD_REGEXES:
+        if re.search(bad_regex, expr) is not None:
+            return False
+
+    return True
+
+
+def are_equal_under_sympy(ground_truth_normalized: str, given_normalized: str):
+    are_equal = False
+    try:
+        expr = f"({ground_truth_normalized})-({given_normalized})"
+        if should_allow_eval(expr):
+            sympy_diff = _sympy_parse(expr)
+            simplified = sympy.simplify(sympy_diff)
+            if simplified == 0:
+                are_equal = True
+    except:
+        pass
+    return are_equal
+
+
+def split_tuple(expr: str):
+    """
+    Split the elements in a tuple/interval, while handling well-formatted commas in large numbers
+    """
+    expr = _strip_properly_formatted_commas(expr)
+    if len(expr) == 0:
+        return []
+    if (len(expr) > 2 and expr[0] in TUPLE_CHARS and expr[-1] in TUPLE_CHARS and
+            all([ch not in expr[1:-1] for ch in TUPLE_CHARS])):
+        elems = [elem.strip() for elem in expr[1:-1].split(",")]
+    else:
+        elems = [expr]
+    return elems
+
+
+def grade_answer(given_answer: str, ground_truth: str) -> bool:
+    """
+    The answer will be considered correct if:
+    (a) it normalizes to the same string as the ground truth answer
+    OR
+    (b) sympy can simplify the difference between the expressions to 0
+    """
+    if given_answer is None:
+        return False
+
+    ground_truth_normalized_mathd = math_normalize.normalize_answer(ground_truth)
+    given_answer_normalized_mathd = math_normalize.normalize_answer(given_answer)
+
+    # be at least as lenient as mathd
+    if ground_truth_normalized_mathd == given_answer_normalized_mathd:
+        return True
+
+    ground_truth_normalized = _normalize(ground_truth)
+    given_normalized = _normalize(given_answer)
+
+    if ground_truth_normalized is None:
+        return False
+
+    if ground_truth_normalized == given_normalized:
+        return True
+
+    if len(given_normalized) == 0:
+        return False
+
+    ground_truth_elems = split_tuple(ground_truth_normalized)
+    given_elems = split_tuple(given_normalized)
+
+    if len(ground_truth_elems) > 1 and (ground_truth_normalized[0] != given_normalized[0] or
+                                        ground_truth_normalized[-1] != given_normalized[-1]):
+        is_correct = False
+    elif len(ground_truth_elems) != len(given_elems):
+        is_correct = False
+    else:
+        for ground_truth_elem, given_elem in zip(ground_truth_elems, given_elems):
+            if _is_frac(ground_truth_elem) and _is_frac(given_elem):
+                # if fractions aren't reduced, then shouldn't be marked as correct
+                # so, we don't want to allow sympy.simplify in this case
+                is_correct = ground_truth_elem == given_elem
+            elif _str_is_int(ground_truth_elem) != _str_is_int(given_elem):
+                # if the ground truth answer is an integer, we require the given answer to be a strict match (no sympy.simplify)
+                is_correct = False
+            else:
+                is_correct = are_equal_under_sympy(ground_truth_elem, given_elem)
+            if not is_correct:
+                break
+
+    return is_correct
+
+
+def remove_boxed(s):
+    left = "\\boxed{"
+    try:
+        assert s[:len(left)] == left
+        assert s[-1] == "}"
+        return s[len(left):-1]
+    except:
+        return None
+
+
+def _last_boxed_only_string(string):
+    idx = string.rfind("\\boxed")
+    if idx < 0:
+        idx = string.rfind("\\fbox")
+        if idx < 0:
+            return None
+
+    i = idx
+    left_brace_idx = None
+    right_brace_idx = None
+    num_left_braces_open = 0
+    while i < len(string):
+        if string[i] == "{":
+            num_left_braces_open += 1
+            if left_brace_idx is None:
+                left_brace_idx = i
+        elif string[i] == "}":
+            num_left_braces_open -= 1
+            if num_left_braces_open == 0:
+                right_brace_idx = i
+                break
+
+        i += 1
+
+    if left_brace_idx is None or right_brace_idx is None:
+        return None
+
+    return string[left_brace_idx + 1:right_brace_idx].strip()
+
+
+def match_answer(response):
+    is_matched = False
+    for ans_marker in ['answer:', "answer is", "answers are"]:
+        ans_idx = response.lower().rfind(ans_marker)
+        if ans_idx != -1:
+            is_matched = True
+            response = response[ans_idx + len(ans_marker):].strip()
+            if response.endswith("\n"):
+                response = response[:-2]
+
+    for ans_marker in ["is answer", "is the answer", "are answers", "are the answers"]:
+        ans_idx = response.lower().rfind(ans_marker)
+        if ans_idx != -1:
+            is_matched = True
+            response = response[:ans_idx].strip()
+            if response.endswith("\n"):
+                response = response[:-2]
+
+    # Find boxed
+    ans_boxed = _last_boxed_only_string(response)
+    if ans_boxed:
+        is_matched = True
+        response = ans_boxed
+
+    if ". " in response:
+        dot_idx = response.lower().rfind(". ")
+        if dot_idx != -1:
+            response = response[:dot_idx].strip()
+
+    for ans_marker in ['be ', "is ", "are ", "=", ": ", "get ", 'be\n', "is\n", "are\n", ":\n", "get\n"]:
+        ans_idx = response.lower().rfind(ans_marker)
+        if ans_idx != -1:
+            is_matched = True
+            response = response[ans_idx + len(ans_marker):].strip()
+            if response.endswith("\n"):
+                response = response[:-2]
+
+    is_matched = is_matched if any([c.isdigit() for c in response]) else False  # answer must have a digit
+    # Grade
+    return is_matched, response
+
+
+import math
+
+
+def compute_score(model_output: str, ground_truth: str) -> bool:
+    model_output = str(model_output)
+    ground_truth = str(ground_truth)
+
+    is_matched, extracted_model_output = match_answer(model_output)
+    format_correctness = "Step 2:" in model_output and "\\box" in model_output
+
+    # grade simple algebra questions. if succeeded, return; otherwise, proceed to more complex grading
+    if grade_answer(extracted_model_output, ground_truth):
+        return True, True, extracted_model_output
+
+    try:
+        if "\pi" in extracted_model_output or "\pi" in ground_truth:
+            equivs = []
+            for pi in [math.pi, 3.14]:
+                equivs.append(math_equal(extracted_model_output, ground_truth, timeout=True, pi=pi))
+            is_correct = any(equivs)
+        else:
+            is_correct = math_equal(extracted_model_output, ground_truth, timeout=True)
+    except:
+        is_correct = False
+
+    return is_correct, format_correctness, extracted_model_output
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/prime_math/grader.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/prime_math/grader.py
new file mode 100644
index 000000000..e3e1686f1
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/prime_math/grader.py
@@ -0,0 +1,380 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright (c) Microsoft Corporation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE
+
+# Copyright (c) 2023 OpenAI
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# Copyright (c) 2021 Dan Hendrycks
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# Copyright 2024 PRIME team and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This logic is largely copied from the Hendrycks' MATH release (math_equivalence), and borrowed from:
+- https://github.com/microsoft/ToRA/blob/main/src/eval/grader.py
+- https://github.com/microsoft/ProphetNet/tree/master/CRITIC
+- https://github.com/openai/prm800k
+"""
+
+import contextlib
+import re
+import signal
+import math
+from math import isclose
+from typing import Union
+
+from sympy import N, simplify
+from sympy.parsing.latex import parse_latex
+from sympy.parsing.sympy_parser import parse_expr
+
+
+def is_digit(s):
+    try:
+        if "{,}" in str(s):
+            num = float(str(s).replace("{,}", ""))
+            return True, num
+
+        num = float(str(s).replace(",", ""))
+        return True, num
+    except ValueError:
+        return False, None
+
+
+def normalize(answer, pi) -> str:
+    # checking if answer is $<number> and removing $ in that case to compare
+    if isinstance(answer, str) and bool(re.match(r'\$\d+(\.\d+)?', answer)):
+        return answer[1:]
+
+    # checking if answer is <number>% or <number>\\% and removing %
+    if isinstance(answer, str) and (bool(re.match(r'^\d+(\.\d+)?%$', answer)) or
+                                    bool(re.match(r'^\d+(\.\d+)?\\%$', answer))):
+        return answer.replace("\\%", "").replace("%", "")
+
+    # handle base
+    answer = handle_base(answer)
+
+    # handle pi
+    answer = handle_pi(answer, pi)
+
+    return answer
+
+
+def handle_base(x) -> str:
+    if isinstance(x, str) and "_" in x:
+        # Due to base
+        x = x.split("_")[0]
+        x = float(x)
+        return int(x)
+    return x
+
+
+def handle_pi(string, pi):
+    if isinstance(string, str) and "\pi" in string:
+        # Find the first occurrence of "\pi"
+        idx = string.find("\pi")
+
+        # Iterate over the string and find all occurrences of "\pi" with a valid previous character
+        while idx != -1:
+
+            if idx > 0 and string[idx - 1].isdigit():
+                # Replace "\pi" with "*math.pi" if the previous character is a digit
+                string = string[:idx] + f"*{pi}" + string[idx + 3:]
+            else:
+                # Replace "\pi" with "1*math.pi" if the previous character is not a digit
+                string = string[:idx] + f"1*{pi}" + string[idx + 3:]
+
+            # Find the next occurrence of "\pi"
+            idx = string.find("\pi", idx + 1)
+
+        # Evaluate the expression using eval() function
+        try:
+            string = eval(string)
+        except:
+            pass
+
+    return string
+
+
+def math_equal(prediction: Union[bool, float, str],
+               reference: Union[float, str],
+               include_percentage: bool = True,
+               tolerance: float = 1e-4,
+               timeout: float = 10.0,
+               pi: float = math.pi) -> bool:
+    """
+    Exact match of math if and only if:
+    1. numerical equal: both can convert to float and are equal
+    2. symbolic equal: both can convert to sympy expression and are equal
+    """
+
+    prediction = normalize(prediction, pi)
+    reference = normalize(reference, pi)
+
+    if isinstance(prediction, str) and len(prediction) > 1000:  # handling weird corner-cases
+        prediction = prediction[:1000]
+
+    # 0. string comparison
+    if isinstance(prediction, str) and isinstance(reference, str):
+        if prediction.strip().lower() == reference.strip().lower():
+            return True
+        if prediction.replace(" ", "") == reference.replace(" ", ""):
+            return True
+
+    try:  # 1. numerical equal
+        if is_digit(prediction)[0] and is_digit(reference)[0]:
+            prediction = is_digit(prediction)[1]
+            reference = is_digit(reference)[1]
+            # number questions
+            if include_percentage:
+                gt_result = [reference / 100, reference, reference * 100]
+            else:
+                gt_result = [reference]
+            for item in gt_result:
+                try:
+                    if isclose(item, prediction, rel_tol=tolerance):
+                        return True
+                except Exception:
+                    continue
+            return False
+    except Exception:
+        pass
+
+    if not prediction and prediction not in [0, False]:
+        return False
+
+    # 2. symbolic equal
+    reference = str(reference).strip()
+    prediction = str(prediction).strip()
+
+    ## deal with [], (), {}
+    prediction = format_intervals(prediction)
+
+    pred_str, ref_str = prediction, reference
+    if (prediction.startswith("[") and prediction.endswith("]") and
+            not reference.startswith("(")) or (prediction.startswith("(") and prediction.endswith(")") and
+                                               not reference.startswith("[")):
+        pred_str = pred_str.strip("[]()")
+        ref_str = ref_str.strip("[]()")
+    for s in ["{", "}", "(", ")"]:
+        ref_str = ref_str.replace(s, "")
+        pred_str = pred_str.replace(s, "")
+    if pred_str == ref_str:
+        return True
+
+    ## [a, b] vs. [c, d], return a==c and b==d
+    if (prediction and reference and prediction[0] in "([" and prediction[-1] in ")]" and
+            prediction[0] == reference[0] and prediction[-1] == reference[-1]):
+        pred_parts = prediction[1:-1].split(",")
+        ref_parts = reference[1:-1].split(",")
+        if len(pred_parts) == len(ref_parts):
+            if all([
+                    math_equal(pred_pt, ref_pt, include_percentage, tolerance)
+                    for pred_pt, ref_pt in zip(pred_parts, ref_parts)
+            ]):
+                return True
+
+    if "," in prediction and "," in reference:
+        pred_parts = [item.strip() for item in prediction.split(",")]
+        ref_parts = [item.strip() for item in reference.split(",")]
+
+        if len(pred_parts) == len(ref_parts):
+            if all([
+                    math_equal(pred_parts[i], ref_parts[i], include_percentage, tolerance)
+                    for i in range(len(pred_parts))
+            ]):
+                return True
+            else:
+                return False
+
+    # if we have point == tuple of values
+    if prediction.startswith("Point") and reference[0] == "(" and reference[-1] == ")":
+        pred_parts = prediction[prediction.find("(") + 1:-1].split(",")
+        ref_parts = reference[1:-1].split(",")
+        if len(pred_parts) == len(ref_parts):
+            if all([
+                    math_equal(pred_pt, ref_pt, include_percentage, tolerance)
+                    for pred_pt, ref_pt in zip(pred_parts, ref_parts)
+            ]):
+                return True
+
+    # if reference is a matrix
+    if "\begin{pmatrix}" in reference and prediction.startswith("Matrix"):
+        try:
+            pred_matrix = parse_expr(prediction)
+            ref_matrix_items = reference.split()[1:-1:2]
+            if len(pred_matrix) == len(ref_matrix_items):
+                if all([
+                        math_equal(pred, ref, include_percentage, tolerance)
+                        for ref, pred in zip(ref_matrix_items, pred_matrix)
+                ]):
+                    return True
+        except Exception:
+            pass
+    elif "\begin{pmatrix}" in reference and prediction.startswith("[") and prediction.endswith("]"):
+        if isinstance(eval(prediction), list):
+            try:
+                pred_matrix = eval(prediction)
+                # ref_matrix_items = reference.split()[1:-1:2]
+                ref_matrix_items = reference.lstrip("\\begin{pmatrix}").lstrip("\begin{pmatrix}").rstrip(
+                    "\\end{pmatrix}").rstrip("\end{pmatrix}")
+                ref_matrix_items = ref_matrix_items.split("\\")
+                ref_matrix_items = [row.split("&") if "&" in row else row for row in ref_matrix_items]
+                if len(pred_matrix) == len(ref_matrix_items):
+                    if all([
+                            math_equal(pred, ref, include_percentage, tolerance)
+                            for ref, pred in zip(ref_matrix_items, pred_matrix)
+                    ]):
+                        return True
+            except Exception:
+                pass
+
+    return symbolic_equal(prediction, reference, tolerance, timeout)
+
+
+def symbolic_equal(a, b, tolerance, timeout=10.0):
+
+    def _parse(s):
+        for f in [parse_expr, parse_latex]:
+            try:
+                with time_limit(timeout):
+                    return f(s)
+            except Exception:
+                pass
+        return s
+
+    a = _parse(a)
+    b = _parse(b)
+
+    try:
+        with time_limit(timeout):
+            if simplify(a - b) == 0:
+                return True
+    except Exception:
+        pass
+
+    try:
+        with time_limit(timeout):
+            if isclose(N(a), N(b), rel_tol=tolerance):
+                return True
+    except Exception:
+        pass
+    return False
+
+
+class TimeoutException(Exception):
+    pass
+
+
+@contextlib.contextmanager
+def time_limit(seconds: float):
+
+    def signal_handler(signum, frame):
+        raise TimeoutException("Timed out!")
+
+    signal.setitimer(signal.ITIMER_REAL, seconds)
+    signal.signal(signal.SIGALRM, signal_handler)
+    try:
+        yield
+    finally:
+        signal.setitimer(signal.ITIMER_REAL, 0)
+
+
+def format_intervals(prediction):
+    patterns = {
+        "Interval(": r"^Interval\((.*)\)$",
+        "Interval.Ropen(": r"^Interval\.Ropen\((.*)\)$",
+        "Interval.Lopen(": r"^Interval\.Lopen\((.*)\)$",
+        "Interval.open(": r"^Interval\.open\((.*)\)$",
+    }
+
+    for key, pattern in patterns.items():
+        match = re.match(pattern, prediction)
+        if match:
+            inner_content = match.group(1)
+
+            if key == "Interval(":  # Intarval(a, b) == [a, b]
+                return f"[{inner_content}]"
+            elif key == "Interval.Ropen(":  # Intarval.Ropen(a, b) == [a, b)
+                return f"[{inner_content})"
+            elif key == "Interval.Lopen(":  # Intarval.Lopen(a, b) == (a, b]
+                return f"({inner_content}]"
+            elif key == "Interval.open(":  # Intarval.open(a, b) == (a, b)
+                return f"({inner_content})"
+
+    return prediction
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/prime_math/math_normalize.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/prime_math/math_normalize.py
new file mode 100644
index 000000000..e9921a5ad
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/prime_math/math_normalize.py
@@ -0,0 +1,191 @@
+# Copyright 2024 PRIME team and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright (c) 2021 Dan Hendrycks
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""
+This logic is largely copied from the Hendrycks' MATH release (math_equivalence).
+
+From: https://github.com/openai/prm800k/blob/main/prm800k/grading/math_normalize.py
+"""
+import re
+from typing import Optional
+
+
+def normalize_answer(answer: Optional[str]) -> Optional[str]:
+    if answer is None:
+        return None
+    answer = answer.strip()
+    try:
+        # Remove enclosing `\text{}`.
+        m = re.search("^\\\\text\{(?P<text>.+?)\}$", answer)
+        if m is not None:
+            answer = m.group("text").strip()
+        return _strip_string(answer)
+    except:
+        return answer
+
+
+def _fix_fracs(string):
+    substrs = string.split("\\frac")
+    new_str = substrs[0]
+    if len(substrs) > 1:
+        substrs = substrs[1:]
+        for substr in substrs:
+            new_str += "\\frac"
+            if substr[0] == "{":
+                new_str += substr
+            else:
+                try:
+                    assert len(substr) >= 2
+                except:
+                    return string
+                a = substr[0]
+                b = substr[1]
+                if b != "{":
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}{" + b + "}" + post_substr
+                    else:
+                        new_str += "{" + a + "}{" + b + "}"
+                else:
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}" + b + post_substr
+                    else:
+                        new_str += "{" + a + "}" + b
+    string = new_str
+    return string
+
+
+def _fix_a_slash_b(string):
+    if len(string.split("/")) != 2:
+        return string
+    a = string.split("/")[0]
+    b = string.split("/")[1]
+    try:
+        a = int(a)
+        b = int(b)
+        assert string == "{}/{}".format(a, b)
+        new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
+        return new_string
+    except:
+        return string
+
+
+def _remove_right_units(string):
+    # "\\text{ " only ever occurs (at least in the val set) when describing units
+    if "\\text{ " in string:
+        splits = string.split("\\text{ ")
+        assert len(splits) == 2
+        return splits[0]
+    else:
+        return string
+
+
+def _fix_sqrt(string):
+    if "\\sqrt" not in string:
+        return string
+    splits = string.split("\\sqrt")
+    new_string = splits[0]
+    for split in splits[1:]:
+        if split[0] != "{":
+            a = split[0]
+            new_substr = "\\sqrt{" + a + "}" + split[1:]
+        else:
+            new_substr = "\\sqrt" + split
+        new_string += new_substr
+    return new_string
+
+
+def _strip_string(string):
+    # linebreaks
+    string = string.replace("\n", "")
+
+    # remove inverse spaces
+    string = string.replace("\\!", "")
+
+    # replace \\ with \
+    string = string.replace("\\\\", "\\")
+
+    # replace tfrac and dfrac with frac
+    string = string.replace("tfrac", "frac")
+    string = string.replace("dfrac", "frac")
+
+    # remove \left and \right
+    string = string.replace("\\left", "")
+    string = string.replace("\\right", "")
+
+    # Remove circ (degrees)
+    string = string.replace("^{\\circ}", "")
+    string = string.replace("^\\circ", "")
+
+    # remove dollar signs
+    string = string.replace("\\$", "")
+
+    # remove units (on the right)
+    string = _remove_right_units(string)
+
+    # remove percentage
+    string = string.replace("\\%", "")
+    string = string.replace("\%", "")
+
+    # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
+    string = string.replace(" .", " 0.")
+    string = string.replace("{.", "{0.")
+    # if empty, return empty string
+    if len(string) == 0:
+        return string
+    if string[0] == ".":
+        string = "0" + string
+
+    # to consider: get rid of e.g. "k = " or "q = " at beginning
+    if len(string.split("=")) == 2:
+        if len(string.split("=")[0]) <= 2:
+            string = string.split("=")[1]
+
+    # fix sqrt3 --> sqrt{3}
+    string = _fix_sqrt(string)
+
+    # remove spaces
+    string = string.replace(" ", "")
+
+    # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
+    string = _fix_fracs(string)
+
+    # manually change 0.5 --> \frac{1}{2}
+    if string == "0.5":
+        string = "\\frac{1}{2}"
+
+    # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
+    string = _fix_a_slash_b(string)
+
+    return string
\ No newline at end of file
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/toolcall/__init__.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/toolcall/__init__.py
new file mode 100644
index 000000000..b978486e0
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/toolcall/__init__.py
@@ -0,0 +1 @@
+from .toolcall import compute_score
\ No newline at end of file
diff --git a/trainer_integration/verl/verl_custom/nvidia/reward_score/toolcall/toolcall.py b/trainer_integration/verl/verl_custom/nvidia/reward_score/toolcall/toolcall.py
new file mode 100644
index 000000000..19162b8e5
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/reward_score/toolcall/toolcall.py
@@ -0,0 +1,101 @@
+import re
+from collections import Counter
+import json
+import os
+
+def validate_result(result, answer):
+    if len(result) == 0 or len(answer) == 0:
+        return 1 if len(result) == len(answer) else 0
+
+    try:
+        counter1_full = Counter(
+            (item["name"], json.dumps(item["arguments"], sort_keys=True)) 
+            for item in result
+        )
+        counter2_full = Counter(
+            (item["name"], json.dumps(item["arguments"], sort_keys=True)) 
+            for item in answer
+        )
+    except TypeError:
+        return 0
+
+    return 1 if counter1_full == counter2_full else 0
+
+def extract_solution(tool_call_str):
+    pattern = os.environ.get('TOOL_CALL_PATTERN', r'<tool_call>([\s\S]*?)<\/tool_call>')
+    matches = list(re.finditer(pattern, tool_call_str, flags=re.DOTALL))
+    if not matches:
+        return None, tool_call_str
+    
+    tool_calls = []
+    for match in matches:
+        content = match.group(1).strip()
+        try:
+            parsed_content = json.loads(content)
+            tool_calls.append(parsed_content)
+        except json.JSONDecodeError:
+            print(f"Failed to parse JSON: {content}")
+            return None, tool_call_str
+    
+    if len(tool_calls) == 1:
+        return tool_calls[0], tool_call_str
+    else:
+        return tool_calls, tool_call_str
+
+
+def validate_format(tool_call_list):
+    for item in tool_call_list:
+        if not isinstance(item, dict):
+            return 0
+    for item in tool_call_list:
+        if "name" not in item.keys() or "arguments" not in item.keys():
+            return 0
+    return 1
+
+def compute_score(solution_str, ground_truth):
+    """
+    Returns:
+       1  if the parsed result from 'solution_str' fully matches the 'ground_truth'
+       0  otherwise
+    """
+    # Extract the tool call result and the full output string
+    result, output_string = extract_solution(solution_str)
+
+    # Ensure the "thinking" markers are present
+    # comment out for llama-nemotron-nano
+    # if "</think>" not in output_string:
+    #     return 0
+
+    # Ensure we actually extracted something
+    if result is None:
+        return 0
+
+    # If the result is a single dictionary, wrap it in a list
+    if isinstance(result, dict):
+        result = [result]
+
+    # Validate the format of the extracted result
+    if not validate_format(result):
+        return 0
+
+    # Compare the extracted result with the ground truth
+    if validate_result(result, ground_truth) == 1:
+        return 1.0
+    else:
+        return 0 # format reward
+
+if __name__ == "__main__":
+    solution_str9 = """ <think>
+Okay, the user is asking for cat breeds native to China and Canada. Let me check the tools available. There's the 'origin' tool that fetches cat breeds by country. The parameter 'origin' has a default value of Egypt, but the user wants China and Canada. I need to call the tool twice, once for each country. Wait, can I do that? The tool might accept multiple countries, but the parameter is a string. Maybe I should pass both as separate arguments. But the tool's parameters are defined with a single 'origin' field. Hmm, maybe the API allows multiple countries separated by commas. Let me check the tool's description again. The description says "a specified country", so maybe it's for a single country. Therefore, I need to make two separate calls, one for China and one for Canada. That's the correct approach here.
+</think>
+
+<tool_call>
+{"name": "origin", "arguments": {"origin": "China"}}
+</tool_call>
+<tool_call>
+{"name": "origin", "arguments": {"origin": "Canada"}}
+</tool_call>
+"""
+    ground_truth9 = [{'name': 'origin', 'arguments': {'origin': 'China'}}, {'name': 'origin', 'arguments': {'origin': 'Canada'}}]
+
+    print(f"Example 9 compute_score: {compute_score(solution_str9, ground_truth9)}")
\ No newline at end of file
diff --git a/trainer_integration/verl/verl_custom/nvidia/rollout/__init__.py b/trainer_integration/verl/verl_custom/nvidia/rollout/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/trainer_integration/verl/verl_custom/nvidia/rollout/async_server.py b/trainer_integration/verl/verl_custom/nvidia/rollout/async_server.py
new file mode 100644
index 000000000..71cfe360a
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/rollout/async_server.py
@@ -0,0 +1,1809 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Async LLM Server Manager for VERL Framework
+
+This module provides the AsyncLLMServerManager class that manages multiple async LLM server instances
+for distributed inference and rollout generation. It handles:
+- Server lifecycle management (start/stop/wake/sleep)
+- Request distribution and load balancing
+- Integration with OpenHands servers for multi-turn conversations
+- Message formatting and tokenization
+- Result aggregation and conversion to DataProto format
+
+The manager supports both single-node and multi-node deployments with tensor parallelism.
+"""
+
+# Standard library imports for async operations, logging, and networking
+import asyncio
+import logging
+import os
+import socket
+import threading
+from typing import Any, Dict, List, Tuple
+
+# Ray framework for distributed computing
+import ray
+from omegaconf import DictConfig
+import json
+import os
+# VERL framework imports
+from verl.protocol import DataProto  # Data protocol for tensor/non-tensor data exchange
+from verl.single_controller.ray.base import RayWorkerGroup  # Ray worker group management
+from verl.workers.rollout.async_server import async_server_class  # Async server implementation
+from async_generator import asynccontextmanager
+# Logging configuration
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "INFO"))
+
+# Chat template management for different model types
+from .chat_template_manager import get_chat_template
+
+# PyTorch and HTTP client imports
+import torch
+import aiohttp
+from verl.utils.model import compute_position_id_with_mask
+from .utils import convert_right_padding_to_left, pad_to_max_length_right
+
+
+import fastapi
+import uvicorn
+from starlette.requests import Request
+from abc import ABC, abstractmethod
+
+def _get_free_port():
+    with socket.socket() as sock:
+        sock.bind(("", 0))
+        return sock.getsockname()[1]
+
+
+class AsyncServerBase(ABC):
+    """Base class for AsyncServer."""
+
+    def __init__(self):
+        self.address = ray._private.services.get_node_ip_address()
+        self.port = None
+        self.server_ready = asyncio.Event()
+        asyncio.create_task(self._start_fastapi_server())
+
+    async def _start_fastapi_server(self):
+        @asynccontextmanager
+        async def lifespan(app: fastapi.FastAPI):
+            print(f"FastAPI listen on {self.address}:{self.port}")
+            self.server_ready.set()
+            yield
+
+            # There's no way to gracefully restart uvicorn server if port is already in use,
+            # so we exit the process directly and let AsyncLLMServerManager restart it.
+            print("FastAPI shutdown, maybe address already in use, exit process immediately.")
+            os._exit(-1)
+
+        app = fastapi.FastAPI(lifespan=lifespan)
+        app.router.add_api_route("/generate", self.generate, methods=["POST"])
+
+        self.port = _get_free_port()
+        config = uvicorn.Config(app, host=["::", "0.0.0.0"], port=self.port, log_level="warning")
+        server = uvicorn.Server(config)
+        await server.serve()
+
+    async def get_server_address(self) -> Tuple[str, int]:
+        """Get FastAPI server address."""
+        await self.server_ready.wait()
+        return f"{self.address}:{self.port}"
+
+    @abstractmethod
+    async def generate(self, raw_request: Request):
+        """OpenAI chat completion API.
+
+        API reference: https://platform.openai.com/docs/api-reference/chat/create
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    async def init_engine(self):
+        """Init async LLM engine."""
+        raise NotImplementedError
+
+    @abstractmethod
+    async def wake_up(self):
+        """Wake up engine to load model weights and build kv cache."""
+        raise NotImplementedError
+
+    @abstractmethod
+    async def sleep(self):
+        """Sleep engine to offload model weights and discard kv cache."""
+        raise NotImplementedError
+
+
+class AsyncLLMServerManager:
+    """
+    AsyncLLMServerManager manages a distributed group of async LLM server instances.
+    
+    This class provides a high-level interface for managing multiple vLLM server instances
+    running in a Ray cluster. It handles:
+    
+    1. Server Management:
+       - Starting/stopping LLM servers with proper resource allocation
+       - Managing server addresses and load balancing
+       - Health monitoring and restart capabilities
+    
+    2. OpenHands Integration:
+       - Distributing LLM server addresses to OpenHands servers
+       - Managing OpenHands worker pools for parallel processing
+       - Handling multi-turn conversations with tool usage
+    
+    3. Request Processing:
+       - Converting DataProto inputs to message format
+       - Distributing requests across available servers
+       - Aggregating results and converting back to DataProto
+    
+    4. Tokenization and Formatting:
+       - Applying chat templates for different model types
+       - Handling prompt/response tokenization and padding
+       - Managing sequence length constraints
+    
+    The manager supports both tensor parallel and data parallel deployments,
+    automatically detecting the optimal server configuration based on the
+    worker group topology.
+    """
+
+    def __init__(self, config: DictConfig, worker_group: RayWorkerGroup):
+        """
+        Initialize AsyncLLMServerManager with configuration and worker group.
+
+        Args:
+            config (DictConfig): Complete configuration object containing:
+                - actor_rollout_ref: Rollout configuration settings
+                - data: Data processing configuration (max lengths, etc.)
+                - model: Model configuration (path, parameters, etc.)
+            worker_group (RayWorkerGroup): Ray worker group for distributed execution
+                - Contains worker topology information
+                - Manages worker lifecycle and communication
+                
+        The initialization process:
+        1. Extracts relevant configuration parameters
+        2. Calculates tensor/data parallel topology
+        3. Starts LLM server instances on appropriate nodes
+        4. Initializes tokenizer and chat templates
+        5. Configures OpenHands server integration
+        6. Sets up sampling parameters for generation
+        """
+        # Store complete configuration and extract rollout-specific config
+        self.full_config = config
+        self.config = config.actor_rollout_ref
+        self.worker_group = worker_group
+
+        # Calculate parallel topology from worker group
+        # tensor_model_parallel_size: Number of GPUs per model instance (for large models)
+        # rollout_dp_size: Number of data parallel replicas (for throughput scaling)
+        self.rollout_tp_size = self.config.rollout.tensor_model_parallel_size
+        self.rollout_dp_size = self.worker_group.world_size // self.rollout_tp_size
+
+        # Initialize server management data structures
+        self.async_llm_servers = [None] * self.rollout_dp_size  # Ray actor references
+        self.server_addresses = [None] * self.rollout_dp_size   # HTTP server addresses
+
+        # Start LLM server instances and initialize their engines
+        self.start_llm_servers()
+        # All server instances are ready, initialize AsyncLLM engines
+        ray.get([server.init_engine.remote() for server in self.async_llm_servers])
+        
+        # Extract generation and processing parameters
+        self.num_trajectories = self.config.rollout.n  # Number of trajectories per prompt
+        self.num_val_trajectories = self.config.rollout.val_kwargs.n
+        self.remove_think_tokens = self.config.rollout.remove_think_tokens  # Token filtering
+        
+        # Get tokenizer from the first server instance via Ray remote call
+        # This ensures we use the same tokenizer as the inference engines
+        self.tokenizer = ray.get(self.async_llm_servers[0].get_tokenizer.remote())
+        self.tokenizer = self.tokenizer.tokenizer  # Extract the actual tokenizer object
+        
+        # Set sequence length constraints from configuration
+        self.max_prompt_length = self.full_config.data.max_prompt_length
+        self.max_response_length = self.full_config.data.max_response_length
+        self.total_len = self.max_prompt_length + self.max_response_length
+        self.max_starting_message_length = self.config.rollout.max_starting_message_length
+        
+        # Use CPU device for tensor operations (data preparation)
+        self.device = torch.device('cpu')
+        
+        # OpenHands worker configuration for parallel processing
+        self.openhands_num_workers = self.config.rollout.openhands_num_workers
+
+        # Extract model name from path for API identification
+        model_name = '/'.join(self.config.model.path.split('/')[-2:])
+
+        # Configure sampling parameters for generation
+        # These parameters control the randomness and quality of generated text
+        self.sampling_params = {
+            "model": f"hosted_vllm/{model_name}",
+            "api_key": "dummy_key",  # Placeholder for local servers
+            "modify_params": False,
+            "log_completions": False,
+            "native_tool_calling": self.config.rollout.get("native_tool_calling", True),
+            "temperature": self.config.rollout.temperature,  # Randomness control
+            "top_p": self.config.rollout.top_p,              # Nucleus sampling
+            "max_iterations": self.config.rollout.max_iterations,  # Max tool calls
+            "max_output_tokens": self.max_response_length,   # Response length limit
+            'token_level_generation': self.config.rollout.get("token_level_generation", False), # This is new
+            'custom_tokenizer': self.config.rollout.get("custom_tokenizer", 'Qwen/Qwen3-8B'),
+            'max_model_len': self.total_len,
+            'ensure_thinking_end_properly': False if self.config.rollout.get("token_level_generation", False) else True,
+        }
+        self.val_sampling_params = self.sampling_params.copy()
+        self.val_sampling_params['temperature'] = self.config.rollout.val_kwargs.temperature
+        self.val_sampling_params['top_p'] = self.config.rollout.val_kwargs.top_p
+        
+        # Parse and validate OpenHands server addresses
+        # OpenHands servers handle multi-turn conversations with tool usage
+        openhands_base_urls = self.config.rollout.openhands_base_url
+        if isinstance(openhands_base_urls, str):
+            # Support multiple URLs separated by '+' 
+            self.openhands_urls = [url.strip() for url in openhands_base_urls.split('+') if url.strip()]
+        else:
+            self.openhands_urls = [openhands_base_urls] if openhands_base_urls else []
+        
+        # Initialize OpenHands servers by clearing existing LLM servers
+        asyncio.run(self.clear_llm_servers())
+        
+        # Distribute LLM server addresses to OpenHands servers for load balancing
+        self._send_llm_addresses_to_openhands()
+        
+        # Initialize chat template for message formatting
+        # Different models require different conversation formats
+        self.chat_template_name = getattr(self.config.rollout, 'chat_template_name', 'qwen3_chat_template_generation')
+        self.chat_template = get_chat_template(self.chat_template_name)
+        logger.info(f"Using chat template: {self.chat_template_name}")
+
+    def start_llm_servers(self):
+        """
+        Start LLM server instances across the Ray cluster with proper resource allocation.
+        
+        This method handles the complex process of starting distributed LLM servers:
+        
+        1. Resource Discovery:
+           - Queries the Ray register center for worker node information
+           - Determines optimal server placement based on worker topology
+           
+        2. Server Instantiation:
+           - Creates async server instances using the appropriate backend
+           - Applies node affinity scheduling to colocate servers with workers
+           - Handles naming and resource allocation
+           
+        3. Error Handling and Retry:
+           - Implements retry logic for address conflicts
+           - Gracefully handles server startup failures
+           - Ensures all servers are successfully started before proceeding
+           
+        4. Address Management:
+           - Collects and stores server HTTP addresses
+           - Maintains mapping between data parallel ranks and addresses
+           
+        The method uses a retry loop to handle common startup issues like
+        port conflicts, ensuring robust server initialization.
+        """
+        # Get worker information from Ray register center
+        # This provides the mapping between worker ranks and node IDs
+        register_center = ray.get_actor(f"{self.worker_group.name_prefix}_register_center")
+        workers_info = ray.get(register_center.get_worker_info.remote())
+        assert len(workers_info) == self.worker_group.world_size
+        
+        # Get the appropriate server class based on rollout backend configuration
+        # This allows switching between different inference backends (vLLM, TensorRT-LLM, etc.)
+        if self.config.rollout.get("token_level_generation", False):
+            from verl_custom.nvidia.rollout.vllm_async_server import AsyncvLLMServer as server_class
+        else:
+            server_class = async_server_class(
+                rollout_backend=self.config.rollout.name,
+            )
+
+        # Start all server instances with retry logic for address conflicts
+        # Track which data parallel ranks still need successful server startup
+        unready_dp_ranks = set(range(self.rollout_dp_size))
+        
+        while len(unready_dp_ranks) > 0:
+            # Create server instances for all unready ranks
+            servers = {
+                rollout_dp_rank: server_class.options(
+                    # Ensure AsyncvLLMServer colocates with its corresponding workers
+                    # This optimizes memory usage and reduces network overhead
+                    scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy(
+                        # Use the first worker in the tensor parallel group for this DP rank
+                        node_id=workers_info[rollout_dp_rank * self.rollout_tp_size],
+                        soft=False,  # Hard constraint - must run on this node
+                    ),
+                    # Assign unique name for Ray actor registry
+                    name=f"async_llm_server_{rollout_dp_rank}",
+                ).remote(
+                    self.full_config,           # Complete configuration
+                    self.rollout_dp_size,       # Total number of data parallel replicas
+                    rollout_dp_rank,            # This server's data parallel rank
+                    self.worker_group.name_prefix  # Worker group identifier
+                )
+                for rollout_dp_rank in unready_dp_ranks
+            }
+
+            # Attempt to start each server and collect successful addresses
+            for rollout_dp_rank, server in servers.items():
+                try:
+                    # Get the HTTP server address from the started server
+                    # This is a blocking call that waits for server initialization
+                    address = ray.get(server.get_server_address.remote())
+                    
+                    # Store successful server reference and address
+                    self.server_addresses[rollout_dp_rank] = address
+                    self.async_llm_servers[rollout_dp_rank] = server
+                    
+                    # Remove from unready set - this server is now operational
+                    unready_dp_ranks.remove(rollout_dp_rank)
+                    
+                except Exception:
+                    # Server startup failed (likely port conflict), clean up and retry
+                    ray.kill(server)
+                    logger.error(f"rollout server {rollout_dp_rank} failed, maybe address already in use, restarting...")
+
+    def assign_llm_addresses_to_openhands(self):
+        """
+        Distribute LLM server addresses to OpenHands servers for optimal load balancing.
+        
+        This method implements a two-phase assignment strategy:
+        
+        Phase 1 - Locality-First Assignment:
+        - Assigns LLM servers to OpenHands servers running on the same IP address
+        - Minimizes network latency and maximizes bandwidth utilization
+        - Reduces cross-node communication overhead
+        
+        Phase 2 - Load Distribution:
+        - Distributes remaining LLM servers evenly across all OpenHands servers
+        - Ensures balanced workload distribution
+        - Handles cases where server counts don't divide evenly
+        
+        Returns:
+            dict: Mapping of OpenHands server URLs to their assigned LLM server addresses
+            
+        The assignment algorithm prioritizes network locality while maintaining
+        load balance across the entire OpenHands server pool.
+        """
+        
+        def url_to_ip(url):
+            """Extract IP address from URL by removing protocol and port."""
+            # Remove HTTP/HTTPS protocol prefixes
+            url = url.replace("http://", "").replace("https://", "")
+            # Remove port number if present
+            url = url.split(":")[0]
+            return url
+        
+        # Create IP address mappings for both OpenHands and LLM servers
+        openhands_urls2ips = {url: url_to_ip(url) for url in self.openhands_urls}
+        server_addresses2ips = {url: url_to_ip(url) for url in self.server_addresses}
+        
+        num_openhands = len(self.openhands_urls)
+        num_llm_servers = len(self.server_addresses)
+        
+        # Initialize assignment tracking
+        address_assignments = {}
+        used_addresses = set()
+        
+        # Phase 1: Assign LLM servers to OpenHands servers on the same IP
+        # This optimizes for network locality and reduces latency
+        recurrent_pointer=0
+        for i, openhands_url in enumerate(self.openhands_urls):            
+            assigned_addresses = []
+            
+            # Find all LLM servers on the same IP as this OpenHands server
+            for server_address in self.server_addresses:
+                if server_addresses2ips[server_address] == openhands_urls2ips[openhands_url]:
+                    assigned_addresses.append(server_address)
+                    used_addresses.add(server_address)
+            num_llm_servers_per_openhands=self.full_config.trainer.n_gpus_per_node//self.config.rollout.tensor_model_parallel_size
+            # if len(assigned_addresses)<num_llm_servers_per_openhands, then we need to assign extra addresses
+            if len(assigned_addresses)<num_llm_servers_per_openhands:
+                if recurrent_pointer+num_llm_servers_per_openhands-len(assigned_addresses)>num_llm_servers:
+                    assigned_addresses.extend(self.server_addresses[recurrent_pointer:])
+                    assigned_addresses.extend(self.server_addresses[0:num_llm_servers_per_openhands-len(assigned_addresses)])
+                    recurrent_pointer = num_llm_servers_per_openhands-len(assigned_addresses)
+                else:
+                    assigned_addresses.extend(self.server_addresses[recurrent_pointer:recurrent_pointer+num_llm_servers_per_openhands-len(assigned_addresses)])
+                    recurrent_pointer += num_llm_servers_per_openhands-len(assigned_addresses)
+            address_assignments[openhands_url] = assigned_addresses
+            logger.info(f"Assigned same-IP LLM server addresses to {openhands_url}: {assigned_addresses}")
+        
+        # Phase 2: Distribute remaining LLM servers evenly among all OpenHands servers
+        remaining_addresses = [addr for addr in self.server_addresses if addr not in used_addresses]
+        
+        if remaining_addresses:
+            logger.info(f"Distributing {len(remaining_addresses)} remaining LLM servers among {num_openhands} OpenHands servers")
+            
+            # Calculate distribution parameters
+            additional_per_openhands = len(remaining_addresses) // num_openhands
+            remainder = len(remaining_addresses) % num_openhands
+            
+            # Distribute remaining addresses with even load balancing
+            addr_index = 0
+            for i, openhands_url in enumerate(self.openhands_urls):
+                # Calculate how many additional addresses this OpenHands server gets
+                num_additional = additional_per_openhands
+                if i < remainder:  # First 'remainder' servers get one extra
+                    num_additional += 1
+                
+                # Assign additional addresses to this OpenHands server
+                for j in range(num_additional):
+                    if addr_index < len(remaining_addresses):
+                        address_assignments[openhands_url].append(remaining_addresses[addr_index])
+                        addr_index += 1
+                
+                logger.info(f"Final LLM server addresses assigned to {openhands_url}: {address_assignments[openhands_url]}")
+        
+        # Log assignment summary for monitoring and debugging
+        total_assigned = sum(len(addrs) for addrs in address_assignments.values())
+        logger.info(f"Assignment complete: {total_assigned}/{num_llm_servers} LLM servers assigned to {num_openhands} OpenHands servers")
+        
+        return address_assignments
+    
+    def _send_llm_addresses_to_openhands(self):
+        """
+        Distribute and send LLM server addresses to multiple OpenHands servers.
+        
+        This method orchestrates the distribution of LLM server addresses to OpenHands servers
+        for load balancing and fault tolerance. It performs the following steps:
+        
+        1. Validates that OpenHands servers are configured
+        2. Calculates optimal address assignments using locality-aware algorithm
+        3. Sends addresses asynchronously to all OpenHands servers
+        4. Provides detailed logging for monitoring and debugging
+        
+        The method ensures that each OpenHands server receives appropriate LLM server
+        addresses, enabling distributed processing of conversation requests.
+        """
+        if not self.openhands_urls:
+            logger.warning("No OpenHands base URLs configured, skipping LLM address distribution")
+            raise ValueError("No OpenHands base URLs configured. Please set actor_rollout_ref.rollout.openhands_base_url.")
+            
+        # Calculate optimal address assignments based on network topology
+        address_assignments = self.assign_llm_addresses_to_openhands()
+        
+        # Send addresses asynchronously to all OpenHands servers
+        asyncio.run(self._send_addresses_async(address_assignments))
+        logger.info("Successfully sent LLM addresses to all OpenHands servers")
+
+    async def _send_addresses_async(self, address_assignments: Dict[str, List[str]]):
+        """
+        Asynchronously send LLM addresses to multiple OpenHands servers.
+        
+        This method implements concurrent address distribution to minimize setup time:
+        
+        1. Creates async tasks for each LLM server address assignment
+        2. Sends all requests concurrently using asyncio.gather
+        3. Handles exceptions gracefully and reports success/failure statistics
+        4. Provides detailed logging for monitoring
+        
+        Args:
+            address_assignments (Dict[str, List[str]]): Mapping of OpenHands server URLs
+                to their assigned LLM server addresses
+                
+        The concurrent approach significantly reduces setup time compared to
+        sequential address distribution, especially with many servers.
+        """
+        tasks = []
+        
+        # Create async tasks for each address assignment
+        for openhands_url, addresses in address_assignments.items():
+            for address in addresses:
+                wrapped_address = f"http://{address}" if self.config.rollout.get("token_level_generation", False) else f"http://{address}/v1"
+                task = self._add_llm_server_to_openhands(openhands_url, wrapped_address)
+                tasks.append(task)
+        
+        # Send all requests concurrently for maximum efficiency
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        
+        # Process results and provide detailed feedback
+        success_count = 0
+        for i, result in enumerate(results):
+            if isinstance(result, Exception):
+                logger.error(f"Failed to send address {i}: {result}")
+            else:
+                success_count += 1
+        
+        logger.info(f"Successfully sent {success_count}/{len(results)} LLM addresses to OpenHands servers")
+
+    async def _add_llm_server_to_openhands(self, openhands_base_url: str, address: str):
+        """
+        Send a single LLM server address to an OpenHands server.
+        
+        This method handles the HTTP communication with OpenHands servers to register
+        LLM server addresses. It includes comprehensive error handling and timeout management.
+        
+        Args:
+            openhands_base_url (str): Base URL of the OpenHands server
+            address (str): HTTP address of the LLM server to register
+            
+        Returns:
+            dict: Response from the OpenHands server on successful registration
+            
+        Raises:
+            Exception: On HTTP errors, timeouts, or communication failures
+            
+        The method uses proper session management and timeout handling to ensure
+        reliable communication with OpenHands servers.
+        """
+        try:
+            # Configure reasonable timeout for server communication
+            timeout = aiohttp.ClientTimeout(total=30)
+            session = aiohttp.ClientSession(timeout=timeout)
+            
+            try:
+                # Prepare request URL and payload
+                url = f"{openhands_base_url}/add_llm_server"
+                payload = {"address": address}
+                
+                # Send POST request to register LLM server
+                async with session.post(url, json=payload) as response:
+                    if response.status == 200:
+                        result = await response.json()
+                        logger.info(f"LLM server {address} added successfully to {openhands_base_url}: {result}")
+                        return result
+                    else:
+                        # Handle HTTP error responses
+                        error_text = await response.text()
+                        logger.error(f"Failed to add LLM server {address} to {openhands_base_url}, HTTP {response.status}: {error_text}")
+                        raise Exception(f"HTTP {response.status}: {error_text}")
+                        
+            finally:
+                # Ensure session is properly closed
+                await session.close()
+                
+        except asyncio.TimeoutError:
+            error_msg = f"Timeout adding LLM server {address} to {openhands_base_url}"
+            logger.error(error_msg)
+            raise Exception(error_msg)
+        except Exception as e:
+            logger.error(f"Error adding LLM server {address} to {openhands_base_url}: {e}")
+            raise
+
+    def wake_up(self):
+        """
+        Wake up all vLLM instances from sleep mode.
+        
+        This method activates all LLM servers that may have been put into sleep mode
+        for resource conservation. It's useful for resuming operations after
+        periods of inactivity.
+        
+        The wake_up operation is performed synchronously across all servers to ensure
+        they are ready before returning control to the caller.
+        """
+        ray.get([server.wake_up.remote() for server in self.async_llm_servers])
+
+    def sleep(self):
+        """
+        Put all vLLM instances into sleep mode.
+        
+        This method puts all LLM servers into a low-power state to conserve resources
+        when they are not actively processing requests. This is useful for:
+        - Reducing GPU memory usage during idle periods
+        - Conserving power in multi-tenant environments
+        - Allowing other processes to use GPU resources temporarily
+        
+        The sleep operation is performed synchronously across all servers.
+        """
+        ray.get([server.sleep.remote() for server in self.async_llm_servers])
+
+    def _convert_results_to_dataproto(self, results, input_batch, val_mode=False) -> DataProto:
+        """
+        Convert OpenHands conversation results to DataProto format for training.
+        
+        This method performs complex data transformation to convert conversation results
+        from OpenHands servers into the structured DataProto format required by the
+        training pipeline. The conversion process includes:
+        
+        1. Result Organization:
+           - Groups results by instance ID and trajectory ID
+           - Handles missing or empty results with intelligent fallback
+           - Maintains consistent ordering matching the input batch
+        
+        2. Message Processing:
+           - Extracts conversation messages from results
+           - Separates prompts from responses based on assistant messages
+           - Handles multi-turn conversations with tool usage
+        
+        3. Tokenization:
+           - Applies chat templates for proper conversation formatting
+           - Tokenizes prompts and responses with appropriate masks
+           - Handles different tokenization requirements for prompts vs responses
+        
+        4. Tensor Creation:
+           - Pads sequences to consistent lengths
+           - Converts to PyTorch tensors with proper attention masks
+           - Computes position IDs for transformer models
+        
+        5. Data Packaging:
+           - Combines tensor and non-tensor data into DataProto
+           - Includes metadata like success flags, error messages, etc.
+           - Maintains batch structure for distributed training
+        
+        Args:
+            results (dict): Dictionary with structure {instance_id: {trajectory_id: result_dict}}
+                Each result_dict contains:
+                - messages: List of conversation messages
+                - resolved: Boolean indicating if task was completed
+                - success: Boolean indicating if execution succeeded
+                - error: Optional error message
+                - tools: Optional tool definitions
+                
+        Returns:
+            DataProto: Structured data containing:
+                - Tensor data: input_ids, attention_mask, position_ids, etc.
+                - Non-tensor data: success flags, error messages, metadata
+                - Proper formatting for downstream training/evaluation
+        """
+        # Initialize lists for non-tensor data collection
+        success_list = []
+        error_list = []
+        resolved_list = []
+        has_finish_action_list = []
+        
+        # Create a mapping of instance_id -> list of trajectories for organization
+        instance_trajectories = {}
+        for instance_id, trajectories in results.items():
+            instance_trajectories[instance_id] = []
+            for trajectory_id, result in trajectories.items():
+                instance_trajectories[instance_id].append(result)
+
+        # Create final results in the same order as the input batch
+        # This ensures consistent ordering for training
+        matched_results = []
+        instance_list = []
+        for batch_item in input_batch:
+            instance_id = batch_item.non_tensor_batch['instance']['instance_id']
+            instance = batch_item.non_tensor_batch['instance']
+            if instance_id in instance_trajectories:
+                # Add all trajectories for this instance
+                traj_results = instance_trajectories[instance_id]
+                matched_results.extend(traj_results)
+                instance_list.extend([instance] * len(traj_results))
+        
+        # Validate that we have the expected number of results
+        expected_count = self.num_trajectories * len(input_batch) if not val_mode else len(input_batch) * self.num_val_trajectories
+        assert len(matched_results) == expected_count, f"Expected {expected_count} results, got {len(matched_results)}"
+        
+        # Group results by instance_id for intelligent message handling
+        results_by_instance = {}
+        for i, result in enumerate(matched_results):
+            instance_id = instance_list[i]['instance_id']
+            if instance_id not in results_by_instance:
+                results_by_instance[instance_id] = []
+            results_by_instance[instance_id].append((i, result))
+        
+        # Handle empty messages by copying from another trajectory of the same instance
+        # This provides robustness against individual trajectory failures
+        for instance_id, results in results_by_instance.items():
+            # Find a valid messages list to use as fallback
+            valid_messages = None
+            for _, result in results:
+                messages = result.get('messages', [])
+                if messages and len(messages) > 0:
+                    valid_messages = messages
+                    valid_resolved = result.get('resolved', False)
+                    valid_finish = result.get('finish', False)
+                    valid_error = result.get('error', None)
+                    break
+            
+            # If we found valid messages, use them for trajectories with empty messages
+            if valid_messages:
+                for idx, result in results:
+                    if not result.get('messages') or len(result.get('messages', [])) == 0:
+                        print(f"Got empty messages for instance_id {instance_id}, trajectory {idx}. "
+                              f"Copying messages array from a valid trajectory.")
+                        # Copy messages from the valid trajectory
+                        matched_results[idx]['messages'] = valid_messages.copy()
+                        matched_results[idx]['resolved'] = valid_resolved
+                        matched_results[idx]['error'] = valid_error
+                        matched_results[idx]['finish'] = valid_finish
+                        matched_results[idx]['is_padded'] = True  # Mark as padded sample
+        
+        # Initialize data structures for tokenization and tensor creation
+        all_messages = []
+        all_prompts = []
+        all_responses = []
+        prompt_encodings = {'input_ids': [], 'attention_mask': []}
+        response_encodings = {'input_ids': [], 'attention_mask': [], 'assistant_masks': []}
+        is_padded_list = []  # Track which samples are padded for training logic
+        error_mask_list = []  # Track which samples are padded for training logic
+
+        # Process each result to extract and tokenize messages
+        for result in matched_results:
+            messages = result.get('messages', [])
+            all_messages.append(messages)
+            
+            # Check if this is a padded sample (copied from another trajectory)
+            is_padded = result.get('is_padded', False)
+            is_padded_list.append(is_padded)
+            error_mask_list.append(result.get('error', None) is not None and result.get('error', None) != '')
+            
+            # Separate prompt from response based on first assistant message
+            # The prompt includes all messages before the first assistant response
+            starting_index = 0
+            for i, msg in enumerate(messages):
+                if msg["role"] == 'assistant':
+                    starting_index = i
+                    break
+                    
+            if starting_index == 0:
+                # If no assistant message found, treat all messages as prompts
+                print(f'ERROR: Found no assistant message. len(messages) == {len(messages)} '
+                      f'and roles are {[msg["role"] for msg in messages]}')
+                starting_index = len(messages)
+                
+            # Split into prompt and response parts
+            prompt = messages[:starting_index]
+            response = messages[starting_index:]
+            all_prompts.append(prompt)
+            all_responses.append(response)
+
+            # Collect non-tensor metadata
+            success_list.append(result.get('success', False))
+            error_list.append(result.get('error', None))
+            resolved_list.append(result.get('resolved', False))
+            has_finish_action_list.append(result.get('finish', False))
+            
+            # Get tool definitions for this conversation
+            tools = result.get('tools', None)
+            
+            # Tokenize the prompt using chat template
+            prompt_encoding = self.tokenizer.apply_chat_template(
+                prompt, 
+                add_generation_prompt=False,
+                return_dict=True,
+                tools=tools,
+                chat_template=self.chat_template
+            )
+            
+            # Tokenize the full conversation with assistant mask
+            message_encoding = self.tokenizer.apply_chat_template(
+                messages, 
+                add_generation_prompt=False,
+                return_assistant_tokens_mask=True,  # Mark assistant tokens for loss calculation
+                return_dict=True,
+                tools=tools,
+                chat_template=self.chat_template
+            )
+            
+            # Extract response encoding by removing prompt portion
+            response_encoding = dict()
+            prompt_length = len(prompt_encoding['input_ids'])
+            response_encoding['input_ids'] = message_encoding['input_ids'][prompt_length:]
+            response_encoding['attention_mask'] = message_encoding['attention_mask'][prompt_length:]
+            response_encoding['assistant_masks'] = message_encoding['assistant_masks'][prompt_length:]
+            
+            # Store encodings for batch processing
+            response_encodings['input_ids'].append(response_encoding['input_ids'])
+            response_encodings['attention_mask'].append(response_encoding['attention_mask'])
+            response_encodings['assistant_masks'].append(response_encoding['assistant_masks'])
+            prompt_encodings['input_ids'].append(prompt_encoding['input_ids'])
+            prompt_encodings['attention_mask'].append(prompt_encoding['attention_mask'])
+            
+        # Pad sequences to consistent lengths for batch processing
+        
+        # Pad prompts to maximum prompt length in batch
+        max_len_prompt = max(len(input_ids) for input_ids in prompt_encodings['input_ids'])
+        for idx in range(len(prompt_encodings['input_ids'])):
+            pad_length = max_len_prompt - len(prompt_encodings['input_ids'][idx])
+            prompt_encodings['input_ids'][idx].extend([self.tokenizer.pad_token_id] * pad_length)
+            prompt_encodings['attention_mask'][idx].extend([0] * pad_length)
+            
+        # Pad responses to maximum response length in batch
+        max_len_response = max(len(input_ids) for input_ids in response_encodings['input_ids'])
+        for idx in range(len(response_encodings['input_ids'])):
+            pad_length = max_len_response - len(response_encodings['input_ids'][idx])
+            response_encodings['input_ids'][idx].extend([self.tokenizer.pad_token_id] * pad_length)
+            response_encodings['attention_mask'][idx].extend([0] * pad_length)
+            response_encodings['assistant_masks'][idx].extend([0] * pad_length)
+        
+        # Convert to PyTorch tensors and apply proper padding
+        prompt_input_ids = torch.tensor(prompt_encodings['input_ids'], device=self.device)
+        prompt_attention_mask = torch.tensor(prompt_encodings['attention_mask'], device=self.device)
+        
+        # Convert right padding to left padding for prompts (required by some models)
+        prompt_input_ids, prompt_attention_mask = convert_right_padding_to_left(
+            self.tokenizer, prompt_input_ids, prompt_attention_mask, 
+            self.device, self.max_starting_message_length
+        )
+        
+        # Pad responses to total sequence length
+        response_ids, response_attention_mask, response_assistant_mask = pad_to_max_length_right(
+            self.tokenizer, response_encodings, self.total_len, self.device
+        )
+        
+        # Combine prompts and responses into full sequences
+        input_ids = torch.cat([prompt_input_ids, response_ids], dim=1)
+        attention_mask = torch.cat([prompt_attention_mask, response_attention_mask], dim=1)
+        position_ids = compute_position_id_with_mask(attention_mask)
+
+        # Log tensor shapes for debugging and validation
+        logger.info(f"input_ids shape: {input_ids.shape}, response_ids shape: {response_ids.shape}, "
+                   f"max_starting_message_length: {self.max_starting_message_length}, "
+                   f"max_response_length: {self.total_len}")
+        
+        # Validate tensor shape consistency
+        assert input_ids.shape[1] == attention_mask.shape[1] == position_ids.shape[1], \
+            f"Shape mismatch: input_ids {input_ids.shape}, attention_mask {attention_mask.shape}, " \
+            f"position_ids {position_ids.shape}"
+        assert response_ids.shape[1] == response_assistant_mask.shape[1], \
+            f"Response shape mismatch: response_ids {response_ids.shape}, " \
+            f"response_assistant_mask {response_assistant_mask.shape}"
+        
+        # Create tensor dictionary for training
+        tensor_dict = {
+            'input_ids': input_ids,                    # Full sequence tokens
+            'responses': response_ids,                 # Response tokens only
+            'attention_mask': attention_mask,          # Attention mask for full sequence
+            'position_ids': position_ids,              # Position IDs for transformer
+            'loss_mask': response_assistant_mask,      # Mask for loss calculation
+            'is_padded': torch.tensor(is_padded_list, device=self.device),              # Padding indicators for training
+            'error_mask': torch.tensor(error_mask_list, device=self.device),           # Error mask for training
+        }
+
+        # Create non-tensor dictionary for metadata
+        non_tensor_dict = {
+            'success': success_list,                   # Task completion status
+            'error': error_list,                       # Error messages if any
+            'instance': instance_list,                 # Original instance data
+            'resolved': resolved_list,                 # Problem resolution status
+            'finish': has_finish_action_list,          # Finish action detection
+        }
+        
+        # Create and return DataProto with combined tensor and non-tensor data
+        result_dataproto = DataProto.from_dict(
+            tensors=tensor_dict,
+            non_tensors=non_tensor_dict
+        )
+        
+        return result_dataproto
+
+
+    def _convert_results_to_dataproto_token(self, results, input_batch, val_mode=False) -> DataProto:
+        """
+        Convert OpenHands conversation results to DataProto format for training.
+        
+        This method is another version of _convert_results_to_dataproto, where the messages contains token ids.
+        Args:
+            results (dict): Dictionary with structure {instance_id: {trajectory_id: result_dict}}
+                Each result_dict contains:
+                - messages: List of conversation messages, each message contains token ids.
+                - resolved: Boolean indicating if task was completed
+                - success: Boolean indicating if execution succeeded
+                - error: Optional error message
+                - tools: Optional tool definitions
+                
+        Returns:
+            DataProto: Structured data containing:
+                - Tensor data: input_ids, attention_mask, position_ids, etc.
+                - Non-tensor data: success flags, error messages, metadata
+                - Proper formatting for downstream training/evaluation
+        """
+        # Initialize lists for non-tensor data collection
+        success_list = []
+        error_list = []
+        resolved_list = []
+        has_finish_action_list = []
+        
+        # Create a mapping of instance_id -> list of trajectories for organization
+        instance_trajectories = {}
+        for instance_id, trajectories in results.items():
+            instance_trajectories[instance_id] = []
+            for trajectory_id, result in trajectories.items():
+                instance_trajectories[instance_id].append(result)
+
+        # Create final results in the same order as the input batch
+        # This ensures consistent ordering for training
+        matched_results = []
+        instance_list = []
+        for batch_item in input_batch:
+            instance_id = batch_item.non_tensor_batch['instance']['instance_id']
+            instance = batch_item.non_tensor_batch['instance']
+            if instance_id in instance_trajectories:
+                # Add all trajectories for this instance
+                traj_results = instance_trajectories[instance_id]
+                matched_results.extend(traj_results)
+                instance_list.extend([instance] * len(traj_results))
+        
+        # Validate that we have the expected number of results
+        expected_count = self.num_trajectories * len(input_batch) if not val_mode else len(input_batch) * self.num_val_trajectories
+        assert len(matched_results) == expected_count, f"Expected {expected_count} results, got {len(matched_results)}"
+        
+        # Group results by instance_id for intelligent message handling
+        results_by_instance = {}
+        for i, result in enumerate(matched_results):
+            instance_id = instance_list[i]['instance_id']
+            if instance_id not in results_by_instance:
+                results_by_instance[instance_id] = []
+            results_by_instance[instance_id].append((i, result))
+        
+        # Handle empty messages by copying from another trajectory of the same instance
+        # This provides robustness against individual trajectory failures
+        # if all trajectories are empty, we will use the valid messages from other instances
+        global_valid_messages = None
+        for instance_id, results in results_by_instance.items():
+            # Find a valid messages list to use as fallback
+            valid_messages = None
+            for _, result in results:
+                messages = result.get('messages', [])
+                if messages and len(messages) > 0:
+                    valid_messages = messages
+                    valid_resolved = result.get('resolved', False)
+                    valid_finish = result.get('finish', False)
+                    valid_error = result.get('error', None)
+                    global_valid_messages = messages
+                    break
+            
+            # If we found valid messages, use them for trajectories with empty messages
+            if valid_messages or global_valid_messages:
+                for idx, result in results:
+                    if not result.get('messages') or len(result.get('messages', [])) == 0:
+                        print(f"Got empty messages for instance_id {instance_id}, trajectory {idx}. "
+                              f"Copying messages array from a valid trajectory.")
+                        # Copy messages from the valid trajectory
+                        matched_results[idx]['messages'] = valid_messages.copy() if valid_messages else global_valid_messages.copy()
+                        matched_results[idx]['resolved'] = valid_resolved
+                        matched_results[idx]['error'] = valid_error
+                        matched_results[idx]['finish'] = valid_finish
+                        matched_results[idx]['is_padded'] = True  # Mark as padded sample
+        
+        # Initialize data structures for tokenization and tensor creation
+        prompt_encodings = {'input_ids': [], 'attention_mask': [], 'log_probs': []}
+        response_encodings = {'input_ids': [], 'attention_mask': [], 'assistant_masks': [], 'log_probs': []}
+        is_padded_list = []  # Track which samples are padded for training logic
+        error_mask_list = []  # Track which samples are padded for training logic
+
+        # Process each result to extract and tokenize messages
+        for result in matched_results:
+            messages = result.get('messages', [])
+            
+            # Check if this is a padded sample (copied from another trajectory)
+            is_padded = result.get('is_padded', False)
+            is_padded_list.append(is_padded)
+            error_mask_list.append(result.get('error', None) is not None and result.get('error', None) != '')
+            
+            # Separate prompt from response based on first assistant message
+            # The prompt includes all messages before the first assistant response
+            starting_index = 0
+            for i, msg in enumerate(messages):
+                if msg["role"] == 'assistant':
+                    starting_index = i
+                    break
+                    
+            if starting_index == 0:
+                # If no assistant message found, treat all messages as prompts
+                print(f'ERROR: Found no assistant message. len(messages) == {len(messages)} '
+                      f'and roles are {[msg["role"] for msg in messages]}')
+                starting_index = len(messages)
+            # if the last message is not an assistant message, we will remove it
+            while messages and messages[-1]["role"] != "assistant":
+                messages = messages[:-1]
+            assert messages[-1]["role"] == "assistant", f"The last message should be an assistant message. " \
+                f"len(messages) == {len(messages)} and roles are {[msg['role'] for msg in messages]}"
+            
+            all_ids = [msg['token_ids'] for msg in messages]
+            all_log_probs = [msg['logprobs'] if 'logprobs' in msg and msg['logprobs'] is not None else [0.0] * len(msg['token_ids']) for msg in messages]
+            # find assistant turn indices
+            assistant_turn_indices = []
+            for i, msg in enumerate(messages):
+                if msg["role"] == 'assistant':
+                    assistant_turn_indices.append(i)
+            assistant_masks = [[0] * len(input_ids) for input_ids in all_ids]
+            for i in assistant_turn_indices:
+                assistant_masks[i] = [1] * len(all_ids[i])
+            
+            # Split into prompt and response parts
+            prompt_ids = all_ids[:starting_index]
+            prompt_ids = sum(prompt_ids, [])
+            response_ids = all_ids[starting_index:]
+            response_ids = sum(response_ids, [])
+            assistant_masks = sum(assistant_masks, [])
+            len_prompt_ids = len(prompt_ids)
+            assistant_masks = assistant_masks[len_prompt_ids:]
+            attention_mask = [[1] * len(input_ids) for input_ids in all_ids]
+            attention_mask = sum(attention_mask, [])
+            prompt_attention_mask = attention_mask[:len_prompt_ids]
+            response_attention_mask = attention_mask[len_prompt_ids:]
+
+            response_log_probs = all_log_probs[starting_index:]
+            response_log_probs = sum(response_log_probs, [])
+
+            # Collect non-tensor metadata
+            success_list.append(result.get('success', False))
+            error_list.append(result.get('error', None))
+            resolved_list.append(result.get('resolved', False))
+            has_finish_action_list.append(result.get('finish', False))
+            
+            # Store encodings for batch processing
+            response_encodings['input_ids'].append(response_ids)
+            response_encodings['attention_mask'].append(response_attention_mask)
+            response_encodings['assistant_masks'].append(assistant_masks)
+            response_encodings['log_probs'].append(response_log_probs)
+            prompt_encodings['input_ids'].append(prompt_ids)
+            prompt_encodings['attention_mask'].append(prompt_attention_mask)
+            
+        # Pad sequences to consistent lengths for batch processing
+        
+        # Pad prompts to maximum prompt length in batch
+        max_len_prompt = max(len(input_ids) for input_ids in prompt_encodings['input_ids'])
+        for idx in range(len(prompt_encodings['input_ids'])):
+            pad_length = max_len_prompt - len(prompt_encodings['input_ids'][idx])
+            prompt_encodings['input_ids'][idx].extend([self.tokenizer.pad_token_id] * pad_length)
+            prompt_encodings['attention_mask'][idx].extend([0] * pad_length)
+            
+        # Pad responses to maximum response length in batch
+        max_len_response = max(len(input_ids) for input_ids in response_encodings['input_ids'])
+        for idx in range(len(response_encodings['input_ids'])):
+            pad_length = max_len_response - len(response_encodings['input_ids'][idx])
+            response_encodings['input_ids'][idx].extend([self.tokenizer.pad_token_id] * pad_length)
+            response_encodings['attention_mask'][idx].extend([0] * pad_length)
+            response_encodings['assistant_masks'][idx].extend([0] * pad_length)
+            response_encodings['log_probs'][idx].extend([0.0] * pad_length)
+
+        # Convert to PyTorch tensors and apply proper padding
+        prompt_input_ids = torch.tensor(prompt_encodings['input_ids'], device=self.device)
+        prompt_attention_mask = torch.tensor(prompt_encodings['attention_mask'], device=self.device)
+        # Convert right padding to left padding for prompts (required by some models)
+        prompt_input_ids, prompt_attention_mask = convert_right_padding_to_left(
+            self.tokenizer, prompt_input_ids, prompt_attention_mask,
+            self.device, self.max_starting_message_length
+        )
+
+        # Pad responses to total sequence length
+        response_ids, response_attention_mask, response_assistant_mask, response_log_probs = pad_to_max_length_right(
+            self.tokenizer, response_encodings, self.total_len, self.device
+        )
+        
+        # Combine prompts and responses into full sequences
+        input_ids = torch.cat([prompt_input_ids, response_ids], dim=1)
+        attention_mask = torch.cat([prompt_attention_mask, response_attention_mask], dim=1)
+        position_ids = compute_position_id_with_mask(attention_mask)
+
+        # Log tensor shapes for debugging and validation
+        logger.info(f"input_ids shape: {input_ids.shape}, response_ids shape: {response_ids.shape}, "
+                   f"max_starting_message_length: {self.max_starting_message_length}, "
+                   f"max_response_length: {self.total_len}")
+        
+        # Validate tensor shape consistency
+        assert input_ids.shape[1] == attention_mask.shape[1] == position_ids.shape[1], \
+            f"Shape mismatch: input_ids {input_ids.shape}, attention_mask {attention_mask.shape}, " \
+            f"position_ids {position_ids.shape}"
+        assert response_ids.shape[1] == response_assistant_mask.shape[1] == response_log_probs.shape[1], \
+            f"Response shape mismatch: response_ids {response_ids.shape}, " \
+            f"response_assistant_mask {response_assistant_mask.shape}, " \
+            f"response_log_probs {response_log_probs.shape}"
+        
+        # Create tensor dictionary for training
+        tensor_dict = {
+            'input_ids': input_ids,                    # Full sequence tokens
+            'responses': response_ids,                 # Response tokens only
+            'attention_mask': attention_mask,          # Attention mask for full sequence
+            'position_ids': position_ids,              # Position IDs for transformer
+            'loss_mask': response_assistant_mask,      # Mask for loss calculation
+            'rollout_log_probs': response_log_probs,   # Log probabilities for training
+            'is_padded': torch.tensor(is_padded_list, device=self.device),              # Padding indicators for training
+            'error_mask': torch.tensor(error_mask_list, device=self.device),           # Error mask for training
+        }
+        
+        # Create non-tensor dictionary for metadata
+        non_tensor_dict = {
+            'success': success_list,                   # Task completion status
+            'error': error_list,                       # Error messages if any
+            'instance': instance_list,                 # Original instance data
+            'resolved': resolved_list,                 # Problem resolution status
+            'finish': has_finish_action_list,          # Finish action detection
+        }
+        
+        # Create and return DataProto with combined tensor and non-tensor data
+        result_dataproto = DataProto.from_dict(
+            tensors=tensor_dict,
+            non_tensors=non_tensor_dict
+        )
+        
+        return result_dataproto
+
+
+    def DataProto2Messages(self, prompts: DataProto, val_mode=False):
+        """
+        Convert DataProto input to OpenHands message format.
+        
+        This method transforms the structured DataProto input into the message format
+        expected by OpenHands servers. It handles trajectory expansion to generate
+        multiple conversation variants from each input prompt.
+        
+        Args:
+            prompts (DataProto): Input data containing instance information
+                Each instance typically contains:
+                - instance_id: Unique identifier for the problem instance
+                - Problem description, context, and other metadata
+                
+        Returns:
+            list: List of message dictionaries formatted for OpenHands processing
+                Each message contains:
+                - Original instance data
+                - trajectory_id: Unique identifier for this conversation variant
+                - All necessary context for OpenHands processing
+                
+        The method creates multiple trajectories per instance to enable diverse
+        solution exploration and improved training data generation.
+        """
+        # Extract instance data from each prompt in the batch
+        messages = [prompt.non_tensor_batch['instance'] for prompt in prompts] 
+        
+        # Expand each instance into multiple trajectories
+        new_messages = []
+        for i in range(len(messages)):
+            for j in range(self.num_trajectories if not val_mode else self.num_val_trajectories):
+                # Create a copy of the original message for each trajectory
+                tmp_message = messages[i].copy()
+                tmp_message['trajectory_id'] = j  # Add trajectory identifier
+                new_messages.append(tmp_message)
+
+        return new_messages
+
+    def generate_sequences(self, prompts: DataProto, val_mode=False, **sampling_params) -> DataProto:
+        """
+        Generate multiple conversation sequences in parallel via OpenHands servers.
+        
+        This is the main entry point for conversation generation. It orchestrates
+        the entire pipeline from input processing to result aggregation:
+        
+        1. Input Processing:
+           - Converts DataProto to OpenHands message format
+           - Expands single instances to multiple trajectories
+           
+        2. Parallel Generation:
+           - Distributes requests across OpenHands servers
+           - Manages concurrent processing with load balancing
+           - Handles retries and error recovery
+           
+        3. Result Processing:
+           - Aggregates responses from all servers
+           - Converts back to DataProto format
+           - Includes timing information and metadata
+        
+        Args:
+            prompts (DataProto): Input batch containing problem instances
+            **sampling_params: Additional parameters for generation (currently unused)
+                
+        Returns:
+            DataProto: Generated conversation sequences with:
+                - Tensor data: tokenized conversations ready for training
+                - Non-tensor data: metadata, success flags, timing info
+                - meta_info: Detailed timing breakdown for performance analysis
+                
+        The method includes comprehensive timing instrumentation to help
+        identify performance bottlenecks in the generation pipeline.
+        """
+        import time
+        
+        # Start total timing for performance analysis
+        total_start_time = time.time()
+        
+        # Time message conversion phase
+        convert_start_time = time.time()
+        messages = self.DataProto2Messages(prompts, val_mode=val_mode)
+        convert_end_time = time.time()
+        
+        # Time OpenHands request processing phase
+        request_start_time = time.time()
+        output_messages = asyncio.run(self.request_from_openhands(messages, val_mode=val_mode))
+        request_end_time = time.time()
+        
+        # Time result conversion phase
+        convert_results_start_time = time.time()
+        if self.config.rollout.get("token_level_generation", False):
+            response = self._convert_results_to_dataproto_token(output_messages, prompts, val_mode=val_mode)
+        else:
+            response = self._convert_results_to_dataproto(output_messages, prompts, val_mode=val_mode)
+        convert_results_end_time = time.time()
+        total_end_time = time.time()
+        
+        # Log detailed timing information for performance monitoring
+        logger.info(f"convert_results_to_dataproto time: {convert_results_end_time - convert_results_start_time:.3f}s")
+
+        # Ensure meta_info exists for timing data
+        if not hasattr(response, 'meta_info') or response.meta_info is None:
+            response.meta_info = {}
+        
+        # Add comprehensive timing breakdown
+        response.meta_info["timing"] = {
+            "total": total_end_time - total_start_time,
+            "convert_messages": convert_end_time - convert_start_time,
+            "openhands_request": request_end_time - request_start_time,
+            "convert_results_to_dataproto": convert_results_end_time - convert_results_start_time,
+            "generate_sequences": total_end_time - total_start_time  # Main timing metric
+        }
+
+        return response
+
+    
+    async def clear_llm_servers(self):
+        """
+        Clear all LLM servers from OpenHands servers.
+        
+        This method removes all previously registered LLM server addresses from
+        OpenHands servers. It's typically called during initialization to ensure
+        a clean state before registering new servers.
+        
+        The operation is performed concurrently across all OpenHands servers
+        to minimize setup time and ensure consistent state.
+        """
+        if not self.openhands_urls:
+            logger.warning("No OpenHands base URLs configured, skipping clear llm servers")
+            raise ValueError("No OpenHands base URLs configured. Please set actor_rollout_ref.rollout.openhands_base_url.")
+        
+        logger.info(f"Clearing llm servers from {len(self.openhands_urls)} OpenHands servers")
+        
+        # Clear all llm servers concurrently for efficiency
+        tasks = []
+        for openhands_url in self.openhands_urls:
+            task = self._clear_llm_servers_single_server(openhands_url)
+            tasks.append(task)
+        
+        # Wait for all clear operations to complete
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        
+        # Process results and provide feedback
+        success_count = 0
+        for i, result in enumerate(results):
+            if isinstance(result, Exception):
+                logger.error(f"Failed to clear llm servers from server {self.openhands_urls[i]}: {result}")
+            else:
+                success_count += 1
+        
+        logger.info(f"Successfully cleared llm servers from {success_count}/{len(self.openhands_urls)} OpenHands servers")    
+    
+    async def _clear_llm_servers_single_server(self, openhands_base_url: str):
+        """
+        Clear LLM servers from a single OpenHands server.
+        
+        Args:
+            openhands_base_url (str): Base URL of the OpenHands server
+            
+        Returns:
+            dict: Response from the OpenHands server
+            
+        Raises:
+            Exception: On HTTP errors or communication failures
+        """
+        try:
+            timeout = aiohttp.ClientTimeout(total=30)
+            session = aiohttp.ClientSession(timeout=timeout)
+            
+            try:
+                url = f"{openhands_base_url}/clear_llm_server"
+                
+                async with session.post(url) as response:
+                    if response.status == 200:
+                        result = await response.json()
+                        logger.info(f"Cleared llm servers from OpenHands server {openhands_base_url}: {result}")
+                        return result
+                    else:
+                        error_text = await response.text()
+                        logger.error(f"Failed to clear llm servers from OpenHands server {openhands_base_url}, "
+                                   f"HTTP {response.status}: {error_text}")
+                        raise Exception(f"HTTP {response.status}: {error_text}")
+            finally:
+                await session.close()
+        except Exception as e:
+            logger.error(f"Error clearing llm servers from OpenHands server {openhands_base_url}: {e}")
+            raise
+    
+    async def start_servers(self):
+        """
+        Start all OpenHands servers concurrently.
+        
+        This method activates all configured OpenHands servers, preparing them
+        for request processing. It's called before beginning conversation generation
+        to ensure all servers are ready to handle requests.
+        
+        The startup process is performed concurrently to minimize initialization time.
+        """
+        if not self.openhands_urls:
+            logger.warning("No OpenHands base URLs configured, skipping server start")
+            raise ValueError("No OpenHands base URLs configured. Please set actor_rollout_ref.rollout.openhands_base_url.")
+        
+        logger.info(f"Starting {len(self.openhands_urls)} OpenHands servers")
+        
+        # Start all servers concurrently
+        tasks = []
+        for openhands_url in self.openhands_urls:
+            task = self._start_single_server(openhands_url)
+            tasks.append(task)
+        
+        # Wait for all servers to start
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        
+        # Process results and provide feedback
+        success_count = 0
+        for i, result in enumerate(results):
+            if isinstance(result, Exception):
+                logger.error(f"Failed to start server {self.openhands_urls[i]}: {result}")
+            else:
+                success_count += 1
+        
+        logger.info(f"Successfully started {success_count}/{len(self.openhands_urls)} OpenHands servers")
+
+    async def stop_servers(self):
+        """
+        Stop all OpenHands servers concurrently.
+        
+        This method gracefully shuts down all OpenHands servers after request
+        processing is complete. It ensures proper cleanup and resource deallocation.
+        
+        The shutdown process is performed concurrently to minimize cleanup time.
+        """
+        if not self.openhands_urls:
+            logger.warning("No OpenHands base URLs configured, skipping server stop")
+            raise ValueError("No OpenHands base URLs configured. Please set actor_rollout_ref.rollout.openhands_base_url.")
+        
+        logger.info(f"Stopping {len(self.openhands_urls)} OpenHands servers")
+        
+        # Stop all servers concurrently
+        tasks = []
+        for openhands_url in self.openhands_urls:
+            task = self._stop_single_server(openhands_url)
+            tasks.append(task)
+        
+        # Wait for all servers to stop
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        
+        # Process results and provide feedback
+        success_count = 0
+        for i, result in enumerate(results):
+            if isinstance(result, Exception):
+                logger.error(f"Failed to stop server {self.openhands_urls[i]}: {result}")
+            else:
+                success_count += 1
+        
+        logger.info(f"Successfully stopped {success_count}/{len(self.openhands_urls)} OpenHands servers")
+
+    async def _start_single_server(self, openhands_base_url: str):
+        """
+        Start a single OpenHands server.
+        
+        This method sends a start request to a specific OpenHands server and waits
+        for successful activation. It includes proper error handling and timeout management.
+        
+        Args:
+            openhands_base_url (str): Base URL of the OpenHands server to start
+            
+        Returns:
+            dict: Response from the OpenHands server on successful startup
+            
+        Raises:
+            Exception: On HTTP errors, timeouts, or communication failures
+        """
+        try:
+            timeout = aiohttp.ClientTimeout(total=30)
+            session = aiohttp.ClientSession(timeout=timeout)
+            
+            try:
+                url = f"{openhands_base_url}/start"
+                
+                async with session.post(url) as response:
+                    if response.status == 200:
+                        result = await response.json()
+                        logger.info(f"OpenHands server {openhands_base_url} started successfully: {result}")
+                        return result
+                    else:
+                        error_text = await response.text()
+                        logger.error(f"Failed to start OpenHands server {openhands_base_url}, "
+                                   f"HTTP {response.status}: {error_text}")
+                        raise Exception(f"HTTP {response.status}: {error_text}")
+                        
+            finally:
+                await session.close()
+                
+        except asyncio.TimeoutError:
+            error_msg = f"Timeout starting OpenHands server {openhands_base_url}"
+            logger.error(error_msg)
+            raise Exception(error_msg)
+        except Exception as e:
+            logger.error(f"Error starting OpenHands server {openhands_base_url}: {e}")
+            raise
+
+    async def _stop_single_server(self, openhands_base_url: str):
+        """
+        Stop a single OpenHands server.
+        
+        This method sends a stop request to a specific OpenHands server and waits
+        for graceful shutdown. It includes proper error handling and timeout management.
+        
+        Args:
+            openhands_base_url (str): Base URL of the OpenHands server to stop
+            
+        Returns:
+            dict: Response from the OpenHands server on successful shutdown
+            
+        Raises:
+            Exception: On HTTP errors, timeouts, or communication failures
+        """
+        try:
+            timeout = aiohttp.ClientTimeout(total=30)
+            session = aiohttp.ClientSession(timeout=timeout)
+            
+            try:
+                url = f"{openhands_base_url}/stop"
+                
+                async with session.post(url) as response:
+                    if response.status == 200:
+                        result = await response.json()
+                        logger.info(f"OpenHands server {openhands_base_url} stopped successfully: {result}")
+                        return result
+                    else:
+                        error_text = await response.text()
+                        logger.error(f"Failed to stop OpenHands server {openhands_base_url}, "
+                                   f"HTTP {response.status}: {error_text}")
+                        raise Exception(f"HTTP {response.status}: {error_text}")
+                        
+            finally:
+                await session.close()
+                
+        except asyncio.TimeoutError:
+            error_msg = f"Timeout stopping OpenHands server {openhands_base_url}"
+            logger.error(error_msg)
+            raise Exception(error_msg)
+        except Exception as e:
+            logger.error(f"Error stopping OpenHands server {openhands_base_url}: {e}")
+            raise
+
+    async def request_from_openhands(self, messages: List, val_mode=False):
+        """
+        Process conversation requests using OpenHands servers with intelligent load balancing.
+        
+        This method implements a sophisticated request distribution system that:
+        
+        1. Server Management:
+           - Starts all OpenHands servers concurrently
+           - Tracks server availability in real-time
+           - Gracefully shuts down servers after processing
+        
+        2. Load Balancing:
+           - Distributes requests across available servers
+           - Uses job queue for fair work distribution
+           - Implements server availability tracking
+        
+        3. Error Handling:
+           - Retries failed requests up to 3 times
+           - Handles server communication failures gracefully
+           - Provides detailed error reporting and logging
+        
+        4. Progress Monitoring:
+           - Tracks processing progress in real-time
+           - Reports active tasks and server utilization
+           - Provides comprehensive timing information
+        
+        Args:
+            messages (List): List of message dictionaries to process
+                Each message contains instance data and trajectory information
+                
+        Returns:
+            dict: Mapping of instance_id to trajectory results
+                Structure: {instance_id: {trajectory_id: result_dict}}
+                Each result_dict contains conversation outcomes and metadata
+                
+        The method uses asyncio queues and locks to ensure thread-safe operation
+        and optimal resource utilization across the server pool.
+        """
+        import time
+        
+        # Start total timing for performance analysis
+        total_start_time = time.time()
+        
+        # Start all OpenHands servers concurrently
+        server_start_time = time.time()
+        await self.start_servers()
+        server_end_time = time.time()
+        logger.info(f"Starting OpenHands servers took: {server_end_time - server_start_time:.2f} seconds")
+        
+        try:
+            # Initialize result storage
+            all_responses = {}
+            
+            if not self.openhands_urls:
+                logger.error("No OpenHands base URLs configured")
+                raise ValueError("No OpenHands base URLs configured. Please set actor_rollout_ref.rollout.openhands_base_url.")
+                
+            # Thread-safe queue management with asyncio locks
+            job_queue_lock = asyncio.Lock()
+            results_queue_lock = asyncio.Lock()
+            available_servers_queue_lock = asyncio.Lock()
+            
+            # Create job queue and populate with all messages
+            job_queue = asyncio.Queue()
+            for i, message in enumerate(messages):
+                await job_queue.put((i, message, 0))  # (index, message, retry_count)
+            # Create results queue for completed work
+            results_queue = asyncio.Queue()
+            
+            # Create server availability queue
+            available_servers_queue = asyncio.Queue()
+            
+            # Initialize server pool - all servers start as available
+            max_active_tasks = 0
+            for server_url in self.openhands_urls:
+                for worker_idx in range(self.openhands_num_workers):
+                    await available_servers_queue.put(f"{server_url}|worker_{worker_idx}")
+                    max_active_tasks += 1
+            
+            logger.info(f"Initialized {available_servers_queue.qsize()} available server workers")
+            
+            # Job dispatcher - assigns jobs to available servers
+            async def job_dispatcher():
+                """Continuously dispatch jobs to available servers."""
+                while True:
+                    try:
+                        # Check for available work and servers
+                        async with job_queue_lock:
+                            async with available_servers_queue_lock:
+                                if job_queue.empty() or available_servers_queue.empty():
+                                    await asyncio.sleep(0.1)  # Brief pause before retry
+                                    continue
+                                    
+                                # Get next job and available server
+                                job_task = asyncio.create_task(job_queue.get())
+                                server_task = asyncio.create_task(available_servers_queue.get())
+
+                                # Wait for both to be ready
+                                done, pending = await asyncio.wait(
+                                    [job_task, server_task], 
+                                    return_when=asyncio.ALL_COMPLETED
+                                )
+                                
+                                # Extract job and server information
+                                message_index, message, retry_count = job_task.result()
+                                server_worker_id = server_task.result()
+                                server_url = server_worker_id.split('|')[0]
+                                
+                                logger.debug(f"Dispatching message {message_index} to {server_worker_id}")
+                                logger.debug(f"Available servers: {available_servers_queue.qsize()}")
+                        
+                        # Create async task to process job on assigned server
+                        asyncio.create_task(
+                            process_job_on_server(message_index, message, retry_count, server_url, server_worker_id)
+                        )
+                        
+                    except asyncio.CancelledError:
+                        break
+                    except Exception as e:
+                        logger.error(f"Job dispatcher error: {e}")
+            
+            # Process individual job on specific server
+            async def process_job_on_server(message_index, message, retry_count, server_url, server_worker_id):
+                """Process a single job on an assigned server."""
+                try:
+                    # Send request to OpenHands server
+                    result, should_retry = await self._send_single_message_to_openhands(
+                        message=message,
+                        message_index=message_index,
+                        openhands_base_url=server_url,
+                        val_mode=val_mode
+                    )
+                    
+                    # Handle retry logic
+                    if should_retry and retry_count < 2:  # Maximum 3 attempts
+                        async with job_queue_lock:
+                            await job_queue.put((message_index, message, retry_count + 1))
+                        logger.info(f"Retrying message {message_index}, attempt {retry_count + 1}")
+                    else:
+                        # Job completed (success or max retries reached)
+                        async with results_queue_lock:
+                            await results_queue.put((message_index, message, result))
+                        async with job_queue_lock:
+                            job_queue.task_done()
+                    
+                    # Return server to available pool
+                    async with available_servers_queue_lock:
+                        await available_servers_queue.put(server_worker_id)
+                        logger.debug(f"Returned server to pool: {server_worker_id}")
+                        
+                except Exception as e:
+                    logger.error(f"Error processing job {message_index}: {e}")
+                    # Return server to pool even on error
+                    async with available_servers_queue_lock:
+                        await available_servers_queue.put(server_worker_id)
+
+            # Start job dispatcher
+            dispatcher_task = asyncio.create_task(job_dispatcher())
+            
+            logger.info(f"Started job dispatcher with {max_active_tasks} concurrent workers "
+                       f"to process {len(messages)} messages")
+            
+            # Collect results as they arrive
+            processing_start_time = time.time()
+            completed_count = 0
+            
+            while completed_count < len(messages):
+                async with results_queue_lock:
+                    if results_queue.empty():
+                        await asyncio.sleep(0.1)
+                        continue
+                    message_index, message, result = await results_queue.get()
+
+
+                # Process and store result
+                instance_id = message['instance_id']
+                trajectory_id = message['trajectory_id']
+                
+                if instance_id not in all_responses:
+                    all_responses[instance_id] = {}
+                
+                # Store result with proper error handling
+                if isinstance(result, Exception):
+                    logger.error(f"Error processing message {message_index}: {result}")
+                    all_responses[instance_id][trajectory_id] = {
+                        "error": str(result),
+                        "success": False,
+                        "messages": [],
+                        "resolved": False,
+                        "finish": False
+                    }
+                elif result is not None:
+                    all_responses[instance_id][trajectory_id] = result
+                else:
+                    logger.warning(f"Message {message_index} returned empty response")
+                    all_responses[instance_id][trajectory_id] = {
+                        "error": "Empty response", 
+                        "success": False,
+                        "messages": [],
+                        "resolved": False,
+                        "finish": False
+                    }
+                
+                completed_count += 1
+                
+                # Log progress periodically
+                if completed_count % 10 == 0 or completed_count == len(messages):
+                    current_time = time.time()
+                    elapsed_time = current_time - processing_start_time
+                    progress_percent = (completed_count / len(messages)) * 100
+                    
+                    async with job_queue_lock:
+                        pending_jobs = job_queue.qsize()
+                    async with available_servers_queue_lock:
+                        available_servers = available_servers_queue.qsize()
+                    
+                    active_tasks = max_active_tasks - available_servers
+                    # Also log the resolved ratio and finish ratio, use all_responses[instance_id][trajectory_id]['resolved'] to find whether the trajectory of this instance is resolved
+                    # You need 2 loops to go through all_responses[instance_id][trajectory_id]['resolved'] to find the resolved ratio and finish ratio
+                    resolved_count = 0
+                    finish_count = 0
+                    all_results_count = 0
+                    resolved_instance_count = 0
+                    for instance_id in all_responses:
+                        resolved_instance_count_tmp = 0
+                        for trajectory_id in all_responses[instance_id]:
+                            all_results_count += 1
+                            if all_responses[instance_id][trajectory_id]['resolved']:
+                                resolved_count += 1
+                                resolved_instance_count_tmp = 1
+                            if all_responses[instance_id][trajectory_id]['finish']:
+                                finish_count += 1
+                        if resolved_instance_count_tmp == 1:
+                            resolved_instance_count += 1
+                    resolved_ratio = resolved_count / all_results_count if all_results_count else 0
+                    finish_ratio = finish_count / all_results_count if all_results_count else 0
+                    resolved_instance_ratio = resolved_instance_count / len(all_responses) if all_responses else 0
+                    logger.info(
+                        f"Progress: {completed_count}/{len(messages)} ({progress_percent:.1f}%), "
+                        f"pending: {pending_jobs}, available: {available_servers}, "
+                        f"active: {active_tasks}/{max_active_tasks}, elapsed: {elapsed_time:.2f}s, "
+                        f"resolved: {resolved_ratio:.2f}, finish: {finish_ratio:.2f}, resolved_instance: {resolved_instance_ratio:.2f}"
+                    )
+            
+            # Clean up dispatcher
+            dispatcher_task.cancel()
+            await asyncio.gather(dispatcher_task, return_exceptions=True)
+            
+            processing_end_time = time.time()
+            total_end_time = time.time()
+            
+            logger.info(f"Request processing completed in {processing_end_time - processing_start_time:.2f}s")
+            logger.info(f"Total request time: {total_end_time - total_start_time:.2f}s")
+            logger.info(f"Processed {len(all_responses)} instances successfully")
+            
+            return all_responses
+            
+        finally:
+            # Always stop servers for cleanup
+            stop_start_time = time.time()
+            await self.stop_servers()
+            stop_end_time = time.time()
+            logger.info(f"Server shutdown took: {stop_end_time - stop_start_time:.2f}s")
+
+    async def _send_single_message_to_openhands(self, message: dict, message_index: int, openhands_base_url: str, val_mode=False):
+        """
+        Send single message to openhands server.
+        
+        This method handles the HTTP communication with OpenHands servers to process
+        a single conversation request. It includes comprehensive error handling,
+        timeout management, and retry logic.
+        
+        Args:
+            message (dict): A single message dictionary containing the conversation
+                to be processed by OpenHands.
+            message_index (int): The index of the message in the input batch.
+            openhands_base_url (str): The base URL of the OpenHands server.
+            
+        Returns:
+            tuple: A tuple (result, should_retry) where:
+                - result (dict or Exception): The response from OpenHands or an exception.
+                - should_retry (bool): True if the message should be retried, False otherwise.
+        """
+        try:
+            request_data = {
+                "instance": message,
+                "sampling_params": self.val_sampling_params if val_mode else self.sampling_params
+            }
+            
+            timeout = aiohttp.ClientTimeout(total=self.config.rollout.get('openhands_timeout', 1000))
+            session = aiohttp.ClientSession(timeout=timeout)
+            
+            try:
+                async with session.post(
+                    url=f"{openhands_base_url}/process",
+                    headers={"Content-Type": "application/json"},
+                    json=request_data,
+                ) as resp:
+                    if resp.status == 200:
+                        data = await resp.json()
+                        out_message=data.get("messages",[])
+                        if out_message and len(out_message)>0:
+                            return data, False  # No retry needed
+                        else:
+                            return None, True  # Retry needed
+                    else:
+                        error_text = await resp.text()
+                        print("Failed to process request:", error_text)
+                        return None, True  # Retry needed
+                        
+            finally:
+                await session.close()
+                
+        except asyncio.TimeoutError as e:
+            logger.error(f"Timeout error sending message {message_index} to OpenHands: {e}")
+            return None, True  # Retry needed
+        except Exception as e:
+            logger.error(f"Error sending message {message_index} to OpenHands: {e}")
+            return None, True  # Retry needed
+
+    def get_server_info(self) -> List[Tuple[str, int]]:
+        """
+        Get address and port information for all LLM servers.
+        
+        This method provides access to the network addresses of all running LLM servers,
+        useful for monitoring, debugging, and external integrations.
+        
+        Returns:
+            List[Tuple[str, int]]: List of (hostname, port) tuples for each server
+                Each tuple contains:
+                - hostname (str): The hostname or IP address of the server
+                - port (int): The port number where the server is listening
+                
+        The information can be used for:
+        - Health monitoring and status checks
+        - Direct API access for debugging
+        - Load balancing configuration
+        - Service discovery integration
+        """
+        server_info = []
+        for address in self.server_addresses:
+            if address:  # Ensure address is not None
+                host, port = address.split(":")
+                server_info.append((host, int(port)))
+        return server_info
diff --git a/trainer_integration/verl/verl_custom/nvidia/rollout/async_server_dapo.py b/trainer_integration/verl/verl_custom/nvidia/rollout/async_server_dapo.py
new file mode 100644
index 000000000..beabddbaa
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/rollout/async_server_dapo.py
@@ -0,0 +1,583 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Async LLM Server Manager for VERL Framework
+
+This module provides the AsyncLLMServerManager class that manages multiple async LLM server instances
+for distributed inference and rollout generation. It handles:
+- Server lifecycle management (start/stop/wake/sleep)
+- Request distribution and load balancing
+- Integration with OpenHands servers for multi-turn conversations
+- Message formatting and tokenization
+- Result aggregation and conversion to DataProto format
+
+The manager supports both single-node and multi-node deployments with tensor parallelism.
+"""
+
+# Standard library imports for async operations, logging, and networking
+import asyncio
+import logging
+import os
+import socket
+import threading
+from typing import Any, Dict, List, Tuple
+import numpy as np
+# Ray framework for distributed computing
+import ray
+from omegaconf import DictConfig
+import os
+import json
+import uuid
+
+# VERL framework imports
+from verl.protocol import DataProto  # Data protocol for tensor/non-tensor data exchange
+from verl.single_controller.ray.base import RayWorkerGroup  # Ray worker group management
+from verl.workers.rollout.async_server import async_server_class  # Async server implementation
+from async_generator import asynccontextmanager
+# Logging configuration
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "INFO"))
+
+# Chat template management for different model types
+from .chat_template_manager import get_chat_template
+
+# PyTorch and HTTP client imports
+import torch
+import aiohttp
+from verl.utils.model import compute_position_id_with_mask
+from .utils import convert_right_padding_to_left, pad_to_max_length_right, get_unique_id
+
+
+import fastapi
+import uvicorn
+from starlette.requests import Request
+from abc import ABC, abstractmethod
+
+from verl_custom.nvidia.rollout.async_server import AsyncLLMServerManager
+
+
+class AsyncLLMServerManagerDAPO(AsyncLLMServerManager):
+    """
+    AsyncLLMServerManagerDAPO
+    """
+
+    def __init__(self, config: DictConfig, worker_group: RayWorkerGroup):
+        """
+        Initialize the AsyncLLMServerManagerDAPO.
+        """
+        super().__init__(config, worker_group)
+        self.data_loader = None
+        self.all_input_batch = None
+        self.last_data_index = 0
+        self.job_queue = asyncio.PriorityQueue()
+
+    def generate_sequences_dapo(self) -> DataProto:
+        import time
+        # Start total timing for performance analysis
+        total_start_time = time.time()
+        request_start_time = time.time()
+        output_messages,input_batch = asyncio.run(self.request_from_openhands_dapo())
+        request_end_time = time.time()
+
+        convert_results_start_time = time.time()
+        if self.config.rollout.get("token_level_generation", False):
+            response = self._convert_results_to_dataproto_token(output_messages,input_batch)
+        else:
+            response = self._convert_results_to_dataproto(output_messages)
+
+        convert_results_end_time = time.time()
+        total_end_time = time.time()
+        
+        # Log detailed timing information for performance monitoring
+        logger.info(f"convert_results_to_dataproto time: {convert_results_end_time - convert_results_start_time:.3f}s")
+
+        input_batch.non_tensor_batch["uid"] = np.array([str(uuid.uuid4()) for _ in range(len(input_batch.batch))], dtype=object)
+        input_batch = input_batch.repeat(repeat_times=self.config.rollout.n, interleave=True)
+        out_batch = input_batch.union(response)
+        # Add comprehensive timing breakdown
+        # Ensure meta_info exists for timing data
+        if not hasattr(out_batch, 'meta_info') or out_batch.meta_info is None:
+            out_batch.meta_info = {}
+        out_batch.meta_info["timing"] = {
+            "total": total_end_time - total_start_time,
+            "openhands_request": request_end_time - request_start_time,
+            "convert_results_to_dataproto": convert_results_end_time - convert_results_start_time,
+            "generate_sequences": total_end_time - total_start_time  # Main timing metric
+        }
+        return out_batch
+  
+    def process_batch(self, batch_dict):
+        batch = DataProto.from_single_dict(batch_dict)
+        batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
+        non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
+
+        batch.pop(
+            batch_keys=batch_keys_to_pop,
+            non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
+        )
+        return batch
+
+    async def refill_job_queue(self,data_index):
+        """
+        Refile the job queue with the remaining messages.
+        """
+        assert self.data_loader is not None
+        next_batch = next(iter(self.data_loader))
+        batch = self.process_batch(next_batch)
+        messages = self.DataProto2Messages(batch)
+        for message in messages:
+            await self.job_queue.put((data_index, (data_index, message, 0)))
+            data_index += 1
+        return batch,data_index
+
+    async def request_from_openhands_dapo(self):
+        import time
+        requested_batch_size = self.full_config.data.train_batch_size
+        # Start total timing for performance analysis
+        total_start_time = time.time()
+        
+        # Start all OpenHands servers concurrently
+        server_start_time = time.time()
+        await self.start_servers()
+        server_end_time = time.time()
+        logger.info(f"Starting OpenHands servers took: {server_end_time - server_start_time:.2f} seconds")
+
+        self.existing_ids = set()
+        
+        try:
+            # Initialize result storage
+            all_responses = {}
+            
+            if not self.openhands_urls:
+                logger.error("No OpenHands base URLs configured")
+                raise ValueError("No OpenHands base URLs configured. Please set actor_rollout_ref.rollout.openhands_base_url.")
+                
+            # Thread-safe queue management with asyncio locks
+            job_queue_lock = asyncio.Lock()
+            results_queue_lock = asyncio.Lock()
+            available_servers_queue_lock = asyncio.Lock()
+            
+            
+            # for i, message in enumerate(messages):
+            #     await job_queue.put((i, message, 0))  # (index, message, retry_count)            
+            # Create results queue for completed work
+            results_queue = asyncio.PriorityQueue()
+            
+            # Create server availability queue
+            available_servers_queue = asyncio.Queue()
+            
+            # Track all active process_job_on_server tasks for cleanup
+            active_job_tasks = []
+            active_job_tasks_lock = asyncio.Lock()
+            
+            # Initialize server pool - all servers start as available
+            max_active_tasks = 0
+            for server_url in self.openhands_urls:
+                for worker_idx in range(self.openhands_num_workers):
+                    await available_servers_queue.put(f"{server_url}|worker_{worker_idx}")
+                    max_active_tasks += 1
+            
+            logger.info(f"Initialized {available_servers_queue.qsize()} available server workers")
+            
+            # Job dispatcher - assigns jobs to available servers
+            async def job_dispatcher():
+                """Continuously dispatch jobs to available servers."""
+                data_index=self.last_data_index
+
+                while True:
+                    try:
+                        # Check for available work and servers
+                        async with job_queue_lock:
+                            async with available_servers_queue_lock:
+                                if available_servers_queue.empty():
+                                    await asyncio.sleep(0.1)  # Brief pause before retry
+                                    continue
+                                if self.job_queue.empty():
+                                    input_batch,data_index = await self.refill_job_queue(data_index)
+                                    if self.all_input_batch is None:
+                                        self.all_input_batch = input_batch
+                                    else:
+                                        self.all_input_batch = DataProto.concat([self.all_input_batch, input_batch])
+                                    continue
+                                # Get next job and available server
+                                job_task = asyncio.create_task(self.job_queue.get())
+                                server_task = asyncio.create_task(available_servers_queue.get())
+
+                                # Wait for both to be ready
+                                done, pending = await asyncio.wait(
+                                    [job_task, server_task], 
+                                    return_when=asyncio.ALL_COMPLETED
+                                )
+                                
+                                # Extract job and server information
+                                priority, (message_index, message, retry_count) = job_task.result()
+                                server_worker_id = server_task.result()
+                                server_url = server_worker_id.split('|')[0]
+                                
+                                logger.debug(f"Dispatching message {message_index} to {server_worker_id}")
+                                logger.debug(f"Available servers: {available_servers_queue.qsize()}")
+                        
+                        # Create async task to process job on assigned server
+                        task = asyncio.create_task(
+                            process_job_on_server(message_index, message, retry_count, server_url, server_worker_id)
+                        )
+                        # Track task for cleanup
+                        async with active_job_tasks_lock:
+                            active_job_tasks.append(task)
+                        
+                    except asyncio.CancelledError:
+                        break
+                    except Exception as e:
+                        logger.error(f"Job dispatcher error: {e}")
+            
+            # Process individual job on specific server
+            async def process_job_on_server(message_index, message, retry_count, server_url, server_worker_id):
+                """Process a single job on an assigned server."""
+                current_task = asyncio.current_task()
+                try:
+                    # Send request to OpenHands server
+                    result, should_retry = await self._send_single_message_to_openhands_dapo(
+                        message=message,
+                        message_index=message_index,
+                        openhands_base_url=server_url
+                    )
+                    
+                    # Handle retry logic
+                    if should_retry and retry_count < 2:  # Maximum 3 attempts
+                        async with job_queue_lock:
+                            await self.job_queue.put((message_index,(message_index, message, retry_count + 1)))
+                        logger.info(f"Retrying message {message_index}, attempt {retry_count + 1}")
+                    else:
+                        # Job completed (success or max retries reached)
+                        async with results_queue_lock:
+                            await results_queue.put((message_index,(message_index, message, result)))
+                        async with job_queue_lock:
+                            self.job_queue.task_done()
+                    
+                    # Return server to available pool
+                    async with available_servers_queue_lock:
+                        await available_servers_queue.put(server_worker_id)
+                        logger.debug(f"Returned server to pool: {server_worker_id}")
+                        
+                except asyncio.CancelledError:
+                    logger.info(f"Job {message_index} processing was cancelled")
+                    # Return server to pool even when cancelled
+                    async with available_servers_queue_lock:
+                        await available_servers_queue.put(server_worker_id)
+                    raise  # Re-raise to properly handle cancellation
+                except Exception as e:
+                    logger.error(f"Error processing job {message_index}: {e}")
+                    # Return server to pool even on error
+                    async with available_servers_queue_lock:
+                        await available_servers_queue.put(server_worker_id)
+                finally:
+                    # Remove task from tracking list
+                    async with active_job_tasks_lock:
+                        if current_task in active_job_tasks:
+                            active_job_tasks.remove(current_task)
+
+            # Start job dispatcher
+            dispatcher_task = asyncio.create_task(job_dispatcher())
+            
+            logger.info(f"Started job dispatcher with {max_active_tasks} concurrent workers ")
+            
+            # Collect results as they arrive
+            processing_start_time = time.time()
+            completed_count = 0
+            filtered_instance_ids = []
+            while True:
+                async with results_queue_lock:
+                    if results_queue.empty():
+                        await asyncio.sleep(0.1)
+                        continue
+                    priority, (message_index, message, result) = await results_queue.get()
+                # Process and store result
+                instance_id = message['instance_id']
+                trajectory_id = message['trajectory_id']
+                
+                if instance_id not in all_responses:
+                    all_responses[instance_id] = {}
+                
+                # Store result with proper error handling
+                if isinstance(result, Exception):
+                    logger.error(f"Error processing message {message_index}: {result}")
+                    all_responses[instance_id][trajectory_id] = {
+                        "error": str(result),
+                        "success": False,
+                        "messages": [],
+                        "resolved": False,
+                        "finish": False
+                    }
+                elif result is not None:
+                    all_responses[instance_id][trajectory_id] = result
+                else:
+                    logger.warning(f"Message {message_index} returned empty response")
+                    all_responses[instance_id][trajectory_id] = {
+                        "error": "Empty response", 
+                        "success": False,
+                        "messages": [],
+                        "resolved": False,
+                        "finish": False
+                    }
+                
+                completed_count += 1
+                num_completed_instances = 0
+
+                all_responses,filtered_instance_ids_tmp = self.filter_easy_hard_instance(all_responses)
+                filtered_instance_ids.extend(filtered_instance_ids_tmp)
+
+                for instance_id in all_responses:
+                    if len(all_responses[instance_id]) == self.num_trajectories:
+                        num_completed_instances += 1
+                logger.error(f"num_completed_instances: {num_completed_instances}")
+                
+                # Log progress periodically
+                if completed_count % 4 == 0 or num_completed_instances == requested_batch_size:
+                    current_time = time.time()
+                    elapsed_time = current_time - processing_start_time
+                    progress_percent = (num_completed_instances / requested_batch_size) * 100
+                    
+                    async with job_queue_lock:
+                        pending_jobs = self.job_queue.qsize()
+                    async with available_servers_queue_lock:
+                        available_servers = available_servers_queue.qsize()
+                    
+                    active_tasks = max_active_tasks - available_servers
+                    # Also log the resolved ratio and finish ratio, use all_responses[instance_id][trajectory_id]['resolved'] to find whether the trajectory of this instance is resolved
+                    # You need 2 loops to go through all_responses[instance_id][trajectory_id]['resolved'] to find the resolved ratio and finish ratio
+                    resolved_count = 0
+                    finish_count = 0
+                    all_results_count = 0
+                    resolved_instance_count = 0
+                    for instance_id in all_responses:
+                        resolved_instance_count_tmp = 0
+                        for trajectory_id in all_responses[instance_id]:
+                            all_results_count += 1
+                            if all_responses[instance_id][trajectory_id]['resolved']:
+                                resolved_count += 1
+                                resolved_instance_count_tmp = 1
+                            if all_responses[instance_id][trajectory_id]['finish']:
+                                finish_count += 1
+                        if resolved_instance_count_tmp == 1:
+                            resolved_instance_count += 1
+                    resolved_ratio = resolved_count / all_results_count if all_results_count else 0
+                    finish_ratio = finish_count / all_results_count if all_results_count else 0
+                    resolved_instance_ratio = resolved_instance_count / len(all_responses) if all_responses else 0
+                    logger.info(
+                        f"Progress: {num_completed_instances}/{requested_batch_size} ({progress_percent:.1f}%), "
+                        f"pending: {pending_jobs}, available: {available_servers}, "
+                        f"active: {active_tasks}/{max_active_tasks}, elapsed: {elapsed_time:.2f}s, "
+                        f"resolved: {resolved_ratio:.2f}, finish: {finish_ratio:.2f}, resolved_instance: {resolved_instance_ratio:.2f}"
+                    )
+
+                if num_completed_instances >= requested_batch_size:
+                    logger.error(f"Completed {num_completed_instances} instances, requested batch size is {requested_batch_size}")
+                    break
+            
+            # Clean up all tasks
+            logger.info("Stopping all tasks...")
+            
+            # Cancel job dispatcher
+            dispatcher_task.cancel()
+            
+            # Cancel all active process_job_on_server tasks
+            async with active_job_tasks_lock:
+                active_tasks_to_cancel = active_job_tasks.copy()
+            
+            if active_tasks_to_cancel:
+                logger.info(f"Cancelling {len(active_tasks_to_cancel)} active job processing tasks")
+                for task in active_tasks_to_cancel:
+                    task.cancel()
+            
+            # Wait for all tasks to complete or be cancelled
+            all_tasks = [dispatcher_task] + active_tasks_to_cancel
+            if all_tasks:
+                await asyncio.gather(*all_tasks, return_exceptions=True)
+                logger.info("All tasks have been stopped")
+            
+            processing_end_time = time.time()
+            total_end_time = time.time()
+            logger.info(f"Request processing completed in {processing_end_time - processing_start_time:.2f}s")
+            logger.info(f"Total request time: {total_end_time - total_start_time:.2f}s")
+            logger.info(f"Processed {len(all_responses)} instances successfully")
+            # the set of instance_ids in self.all_input_batch before filtering
+            instance_ids_before_filtering = set([instance['instance_id'] for instance in self.all_input_batch.non_tensor_batch['instance']])
+            #filter the unfinished instances in all_responses and self.all_input_batch and reamin unfinished instances in self.all_input_batch
+            all_responses,output_batch = self.filter_uncompleted_instances(all_responses, filtered_instance_ids)
+            # the set of instance_ids in output_batch
+            instance_ids_in_output_batch = set([instance['instance_id'] for instance in output_batch.non_tensor_batch['instance']])
+            # the set of instance_ids in self.all_input_batch after filtering
+            instance_ids_after_filtering = set([instance['instance_id'] for instance in self.all_input_batch.non_tensor_batch['instance']])
+            # the set of instance_ids in filtered_instance_ids
+            instance_ids_in_filtered_instance_ids = set(filtered_instance_ids)
+            # assert instance_ids_before_filtering is equal to instance_ids_in_all_input_batch_after_filtering + instance_ids_in_output_batch (how to compare two sets?)
+            assert instance_ids_before_filtering == instance_ids_after_filtering.union(instance_ids_in_output_batch).union(instance_ids_in_filtered_instance_ids), f"The instance_ids before filtering and the instance_ids in the output batch are not the same, {instance_ids_before_filtering} != {instance_ids_after_filtering} + {instance_ids_in_output_batch} + {instance_ids_in_filtered_instance_ids}"
+            # put the remaining train data back to the job queue
+            await self.push_remaining_train_data_to_job_queue()
+            
+            return all_responses, output_batch
+        except Exception as e:
+            logger.error(f"Error in request_from_openhands_dapo: {e}")
+            raise
+            
+        finally:
+            # Always stop servers for cleanup
+            stop_start_time = time.time()
+            await self.stop_servers()
+            stop_end_time = time.time()
+            logger.info(f"Server shutdown took: {stop_end_time - stop_start_time:.2f}s")
+        
+    async def push_remaining_train_data_to_job_queue(self):
+        """
+        Push the remaining train data to the job queue
+        """
+        self.last_data_index = 0
+        messages = self.DataProto2Messages(self.all_input_batch)
+        for message in messages:
+            await self.job_queue.put((self.last_data_index, (self.last_data_index, message, 0)))
+            self.last_data_index += 1
+
+    def filter_uncompleted_instances(self, all_responses,filtered_instance_ids):
+        """
+        Filter the uncompleted instances
+        """
+        instance_id_to_batch_idx = {}
+        for i, instance in enumerate(self.all_input_batch.non_tensor_batch['instance']):
+            instance_id = instance['instance_id']
+            instance_id_to_batch_idx[instance_id] = i
+        instance_ids_to_keep = []
+        batch_idx_to_keep = []
+        instance_ids=list(all_responses.keys())
+        for instance_id in instance_ids:
+            if len(all_responses[instance_id]) != self.num_trajectories:
+                all_responses.pop(instance_id)
+            else:
+                instance_ids_to_keep.append(instance_id)
+                batch_idx_to_keep.append(instance_id_to_batch_idx[instance_id])
+        output_batch = self.all_input_batch.select_idxs(batch_idx_to_keep)
+        # remain the instances that are not in filtered_instance_ids and not in instance_ids_to_keep
+        batch_idx_to_remain = []
+        for i, instance in enumerate(self.all_input_batch.non_tensor_batch['instance']):
+            if not instance['instance_id'] in filtered_instance_ids and not instance['instance_id'] in instance_ids_to_keep:
+                batch_idx_to_remain.append(i)
+        self.all_input_batch = self.all_input_batch.select_idxs(batch_idx_to_remain)
+        assert len(output_batch.non_tensor_batch['instance']) == len(all_responses) and len(all_responses) == self.full_config.data.train_batch_size, f"The number of instances in the input batch and the number of instances in the responses are not the same, {len(output_batch.non_tensor_batch['instance'])} != {len(all_responses)} != {self.full_config.data.train_batch_size}"
+        return all_responses,output_batch
+
+    def filter_easy_hard_instance(self, all_responses):
+        """
+        Filter the instance with the resolved ratio
+        """
+        filtered_instance_ids = []
+        keys=list(all_responses.keys())
+        for instance_id in keys:
+            if len(all_responses[instance_id]) == self.num_trajectories:
+                resolved_count = 0
+                for trajectory_id in all_responses[instance_id]:
+                    if all_responses[instance_id][trajectory_id]['resolved']:
+                        resolved_count += 1
+                if resolved_count == self.num_trajectories or resolved_count == 0:
+                    all_responses.pop(instance_id)
+                    filtered_instance_ids.append(instance_id)
+                    logger.info(f"Filtered instance {instance_id} with resolved ratio {resolved_count}/{self.num_trajectories}")
+        return all_responses,filtered_instance_ids
+    
+    async def _send_single_message_to_openhands_dapo(self, message: dict, message_index: int, openhands_base_url: str, val_mode=False):
+        """
+        Send single message to openhands server.
+        
+        This method handles the HTTP communication with OpenHands servers to process
+        a single conversation request. It includes comprehensive error handling,
+        timeout management, and retry logic.
+        
+        Args:
+            message (dict): A single message dictionary containing the conversation
+                to be processed by OpenHands.
+            message_index (int): The index of the message in the input batch.
+            openhands_base_url (str): The base URL of the OpenHands server.
+            
+        Returns:
+            tuple: A tuple (result, should_retry) where:
+                - result (dict or Exception): The response from OpenHands or an exception.
+                - should_retry (bool): True if the message should be retried, False otherwise.
+        """
+        try:
+            request_data = {
+                "job_id": get_unique_id(message,self.existing_ids),
+                "instance": message,
+                "sampling_params": self.val_sampling_params if val_mode else self.sampling_params
+            }
+            self.existing_ids.add(request_data["job_id"])
+            timeout = aiohttp.ClientTimeout(total=self.config.rollout.get('openhands_timeout', 1000))
+            session = aiohttp.ClientSession(timeout=timeout)
+            
+            try:
+                async with session.post(
+                    url=f"{openhands_base_url}/process",
+                    headers={"Content-Type": "application/json"},
+                    json=request_data,
+                ) as resp:
+                    if resp.status == 200:
+                        data = await resp.json()
+                        out_message=data.get("messages",[])
+                        if out_message and len(out_message)>0:
+                            return data, False  # No retry needed
+                        else:
+                            return None, True  # Retry needed
+                    else:
+                        error_text = await resp.text()
+                        logger.error("Failed to process request:", error_text)
+                        return None, True  # Retry needed
+                        
+            finally:
+                await session.close()
+                
+        except asyncio.CancelledError:
+            success = await self.cancel_job(request_data["job_id"],openhands_base_url)
+            if not success:
+                logger.error(f"Failed to cancel job {request_data['job_id']}")
+
+            raise  # Re-raise to properly handle cancellation
+        except asyncio.TimeoutError as e:
+            logger.error(f"Timeout error sending message {message_index} to OpenHands: {e}")
+            success = await self.cancel_job(request_data["job_id"],openhands_base_url)
+            if not success:
+                logger.error(f"Failed to cancel job {request_data['job_id']}")
+
+            return None, True  # Retry needed
+        except Exception as e:
+            logger.error(f"Error sending message {message_index} to OpenHands: {e}")
+            success = await self.cancel_job(request_data["job_id"],openhands_base_url)
+            if not success:
+                logger.error(f"Failed to cancel job {request_data['job_id']}")
+
+            return None, True  # Retry needed
+
+
+
+    async def cancel_job(self, job_id: str,openhands_base_url:str):
+        """
+        Cancel a job by job_id
+        """
+        url = f"{openhands_base_url}/cancel"
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.post(url, json={"job_id": job_id}) as resp:
+                    if resp.status == 200:
+                        return True
+                    else:
+                        return False
+        except Exception as e:
+            return False
\ No newline at end of file
diff --git a/trainer_integration/verl/verl_custom/nvidia/rollout/chat_template_manager.py b/trainer_integration/verl/verl_custom/nvidia/rollout/chat_template_manager.py
new file mode 100644
index 000000000..3f639bc13
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/rollout/chat_template_manager.py
@@ -0,0 +1,161 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Chat Template Manager for supporting multiple chat templates
+"""
+
+import logging
+from typing import Dict, Optional
+
+logger = logging.getLogger(__name__)
+
+
+class ChatTemplateManager:
+    """Manager for different chat templates"""
+    
+    def __init__(self):
+        self._templates: Dict[str, str] = {}
+        self._register_default_templates()
+    
+    def _register_default_templates(self):
+        """Register default chat templates"""
+        # Qwen3 chat template with generation support
+        self._templates["qwen3_chat_template_generation"] = (
+            "{%- if tools %}"
+            "{{'<|im_start|>system\\n' }}"
+            "{%- if messages[0].role == 'system' %}"
+            "{{ messages[0].content + '\\n\\n' }}"
+            "{%- endif %}"
+            "{{'# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\n"
+            "You are provided with function signatures within <tools></tools> XML tags:\\n<tools>' }}"
+            "{%- for tool in tools %}"
+            "{{ '\\n' + (tool | tojson) }}"
+            "{%- endfor %}"
+            "{{ '\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:"
+            "\\n<tool_call>\\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\\n</tool_call><|im_end|>\\n' }}"
+            "{%- else %}"
+            "{%- if messages[0].role == 'system' %}"
+            "{{'<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}"
+            "{%- endif %}"
+            "{%- endif %}"
+            "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}"
+            "{%- for message in messages[::-1] %}"
+            "{%- set index = (messages|length - 1) - loop.index0 %}"
+            "{%- if ns.multi_step_tool and message.role == 'user' and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}"
+            "{%- set ns.multi_step_tool = false %}"
+            "{%- set ns.last_query_index = index %}"
+            "{%- endif %}"
+            "{%- endfor %}"
+            "{%- for message in messages %}"
+            "{%- if message.content is string %}"
+            "{%- set content = message.content %}"
+            "{%- else %}"
+            "{%- set content = '' %}"
+            "{%- endif %}"
+            "{%- if (message.role == 'user') or (message.role == 'system' and not loop.first) %}"
+            "{{'<|im_start|>' + message.role + '\\n' + content + '<|im_end|>\\n' }}"
+            "{%- elif message.role == 'assistant' %}"
+            "{{'<|im_start|>' + message['role'] + '\n'}}"
+            "{% generation %}"
+            "{%- set reasoning_content = '' %}"
+            "{%- if message.reasoning_content is string %}"
+            "{%- set reasoning_content = message.reasoning_content %}"
+            "{%- else %}"
+            "{%- if '</think>' in content %}"
+            "{%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}"
+            "{%- set content = content.split('</think>')[-1].lstrip('\\n') %}"
+            "{%- endif %}"
+            "{%- endif %}"
+            "{%- if loop.index0 > ns.last_query_index %}"
+            "{%- if loop.last or (not loop.last and reasoning_content) %}"
+            "{{ '<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}"
+            "{%- else %}"
+            "{{ content }}"
+            "{%- endif %}"
+            "{%- else %}"
+            "{{ content }}"
+            "{%- endif %}"
+            "{%- if message.tool_calls %}"
+            "{%- for tool_call in message.tool_calls %}"
+            "{%- if (loop.first and content) or (not loop.first) %}"
+            "{{'\\n' }}"
+            "{%- endif %}"
+            "{%- if tool_call.function %}"
+            "{%- set tool_call = tool_call.function %}"
+            "{%- endif %}"
+            "{{'<tool_call>\\n{\"name\": \"' + tool_call.name + '\", \"arguments\": ' + (tool_call.arguments if tool_call.arguments is string else (tool_call.arguments | tojson)) + '}\\n</tool_call>' }}"
+            "{%- endfor %}"
+            "{%- endif %}"
+            "{{'<|im_end|>' }}"
+            "{% endgeneration %}"
+            "{{'\n' }}"
+            "{%- elif message.role == 'tool' %}"
+            "{%- if loop.first or (messages[loop.index0 - 1].role != 'tool') %}"
+            "{{'<|im_start|>user' }}"
+            "{%- endif %}"
+            "{{'\\n<tool_response>\\n' + content + '\\n</tool_response>' }}"
+            "{%- if loop.last or (messages[loop.index0 + 1].role != 'tool') %}"
+            "{{'<|im_end|>\\n' }}"
+            "{%- endif %}"
+            "{%- endif %}"
+            "{%- endfor %}"
+            "{%- if add_generation_prompt %}"
+            "{{'<|im_start|>assistant\\n' }}"
+            "{%- if enable_thinking is defined and enable_thinking is false %}"
+            "{{'<think>\\n\\n</think>\\n\\n' }}"
+            "{%- endif %}"
+            "{%- endif %}"
+        )
+        
+        #Some other templates ...
+    
+    def register_template(self, name: str, template: str):
+        """Register a new chat template"""
+        self._templates[name] = template
+        logger.info(f"Registered chat template: {name}")
+    
+    def get_template(self, name: str) -> Optional[str]:
+        """Get a chat template by name"""
+        if name not in self._templates:
+            logger.warning(f"Chat template '{name}' not found. Available templates: {list(self._templates.keys())}")
+            # Return default template if not found
+            return self._templates.get("qwen3_chat_template_generation")
+        return self._templates[name]
+    
+    def list_templates(self) -> list:
+        """List all available template names"""
+        return list(self._templates.keys())
+    
+    def has_template(self, name: str) -> bool:
+        """Check if a template exists"""
+        return name in self._templates
+
+
+# Global instance
+chat_template_manager = ChatTemplateManager()
+
+
+def get_chat_template(template_name: str) -> str:
+    """Get chat template by name"""
+    return chat_template_manager.get_template(template_name)
+
+
+def register_chat_template(template_name: str, template: str):
+    """Register a new chat template"""
+    chat_template_manager.register_template(template_name, template)
+
+
+# Backward compatibility
+qwen3_chat_template_generation = chat_template_manager.get_template("qwen3_chat_template_generation") 
\ No newline at end of file
diff --git a/trainer_integration/verl/verl_custom/nvidia/rollout/utils.py b/trainer_integration/verl/verl_custom/nvidia/rollout/utils.py
new file mode 100644
index 000000000..928153546
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/rollout/utils.py
@@ -0,0 +1,115 @@
+import torch
+import logging
+
+logger = logging.getLogger(__name__)
+
+def convert_right_padding_to_left(tokenizer, input_ids, attention_mask, device, max_len=None):
+    """
+    Converts right-padded tensors to left-padded tensors with optional custom length.
+    
+    Args:
+        tokenizer: The tokenizer object with pad_token_id attribute
+        input_ids (torch.Tensor): Right-padded input IDs tensor of shape [batch_size, seq_length]
+        attention_mask (torch.Tensor): Right-padded attention mask tensor of shape [batch_size, seq_length]
+        device: The device to place the new tensors on
+        max_len (int, optional): The desired maximum length of the returned tensors.
+                                If None, uses the original sequence length.
+    
+    Returns:
+        tuple: (left_padded_input_ids, left_padded_attention_mask)
+    """
+    batch_size, orig_seq_length = input_ids.size()
+    
+    # Use original length if max_len is not specified
+    seq_length = max_len if max_len is not None else orig_seq_length
+    
+    # Create new tensors with the desired size
+    left_padded_input_ids = torch.full((batch_size, seq_length), 
+                                     tokenizer.pad_token_id, 
+                                     dtype=input_ids.dtype, 
+                                     device=device)
+    left_padded_attention_mask = torch.zeros((batch_size, seq_length), 
+                                           dtype=attention_mask.dtype, 
+                                           device=device)
+    
+    for i in range(batch_size):
+        # Get the non-padded length of this sequence
+        seq_len = attention_mask[i].sum().item()
+        
+        # Trim sequence if it's longer than max_len
+        if seq_len > seq_length:
+            logger.warning(f"Trimming sequence length from {seq_len} to {seq_length}")
+            seq_len = seq_length
+        
+        # Calculate the offset for left padding
+        offset = seq_length - seq_len
+        
+        # Copy the non-padded tokens to the end
+        left_padded_input_ids[i, offset:] = input_ids[i, :seq_len]
+        left_padded_attention_mask[i, offset:] = 1  # Set attention mask for non-padding tokens
+    
+    return left_padded_input_ids, left_padded_attention_mask
+
+def pad_to_max_length_right(tokenizer, encodings, max_length, device):
+    """
+    Pads tokenizer outputs to a specific maximum length with configurable padding side.
+    
+    Args:
+        tokenizer: The tokenizer object with pad_token_id attribute
+        encodings (dict): Dictionary containing 'input_ids', 'attention_mask', and optionally 'assistant_masks'
+        max_length (int): The desired maximum length to pad to
+        device: The device to place the tensors on
+        
+    Returns:
+        dict: Dictionary with padded tensors for 'input_ids', 'attention_mask', and 'assistant_masks' if present
+    """
+    batch_size = len(encodings['input_ids'])
+    
+    # Initialize output tensors
+    padded_input_ids = torch.full((batch_size, max_length), 
+                                tokenizer.pad_token_id, 
+                                dtype=torch.long, 
+                                device=device)
+    padded_attention_mask = torch.zeros((batch_size, max_length), 
+                                      dtype=torch.long, 
+                                      device=device)
+    padded_assistant_mask = torch.zeros((batch_size, max_length), 
+                                          dtype=torch.long, 
+                                          device=device)
+    padded_log_probs = torch.zeros((batch_size, max_length), 
+                                      dtype=torch.float, 
+                                      device=device)
+    
+    # Fill tensors with actual values
+    num_trimmed = 0
+    for i in range(batch_size):
+        seq_len = encodings["attention_mask"][i].sum().item() if isinstance(encodings["attention_mask"][i], torch.Tensor) else sum(encodings["attention_mask"][i])
+        # Trim if longer than max_length
+        actual_len = min(seq_len, max_length)
+        if seq_len > max_length:
+            logger.warning(
+                f"Trimming sequence length from {seq_len} to {actual_len} for batch item {i}"
+            )
+            num_trimmed += 1
+        
+        # Right padding - copy sequence data to the beginning
+        padded_input_ids[i, :actual_len] = torch.tensor(encodings['input_ids'][i][:actual_len], device=device)
+        padded_attention_mask[i, :actual_len] = torch.tensor(encodings['attention_mask'][i][:actual_len], device=device)
+        padded_assistant_mask[i, :actual_len] = torch.tensor(encodings['assistant_masks'][i][:actual_len], device=device)
+        padded_log_probs[i, :actual_len] = torch.tensor(encodings['log_probs'][i][:actual_len], device=device)
+
+    logger.info(f"Trimmed {num_trimmed*100 / max(batch_size, 1)}% of samples in the batch of size {batch_size}")
+    return padded_input_ids, padded_attention_mask, padded_assistant_mask, padded_log_probs
+
+
+
+import hashlib
+import uuid
+def get_unique_id(instance,existing_ids=None):
+    base = f'{instance["instance_id"]}_{instance["trajectory_id"]}'
+    base_hash = hashlib.sha256(base.encode('utf-8')).hexdigest()[:16]
+    while True:
+        rand = uuid.uuid4().hex[:8]
+        uid = f'{base_hash}_{rand}'
+        if uid not in existing_ids:
+            return uid
\ No newline at end of file
diff --git a/trainer_integration/verl/verl_custom/nvidia/rollout/vllm_api_server.py b/trainer_integration/verl/verl_custom/nvidia/rollout/vllm_api_server.py
new file mode 100644
index 000000000..e39ebc18d
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/rollout/vllm_api_server.py
@@ -0,0 +1,184 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+NOTE: This API server is used only for demonstrating usage of AsyncEngine
+and simple performance benchmarks. It is not intended for production use.
+For production use, we recommend using our OpenAI compatible server.
+We are also not going to accept PRs modifying this file, please
+change `vllm/entrypoints/openai/api_server.py` instead.
+"""
+
+import asyncio
+import ssl
+from argparse import Namespace
+from typing import Any, Optional
+
+import vllm.envs as envs
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse, Response
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.entrypoints.launcher import serve_http
+from vllm.entrypoints.utils import with_cancellation
+from vllm.inputs import TokensPrompt
+from vllm.logger import init_logger
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import SamplingParams
+from vllm.usage.usage_lib import UsageContext
+from vllm.utils import FlexibleArgumentParser, random_uuid, set_ulimit
+from vllm.version import __version__ as VLLM_VERSION
+
+logger = init_logger('vllm.nvidia.api_server')
+
+app = FastAPI()
+engine = None
+
+
+@app.get('/health')
+async def health() -> Response:
+    """Health check."""
+    return Response(status_code=200)
+
+
+@app.post('/generate')
+async def generate(request: Request) -> Response:
+    """Generate completion for the request.
+
+    The request should be a JSON object with the following fields:
+    - prompt: the prompt to use for the generation.
+    - stream: whether to stream the results or not.
+    - other fields: the sampling parameters (See `SamplingParams` for details).
+    """
+    request_dict = await request.json()
+    return await _generate(request_dict, raw_request=request)
+
+
+@with_cancellation
+async def _generate(request_dict: dict, raw_request: Request) -> list[int]:
+    prompt_ids = request_dict.pop('prompt_ids')
+    # Set logprobs to 0 to only return selected tokens
+    request_dict['logprobs'] = 0
+    sampling_params = SamplingParams(**request_dict)
+    request_id = random_uuid()
+
+    # Create prompt from token IDs
+    prompt = TokensPrompt(prompt_token_ids=prompt_ids)
+    generator = engine.generate(
+        prompt=prompt, sampling_params=sampling_params, request_id=request_id
+    )
+
+    # Get final response
+    final_res: Optional[RequestOutput] = None
+    try:
+        async for output in generator:
+            final_res = output
+    except asyncio.CancelledError:
+        return Response(status_code=499)
+
+    assert final_res is not None
+
+    def obtain_logprobs(logprobs):
+        if logprobs is None:
+            return None
+        log_probs = []
+        for d in logprobs:
+            cur_logprobs = list(d.values())
+            assert len(cur_logprobs) == 1, f"Expected 1 logprob per token when logprobs=0, but got {len(cur_logprobs)}"
+            log_probs.append(cur_logprobs[0].logprob)
+        return log_probs
+
+    ret = {
+        'response_ids': final_res.outputs[0].token_ids,
+        'logprobs': obtain_logprobs(final_res.outputs[0].logprobs),
+    }
+    return JSONResponse(ret)
+
+
+def build_app(args: Namespace) -> FastAPI:
+    global app
+
+    app.root_path = args.root_path
+    return app
+
+
+async def init_app(
+    args: Namespace,
+    llm_engine: Optional[AsyncLLMEngine] = None,
+) -> FastAPI:
+    app = build_app(args)
+
+    global engine
+
+    engine_args = AsyncEngineArgs.from_cli_args(args)
+    engine = (
+        llm_engine
+        if llm_engine is not None
+        else AsyncLLMEngine.from_engine_args(
+            engine_args, usage_context=UsageContext.API_SERVER
+        )
+    )
+    app.state.engine_client = engine
+    return app
+
+
+async def run_server(
+    args: Namespace, llm_engine: Optional[AsyncLLMEngine] = None, **uvicorn_kwargs: Any
+) -> None:
+    logger.info('vLLM API server version %s', VLLM_VERSION)
+    logger.info('args: %s', args)
+
+    set_ulimit()
+
+    app = await init_app(args, llm_engine)
+    assert engine is not None
+
+    shutdown_task = await serve_http(
+        app,
+        sock=None,
+        enable_ssl_refresh=args.enable_ssl_refresh,
+        host=args.host,
+        port=args.port,
+        log_level=args.log_level,
+        timeout_keep_alive=getattr(envs, 'VLLM_HTTP_TIMEOUT_KEEP_ALIVE', 5),
+        ssl_keyfile=args.ssl_keyfile,
+        ssl_certfile=args.ssl_certfile,
+        ssl_ca_certs=args.ssl_ca_certs,
+        ssl_cert_reqs=args.ssl_cert_reqs,
+        **uvicorn_kwargs,
+    )
+
+    await shutdown_task
+
+
+if __name__ == '__main__':
+    parser = FlexibleArgumentParser()
+    parser.add_argument('--host', type=str, default=None)
+    parser.add_argument('--port', type=parser.check_port, default=8000)
+    parser.add_argument('--ssl-keyfile', type=str, default=None)
+    parser.add_argument('--ssl-certfile', type=str, default=None)
+    parser.add_argument(
+        '--ssl-ca-certs', type=str, default=None, help='The CA certificates file'
+    )
+    parser.add_argument(
+        '--enable-ssl-refresh',
+        action='store_true',
+        default=False,
+        help='Refresh SSL Context when SSL certificate files change',
+    )
+    parser.add_argument(
+        '--ssl-cert-reqs',
+        type=int,
+        default=int(ssl.CERT_NONE),
+        help="Whether client certificate is required (see stdlib ssl module's)",
+    )
+    parser.add_argument(
+        '--root-path',
+        type=str,
+        default=None,
+        help='FastAPI root_path when app is behind a path based routing proxy',
+    )
+    parser.add_argument('--log-level', type=str, default='error')
+    parser = AsyncEngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+
+    asyncio.run(run_server(args))
diff --git a/trainer_integration/verl/verl_custom/nvidia/rollout/vllm_async_server.py b/trainer_integration/verl/verl_custom/nvidia/rollout/vllm_async_server.py
new file mode 100644
index 000000000..d976aedda
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/rollout/vllm_async_server.py
@@ -0,0 +1,263 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import cloudpickle
+import ray
+from omegaconf import DictConfig
+from starlette.requests import Request
+from starlette.responses import JSONResponse, StreamingResponse, Response
+from vllm import SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ChatCompletionResponse, ErrorResponse
+from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
+from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.v1.executor.abstract import Executor
+from vllm.worker.worker_base import WorkerWrapperBase
+
+from verl.utils.fs import copy_to_local
+from verl_custom.nvidia.rollout.async_server import AsyncServerBase
+
+from vllm.inputs import TokensPrompt
+from vllm.outputs import RequestOutput
+from vllm.utils import random_uuid
+import asyncio
+
+logger = logging.getLogger(__file__)
+
+
+class ExternalRayDistributedExecutor(Executor):
+    """An executor that engines are launched by external ray actors."""
+
+    uses_ray: bool = False
+
+    def _init_executor(self) -> None:
+        assert self.vllm_config.instance_id is not None, "instance_id must be set for external ray actors."
+
+        fields = self.vllm_config.instance_id.split(":")
+        assert len(fields) == 4, f"instance_id: {self.vllm_config.instance_id} must be in the format of <namespace>:<wg_prefix>:<vllm_dp_size>:<vllm_dp_rank>."
+        namespace, wg_prefix, vllm_dp_size, vllm_dp_rank = fields[0], fields[1], int(fields[2]), int(fields[3])
+
+        # Make sure subprocess in same namespace as parent actor.
+        # actor name format: {name_prefix}WorkerDict_{pg_idx}:{local_rank}
+        ray.init(address="auto", namespace=namespace)
+        actor_names = [actor_name for actor_name in ray.util.list_named_actors() if actor_name.startswith(f"{wg_prefix}WorkerDict")]
+
+        vllm_tp_size = self.vllm_config.parallel_config.tensor_parallel_size
+        assert len(actor_names) == vllm_dp_size * vllm_tp_size, f"instance_id: {self.vllm_config.instance_id} has {len(actor_names)} actors, but vllm_dp_size: {vllm_dp_size} * vllm_tp_size: {vllm_tp_size} = {vllm_dp_size * vllm_tp_size} is expected."
+
+        def get_pg_index_and_local_rank(actor_name) -> Tuple[int, int]:
+            fields = actor_name.split(":")
+            assert len(fields) == 2, f"invalid actor name: {actor_name}"
+            pg_index, local_rank = int(fields[0].split("_")[-1]), int(fields[1])
+            return pg_index, local_rank
+
+        # sort actor names by pg_index and local_rank
+        actor_names = sorted(actor_names, key=get_pg_index_and_local_rank)
+        actor_names = actor_names[vllm_dp_rank * vllm_tp_size : (vllm_dp_rank + 1) * vllm_tp_size]
+        self.workers: List[WorkerWrapperBase] = [ray.get_actor(actor_name) for actor_name in actor_names]
+        print(f"instance_id: {self.vllm_config.instance_id} initializes with external actors: {actor_names}")
+
+        kwargs = dict(
+            vllm_config=self.vllm_config,
+            local_rank=None,
+            rank=None,
+            distributed_init_method="env://",
+            is_driver_worker=True,
+        )
+        self.collective_rpc("init_worker", args=([kwargs],))
+        self.collective_rpc("init_device")
+        self.collective_rpc("load_model")
+        print(f"instance_id: {self.vllm_config.instance_id} initializes finished.")
+
+    def collective_rpc(
+        self,
+        method: Union[str, Callable],
+        timeout: Optional[float] = None,
+        args: Tuple = (),
+        kwargs: Optional[Dict[str, Any]] = None,
+    ) -> List[Any]:
+        # TODO(wuxibin): support ray compiled graph
+        if isinstance(method, str):
+            sent_method = method
+        else:
+            sent_method = cloudpickle.dumps(method)
+        del method
+
+        # ~3ms overhead per schedule step due to SchedulerOutput/ModelRunnerOutput serialization/deserialization.
+        outputs = ray.get([worker.execute_method.remote(sent_method, *args, **(kwargs or {})) for worker in self.workers])
+        return outputs
+
+    def check_health(self):
+        return
+
+
+@ray.remote(num_cpus=1)
+class AsyncvLLMServer(AsyncServerBase):
+    """
+    AsyncvLLMServer is a wrapper for AsyncLLM, it uses ExternalRayDistributedExecutor to launch engines
+    in hybrid rollout workers, i.e AsyncActorRolloutRefWorker.
+
+    AsyncvLLMServer works as follows:
+    1. Start FastAPI server first.
+    2. Initialize AsyncLLM with ExternalRayDistributedExecutor.
+    3. AsyncLLM spawn EngineCore in subprocess.
+    4. EngineCore initialize ExternalRayDistributedExecutor.
+    5. ExternalRayDistributedExecutor lookup its corresponding actors by name.
+    6. ExternalRayDistributedExecutor init executor: init_worker, init_device, load_model.
+
+    For vLLM AsyncLLM design, see: https://github.com/vllm-project/vllm/pull/9826
+    """
+
+    def __init__(self, config: DictConfig, vllm_dp_size: int, vllm_dp_rank: int, wg_prefix: str):
+        """
+        Args:
+            config: DictConfig.
+            vllm_dp_size: int, vllm data parallel size.
+            vllm_dp_rank: int, vllm data parallel rank.
+            wg_prefix: str, worker group prefix, used to lookup actors.
+        """
+        super().__init__()
+
+        self.config = config.actor_rollout_ref
+        self.vllm_dp_size = vllm_dp_size
+        self.vllm_dp_rank = vllm_dp_rank
+        self.wg_prefix = wg_prefix
+        self.engine: AsyncLLM = None
+        self.request_ids = set()
+
+    async def init_engine(self):
+        """Init vLLM AsyncLLM engine."""
+        config = self.config
+        model_path = config.model.path
+        model_name = "/".join(model_path.split("/")[-2:])
+        local_path = copy_to_local(model_path)
+        trust_remote_code = config.model.get("trust_remote_code", False)
+        config = config.rollout
+
+        tensor_parallel_size = config.get("tensor_model_parallel_size", 1)
+        max_num_batched_tokens = config.get("max_num_batched_tokens", 8192)
+        max_model_len = config.max_model_len if config.max_model_len else config.prompt_length + config.response_length
+        max_model_len = int(max_model_len)
+
+        # Override default generation config from hugging face model config,
+        # user can still override them by passing kwargs in each request.
+        kwargs = dict(
+            n=1,
+            logprobs=0,
+            max_new_tokens=config.response_length,
+        )
+        for k in config.keys():
+            if hasattr(SamplingParams(), str(k)):
+                kwargs[k] = config.get(k)
+        print(f"override_generation_config: {kwargs}")
+
+        engine_args = AsyncEngineArgs(
+            model=local_path,
+            enable_sleep_mode=True,
+            override_generation_config=kwargs,
+            tensor_parallel_size=tensor_parallel_size,
+            distributed_executor_backend=ExternalRayDistributedExecutor,
+            dtype=config.dtype,
+            enforce_eager=config.enforce_eager,
+            gpu_memory_utilization=config.gpu_memory_utilization,
+            disable_custom_all_reduce=True,
+            disable_mm_preprocessor_cache=True,
+            skip_tokenizer_init=False,
+            max_model_len=max_model_len,
+            load_format="auto",
+            disable_log_stats=config.disable_log_stats,
+            max_num_batched_tokens=max_num_batched_tokens,
+            enable_chunked_prefill=config.enable_chunked_prefill,
+            enable_prefix_caching=True,
+            trust_remote_code=trust_remote_code,
+            seed=self.vllm_dp_rank,
+            logprobs_mode=config.get("logprobs_mode", "processed_logprobs"),
+        )
+
+        # init async llm engine
+        vllm_config = engine_args.create_engine_config()
+        namespace = ray.get_runtime_context().namespace
+        vllm_config.instance_id = f"{namespace}:{self.wg_prefix}:{self.vllm_dp_size}:{self.vllm_dp_rank}"
+        self.engine = AsyncLLM.from_vllm_config(vllm_config)
+
+    async def generate(self, raw_request: Request) -> list[int]:
+        request_dict = await raw_request.json()
+        # Set logprobs to 0 to only return selected tokens
+        request_dict['logprobs'] = 0
+
+        prompt_ids = request_dict.pop("prompt_ids")
+        sampling_params = SamplingParams(**request_dict)
+        request_id = random_uuid()
+        self.request_ids.add(request_id)
+        # Create prompt from token IDs
+        prompt = TokensPrompt(prompt_token_ids=prompt_ids)
+        generator = self.engine.generate(prompt=prompt, sampling_params=sampling_params, request_id=request_id)
+
+        # Get final response
+        final_res: Optional[RequestOutput] = None
+        try:    
+            async for output in generator:
+                final_res = output
+        except asyncio.CancelledError:
+            return Response(status_code=499)
+        finally:
+            self.request_ids.remove(request_id)
+        
+        assert final_res is not None
+
+        def obtain_logprobs(logprobs):
+            if logprobs is None:
+                return None
+            log_probs = []
+            for d in logprobs:
+                cur_logprobs = list(d.values())
+                assert len(cur_logprobs) == 1, f"Expected 1 logprob per token when logprobs=0, but got {len(cur_logprobs)}"
+                log_probs.append(cur_logprobs[0].logprob)
+            return log_probs
+            
+        ret = {
+            "response_ids": final_res.outputs[0].token_ids,
+            "logprobs": obtain_logprobs(final_res.outputs[0].logprobs),
+        }
+        return JSONResponse(ret)
+
+    async def wake_up(self):
+        await self.engine.wake_up()
+
+    async def sleep(self):
+        # Abort all unfinished generation requests
+        await self._abort_requests()
+        # TODO: https://github.com/vllm-project/vllm/issues/17103
+        await self.engine.reset_prefix_cache()
+        await self.engine.sleep()
+
+    async def _abort_requests(self):
+        request_states = self.engine.output_processor.request_states
+        request_ids = list(request_states.keys())
+        print(f"abort {len(request_ids)} request_ids: {request_ids}")
+
+        # abort all unfinished generation requests
+        self.engine.output_processor.abort_requests(request_ids)
+        await self.engine.engine_core.abort_requests_async(request_ids)
+
+    def get_tokenizer(self):
+        """Get the tokenizer from the engine."""
+        if self.engine is None:
+            raise RuntimeError("Engine not initialized. Call init_engine() first.")
+        
+        return self.engine.tokenizer
diff --git a/trainer_integration/verl/verl_custom/nvidia/scripts/run_proagent_qwn3_4B_instruct.sh b/trainer_integration/verl/verl_custom/nvidia/scripts/run_proagent_qwn3_4B_instruct.sh
new file mode 100644
index 000000000..b84628758
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/scripts/run_proagent_qwn3_4B_instruct.sh
@@ -0,0 +1,114 @@
+PROJECT_NAME='ProAgent'
+EXPERIMENT_NAME='ProAgent-Qwen3-4B-instruct-training-GRPO'
+DATA_PATH="/path/to/data/parquet"
+SFT_MODEL_PATH='Qwen/Qwen3-4B-Instruct-2507'
+TOKENIZER_PATH='Qwen/Qwen3-4B-Instruct-2507'
+CKPT_PATH='/path/to/outputs'
+
+
+BATCH_SIZE=4
+MAX_NUM_ITERS=30
+NUM_TRAJ=4
+SAVE_FREQ=1
+OPENHANDS_NUM_WORKERS=70
+
+USE_KL_LOSS=True
+KL_LOSS_COEF=0.001
+KL_LOSS_TYPE=low_var_kl
+ENTROPY_COEFF=0
+CLIP_RATIO_LOW=0.2
+CLIP_RATIO_HIGH=0.2
+
+GPU_MEM_UTIL=0.8
+TP_SIZE=2
+# Assumes a h200 node
+# For 2xH100: change tp size -> 2, sequence parallel size -> 2, nnodes -> 2
+NNODES=1
+SP_SIZE=1
+TEMPERATURE=1.4
+TOP_P=0.95
+
+python3 -m verl_custom.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    data.train_files=["$DATA_PATH/train.parquet"] \
+    data.val_files=["$DATA_PATH/validation.parquet"] \
+    data.train_batch_size=$BATCH_SIZE \
+    +data.gen_batch_size=1 \
+    data.max_prompt_length=31232 \
+    data.max_response_length=1536 \
+    data.truncation='error' \
+    actor_rollout_ref.model.path=$SFT_MODEL_PATH \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=$BATCH_SIZE \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=$SP_SIZE \
+    actor_rollout_ref.actor.use_kl_loss=$USE_KL_LOSS \
+    actor_rollout_ref.actor.kl_loss_coef=$KL_LOSS_COEF \
+    actor_rollout_ref.actor.kl_loss_type=$KL_LOSS_TYPE \
+    actor_rollout_ref.actor.entropy_coeff=$ENTROPY_COEFF \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.actor.clip_ratio_low=$CLIP_RATIO_LOW \
+    actor_rollout_ref.actor.clip_ratio_high=$CLIP_RATIO_HIGH \
+    actor_rollout_ref.actor.tis_imp_ratio_cap=2 \
+    +actor_rollout_ref.actor.use_error_mask=True \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=$TP_SIZE \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.mode=async \
+    +actor_rollout_ref.rollout.logprobs_mode=processed_logprobs \
+    actor_rollout_ref.rollout.gpu_memory_utilization=$GPU_MEM_UTIL \
+    actor_rollout_ref.rollout.n=$NUM_TRAJ \
+    actor_rollout_ref.rollout.temperature=$TEMPERATURE \
+    actor_rollout_ref.rollout.top_p=$TOP_P \
+    +actor_rollout_ref.rollout.async_manager=openhands \
+    +actor_rollout_ref.rollout.max_iterations=$MAX_NUM_ITERS \
+    +actor_rollout_ref.rollout.enable_memory_saver=True \
+    +actor_rollout_ref.rollout.max_starting_message_length=12000 \
+    +actor_rollout_ref.rollout.remove_think_tokens=True \
+    +actor_rollout_ref.rollout.openhands_base_url=http://localhost:8006 \
+    +actor_rollout_ref.rollout.openhands_num_workers=$OPENHANDS_NUM_WORKERS \
+    +actor_rollout_ref.rollout.task_type=swegym \
+    +actor_rollout_ref.rollout.chat_template_name=qwen3_chat_template_generation \
+    +actor_rollout_ref.rollout.openhands_timeout=1000 \
+    +actor_rollout_ref.actor.masking=True \
+    actor_rollout_ref.rollout.multi_turn.enable=True \
+    actor_rollout_ref.rollout.multi_turn.format=hermes \
+    +actor_rollout_ref.rollout.multi_turn.agent=True \
+    +actor_rollout_ref.rollout.token_level_generation=True \
+    +actor_rollout_ref.rollout.custom_tokenizer=$TOKENIZER_PATH \
+    +actor_rollout_ref.rollout.rollout_save_dir=$CKPT_PATH/rollout_data \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
+    +actor_rollout_ref.rollout.debug=True \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    reward_manager.type="swebench" \
+    +reward_manager.verifier.reward_coef=1.0 \
+    +reward_manager.debug=True \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
+    trainer.resume_mode=auto \
+    trainer.max_actor_ckpt_to_keep=10 \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=$NNODES \
+    trainer.save_freq=$SAVE_FREQ \
+    trainer.val_before_train=False \
+    +data.dataloader_num_workers=1 \
+    +actor_rollout_ref.exchange_size=500000000 \
+    actor_rollout_ref.rollout.val_kwargs.n=2 \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.temperature=0.6 \
+    actor_rollout_ref.rollout.val_kwargs.top_p=0.95 \
+    +algorithm.filter_groups.enable=True \
+    trainer.test_freq=-1 \
+    +trainer.enable_pass_k_evaluation=True \
+    +trainer.pass_k_problem_id_strategy=input_hash \
+    trainer.total_epochs=100 $@
+
+rm -rf $CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME/
\ No newline at end of file
diff --git a/trainer_integration/verl/verl_custom/nvidia/utils/__init__.py b/trainer_integration/verl/verl_custom/nvidia/utils/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/trainer_integration/verl/verl_custom/nvidia/utils/timer.py b/trainer_integration/verl/verl_custom/nvidia/utils/timer.py
new file mode 100644
index 000000000..87d87d710
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/utils/timer.py
@@ -0,0 +1,28 @@
+import sys
+import time
+
+
+class TimeoutChecker:
+    def __init__(self, initial_interval_hours=4, backoff_minutes=15):
+        super().__init__()
+        self.last_save_time = initial_interval_hours * 3600 - backoff_minutes * 60
+        self.start_time = time.time()
+        self.last_saved = False
+
+    def check_save(self):
+        # Flush
+        sys.stdout.flush()
+        sys.stderr.flush()
+
+        # Already saved after timeout
+        if self.last_saved:
+            return False
+
+        current_time = time.time()
+        elapsed_time = current_time - self.start_time
+        
+        if elapsed_time >= self.last_save_time:
+            self.last_saved = True
+            return True
+
+        return False
diff --git a/trainer_integration/verl/verl_custom/nvidia/utils/utils.py b/trainer_integration/verl/verl_custom/nvidia/utils/utils.py
new file mode 100644
index 000000000..f9c0c3e60
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/nvidia/utils/utils.py
@@ -0,0 +1,58 @@
+import torch
+
+def boost_high_score_advantages(advantages, scores, correct_sample_advantage_boost_value, correct_sample_advantage_boost_threshold):
+    """Boost advantages for high-scoring samples
+    
+    Args:
+        advantages: Original advantage values of shape [batch_size, response_length]
+        scores: Corresponding scores for each sample of shape [batch_size]
+        correct_sample_advantage_boost_value: Value to boost advantages for high-scoring samples
+        
+    Returns:
+        Modified advantage values with boosted values for high-scoring samples
+    """
+    # Use torch.isclose for floating point comparison with a small tolerance
+    high_score_mask = scores >= correct_sample_advantage_boost_threshold - 1e-5
+    high_score_mask = high_score_mask.unsqueeze(-1).expand_as(advantages)
+    
+    # Boost advantages for high-scoring samples
+    advantages = advantages + high_score_mask * correct_sample_advantage_boost_value
+    return advantages
+
+
+def apply_padding_mask(response_mask: torch.Tensor, is_padded: torch.Tensor, log_stats: bool = False) -> torch.Tensor:
+    """
+    Apply padding mask to response mask to exclude padded samples from loss computation.
+    
+    Args:
+        response_mask (torch.Tensor): Original response mask of shape (batch_size, response_length)
+        is_padded (torch.Tensor): Boolean tensor of shape (batch_size,) indicating padded samples
+        log_stats (bool): Whether to log masking statistics
+        
+    Returns:
+        torch.Tensor: Modified response mask with padded samples masked out
+    """
+    if is_padded is None:
+        return response_mask
+    
+    # Ensure is_padded is on the same device as response_mask
+    if isinstance(is_padded, (list, tuple)):
+        is_padded = torch.tensor(is_padded, device=response_mask.device, dtype=torch.bool)
+    elif not isinstance(is_padded, torch.Tensor):
+        is_padded = torch.tensor(is_padded, device=response_mask.device, dtype=torch.bool)
+    else:
+        is_padded = is_padded.to(response_mask.device).bool()
+    
+    # Log statistics if requested
+    if log_stats:
+        num_padded = is_padded.sum().item()
+        total_samples = is_padded.size(0)
+        if num_padded > 0:
+            print(f"[PADDING MASK] Masking {num_padded}/{total_samples} samples ({num_padded/total_samples*100:.1f}%) due to padding")
+    
+    # Apply mask: set response_mask to 0 for padded samples
+    # (~is_padded) creates a mask where True means NOT padded
+    sample_mask = (~is_padded).float().unsqueeze(1)  # Shape: (batch_size, 1)
+    masked_response_mask = response_mask * sample_mask
+    
+    return masked_response_mask
\ No newline at end of file
diff --git a/trainer_integration/verl/verl_custom/trainer/__init__.py b/trainer_integration/verl/verl_custom/trainer/__init__.py
new file mode 100644
index 000000000..1ce90c5eb
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/trainer/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/trainer_integration/verl/verl_custom/trainer/config/evaluation.yaml b/trainer_integration/verl/verl_custom/trainer/config/evaluation.yaml
new file mode 100644
index 000000000..efca03da4
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/trainer/config/evaluation.yaml
@@ -0,0 +1,14 @@
+data:
+  path: /tmp/math_Qwen2-7B-Instruct.parquet
+  prompt_key: prompt
+  response_key: responses
+  data_source_key: data_source
+  reward_model_key: reward_model
+
+custom_reward_function:
+  path: null
+  name: compute_score
+
+ray_init:
+  num_cpus: null # `None` means using all CPUs, which might cause hang if limited in systems like SLURM. Please set to a number allowed then.
+  timeline_json_file: null
diff --git a/trainer_integration/verl/verl_custom/trainer/config/generation.yaml b/trainer_integration/verl/verl_custom/trainer/config/generation.yaml
new file mode 100644
index 000000000..047fe46ae
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/trainer/config/generation.yaml
@@ -0,0 +1,52 @@
+trainer:
+  nnodes: 1
+  n_gpus_per_node: 8
+
+data:
+  path: ~/data/rlhf/math/test.parquet
+  prompt_key: prompt
+  n_samples: 5
+  output_path: /opt/tiger/math_Qwen2-7B-Instruct.parquet
+  batch_size: 128
+
+model:
+  path: ~/models/Qwen2-7B-Instruct
+  external_lib: null
+rollout:
+  name: vllm
+  mode: sync # sync: LLM, async: AsyncLLM
+  temperature: 1.0
+  top_k: 50 # 0 for hf rollout, -1 for vllm rollout
+  top_p: 0.7
+  prompt_length: 1536
+  response_length: 512
+  # for vllm rollout
+  dtype: bfloat16 # should align with FSDP
+  gpu_memory_utilization: 0.5
+  ignore_eos: False
+  enforce_eager: True
+  free_cache_engine: True
+  load_format: dummy_dtensor
+  tensor_model_parallel_size: 1
+  max_num_batched_tokens: 8192
+  max_model_len: null
+  max_num_seqs: 1024
+  log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
+  log_prob_micro_batch_size_per_gpu: 8
+  # for fire vllm rollout
+  use_fire_sampling: False # enable FIRE https://arxiv.org/abs/2410.21236
+  # for hf rollout
+  do_sample: True
+  disable_log_stats: True
+  enable_chunked_prefill: True
+  n: 1
+  calculate_log_probs: False
+actor:
+  strategy: fsdp  # This is for backward-compatibility
+  ulysses_sequence_parallel_size: 1 # sp size
+  fsdp_config:
+    fsdp_size: -1
+
+ray_init:
+  num_cpus: null # `None` means using all CPUs, which might cause hang if limited in systems like SLURM. Please set to a number allowed then.
+  timeline_json_file: null
diff --git a/trainer_integration/verl/verl_custom/trainer/config/ppo_megatron_trainer.yaml b/trainer_integration/verl/verl_custom/trainer/config/ppo_megatron_trainer.yaml
new file mode 100644
index 000000000..b2af777fb
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/trainer/config/ppo_megatron_trainer.yaml
@@ -0,0 +1,336 @@
+data:
+  tokenizer: null
+  train_files: ~/data/rlhf/gsm8k/train.parquet
+  val_files: ~/data/rlhf/gsm8k/test.parquet
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 512
+  max_response_length: 512
+  train_batch_size: 1024
+  val_batch_size: null # DEPRECATED: Validation datasets are sent to inference engines as a whole batch, which will schedule the memory themselves
+  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
+  return_raw_chat: False
+  return_full_prompt: False
+  shuffle: True
+  filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You cat set the filter_overlong_prompts_workers to use multiprocessing to speed up.
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  trust_remote_code: False  # main_ppo will check this config to determine whether to use remote code for tokenizer
+  custom_cls:
+      path: null
+      name: null
+
+actor_rollout_ref:
+  hybrid_engine: True
+  model:
+    path: ~/models/deepseek-llm-7b-chat
+    external_lib: null
+    override_config:
+      model_config: {}
+      moe_config:
+        freeze_moe_router: False
+    enable_gradient_checkpointing: False
+    gradient_checkpointing_kwargs:
+      ## Activation Checkpointing
+      activations_checkpoint_method: null # 'uniform', 'block'; not used with 'selective'
+      # 'uniform' divides the total number of transformer layers and checkpoints the input activation of each chunk
+      # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+      activations_checkpoint_granularity: null # 'selective' or 'full'
+      # 'full' will checkpoint the entire transformer layer and 'selective' only checkpoints memory intensive part of attention
+      activations_checkpoint_num_layers: null # not used with 'selective'
+    trust_remote_code: False
+  actor:
+    strategy: megatron  # This is for backward-compatibility
+    ppo_mini_batch_size: 256
+    ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
+    ppo_micro_batch_size_per_gpu: null
+    use_dynamic_bsz: False
+    ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    use_torch_compile: True # False to disable torch compile
+    # pg_losses2 = -advantages * torch.clamp(ratio, 1 - cliprange_low, 1 + cliprange_high)
+    clip_ratio: 0.2 # default value if clip_ratio_low and clip_ratio_high are not specified
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.2
+    clip_ratio_c: 3.0 # lower bound of the value for Dual-clip PPO from https://arxiv.org/pdf/1912.09729
+    loss_agg_mode: "token-mean" # / "seq-mean-token-sum" / "seq-mean-token-mean"
+    # NOTE: "token-mean" is the default behavior
+    entropy_coeff: 0
+    use_kl_loss: False # True for GRPO
+    kl_loss_coef: 0.001 # for grpo
+    kl_loss_type: low_var_kl # for grpo
+    ppo_epochs: 1
+    data_loader_seed: null
+    shuffle: False
+    optim:
+      optimizer: adam
+      lr: 1e-6
+      clip_grad: 1.0
+      total_training_steps: -1  # must be override by program
+      lr_warmup_init: 0.0  # initial learning rate for warmup, default to 0.0
+      lr_warmup_steps: null # Prioritized. None, 0 or Negative values mean delegating to lr_warmup_steps_ratio.
+      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
+      lr_decay_steps: null
+      lr_decay_style: linear # select from constant/linear/cosine/inverse_square_root
+      min_lr: 0.0 # minimum learning rate, default to 0.0
+      weight_decay: 0.01
+      weight_decay_incr_style: constant # select from constant/linear/cosine
+      lr_wsd_decay_style: exponential # select from constant/exponential/cosine
+      lr_wsd_decay_steps: null
+      use_checkpoint_opt_param_scheduler: False # use checkpoint optimizer parameter scheduler
+    megatron:
+      param_offload: False
+      grad_offload: False
+      optimizer_offload: False
+      tensor_model_parallel_size: 1
+      expert_model_parallel_size: 1
+      expert_tensor_parallel_size: null
+      pipeline_model_parallel_size: 1
+      virtual_pipeline_model_parallel_size: null # change VPP interface for parallelism tests
+      context_parallel_size: 1
+      sequence_parallel: True
+      use_distributed_optimizer: True
+      use_dist_checkpointing: False
+      dist_checkpointing_path: null
+      seed: 42
+      override_transformer_config: {} # additional transformer config like: num_layers_in_first(/last)_pipeline_stage
+    profile: # profile the actor model in `update_policy` 
+      use_profile: False # open it when you want to profile the actor model
+      profile_ranks: null # list, you can specify the ranks to profile
+      step_start: -1 # start step in update_policy 
+      step_end: -1 # end step 
+      save_path: null # the path to save the profile result
+    load_weight: True
+    checkpoint:
+      contents: ['model', 'optimizer', 'extra']  # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
+  ref:
+    strategy: megatron
+    use_torch_compile: ${actor_rollout_ref.actor.use_torch_compile}
+    megatron:
+      param_offload: False
+      tensor_model_parallel_size: 1
+      expert_model_parallel_size: 1
+      expert_tensor_parallel_size: None
+      pipeline_model_parallel_size: 1
+      virtual_pipeline_model_parallel_size: null # change VPP interface for parallelism tests
+      context_parallel_size: 1
+      sequence_parallel: True
+      use_distributed_optimizer: False
+      use_dist_checkpointing: False
+      dist_checkpointing_path: null
+      seed: ${actor_rollout_ref.actor.megatron.seed}
+      override_transformer_config: ${actor_rollout_ref.actor.megatron.override_transformer_config}
+    profile:
+      use_profile: False
+      profile_ranks: null
+      step_start: -1
+      step_end: -1
+      save_path: null
+    load_weight: True
+    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
+    log_prob_micro_batch_size_per_gpu: null
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+  rollout:
+    name: vllm
+    mode: sync # sync: LLM, async: AsyncLLM
+    temperature: 1.0
+    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
+    top_p: 1
+    prompt_length: ${data.max_prompt_length}  # for xperf_gpt
+    response_length: ${data.max_response_length}
+    # for vllm rollout
+    dtype: bfloat16 # should align with FSDP
+    gpu_memory_utilization: 0.5
+    ignore_eos: False
+    enforce_eager: True
+    free_cache_engine: True
+    load_format: dummy_megatron
+    tensor_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
+    log_prob_micro_batch_size_per_gpu: null
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+    disable_log_stats: True
+    enable_chunked_prefill: False # could get higher throughput
+    # for hf rollout
+    do_sample: True
+    layer_name_map:
+      qkv_layer_name: qkv
+      gate_proj_layer_name: gate_up
+    # number of responses (i.e. num sample times)
+    n: 1
+    # support logging rollout prob
+    calculate_log_probs: False
+    
+    engine_kwargs: # inference engine parameters
+      vllm:
+        swap_space: null # null means "use the engine default value" (usually 4 GB), setting it to, e.g., 32 means 32 GB
+      sglang:
+        attention_backend: null # null means use the engine default value, available options: flashinfer, triton, flashmla
+    val_kwargs:
+      # sampling parameters for validation
+      top_k: -1 # 0 for hf rollout, -1 for vllm rollout
+      top_p: 1.0
+      temperature: 0
+      n: 1
+      do_sample: False # default eager for validation
+    multi_turn: 
+      enable: False  # set to True for multi-turn tool interaction tasks; should set rollout.name to sglang as well
+      max_turns: null  # null for no limit (default max_length // 3)
+      tool_config_path: null  # null for no tool
+      format: chatml  # chatml, more formats will be supported in the future
+
+critic:
+  rollout_n: ${actor_rollout_ref.rollout.n}
+  strategy: megatron
+  optim:
+    optimizer: adam
+    lr: 1e-6
+    clip_grad: 1.0
+    total_training_steps: -1  # must be override by program
+    lr_warmup_init: 0.0  # initial learning rate for warmup, default to 0.0
+    lr_warmup_steps: null # Prioritized. None, 0 or Negative values mean delegating to lr_warmup_steps_ratio.
+    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
+    lr_decay_steps: null
+    lr_decay_style: linear # select from constant/linear/cosine/inverse_square_root
+    min_lr: 0.0 # minimum learning rate, default to 0.0
+    weight_decay: 0.01
+    weight_decay_incr_style: constant # select from constant/linear/cosine
+    lr_wsd_decay_style: exponential # select from constant/exponential/cosine
+    lr_wsd_decay_steps: null
+    use_checkpoint_opt_param_scheduler: False # use checkpoint optimizer parameter scheduler
+  model:
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${actor_rollout_ref.model.path}
+    override_config:
+      model_config: {}
+      moe_config:
+        freeze_moe_router: False
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: False
+    enable_gradient_checkpointing: False
+    gradient_checkpointing_kwargs:
+      ## Activation Checkpointing
+      activations_checkpoint_method: null
+      activations_checkpoint_granularity: null
+      activations_checkpoint_num_layers: null
+  megatron:
+    param_offload: False
+    grad_offload: False
+    optimizer_offload: False
+    tensor_model_parallel_size: 1
+    expert_model_parallel_size: 1
+    expert_tensor_parallel_size: null
+    pipeline_model_parallel_size: 1
+    virtual_pipeline_model_parallel_size: null # change VPP interface for parallelism tests
+    context_parallel_size: 1
+    sequence_parallel: True
+    use_distributed_optimizer: True
+    use_dist_checkpointing: False
+    dist_checkpointing_path: null
+    seed: ${actor_rollout_ref.actor.megatron.seed}
+    override_transformer_config: ${actor_rollout_ref.actor.megatron.override_transformer_config}
+  load_weight: True
+  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
+  ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
+  ppo_micro_batch_size_per_gpu: null
+  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
+  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
+  data_loader_seed: ${actor_rollout_ref.actor.data_loader_seed}
+  shuffle: ${actor_rollout_ref.actor.shuffle}
+  cliprange_value: 0.5
+  kl_ctrl:
+    type: fixed
+    kl_coef: 0.001
+  loss_agg_mode: ${actor_rollout_ref.actor.loss_agg_mode}
+  checkpoint:
+    contents: ['model', 'optimizer', 'extra']  # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
+
+reward_model:
+  enable: False
+  strategy: megatron
+  megatron:
+    param_offload: False
+    tensor_model_parallel_size: 1
+    expert_model_parallel_size: 1
+    expert_tensor_parallel_size: null
+    pipeline_model_parallel_size: 1
+    virtual_pipeline_model_parallel_size: null # change VPP interface for parallelism tests
+    context_parallel_size: 1
+    sequence_parallel: True
+    use_distributed_optimizer: False
+    use_dist_checkpointing: False
+    dist_checkpointing_path: null
+    seed: ${actor_rollout_ref.actor.megatron.seed}
+    override_transformer_config: {}
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    trust_remote_code: False
+    external_lib: ${actor_rollout_ref.model.external_lib}
+  load_weight: True
+  micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
+  micro_batch_size_per_gpu: null
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  max_length: null
+  reward_manager: naive
+  launch_reward_fn_async: False # custom reward function executed async on CPU, during log_prob
+  sandbox_fusion:
+    url: null # faas url to run code in cloud sandbox
+    max_concurrent: 64 # max concurrent requests to sandbox
+custom_reward_function:
+  path: null
+  name: compute_score
+
+algorithm:
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: gae
+  norm_adv_by_std_in_grpo: True
+  use_kl_in_reward: False
+  kl_penalty: kl  # how to estimate kl divergence
+  kl_ctrl:
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: False
+  pf_ppo:
+    reweight_method: pow  # ["pow", "max_min", "max_random"]
+    weight_pow: 2.0
+
+trainer:
+  balance_batch: True
+  total_epochs: 30
+  total_training_steps: null
+  project_name: verl_examples
+  experiment_name: gsm8k
+  logger: ['console', 'wandb']
+  log_val_generations: 0
+  nnodes: 1
+  n_gpus_per_node: 8
+  save_freq: -1
+  # auto: find the last ckpt to resume. If can't find, start from scratch
+  resume_mode: auto # or disable or resume_path if resume_from_path is set
+  resume_from_path: null
+  del_local_ckpt_after_load: False
+  val_before_train: True
+  test_freq: -1
+  critic_warmup: 0
+  default_hdfs_dir: null
+  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
+  max_actor_ckpt_to_keep: null
+  max_critic_ckpt_to_keep: null
+  # The timeout for ray worker group to wait for the register center to be ready
+  ray_wait_register_center_timeout: 300
+  device: cuda
+
+ray_init:
+  num_cpus: null # `None` means using all CPUs, which might cause hang if limited in systems like SLURM. Please set to a number allowed then.
+  timeline_json_file: null
diff --git a/trainer_integration/verl/verl_custom/trainer/config/ppo_trainer.yaml b/trainer_integration/verl/verl_custom/trainer/config/ppo_trainer.yaml
new file mode 100644
index 000000000..446dd66c7
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/trainer/config/ppo_trainer.yaml
@@ -0,0 +1,857 @@
+# Format checks enforced on CI:
+# 1. Comments must appear above each field.
+# 2. There must be a blank line between each field.
+# 3. Inline comments (after a field on the same line) are not allowed.
+# 4. Indentation level is respected for nested fields.
+
+# dataset config
+data:
+
+  # Tokenizer class or path. If null, it will be inferred from the model.
+  tokenizer: null
+
+  # Whether to use shared memory for data loading.
+  use_shm: False
+
+  # Training set parquet. Can be a list or a single file.
+  # The program will read all files into memory, so it can't be too large (< 100GB).
+  # The path can be either a local path or an HDFS path.
+  # For HDFS path, we provide utils to download it to DRAM and convert it to a local path.
+  train_files: ~/data/rlhf/gsm8k/train.parquet
+
+  # Validation parquet. Can be a list or a single file.
+  val_files: ~/data/rlhf/gsm8k/test.parquet
+
+  # The field in the dataset where the prompt is located. Default is 'prompt'.
+  prompt_key: prompt
+
+  # The field used to select the reward function (if using different ones per example).
+  reward_fn_key: data_source
+
+  # Maximum prompt length. All prompts will be left-padded to this length.
+  # An error will be reported if the length is too long.
+  max_prompt_length: 512
+
+  # Maximum response length. Rollout in RL algorithms (e.g. PPO) generates up to this length.
+  max_response_length: 512
+
+  # Batch size sampled for one training iteration of different RL algorithms.
+  train_batch_size: 1024
+
+  # Batch size used during validation. Can be null.
+  val_batch_size: null
+
+  # Whether to return the original input_ids without adding chat template.
+  # This is used when the reward model's chat template differs from the policy.
+  # If using a model-based RM with different templates, this should be True.
+  return_raw_input_ids: False
+
+  # Whether to return the original chat (prompt) without applying chat template.
+  return_raw_chat: False
+
+  # Whether to return the full prompt with chat template.
+  return_full_prompt: False
+
+  # Whether to shuffle the data in the dataloader.
+  shuffle: True
+
+  # Whether to shuffle the validation set.
+  validation_shuffle: False
+
+  # Whether to filter overlong prompts.
+  filter_overlong_prompts: False
+
+  # Number of workers for filtering overlong prompts.
+  # For large-scale datasets, filtering can be time-consuming.
+  # Use multiprocessing to speed up. Default is 1.
+  filter_overlong_prompts_workers: 1
+
+  # Truncate the input_ids or prompt if they exceed max_prompt_length.
+  # Options: 'error', 'left', or 'right'. Default is 'error'.
+  truncation: error
+
+  # The field in the multi-modal dataset where the image is located. Default is 'images'.
+  image_key: images
+
+  # The field in the multi-modal dataset where the video is located.
+  video_key: videos
+
+  # If the remote tokenizer has a Python file, this flag determines whether to allow using it.
+  trust_remote_code: False
+
+  # Optional: specify a custom dataset class path and name if overriding default loading behavior.
+  custom_cls:
+
+    # The path to the file containing your customized dataset class. If not specified, pre-implemented dataset will be used.
+    path: null
+
+    # The name of the dataset class within the specified file.
+    name: null
+
+reward_manager:
+  type: prime
+  num_examine: 0
+  server_ip: localhost
+  use_remote_reward: False
+  max_concurrency: 1024  # recommend at least 64 for each node
+
+# config for actor, rollout and reference model
+actor_rollout_ref:
+
+  # Whether it's a hybrid engine, currently only supports hybrid engine
+  hybrid_engine: true
+
+  # common configs for the model
+  model:
+  
+    # Huggingface model path. This can be either local path or HDFS path.
+    path: ~/models/deepseek-llm-7b-chat
+
+    # Whether to use shared memory (SHM) for accelerating the loading of model weights
+    use_shm: false
+
+    # Additional Python packages to register huggingface models/tokenizers.
+    external_lib: null
+    
+    # Used to override model's original configurations, mainly dropout
+    override_config: {}
+    
+    # Enable gradient checkpointing for actor
+    enable_gradient_checkpointing: true
+    
+    # Enable activation offloading for actor
+    enable_activation_offload: false
+    
+    # Whether to remove padding tokens in inputs during training
+    use_remove_padding: false
+    
+    # Set to positive value to enable LoRA (e.g., 32)
+    lora_rank: 0
+    
+    # LoRA scaling factor
+    lora_alpha: 16
+    
+    # Target modules to apply LoRA. Options: "all-linear" or 
+    # [q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj]
+    target_modules: all-linear
+    
+    # Whether to use Liger for linear layer fusion
+    use_liger: false
+    
+    # Whether to use custom fused kernels (e.g., FlashAttention, fused MLP)
+    use_fused_kernels: false
+    
+    # Whether to enable loading a remote code model
+    trust_remote_code: false
+
+  # configs for the actor
+  actor:
+  
+    # fsdp, fsdp2 or megatron. fsdp backend used here.
+    strategy: fsdp
+    
+    # Split each sample into sub-batches of this size for PPO
+    ppo_mini_batch_size: 256
+    
+    # [Deprecated] Global micro batch size
+    ppo_micro_batch_size: null
+    
+    # Local per-GPU micro batch size
+    ppo_micro_batch_size_per_gpu: null
+    
+    # Whether to automatically adjust batch size at runtime
+    use_dynamic_bsz: false
+    
+    # Max tokens per GPU in one PPO batch; affects gradient accumulation
+    # Typically it should be: n * ${data.max_prompt_length} + ${data.max_response_length}
+    ppo_max_token_len_per_gpu: 16384
+    
+    # Gradient clipping for actor updates
+    grad_clip: 1.0
+    
+    # PPO clip ratio
+    clip_ratio: 0.2
+    
+    # Lower bound for asymmetric clipping (used in dual-clip PPO)
+    clip_ratio_low: 0.2
+    
+    # Upper bound for asymmetric clipping (used in dual-clip PPO)
+    clip_ratio_high: 0.2
+    
+    # Constant C in Dual-clip PPO; clips when advantage < -C
+    clip_ratio_c: 3.0
+    
+    # Loss aggregation mode: "token-mean", "seq-mean-token-sum", or "seq-mean-token-mean"
+    loss_agg_mode: token-mean
+    
+    # Policy loss type: "ppo" (standard PPO) or "gspo" (Geometric Sequence PPO)
+    policy_loss_type: ppo
+    
+    # Entropy regularization coefficient in PPO loss
+    entropy_coeff: 0
+    
+    # Whether to use KL loss instead of KL reward penalty. True for GRPO
+    use_kl_loss: false
+    
+    # Whether to use torch.compile()
+    use_torch_compile: true
+    
+    # KL loss coefficient when use_kl_loss is enabled. For GRPO
+    kl_loss_coef: 0.001
+    
+    # Type of KL divergence loss. Options: "kl"(k1), "abs", "mse"(k2), "low_var_kl"(k3), "full"
+    kl_loss_type: low_var_kl
+    
+    # Number of PPO epochs per batch
+    ppo_epochs: 1
+    
+    # Shuffle training data across PPO epochs
+    shuffle: false
+    
+    # Sequence parallelism size for Ulysses-style model parallelism
+    ulysses_sequence_parallel_size: 1
+
+    correct_sample_advantage_boost_value: 0  # value to boost advantages for high-scoring samples
+    correct_sample_advantage_boost_threshold: 1.0  # threshold to boost advantages for high-scoring samples
+
+    # Truncated Importance Sampling (TIS): https://fengyao.notion.site/off-policy-rl
+    # the truncation value C of truncated Importance Sampling (-1 for disable TIS)
+    tis_imp_ratio_cap: -1
+
+    # checkpoint configs
+    checkpoint:
+    
+      # What to include in saved checkpoints
+      # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
+      contents: ['model', 'optimizer', 'extra', 'hf_model']
+
+    # optimizer configs
+    optim:
+    
+      # Learning rate
+      lr: 1e-6
+      
+      # Warmup steps; negative value delegates to lr_warmup_steps_ratio
+      lr_warmup_steps: -1
+      
+      # Warmup steps ratio (used if lr_warmup_steps is negative)
+      lr_warmup_steps_ratio: 0.0
+      
+      # Minimum LR ratio for cosine schedule
+      min_lr_ratio: 0.0
+      
+      # Number of cosine cycles in LR schedule
+      num_cycles: 0.5
+      
+      # LR warmup style: "constant" or "cosine"
+      warmup_style: constant
+      
+      # Total training steps (must be overridden at runtime)
+      total_training_steps: -1
+      
+      # Weight decay
+      weight_decay: 0.01
+
+    # configs for FSDP
+    fsdp_config:
+
+      # policy for wrapping the model
+      wrap_policy:
+      
+        # Minimum number of parameters to trigger wrapping a layer with FSDP
+        min_num_params: 0
+        
+      # Whether to offload model parameters to CPU (trades speed for memory)
+      param_offload: false
+      
+      # Whether to offload optimizer state to CPU
+      optimizer_offload: false
+      
+      # Only for FSDP2: offload param/grad/optimizer during train
+      offload_policy: false
+      
+      # Only for FSDP2: Reshard after forward pass to reduce memory footprint
+      reshard_after_forward: true
+      
+      # Number of GPUs in each FSDP shard group; -1 means auto
+      fsdp_size: -1
+
+  # Reference model config.
+  # Reference model will be enabled when actor.use_kl_loss or/and algorithm.use_kl_in_reward is/are True.
+  ref:
+
+    # actor_rollout_ref.ref: FSDP config same as actor. For models larger than 7B, it’s recommended to turn on offload for ref by default
+    strategy: fsdp
+
+    # config for FSDP strategy
+    fsdp_config:
+
+      # whether to offload parameters in FSDP
+      param_offload: False
+
+      # whether to perform reshard after model forward to save memory.
+      # only for fsdp2, [True, False, int between 1 and fsdp_size]
+      reshard_after_forward: True
+
+      # the wrap policy for FSDP model
+      wrap_policy:
+
+        # minimum number of params in a wrapped module
+        min_num_params: 0
+
+    # whether to enable torch.compile
+    use_torch_compile: ${actor_rollout_ref.actor.use_torch_compile}
+
+    # [Will be deprecated, use log_prob_micro_batch_size_per_gpu]
+    # The batch size for one forward pass in the computation of log_prob. Global batch size.
+    log_prob_micro_batch_size: null
+
+    # The batch size for one forward pass in the computation of log_prob. Local batch size per GPU.
+    log_prob_micro_batch_size_per_gpu: null
+
+    # enable dynamic batch size (sequence packing) for log_prob computation
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+
+    # the max token length per GPU
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+
+    # sequence parallel size
+    ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size}
+
+  # Rollout model config.
+  rollout:
+
+    # actor_rollout_ref.rollout.name: hf/vllm/sglang.
+    name: vllm
+
+    # sync: LLM, async: AsyncLLM
+    mode: sync
+
+    # Sampling temperature for rollout.
+    temperature: 1.0
+
+    # Top-k sampling parameter. -1 for vLLM rollout, 0 for HF rollout.
+    top_k: -1
+
+    # Top-p sampling parameter. Default 1.0.
+    top_p: 1
+
+    # https://arxiv.org/abs/2410.21236
+    use_fire_sampling: False
+
+    # typically the same as data max prompt length
+    prompt_length: ${data.max_prompt_length}
+    
+    # typically the same as data max response length
+    response_length: ${data.max_response_length}
+
+    # for vllm rollout
+    # Rollout model parameters type. Align with actor model's FSDP/Megatron type.
+    dtype: bfloat16
+
+    # Fraction of GPU memory used by vLLM/SGLang for KV cache.
+    gpu_memory_utilization: 0.5
+
+    # Whether to ignore EOS and continue generating after EOS is hit.
+    ignore_eos: False
+
+    # Whether to disable CUDA graph. Default True to allow cache freeing.
+    enforce_eager: True
+
+    # Whether to free engine KVCache after generation. Set enforce_eager=True when enabled.
+    free_cache_engine: True
+
+    # Which loader to use for rollout model weights: dummy_dtensor, hf, megatron, etc.
+    # safetensors (for huge model, and set use_shm=True); dummy_dtensor: randomly init model weight
+    load_format: dummy_dtensor
+
+    # for huge model, layered summon can save memory (prevent OOM) but make it slower
+    layered_summon: False
+
+    # TP size for rollout. Only effective for vLLM.
+    tensor_model_parallel_size: 2
+
+    # max number of tokens in a batch
+    max_num_batched_tokens: 8192
+
+    # max length for rollout
+    max_model_len: null
+
+    # max length of sequences
+    max_num_seqs: 1024
+
+    # [Will be deprecated, use log_prob_micro_batch_size_per_gpu] The batch size for one forward pass in the computation of log_prob. Global batch size.
+    log_prob_micro_batch_size: null
+
+    # The batch size for one forward pass in the computation of log_prob. Local batch size per GPU.
+    log_prob_micro_batch_size_per_gpu: null
+
+    # enable dynamic batch size (sequence packing) for log_prob computation
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+
+    # max token length for log_prob computation
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+
+    # disable logging statistics
+    disable_log_stats: True
+
+    # may get higher throughput when set to True. When activated, Please increase max_num_batched_tokens or decrease max_model_len.
+    enable_chunked_prefill: True
+
+    # for hf rollout
+    # Whether to sample during training rollout. False uses greedy sampling.
+    do_sample: True
+
+    # number of responses (i.e. num sample times). > 1 for grpo
+    n: 1
+
+    # support logging rollout prob
+    calculate_log_probs: False 
+
+    # Extra inference engine arguments (vllm, sglang).
+    engine_kwargs:
+
+      # for vllm 
+      vllm:
+
+        # Swap space (in GB) used by inference engine. null uses default (e.g., 4 GB).
+        swap_space: null
+
+      # for sglang
+      sglang:
+
+        # The attention backend for sglang engine. Options: flashinfer, triton, flashmla, null for default.
+        attention_backend: null
+
+    # Sampling parameters used during validation.
+    val_kwargs:
+
+      # sampling parameters for validation
+      # Top-k sampling parameter. -1 for vLLM rollout, 0 for HF rollout.
+      top_k: -1
+
+      # Top-p sampling parameter. Default 1.0.
+      top_p: 1.0
+
+      # Sampling temperature for rollout.
+      temperature: 0
+
+      # whether to repeat n times for validation
+      n: 1
+
+      # Whether to sample during training rollout. False uses greedy sampling.
+      do_sample: False
+
+    # Multi-turn interaction config for tools or chat.
+    multi_turn:
+
+      # set to True for multi-turn tool interaction tasks; should set rollout.name to sglang as well
+      enable: False
+
+      # null for no limit (default max_length // 3)
+      max_turns: null
+
+      # null for no tool
+      tool_config_path: null
+
+      # chatml, more formats will be supported in the future
+      format: chatml 
+
+      # null for default callback
+      completion_callback: null
+
+# configs for the critic
+critic:
+
+  # Number of rollouts per update (mirrors actor rollout_n)
+  rollout_n: ${actor_rollout_ref.rollout.n}
+
+  # fsdp or fsdp2 strategy used for critic model training
+  strategy: fsdp
+
+  # optimizer configs
+  optim:
+
+    # Learning rate
+    lr: 1e-5
+
+    # Warmup steps ratio; total steps will be injected at runtime
+    lr_warmup_steps_ratio: 0.
+
+    # Minimum LR ratio for cosine schedule
+    min_lr_ratio: null
+
+    # LR warmup style: "constant" or "cosine"
+    warmup_style: constant
+
+    # Total training steps (must be overridden at runtime)
+    total_training_steps: -1
+
+    # Weight decay
+    weight_decay: 0.01
+
+  # model config for the critic
+  model:
+
+    # Path to pretrained model weights
+    path: ~/models/deepseek-llm-7b-chat
+
+    # Whether to use shared memory for loading the model
+    use_shm: False
+
+    # Tokenizer path (defaults to actor's model path)
+    tokenizer_path: ${actor_rollout_ref.model.path}
+
+    # Hugging Face config override
+    override_config: { }
+
+    # External model implementation (optional)
+    external_lib: ${actor_rollout_ref.model.external_lib}
+
+    # Enable gradient checkpointing to save memory
+    enable_gradient_checkpointing: True
+
+    # Offload activations to CPU to reduce GPU memory usage
+    enable_activation_offload: False
+
+    # Use remove padding optimization (saves compute)
+    use_remove_padding: False
+
+    # Whether to trust remote code from Hugging Face models
+    trust_remote_code: ${actor_rollout_ref.model.trust_remote_code}
+
+    # FSDP-specific config
+    fsdp_config:
+
+      # Whether to offload model parameters to CPU
+      param_offload: False
+
+      # Whether to offload optimizer state to CPU
+      optimizer_offload: False
+
+      # Only for FSDP2: offload param/grad/optimizer during train
+      offload_policy: False
+
+      # Only for FSDP2: Reshard after forward pass to reduce memory footprint
+      reshard_after_forward: True
+
+      # Policy for wrapping layers with FSDP
+      wrap_policy:
+
+        # Minimum number of parameters to trigger wrapping
+        min_num_params: 0
+
+      # Number of GPUs in each FSDP shard group; -1 means auto
+      fsdp_size: -1
+
+    # Set to positive value to enable LoRA (e.g., 32)
+    lora_rank: 0
+
+    # LoRA scaling factor
+    lora_alpha: 16
+
+    # LoRA target modules: "all-linear" or list of linear projection layers
+    target_modules: all-linear
+
+  # PPO mini-batch size per update
+  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
+
+  # [Deprecated] Global micro batch size
+  ppo_micro_batch_size: null
+
+  # Local per-GPU micro batch size
+  ppo_micro_batch_size_per_gpu: null
+
+  # Forward-only batch size (global)
+  forward_micro_batch_size: ${critic.ppo_micro_batch_size}
+
+  # Forward-only batch size (per GPU)
+  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
+
+  # Whether to automatically adjust batch size at runtime
+  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+
+  # Max tokens per GPU in one PPO batch (doubled for critic)
+  ppo_max_token_len_per_gpu: 32768
+
+  # Max token length per GPU in forward pass
+  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
+
+  # Sequence parallelism size for Ulysses-style model parallelism
+  ulysses_sequence_parallel_size: 1
+
+  # Number of PPO epochs per batch
+  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
+
+  # Shuffle training data across PPO epochs
+  shuffle: ${actor_rollout_ref.actor.shuffle}
+
+  # Gradient clipping for critic updates
+  grad_clip: 1.0
+
+  # PPO value function clipping range
+  cliprange_value: 0.5
+
+  # Loss aggregation mode: "token-mean", "seq-mean-token-sum", or "seq-mean-token-mean"
+  loss_agg_mode: ${actor_rollout_ref.actor.loss_agg_mode}
+
+  # checkpoint configs
+  checkpoint:
+
+    # What to include in saved checkpoints
+    # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
+    contents: ['model', 'optimizer', 'extra', 'hf_model']
+
+# configs for the reward model
+reward_model:
+
+  # Whether to enable reward model. If False, we compute the reward only with the user-defined reward functions.
+  # In GSM8K and Math examples, we disable reward model.
+  # For RLHF alignment example using full_hh_rlhf, we utilize reward model to assess the responses.
+  # If False, the following parameters are not effective
+  enable: False
+
+  # FSDP strategy: "fsdp" or "fsdp2"
+  strategy: fsdp
+
+  # model config for reward scoring
+  model:
+
+    # Input tokenizer. If the reward model’s chat template is inconsistent with the policy,
+    # we need to first decode to plaintext, then apply the rm’s chat_template.
+    # Then score with RM. If chat_templates are consistent, it can be set to null.
+    input_tokenizer: ${actor_rollout_ref.model.path}
+
+    # RM’s HDFS path or local path. Note that RM only supports AutoModelForSequenceClassification.
+    # Other model types need to define their own RewardModelWorker and pass it from the code.
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+
+    # Whether to use shared memory for loading the model
+    use_shm: False
+
+    # External model implementation (optional)
+    external_lib: ${actor_rollout_ref.model.external_lib}
+
+    # Use remove padding optimization (saves compute)
+    use_remove_padding: False
+
+    # Whether to use fused reward kernels for speedup
+    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+
+    # Whether to enable loading a remote code model, default to False
+    trust_remote_code: False
+
+    # FSDP-specific config
+    fsdp_config:
+
+      # Policy for wrapping layers with FSDP
+      wrap_policy:
+
+        # Minimum number of parameters to trigger wrapping
+        min_num_params: 0
+
+      # Whether to offload model parameters to CPU
+      param_offload: False
+
+      # Only for FSDP2: Reshard after forward pass to reduce memory footprint
+      reshard_after_forward: True
+
+      # Number of GPUs in each FSDP shard group; -1 means auto
+      fsdp_size: -1
+
+  # [Deprecated] Global micro batch size
+  micro_batch_size: null
+
+  # Local per-GPU micro batch size
+  micro_batch_size_per_gpu: null
+
+  # Maximum sequence length to process for scoring
+  max_length: null
+
+  # Sequence parallelism size for Ulysses-style model parallelism
+  ulysses_sequence_parallel_size: 1
+
+  # Whether to dynamically adjust batch size at runtime
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+
+  # Maximum number of tokens per GPU in one forward pass
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+
+  # Reward Manager. This defines the mechanism of computing rule-based reward and handling different reward sources.
+  # Default is naive. If all verification functions are multiprocessing-safe,
+  # the reward manager can be set to prime for parallel verification.
+  reward_manager: ${reward_manager.type}
+
+  # Whether to launch custom reward function asynchronously during log_prob
+  launch_reward_fn_async: False
+
+  # Cloud/local sandbox fusion configuration for custom reward logic
+  sandbox_fusion:
+
+    # Cloud/local function URL for sandbox execution
+    url: null
+
+    # Max concurrent requests allowed to sandbox
+    max_concurrent: 64
+
+# custom reward function definition
+custom_reward_function:
+
+  # The path to the file containing your customized reward function.
+  # If not specified, pre-implemented reward functions will be used.
+  path: null
+
+  # The name of the reward function within the specified file. Default is 'compute_score'.
+  name: compute_score
+
+length_penalty:
+  length_penalty_type: none # none, linear, instance_linear, cosine
+  max_length: ${data.max_response_length} # used for linear and instance_linear
+  max_length_margin: -1 # used for cosine, must not be 0
+  repetition_ngram_size: 64 # used for cosine
+  repetition_penalty: -0.001 # used for cosine
+  min_value_correct: 0.5
+  max_value_correct: 1.0
+  min_value_wrong: 0.0
+  max_value_wrong: 0.5
+  binary_score_data_sources: ['codecontests', 'apps', 'codeforces', 'taco']
+  selected_data_sources: ['deepscaler', 'codecontests', 'apps', 'codeforces', 'taco']
+
+stop_properly_penalty:
+  penalty_coef: 0.1 # if the generation is not stopped properly, the score will be multiplied by this value
+
+
+# config for the algorithm
+algorithm:
+
+  # Discount factor for future rewards
+  gamma: 1.0
+
+  # Trade-off between bias and variance in the GAE estimator
+  lam: 1.0
+
+  # Advantage estimator type: "gae", "grpo", "reinforce_plus_plus", etc.
+  adv_estimator: gae
+
+  # Whether to normalize advantages by std (specific to GRPO)
+  norm_adv_by_std_in_grpo: True
+
+  # Whether to enable in-reward KL penalty
+  use_kl_in_reward: False
+
+  # How to estimate KL divergence: "kl", "abs", "mse", "low_var_kl", or "full"
+  kl_penalty: kl
+
+  # KL control configuration
+  kl_ctrl:
+
+    # KL control type: "fixed" or "adaptive"
+    type: fixed
+
+    # Initial coefficient for KL penalty
+    kl_coef: 0.001
+
+    # Horizon value for adaptive controller (if enabled)
+    horizon: 10000
+
+    # Target KL divergence (used for adaptive controller)
+    target_kl: 0.1
+
+  # Whether to enable preference feedback PPO
+  use_pf_ppo: False
+
+  # Preference feedback PPO settings
+  pf_ppo:
+
+    # Method for reweighting samples: "pow", "max_min", or "max_random"
+    reweight_method: pow
+
+    # Power used for weight scaling in "pow" method
+    weight_pow: 2.0
+
+# config for the trainer
+trainer:
+
+  # Whether to balance batch sizes across distributed workers
+  balance_batch: True
+
+  # Number of epochs in training
+  total_epochs: 30
+
+  # Total training steps (can be set explicitly or derived from epochs)
+  total_training_steps: null
+
+  # Project name for experiment tracking (e.g., wandb)
+  project_name: verl_examples
+
+  # Experiment name for run identification in tracking tools
+  experiment_name: gsm8k
+
+  # Logging backends to use: "console", "wandb", etc.
+  logger: [ 'console', 'wandb' ]
+
+  # Number of generations to log during validation
+  log_val_generations: 0
+
+  # Directory for logging rollout data; no dump if null
+  rollout_data_dir: null
+
+  # Directory for logging validation data; no dump if null
+  validation_data_dir: null
+
+  # Number of nodes used in the training
+  nnodes: 1
+
+  # Number of GPUs per node
+  n_gpus_per_node: 8
+
+  # Save frequency (by iteration) for model checkpoints
+  save_freq: -1
+
+  # Resume mode: "auto", "disable", or "resume_path"
+  # "auto": resume from last checkpoint if available
+  # "disable": start from scratch
+  # "resume_path": resume from a user-defined path
+  resume_mode: auto
+
+  # Path to resume training from (only used when resume_mode is "resume_path")
+  resume_from_path: null
+
+  # Whether to run validation before training begins
+  val_before_train: True
+
+  # Validation frequency (in training iterations)
+  test_freq: -1
+
+  # Number of iterations to warm up the critic before updating policy
+  critic_warmup: 0
+
+  # Default path to distributed filesystem for saving checkpoints
+  default_hdfs_dir: null
+
+  # Whether to delete local checkpoints after loading
+  del_local_ckpt_after_load: False
+
+  # Default local directory for saving checkpoints
+  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
+
+  # Maximum number of actor checkpoints to keep
+  max_actor_ckpt_to_keep: null
+
+  # Maximum number of critic checkpoints to keep
+  max_critic_ckpt_to_keep: null
+
+  # Timeout (in seconds) for Ray worker to wait for registration
+  ray_wait_register_center_timeout: 300
+
+  # Device to run training on (e.g., "cuda", "cpu")
+  device: cuda
+
+# configs related to ray initialization
+ray_init:
+
+  # Number of CPUs for Ray. Use a fixed number instead of null when using SLURM.
+  num_cpus: null
+
+  # Path to save Ray timeline JSON for performance profiling
+  timeline_json_file: null
\ No newline at end of file
diff --git a/trainer_integration/verl/verl_custom/trainer/main_ppo.py b/trainer_integration/verl/verl_custom/trainer/main_ppo.py
new file mode 100644
index 000000000..8752bfe1a
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/trainer/main_ppo.py
@@ -0,0 +1,276 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Note that we don't combine the main with ray_trainer as ray_trainer is used by other main.
+"""
+
+import hydra
+import ray
+from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
+
+from verl_custom.trainer.ppo.ray_trainer import RayPPOTrainer
+from verl_custom.trainer.ppo.ray_trainer_dapo import RayPPOTrainerDAPO
+from verl_custom.trainer.ppo.reward import load_reward_manager
+
+
+@hydra.main(config_path="config", config_name="ppo_trainer", version_base=None)
+def main(config):
+    run_ppo(config)
+
+
+# Define a function to run the PPO-like training process
+def run_ppo(config) -> None:
+    # Check if Ray is not initialized
+    if not ray.is_initialized():
+        # Initialize Ray with a local cluster configuration
+        # Set environment variables in the runtime environment to control tokenizer parallelism,
+        # NCCL debug level, VLLM logging level, and allow runtime LoRA updating
+        # `num_cpus` specifies the number of CPU cores Ray can use, obtained from the configuration
+        ray.init(
+            runtime_env={"env_vars": {"TOKENIZERS_PARALLELISM": "true", "NCCL_DEBUG": "WARN", "VLLM_LOGGING_LEVEL": "WARN", "VLLM_ALLOW_RUNTIME_LORA_UPDATING": "true", "VLLM_USE_V1": "1"}},
+            num_cpus=config.ray_init.num_cpus,
+        )
+
+    # Create a remote instance of the TaskRunner class, and
+    # Execute the `run` method of the TaskRunner instance remotely and wait for it to complete
+    runner = TaskRunner.remote()
+    ray.get(runner.run.remote(config))
+
+    # [Optional] get the path of the timeline trace file from the configuration, default to None
+    # This file is used for performance analysis
+    timeline_json_file = config.ray_init.get("timeline_json_file", None)
+    if timeline_json_file:
+        ray.timeline(filename=timeline_json_file)
+
+
+@ray.remote(num_cpus=1)  # please make sure main_task is not scheduled on head
+class TaskRunner:
+    def run(self, config):
+        # Print the initial configuration. `resolve=True` will evaluate symbolic values.
+        from pprint import pprint
+
+        from omegaconf import OmegaConf
+
+        from verl.utils.fs import copy_to_local
+
+        pprint(OmegaConf.to_container(config, resolve=True))
+        OmegaConf.resolve(config)
+
+        # Download the checkpoint from HDFS to the local machine.
+        # `use_shm` determines whether to use shared memory, which could lead to faster model loading if turned on
+        local_path = copy_to_local(config.actor_rollout_ref.model.path, use_shm=config.actor_rollout_ref.model.get("use_shm", False))
+
+        # Instantiate the tokenizer and processor.
+        from verl.utils import hf_processor, hf_tokenizer
+
+        trust_remote_code = config.data.get("trust_remote_code", False)
+        tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code)
+        # Used for multimodal LLM, could be None
+        processor = hf_processor(local_path, trust_remote_code=trust_remote_code, use_fast=True)
+
+        # Version validation for vllm.
+        if config.actor_rollout_ref.rollout.name in ["vllm"]:
+            from verl.utils.vllm_utils import is_version_ge
+
+            if config.actor_rollout_ref.model.get("lora_rank", 0) > 0:
+                if not is_version_ge(pkg="vllm", minver="0.7.3"):
+                    raise NotImplementedError("PPO LoRA is not supported before vllm 0.7.3")
+
+        # Define worker classes based on the actor strategy.
+        if config.actor_rollout_ref.actor.strategy in ["fsdp", "fsdp2"]:
+            assert config.critic.strategy in ["fsdp", "fsdp2"]
+            from verl.single_controller.ray import RayWorkerGroup
+            from verl.workers.fsdp_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker, CriticWorker
+
+            actor_rollout_cls = AsyncActorRolloutRefWorker if config.actor_rollout_ref.rollout.mode == "async" else ActorRolloutRefWorker
+            ray_worker_group_cls = RayWorkerGroup
+
+        elif config.actor_rollout_ref.actor.strategy == "megatron":
+            assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
+            from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup
+            from verl.workers.megatron_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker, CriticWorker
+
+            actor_rollout_cls = AsyncActorRolloutRefWorker if config.actor_rollout_ref.rollout.mode == "async" else ActorRolloutRefWorker
+            ray_worker_group_cls = NVMegatronRayWorkerGroup
+
+        else:
+            raise NotImplementedError
+
+        from verl_custom.trainer.ppo.ray_trainer import ResourcePoolManager, Role
+
+        # Map roles to their corresponding remote worker classes.
+        role_worker_mapping = {
+            Role.ActorRollout: ray.remote(actor_rollout_cls),
+            Role.Critic: ray.remote(CriticWorker),
+        }
+
+        # Define the resource pool specification.
+        # Map roles to the resource pool.
+        global_pool_id = "global_pool"
+        resource_pool_spec = {
+            global_pool_id: [config.trainer.n_gpus_per_node] * config.trainer.nnodes,
+        }
+        mapping = {
+            Role.ActorRollout: global_pool_id,
+            Role.Critic: global_pool_id,
+        }
+
+        # We should adopt a multi-source reward function here:
+        # - for rule-based rm, we directly call a reward score
+        # - for model-based rm, we call a model
+        # - for code related prompt, we send to a sandbox if there are test cases
+        # finally, we combine all the rewards together
+        # The reward type depends on the tag of the data
+        if config.reward_model.enable:
+            if config.reward_model.strategy in ["fsdp", "fsdp2"]:
+                from verl.workers.fsdp_workers import RewardModelWorker
+            elif config.reward_model.strategy == "megatron":
+                from verl.workers.megatron_workers import RewardModelWorker
+            else:
+                raise NotImplementedError
+            role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker)
+            mapping[Role.RewardModel] = global_pool_id
+
+        # Add a reference policy worker if KL loss or KL reward is used.
+        if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
+            role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
+            mapping[Role.RefPolicy] = global_pool_id
+
+        reward_manager_name = config.reward_manager.get("type", "naive")
+        if reward_manager_name == "naive":
+            from verl.workers.reward_manager import NaiveRewardManager
+
+            reward_manager_cls = NaiveRewardManager
+        elif reward_manager_name == "prime":
+            from verl_custom.nvidia.reward_manager import PrimeRewardManager
+
+            reward_manager_cls = PrimeRewardManager
+        elif reward_manager_name == "dapo":
+            from verl.workers.reward_manager import DAPORewardManager
+
+            reward_manager_cls = DAPORewardManager
+        elif reward_manager_name == "swebench":
+            from verl_custom.nvidia.reward_manager import SWEBenchRewardManager
+
+            reward_manager_cls = SWEBenchRewardManager
+        else:
+            raise NotImplementedError
+        
+        # enforce placement on head node
+        strategy = NodeAffinitySchedulingStrategy(node_id = ray.get_runtime_context().get_node_id(), soft = False)
+        reward_fn = reward_manager_cls.options(scheduling_strategy=strategy).remote(tokenizer=tokenizer, compute_score=None, config=config.reward_manager)
+        val_reward_fn = reward_fn
+
+        resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
+
+        from verl_custom.utils.dataset.rl_dataset import collate_fn
+
+        # Create training and validation datasets.
+        train_dataset = create_rl_dataset(config.data.train_files, config.data, tokenizer, processor)
+        val_dataset = create_rl_dataset(config.data.val_files, config.data, tokenizer, processor)
+        train_sampler = create_rl_sampler(config.data, train_dataset)
+        trainer_cls = RayPPOTrainerDAPO if config.algorithm.get("filter_groups", {}).get("enable", False) else RayPPOTrainer
+        print(f"Using trainer: {trainer_cls.__name__}")
+        # Initialize the PPO trainer.
+        trainer = trainer_cls(
+            config=config,
+            tokenizer=tokenizer,
+            processor=processor,
+            role_worker_mapping=role_worker_mapping,
+            resource_pool_manager=resource_pool_manager,
+            ray_worker_group_cls=ray_worker_group_cls,
+            reward_fn=reward_fn,
+            val_reward_fn=val_reward_fn,
+            train_dataset=train_dataset,
+            val_dataset=val_dataset,
+            collate_fn=collate_fn,
+            train_sampler=train_sampler,
+            device_name=config.trainer.device,
+        )
+        # Initialize the workers of the trainer.
+        trainer.init_workers()
+        # Start the training process.
+        trainer.fit()
+
+
+def create_rl_dataset(data_paths, data_config, tokenizer, processor):
+    """Create a dataset.
+
+    Arguments:
+        data_paths: List of paths to data files.
+        data_config: The data config.
+        tokenizer (Tokenizer): The tokenizer.
+        processor (Processor): The processor.
+
+    Returns:
+        dataset (Dataset): The dataset.
+    """
+    from torch.utils.data import Dataset
+
+    from verl_custom.utils.dataset.rl_dataset import RLHFDataset
+
+    # Check if a custom dataset class is specified in the data configuration
+    # and if the path to the custom class is provided
+    if "custom_cls" in data_config and data_config.custom_cls.get("path", None) is not None:
+        from verl.utils.import_utils import load_extern_type
+
+        # Dynamically load the custom dataset class
+        dataset_cls = load_extern_type(data_config.custom_cls.path, data_config.custom_cls.name)
+        # Verify that the custom dataset class inherits from torch.utils.data.Dataset
+        if not issubclass(dataset_cls, Dataset):
+            raise TypeError(f"The custom dataset class '{data_config.custom_cls.name}' from '{data_config.custom_cls.path}' must inherit from torch.utils.data.Dataset")
+    else:
+        # Use the default RLHFDataset class if no custom class is specified
+        dataset_cls = RLHFDataset
+    print(f"Using dataset class: {dataset_cls.__name__}")
+
+    # Instantiate the dataset using the determined dataset class
+    dataset = dataset_cls(
+        data_files=data_paths,
+        tokenizer=tokenizer,
+        processor=processor,
+        config=data_config,
+    )
+
+    return dataset
+
+
+def create_rl_sampler(data_config, dataset):
+    """Create a sampler for the dataset.
+
+    Arguments:
+        data_config: The data config.
+        dataset (Dataset): The dataset.
+
+    Returns:
+        sampler (Sampler): The sampler.
+    """
+    import torch
+    from torch.utils.data import RandomSampler, SequentialSampler
+
+    # Use a sampler to facilitate checkpoint resumption.
+    # If shuffling is enabled in the data configuration, create a random sampler.
+    if data_config.shuffle:
+        train_dataloader_generator = torch.Generator()
+        train_dataloader_generator.manual_seed(data_config.get("seed", 1))
+        sampler = RandomSampler(data_source=dataset, generator=train_dataloader_generator)
+    else:
+        # If shuffling is disabled, use a sequential sampler to iterate through the dataset in order.
+        sampler = SequentialSampler(data_source=dataset)
+
+    return sampler
+
+
+if __name__ == "__main__":
+    main()
diff --git a/trainer_integration/verl/verl_custom/trainer/ppo/__init__.py b/trainer_integration/verl/verl_custom/trainer/ppo/__init__.py
new file mode 100644
index 000000000..7a7aadbc9
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/trainer/ppo/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
diff --git a/trainer_integration/verl/verl_custom/trainer/ppo/core_algos.py b/trainer_integration/verl/verl_custom/trainer/ppo/core_algos.py
new file mode 100644
index 000000000..b35352bc2
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/trainer/ppo/core_algos.py
@@ -0,0 +1,841 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Core functions to implement PPO algorithms.
+The function implemented in this file should be used by trainer with different distributed strategies to
+implement PPO-like algorithms.
+"""
+
+__all__ = ['register', "get_adv_estimator_fn", "AdvantageEstimator"]
+
+from collections import defaultdict
+from enum import Enum
+
+import numpy as np
+import torch
+
+import verl.utils.torch_functional as verl_F
+
+ADV_ESTIMATOR_REGISTRY = {}
+
+def register_adv_est(name_or_enum):
+    """Decorator to register a advantage estimator function with a given name.
+
+    Args:
+        name_or_enum: `(str)` or `(AdvantageEstimator)`
+            The name or enum of the advantage estimator.
+
+    """
+    def decorator(fn):
+        name = name_or_enum.value if isinstance(name_or_enum, Enum) else name_or_enum
+        if name in ADV_ESTIMATOR_REGISTRY and ADV_ESTIMATOR_REGISTRY[name] != fn:
+            raise ValueError(f"Adv estimator {name} has already been registered: {ADV_ESTIMATOR_REGISTRY[name]} vs {fn}")
+        ADV_ESTIMATOR_REGISTRY[name] = fn
+        return fn
+    return decorator
+
+def get_adv_estimator_fn(name_or_enum):
+    """Get the advantage estimator function with a given name.
+
+    Args:
+        name_or_enum: `(str)` or `(AdvantageEstimator)`
+            The name or enum of the advantage estimator.
+
+    Returns:
+        `(callable)`: The advantage estimator function.
+    """
+    name = name_or_enum.value if isinstance(name_or_enum, Enum) else name_or_enum
+    if name not in ADV_ESTIMATOR_REGISTRY:
+        raise ValueError(f"Unknown advantage estimator simply: {name}")
+    return ADV_ESTIMATOR_REGISTRY[name]
+
+class AdvantageEstimator(str, Enum):
+    """Using an enumeration class to avoid spelling errors in adv_estimator.
+
+    Note(haibin.lin): this enum class is immutable after creation. Extending this
+    enum for new estimators may not be necessary since users can always just call
+    `verl_custom.trainer.ppo.core_algos.register` with string name for a custom advantage
+    estimator instead.
+    """
+
+    GAE = "gae"
+    GRPO = "grpo"
+    REINFORCE_PLUS_PLUS = "reinforce_plus_plus"
+    REINFORCE_PLUS_PLUS_BASELINE = "reinforce_plus_plus_baseline"
+    REMAX = "remax"
+    RLOO = "rloo"
+    OPO = "opo"
+    GRPO_PASSK = "grpo_passk"
+
+
+class AdaptiveKLController:
+    """
+    Adaptive KL controller described in the paper:
+    https://arxiv.org/pdf/1909.08593.pdf
+    """
+
+    def __init__(self, init_kl_coef, target_kl, horizon):
+        self.value = init_kl_coef
+        self.target = target_kl
+        self.horizon = horizon
+
+    def update(self, current_kl, n_steps):
+        target = self.target
+        proportional_error = np.clip(current_kl / target - 1, -0.2, 0.2)
+        mult = 1 + proportional_error * n_steps / self.horizon
+        self.value *= mult
+
+
+class FixedKLController:
+    """Fixed KL controller."""
+
+    def __init__(self, kl_coef):
+        self.value = kl_coef
+
+    def update(self, current_kl, n_steps):
+        pass
+
+
+def get_kl_controller(kl_ctrl):
+    if kl_ctrl.type == "fixed":
+        return FixedKLController(kl_coef=kl_ctrl.kl_coef)
+    elif kl_ctrl.type == "adaptive":
+        assert kl_ctrl.horizon > 0, f"horizon must be larger than 0. Got {kl_ctrl.horizon}"
+        return AdaptiveKLController(init_kl_coef=kl_ctrl.kl_coef, target_kl=kl_ctrl.target_kl, horizon=kl_ctrl.horizon)
+    else:
+        raise NotImplementedError
+
+@register_adv_est(AdvantageEstimator.GAE) # or simply: @register_adv_est("gae")
+def compute_gae_advantage_return(
+    token_level_rewards: torch.Tensor,
+    values: torch.Tensor,
+    response_mask: torch.Tensor,
+    gamma: torch.Tensor,
+    lam: torch.Tensor,
+):
+    """Adapted from https://github.com/huggingface/trl/blob/main/trl/trainer/ppo_trainer.py
+
+    Args:
+        token_level_rewards: `(torch.Tensor)`
+            shape is (bs, response_length)
+        values: `(torch.Tensor)`
+            shape is (bs, response_length)
+        response_mask: `(torch.Tensor)`
+            shape is (bs, response_length). [EOS] mask. The token after [EOS] have mask zero.
+        gamma is `(float)`
+            discounted factor used in RL
+        lam: `(float)`
+            lambda value when computing Generalized Advantage Estimation (https://arxiv.org/abs/1506.02438)
+
+    Returns:
+        advantages: `(torch.Tensor)`
+            shape: (bs, response_length)
+        Returns: `(torch.Tensor)`
+            shape: (bs, response_length)
+
+    """
+    with torch.no_grad():
+        lastgaelam = 0
+        advantages_reversed = []
+        gen_len = token_level_rewards.shape[-1]
+
+        for t in reversed(range(gen_len)):
+            nextvalues = values[:, t + 1] if t < gen_len - 1 else 0.0
+            delta = token_level_rewards[:, t] + gamma * nextvalues - values[:, t]
+            lastgaelam = delta + gamma * lam * lastgaelam
+            advantages_reversed.append(lastgaelam)
+        advantages = torch.stack(advantages_reversed[::-1], dim=1)
+
+        returns = advantages + values
+        advantages = verl_F.masked_whiten(advantages, response_mask)
+    return advantages, returns
+
+
+# NOTE(sgm): this implementation only consider outcome supervision, where the reward is a scalar.
+@register_adv_est(AdvantageEstimator.GRPO) # or simply: @register_adv_est("grpo")
+def compute_grpo_outcome_advantage(
+    token_level_rewards: torch.Tensor,
+    response_mask: torch.Tensor,
+    index: np.ndarray,
+    epsilon: float = 1e-6,
+    norm_adv_by_std_in_grpo: str = True,
+):
+    """
+    Compute advantage for GRPO, operating only on Outcome reward
+    (with only one scalar reward for each response).
+
+    Args:
+        token_level_rewards: `(torch.Tensor)`
+            shape is (bs, response_length)
+        response_mask: `(torch.Tensor)`
+            shape is (bs, response_length)
+        norm_adv_by_std_in_grpo: (bool)
+            whether to scale the GRPO advantage.
+            If True, the advantage is scaled by the std, as in the original GRPO.
+            If False, the advantage is not scaled, as in Dr.GRPO (https://arxiv.org/abs/2503.20783).
+
+    Returns:
+        advantages: `(torch.Tensor)`
+            shape is (bs, response_length)
+        Returns: `(torch.Tensor)`
+            shape is (bs, response_length)
+    """
+    scores = token_level_rewards.sum(dim=-1)
+
+    id2score = defaultdict(list)
+    id2mean = {}
+    id2std = {}
+
+    with torch.no_grad():
+        bsz = scores.shape[0]
+        for i in range(bsz):
+            id2score[index[i]].append(scores[i])
+        for idx in id2score:
+            if len(id2score[idx]) == 1:
+                id2mean[idx] = torch.tensor(0.0)
+                id2std[idx] = torch.tensor(1.0)
+            elif len(id2score[idx]) > 1:
+                id2mean[idx] = torch.mean(torch.tensor(id2score[idx]))
+                id2std[idx] = torch.std(torch.tensor([id2score[idx]]))
+            else:
+                raise ValueError(f"no score in prompt index: {idx}")
+        for i in range(bsz):
+            if norm_adv_by_std_in_grpo:
+                scores[i] = (scores[i] - id2mean[index[i]]) / (id2std[index[i]] + epsilon)
+            else:
+                scores[i] = scores[i] - id2mean[index[i]]
+        scores = scores.unsqueeze(-1) * response_mask
+
+    return scores, scores
+
+@register_adv_est(AdvantageEstimator.GRPO_PASSK) # or simply: @register_adv_est("grpo_passk")
+def compute_grpo_passk_outcome_advantage(
+    token_level_rewards: torch.Tensor,
+    response_mask: torch.Tensor,
+    index: np.ndarray,
+    epsilon: float = 1e-6,
+    norm_adv_by_std_in_grpo: bool = True,
+    config = None,
+    **kwargs,
+):
+    """
+    Compute advantage for Pass@k using a GRPO-style outcome reward formulation.
+    Only the best response per group gets a non-zero advantage: r_max - r_second_max.
+
+    Implemented as described in https://arxiv.org/abs/2503.19595.
+
+    Args:
+        token_level_rewards: (bs, response_length)
+        response_mask: (bs, response_length)
+        index: (bs,) → group ID per sample
+        epsilon: float for numerical stability
+        config: (dict) algorithm settings, which contains "norm_adv_by_std_in_grpo"
+
+    Returns:
+        advantages: (bs, response_length)
+        returns: (bs, response_length)
+    """
+    assert config is not None
+    # if True, normalize advantage by std within group
+    norm_adv_by_std_in_grpo = config.get("norm_adv_by_std_in_grpo", True)
+    scores = token_level_rewards.sum(dim=-1)  # (bs,)
+    advantages = torch.zeros_like(scores)
+
+    id2scores = defaultdict(list)
+    id2indices = defaultdict(list)
+
+    with torch.no_grad():
+        bsz = scores.shape[0]
+        for i in range(bsz):
+            idx = index[i]
+            id2scores[idx].append(scores[i])
+            id2indices[idx].append(i)
+
+        for idx in id2scores:
+            rewards = torch.stack(id2scores[idx])  # (k,)
+            if rewards.numel() < 2:
+                raise ValueError(f"Pass@k requires at least 2 samples per group. Got {rewards.numel()} for group {idx}.")
+            topk, topk_idx = torch.topk(rewards, 2)
+            r_max, r_second_max = topk[0], topk[1]
+            i_max = id2indices[idx][topk_idx[0].item()]
+            advantage = r_max - r_second_max
+            if norm_adv_by_std_in_grpo:
+                std = torch.std(rewards)
+                advantage = advantage / (std + epsilon)
+            advantages[i_max] = advantage
+
+    advantages = advantages.unsqueeze(-1) * response_mask
+    return advantages, advantages
+
+@register_adv_est(AdvantageEstimator.REINFORCE_PLUS_PLUS_BASELINE) # or simply: @register_adv_est("reinforce_plus_plus_baseline")
+def compute_reinforce_plus_plus_baseline_outcome_advantage(token_level_rewards: torch.Tensor, response_mask: torch.Tensor, index: torch.Tensor,
+                                                           epsilon: float = 1e-6, config=None, **kwargs):
+    """
+    Compute advantage for RF++-baseline (https://arxiv.org/abs/2501.03262), operating only on Outcome reward
+    (with only one scalar reward for each response).
+
+    Args:
+        token_level_rewards: `(torch.Tensor)`
+            shape: (bs, response_length)
+        response_mask: `(torch.Tensor)`
+            shape: (bs, response_length)
+        config: (dict) algorithm config
+
+    Returns:
+        advantages: `(torch.Tensor)`
+            shape: (bs, response_length)
+        Returns: `(torch.Tensor)`
+            shape: (bs, response_length)
+    """
+    response_length = token_level_rewards.shape[-1]
+    scores = token_level_rewards.sum(dim=-1)
+
+    id2score = defaultdict(list)
+    id2mean = {}
+
+    with torch.no_grad():
+        bsz = scores.shape[0]
+        for i in range(bsz):
+            id2score[index[i]].append(scores[i])
+        for idx in id2score:
+            if len(id2score[idx]) == 1:
+                id2mean[idx] = torch.tensor(0.0)
+            elif len(id2score[idx]) > 1:
+                id2mean[idx] = torch.mean(torch.tensor(id2score[idx]))
+            else:
+                raise ValueError(f"no score in prompt index: {idx}")
+        for i in range(bsz):
+            scores[i] = scores[i] - id2mean[index[i]]
+
+        scores = scores.unsqueeze(-1).tile([1, response_length]) * response_mask
+        scores = verl_F.masked_whiten(scores, response_mask) * response_mask
+
+    return scores, scores
+
+@register_adv_est(AdvantageEstimator.RLOO) # or simply: @register_adv_est("rloo")
+def compute_rloo_outcome_advantage(token_level_rewards: torch.Tensor, response_mask: torch.Tensor, index: np.ndarray,
+                                   epsilon: float = 1e-6, config=None, **kwargs):
+    """
+    Compute advantage for RLOO based on https://arxiv.org/abs/2402.14740
+
+    Args:
+        token_level_rewards: `(torch.Tensor)`
+            shape: (bs, response_length)
+        response_mask: `(torch.Tensor)`
+            shape: (bs, response_length)
+        config: (dict) algorithm config
+
+    Returns:
+        advantages: `(torch.Tensor)`
+            shape: (bs, response_length)
+        Returns: `(torch.Tensor)`
+            shape: (bs, response_length)
+    """
+    scores = token_level_rewards.sum(dim=-1)
+
+    id2score = defaultdict(list)
+    id2mean = {}
+
+    with torch.no_grad():
+        bsz = scores.shape[0]
+        for i in range(bsz):
+            id2score[index[i]].append(scores[i])
+        for idx in id2score:
+            if len(id2score[idx]) == 1:
+                id2mean[idx] = torch.tensor(0.0)
+            elif len(id2score[idx]) > 1:
+                id2mean[idx] = torch.mean(torch.tensor(id2score[idx]))
+            else:
+                raise ValueError(f"no score in prompt index: {idx}")
+        for i in range(bsz):
+            response_num = len(id2score[index[i]])
+            if response_num > 1:
+                scores[i] = scores[i] * response_num / (response_num - 1) - id2mean[index[i]] * response_num / (response_num - 1)
+        scores = scores.unsqueeze(-1) * response_mask
+
+    return scores, scores
+
+@register_adv_est(AdvantageEstimator.OPO) # or simply: @register_adv_est("opo")
+def compute_opo_outcome_advantage(token_level_rewards: torch.Tensor, response_mask: torch.Tensor, index: np.ndarray, epsilon: float = 1e-6,
+                                  config=None, **kwargs):
+    """
+    Compute advantage for OPO based on https://arxiv.org/pdf/2505.23585
+
+    Args:
+        token_level_rewards: `(torch.Tensor)`
+            shape: (bs, response_length)
+        response_mask: `(torch.Tensor)`
+            shape: (bs, response_length)
+        config: (dict) algorithm config
+
+    Returns:
+        advantages: `(torch.Tensor)`
+            shape: (bs, response_length)
+        Returns: `(torch.Tensor)`
+            shape: (bs, response_length)
+    """
+    response_length = response_mask.sum(dim=-1)
+    scores = token_level_rewards.sum(dim=-1)
+
+    id2score = defaultdict(list)
+    id2len = defaultdict(list)
+    id2bsl = {}
+
+    with torch.no_grad():
+        bsz = scores.shape[0]
+        for i in range(bsz):
+            id2score[index[i]].append(scores[i])
+            id2len[index[i]].append(response_length[i])
+
+        for idx in id2score:
+            if len(id2score[idx]) == 1:
+                id2bsl[idx] = torch.tensor(0.0)
+            elif len(id2score[idx]) > 1:
+                score_tensor = torch.tensor(id2score[idx])
+                len_tensor = torch.tensor(id2len[idx])
+                id2bsl[idx] = (len_tensor * score_tensor).sum() / len_tensor.sum()
+            else:
+                raise ValueError(f"no score in prompt index: {idx}")
+        for i in range(bsz):
+            scores[i] = scores[i] - id2bsl[index[i]]
+        scores = scores.unsqueeze(-1) * response_mask
+
+    return scores, scores
+
+@register_adv_est(AdvantageEstimator.REINFORCE_PLUS_PLUS) # or simply: @register_adv_est("reinforce_plus_plus")
+def compute_reinforce_plus_plus_outcome_advantage(token_level_rewards: torch.Tensor, response_mask: torch.Tensor, config=None, **kwargs):
+    """
+    Compute advantage for REINFORCE++.
+    This implementation is based on the paper: https://arxiv.org/abs/2501.03262
+
+    Args:
+        token_level_rewards: `(torch.Tensor)`
+            shape: (bs, response_length)
+        response_mask: `(torch.Tensor)`
+            shape: (bs, response_length)
+        config: (dict) algorithm config
+
+    Returns:
+        advantages: `(torch.Tensor)`
+            shape: (bs, response_length)
+        Returns: `(torch.Tensor)`
+            shape: (bs, response_length)
+    """
+    assert config is not None
+    gamma = config.gamma
+    with torch.no_grad():
+        returns = torch.zeros_like(token_level_rewards)
+        running_return = 0
+
+        for t in reversed(range(token_level_rewards.shape[1])):
+            running_return = token_level_rewards[:, t] + gamma * running_return
+            returns[:, t] = running_return
+            # Reset after EOS
+            running_return = running_return * response_mask[:, t]
+
+        advantages = verl_F.masked_whiten(returns, response_mask)
+        advantages = advantages * response_mask
+
+    return advantages, returns
+
+@register_adv_est(AdvantageEstimator.REMAX) # or simply: @register_adv_est("remax")
+def compute_remax_outcome_advantage(token_level_rewards: torch.Tensor, reward_baselines: torch.Tensor, response_mask: torch.Tensor, config=None, **kwargs):
+    """
+    Compute advantage for ReMax, operating only on Outcome reward
+    This implementation is based on the paper: https://arxiv.org/abs/2310.10505
+    (with only one scalar reward for each response).
+
+    Args:
+        token_level_rewards: `(torch.Tensor)`
+            shape: (bs, response_length)
+        reward_baselines: `(torch.Tensor)`
+            shape: (bs,)
+        response_mask: `(torch.Tensor)`
+            shape: (bs, response_length)
+        config: (dict) algorithm config
+
+    Returns:
+        advantages: `(torch.Tensor)`
+            shape: (bs, response_length)
+        Returns: `(torch.Tensor)`
+            shape: (bs, response_length)
+    """
+
+    with torch.no_grad():
+        returns = (token_level_rewards * response_mask).flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1])
+        advantages = returns - reward_baselines.unsqueeze(-1) * response_mask
+
+    return advantages, returns
+
+
+def compute_rewards(token_level_scores, old_log_prob, ref_log_prob, kl_ratio):
+    kl = old_log_prob - ref_log_prob
+    return token_level_scores - kl * kl_ratio
+
+
+def agg_loss(loss_mat: torch.Tensor, loss_mask: torch.Tensor, loss_agg_mode: str):
+    """
+    Aggregate the loss matrix into a scalar.
+
+    Args:
+        loss_mat: `(torch.Tensor)`:
+            shape: (bs, response_length)
+        loss_mask: `(torch.Tensor)`:
+            shape: (bs, response_length)
+        loss_agg_mode: (str) choices:
+            method to aggregate the loss matrix into a scalar.
+    Returns:
+        loss: `a scalar torch.Tensor`
+            aggregated loss
+    """
+    if loss_agg_mode == "token-mean":
+        loss = verl_F.masked_mean(loss_mat, loss_mask)
+    elif loss_agg_mode == "seq-mean-token-sum":
+        seq_losses = torch.sum(loss_mat * loss_mask, dim=-1)  # token-sum
+        loss = torch.mean(seq_losses)  # seq-mean
+    elif loss_agg_mode == "seq-mean-token-mean":
+        seq_losses = torch.sum(loss_mat * loss_mask, dim=-1) / torch.sum(loss_mask, dim=-1)  # token-mean
+        loss = torch.mean(seq_losses)  # seq-mean
+    elif loss_agg_mode == "seq-mean-token-sum-norm":
+        seq_losses = torch.sum(loss_mat * loss_mask, dim=-1)
+        loss = torch.sum(seq_losses) / loss_mask.shape[-1]  # The divisor
+        # (loss_mask.shape[-1]) should ideally be constant
+        # throughout training to well-replicate the DrGRPO paper.
+        # TODO: Perhaps add user-defined normalizer argument to
+        # agg_loss to ensure divisor stays constant throughout.
+    else:
+        raise ValueError(f"Invalid loss_agg_mode: {loss_agg_mode}")
+
+    return loss
+
+
+def compute_policy_loss(
+    old_log_prob,
+    log_prob,
+    advantages,
+    response_mask,
+    cliprange=None,
+    cliprange_low=None,
+    cliprange_high=None,
+    clip_ratio_c=3.0,
+    loss_agg_mode: str = "token-mean",
+    config = None,
+    rollout_log_probs=None,
+):
+    """
+    Compute the clipped policy objective and related metrics for PPO.
+
+    Adapted from
+    https://github.com/huggingface/trl/blob/main/trl/trainer/ppo_trainer.py#L1122
+
+    Args:
+        old_log_prob (torch.Tensor):
+            Log-probabilities of actions under the old policy, shape (batch_size, response_length).
+        log_prob (torch.Tensor):
+            Log-probabilities of actions under the current policy, shape (batch_size, response_length).
+        advantages (torch.Tensor):
+            Advantage estimates for each action, shape (batch_size, response_length).
+        response_mask (torch.Tensor):
+            Mask indicating which tokens to include in the loss, shape (batch_size, response_length).
+        cliprange (float, optional):
+            Clipping parameter ε for standard PPO. See https://arxiv.org/abs/1707.06347.
+            Defaults to None (must be provided).
+        cliprange_low (float, optional):
+            Lower clip range for dual-clip PPO. Defaults to same as `cliprange`.
+        cliprange_high (float, optional):
+            Upper clip range for dual-clip PPO. Defaults to same as `cliprange`.
+        clip_ratio_c (float, optional):
+            Lower bound of the ratio for dual-clip PPO. See https://arxiv.org/pdf/1912.09729.
+            Defaults to 3.0.
+        loss_agg_mode (str, optional):
+            Aggregation mode for `agg_loss`. Defaults to "token-mean".
+    """
+    assert clip_ratio_c > 1.0, "The lower bound of the clip_ratio_c for dual-clip PPO should be greater than 1.0," + f" but get the value: {clip_ratio_c}."
+
+    negative_approx_kl = log_prob - old_log_prob
+    ratio = torch.exp(negative_approx_kl)
+    ppo_kl = verl_F.masked_mean(-negative_approx_kl, response_mask)
+
+    pg_losses1 = -advantages * ratio
+    if cliprange_low is None:
+        cliprange_low = cliprange
+    if cliprange_high is None:
+        cliprange_high = cliprange
+    pg_losses2 = -advantages * torch.clamp(ratio, 1 - cliprange_low, 1 + cliprange_high)  # - clip(ratio, 1-cliprange, 1+cliprange) * A
+    clip_pg_losses1 = torch.maximum(pg_losses1, pg_losses2)  # max(-ratio * A, -clip(ratio, 1-cliprange, 1+cliprange) * A)
+    pg_clipfrac = verl_F.masked_mean(torch.gt(pg_losses2, pg_losses1).float(), response_mask)
+
+    pg_losses3 = -advantages * clip_ratio_c
+    clip_pg_losses2 = torch.min(pg_losses3, clip_pg_losses1)
+    pg_clipfrac_lower = verl_F.masked_mean(torch.gt(clip_pg_losses1, pg_losses3) * (advantages < 0).float(), response_mask)
+
+    pg_losses = torch.where(advantages < 0, clip_pg_losses2, clip_pg_losses1)
+
+    if config is not None and config.tis_imp_ratio_cap > 0 and rollout_log_probs is not None:
+        # Apply truncated importance sampling -> https://fengyao.notion.site/off-policy-rl
+        tis_imp_ratio = torch.exp(old_log_prob - rollout_log_probs)
+        tis_imp_ratio = torch.clamp(tis_imp_ratio, max=config.tis_imp_ratio_cap)
+        pg_losses = pg_losses * tis_imp_ratio
+
+    pg_loss = agg_loss(loss_mat=pg_losses, loss_mask=response_mask, loss_agg_mode=loss_agg_mode)
+
+    return pg_loss, pg_clipfrac, ppo_kl, pg_clipfrac_lower
+
+
+def compute_gspo_policy_loss(
+    old_log_prob,
+    log_prob,
+    advantages,
+    response_mask,
+    cliprange=None,
+    cliprange_low=None,
+    cliprange_high=None,
+    clip_ratio_c=3.0,
+    loss_agg_mode: str = "token-mean",
+    rollout_log_probs=None,
+):
+    """
+    Compute the clipped policy objective and related metrics for GSPO.
+    
+    GSPO (Geometric Sequence PPO) differs from standard PPO in how the ratio is computed:
+    - PPO: ratio = exp(log_prob - old_log_prob) for each token
+    - GSPO: ratio = (prod(exp(log_prob - old_log_prob)))^(1/N) where N is the sequence length
+    
+    This implements a geometric mean approach where the ratio is computed as the N-th root
+    of the product of all token-level ratios within each sequence.
+
+    Args:
+        old_log_prob (torch.Tensor):
+            Log-probabilities of actions under the old policy, shape (batch_size, response_length).
+        log_prob (torch.Tensor):
+            Log-probabilities of actions under the current policy, shape (batch_size, response_length).
+        advantages (torch.Tensor):
+            Advantage estimates for each action, shape (batch_size, response_length).
+        response_mask (torch.Tensor):
+            Mask indicating which tokens to include in the loss, shape (batch_size, response_length).
+        cliprange (float, optional):
+            Clipping parameter ε for standard PPO. See https://arxiv.org/abs/1707.06347.
+            Defaults to None (must be provided).
+        cliprange_low (float, optional):
+            Lower clip range for dual-clip PPO. Defaults to same as `cliprange`.
+        cliprange_high (float, optional):
+            Upper clip range for dual-clip PPO. Defaults to same as `cliprange`.
+        clip_ratio_c (float, optional):
+            Lower bound of the ratio for dual-clip PPO. See https://arxiv.org/pdf/1912.09729.
+            Defaults to 3.0.
+        loss_agg_mode (str, optional):
+            Aggregation mode for `agg_loss`. Defaults to "token-mean".
+    """
+    assert clip_ratio_c > 1.0, "The lower bound of the clip_ratio_c for dual-clip PPO should be greater than 1.0," + f" but get the value: {clip_ratio_c}."
+
+    if rollout_log_probs is not None:
+        old_log_prob = rollout_log_probs
+
+    # Compute token-level log ratios
+    negative_approx_kl = log_prob - old_log_prob
+    
+    # Compute sequence-level geometric mean ratio
+    # For each sequence, compute: (prod(exp(log_prob - old_log_prob)))^(1/N)
+    # where N is the number of valid tokens in the sequence
+    
+    # Step 1: Compute the sum of log ratios for each sequence (masked)
+    log_ratio_sum = (negative_approx_kl * response_mask).sum(dim=-1)  # (batch_size,)
+    
+    # Step 2: Get the sequence lengths (number of valid tokens per sequence)
+    sequence_lengths = response_mask.sum(dim=-1).float()  # (batch_size,)
+    
+    # Step 3: Compute geometric mean ratio: exp(sum(log_ratios) / N)
+    # Add small epsilon to avoid division by zero
+    geometric_mean_ratio = torch.exp(log_ratio_sum / (sequence_lengths))  # (batch_size,)
+    
+    # Step 4: Expand ratio to match the original shape for loss computation
+    ratio = geometric_mean_ratio.unsqueeze(-1) * response_mask  # (batch_size, response_length)
+    
+    # Compute KL divergence for monitoring (same as PPO)
+    ppo_kl = verl_F.masked_mean(-negative_approx_kl, response_mask)
+
+    # Compute policy gradient losses (same logic as PPO but with geometric mean ratio)
+    pg_losses1 = -advantages * ratio
+    if cliprange_low is None:
+        cliprange_low = cliprange
+    if cliprange_high is None:
+        cliprange_high = cliprange
+    pg_losses2 = -advantages * torch.clamp(ratio, 1 - cliprange_low, 1 + cliprange_high)
+    clip_pg_losses1 = torch.maximum(pg_losses1, pg_losses2)
+    pg_clipfrac = verl_F.masked_mean(torch.gt(pg_losses2, pg_losses1).float(), response_mask)
+
+    pg_losses3 = -advantages * clip_ratio_c
+    clip_pg_losses2 = torch.min(pg_losses3, clip_pg_losses1)
+    pg_clipfrac_lower = verl_F.masked_mean(torch.gt(clip_pg_losses1, pg_losses3) * (advantages < 0).float(), response_mask)
+
+    pg_losses = torch.where(advantages < 0, clip_pg_losses2, clip_pg_losses1)
+    pg_loss = agg_loss(loss_mat=pg_losses, loss_mask=response_mask, loss_agg_mode="seq-mean-token-mean")
+
+    return pg_loss, pg_clipfrac, ppo_kl, pg_clipfrac_lower
+
+
+def compute_entropy_loss(logits, response_mask, loss_agg_mode: str = "token-mean"):
+    """Compute categorical entropy loss (For backward compatibility)
+
+    Args:
+        logits (torch.Tensor): shape is (bs, response_length, vocab_size)
+        response_mask (torch.Tensor): shape is (bs, response_length)
+
+    Returns:
+        entropy: a scalar torch.Tensor
+
+    """
+    # compute entropy
+    token_entropy = verl_F.entropy_from_logits(logits)  # (bs, response_len)
+    entropy_loss = agg_loss(loss_mat=token_entropy, loss_mask=response_mask, loss_agg_mode=loss_agg_mode)
+    return entropy_loss
+
+
+def compute_value_loss(vpreds: torch.Tensor, returns: torch.Tensor, values: torch.Tensor, response_mask: torch.Tensor, cliprange_value: float, loss_agg_mode: str = "token-mean"):
+    """
+    Compute the clipped value-function loss for PPO.
+
+    Copied from https://github.com/huggingface/trl/blob/main/trl/trainer/ppo_trainer.py#L1151
+
+    Args:
+        vpreds (torch.FloatTensor):
+            Predicted values from the value head, shape (batch_size, response_length).
+        values (torch.FloatTensor):
+            Old (baseline) values from the value head, shape (batch_size, response_length).
+        returns (torch.FloatTensor):
+            Ground-truth returns, shape (batch_size, response_length).
+        response_mask (torch.Tensor):
+            Mask indicating which tokens to include in the value loss calculation.
+        cliprange_value (float):
+            Clip range for value prediction updates.
+        loss_agg_mode (str, optional):
+            Aggregation mode for `agg_loss`. Defaults to "token-mean".
+
+    Returns:
+        vf_loss (torch.FloatTensor):
+            A scalar tensor containing the aggregated value-function loss.
+        vf_clipfrac (float):
+            Fraction of elements where the clipped loss was used.
+    """
+    vpredclipped = verl_F.clip_by_value(vpreds, values - cliprange_value, values + cliprange_value)
+    vf_losses1 = (vpreds - returns) ** 2
+    vf_losses2 = (vpredclipped - returns) ** 2
+    clipped_vf_losses = torch.max(vf_losses1, vf_losses2)
+    vf_loss = agg_loss(loss_mat=clipped_vf_losses, loss_mask=response_mask, loss_agg_mode=loss_agg_mode)
+    vf_clipfrac = verl_F.masked_mean(torch.gt(vf_losses2, vf_losses1).float(), response_mask)
+    return vf_loss, vf_clipfrac
+
+
+def kl_penalty(logprob: torch.FloatTensor, ref_logprob: torch.FloatTensor, kl_penalty) -> torch.FloatTensor:
+    """Compute KL divergence given logprob and ref_logprob.
+    Copied from https://github.com/huggingface/trl/blob/main/trl/trainer/ppo_trainer.py#L1104
+    See more description in http://joschu.net/blog/kl-approx.html
+
+    Args:
+        logprob:
+        ref_logprob:
+
+    Returns:
+
+    """
+    if kl_penalty in ("kl", "k1"):
+        return logprob - ref_logprob
+
+    if kl_penalty == "abs":
+        return (logprob - ref_logprob).abs()
+
+    if kl_penalty in ("mse", "k2"):
+        kld = 0.5 * (logprob - ref_logprob).square()
+        return torch.clamp(kld, min=-10, max=10)
+
+    # J. Schulman. Approximating kl divergence, 2020.
+    # # URL http://joschu.net/blog/kl-approx.html.
+    if kl_penalty in ("low_var_kl", "k3"):
+        kl = ref_logprob - logprob
+        ratio = torch.exp(kl)
+        kld = (ratio - kl - 1).contiguous()
+        return torch.clamp(kld, min=-10, max=10)
+
+    if kl_penalty == "full":
+        # so, here logprob and ref_logprob should contain the logits for every token in vocabulary
+        raise NotImplementedError
+
+    raise NotImplementedError
+
+
+def compute_pf_ppo_reweight_data(
+    data,
+    reweight_method: str = "pow",
+    weight_pow: float = 2.0,
+):
+    """Reweight the data based on the token_level_scores.
+
+    Args:
+        data: DataProto object, containing batch, non_tensor_batch and meta_info
+        reweight_method: str, choices: "pow", "max_min", "max_random"
+        weight_pow: float, the power of the weight
+
+    Returns:
+
+    """
+
+    @torch.no_grad()
+    def compute_weights(scores: torch.Tensor, reweight_method: str, weight_pow: float) -> torch.Tensor:
+        if reweight_method == "pow":
+            weights = torch.pow(torch.abs(scores), weight_pow)
+        elif reweight_method == "max_min":
+            max_score = torch.max(scores)
+            min_score = torch.min(scores)
+            weights = torch.where((scores == max_score) | (scores == min_score), 1.0, 0.0)
+        elif reweight_method == "max_random":
+            max_score = torch.max(scores)
+            weights = torch.where(scores == max_score, 0.4, 0.1)
+        else:
+            raise ValueError(f"Unsupported reweight_method: {reweight_method}")
+        return weights
+
+    scores = data.batch["token_level_scores"].sum(dim=-1)
+    weights = compute_weights(scores, reweight_method, weight_pow)
+    weights = torch.clamp(weights + 1e-8, min=1e-8)
+
+    batch_size = scores.shape[0]
+    sample_indices = torch.multinomial(weights, batch_size, replacement=True)
+
+    resampled_batch = {key: tensor[sample_indices] for key, tensor in data.batch.items()}
+
+    sample_indices_np = sample_indices.numpy()
+    resampled_non_tensor_batch = {}
+    for key, array in data.non_tensor_batch.items():
+        if isinstance(array, np.ndarray):
+            resampled_non_tensor_batch[key] = array[sample_indices_np]
+        else:
+            resampled_non_tensor_batch[key] = [array[i] for i in sample_indices_np]
+
+    resampled_meta_info = {}
+    for key, value in data.meta_info.items():
+        if isinstance(value, list) and len(value) == batch_size:
+            resampled_meta_info[key] = [value[i] for i in sample_indices_np]
+        else:
+            resampled_meta_info[key] = value
+
+    from copy import deepcopy
+
+    resampled_data = deepcopy(data)
+    resampled_data.batch = type(data.batch)(resampled_batch)
+    resampled_data.batch.batch_size = data.batch.batch_size
+    resampled_data.non_tensor_batch = resampled_non_tensor_batch
+    resampled_data.meta_info = resampled_meta_info
+
+    return resampled_data
diff --git a/trainer_integration/verl/verl_custom/trainer/ppo/metric_utils.py b/trainer_integration/verl/verl_custom/trainer/ppo/metric_utils.py
new file mode 100644
index 000000000..91ef1da91
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/trainer/ppo/metric_utils.py
@@ -0,0 +1,494 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Metrics related to the PPO trainer.
+"""
+
+from collections import defaultdict
+from functools import partial
+from typing import Any, Callable, Dict, List
+
+import numpy as np
+import torch
+
+from verl import DataProto
+from verl.utils.import_utils import deprecated
+
+from scipy.special import comb
+
+@deprecated("verl.utils.metric.reduce_metrics")
+def reduce_metrics(metrics: Dict[str, List[Any]]) -> Dict[str, Any]:
+    """
+    Reduces a dictionary of metric lists by computing the mean of each list.
+
+    Args:
+        metrics: A dictionary mapping metric names to lists of metric values.
+
+    Returns:
+        A dictionary with the same keys but with each list replaced by its mean value.
+
+    Example:
+        >>> metrics = {"loss": [1.0, 2.0, 3.0], "accuracy": [0.8, 0.9, 0.7]}
+        >>> reduce_metrics(metrics)
+        {"loss": 2.0, "accuracy": 0.8}
+    """
+    from verl.utils.metric import reduce_metrics
+
+    return reduce_metrics(metrics)
+
+
+def _compute_response_info(batch: DataProto) -> Dict[str, Any]:
+    """
+    Computes information about prompts and responses from a batch.
+
+    This is an internal helper function that extracts masks and lengths for prompts and responses.
+
+    Args:
+        batch: A DataProto object containing batch data with responses and attention masks.
+
+    Returns:
+        A dictionary containing:
+            - response_mask: Attention mask for the response tokens
+            - prompt_length: Tensor of prompt lengths for each item in the batch
+            - response_length: Tensor of response lengths for each item in the batch
+    """
+    response_length = batch.batch["responses"].shape[-1]
+
+    prompt_mask = batch.batch["attention_mask"][:, :-response_length]
+    response_mask = batch.batch["attention_mask"][:, -response_length:]
+
+    prompt_length = prompt_mask.sum(-1).float()
+    response_length = response_mask.sum(-1).float()  # (batch_size,)
+
+    return dict(
+        response_mask=response_mask,
+        prompt_length=prompt_length,
+        response_length=response_length,
+    )
+
+
+def compute_data_metrics(batch: DataProto, use_critic: bool = True) -> Dict[str, Any]:
+    """
+    Computes various metrics from a batch of data for PPO training.
+
+    This function calculates metrics related to scores, rewards, advantages, returns, values,
+    and sequence lengths from a batch of data. It provides statistical information (mean, max, min)
+    for each metric category.
+
+    Args:
+        batch: A DataProto object containing batch data with token-level scores, rewards, advantages, etc.
+        use_critic: Whether to include critic-specific metrics. Defaults to True.
+
+    Returns:
+        A dictionary of metrics including:
+            - critic/score/mean, max, min: Statistics about sequence scores
+            - critic/rewards/mean, max, min: Statistics about sequence rewards
+            - critic/advantages/mean, max, min: Statistics about advantages
+            - critic/returns/mean, max, min: Statistics about returns
+            - critic/values/mean, max, min: Statistics about critic values (if use_critic=True)
+            - critic/vf_explained_var: Explained variance of the value function (if use_critic=True)
+            - response_length/mean, max, min, clip_ratio: Statistics about response lengths
+            - prompt_length/mean, max, min, clip_ratio: Statistics about prompt lengths
+    """
+    sequence_score = batch.batch["token_level_scores"].sum(-1)
+    sequence_reward = batch.batch["token_level_rewards"].sum(-1)
+
+    advantages = batch.batch["advantages"]
+    returns = batch.batch["returns"]
+
+    max_response_length = batch.batch["responses"].shape[-1]
+
+    prompt_mask = batch.batch["attention_mask"][:, :-max_response_length].bool()
+    response_mask = batch.batch["attention_mask"][:, -max_response_length:].bool()
+
+    max_prompt_length = prompt_mask.size(-1)
+
+    response_info = _compute_response_info(batch)
+    prompt_length = response_info["prompt_length"]
+    response_length = response_info["response_length"]
+
+    valid_adv = torch.masked_select(advantages, response_mask)
+    valid_returns = torch.masked_select(returns, response_mask)
+
+    if use_critic:
+        values = batch.batch["values"]
+        valid_values = torch.masked_select(values, response_mask)
+        return_diff_var = torch.var(valid_returns - valid_values)
+        return_var = torch.var(valid_returns)
+
+    metrics = {
+        # score
+        "critic/score/mean": torch.mean(sequence_score).detach().item(),
+        "critic/score/max": torch.max(sequence_score).detach().item(),
+        "critic/score/min": torch.min(sequence_score).detach().item(),
+        # reward
+        "critic/rewards/mean": torch.mean(sequence_reward).detach().item(),
+        "critic/rewards/max": torch.max(sequence_reward).detach().item(),
+        "critic/rewards/min": torch.min(sequence_reward).detach().item(),
+        # adv
+        "critic/advantages/mean": torch.mean(valid_adv).detach().item(),
+        "critic/advantages/max": torch.max(valid_adv).detach().item(),
+        "critic/advantages/min": torch.min(valid_adv).detach().item(),
+        # returns
+        "critic/returns/mean": torch.mean(valid_returns).detach().item(),
+        "critic/returns/max": torch.max(valid_returns).detach().item(),
+        "critic/returns/min": torch.min(valid_returns).detach().item(),
+        **(
+            {
+                # values
+                "critic/values/mean": torch.mean(valid_values).detach().item(),
+                "critic/values/max": torch.max(valid_values).detach().item(),
+                "critic/values/min": torch.min(valid_values).detach().item(),
+                # vf explained var
+                "critic/vf_explained_var": (1.0 - return_diff_var / (return_var + 1e-5)).detach().item(),
+            }
+            if use_critic
+            else {}
+        ),
+        # response length
+        "response_length/mean": torch.mean(response_length).detach().item(),
+        "response_length/max": torch.max(response_length).detach().item(),
+        "response_length/min": torch.min(response_length).detach().item(),
+        "response_length/clip_ratio": torch.mean(torch.eq(response_length, max_response_length).float()).detach().item(),
+        # prompt length
+        "prompt_length/mean": torch.mean(prompt_length).detach().item(),
+        "prompt_length/max": torch.max(prompt_length).detach().item(),
+        "prompt_length/min": torch.min(prompt_length).detach().item(),
+        "prompt_length/clip_ratio": torch.mean(torch.eq(prompt_length, max_prompt_length).float()).detach().item(),
+    }
+    return metrics
+
+
+def compute_timing_metrics(batch: DataProto, timing_raw: Dict[str, float]) -> Dict[str, Any]:
+    """
+    Computes timing metrics for different processing stages in PPO training.
+
+    This function calculates both raw timing metrics (in seconds) and per-token timing metrics
+    (in milliseconds) for various processing stages like generation, reference computation,
+    value computation, advantage computation, and model updates.
+
+    Args:
+        batch: A DataProto object containing batch data with responses and attention masks.
+        timing_raw: A dictionary mapping stage names to their execution times in seconds.
+
+    Returns:
+        A dictionary containing:
+            - timing_s/{name}: Raw timing in seconds for each stage
+            - timing_per_token_ms/{name}: Per-token timing in milliseconds for each stage
+
+    Note:
+        Different stages use different token counts for normalization:
+        - "gen" uses only response tokens
+        - Other stages ("ref", "values", "adv", "update_critic", "update_actor") use all tokens
+          (prompt + response)
+    """
+    response_info = _compute_response_info(batch)
+    num_prompt_tokens = torch.sum(response_info["prompt_length"]).item()
+    num_response_tokens = torch.sum(response_info["response_length"]).item()
+    num_overall_tokens = num_prompt_tokens + num_response_tokens
+
+    num_tokens_of_section = {
+        "gen": num_response_tokens,
+        **{name: num_overall_tokens for name in ["ref", "values", "adv", "update_critic", "update_actor"]},
+    }
+
+    return {
+        **{f"timing_s/{name}": value for name, value in timing_raw.items()},
+        **{f"timing_per_token_ms/{name}": timing_raw[name] * 1000 / num_tokens_of_section[name] for name in set(num_tokens_of_section.keys()) & set(timing_raw.keys())},
+    }
+
+
+def compute_throughout_metrics(batch: DataProto, timing_raw: Dict[str, float], n_gpus: int) -> Dict[str, Any]:
+    """
+    Computes throughput metrics for PPO training.
+
+    This function calculates performance metrics related to token processing speed,
+    including the total number of tokens processed, time per step, and throughput
+    (tokens per second per GPU).
+
+    Args:
+        batch: A DataProto object containing batch data with meta information about token counts.
+        timing_raw: A dictionary mapping stage names to their execution times in seconds.
+                   Must contain a "step" key with the total step time.
+        n_gpus: Number of GPUs used for training.
+
+    Returns:
+        A dictionary containing:
+            - perf/total_num_tokens: Total number of tokens processed in the batch
+            - perf/time_per_step: Time taken for the step in seconds
+            - perf/throughput: Tokens processed per second per GPU
+
+    Note:
+        The throughput is calculated as total_tokens / (time * n_gpus) to normalize
+        across different GPU counts.
+    """
+    total_num_tokens = sum(batch.meta_info["global_token_num"])
+    time = timing_raw["step"]
+    # estimated_flops, promised_flops = flops_function.estimate_flops(num_tokens, time)
+    # f'Actual TFLOPs/s/GPU​': estimated_flops/(n_gpus),
+    # f'Theoretical TFLOPs/s/GPU​': promised_flops,
+    return {
+        "perf/total_num_tokens": total_num_tokens,
+        "perf/time_per_step": time,
+        "perf/throughput": total_num_tokens / (time * n_gpus),
+    }
+
+
+def bootstrap_metric(
+    data: list[Any],
+    subset_size: int,
+    reduce_fns: list[Callable[[np.ndarray], float]],
+    n_bootstrap: int = 1000,
+    seed: int = 42,
+) -> list[tuple[float, float]]:
+    """
+    Performs bootstrap resampling to estimate statistics of metrics.
+
+    This function uses bootstrap resampling to estimate the mean and standard deviation
+    of metrics computed by the provided reduction functions on random subsets of the data.
+
+    Args:
+        data: List of data points to bootstrap from.
+        subset_size: Size of each bootstrap sample.
+        reduce_fns: List of functions that compute a metric from a subset of data.
+        n_bootstrap: Number of bootstrap iterations. Defaults to 1000.
+        seed: Random seed for reproducibility. Defaults to 42.
+
+    Returns:
+        A list of tuples, where each tuple contains (mean, std) for a metric
+        corresponding to each reduction function in reduce_fns.
+
+    Example:
+        >>> data = [1, 2, 3, 4, 5]
+        >>> reduce_fns = [np.mean, np.max]
+        >>> bootstrap_metric(data, 3, reduce_fns)
+        [(3.0, 0.5), (4.5, 0.3)]  # Example values
+    """
+    np.random.seed(seed)
+
+    bootstrap_metric_lsts = [[] for _ in range(len(reduce_fns))]
+    for _ in range(n_bootstrap):
+        bootstrap_idxs = np.random.choice(len(data), size=subset_size, replace=True)
+        bootstrap_data = [data[i] for i in bootstrap_idxs]
+        for i, reduce_fn in enumerate(reduce_fns):
+            bootstrap_metric_lsts[i].append(reduce_fn(bootstrap_data))
+    return [(np.mean(lst), np.std(lst)) for lst in bootstrap_metric_lsts]
+
+
+def calc_maj_val(data: list[dict[str, Any]], vote_key: str, val_key: str) -> float:
+    """
+    Calculate a value based on majority voting.
+
+    This function identifies the most common value for a specified vote key
+    in the data, then returns the corresponding value for that majority vote.
+
+    Args:
+        data: List of dictionaries, where each dictionary contains both vote_key and val_key.
+        vote_key: The key in each dictionary used for voting/counting.
+        val_key: The key in each dictionary whose value will be returned for the majority vote.
+
+    Returns:
+        The value associated with the most common vote.
+
+    Example:
+        >>> data = [
+        ...     {"pred": "A", "val": 0.9},
+        ...     {"pred": "B", "val": 0.8},
+        ...     {"pred": "A", "val": 0.7}
+        ... ]
+        >>> calc_maj_val(data, vote_key="pred", val_key="val")
+        0.9  # Returns the first "val" for the majority vote "A"
+    """
+    vote2vals = defaultdict(list)
+    for d in data:
+        vote2vals[d[vote_key]].append(d[val_key])
+
+    vote2cnt = {k: len(v) for k, v in vote2vals.items()}
+    maj_vote = max(vote2cnt, key=vote2cnt.get)
+
+    maj_val = vote2vals[maj_vote][0]
+
+    return maj_val
+
+
+def process_validation_metrics(data_sources: list[str], sample_inputs: list[str], infos_dict: dict[str, list[Any]], seed: int = 42) -> dict[str, dict[str, dict[str, float]]]:
+    """
+    Process validation metrics into a structured format with statistical analysis.
+
+    This function organizes validation metrics by data source and prompt, then computes
+    various statistical measures including means, standard deviations, best/worst values,
+    and majority voting results. It also performs bootstrap sampling to estimate statistics
+    for different sample sizes.
+
+    Args:
+        data_sources: List of data source identifiers for each sample.
+        sample_inputs: List of input prompts corresponding to each sample.
+        infos_dict: Dictionary mapping variable names to lists of values for each sample.
+        seed: Random seed for bootstrap sampling. Defaults to 42.
+
+    Returns:
+        A nested dictionary with the structure:
+        {
+            data_source: {
+                variable_name: {
+                    metric_name: value
+                }
+            }
+        }
+
+        Where metric_name includes:
+        - "mean@N": Mean value across N samples
+        - "std@N": Standard deviation across N samples
+        - "best@N/mean": Mean of the best values in bootstrap samples of size N
+        - "best@N/std": Standard deviation of the best values in bootstrap samples
+        - "worst@N/mean": Mean of the worst values in bootstrap samples
+        - "worst@N/std": Standard deviation of the worst values in bootstrap samples
+        - "maj@N/mean": Mean of majority voting results in bootstrap samples (if "pred" exists)
+        - "maj@N/std": Standard deviation of majority voting results (if "pred" exists)
+
+    Example:
+        >>> data_sources = ["source1", "source1", "source2"]
+        >>> sample_inputs = ["prompt1", "prompt1", "prompt2"]
+        >>> infos_dict = {"score": [0.8, 0.9, 0.7], "pred": ["A", "A", "B"]}
+        >>> result = process_validation_metrics(data_sources, sample_inputs, infos_dict)
+        >>> # result will contain statistics for each data source and variable
+    """
+    # Group metrics by data source, prompt and variable
+    data_src2prompt2var2vals = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
+    for sample_idx, data_source in enumerate(data_sources):
+        prompt = sample_inputs[sample_idx]
+        var2vals = data_src2prompt2var2vals[data_source][prompt]
+        for var_name, var_vals in infos_dict.items():
+            var2vals[var_name].append(var_vals[sample_idx])
+
+    # Calculate metrics for each group
+    data_src2prompt2var2metric = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
+    for data_source, prompt2var2vals in data_src2prompt2var2vals.items():
+        for prompt, var2vals in prompt2var2vals.items():
+            for var_name, var_vals in var2vals.items():
+                if isinstance(var_vals[0], str):
+                    continue
+
+                metric = {}
+                n_resps = len(var_vals)
+                metric[f"mean@{n_resps}"] = np.mean(var_vals)
+
+                if n_resps > 1:
+                    metric[f"std@{n_resps}"] = np.std(var_vals)
+
+                    ns = []
+                    n = 2
+                    while n < n_resps:
+                        ns.append(n)
+                        n *= 2
+                    ns.append(n_resps)
+
+                    for n in ns:
+                        [(bon_mean, bon_std), (won_mean, won_std)] = bootstrap_metric(data=var_vals, subset_size=n, reduce_fns=[np.max, np.min], seed=seed)
+                        metric[f"best@{n}/mean"], metric[f"best@{n}/std"] = bon_mean, bon_std
+                        metric[f"worst@{n}/mean"], metric[f"worst@{n}/std"] = won_mean, won_std
+                        if var2vals.get("pred", None) is not None:
+                            vote_data = [{"val": val, "pred": pred} for val, pred in zip(var_vals, var2vals["pred"])]
+                            [(maj_n_mean, maj_n_std)] = bootstrap_metric(
+                                data=vote_data,
+                                subset_size=n,
+                                reduce_fns=[partial(calc_maj_val, vote_key="pred", val_key="val")],
+                                seed=seed,
+                            )
+                            metric[f"maj@{n}/mean"], metric[f"maj@{n}/std"] = maj_n_mean, maj_n_std
+
+                data_src2prompt2var2metric[data_source][prompt][var_name] = metric
+
+    # Aggregate metrics across prompts
+    data_src2var2metric2prompt_vals = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
+    for data_source, prompt2var2metric in data_src2prompt2var2metric.items():
+        for prompt, var2metric in prompt2var2metric.items():
+            for var_name, metric in var2metric.items():
+                for metric_name, metric_val in metric.items():
+                    data_src2var2metric2prompt_vals[data_source][var_name][metric_name].append(metric_val)
+
+    data_src2var2metric2val = defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
+    for data_source, var2metric2prompt_vals in data_src2var2metric2prompt_vals.items():
+        for var_name, metric2prompt_vals in var2metric2prompt_vals.items():
+            for metric_name, prompt_vals in metric2prompt_vals.items():
+                data_src2var2metric2val[data_source][var_name][metric_name] = np.mean(prompt_vals)
+
+    return data_src2var2metric2val
+
+
+
+def calculate_pass_at_k(n: int, c: int, k: int) -> float:
+    """
+    Calculate the unbiased estimator for pass@k
+    
+    Based on paper "Evaluating Large Language Models Trained on Code" (arXiv:2107.03374)
+    Formula: pass@k = E[1 - C(n-c, k) / C(n, k)]
+    
+    Args:
+        n: Total number of samples generated per problem
+        c: Number of samples that pass the test
+        k: Number of samples to draw
+    
+    Returns:
+        pass@k value (between 0.0 and 1.0)
+    """
+    if n - c < k:
+        return 1.0
+    return 1.0 - float(comb(n - c, k)) / float(comb(n, k))
+
+def compute_pass_k_metrics(problem_results: dict, k_values: list = None) -> dict:
+    """
+    Compute pass@k metrics
+    
+    Args:
+        problem_results: Dictionary in format {problem_id: {'total': int, 'correct': int}}
+        k_values: List of k values to compute, defaults to [1, 5, 10, 25, 50, 100]
+    
+    Returns:
+        Dictionary containing pass@k results
+    """
+    if k_values is None:
+        k_values = [1, 5, 10, 25, 50, 100]
+    
+    results = {}
+    
+    for k in k_values:
+        pass_at_k_scores = []
+        valid_problems = 0
+        
+        for problem_id, stats in problem_results.items():
+            n = stats['total']
+            c = stats['correct']
+            
+            if n >= k:  # Only compute when generated samples >= k
+                pass_k = calculate_pass_at_k(n, c, k)
+                pass_at_k_scores.append(pass_k)
+                valid_problems += 1
+        
+        if pass_at_k_scores:
+            avg_pass_at_k = np.mean(pass_at_k_scores)
+            std_pass_at_k = np.std(pass_at_k_scores)
+            results[f'pass@{k}'] = {
+                'score': float(avg_pass_at_k),
+                'std': float(std_pass_at_k),
+                'num_problems': valid_problems,
+            }
+        else:
+            results[f'pass@{k}'] = {
+                'score': 0.0,
+                'std': 0.0, 
+                'num_problems': 0,
+            }
+    
+    return results
\ No newline at end of file
diff --git a/trainer_integration/verl/verl_custom/trainer/ppo/ray_trainer.py b/trainer_integration/verl/verl_custom/trainer/ppo/ray_trainer.py
new file mode 100644
index 000000000..7181fc05a
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/trainer/ppo/ray_trainer.py
@@ -0,0 +1,1288 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023-2024 SGLang Team
+# Copyright 2025 ModelBest Inc. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+FSDP PPO Trainer with Ray-based single controller.
+This trainer supports model-agonistic model initialization with huggingface
+"""
+
+import json
+import os
+import uuid
+from collections import defaultdict
+from copy import deepcopy
+from dataclasses import dataclass, field
+from enum import Enum
+from pprint import pprint
+from typing import Optional, Type
+
+import numpy as np
+import ray
+import torch
+from omegaconf import OmegaConf, open_dict
+from torch.utils.data import Dataset, Sampler
+from torchdata.stateful_dataloader import StatefulDataLoader
+from tqdm import tqdm
+
+from verl import DataProto
+from verl_custom.nvidia.reward_manager.length_penalty import LengthPenalty
+from verl_custom.nvidia.utils.timer import TimeoutChecker
+from verl.protocol import pad_dataproto_to_divisor, unpad_dataproto
+from verl.single_controller.base import Worker
+from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
+from verl.single_controller.ray.base import create_colocated_worker_cls
+from verl_custom.trainer.ppo import core_algos
+from verl_custom.trainer.ppo.core_algos import AdvantageEstimator, agg_loss
+from verl_custom.trainer.ppo.metric_utils import (
+    compute_data_metrics,
+    compute_throughout_metrics,
+    compute_timing_metrics,
+    process_validation_metrics,
+    compute_pass_k_metrics,
+)
+from verl_custom.trainer.ppo.reward import compute_reward, compute_reward_async
+from verl.utils.checkpoint.checkpoint_manager import BaseCheckpointManager, find_latest_ckpt_path
+from verl.utils.debug.performance import _timer
+from verl.utils.metric import (
+    reduce_metrics,
+)
+from verl.utils.seqlen_balancing import get_seqlen_balanced_partitions, log_seqlen_unbalance
+from verl.utils.torch_functional import masked_mean
+from verl.utils.tracking import ValidationGenerationsLogger
+
+WorkerType = Type[Worker]
+
+
+class Role(Enum):
+    """
+    To create more roles dynamically, you can subclass Role and add new members
+    """
+
+    Actor = 0
+    Rollout = 1
+    ActorRollout = 2
+    Critic = 3
+    RefPolicy = 4
+    RewardModel = 5
+    ActorRolloutRef = 6
+
+
+@dataclass
+class ResourcePoolManager:
+    """
+    Define a resource pool specification. Resource pool will be initialized first.
+    """
+
+    resource_pool_spec: dict[str, list[int]]
+    mapping: dict[Role, str]
+    resource_pool_dict: dict[str, RayResourcePool] = field(default_factory=dict)
+
+    def create_resource_pool(self):
+        for resource_pool_name, process_on_nodes in self.resource_pool_spec.items():
+            # max_colocate_count means the number of WorkerGroups (i.e. processes) in each RayResourcePool
+            # For FSDP backend, we recommend using max_colocate_count=1 that merge all WorkerGroups into one.
+            # For Megatron backend, we recommend using max_colocate_count>1
+            # that can utilize different WorkerGroup for differnt models
+            resource_pool = RayResourcePool(process_on_nodes=process_on_nodes, use_gpu=True, max_colocate_count=1, name_prefix=resource_pool_name)
+            self.resource_pool_dict[resource_pool_name] = resource_pool
+
+        self._check_resource_available()
+
+    def get_resource_pool(self, role: Role) -> RayResourcePool:
+        """Get the resource pool of the worker_cls"""
+        return self.resource_pool_dict[self.mapping[role]]
+
+    def get_n_gpus(self) -> int:
+        """Get the number of gpus in this cluster."""
+        return sum([n_gpus for process_on_nodes in self.resource_pool_spec.values() for n_gpus in process_on_nodes])
+
+    def _check_resource_available(self):
+        """Check if the resource pool can be satisfied in this ray cluster."""
+        node_available_resources = ray.state.available_resources_per_node()
+        node_available_gpus = {node: node_info.get("GPU", 0) if "GPU" in node_info else node_info.get("NPU", 0) for node, node_info in node_available_resources.items()}
+
+        # check total required gpus can be satisfied
+        total_available_gpus = sum(node_available_gpus.values())
+        total_required_gpus = sum([n_gpus for process_on_nodes in self.resource_pool_spec.values() for n_gpus in process_on_nodes])
+        if total_available_gpus < total_required_gpus:
+            raise ValueError(f"Total available GPUs {total_available_gpus} is less than total desired GPUs {total_required_gpus}")
+
+        # check each resource pool can be satisfied, O(#resource_pools * #nodes)
+        for resource_pool_name, process_on_nodes in self.resource_pool_spec.items():
+            num_gpus, num_nodes = process_on_nodes[0], len(process_on_nodes)
+            for node, available_gpus in node_available_gpus.items():
+                if available_gpus >= num_gpus:
+                    node_available_gpus[node] -= num_gpus
+                    num_nodes -= 1
+                    if num_nodes == 0:
+                        break
+            if num_nodes > 0:
+                raise ValueError(f"Resource pool {resource_pool_name}: {num_gpus}*{num_nodes}" + "cannot be satisfied in this ray cluster")
+
+
+def apply_kl_penalty(data: DataProto, kl_ctrl: core_algos.AdaptiveKLController, kl_penalty="kl", multi_turn=False):
+    """Apply KL penalty to the token-level rewards.
+
+    This function computes the KL divergence between the reference policy and current policy,
+    then applies a penalty to the token-level rewards based on this divergence.
+
+    Args:
+        data (DataProto): The data containing batched model outputs and inputs.
+        kl_ctrl (core_algos.AdaptiveKLController): Controller for adaptive KL penalty.
+        kl_penalty (str, optional): Type of KL penalty to apply. Defaults to "kl".
+        multi_turn (bool, optional): Whether the data is from a multi-turn conversation. Defaults to False.
+
+    Returns:
+        tuple: A tuple containing:
+            - The updated data with token-level rewards adjusted by KL penalty
+            - A dictionary of metrics related to the KL penalty
+    """
+    responses = data.batch["responses"]
+    response_length = responses.size(1)
+    token_level_scores = data.batch["token_level_rewards"]
+    batch_size = data.batch.batch_size[0]
+
+    if multi_turn:
+        loss_mask = data.batch["loss_mask"]
+        response_mask = loss_mask[:, -response_length:]
+    else:
+        attention_mask = data.batch["attention_mask"]
+        response_mask = attention_mask[:, -response_length:]
+
+    # compute kl between ref_policy and current policy
+    # When apply_kl_penalty, algorithm.use_kl_in_reward=True, so the reference model has been enabled.
+    kld = core_algos.kl_penalty(data.batch["old_log_probs"], data.batch["ref_log_prob"], kl_penalty=kl_penalty)  # (batch_size, response_length)
+    kld = kld * response_mask
+    beta = kl_ctrl.value
+
+    token_level_rewards = token_level_scores - beta * kld
+
+    current_kl = masked_mean(kld, mask=response_mask, axis=-1)  # average over sequence
+    current_kl = torch.mean(current_kl, dim=0).item()
+
+    # according to https://github.com/huggingface/trl/blob/951ca1841f29114b969b57b26c7d3e80a39f75a0/trl/trainer/ppo_trainer.py#L837
+    kl_ctrl.update(current_kl=current_kl, n_steps=batch_size)
+    data.batch["token_level_rewards"] = token_level_rewards
+
+    metrics = {"actor/reward_kl_penalty": current_kl, "actor/reward_kl_penalty_coeff": beta}
+
+    return data, metrics
+
+
+def compute_response_mask(data: DataProto):
+    """Compute the attention mask for the response part of the sequence.
+
+    This function extracts the portion of the attention mask that corresponds to the model's response,
+    which is used for masking computations that should only apply to response tokens.
+
+    Args:
+        data (DataProto): The data containing batched model outputs and inputs.
+
+    Returns:
+        torch.Tensor: The attention mask for the response tokens.
+    """
+    responses = data.batch["responses"]
+    response_length = responses.size(1)
+    attention_mask = data.batch["attention_mask"]
+    return attention_mask[:, -response_length:]
+
+
+def compute_advantage(data: DataProto, adv_estimator, gamma=1.0, lam=1.0, num_repeat=1, multi_turn=False, norm_adv_by_std_in_grpo=True, config=None):
+    """Compute advantage estimates for policy optimization.
+
+    This function computes advantage estimates using various estimators like GAE, GRPO, REINFORCE++, etc.
+    The advantage estimates are used to guide policy optimization in RL algorithms.
+
+    Args:
+        data (DataProto): The data containing batched model outputs and inputs.
+        adv_estimator: The advantage estimator to use (e.g., GAE, GRPO, REINFORCE++).
+        gamma (float, optional): Discount factor for future rewards. Defaults to 1.0.
+        lam (float, optional): Lambda parameter for GAE. Defaults to 1.0.
+        num_repeat (int, optional): Number of times to repeat the computation. Defaults to 1.
+        multi_turn (bool, optional): Whether the data is from a multi-turn conversation. Defaults to False.
+        norm_adv_by_std_in_grpo (bool, optional): Whether to normalize advantages by standard deviation in GRPO. Defaults to True.
+        config (dict, optional): Configuration dictionary for algorithm settings. Defaults to None.
+
+    Returns:
+        DataProto: The updated data with computed advantages and returns.
+    """
+    # Back-compatible with trainers that do not compute response mask in fit
+    if "response_mask" not in data.batch.keys():
+        data.batch["response_mask"] = compute_response_mask(data)
+    # prepare response group
+    if adv_estimator == AdvantageEstimator.GAE:
+        # Compute advantages and returns using Generalized Advantage Estimation (GAE)
+        advantages, returns = core_algos.compute_gae_advantage_return(
+            token_level_rewards=data.batch["token_level_rewards"],
+            values=data.batch["values"],
+            response_mask=data.batch["response_mask"],
+            gamma=gamma,
+            lam=lam,
+        )
+        data.batch["advantages"] = advantages
+        data.batch["returns"] = returns
+    elif adv_estimator == AdvantageEstimator.GRPO:
+        # Initialize the mask for GRPO calculation
+        grpo_calculation_mask = data.batch["response_mask"]
+        if multi_turn:
+            # If multi-turn, replace the mask with the relevant part of loss_mask
+            # Get length from the initial response mask
+            response_length = grpo_calculation_mask.size(1)
+            # This mask is the one intended for GRPO
+            grpo_calculation_mask = data.batch["loss_mask"][:, -response_length:]
+        # Call compute_grpo_outcome_advantage with parameters matching its definition
+        advantages, returns = core_algos.compute_grpo_outcome_advantage(
+            token_level_rewards=data.batch["token_level_rewards"],
+            response_mask=grpo_calculation_mask,
+            index=data.non_tensor_batch["uid"],
+            norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
+        )
+        data.batch["advantages"] = advantages
+        data.batch["returns"] = returns
+    else:
+        # handle all other adv estimator type other than GAE and GRPO
+        adv_estimator_fn = core_algos.get_adv_estimator_fn(adv_estimator)
+        adv_kwargs = {"token_level_rewards": data.batch["token_level_rewards"],
+                      "response_mask": data.batch["response_mask"],
+                      "config": config,
+        }
+        if "uid" in data.non_tensor_batch: # optional
+            adv_kwargs['index'] = data.non_tensor_batch["uid"]
+        if "reward_baselines" in data.batch:# optional
+            adv_kwargs['reward_baselines'] = data.batch["reward_baselines"]
+
+        # calculate advantage estimator
+        advantages, returns = adv_estimator_fn(**adv_kwargs)
+        data.batch["advantages"] = advantages
+        data.batch["returns"] = returns
+    return data
+
+
+class RayPPOTrainer:
+    """
+    Note that this trainer runs on the driver process on a single CPU/GPU node.
+    """
+
+    # TODO: support each role have individual ray_worker_group_cls,
+    # i.e., support different backend of different role
+    def __init__(
+        self,
+        config,
+        tokenizer,
+        role_worker_mapping: dict[Role, WorkerType],
+        resource_pool_manager: ResourcePoolManager,
+        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+        processor=None,
+        reward_fn=None,
+        val_reward_fn=None,
+        train_dataset: Optional[Dataset] = None,
+        val_dataset: Optional[Dataset] = None,
+        collate_fn=None,
+        train_sampler: Optional[Sampler] = None,
+        device_name="cuda",
+    ):
+        # assert torch.cuda.is_available(), 'cuda must be available on driver'
+
+        self.timeout = TimeoutChecker()
+
+        self.tokenizer = tokenizer
+        self.processor = processor
+        self.config = config
+        self.reward_fn = reward_fn
+        self.val_reward_fn = val_reward_fn
+
+        self.length_penalty = LengthPenalty(config.get('length_penalty', None))
+        ray.get(self.reward_fn.set_length_penalty.remote(self.length_penalty))
+        ray.get(self.reward_fn.set_stop_properly_penalty.remote(self.config.stop_properly_penalty.penalty_coef))
+
+        self.hybrid_engine = config.actor_rollout_ref.hybrid_engine
+        assert self.hybrid_engine, "Currently, only support hybrid engine"
+
+        if self.hybrid_engine:
+            assert Role.ActorRollout in role_worker_mapping, f"{role_worker_mapping.keys()=}"
+
+        self.role_worker_mapping = role_worker_mapping
+        self.resource_pool_manager = resource_pool_manager
+        self.use_reference_policy = Role.RefPolicy in role_worker_mapping
+        self.use_rm = Role.RewardModel in role_worker_mapping
+        self.ray_worker_group_cls = ray_worker_group_cls
+        self.device_name = device_name
+        self.validation_generations_logger = ValidationGenerationsLogger()
+
+        # if ref_in_actor is True, the reference policy will be actor without lora applied
+        self.ref_in_actor = config.actor_rollout_ref.model.get("lora_rank", 0) > 0
+
+        # define in-reward KL control
+        # kl loss control currently not suppoorted
+        if config.algorithm.use_kl_in_reward:
+            self.kl_ctrl_in_reward = core_algos.get_kl_controller(config.algorithm.kl_ctrl)
+
+        if self.config.algorithm.adv_estimator == AdvantageEstimator.GAE:
+            self.use_critic = True
+        elif self.config.algorithm.adv_estimator in [
+            AdvantageEstimator.GRPO,
+            AdvantageEstimator.GRPO_PASSK,
+            AdvantageEstimator.REINFORCE_PLUS_PLUS,
+            AdvantageEstimator.REMAX,
+            AdvantageEstimator.RLOO,
+            AdvantageEstimator.OPO,
+            AdvantageEstimator.REINFORCE_PLUS_PLUS_BASELINE,
+        ]:
+            self.use_critic = False
+        else:
+            raise NotImplementedError
+
+        self._validate_config()
+        self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler)
+
+    def _validate_config(self):
+        config = self.config
+        # number of GPUs total
+        n_gpus = config.trainer.n_gpus_per_node * config.trainer.nnodes
+        if config.actor_rollout_ref.actor.strategy == "megatron":
+            model_parallel_size = config.actor_rollout_ref.actor.megatron.tensor_model_parallel_size * config.actor_rollout_ref.actor.megatron.pipeline_model_parallel_size
+            assert n_gpus % (model_parallel_size * config.actor_rollout_ref.actor.megatron.context_parallel_size) == 0, f"n_gpus ({n_gpus}) must be divisible by model_parallel_size ({model_parallel_size}) times context_parallel_size ({config.actor_rollout_ref.actor.megatron.context_parallel_size})"
+            megatron_dp = n_gpus // (model_parallel_size * config.actor_rollout_ref.actor.megatron.context_parallel_size)
+            minimal_bsz = megatron_dp * config.actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu
+        else:
+            minimal_bsz = n_gpus
+
+        # 1. Check total batch size for data correctness
+        real_train_batch_size = config.data.train_batch_size * config.actor_rollout_ref.rollout.n
+        assert real_train_batch_size % minimal_bsz == 0, f"real_train_batch_size ({real_train_batch_size}) must be divisible by minimal possible batch size ({minimal_bsz})"
+
+        # A helper function to check "micro_batch_size" vs "micro_batch_size_per_gpu"
+        # We throw an error if the user sets both. The new convention is "..._micro_batch_size_per_gpu".
+        def check_mutually_exclusive(mbs, mbs_per_gpu, name: str):
+            settings = {
+                "actor_rollout_ref.actor": "micro_batch_size",
+                "critic": "micro_batch_size",
+                "reward_model": "micro_batch_size",
+                "actor_rollout_ref.ref": "log_prob_micro_batch_size",
+                "actor_rollout_ref.rollout": "log_prob_micro_batch_size",
+            }
+
+            if name in settings:
+                param = settings[name]
+                param_per_gpu = f"{param}_per_gpu"
+
+                if mbs is None and mbs_per_gpu is None:
+                    raise ValueError(f"[{name}] Please set at least one of '{name}.{param}' or '{name}.{param_per_gpu}'.")
+
+                if mbs is not None and mbs_per_gpu is not None:
+                    raise ValueError(f"[{name}] You have set both '{name}.{param}' AND '{name}.{param_per_gpu}'. Please remove '{name}.{param}' because only '*_{param_per_gpu}'" + "is supported (the former is deprecated).")
+
+        if not config.actor_rollout_ref.actor.use_dynamic_bsz:
+            # actor: ppo_micro_batch_size vs. ppo_micro_batch_size_per_gpu
+            check_mutually_exclusive(
+                config.actor_rollout_ref.actor.ppo_micro_batch_size,
+                config.actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu,
+                "actor_rollout_ref.actor",
+            )
+
+            if self.use_reference_policy:
+                # reference: log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu
+                check_mutually_exclusive(
+                    config.actor_rollout_ref.ref.log_prob_micro_batch_size,
+                    config.actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu,
+                    "actor_rollout_ref.ref",
+                )
+
+            #  The rollout section also has log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu
+            check_mutually_exclusive(
+                config.actor_rollout_ref.rollout.log_prob_micro_batch_size,
+                config.actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu,
+                "actor_rollout_ref.rollout",
+            )
+
+        if self.use_critic and not config.critic.use_dynamic_bsz:
+            # Check for critic micro-batch size conflicts
+            check_mutually_exclusive(config.critic.ppo_micro_batch_size, config.critic.ppo_micro_batch_size_per_gpu, "critic")
+
+        # Check for reward model micro-batch size conflicts
+        if config.reward_model.enable and not config.reward_model.use_dynamic_bsz:
+            check_mutually_exclusive(config.reward_model.micro_batch_size, config.reward_model.micro_batch_size_per_gpu, "reward_model")
+
+        # Actor
+        # check if train_batch_size is larger than ppo_mini_batch_size
+        # if NOT dynamic_bsz, we must ensure:
+        #    ppo_mini_batch_size is divisible by ppo_micro_batch_size
+        #    ppo_micro_batch_size * sequence_parallel_size >= n_gpus
+        if not config.actor_rollout_ref.actor.use_dynamic_bsz:
+            assert config.data.train_batch_size >= config.actor_rollout_ref.actor.ppo_mini_batch_size
+            sp_size = config.actor_rollout_ref.actor.get("ulysses_sequence_parallel_size", 1)
+            if config.actor_rollout_ref.actor.ppo_micro_batch_size is not None:
+                assert config.actor_rollout_ref.actor.ppo_mini_batch_size % config.actor_rollout_ref.actor.ppo_micro_batch_size == 0
+                assert config.actor_rollout_ref.actor.ppo_micro_batch_size * sp_size >= n_gpus
+
+        assert config.actor_rollout_ref.actor.loss_agg_mode in [
+            "token-mean",
+            "seq-mean-token-sum",
+            "seq-mean-token-mean",
+            "seq-mean-token-sum-norm",
+        ], f"Invalid loss_agg_mode: {config.actor_rollout_ref.actor.loss_agg_mode}"
+
+        if config.algorithm.use_kl_in_reward and config.actor_rollout_ref.actor.use_kl_loss:
+            print("NOTICE: You have both enabled in-reward kl and kl loss.")
+
+        # critic
+        if self.use_critic and not config.critic.use_dynamic_bsz:
+            assert config.data.train_batch_size >= config.critic.ppo_mini_batch_size
+            sp_size = config.critic.get("ulysses_sequence_parallel_size", 1)
+            if config.critic.ppo_micro_batch_size is not None:
+                assert config.critic.ppo_mini_batch_size % config.critic.ppo_micro_batch_size == 0
+                assert config.critic.ppo_micro_batch_size * sp_size >= n_gpus
+
+        # Check if use_remove_padding is enabled when using sequence parallelism for fsdp
+        if config.actor_rollout_ref.actor.strategy == "fsdp" and (config.actor_rollout_ref.actor.get("ulysses_sequence_parallel_size", 1) > 1 or config.actor_rollout_ref.ref.get("ulysses_sequence_parallel_size", 1) > 1):
+            assert config.actor_rollout_ref.model.use_remove_padding, "When using sequence parallelism for actor/ref policy, you must enable `use_remove_padding`."
+
+        if self.use_critic and config.critic.strategy == "fsdp":
+            if config.critic.get("ulysses_sequence_parallel_size", 1) > 1:
+                assert config.critic.model.use_remove_padding, "When using sequence parallelism for critic, you must enable `use_remove_padding`."
+
+        if config.data.get("val_batch_size", None) is not None:
+            print("WARNING: val_batch_size is deprecated." + " Validation datasets are sent to inference engines as a whole batch," + " which will schedule the memory themselves.")
+
+        # check eval config
+        if config.actor_rollout_ref.rollout.val_kwargs.do_sample:
+            assert config.actor_rollout_ref.rollout.temperature > 0, "validation gen temperature should be greater than 0 when enabling do_sample"
+
+        # check multi_turn with tool config
+        if config.actor_rollout_ref.rollout.multi_turn.enable and not config.actor_rollout_ref.rollout.multi_turn.get("agent", False):
+            assert config.actor_rollout_ref.rollout.multi_turn.tool_config_path is not None, "tool_config_path must be set when enabling multi_turn with tool, due to no role-playing support"
+            assert config.algorithm.adv_estimator in [AdvantageEstimator.GRPO], "only GRPO is tested for multi-turn with tool"
+
+        print("[validate_config] All configuration checks passed successfully!")
+
+    def _create_dataloader(self, train_dataset, val_dataset, collate_fn, train_sampler):
+        """
+        Creates the train and validation dataloaders.
+        """
+        # TODO: we have to make sure the batch size is divisible by the dp size
+        from verl_custom.trainer.main_ppo import create_rl_dataset, create_rl_sampler
+
+        if train_dataset is None:
+            train_dataset = create_rl_dataset(self.config.data.train_files, self.config.data, self.tokenizer, self.processor)
+        if val_dataset is None:
+            val_dataset = create_rl_dataset(self.config.data.val_files, self.config.data, self.tokenizer, self.processor)
+        self.train_dataset, self.val_dataset = train_dataset, val_dataset
+
+        if train_sampler is None:
+            train_sampler = create_rl_sampler(self.config.data, self.train_dataset)
+        if collate_fn is None:
+            from verl_custom.utils.dataset.rl_dataset import \
+                collate_fn as default_collate_fn
+
+            collate_fn = default_collate_fn
+
+        self.train_dataloader = StatefulDataLoader(
+            dataset=self.train_dataset,
+            batch_size=self.config.data.get("gen_batch_size", self.config.data.train_batch_size),
+            num_workers=self.config.data.get("dataloader_num_workers", 8),
+            drop_last=True,
+            collate_fn=collate_fn,
+            sampler=train_sampler,
+        )
+
+        val_batch_size = self.config.data.val_batch_size  # Prefer config value if set
+        if val_batch_size is None:
+            val_batch_size = len(self.val_dataset)
+
+        self.val_dataloader = StatefulDataLoader(
+            dataset=self.val_dataset,
+            batch_size=val_batch_size,
+            num_workers=self.config.data.get("dataloader_num_workers", 8),
+            shuffle=self.config.data.get("validation_shuffle", True),
+            drop_last=False,
+            collate_fn=collate_fn,
+        )
+
+        assert len(self.train_dataloader) >= 1, "Train dataloader is empty!"
+        assert len(self.val_dataloader) >= 1, "Validation dataloader is empty!"
+
+        print(f"Size of train dataloader: {len(self.train_dataloader)}, Size of val dataloader: {len(self.val_dataloader)}")
+
+        total_training_steps = len(self.train_dataloader) * self.config.trainer.total_epochs
+
+        if self.config.trainer.total_training_steps is not None:
+            total_training_steps = self.config.trainer.total_training_steps
+
+        self.total_training_steps = total_training_steps
+        print(f"Total training steps: {self.total_training_steps}")
+
+        try:
+            OmegaConf.set_struct(self.config, True)
+            with open_dict(self.config):
+                if OmegaConf.select(self.config, "actor_rollout_ref.actor.optim"):
+                    self.config.actor_rollout_ref.actor.optim.total_training_steps = total_training_steps
+                if OmegaConf.select(self.config, "critic.optim"):
+                    self.config.critic.optim.total_training_steps = total_training_steps
+        except Exception as e:
+            print(f"Warning: Could not set total_training_steps in config. Structure missing? Error: {e}")
+
+    def _dump_generations(self, inputs, outputs, scores, reward_extra_infos_dict, dump_path):
+        """Dump rollout/validation samples as JSONL."""
+        os.makedirs(dump_path, exist_ok=True)
+        filename = os.path.join(dump_path, f"{self.global_steps}.jsonl")
+
+        n = len(inputs)
+        base_data = {
+            "input": inputs,
+            "output": outputs,
+            "score": scores,
+            "step": [self.global_steps] * n,
+        }
+
+        for k, v in reward_extra_infos_dict.items():
+            if len(v) == n:
+                base_data[k] = v
+
+        with open(filename, "w") as f:
+            for i in range(n):
+                entry = {k: v[i] for k, v in base_data.items()}
+                f.write(json.dumps(entry, ensure_ascii=False) + "\n")
+
+        print(f"Dumped generations to {filename}")
+
+    def _maybe_log_val_generations(self, inputs, outputs, scores):
+        """Log a table of validation samples to the configured logger (wandb or swanlab)"""
+
+        generations_to_log = self.config.trainer.log_val_generations
+
+        if generations_to_log == 0:
+            return
+
+        import numpy as np
+
+        # Create tuples of (input, output, score) and sort by input text
+        samples = list(zip(inputs, outputs, scores))
+        samples.sort(key=lambda x: x[0])  # Sort by input text
+
+        # Use fixed random seed for deterministic shuffling
+        rng = np.random.RandomState(42)
+        rng.shuffle(samples)
+
+        # Take first N samples after shuffling
+        samples = samples[:generations_to_log]
+
+        # Log to each configured logger
+        self.validation_generations_logger.log(self.config.trainer.logger, samples, self.global_steps)
+
+    def _validate(self):
+        data_source_lst = []
+        reward_extra_infos_dict: dict[str, list] = defaultdict(list)
+
+        # Lists to collect samples for the table
+        sample_inputs = []
+        sample_outputs = []
+        sample_scores = []
+
+        for test_data in self.val_dataloader:
+            test_batch = DataProto.from_single_dict(test_data)
+
+            # we only do validation on rule-based rm
+            if self.config.reward_model.enable and test_batch[0].non_tensor_batch["reward_model"]["style"] == "model":
+                return {}
+
+            # Store original inputs
+            input_ids = test_batch.batch["input_ids"]
+            
+
+            batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
+            if self.config.actor_rollout_ref.rollout.get("task_type", None) == "swegym": 
+                non_tensor_batch_keys_to_pop = ["instance"]
+            else:
+                non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
+            if "multi_modal_data" in test_batch.non_tensor_batch:
+                non_tensor_batch_keys_to_pop.append("multi_modal_data")
+            if "raw_prompt" in test_batch.non_tensor_batch:
+                non_tensor_batch_keys_to_pop.append("raw_prompt")
+            if "tools_kwargs" in test_batch.non_tensor_batch:
+                non_tensor_batch_keys_to_pop.append("tools_kwargs")
+            test_gen_batch = test_batch.pop(
+                batch_keys=batch_keys_to_pop,
+                non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
+            )
+
+            test_gen_batch.meta_info = {
+                "eos_token_id": self.tokenizer.eos_token_id,
+                "pad_token_id": self.tokenizer.pad_token_id,
+                "recompute_log_prob": False,
+                "do_sample": self.config.actor_rollout_ref.rollout.val_kwargs.do_sample,
+                "validate": True,
+            }
+            print(f"test_gen_batch meta info: {test_gen_batch.meta_info}")
+            # only pad for non-openhands async manager
+            if self.config.actor_rollout_ref.rollout.get("async_manager", "") != "openhands":
+                # pad to be divisible by dp_size
+                test_gen_batch, pad_size = pad_dataproto_to_divisor(test_gen_batch, self.actor_rollout_wg.world_size)
+
+            if not self.async_rollout_mode:
+                test_output_gen_batch = self.actor_rollout_wg.generate_sequences(test_gen_batch)
+            else:
+                self.async_rollout_manager.wake_up()
+                test_output_gen_batch = self.async_rollout_manager.generate_sequences(test_gen_batch, val_mode=True)
+                self.async_rollout_manager.sleep()
+
+            # only unpad for non-openhands async manager
+            if self.config.actor_rollout_ref.rollout.get("async_manager", "") != "openhands":
+                # unpad
+                test_output_gen_batch = unpad_dataproto(test_output_gen_batch, pad_size=pad_size)
+            print("validation generation end")
+
+            # Store generated outputs
+            output_ids = test_output_gen_batch.batch["responses"]
+            output_texts = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids]
+            sample_outputs.extend(output_texts)
+
+            # repeat test batch
+            test_batch = test_batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.val_kwargs.n, interleave=True)
+
+            # TODO: Can we keep special tokens except for padding tokens?
+            input_texts = [[self.tokenizer.decode(ids, skip_special_tokens=True) for i in range(self.config.actor_rollout_ref.rollout.val_kwargs.n)] for ids in input_ids]
+            input_texts = [item for sublist in input_texts for item in sublist]
+            sample_inputs.extend(input_texts)
+            test_batch = test_batch.union(test_output_gen_batch)
+
+            # evaluate using reward_function
+            result = ray.get(self.val_reward_fn.__call__.remote(test_batch))
+            reward_tensor = result["score"]
+            scores = reward_tensor.sum(-1).cpu().tolist()
+            sample_scores.extend(scores)
+
+            reward_extra_infos_dict["reward"].extend(scores)
+            if "reward_extra_info" in result:
+                for key, lst in result["reward_extra_info"].items():
+                    reward_extra_infos_dict[key].extend(lst)
+            
+            # Log reward_metrics if available during validation
+            if "reward_metrics" in result:
+                val_reward_metrics = result["reward_metrics"]
+                if isinstance(val_reward_metrics, dict):
+                    for key, value in val_reward_metrics.items():
+                        # Use the same key format as in training
+                        metric_key = f"reward_metrics/{key}"
+                        if metric_key not in reward_extra_infos_dict:
+                            reward_extra_infos_dict[metric_key] = []
+                        reward_extra_infos_dict[metric_key].extend([value] * len(scores))
+                else:
+                    print(f"Warning: validation reward_metrics is not a dict, got {type(val_reward_metrics)}")
+                    
+            data_source = test_batch.non_tensor_batch.get('data_source', ['unknown'] * reward_tensor.shape[0])
+            for idx, x in enumerate(test_batch.non_tensor_batch.get('extra_info', [])):
+                if 'name' in x:
+                    data_source[idx] = x['name']
+            data_source_lst.append(data_source) 
+
+        self._maybe_log_val_generations(inputs=sample_inputs, outputs=sample_outputs, scores=sample_scores)
+
+        # dump generations
+        val_data_dir = self.config.trainer.get("validation_data_dir", None)
+        if val_data_dir:
+            self._dump_generations(
+                inputs=sample_inputs,
+                outputs=sample_outputs,
+                scores=sample_scores,
+                reward_extra_infos_dict=reward_extra_infos_dict,
+                dump_path=val_data_dir,
+            )
+
+        for key_info, lst in reward_extra_infos_dict.items():
+            assert len(lst) == 0 or len(lst) == len(sample_scores), f"{key_info}: {len(lst)=}, {len(sample_scores)=}"
+
+        data_sources = np.concatenate(data_source_lst, axis=0)
+
+        data_src2var2metric2val = process_validation_metrics(data_sources, sample_inputs, reward_extra_infos_dict)
+        metric_dict = {}
+        for data_source, var2metric2val in data_src2var2metric2val.items():
+            core_var = "acc" if "acc" in var2metric2val else "reward"
+            for var_name, metric2val in var2metric2val.items():
+                n_max = max([int(name.split("@")[-1].split("/")[0]) for name in metric2val.keys()])
+                for metric_name, metric_val in metric2val.items():
+                    if (var_name == core_var) and any(metric_name.startswith(pfx) for pfx in ["mean", "maj", "best"]) and (f"@{n_max}" in metric_name):
+                        metric_sec = "val-core"
+                    else:
+                        metric_sec = "val-aux"
+                    pfx = f"{metric_sec}/{data_source}/{var_name}/{metric_name}"
+                    metric_dict[pfx] = metric_val
+
+        # ==================== Pass@k Calculation ====================
+        # Compute pass@k metrics (if enabled)
+        if getattr(self.config.trainer, 'enable_pass_k_evaluation', False):
+            print("Computing pass@k metrics...")
+            
+            # Get k values list from configuration
+            k_values = getattr(self.config.trainer, 'pass_k_values', [1, 4, 8, 16])
+            
+            # Build problem statistics information
+            problem_results = defaultdict(lambda: {'total': 0, 'correct': 0})
+            
+            # Build problem ID identification strategy
+            problem_id_strategy = getattr(self.config.trainer, 'pass_k_problem_id_strategy', 'input_hash')
+            
+            # Assign problem ID for each sample
+            for i, (input_text, output_text, score) in enumerate(zip(sample_inputs, sample_outputs, sample_scores)):
+                
+                # Determine problem ID based on configured strategy
+                if problem_id_strategy == 'input_hash':
+                    # Use hash of input text as problem identifier
+                    problem_id = abs(hash(input_text)) % 10000
+                elif problem_id_strategy == 'input_prefix':
+                    # Use first 50 characters of input text as problem identifier
+                    problem_id = input_text[:50].strip()
+                elif problem_id_strategy == 'index_based':
+                    # Index-based strategy, assuming every N samples are for the same problem
+                    samples_per_problem = getattr(self.config.trainer, 'pass_k_samples_per_problem', 10)
+                    problem_id = i // samples_per_problem
+                else:
+                    # Default to using index
+                    problem_id = i
+                
+                problem_results[problem_id]['total'] += 1
+                
+                # Determine if test passes
+                success_threshold = getattr(self.config.trainer, 'pass_k_success_threshold', 0.0)
+                success_mode = getattr(self.config.trainer, 'pass_k_success_mode', 'threshold')
+                
+                is_success = False
+                if success_mode == 'threshold':
+                    # Judge based on score threshold
+                    is_success = score > success_threshold
+                elif success_mode == 'positive':
+                    # Positive numbers indicate success
+                    is_success = score > 0
+                elif success_mode == 'percentile':
+                    # Judge based on percentile (requires pre-computing all scores)
+                    percentile_threshold = getattr(self.config.trainer, 'pass_k_success_percentile', 50)
+                    threshold_score = np.percentile(sample_scores, percentile_threshold)
+                    is_success = score >= threshold_score
+                else:
+                    # Default to threshold mode
+                    is_success = score > success_threshold
+                
+                if is_success:
+                    problem_results[problem_id]['correct'] += 1
+            # Print problem results
+            for problem_id, result in problem_results.items():
+                print("="*50)
+                print("Here is the problem results:")
+                print(f"Problem {problem_id}: {result['total']} total, {result['correct']} correct")
+                print("="*50)
+            # Compute pass@k metrics
+            pass_k_results = compute_pass_k_metrics(dict(problem_results), k_values)
+            
+            # Add pass@k results to metric_dict
+            for k_metric, k_result in pass_k_results.items():
+                metric_dict[f"val-core/pass_k/{k_metric}/score"] = k_result['score']
+                metric_dict[f"val-aux/pass_k/{k_metric}/std"] = k_result['std']
+                metric_dict[f"val-aux/pass_k/{k_metric}/num_problems"] = k_result['num_problems']
+            
+            print(f"Pass@k evaluation completed. Evaluated {len(problem_results)} unique problems.")
+            
+            # Print pass@k results
+            print("\n" + "="*50)
+            print("PASS@K VALIDATION RESULTS")
+            print("="*50)
+            for k in sorted(k_values):
+                if f'pass@{k}' in pass_k_results:
+                    result = pass_k_results[f'pass@{k}']
+                    score = result['score']
+                    std = result['std']
+                    num_problems = result['num_problems']
+                    print(f"pass@{k:3d}: {score:.4f} ± {std:.4f} (n={num_problems})")
+            print("="*50)
+
+        return metric_dict
+
+    def init_workers(self):
+        """Initialize distributed training workers using Ray backend.
+
+        Creates:
+        1. Ray resource pools from configuration
+        2. Worker groups for each role (actor, critic, etc.)
+        """
+        self.resource_pool_manager.create_resource_pool()
+
+        self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()}
+
+        # create actor and rollout
+        if self.hybrid_engine:
+            resource_pool = self.resource_pool_manager.get_resource_pool(Role.ActorRollout)
+            actor_rollout_cls = RayClassWithInitArgs(
+                cls=self.role_worker_mapping[Role.ActorRollout],
+                config=self.config.actor_rollout_ref,
+                role="actor_rollout",
+            )
+            self.resource_pool_to_cls[resource_pool]["actor_rollout"] = actor_rollout_cls
+        else:
+            raise NotImplementedError
+
+        # create critic
+        if self.use_critic:
+            resource_pool = self.resource_pool_manager.get_resource_pool(Role.Critic)
+            critic_cls = RayClassWithInitArgs(cls=self.role_worker_mapping[Role.Critic], config=self.config.critic)
+            self.resource_pool_to_cls[resource_pool]["critic"] = critic_cls
+
+        # create reference policy if needed
+        if self.use_reference_policy:
+            resource_pool = self.resource_pool_manager.get_resource_pool(Role.RefPolicy)
+            ref_policy_cls = RayClassWithInitArgs(self.role_worker_mapping[Role.RefPolicy], config=self.config.actor_rollout_ref, role="ref")
+            self.resource_pool_to_cls[resource_pool]["ref"] = ref_policy_cls
+
+        # create a reward model if reward_fn is None
+        if self.use_rm:
+            # we create a RM here
+            resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel)
+            rm_cls = RayClassWithInitArgs(self.role_worker_mapping[Role.RewardModel], config=self.config.reward_model)
+            self.resource_pool_to_cls[resource_pool]["rm"] = rm_cls
+
+        # initialize WorkerGroup
+        # NOTE: if you want to use a different resource pool for each role, which can support different parallel size,
+        # you should not use `create_colocated_worker_cls`.
+        # Instead, directly pass different resource pool to different worker groups.
+        # See https://github.com/volcengine/verl/blob/master/examples/ray/tutorial.ipynb for more information.
+        all_wg = {}
+        wg_kwargs = {}  # Setting up kwargs for RayWorkerGroup
+        if OmegaConf.select(self.config.trainer, "ray_wait_register_center_timeout") is not None:
+            wg_kwargs["ray_wait_register_center_timeout"] = self.config.trainer.ray_wait_register_center_timeout
+
+        for resource_pool, class_dict in self.resource_pool_to_cls.items():
+            worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict)
+            wg_dict = self.ray_worker_group_cls(resource_pool=resource_pool, ray_cls_with_init=worker_dict_cls, device_name=self.device_name, **wg_kwargs)
+            spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys())
+            all_wg.update(spawn_wg)
+
+        if self.use_critic:
+            self.critic_wg = all_wg["critic"]
+            self.critic_wg.init_model()
+
+        if self.use_reference_policy and not self.ref_in_actor:
+            self.ref_policy_wg = all_wg["ref"]
+            self.ref_policy_wg.init_model()
+
+        if self.use_rm:
+            self.rm_wg = all_wg["rm"]
+            self.rm_wg.init_model()
+
+        # we should create rollout at the end so that vllm can have a better estimation of kv cache memory
+        self.actor_rollout_wg = all_wg["actor_rollout"]
+        self.actor_rollout_wg.init_model()
+
+        # create async rollout manager and request scheduler
+        self.async_rollout_mode = False
+        if self.config.actor_rollout_ref.rollout.mode == "async":
+            if self.config.actor_rollout_ref.rollout.get("async_manager", "")=="openhands":
+                if self.config.algorithm.get("filter_groups", {}).get("enable", False):
+                    from verl_custom.nvidia.rollout.async_server_dapo import AsyncLLMServerManagerDAPO as AsyncLLMServerManager
+                else:
+                    from verl_custom.nvidia.rollout.async_server import AsyncLLMServerManager
+            else:
+                from verl.workers.rollout.async_server import AsyncLLMServerManager
+
+            self.async_rollout_mode = True
+            self.async_rollout_manager = AsyncLLMServerManager(
+                config=self.config,
+                worker_group=self.actor_rollout_wg,
+            )
+            # need to set data_loader for dapo
+            if self.config.algorithm.get("filter_groups", {}).get("enable", False):
+                self.async_rollout_manager.data_loader = self.train_dataloader
+            
+
+    def _save_checkpoint(self):
+        # path: given_path + `/global_step_{global_steps}` + `/actor`
+        local_global_step_folder = os.path.join(self.config.trainer.default_local_dir, f"global_step_{self.global_steps}")
+
+        print(f"local_global_step_folder: {local_global_step_folder}")
+        actor_local_path = os.path.join(local_global_step_folder, "actor")
+
+        actor_remote_path = None if self.config.trainer.default_hdfs_dir is None else os.path.join(self.config.trainer.default_hdfs_dir, f"global_step_{self.global_steps}", "actor")
+
+        remove_previous_ckpt_in_save = self.config.trainer.get("remove_previous_ckpt_in_save", False)
+        if remove_previous_ckpt_in_save:
+            print("Warning: remove_previous_ckpt_in_save is deprecated," + " set max_actor_ckpt_to_keep=1 and max_critic_ckpt_to_keep=1 instead")
+        max_actor_ckpt_to_keep = self.config.trainer.get("max_actor_ckpt_to_keep", None) if not remove_previous_ckpt_in_save else 1
+        max_critic_ckpt_to_keep = self.config.trainer.get("max_critic_ckpt_to_keep", None) if not remove_previous_ckpt_in_save else 1
+
+        self.actor_rollout_wg.save_checkpoint(actor_local_path, actor_remote_path, self.global_steps, max_ckpt_to_keep=max_actor_ckpt_to_keep)
+
+        if self.use_critic:
+            critic_local_path = os.path.join(local_global_step_folder, "critic")
+            critic_remote_path = None if self.config.trainer.default_hdfs_dir is None else os.path.join(self.config.trainer.default_hdfs_dir, f"global_step_{self.global_steps}", "critic")
+            self.critic_wg.save_checkpoint(critic_local_path, critic_remote_path, self.global_steps, max_ckpt_to_keep=max_critic_ckpt_to_keep)
+
+        # save dataloader
+        BaseCheckpointManager.local_mkdir(local_global_step_folder)
+        dataloader_local_path = os.path.join(local_global_step_folder, "data.pt")
+        dataloader_state_dict = self.train_dataloader.state_dict()
+        torch.save(dataloader_state_dict, dataloader_local_path)
+
+        # latest checkpointed iteration tracker (for atomic usage)
+        local_latest_checkpointed_iteration = os.path.join(self.config.trainer.default_local_dir, "latest_checkpointed_iteration.txt")
+        with open(local_latest_checkpointed_iteration, "w") as f:
+            f.write(str(self.global_steps))
+
+    def _load_checkpoint(self):
+        # Sleep and wake up the async rollout manager: https://github.com/volcengine/verl/issues/2613
+        # This syncs weights to vllm server. Also release GPU memory.
+        if self.async_rollout_mode:
+            self.async_rollout_manager.sleep()
+
+        if self.config.trainer.resume_mode == "disable":
+            return 0
+
+        # load from hdfs
+        if self.config.trainer.default_hdfs_dir is not None:
+            raise NotImplementedError("load from hdfs is not implemented yet")
+        else:
+            checkpoint_folder = self.config.trainer.default_local_dir  # TODO: check path
+            if not os.path.isabs(checkpoint_folder):
+                working_dir = os.getcwd()
+                checkpoint_folder = os.path.join(working_dir, checkpoint_folder)
+            global_step_folder = find_latest_ckpt_path(checkpoint_folder)  # None if no latest
+
+        # find global_step_folder
+        if self.config.trainer.resume_mode == "auto":
+            if global_step_folder is None:
+                print("Training from scratch")
+                return 0
+        else:
+            if self.config.trainer.resume_mode == "resume_path":
+                assert isinstance(self.config.trainer.resume_from_path, str), "resume ckpt must be str type"
+                assert "global_step_" in self.config.trainer.resume_from_path, "resume ckpt must specify the global_steps"
+                global_step_folder = self.config.trainer.resume_from_path
+                if not os.path.isabs(global_step_folder):
+                    working_dir = os.getcwd()
+                    global_step_folder = os.path.join(working_dir, global_step_folder)
+        print(f"Load from checkpoint folder: {global_step_folder}")
+        # set global step
+        self.global_steps = int(global_step_folder.split("global_step_")[-1])
+
+        print(f"Setting global step to {self.global_steps}")
+        print(f"Resuming from {global_step_folder}")
+
+        actor_path = os.path.join(global_step_folder, "actor")
+        critic_path = os.path.join(global_step_folder, "critic")
+        # load actor
+        self.actor_rollout_wg.load_checkpoint(actor_path, del_local_after_load=self.config.trainer.del_local_ckpt_after_load)
+        # load critic
+        if self.use_critic:
+            self.critic_wg.load_checkpoint(critic_path, del_local_after_load=self.config.trainer.del_local_ckpt_after_load)
+
+        # load dataloader,
+        # TODO: from remote not implemented yet
+        dataloader_local_path = os.path.join(global_step_folder, "data.pt")
+        if os.path.exists(dataloader_local_path):
+            dataloader_state_dict = torch.load(dataloader_local_path, weights_only=False)
+            self.train_dataloader.load_state_dict(dataloader_state_dict)
+        else:
+            print(f"Warning: No dataloader state found at {dataloader_local_path}, will start from scratch")
+
+    def _balance_batch(self, batch: DataProto, metrics, logging_prefix="global_seqlen"):
+        """Reorder the data on single controller such that each dp rank gets similar total tokens"""
+        attention_mask = batch.batch["attention_mask"]
+        batch_size = attention_mask.shape[0]
+        global_seqlen_lst = batch.batch["attention_mask"].view(batch_size, -1).sum(-1).tolist()  # (train_batch_size,)
+        world_size = self.actor_rollout_wg.world_size
+        global_partition_lst = get_seqlen_balanced_partitions(global_seqlen_lst, k_partitions=world_size, equal_size=True)
+        # reorder based on index. The data will be automatically equally partitioned by dispatch function
+        global_idx = torch.tensor([j for partition in global_partition_lst for j in partition])
+        batch.reorder(global_idx)
+        global_balance_stats = log_seqlen_unbalance(seqlen_list=global_seqlen_lst, partitions=global_partition_lst, prefix=logging_prefix)
+        metrics.update(global_balance_stats)
+
+
+    def fit(self):
+        """
+        The training loop of PPO.
+        The driver process only need to call the compute functions of the worker group through RPC
+        to construct the PPO dataflow.
+        The light-weight advantage computation is done on the driver process.
+        """
+            
+        from omegaconf import OmegaConf
+
+        from verl.utils.tracking import Tracking
+
+        logger = Tracking(
+            project_name=self.config.trainer.project_name,
+            experiment_name=self.config.trainer.experiment_name,
+            default_backend=self.config.trainer.logger,
+            config=OmegaConf.to_container(self.config, resolve=True),
+        )
+
+        self.global_steps = 0
+
+        # load checkpoint before doing anything
+        self._load_checkpoint()
+
+        # perform validation before training
+        # currently, we only support validation using the reward_function.
+        if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
+            val_metrics = self._validate()
+            assert val_metrics, f"{val_metrics=}"
+            pprint(f"Initial validation metrics: {val_metrics}")
+            logger.log(data=val_metrics, step=self.global_steps)
+            if self.config.trainer.get("val_only", False):
+                return
+
+        # add tqdm
+        progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Training Progress")
+
+        # we start from step 1
+        self.global_steps += 1
+        last_val_metrics = None
+
+        for epoch in range(self.config.trainer.total_epochs):
+            for batch_dict in self.train_dataloader:
+                metrics = {}
+                timing_raw = {}
+                batch: DataProto = DataProto.from_single_dict(batch_dict)
+
+                # pop those keys for generation
+                batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
+                if self.config.actor_rollout_ref.rollout.get("task_type", None) == "swegym": 
+                    non_tensor_batch_keys_to_pop = ["instance"]
+                else:
+                    non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
+                if "multi_modal_data" in batch.non_tensor_batch:
+                    non_tensor_batch_keys_to_pop.append("multi_modal_data")
+                if "raw_prompt" in batch.non_tensor_batch:
+                    non_tensor_batch_keys_to_pop.append("raw_prompt")
+                if "tools_kwargs" in batch.non_tensor_batch:
+                    non_tensor_batch_keys_to_pop.append("tools_kwargs")
+                gen_batch = batch.pop(
+                    batch_keys=batch_keys_to_pop,
+                    non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
+                )
+
+                is_last_step = self.global_steps >= self.total_training_steps
+
+                with _timer("step", timing_raw):
+                    # generate a batch
+                    with _timer("gen", timing_raw):
+                        if not self.async_rollout_mode:
+                            gen_batch_output = self.actor_rollout_wg.generate_sequences(gen_batch)
+                        else:
+                            self.async_rollout_manager.wake_up()
+                            gen_batch_output = self.async_rollout_manager.generate_sequences(gen_batch)
+                            self.async_rollout_manager.sleep()
+                        timing_raw.update(gen_batch_output.meta_info["timing"])
+                        gen_batch_output.meta_info.pop("timing", None)
+
+                    if self.config.algorithm.adv_estimator == AdvantageEstimator.REMAX:
+                        with _timer("gen_max", timing_raw):
+                            gen_baseline_batch = deepcopy(gen_batch)
+                            gen_baseline_batch.meta_info["do_sample"] = False
+                            gen_baseline_output = self.actor_rollout_wg.generate_sequences(gen_baseline_batch)
+
+                            batch = batch.union(gen_baseline_output)
+                            reward_baseline_tensor = self.reward_fn(batch)
+                            reward_baseline_tensor = reward_baseline_tensor.sum(dim=-1)
+
+                            batch.pop(batch_keys=list(gen_baseline_output.batch.keys()))
+
+                            batch.batch["reward_baselines"] = reward_baseline_tensor
+
+                            del gen_baseline_batch, gen_baseline_output
+
+                    batch.non_tensor_batch["uid"] = np.array([str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object)
+                    # repeat to align with repeated responses in rollout
+                    batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
+                    batch = batch.union(gen_batch_output)
+
+                    batch.batch["response_mask"] = compute_response_mask(batch)
+                    # Balance the number of valid tokens across DP ranks.
+                    # NOTE: This usually changes the order of data in the `batch`,
+                    # which won't affect the advantage calculation (since it's based on uid),
+                    # but might affect the loss calculation (due to the change of mini-batching).
+                    # TODO: Decouple the DP balancing and mini-batching.
+                    if self.config.trainer.balance_batch:
+                        self._balance_batch(batch, metrics=metrics)
+
+                    # compute global_valid tokens
+                    batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist()
+
+                    with _timer("reward", timing_raw):
+                        # compute reward model score
+                        if self.use_rm:
+                            reward_tensor = self.rm_wg.compute_rm_score(batch)
+                            batch = batch.union(reward_tensor)
+
+                        # rule-based rm
+                        future_reward = self.reward_fn.__call__.remote(batch)
+
+                    # recompute old_log_probs
+                    with _timer("old_log_prob", timing_raw):
+                        old_log_prob = self.actor_rollout_wg.compute_log_prob(batch)
+                        entropys = old_log_prob.batch["entropys"]
+                        response_masks = batch.batch["response_mask"]
+                        loss_agg_mode = self.config.actor_rollout_ref.actor.loss_agg_mode
+                        entropy_loss = agg_loss(loss_mat=entropys, loss_mask=response_masks, loss_agg_mode=loss_agg_mode)
+                        old_log_prob_metrics = {"actor/entropy_loss": entropy_loss.detach().item()}
+                        metrics.update(old_log_prob_metrics)
+                        old_log_prob.batch.pop("entropys")
+                        batch = batch.union(old_log_prob)
+
+                        if "rollout_log_probs" in batch.batch.keys():
+                            # TODO: we may want to add diff of probs too.
+                            rollout_old_log_probs = batch.batch["rollout_log_probs"]
+                            actor_old_log_probs = batch.batch["old_log_probs"]
+                            attention_mask = batch.batch["attention_mask"]
+                            responses = batch.batch["responses"]
+                            response_length = responses.size(1)
+                            response_mask = attention_mask[:, -response_length:]
+
+                            rollout_probs = torch.exp(rollout_old_log_probs)
+                            actor_probs = torch.exp(actor_old_log_probs)
+                            rollout_probs_diff = torch.abs(rollout_probs - actor_probs)
+                            rollout_probs_diff = torch.masked_select(rollout_probs_diff, response_mask.bool())
+                            rollout_probs_diff_max = torch.max(rollout_probs_diff)
+                            rollout_probs_diff_mean = torch.mean(rollout_probs_diff)
+                            rollout_probs_diff_std = torch.std(rollout_probs_diff)
+                            metrics.update(
+                                {
+                                    "training/rollout_probs_diff_max": rollout_probs_diff_max.detach().item(),
+                                    "training/rollout_probs_diff_mean": rollout_probs_diff_mean.detach().item(),
+                                    "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(),
+                                }
+                            )
+
+                    if self.use_reference_policy:
+                        # compute reference log_prob
+                        with _timer("ref", timing_raw):
+                            if not self.ref_in_actor:
+                                ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch)
+                            else:
+                                ref_log_prob = self.actor_rollout_wg.compute_ref_log_prob(batch)
+                            batch = batch.union(ref_log_prob)
+
+                    # compute values
+                    if self.use_critic:
+                        with _timer("values", timing_raw):
+                            values = self.critic_wg.compute_values(batch)
+                            batch = batch.union(values)
+
+                    with _timer("adv", timing_raw):
+                        # we combine with rule-based rm
+                        reward_result = ray.get(future_reward)
+                        reward_extra_infos_dict = {}
+
+                        batch.batch["token_level_scores"] = reward_result["score"]
+                        batch.batch['token_level_rewards'] = reward_result['reward']
+
+                        # Log reward_metrics if available
+                        if "reward_metrics" in reward_result:
+                            reward_metrics = reward_result["reward_metrics"]
+                            if isinstance(reward_metrics, dict):
+                                for key, value in reward_metrics.items():
+                                    metrics[f"reward_metrics/{key}"] = value
+                            else:
+                                print(f"Warning: reward_metrics is not a dict, got {type(reward_metrics)}")
+
+                        if reward_extra_infos_dict:
+                            batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()})
+
+                        # compute rewards. apply_kl_penalty if available
+                        if self.config.algorithm.use_kl_in_reward:
+                            batch, kl_metrics = apply_kl_penalty(batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty)
+                            metrics.update(kl_metrics)
+
+                        # compute advantages, executed on the driver process
+
+                        norm_adv_by_std_in_grpo = self.config.algorithm.get("norm_adv_by_std_in_grpo", True)  # GRPO adv normalization factor
+
+                        batch = compute_advantage(
+                            batch,
+                            adv_estimator=self.config.algorithm.adv_estimator,
+                            gamma=self.config.algorithm.gamma,
+                            lam=self.config.algorithm.lam,
+                            num_repeat=self.config.actor_rollout_ref.rollout.n,
+                            norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
+                            multi_turn=self.config.actor_rollout_ref.rollout.multi_turn.enable,
+                            config=self.config.algorithm
+                        )
+
+                    # update critic
+                    if self.use_critic:
+                        with _timer("update_critic", timing_raw):
+                            critic_output = self.critic_wg.update_critic(batch)
+                        critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"])
+                        metrics.update(critic_output_metrics)
+
+                    # implement critic warmup
+                    if self.config.trainer.critic_warmup <= self.global_steps:
+                        # update actor
+                        with _timer("update_actor", timing_raw):
+                            batch.meta_info["multi_turn"] = self.config.actor_rollout_ref.rollout.multi_turn.enable
+                            actor_output = self.actor_rollout_wg.update_actor(batch)
+                        actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"])
+                        metrics.update(actor_output_metrics)
+
+                    # Log rollout generations if enabled
+                    rollout_data_dir = self.config.trainer.get("rollout_data_dir", None)
+                    if rollout_data_dir:
+                        with _timer("dump_rollout_generations", timing_raw):
+                            print(batch.batch.keys())
+                            inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True)
+                            outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True)
+                            scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist()
+                            self._dump_generations(
+                                inputs=inputs,
+                                outputs=outputs,
+                                scores=scores,
+                                reward_extra_infos_dict=reward_extra_infos_dict,
+                                dump_path=rollout_data_dir,
+                            )
+
+                    if self.config.trainer.save_freq > 0 and (is_last_step or self.global_steps % self.config.trainer.save_freq == 0):
+                        with _timer("save_checkpoint", timing_raw):
+                            self._save_checkpoint()
+                    elif self.timeout.check_save():
+                        with _timer('save_checkpoint', timing_raw):
+                            self._save_checkpoint()
+
+                    # validate
+                    if self.val_reward_fn is not None and self.config.trainer.test_freq > 0 and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0 or self.timeout.last_saved):
+                        with _timer("testing", timing_raw):
+                            val_metrics: dict = self._validate()
+                            if is_last_step:
+                                last_val_metrics = val_metrics
+                        metrics.update(val_metrics)
+
+                # training metrics
+                metrics.update(
+                    {
+                        "training/global_step": self.global_steps,
+                        "training/epoch": epoch,
+                    }
+                )
+                # collect metrics
+                metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic))
+                metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw))
+                # TODO: implement actual tflpo and theoretical tflpo
+                n_gpus = self.resource_pool_manager.get_n_gpus()
+                metrics.update(compute_throughout_metrics(batch=batch, timing_raw=timing_raw, n_gpus=n_gpus))
+
+                # TODO: make a canonical logger that supports various backend
+                logger.log(data=metrics, step=self.global_steps)
+
+                progress_bar.update(1)
+                self.global_steps += 1
+                if is_last_step:
+                    pprint(f"Final validation metrics: {last_val_metrics}")
+                    progress_bar.close()
+                    return
diff --git a/trainer_integration/verl/verl_custom/trainer/ppo/ray_trainer_dapo.py b/trainer_integration/verl/verl_custom/trainer/ppo/ray_trainer_dapo.py
new file mode 100644
index 000000000..a857b091b
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/trainer/ppo/ray_trainer_dapo.py
@@ -0,0 +1,313 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023-2024 SGLang Team
+# Copyright 2025 ModelBest Inc. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+FSDP PPO Trainer with Ray-based single controller.
+This trainer supports model-agonistic model initialization with huggingface
+"""
+
+
+from collections import defaultdict
+from copy import deepcopy
+from dataclasses import dataclass, field
+from enum import Enum
+from pprint import pprint
+from typing import Optional, Type
+
+import numpy as np
+import ray
+import torch
+from omegaconf import OmegaConf, open_dict
+from torch.utils.data import Dataset, Sampler
+from torchdata.stateful_dataloader import StatefulDataLoader
+from tqdm import tqdm
+
+from verl import DataProto
+from verl_custom.nvidia.reward_manager.length_penalty import LengthPenalty
+from verl_custom.nvidia.utils.timer import TimeoutChecker
+from verl.protocol import pad_dataproto_to_divisor, unpad_dataproto
+from verl.single_controller.base import Worker
+from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
+from verl.single_controller.ray.base import create_colocated_worker_cls
+from verl_custom.trainer.ppo import core_algos
+from verl_custom.trainer.ppo.core_algos import AdvantageEstimator, agg_loss
+from verl_custom.trainer.ppo.metric_utils import (
+    compute_data_metrics,
+    compute_throughout_metrics,
+    compute_timing_metrics,
+    process_validation_metrics,
+)
+from verl_custom.trainer.ppo.reward import compute_reward, compute_reward_async
+from verl.utils.checkpoint.checkpoint_manager import BaseCheckpointManager, find_latest_ckpt_path
+from verl.utils.debug.performance import _timer
+from verl.utils.metric import (
+    reduce_metrics,
+)
+from verl.utils.seqlen_balancing import get_seqlen_balanced_partitions, log_seqlen_unbalance
+from verl.utils.torch_functional import masked_mean
+from verl.utils.tracking import ValidationGenerationsLogger
+from verl_custom.trainer.ppo.ray_trainer import RayPPOTrainer,compute_response_mask,apply_kl_penalty,compute_advantage
+import time
+
+
+class RayPPOTrainerDAPO(RayPPOTrainer):
+    """
+    Note that this trainer runs on the driver process on a single CPU/GPU node.
+    """
+
+    def fit(self):
+        """
+        The training loop of PPO.
+        The driver process only need to call the compute functions of the worker group through RPC
+        to construct the PPO dataflow.
+        The light-weight advantage computation is done on the driver process.
+        """
+            
+        from omegaconf import OmegaConf
+
+        from verl.utils.tracking import Tracking
+
+        logger = Tracking(
+            project_name=self.config.trainer.project_name,
+            experiment_name=self.config.trainer.experiment_name,
+            default_backend=self.config.trainer.logger,
+            config=OmegaConf.to_container(self.config, resolve=True),
+        )
+
+        self.global_steps = 0
+        # to resolve a bug in vllm
+        if self.config.actor_rollout_ref.rollout.mode == "async":
+            self.async_rollout_manager.sleep()
+        # load checkpoint before doing anything
+        self._load_checkpoint()            
+
+        # perform validation before training
+        # currently, we only support validation using the reward_function.
+        if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
+            val_metrics = self._validate()
+            assert val_metrics, f"{val_metrics=}"
+            pprint(f"Initial validation metrics: {val_metrics}")
+            logger.log(data=val_metrics, step=self.global_steps)
+            if self.config.trainer.get("val_only", False):
+                return
+
+        # add tqdm
+        progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Training Progress")
+
+        # we start from step 1
+        self.global_steps += 1
+        last_val_metrics = None
+
+        timing_raw = defaultdict(float)
+        batch = None
+        
+        while True:
+            metrics = {}
+
+            is_last_step = self.global_steps >= self.total_training_steps
+
+            with _timer("step", timing_raw):
+                # generate a batch
+                with _timer("gen", timing_raw):
+                    assert self.async_rollout_mode
+                    self.async_rollout_manager.wake_up()
+                    batch = self.async_rollout_manager.generate_sequences_dapo()
+                    self.async_rollout_manager.sleep()
+                            
+                    timing_raw.update(batch.meta_info["timing"])
+                    batch.meta_info.pop("timing", None)
+
+                with _timer("reward", timing_raw):
+                    # compute scores. Support both model and function-based.
+                    if self.use_rm:
+                        # we first compute reward model score
+                        reward_tensor = self.rm_wg.compute_rm_score(batch)
+                        batch = batch.union(reward_tensor)
+
+                    # rule-based rm
+                    reward_result = ray.get(self.reward_fn.__call__.remote(batch))
+                    reward_extra_infos_dict = {}
+
+                    batch.batch["token_level_scores"] = reward_result["score"]
+                    batch.batch['token_level_rewards'] = reward_result['reward']
+
+                    if reward_extra_infos_dict:
+                        batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()})
+
+                    # Log reward_metrics if available
+                    if "reward_metrics" in reward_result:
+                        reward_metrics = reward_result["reward_metrics"]
+                        if isinstance(reward_metrics, dict):
+                            for key, value in reward_metrics.items():
+                                metrics[f"reward_metrics/{key}"] = value
+                        else:
+                            print(f"Warning: reward_metrics is not a dict, got {type(reward_metrics)}")
+
+                    # compute rewards. apply_kl_penalty if available
+                    if self.config.algorithm.use_kl_in_reward:
+                        batch, kl_metrics = apply_kl_penalty(batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty)
+                        metrics.update(kl_metrics)  # TODO: This will be cleared if we use multiple genenration batches
+
+                assert self.config.algorithm.filter_groups.enable
+
+                # === Updating ===
+
+                batch.batch["response_mask"] = compute_response_mask(batch)
+
+                # Balance the number of valid tokens across DP ranks.
+                # NOTE: This usually changes the order of data in the `batch`,
+                # which won't affect the advantage calculation (since it's based on uid),
+                # but might affect the loss calculation (due to the change of mini-batching).
+                # TODO: Decouple the DP balancing and mini-batching.
+                if self.config.trainer.balance_batch:
+                    self._balance_batch(batch, metrics=metrics)
+
+                # compute global_valid tokens
+                batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist()
+
+                # recompute old_log_probs
+                with _timer("old_log_prob", timing_raw):
+                    old_log_prob = self.actor_rollout_wg.compute_log_prob(batch)
+                    entropys = old_log_prob.batch["entropys"]
+                    response_masks = batch.batch["response_mask"]
+                    loss_agg_mode = self.config.actor_rollout_ref.actor.loss_agg_mode
+                    entropy_loss = agg_loss(loss_mat=entropys, loss_mask=response_masks, loss_agg_mode=loss_agg_mode)
+                    old_log_prob_metrics = {"actor/entropy_loss": entropy_loss.detach().item()}
+                    metrics.update(old_log_prob_metrics)
+                    old_log_prob.batch.pop("entropys")
+                    batch = batch.union(old_log_prob)
+
+                    if "rollout_log_probs" in batch.batch.keys():
+                        # TODO: we may want to add diff of probs too.
+                        rollout_old_log_probs = batch.batch["rollout_log_probs"]
+                        actor_old_log_probs = batch.batch["old_log_probs"]
+                        attention_mask = batch.batch["attention_mask"]
+                        responses = batch.batch["responses"]
+                        response_length = responses.size(1)
+                        response_mask = attention_mask[:, -response_length:]
+
+                        rollout_probs = torch.exp(rollout_old_log_probs)
+                        actor_probs = torch.exp(actor_old_log_probs)
+                        rollout_probs_diff = torch.abs(rollout_probs - actor_probs)
+                        rollout_probs_diff = torch.masked_select(rollout_probs_diff, response_mask.bool())
+                        rollout_probs_diff_max = torch.max(rollout_probs_diff)
+                        rollout_probs_diff_mean = torch.mean(rollout_probs_diff)
+                        rollout_probs_diff_std = torch.std(rollout_probs_diff)
+                        metrics.update(
+                            {
+                                "training/rollout_probs_diff_max": rollout_probs_diff_max.detach().item(),
+                                "training/rollout_probs_diff_mean": rollout_probs_diff_mean.detach().item(),
+                                "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(),
+                            }
+                        )
+
+                if self.use_reference_policy:
+                    # compute reference log_prob
+                    with _timer("ref", timing_raw):
+                        if not self.ref_in_actor:
+                            ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch)
+                        else:
+                            ref_log_prob = self.actor_rollout_wg.compute_ref_log_prob(batch)
+                        batch = batch.union(ref_log_prob)
+
+                # compute values
+                if self.use_critic:
+                    with _timer("values", timing_raw):
+                        values = self.critic_wg.compute_values(batch)
+                        batch = batch.union(values)
+
+                with _timer("adv", timing_raw):
+                    # compute advantages, executed on the driver process
+
+                    norm_adv_by_std_in_grpo = self.config.algorithm.get("norm_adv_by_std_in_grpo", True)  # GRPO adv normalization factor
+
+                    batch = compute_advantage(
+                        batch,
+                        adv_estimator=self.config.algorithm.adv_estimator,
+                        gamma=self.config.algorithm.gamma,
+                        lam=self.config.algorithm.lam,
+                        num_repeat=self.config.actor_rollout_ref.rollout.n,
+                        norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
+                        multi_turn=self.config.actor_rollout_ref.rollout.multi_turn.enable,
+                        config=self.config.algorithm
+                    )
+
+                # update critic
+                if self.use_critic:
+                    with _timer("update_critic", timing_raw):
+                        critic_output = self.critic_wg.update_critic(batch)
+                    critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"])
+                    metrics.update(critic_output_metrics)
+
+                # implement critic warmup
+                if self.config.trainer.critic_warmup <= self.global_steps:
+                    # update actor
+                    with _timer("update_actor", timing_raw):
+                        batch.meta_info["multi_turn"] = self.config.actor_rollout_ref.rollout.multi_turn.enable
+                        actor_output = self.actor_rollout_wg.update_actor(batch)
+                    actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"])
+                    metrics.update(actor_output_metrics)
+
+                # Log rollout generations if enabled
+                rollout_data_dir = self.config.trainer.get("rollout_data_dir", None)
+                if rollout_data_dir:
+                    with _timer("dump_rollout_generations", timing_raw):
+                        print(batch.batch.keys())
+                        inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True)
+                        outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True)
+                        scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist()
+                        self._dump_generations(
+                            inputs=inputs,
+                            outputs=outputs,
+                            scores=scores,
+                            reward_extra_infos_dict=reward_extra_infos_dict,
+                            dump_path=rollout_data_dir,
+                        )
+
+                if self.config.trainer.save_freq > 0 and (is_last_step or self.global_steps % self.config.trainer.save_freq == 0):
+                    with _timer("save_checkpoint", timing_raw):
+                        self._save_checkpoint()
+                elif self.timeout.check_save():
+                    with _timer('save_checkpoint', timing_raw):
+                        self._save_checkpoint()
+
+                # validate
+                if self.val_reward_fn is not None and self.config.trainer.test_freq > 0 and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0 or self.timeout.last_saved):
+                    with _timer("testing", timing_raw):
+                        val_metrics: dict = self._validate()
+                        if is_last_step:
+                            last_val_metrics = val_metrics
+                    metrics.update(val_metrics)
+
+                # training metrics
+                metrics.update(
+                    {
+                        "training/global_step": self.global_steps,
+                    }
+                )
+                # collect metrics
+                metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic))
+                metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw))
+                
+
+                # TODO: make a canonical logger that supports various backend
+                logger.log(data=metrics, step=self.global_steps)
+
+                progress_bar.update(1)
+                self.global_steps += 1
+                if is_last_step:
+                    pprint(f"Final validation metrics: {last_val_metrics}")
+                    progress_bar.close()
+                    return
diff --git a/trainer_integration/verl/verl_custom/trainer/ppo/reward.py b/trainer_integration/verl/verl_custom/trainer/ppo/reward.py
new file mode 100644
index 000000000..323be6cb8
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/trainer/ppo/reward.py
@@ -0,0 +1,139 @@
+# Copyright 2025 Individual Contributor: Thibaut Barroyer
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import multiprocessing
+import os
+from functools import partial
+
+import ray
+
+from verl import DataProto
+from verl.utils.reward_score import default_compute_score
+
+
+def get_custom_reward_fn(config):
+    import importlib.util
+    import sys
+
+    reward_fn_config = config.get("custom_reward_function") or {}
+    file_path = reward_fn_config.get("path")
+    if not file_path:
+        return None
+
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"Reward function file '{file_path}' not found.")
+
+    spec = importlib.util.spec_from_file_location("custom_module", file_path)
+    module = importlib.util.module_from_spec(spec)
+    try:
+        sys.modules["custom_module"] = module
+        spec.loader.exec_module(module)
+    except Exception as e:
+        raise RuntimeError(f"Error loading module from '{file_path}': {e}") from e
+
+    function_name = reward_fn_config.get("name")
+    if not hasattr(module, function_name):
+        raise AttributeError(f"Reward function '{function_name}' not found in '{file_path}'.")
+
+    print(f"using customized reward function '{function_name}' from '{file_path}'")
+    raw_fn = getattr(module, function_name)
+
+    reward_kwargs = dict(reward_fn_config.get("reward_kwargs", {}))
+
+    def wrapped_fn(*args, **kwargs):
+        return raw_fn(*args, **kwargs, **reward_kwargs)
+
+    return wrapped_fn
+
+
+def load_reward_manager(config, tokenizer, num_examine, **reward_kwargs):
+    """
+    Load and initialize a reward manager based on the configuration.
+
+    Args:
+        config: PPO trainer configuration object containing reward_model fields.
+        tokenizer: Tokenizer object used for processing text.
+        num_examine: Number of samples to examine.
+        **reward_kwargs: Additional keyword arguments for the reward manager.
+
+    Returns:
+        An instance of the specified reward manager class.
+    """
+    from verl.workers.reward_manager import get_reward_manager_cls
+
+    # The list of pre-defined reward managers are defined in `verl/workers/reward_manager/`:
+    # naive: NaiveRewardManager
+    # prime: PrimeRewardManager
+    # batch: BatchRewardManager
+    # dapo: DAPORewardManager
+    # Note(haibin.lin): For custom reward managers, please make sure they are imported and
+    # registered via `verl.workers.reward_manager.register`
+    # By default reward_manager is set to naive (NaiveRewardManager)
+    reward_manager_name = config.reward_model.get("reward_manager", "naive")
+    reward_manager_cls = get_reward_manager_cls(reward_manager_name)
+
+    # Try to get a custom reward function based on the configuration
+    compute_score = get_custom_reward_fn(config)
+    final_compute_score = compute_score
+
+    if compute_score is None:
+        sandbox_config = config.reward_model.get("sandbox_fusion")
+        sandbox_url = sandbox_config.get("url") if sandbox_config else None
+        if sandbox_url:
+            sandbox_manager = multiprocessing.Manager()
+            # Create a semaphore to control concurrent access to the sandbox
+            _concurrent_semaphore = sandbox_manager.Semaphore(sandbox_config.get("max_concurrent", 64))
+            final_compute_score = partial(default_compute_score, sandbox_fusion_url=sandbox_url, concurrent_semaphore=_concurrent_semaphore)
+        else:
+            final_compute_score = default_compute_score
+
+    # Instantiate and return the reward manager with the specified parameters
+    return reward_manager_cls(
+        tokenizer=tokenizer,
+        num_examine=num_examine,
+        compute_score=final_compute_score,
+        reward_fn_key=config.data.reward_fn_key,
+        **reward_kwargs,
+    )
+
+
+def compute_reward(data: DataProto, reward_fn):
+    """
+    Compute reward for a batch of data.
+    Args:
+        data: DataProto object containing the input data.
+        reward_fn: Reward function to compute the reward.
+    Returns:
+        Tuple of reward tensor and extra info dictionary.
+    """
+    try:
+        reward_result = reward_fn(data, return_dict=True)
+        reward_tensor = reward_result["reward_tensor"]
+        reward_extra_infos_dict = reward_result["reward_extra_info"]
+    except Exception as e:
+        print(f"Error in reward_fn: {e}")
+        reward_tensor = reward_fn(data)
+        reward_extra_infos_dict = {}
+
+    return reward_tensor, reward_extra_infos_dict
+
+
+@ray.remote(num_cpus=1)
+def compute_reward_async(data: DataProto, config, tokenizer):
+    """
+    Load the reward manager and compute the reward for a batch of data.
+    This is meant to be run in a separate Ray worker.
+    """
+    reward_fn = load_reward_manager(config, tokenizer, num_examine=0, **config.reward_model.get("reward_kwargs", {}))
+    return compute_reward(data, reward_fn)
diff --git a/trainer_integration/verl/verl_custom/utils/__init__.py b/trainer_integration/verl/verl_custom/utils/__init__.py
new file mode 100644
index 000000000..c4f74a03a
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/utils/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/trainer_integration/verl/verl_custom/utils/dataset/__init__.py b/trainer_integration/verl/verl_custom/utils/dataset/__init__.py
new file mode 100644
index 000000000..53a767051
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/utils/dataset/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .rl_dataset import RLHFDataset
+
+
+__all__ = ["RLHFDataset"]
diff --git a/trainer_integration/verl/verl_custom/utils/dataset/rl_dataset.py b/trainer_integration/verl/verl_custom/utils/dataset/rl_dataset.py
new file mode 100644
index 000000000..da607e91f
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/utils/dataset/rl_dataset.py
@@ -0,0 +1,377 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023-2024 SGLang Team
+# Copyright 2025 ModelBest Inc. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import logging
+import os
+import re
+from collections import defaultdict
+from typing import List, Optional, Union
+
+import datasets
+import numpy as np
+import pandas as pd
+import torch
+from omegaconf import DictConfig, ListConfig
+from torch.utils.data import Dataset
+from transformers import PreTrainedTokenizer, ProcessorMixin
+
+import verl.utils.torch_functional as verl_F
+from verl.utils.model import compute_position_id_with_mask
+import json
+
+def normalize_prompt(x):
+    def convert_numpy_recursive(obj):
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        elif isinstance(obj, dict):
+            return {k: convert_numpy_recursive(v) for k, v in obj.items()}
+        elif isinstance(obj, (list, tuple)):
+            return [convert_numpy_recursive(item) for item in obj]
+        else:
+            return obj
+    
+    x = convert_numpy_recursive(x)
+    
+    if isinstance(x, str):
+        return x
+    elif isinstance(x, (dict, list, tuple)):
+        try:
+            return json.dumps(x)
+        except Exception:
+            raise ValueError(f"x is not a string: {x}")
+    elif pd.isnull(x):
+        raise ValueError(f"x is null: {x}")
+    else:
+        raise ValueError(f"x is not a string: {x}")
+        
+logger = logging.getLogger(__name__)
+
+
+def collate_fn(data_list: list[dict]) -> dict:
+    """
+    Collate a batch of sample dicts into batched tensors and arrays.
+
+    Args:
+        data_list: List of dicts mapping feature names to torch.Tensor or other values.
+
+    Returns:
+        Dict where tensor entries are stacked into a torch.Tensor of shape
+        (batch_size, *dims) and non-tensor entries are converted to
+        np.ndarray of dtype object with shape (batch_size,).
+    """
+    tensors = defaultdict(list)
+    non_tensors = defaultdict(list)
+
+    for data in data_list:
+        for key, val in data.items():
+            if isinstance(val, torch.Tensor):
+                tensors[key].append(val)
+            else:
+                non_tensors[key].append(val)
+
+    for key, val in tensors.items():
+        tensors[key] = torch.stack(val, dim=0)
+
+    for key, val in non_tensors.items():
+        non_tensors[key] = np.array(val, dtype=object)
+
+    return {**tensors, **non_tensors}
+
+
+class RLHFDataset(Dataset):
+    """
+    Load and preprocess RLHF data from Parquet files.
+
+    - Caches files locally.
+    - Reads into a HuggingFace Dataset and tokenizes prompts.
+    - Optionally handles images/videos via a ProcessorMixin.
+    - Filters prompts over a max length.
+    - Supports resuming from checkpoints.
+
+    Args:
+        data_files (str or list): Path(s) to Parquet file(s).
+        tokenizer (PreTrainedTokenizer): For the tokenization of text to token IDs.
+        config (DictConfig): Options like cache_dir, prompt_key, max_prompt_length, truncation, etc.
+        processor (ProcessorMixin, optional): Multimodal preprocessor for images/videos.
+    """
+
+    def __init__(
+        self,
+        data_files: Union[str, List[str]],
+        tokenizer: PreTrainedTokenizer,
+        config: DictConfig,
+        processor: Optional[ProcessorMixin] = None,
+    ):
+        if not isinstance(data_files, (List, ListConfig)):
+            data_files = [data_files]
+
+        self.data_files = copy.deepcopy(data_files)
+        self.original_data_files = copy.deepcopy(data_files)  # use for resume
+        self.tokenizer = tokenizer
+        self.processor = processor
+        self.config = config
+
+        self.cache_dir = os.path.expanduser(config.get("cache_dir", "~/.cache/verl/rlhf"))
+        self.prompt_key = config.get("prompt_key", "prompt")
+        self.image_key = config.get("image_key", "images")
+        self.video_key = config.get("video_key", "videos")
+        self.max_prompt_length = config.get("max_prompt_length", 1024)
+        self.return_raw_chat = config.get("return_raw_chat", False)
+        self.return_full_prompt = config.get("return_full_prompt", False)
+        self.truncation = config.get("truncation", "error")
+        self.filter_overlong_prompts = config.get("filter_overlong_prompts", True)
+
+        self.num_workers = config.get("filter_overlong_prompts_workers", max(1, os.cpu_count() // 4))
+        self.num_workers = min(self.num_workers, os.cpu_count())
+        self.use_shm = config.get('use_shm', False)
+        self.chat_template_func = config.get("chat_template_func", None)
+        self.need_tools_kwargs = config.get("need_tools_kwargs", False)
+        self.filter_prompts = config.get("filter_prompts", True)
+        self.serialize_dataset = False
+        self._download()
+        self._read_files_and_tokenize()
+
+    def _download(self, use_origin_parquet=False):
+        from verl.utils.fs import copy_to_local
+
+        data_files = self.data_files if not use_origin_parquet else self.original_data_files
+        for i, parquet_file in enumerate(data_files):
+            self.data_files[i] = copy_to_local(src=parquet_file, cache_dir=self.cache_dir, use_shm=self.use_shm)
+
+    def _read_files_and_tokenize(self):
+        dataframes = []
+        for parquet_file in self.data_files:
+            try:
+                # read parquet files and cache
+                dataframe = pd.read_parquet(parquet_file)
+            except Exception as e:
+                # large parquet files fail due to paarrow error
+                json_file = parquet_file.replace('.parquet', '.json')
+                print(f"Failed to read file {parquet_file}. Try to read {json_file}.")
+                try:
+                    dataframe = pd.read_json(json_file, orient='records')
+                except Exception as json_e:
+                    print(f"Failed to read file {json_file}.")
+                    raise e
+
+            dataframe['prompt'] = dataframe['prompt'].apply(normalize_prompt)
+            if 'reward_model' in dataframe.columns:
+                dataframe['reward_model'] = dataframe['reward_model'].apply(normalize_prompt)
+            if 'tools' in dataframe.columns:
+                dataframe['tools'] = dataframe['tools'].apply(normalize_prompt)
+
+            dataframes.append(dataframe)
+            
+        # Convert pandas DataFrame to datasets.Dataset
+        self.dataframe = datasets.Dataset.from_pandas(pd.concat(dataframes, ignore_index=True))
+
+        print(f"dataset len: {len(self.dataframe)}")
+
+        # filter out too long prompts
+        if self.filter_overlong_prompts:
+            tokenizer = self.tokenizer
+            prompt_key = self.prompt_key
+            self.dataframe = self.dataframe.filter(
+                lambda doc: len(tokenizer.apply_chat_template(json.loads(doc[prompt_key]), add_generation_prompt=True)) <= self.max_prompt_length,
+                num_proc=self.num_workers,
+                desc=f"Filtering prompts longer than {self.max_prompt_length} tokens",
+            )
+
+    def resume_dataset_state(self):
+        self.serialize_dataset = not hasattr(self, "original_data_files")
+        # resume dataframe if not it's serialized in data.pt
+        if not self.serialize_dataset:
+            self._download(use_origin_parquet=True)  # download and resume from original parquet files
+            self._read_files_and_tokenize()
+        else:
+            print(r"old dataloader ckpt file is used, please train from scratch for better ckpt performance")
+
+    def __len__(self):
+        return len(self.dataframe)
+
+    def _build_messages(self, example: dict):
+        if isinstance(example[self.prompt_key], str):
+            chat = json.loads(example.pop(self.prompt_key))
+        else:
+            chat = example.pop(self.prompt_key)
+
+        if 'tools' in example and pd.notna(example['tools']) and example['tools'] != "":
+            tools = json.loads(example.pop('tools')) # tools from the function-calling training data (tool-access)
+        else:
+            if 'tools' in example:
+                example.pop('tools')
+            tools = None
+
+        messages: list = chat
+
+        if self.image_key in example or self.video_key in example:
+            for message in messages:
+                content = message["content"]
+                content_list = []
+                for segment in re.split("(<image>|<video>)", content):
+                    if segment == "<image>":
+                        content_list.append({"type": "image"})
+                    elif segment == "<video>":
+                        content_list.append({"type": "video"})
+                    else:
+                        content_list.append({"type": "text", "text": segment})
+
+                message["content"] = content_list
+
+        return messages, tools
+
+    def __getitem__(self, item):
+        """
+        Note that we also return the raw_input_ids so that it can be combined with other chat template
+        """
+        row_dict: dict = self.dataframe[item]
+
+        messages, tools = self._build_messages(row_dict)
+        
+        if 'reward_model' in row_dict and row_dict['reward_model'] and isinstance(row_dict['reward_model'], str):
+            row_dict['reward_model'] = json.loads(row_dict['reward_model'])
+
+        model_inputs = {}
+
+        if self.processor is not None:
+            from verl.utils.dataset.vision_utils import process_image, process_video
+
+            raw_prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+            multi_modal_data = {}
+
+            images = None
+            if self.image_key in row_dict:
+                images = [process_image(image) for image in row_dict.pop(self.image_key)]
+                multi_modal_data["image"] = images
+
+            videos = None
+            if self.video_key in row_dict:
+                videos = [process_video(video) for video in row_dict.pop(self.video_key)]
+                multi_modal_data["video"] = [video.numpy() for video in videos]
+
+            model_inputs = self.processor(text=[raw_prompt], images=images, videos=videos, return_tensors="pt")
+
+            input_ids = model_inputs.pop("input_ids")
+            attention_mask = model_inputs.pop("attention_mask")
+
+            if "second_per_grid_ts" in model_inputs:
+                model_inputs.pop("second_per_grid_ts")
+
+            # There's a trap here, multi_modal_inputs has to be a dict, not BatchFeature
+            row_dict["multi_modal_data"] = multi_modal_data
+            row_dict["multi_modal_inputs"] = dict(model_inputs)
+
+            # second_per_grid_ts isn't used for training, just for mrope
+            row_dict["multi_modal_inputs"].pop("second_per_grid_ts", None)
+
+        else:
+            raw_prompt = self.tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, tokenize=False)
+            model_inputs = self.tokenizer(raw_prompt, return_tensors="pt", add_special_tokens=False)
+            input_ids = model_inputs.pop("input_ids")
+            attention_mask = model_inputs.pop("attention_mask")
+
+
+        input_ids, attention_mask = verl_F.postprocess_data(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            max_length=self.max_prompt_length,
+            pad_token_id=self.tokenizer.pad_token_id,
+            left_pad=True,
+            truncation=self.truncation,
+        )
+
+        if self.processor is not None and self.processor.image_processor.__class__.__name__ == "Qwen2VLImageProcessor":
+            from verl.models.transformers.qwen2_vl import get_rope_index
+
+            position_ids = [
+                get_rope_index(
+                    self.processor,
+                    input_ids=input_ids[0],
+                    image_grid_thw=model_inputs.get("image_grid_thw"),
+                    video_grid_thw=model_inputs.get("video_grid_thw"),
+                    second_per_grid_ts=model_inputs.get("second_per_grid_ts"),
+                    attention_mask=attention_mask[0],
+                )
+            ]  # (1, 3, seq_len)
+
+        else:
+            position_ids = compute_position_id_with_mask(attention_mask)
+
+        row_dict["input_ids"] = input_ids[0]
+        row_dict["attention_mask"] = attention_mask[0]
+        row_dict["position_ids"] = position_ids[0]
+
+        raw_prompt_ids = self.tokenizer.encode(raw_prompt, add_special_tokens=False)
+        if len(raw_prompt_ids) > self.max_prompt_length:
+            if self.truncation == "left":
+                raw_prompt_ids = raw_prompt_ids[-self.max_prompt_length :]
+            elif self.truncation == "right":
+                raw_prompt_ids = raw_prompt_ids[: self.max_prompt_length]
+            elif self.truncation == "middle":
+                left_half = self.max_prompt_length // 2
+                right_half = self.max_prompt_length - left_half
+                raw_prompt_ids = raw_prompt_ids[:left_half] + raw_prompt_ids[-right_half:]
+            elif self.truncation == "error":
+                raise RuntimeError(f"Prompt length {len(raw_prompt_ids)} is longer than {self.max_prompt_length}.")
+
+        row_dict["raw_prompt_ids"] = raw_prompt_ids
+        # encode prompts without chat template
+        if self.return_raw_chat:
+            row_dict["raw_prompt"] = messages
+
+        # get prompts with chat template
+        if self.return_full_prompt:
+            row_dict["full_prompts"] = raw_prompt  # array of strings
+
+        # add index for each prompt
+        index = row_dict.get("extra_info", {}).get("index", 0)
+        tools_kwargs = row_dict.get("extra_info", {}).get("tools_kwargs", {})
+        need_tools_kwargs = row_dict.get("extra_info", {}).get("need_tools_kwargs", self.need_tools_kwargs)
+        if need_tools_kwargs and not tools_kwargs:
+            logger.warning("tools_kwargs is empty for index {}, data source: {}", index, row_dict["data_source"])
+        row_dict["index"] = index
+        row_dict["tools_kwargs"] = tools_kwargs
+
+        # we copy instance data to 'instance' key for agent training delegating to openhands.
+        # swebench tasks already have 'instance' key.
+        if "instance" not in row_dict:
+            row_dict["instance"] = dict(self.dataframe[item])
+
+            # add instance_id to the instance
+            instance = row_dict["instance"]
+            if 'instance_id' not in instance:
+                data_source = instance.get("data_source", "unknown")
+                if 'extra_info' in instance:
+                    split = instance['extra_info'].get('split', 'unknown')
+                    index = instance['extra_info'].get('index', 'unknown')
+                    name = instance['extra_info'].get('name', 'unknown')
+                else:
+                    split = 'unknown'
+                    index = 'unknown'
+                    name = 'unknown'
+                row_dict["instance"]["instance_id"] = f'{data_source}_{name}_{split}_{index}'
+        return row_dict
+
+    def __getstate__(self):
+        if not self.serialize_dataset:
+            state = self.__dict__.copy()
+
+            if "dataframe" in state:
+                del state["dataframe"]
+            return state
+
+        return self.__dict__.copy()
\ No newline at end of file
diff --git a/trainer_integration/verl/verl_custom/workers/__init__.py b/trainer_integration/verl/verl_custom/workers/__init__.py
new file mode 100644
index 000000000..1ce90c5eb
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/workers/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/trainer_integration/verl/verl_custom/workers/actor/__init__.py b/trainer_integration/verl/verl_custom/workers/actor/__init__.py
new file mode 100644
index 000000000..f3ec3c456
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/workers/actor/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .dp_actor import DataParallelPPOActor
+
+__all__ = ["DataParallelPPOActor"]
diff --git a/trainer_integration/verl/verl_custom/workers/actor/dp_actor.py b/trainer_integration/verl/verl_custom/workers/actor/dp_actor.py
new file mode 100644
index 000000000..64fa29ea4
--- /dev/null
+++ b/trainer_integration/verl/verl_custom/workers/actor/dp_actor.py
@@ -0,0 +1,484 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023-2024 SGLang Team
+# Copyright 2025 ModelBest Inc. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Single Process Actor
+"""
+
+import itertools
+import logging
+import os
+from typing import Tuple
+
+import torch
+from torch import nn
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+
+import verl.utils.torch_functional as verl_F
+from verl import DataProto
+from verl_custom.nvidia.utils.utils import boost_high_score_advantages
+from verl_custom.trainer.ppo.core_algos import agg_loss, compute_policy_loss, compute_gspo_policy_loss, kl_penalty
+from verl.utils.debug import GPUMemoryLogger
+from verl.utils.device import get_device_name, get_torch_device, is_cuda_available, is_npu_available
+from verl.utils.fsdp_utils import FSDPModule, fsdp2_clip_grad_norm_
+from verl.utils.py_functional import append_to_dict
+from verl.utils.seqlen_balancing import get_reverse_idx, rearrange_micro_batches
+from verl.utils.torch_functional import logprobs_from_logits
+from verl.utils.ulysses import gather_outpus_and_unpad, ulysses_pad, ulysses_pad_and_slice_inputs
+from verl.workers.actor import BasePPOActor
+
+if is_cuda_available:
+    from flash_attn.bert_padding import index_first_axis, pad_input, rearrange, unpad_input
+elif is_npu_available:
+    from transformers.integrations.npu_flash_attention import index_first_axis, pad_input, rearrange, unpad_input
+
+
+__all__ = ["DataParallelPPOActor"]
+
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
+
+
+class DataParallelPPOActor(BasePPOActor):
+    def __init__(self, config, actor_module: nn.Module, actor_optimizer: torch.optim.Optimizer = None):
+        """When optimizer is None, it is Reference Policy"""
+        super().__init__(config)
+        self.actor_module = actor_module
+        self.actor_optimizer = actor_optimizer
+
+        self.use_remove_padding = self.config.get("use_remove_padding", False)
+        if torch.distributed.get_rank() == 0:
+            print(f"Actor use_remove_padding={self.use_remove_padding}")
+        self.use_fused_kernels = self.config.get("use_fused_kernels", False)
+        if torch.distributed.get_rank() == 0:
+            print(f"Actor use_fused_kernels={self.use_fused_kernels}")
+
+        self.ulysses_sequence_parallel_size = self.config.ulysses_sequence_parallel_size
+        self.use_ulysses_sp = self.ulysses_sequence_parallel_size > 1
+
+        self.compute_entropy_from_logits = (
+            torch.compile(verl_F.entropy_from_logits, dynamic=True)
+            if self.config.get("use_torch_compile", True)  #  use torch compile by default
+            else verl_F.entropy_from_logits
+        )
+        self.device_name = get_device_name()
+
+    def _forward_micro_batch(self, micro_batch, temperature, calculate_entropy=False) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Returns:
+            entropy: # (bs, response_len)
+            log_probs: # (bs, response_len)
+        """
+        response_length = micro_batch["responses"].size(-1)
+        multi_modal_inputs = {}
+        if "multi_modal_inputs" in micro_batch.keys():
+            for key in micro_batch["multi_modal_inputs"][0].keys():
+                multi_modal_inputs[key] = torch.cat([inputs[key] for inputs in micro_batch["multi_modal_inputs"]], dim=0)
+
+        with torch.autocast(device_type=self.device_name, dtype=torch.bfloat16):
+            input_ids = micro_batch["input_ids"]
+            batch_size, seqlen = input_ids.shape
+            attention_mask = micro_batch["attention_mask"]
+            position_ids = micro_batch["position_ids"]
+            entropy = None
+            if position_ids.dim() == 3:  # qwen2vl mrope
+                position_ids = position_ids.transpose(0, 1)  # (bsz, 3, seqlen) -> (3, bsz, seqlen)
+
+            if self.use_remove_padding:
+                input_ids_rmpad, indices, *_ = unpad_input(input_ids.unsqueeze(-1), attention_mask)  # input_ids_rmpad (total_nnz, ...)
+                input_ids_rmpad = input_ids_rmpad.transpose(0, 1)  # (1, total_nnz)
+
+                # unpad the position_ids to align the rotary
+                if position_ids.dim() == 3:
+                    position_ids_rmpad = index_first_axis(rearrange(position_ids, "c b s ... -> (b s) c ..."), indices).transpose(0, 1).unsqueeze(1)  # (3, bsz, seqlen) -> (3, 1, bsz * seqlen)
+                else:
+                    position_ids_rmpad = index_first_axis(rearrange(position_ids.unsqueeze(-1), "b s ... -> (b s) ..."), indices).transpose(0, 1)
+
+                # for compute the log_prob
+                input_ids_rmpad_rolled = torch.roll(input_ids_rmpad, shifts=-1, dims=1)  # (1, total_nnz)
+
+                # pad and slice the inputs if sp > 1
+                if self.use_ulysses_sp:
+                    is_vlm_model = "multi_modal_inputs" in micro_batch
+                    if is_vlm_model:
+                        # vlm model's inputs will be sliced after embedding
+                        input_ids_rmpad, position_ids_rmpad, pad_size = ulysses_pad(
+                            input_ids_rmpad,
+                            position_ids_rmpad=position_ids_rmpad,
+                            sp_size=self.ulysses_sequence_parallel_size,
+                        )
+                    else:
+                        input_ids_rmpad, position_ids_rmpad, pad_size = ulysses_pad_and_slice_inputs(
+                            input_ids_rmpad,
+                            position_ids_rmpad=position_ids_rmpad,
+                            sp_size=self.ulysses_sequence_parallel_size,
+                        )
+                    input_ids_rmpad_rolled, _, _ = ulysses_pad_and_slice_inputs(
+                        input_ids_rmpad_rolled,
+                        position_ids_rmpad=None,
+                        sp_size=self.ulysses_sequence_parallel_size,
+                    )
+
+                input_ids_rmpad_rolled = input_ids_rmpad_rolled.squeeze(0)  # ((total_nnz / sp) + pad)
+
+                # only pass input_ids and position_ids to enable flash_attn_varlen
+                extra_args = {}
+                if self.use_fused_kernels:
+                    extra_args["temperature"] = temperature
+
+                output = self.actor_module(
+                    input_ids=input_ids_rmpad,
+                    attention_mask=None,
+                    position_ids=position_ids_rmpad,
+                    **multi_modal_inputs,
+                    use_cache=False,
+                    **extra_args,
+                )  # prevent model thinks we are generating
+
+                if self.use_fused_kernels:
+                    log_probs = output.log_probs.squeeze(0)  # (total_nnz,)
+                    entropy_rmpad = output.entropy.squeeze(0)  # (total_nnz,)
+
+                else:
+                    logits_rmpad = output.logits.squeeze(0)  # (total_nnz, vocab_size)
+                    logits_rmpad.div_(temperature)
+
+                    # if use_sp: ((total_nnz / sp) + pad) ; if not use_sp: (batch, seqlen)
+                    inplace_backward = True
+                    if calculate_entropy:
+                        inplace_backward = False
+                    log_probs = logprobs_from_logits(
+                        logits=logits_rmpad,
+                        labels=input_ids_rmpad_rolled,
+                        inplace_backward=inplace_backward,
+                    )
+
+                    # compute entropy
+                    if calculate_entropy:
+                        entropy_rmpad = self.compute_entropy_from_logits(logits_rmpad)  # ((total_nnz / sp) + pad)
+
+                # gather log_prob if sp > 1
+                if self.use_ulysses_sp:
+                    # gather and unpad for the ulysses sp
+                    log_probs = gather_outpus_and_unpad(
+                        log_probs,
+                        gather_dim=0,
+                        unpad_dim=0,
+                        padding_size=pad_size,
+                    )
+                    if calculate_entropy:
+                        entropy_rmpad = gather_outpus_and_unpad(
+                            entropy_rmpad,
+                            gather_dim=0,
+                            unpad_dim=0,
+                            padding_size=pad_size,
+                        )
+                # pad back to (bsz, seqlen)
+                if calculate_entropy:
+                    full_entropy = pad_input(
+                        hidden_states=entropy_rmpad.unsqueeze(-1),
+                        indices=indices,
+                        batch=batch_size,
+                        seqlen=seqlen,
+                    )
+                full_log_probs = pad_input(
+                    hidden_states=log_probs.unsqueeze(-1),
+                    indices=indices,
+                    batch=batch_size,
+                    seqlen=seqlen,
+                )
+
+                # only return response part:
+                if calculate_entropy:
+                    entropy = full_entropy.squeeze(-1)[:, -response_length - 1 : -1]  # (bsz, response_length)
+                log_probs = full_log_probs.squeeze(-1)[:, -response_length - 1 : -1]  # (bsz, response_length)
+
+            else:  # not using rmpad and no ulysses sp
+                extra_args = {}
+                if self.use_fused_kernels:
+                    extra_args["temperature"] = temperature
+                output = self.actor_module(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    **multi_modal_inputs,
+                    use_cache=False,
+                    **extra_args,
+                )  # prevent model thinks we are generating
+
+                if self.use_fused_kernels:
+                    log_probs = output.log_probs[:, -response_length - 1 : -1]
+                    entropy = output.entropy[:, -response_length - 1 : -1]  # (bsz, response_length)
+
+                else:
+                    logits = output.logits
+
+                    logits.div_(temperature)
+                    logits = logits[:, -response_length - 1 : -1, :]  # (bsz, response_length, vocab_size)
+                    log_probs = logprobs_from_logits(logits, micro_batch["responses"])
+                    if calculate_entropy:
+                        entropy = verl_F.entropy_from_logits(logits)  # (bsz, response_length)
+
+            return entropy, log_probs
+
+    def _optimizer_step(self):
+        assert self.config.grad_clip is not None
+
+        if isinstance(self.actor_module, FSDP):
+            grad_norm = self.actor_module.clip_grad_norm_(max_norm=self.config.grad_clip)
+        elif isinstance(self.actor_module, FSDPModule):
+            grad_norm = fsdp2_clip_grad_norm_(self.actor_module.parameters(), max_norm=self.config.grad_clip)
+        else:
+            grad_norm = torch.nn.utils.clip_grad_norm_(self.actor_module.parameters(), max_norm=self.config.grad_clip)
+
+        # if grad_norm is not finite, skip the update
+        if not torch.isfinite(grad_norm):
+            print(f"WARN: rank {torch.distributed.get_rank()} grad_norm is not finite: {grad_norm}")
+            self.actor_optimizer.zero_grad()
+        else:
+            self.actor_optimizer.step()
+        return grad_norm
+
+    @GPUMemoryLogger(role="dp actor", logger=logger)
+    def compute_log_prob(self, data: DataProto, calculate_entropy=False) -> torch.Tensor:
+        """Compute the log probability of the responses given input_ids, attention_mask and position_ids
+
+        Args:
+            data (DataProto): a DataProto containing keys
+
+                ``input_ids``: tensor of shape [batch_size, sequence_length]. torch.int64. Note that input_ids is the
+                concatenation of prompt and response. Note that ``sequence_length = prompt_length + response_length``.
+
+                ``attention_mask``: tensor of shape [batch_size, sequence_length]. torch.int64.
+
+                ``position_ids``: tensor of shape [batch_size, sequence_length]. torch.int64.
+
+                ``responses``:  tensor of shape [batch_size, response_length]. torch.int64.
+
+        Returns:
+            torch.Tensor: the log_prob tensor
+        """
+        # set to eval
+        self.actor_module.eval()
+
+        micro_batch_size = data.meta_info["micro_batch_size"]
+        temperature = data.meta_info["temperature"]  # temperature must be in the data.meta_info to avoid silent error
+        use_dynamic_bsz = data.meta_info["use_dynamic_bsz"]
+
+        select_keys = ["responses", "input_ids", "attention_mask", "position_ids"]
+        batch = data.select(batch_keys=select_keys).batch
+        has_multi_modal_inputs = "multi_modal_inputs" in data.non_tensor_batch.keys()
+
+        if has_multi_modal_inputs:
+            num_micro_batches = data.batch.batch_size[0] // micro_batch_size
+            non_tensor_select_keys = ["multi_modal_inputs"]
+            micro_batches = data.select(select_keys, non_tensor_select_keys).chunk(num_micro_batches)
+        elif use_dynamic_bsz:
+            # split using dynamic bsz
+            max_token_len = data.meta_info["max_token_len"] * self.ulysses_sequence_parallel_size
+            micro_batches, indices = rearrange_micro_batches(batch=batch, max_token_len=max_token_len)
+        else:
+            micro_batches = batch.split(micro_batch_size)
+
+        log_probs_lst = []
+        entropy_lst = []
+        for micro_batch in micro_batches:
+            if isinstance(micro_batch, DataProto):
+                micro_batch = {**micro_batch.batch, **micro_batch.non_tensor_batch}
+            with torch.no_grad():
+                entropy, log_probs = self._forward_micro_batch(micro_batch, temperature=temperature, calculate_entropy=calculate_entropy)
+            log_probs_lst.append(log_probs)
+            if calculate_entropy:
+                entropy_lst.append(entropy)
+
+        log_probs = torch.concat(log_probs_lst, dim=0)
+        entropys = None
+        if calculate_entropy:
+            entropys = torch.concat(entropy_lst, dim=0)
+        if use_dynamic_bsz:
+            indices = list(itertools.chain.from_iterable(indices))
+            assert len(indices) == log_probs.size(0), f"{len(indices)} vs. {log_probs.size()}"
+            revert_indices = torch.tensor(get_reverse_idx(indices), dtype=torch.long)
+            log_probs = log_probs[revert_indices]
+
+        return log_probs, entropys
+
+    @GPUMemoryLogger(role="dp actor", logger=logger)
+    def update_policy(self, data: DataProto):
+        # make sure we are in training mode
+        self.actor_module.train()
+
+        # Apply advantage boosting if needed
+        correct_sample_adv_boost_value = self.config.get('correct_sample_advantage_boost_value', 0)
+        if correct_sample_adv_boost_value > 0:
+            correct_sample_adv_boost_threshold = self.config.get('correct_sample_advantage_boost_threshold', 1.0)
+            scores = data.batch['token_level_scores'].sum(dim=-1)
+            data.batch['advantages'] = boost_high_score_advantages(
+                data.batch['advantages'], scores, correct_sample_adv_boost_value, correct_sample_adv_boost_threshold)
+
+        temperature = data.meta_info["temperature"]  # temperature must be in the data.meta_info to avoid silent error
+        multi_turn = data.meta_info.get("multi_turn", False)
+
+        select_keys = ["responses", "input_ids", "attention_mask", "position_ids", "old_log_probs", "advantages"]
+        if multi_turn:
+            select_keys.append("loss_mask")
+        if self.config.use_kl_loss:
+            select_keys.append("ref_log_prob")
+        if "is_padded" in data.batch.keys():
+            select_keys.append("is_padded")
+        if "error_mask" in data.batch.keys():
+            select_keys.append("error_mask")
+        
+        if self.config.tis_imp_ratio_cap > 0 and "rollout_log_probs" in data.batch.keys():
+            select_keys.append("rollout_log_probs")
+        batch = data.select(batch_keys=select_keys).batch
+        has_multi_modal_inputs = "multi_modal_inputs" in data.non_tensor_batch.keys()
+
+        # Split to make minibatch iterator for updating the actor
+        # See PPO paper for details. https://arxiv.org/abs/1707.06347
+        if has_multi_modal_inputs:
+            num_mini_batches = data.batch.batch_size[0] // self.config.ppo_mini_batch_size
+            non_tensor_select_keys = ["multi_modal_inputs"]
+            dataloader = data.select(select_keys, non_tensor_select_keys).chunk(num_mini_batches)
+        else:
+            dataloader = batch.split(self.config.ppo_mini_batch_size)
+
+        metrics = {}
+        for epoch in range(self.config.ppo_epochs):
+            for batch_idx, data in enumerate(dataloader):
+                # split batch into micro_batches
+                mini_batch = data
+                if has_multi_modal_inputs:
+                    self.gradient_accumulation = self.config.ppo_mini_batch_size // self.config.ppo_micro_batch_size_per_gpu
+                    num_micro_batches = mini_batch.batch.batch_size[0] // self.config.ppo_micro_batch_size_per_gpu
+                    micro_batches = data.select(select_keys, non_tensor_select_keys).chunk(num_micro_batches)
+                elif self.config.use_dynamic_bsz:
+                    max_token_len = self.config.ppo_max_token_len_per_gpu * self.ulysses_sequence_parallel_size
+                    micro_batches, _ = rearrange_micro_batches(batch=mini_batch, max_token_len=max_token_len)
+                else:
+                    self.gradient_accumulation = self.config.ppo_mini_batch_size // self.config.ppo_micro_batch_size_per_gpu
+                    # split batch into micro_batches
+                    micro_batches = mini_batch.split(self.config.ppo_micro_batch_size_per_gpu)
+
+                self.actor_optimizer.zero_grad()
+
+                for data in micro_batches:
+                    # Support all hardwares
+                    if isinstance(data, DataProto):
+                        data = {**data.batch.to(get_torch_device().current_device()), **data.non_tensor_batch}
+                    else:
+                        data = data.to(get_torch_device().current_device())  # actor device is cpu when using offload
+                    responses = data["responses"]
+                    response_length = responses.size(1)
+                    attention_mask = data["attention_mask"]
+                    if multi_turn:
+                        response_mask = data["loss_mask"][:, -response_length:]
+                    else:
+                        response_mask = attention_mask[:, -response_length:]
+                    
+                    if "is_padded" in data:
+                        from verl_custom.nvidia.utils.utils import apply_padding_mask
+                        response_mask = apply_padding_mask(response_mask, data["is_padded"], log_stats=True)
+
+                    if "error_mask" in data and self.config.get("use_error_mask", False):
+                        response_mask = apply_padding_mask(response_mask, data["error_mask"], log_stats=True)
+
+                    old_log_prob = data["old_log_probs"]
+                    advantages = data["advantages"]
+
+                    clip_ratio = self.config.clip_ratio
+                    clip_ratio_low = self.config.clip_ratio_low if self.config.clip_ratio_low is not None else clip_ratio
+                    clip_ratio_high = self.config.clip_ratio_high if self.config.clip_ratio_high is not None else clip_ratio
+                    clip_ratio_c = self.config.get("clip_ratio_c", 3.0)
+                    entropy_coeff = self.config.entropy_coeff
+                    loss_agg_mode = self.config.loss_agg_mode
+
+                    # all return: (bsz, response_length)
+                    calculate_entropy = False
+                    if entropy_coeff != 0:
+                        calculate_entropy = True
+                    entropy, log_prob = self._forward_micro_batch(micro_batch=data, temperature=temperature, calculate_entropy=calculate_entropy)
+
+                    # Get policy loss type from config, default to "ppo"
+                    policy_loss_type = getattr(self.config, 'policy_loss_type', 'ppo')
+                    
+                    if policy_loss_type == "ppo":
+                        pg_loss, pg_clipfrac, ppo_kl, pg_clipfrac_lower = compute_policy_loss(
+                            old_log_prob=old_log_prob,
+                            log_prob=log_prob,
+                            advantages=advantages,
+                            response_mask=response_mask,
+                            cliprange=clip_ratio,
+                            cliprange_low=clip_ratio_low,
+                            cliprange_high=clip_ratio_high,
+                            clip_ratio_c=clip_ratio_c,
+                            loss_agg_mode=loss_agg_mode,
+                            config=self.config,
+                            rollout_log_probs=data.get("rollout_log_probs", None) if self.config.tis_imp_ratio_cap > 0 else None,
+                        )
+                    elif policy_loss_type == "gspo":
+                        pg_loss, pg_clipfrac, ppo_kl, pg_clipfrac_lower = compute_gspo_policy_loss(
+                            old_log_prob=old_log_prob,
+                            log_prob=log_prob,
+                            advantages=advantages,
+                            response_mask=response_mask,
+                            cliprange=clip_ratio,
+                            cliprange_low=clip_ratio_low,
+                            cliprange_high=clip_ratio_high,
+                            clip_ratio_c=clip_ratio_c,
+                            loss_agg_mode=loss_agg_mode,
+                            rollout_log_probs=data.get("rollout_log_probs", None) if self.config.tis_imp_ratio_cap > 0 else None,
+                        )
+                    else:
+                        raise ValueError(f"Unsupported policy_loss_type: {policy_loss_type}. Supported types: 'ppo', 'gspo'")
+
+                    if entropy_coeff != 0:
+                        entropy_loss = agg_loss(loss_mat=entropy, loss_mask=response_mask, loss_agg_mode=loss_agg_mode)
+
+                        # compute policy loss
+                        policy_loss = pg_loss - entropy_loss * entropy_coeff
+                    else:
+                        policy_loss = pg_loss
+
+                    if self.config.use_kl_loss:
+                        ref_log_prob = data["ref_log_prob"]
+                        # compute kl loss
+                        kld = kl_penalty(logprob=log_prob, ref_logprob=ref_log_prob, kl_penalty=self.config.kl_loss_type)
+                        kl_loss = agg_loss(loss_mat=kld, loss_mask=response_mask, loss_agg_mode=loss_agg_mode)
+
+                        policy_loss = policy_loss + kl_loss * self.config.kl_loss_coef
+                        metrics["actor/kl_loss"] = kl_loss.detach().item()
+                        metrics["actor/kl_coef"] = self.config.kl_loss_coef
+
+                    if self.config.use_dynamic_bsz:
+                        # relative to the dynamic bsz
+                        loss = policy_loss * (len(data) / self.config.ppo_mini_batch_size)
+                    else:
+                        loss = policy_loss / self.gradient_accumulation
+                    loss.backward()
+
+                    data = {
+                        "actor/pg_loss": pg_loss.detach().item(),
+                        "actor/pg_clipfrac": pg_clipfrac.detach().item(),
+                        "actor/ppo_kl": ppo_kl.detach().item(),
+                        "actor/pg_clipfrac_lower": pg_clipfrac_lower.detach().item(),
+                    }
+                    append_to_dict(metrics, data)
+
+                grad_norm = self._optimizer_step()
+                data = {"actor/grad_norm": grad_norm.detach().item()}
+                append_to_dict(metrics, data)
+        self.actor_optimizer.zero_grad()
+        return metrics