diff --git a/.github/workflows/skill-eval.yml b/.github/workflows/skill-eval.yml new file mode 100644 index 00000000000..1be42960da9 --- /dev/null +++ b/.github/workflows/skill-eval.yml @@ -0,0 +1,206 @@ +name: Skill Eval + +on: + pull_request: + paths: + - 'skills/**' + - 'evals/**' + workflow_dispatch: + +permissions: + contents: read + pull-requests: write + +jobs: + # Job 1: Validate graders against reference solutions + validate_graders: + runs-on: ubuntu-latest + timeout-minutes: 10 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: '22' + + - name: Validate graders against reference solutions + working-directory: evals + run: npm run validate + + - name: Upload validation results + if: always() + uses: actions/upload-artifact@v4 + with: + name: skill-eval-validation-results + path: evals/results/ + retention-days: 30 + + # Job 2: Run evals against the Copilot agent + agent_eval_copilot: + runs-on: ubuntu-latest + timeout-minutes: 60 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: '22' + + - name: Install Copilot CLI + run: npm install -g @github/copilot + + - name: Run eval against Copilot + working-directory: evals + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: npm run agent:copilot + + - name: Upload Copilot eval results + if: always() + uses: actions/upload-artifact@v4 + with: + name: skill-eval-agent-copilot-results + path: evals/results/ + retention-days: 30 + + # Job 3: Run evals against the Gemini agent + agent_eval_gemini: + runs-on: ubuntu-latest + timeout-minutes: 60 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: '22' + + - name: Install Gemini CLI + run: npm install -g @google/gemini-cli + + - name: Run eval against Gemini + working-directory: evals + env: + GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} + run: npm run agent:gemini + + - name: Upload Gemini eval results + if: always() + uses: actions/upload-artifact@v4 + with: + name: skill-eval-agent-gemini-results + path: evals/results/ + retention-days: 30 + + # Job 4: Post combined summary comment on PRs + post_summary: + if: always() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.fork == false + needs: [validate_graders, agent_eval_copilot, agent_eval_gemini] + runs-on: ubuntu-latest + + steps: + - name: Download validation results + uses: actions/download-artifact@v4 + with: + name: skill-eval-validation-results + path: evals/results/validation + continue-on-error: true + + - name: Download Copilot results + uses: actions/download-artifact@v4 + with: + name: skill-eval-agent-copilot-results + path: evals/results/copilot + continue-on-error: true + + - name: Download Gemini results + uses: actions/download-artifact@v4 + with: + name: skill-eval-agent-gemini-results + path: evals/results/gemini + continue-on-error: true + + - name: Post summary comment + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + const path = require('path'); + + function readResults(dir) { + const results = []; + try { + if (!fs.existsSync(dir)) return results; + const files = fs.readdirSync(dir).filter(f => f.endsWith('.json') && f !== 'baseline.json'); + for (const file of files) { + try { + results.push(JSON.parse(fs.readFileSync(path.join(dir, file), 'utf8'))); + } catch (e) { + results.push({ task: file.replace('.json', ''), error: true }); + } + } + } catch (e) { /* dir doesn't exist */ } + return results; + } + + let summary = '## 📊 Skill Eval Results\n\n'; + + // --- Validation results --- + const validation = readResults('evals/results/validation'); + if (validation.length > 0) { + summary += '### Grader Validation (reference solutions)\n\n'; + summary += '| Task | Pass Rate | Status |\n'; + summary += '|---|---|---|\n'; + for (const r of validation) { + if (r.error) { summary += `| ${r.task} | Error | ❌ |\n`; continue; } + const passRate = r.passRate != null ? `${(r.passRate * 100).toFixed(0)}%` : 'N/A'; + const status = r.passRate >= 1.0 ? '✅' : '❌'; + summary += `| ${r.task} | ${passRate} | ${status} |\n`; + } + summary += '\n'; + } + + // --- Agent results --- + const copilot = readResults('evals/results/copilot'); + const gemini = readResults('evals/results/gemini'); + + if (copilot.length > 0 || gemini.length > 0) { + summary += '### Agent Evaluation\n\n'; + summary += '| Task | Agent | Pass Rate | pass@k | Status |\n'; + summary += '|---|---|---|---|---|\n'; + + for (const r of [...copilot, ...gemini]) { + if (r.error) { summary += `| ${r.task} | — | Error | Error | ❌ |\n`; continue; } + const taskName = r.task || 'unknown'; + const agent = r.agent || 'unknown'; + const passRate = r.passRate != null ? `${(r.passRate * 100).toFixed(0)}%` : 'N/A'; + const passAtK = r.passAtK != null ? `${(r.passAtK * 100).toFixed(0)}%` : 'N/A'; + const status = r.passAtK >= 0.8 ? '✅' : r.passAtK >= 0.6 ? '⚠️' : '❌'; + summary += `| ${taskName} | ${agent} | ${passRate} | ${passAtK} | ${status} |\n`; + } + summary += '\n'; + } + + if (validation.length === 0 && copilot.length === 0 && gemini.length === 0) { + summary += '> ⚠️ No eval results found. The eval runs may have failed.\n'; + } + + summary += '### Thresholds\n'; + summary += '- ✅ `pass@k ≥ 80%` — merge gate passed\n'; + summary += '- ⚠️ `pass@k ≥ 60%` — needs investigation\n'; + summary += '- ❌ `pass@k < 60%` — blocks merge for affected skill\n'; + + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body: summary, + }); diff --git a/.gitignore b/.gitignore index a4542ab1403..b0820022330 100644 --- a/.gitignore +++ b/.gitignore @@ -56,3 +56,8 @@ extras/docs/themes/sassdoc/sassdoc/* # Localization sources i18nRepo + +# Eval artifacts (keep baseline results) +evals/node_modules +evals/results/*.json +!evals/results/baseline.json diff --git a/evals/README.md b/evals/README.md new file mode 100644 index 00000000000..196948924df --- /dev/null +++ b/evals/README.md @@ -0,0 +1,228 @@ +# Ignite UI for Angular — Skill Evals + +Automated evaluation suite for the Ignite UI for Angular agent skills. +Inspired by the [skill-eval](https://github.com/mgechev/skill-eval) reference +architecture and extended with patterns from +[Anthropic's agent eval research](https://www.anthropic.com/engineering/demystifying-evals-for-ai-agents). + +The infrastructure is **self-contained** — there are no external eval-framework +dependencies. A lightweight shell runner (`run-eval.sh`) executes each task's +reference solution and deterministic grader, and can also dispatch tasks to +AI coding agents (GitHub Copilot CLI or Google Gemini CLI) for end-to-end +evaluation. + +## Overview + +The suite tests three skills: + +| Skill | Task ID | What it tests | +|---|---|---| +| `igniteui-angular-grids` | `grid-basic-setup` | Flat grid with sorting and pagination on flat employee data | +| `igniteui-angular-components` | `component-combo-reactive-form` | Multi-select combo bound to a reactive form control | +| `igniteui-angular-theming` | `theming-palette-generation` | Custom branded palette with `palette()` and `theme()` | + +Each task includes: + +- **`prompt.md`** — the agent prompt sent to the CLI (concise, actionable) +- **`instruction.md`** — human-readable task description (detailed requirements) +- **`tests/test.sh`** — deterministic grader (file checks, import validation, ordering) +- **`prompts/quality.md`** — LLM rubric grader (intent routing, API usage) +- **`solution/solve.sh`** — reference solution for baseline validation +- **`environment/Dockerfile`** — isolated environment for agent execution +- **`skills/`** — symlinked skill files under test + +## Prerequisites + +- Bash 4+ +- `bc` (installed by default on most Linux / macOS systems) +- Node.js 20+ (for config parsing and agent CLI installation) + +**For agent-based evaluation (optional):** + +| Agent | Install | Auth | +|---|---|---| +| GitHub Copilot | `npm install -g @github/copilot` | Active Copilot subscription; `GITHUB_TOKEN` env var | +| Google Gemini | `npm install -g @google/gemini-cli` | `GEMINI_API_KEY` env var | + +## Running Evals Locally + +### Validate graders against reference solutions + +This applies each task's `solution/solve.sh`, then runs `tests/test.sh` to +confirm the grader scores 100%. Use this to catch grader regressions. + +```bash +cd evals + +# Validate all tasks +npm run validate + +# Validate a single task +npm run validate:grid +npm run validate:combo +npm run validate:theming +``` + +### Run evals against an AI agent + +Send the `prompt.md` to a coding agent CLI, let the agent generate code +in an isolated workspace, then run the deterministic grader on the output. + +```bash +cd evals + +# Run all tasks with GitHub Copilot CLI +npm run agent:copilot + +# Run all tasks with Gemini CLI +npm run agent:gemini + +# Run a single task with a specific agent +npm run agent:copilot:grid +npm run agent:gemini:theming +``` + +### All npm scripts + +```bash +cd evals + +# Validation (reference solutions) +npm run validate # all tasks +npm run validate:grid # grid-basic-setup only +npm run validate:combo # component-combo-reactive-form only +npm run validate:theming # theming-palette-generation only + +# Agent-based evaluation +npm run agent:copilot # all tasks with Copilot +npm run agent:copilot:grid # grid task with Copilot +npm run agent:gemini # all tasks with Gemini +npm run agent:gemini:theming # theming task with Gemini +``` + +## Agent Configuration + +Agent settings are stored in `eval-config.json`: + +```json +{ + "defaultAgent": "copilot", + "agents": { + "copilot": { + "command": "copilot", + "installCommand": "npm install -g @github/copilot", + "promptArgs": ["-p"], + "autoApproveArgs": ["--yes"], + "envAuth": "GITHUB_TOKEN" + }, + "gemini": { + "command": "gemini", + "installCommand": "npm install -g @google/gemini-cli", + "promptArgs": ["-p"], + "autoApproveArgs": ["--sandbox"], + "envAuth": "GEMINI_API_KEY" + } + }, + "trialCount": 1, + "timeoutSec": 600 +} +``` + +You can customize the agent command, flags, and timeouts by editing this file. +To switch the default agent, change `defaultAgent`. + +## Adding a New Task + +1. Create a directory under `evals/tasks//` with the standard structure: + + ``` + tasks// + ├── task.toml # Config: grader metadata, weights, timeouts + ├── prompt.md # Agent prompt (sent to CLI agents) + ├── instruction.md # Human-readable task description + ├── environment/Dockerfile # Container setup (for future Docker-based runs) + ├── tests/test.sh # Deterministic grader + ├── prompts/quality.md # LLM rubric grader + ├── solution/solve.sh # Reference solution + └── skills/ # Skill files under test + └── /SKILL.md + ``` + +2. Write a clear, unambiguous `instruction.md` with full task requirements. + +3. Write a concise `prompt.md` that is sent directly to the agent CLI. This + should be a focused, actionable prompt derived from the instruction. + +4. Write `tests/test.sh` to check **outcomes** (files exist, correct selectors + and entry-point imports are present, correct API call ordering) rather than + specific steps. The grader must write a reward (0.0–1.0) to + `logs/verifier/reward.txt`. + +5. Write `prompts/quality.md` with rubric dimensions that sum to 1.0. + +6. Write `solution/solve.sh` — a shell script that proves the task is solvable + and validates that the graders work correctly. + +7. Validate graders before submitting: + + ```bash + npm run validate: + ``` + +8. Test against at least one agent: + + ```bash + npm run agent:copilot: + ``` + +## Pass / Fail Thresholds + +Following [Anthropic's recommendations](https://www.anthropic.com/engineering/demystifying-evals-for-ai-agents): + +| Metric | Threshold | Effect | +|---|---|---| +| `pass@k ≥ 80%` | **Merge gate** | At least 1 success in k trials required | +| `pass@k ≥ 60%` | **Tracked** | Flags flaky skills for investigation | +| `pass@k < 60%` | **Blocks merge** | On PRs touching the relevant skill | + +## CI Integration + +The GitHub Actions workflow at `.github/workflows/skill-eval.yml` runs +both on PRs (that modify `skills/**` or `evals/**`) and via manual +`workflow_dispatch`. Every run executes three parallel jobs: + +1. **Grader validation** — applies reference solutions, verifies graders score 100% +2. **Copilot agent eval** — installs `@github/copilot`, runs all tasks against Copilot CLI +3. **Gemini agent eval** — installs `@google/gemini-cli`, runs all tasks against Gemini CLI + +A fourth summary job collects results from all three and posts a combined +PR comment showing pass rates per task per agent. + +**Secrets required:** +- `GITHUB_TOKEN` — automatically available (for Copilot) +- `GEMINI_API_KEY` — must be added as a repository secret (for Gemini) + +## Grading Strategy + +**Deterministic grader (60% weight)** — checks: +- Expected component files exist +- Correct Ignite UI selector is present in the generated template +- Required entry-point imports exist (not root barrel) +- No use of forbidden alternatives +- Correct API call ordering (e.g. `core()` before `theme()`) + +**LLM rubric grader (40% weight)** — evaluates: +- Correct intent routing +- Idiomatic API usage +- Absence of hallucinated APIs +- Following the skill's guidance + +## Results + +Baseline results are stored in `evals/results/baseline.json` and used for +regression comparison on PRs. The CI workflow uploads per-run results as +GitHub Actions artifacts. + +Agent-based results are suffixed with the agent name (e.g., +`grid-basic-setup-copilot.json`) to distinguish them from reference +validation results. diff --git a/evals/eval-config.json b/evals/eval-config.json new file mode 100644 index 00000000000..3c073c7832b --- /dev/null +++ b/evals/eval-config.json @@ -0,0 +1,23 @@ +{ + "defaultAgent": "copilot", + "agents": { + "copilot": { + "command": "copilot", + "installCommand": "npm install -g @github/copilot", + "promptArgs": ["-p"], + "autoApproveArgs": ["--yes"], + "envAuth": "GITHUB_TOKEN", + "description": "GitHub Copilot CLI (requires active Copilot subscription)" + }, + "gemini": { + "command": "gemini", + "installCommand": "npm install -g @google/gemini-cli", + "promptArgs": ["-p"], + "autoApproveArgs": ["--sandbox"], + "envAuth": "GEMINI_API_KEY", + "description": "Google Gemini CLI (requires GEMINI_API_KEY)" + } + }, + "trialCount": 1, + "timeoutSec": 600 +} diff --git a/evals/package.json b/evals/package.json new file mode 100644 index 00000000000..b6d79561471 --- /dev/null +++ b/evals/package.json @@ -0,0 +1,28 @@ +{ + "name": "igniteui-angular-skill-evals", + "version": "1.0.0", + "description": "Evaluation suite for Ignite UI for Angular agent skills", + "private": true, + "scripts": { + "eval": "bash run-eval.sh", + "eval:grid": "bash run-eval.sh grid-basic-setup", + "eval:combo": "bash run-eval.sh component-combo-reactive-form", + "eval:theming": "bash run-eval.sh theming-palette-generation", + "eval:all": "bash run-eval.sh --all", + "validate": "bash run-eval.sh --all --validate", + "validate:grid": "bash run-eval.sh grid-basic-setup --validate", + "validate:combo": "bash run-eval.sh component-combo-reactive-form --validate", + "validate:theming": "bash run-eval.sh theming-palette-generation --validate", + "agent:copilot": "bash run-eval.sh --all --agent copilot", + "agent:copilot:grid": "bash run-eval.sh grid-basic-setup --agent copilot", + "agent:copilot:combo": "bash run-eval.sh component-combo-reactive-form --agent copilot", + "agent:copilot:theming": "bash run-eval.sh theming-palette-generation --agent copilot", + "agent:gemini": "bash run-eval.sh --all --agent gemini", + "agent:gemini:grid": "bash run-eval.sh grid-basic-setup --agent gemini", + "agent:gemini:combo": "bash run-eval.sh component-combo-reactive-form --agent gemini", + "agent:gemini:theming": "bash run-eval.sh theming-palette-generation --agent gemini" + }, + "engines": { + "node": ">=20.0.0" + } +} diff --git a/evals/results/baseline.json b/evals/results/baseline.json new file mode 100644 index 00000000000..0bdcc9d6469 --- /dev/null +++ b/evals/results/baseline.json @@ -0,0 +1,36 @@ +{ + "generated_at": "2026-03-08T07:00:00.000Z", + "framework_version": "1.0.0", + "description": "Initial baseline results for skill evals. Actual scores will be populated after the first full eval run with an API key.", + "thresholds": { + "pass_at_5_merge_gate": 0.8, + "pass_at_5_block": 0.6, + "pass_pow_5_tracked": 0.6 + }, + "tasks": { + "grid-basic-setup": { + "skill": "igniteui-angular-grids", + "trials": 5, + "pass_rate": null, + "pass_at_5": null, + "pass_pow_5": null, + "status": "pending_first_run" + }, + "component-combo-reactive-form": { + "skill": "igniteui-angular-components", + "trials": 5, + "pass_rate": null, + "pass_at_5": null, + "pass_pow_5": null, + "status": "pending_first_run" + }, + "theming-palette-generation": { + "skill": "igniteui-angular-theming", + "trials": 5, + "pass_rate": null, + "pass_at_5": null, + "pass_pow_5": null, + "status": "pending_first_run" + } + } +} diff --git a/evals/run-eval.sh b/evals/run-eval.sh new file mode 100755 index 00000000000..9d7f026dca3 --- /dev/null +++ b/evals/run-eval.sh @@ -0,0 +1,439 @@ +#!/bin/bash +# run-eval.sh — Self-contained eval runner for Ignite UI Angular skills. +# Inspired by https://github.com/mgechev/skill-eval (a reference architecture, +# not an installable package). +# +# Usage: +# bash run-eval.sh # validate one task (reference solution) +# bash run-eval.sh --all # validate all tasks +# bash run-eval.sh --validate # run reference solution then grade +# bash run-eval.sh --agent copilot # run task using copilot CLI agent +# bash run-eval.sh --agent gemini # run task using gemini CLI agent +# bash run-eval.sh --all --agent copilot # run all tasks with copilot agent +# bash run-eval.sh --all --agent gemini --trials 3 # 3 trials per task with gemini + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TASKS_DIR="$SCRIPT_DIR/tasks" +RESULTS_DIR="$SCRIPT_DIR/results" +CONFIG_FILE="$SCRIPT_DIR/eval-config.json" + +# --- helpers --------------------------------------------------------------- # + +usage() { + cat < [--validate] [--agent ] [--trials ] + +Arguments: + Name of the task directory under tasks/ + --all Run all tasks + +Options: + --validate Apply the reference solution before grading (sanity-check mode) + --agent NAME Run task using an AI agent CLI (copilot | gemini) + --trials N Number of trials per task when using --agent (default: 1) + +Examples: + $(basename "$0") grid-basic-setup --validate + $(basename "$0") --all + $(basename "$0") grid-basic-setup --agent copilot + $(basename "$0") --all --agent gemini --trials 3 +EOF + exit 1 +} + +# Read a JSON string field from eval-config.json +# Usage: read_config '.agents.copilot.command' +read_config() { + local QUERY="$1" + if [ ! -f "$CONFIG_FILE" ]; then + echo "" + return + fi + # Use node to parse JSON (available in CI and most dev environments) + node -e " + const fs = require('fs'); + const cfg = JSON.parse(fs.readFileSync('$CONFIG_FILE', 'utf8')); + const keys = '${QUERY}'.replace(/^\\./, '').split('.'); + let val = cfg; + for (const k of keys) { val = val?.[k]; } + if (Array.isArray(val)) { console.log(val.join(' ')); } + else { console.log(val ?? ''); } + " 2>/dev/null || echo "" +} + +# Resolve the agent CLI command and flags from config +resolve_agent() { + local AGENT_NAME="$1" + AGENT_CMD=$(read_config "agents.${AGENT_NAME}.command") + AGENT_PROMPT_ARGS=$(read_config "agents.${AGENT_NAME}.promptArgs") + AGENT_APPROVE_ARGS=$(read_config "agents.${AGENT_NAME}.autoApproveArgs") + AGENT_ENV_AUTH=$(read_config "agents.${AGENT_NAME}.envAuth") + + if [ -z "$AGENT_CMD" ]; then + echo "ERROR: Unknown agent '$AGENT_NAME'. Check eval-config.json" >&2 + exit 1 + fi + + # Verify the CLI is installed + if ! command -v "$AGENT_CMD" &>/dev/null; then + local INSTALL_CMD + INSTALL_CMD=$(read_config "agents.${AGENT_NAME}.installCommand") + echo "ERROR: '$AGENT_CMD' is not installed." >&2 + echo " Install with: $INSTALL_CMD" >&2 + exit 1 + fi + + # Verify the auth env var is set + if [ -n "$AGENT_ENV_AUTH" ]; then + if [ -z "${!AGENT_ENV_AUTH:-}" ]; then + echo "WARNING: $AGENT_ENV_AUTH is not set. The agent may fail to authenticate." >&2 + fi + fi +} + +# Run a single task using the agent CLI +run_agent_task() { + local TASK_DIR="$1" + local WORK_DIR="$2" + local AGENT_NAME="$3" + + # Prefer prompt.md (agent-oriented prompt) over instruction.md (human-oriented task description) + local PROMPT_FILE="$TASK_DIR/prompt.md" + if [ ! -f "$PROMPT_FILE" ]; then + PROMPT_FILE="$TASK_DIR/instruction.md" + fi + if [ ! -f "$PROMPT_FILE" ]; then + echo "ERROR: No prompt.md or instruction.md found in $TASK_DIR" >&2 + return 1 + fi + + local PROMPT + PROMPT=$(cat "$PROMPT_FILE") + + # Build the skill context preamble if skills/ directory exists + local SKILL_CONTEXT="" + if [ -d "$TASK_DIR/skills" ]; then + for SKILL_FILE in "$TASK_DIR"/skills/*/SKILL.md; do + if [ -f "$SKILL_FILE" ]; then + SKILL_CONTEXT="${SKILL_CONTEXT}$(cat "$SKILL_FILE")\n\n" + fi + done + fi + + # Combine skill context + prompt into a single agent instruction + local FULL_PROMPT="" + if [ -n "$SKILL_CONTEXT" ]; then + FULL_PROMPT="Use the following skill reference when completing the task:\n\n${SKILL_CONTEXT}---\n\n${PROMPT}" + else + FULL_PROMPT="$PROMPT" + fi + + echo " → Sending instruction to $AGENT_NAME agent …" + + local TIMEOUT_SEC + TIMEOUT_SEC=$(read_config "timeoutSec") + TIMEOUT_SEC="${TIMEOUT_SEC:-600}" + + # Build the agent command + local CMD_ARGS=() + CMD_ARGS+=("$AGENT_CMD") + + # Add prompt args (e.g., -p) + if [ -n "$AGENT_PROMPT_ARGS" ]; then + read -ra _PROMPT_PARTS <<< "$AGENT_PROMPT_ARGS" + CMD_ARGS+=("${_PROMPT_PARTS[@]}") + fi + CMD_ARGS+=("$FULL_PROMPT") + + # Add auto-approve args (e.g., --yes, --sandbox) + if [ -n "$AGENT_APPROVE_ARGS" ]; then + read -ra _APPROVE_PARTS <<< "$AGENT_APPROVE_ARGS" + CMD_ARGS+=("${_APPROVE_PARTS[@]}") + fi + + # Run the agent in the work directory with a timeout + local AGENT_EXIT=0 + ( + cd "$WORK_DIR" + timeout "${TIMEOUT_SEC}s" "${CMD_ARGS[@]}" 2>&1 || true + ) > "$WORK_DIR/agent-output.log" 2>&1 || AGENT_EXIT=$? + + if [ "$AGENT_EXIT" -eq 124 ]; then + echo " ⚠ Agent timed out after ${TIMEOUT_SEC}s" + elif [ "$AGENT_EXIT" -ne 0 ]; then + echo " ⚠ Agent exited with code $AGENT_EXIT" + fi + + echo " → Agent output saved to $WORK_DIR/agent-output.log" +} + +run_task() { + local TASK_ID="$1" + local MODE="${2:-validate}" # validate | agent + local AGENT_NAME="${3:-}" + local TASK_DIR="$TASKS_DIR/$TASK_ID" + + if [ ! -d "$TASK_DIR" ]; then + echo "ERROR: Task directory not found: $TASK_DIR" >&2 + return 1 + fi + + echo "═══════════════════════════════════════════════════════" + echo " Task: $TASK_ID" + if [ "$MODE" = "agent" ]; then + echo " Agent: $AGENT_NAME" + fi + echo "═══════════════════════════════════════════════════════" + + # Create a temporary workspace so graders run in isolation + local WORK_DIR + WORK_DIR=$(mktemp -d) + trap "rm -rf '$WORK_DIR'" RETURN + + # Seed the workspace with a minimal src/ tree + mkdir -p "$WORK_DIR/src" + + if [ "$MODE" = "validate" ]; then + # --validate: apply the reference solution first + if [ ! -f "$TASK_DIR/solution/solve.sh" ]; then + echo "ERROR: No reference solution at $TASK_DIR/solution/solve.sh" >&2 + return 1 + fi + echo "→ Applying reference solution …" + (cd "$WORK_DIR" && bash "$TASK_DIR/solution/solve.sh") + elif [ "$MODE" = "agent" ]; then + # --agent: send the instruction to the agent CLI + run_agent_task "$TASK_DIR" "$WORK_DIR" "$AGENT_NAME" + fi + + # Run deterministic grader + if [ ! -f "$TASK_DIR/tests/test.sh" ]; then + echo "ERROR: No deterministic grader at $TASK_DIR/tests/test.sh" >&2 + return 1 + fi + + echo "→ Running deterministic grader …" + local GRADER_EXIT=0 + (cd "$WORK_DIR" && bash "$TASK_DIR/tests/test.sh") || GRADER_EXIT=$? + + # Read reward + local REWARD="0" + if [ -f "$WORK_DIR/logs/verifier/reward.txt" ]; then + REWARD=$(cat "$WORK_DIR/logs/verifier/reward.txt") + fi + + local STATUS="fail" + local PASS_RATE="0" + local PASS_AT_K="0" + if [ "$GRADER_EXIT" -eq 0 ]; then + STATUS="pass" + PASS_RATE="1" + PASS_AT_K="1" + fi + + echo "" + echo " Result: $STATUS (reward=$REWARD)" + echo "" + + # Persist result — includes passRate/passAtK so the CI summary comment can + # read them directly (these are the fields the workflow script expects). + mkdir -p "$RESULTS_DIR" + local RESULT_SUFFIX="" + if [ "$MODE" = "agent" ]; then + RESULT_SUFFIX="-${AGENT_NAME}" + fi + cat > "$RESULTS_DIR/${TASK_ID}${RESULT_SUFFIX}.json" <&2 + return 1 + fi + + local PASS_COUNT=0 + local TOTAL_REWARD=0 + + for i in $(seq 1 "$TRIALS"); do + echo "" + echo " ── Trial $i/$TRIALS ──" + + # Create a temporary workspace for each trial + local WORK_DIR + WORK_DIR=$(mktemp -d) + + mkdir -p "$WORK_DIR/src" + + # Send to agent + run_agent_task "$TASK_DIR" "$WORK_DIR" "$AGENT_NAME" + + # Run grader + local GRADER_EXIT=0 + (cd "$WORK_DIR" && bash "$TASK_DIR/tests/test.sh") || GRADER_EXIT=$? + + local REWARD="0" + if [ -f "$WORK_DIR/logs/verifier/reward.txt" ]; then + REWARD=$(cat "$WORK_DIR/logs/verifier/reward.txt") + fi + + if [ "$GRADER_EXIT" -eq 0 ]; then + PASS_COUNT=$((PASS_COUNT + 1)) + fi + TOTAL_REWARD=$(echo "$TOTAL_REWARD + $REWARD" | bc) + + # Cleanup trial workspace + rm -rf "$WORK_DIR" + + echo " Trial $i: reward=$REWARD $([ "$GRADER_EXIT" -eq 0 ] && echo "✅" || echo "❌")" + done + + # Calculate aggregate metrics + if [ "$TRIALS" -le 0 ]; then + echo "ERROR: TRIALS must be > 0" >&2 + return 1 + fi + local PASS_RATE + PASS_RATE=$(echo "scale=2; $PASS_COUNT / $TRIALS" | bc) + # pass@k = 1 if at least one trial passed, else 0 + local PASS_AT_K=0 + if [ "$PASS_COUNT" -gt 0 ]; then + PASS_AT_K=1 + fi + local AVG_REWARD + AVG_REWARD=$(echo "scale=2; $TOTAL_REWARD / $TRIALS" | bc) + + echo "" + echo " ═══ Aggregate ($TRIALS trials) ═══" + echo " Pass rate: $PASS_COUNT/$TRIALS ($PASS_RATE)" + echo " pass@$TRIALS: $PASS_AT_K" + echo " Avg reward: $AVG_REWARD" + echo "" + + # Persist aggregated result + mkdir -p "$RESULTS_DIR" + cat > "$RESULTS_DIR/${TASK_ID}-${AGENT_NAME}.json" <&2 + exit 1 + fi + shift 2 + ;; + --trials) + TRIALS="${2:-1}" + shift 2 + ;; + -h|--help) + usage + ;; + *) + if [ -z "$TASK_ARG" ]; then + TASK_ARG="$1" + fi + shift + ;; + esac +done + +if [ -z "$TASK_ARG" ]; then + usage +fi + +# If using agent mode, resolve and verify agent CLI +if [ "$MODE" = "agent" ]; then + # Default to configured agent if none specified + if [ -z "$AGENT_NAME" ]; then + AGENT_NAME=$(read_config "defaultAgent") + AGENT_NAME="${AGENT_NAME:-copilot}" + fi + resolve_agent "$AGENT_NAME" + echo "Using agent: $AGENT_NAME ($AGENT_CMD)" + echo "" +fi + +OVERALL_EXIT=0 + +if [ "$TASK_ARG" = "--all" ]; then + for TASK_PATH in "$TASKS_DIR"/*/; do + TASK_NAME=$(basename "$TASK_PATH") + if [ "$MODE" = "agent" ] && [ "$TRIALS" -gt 1 ]; then + echo "═══════════════════════════════════════════════════════" + echo " Task: $TASK_NAME (Agent: $AGENT_NAME, $TRIALS trials)" + echo "═══════════════════════════════════════════════════════" + run_task_trials "$TASK_NAME" "$AGENT_NAME" "$TRIALS" || OVERALL_EXIT=1 + else + run_task "$TASK_NAME" "$MODE" "$AGENT_NAME" || OVERALL_EXIT=1 + fi + done +else + if [ "$MODE" = "agent" ] && [ "$TRIALS" -gt 1 ]; then + echo "═══════════════════════════════════════════════════════" + echo " Task: $TASK_ARG (Agent: $AGENT_NAME, $TRIALS trials)" + echo "═══════════════════════════════════════════════════════" + run_task_trials "$TASK_ARG" "$AGENT_NAME" "$TRIALS" || OVERALL_EXIT=1 + else + run_task "$TASK_ARG" "$MODE" "$AGENT_NAME" || OVERALL_EXIT=1 + fi +fi + +exit "$OVERALL_EXIT" diff --git a/evals/tasks/component-combo-reactive-form/environment/Dockerfile b/evals/tasks/component-combo-reactive-form/environment/Dockerfile new file mode 100644 index 00000000000..4cfd43a762c --- /dev/null +++ b/evals/tasks/component-combo-reactive-form/environment/Dockerfile @@ -0,0 +1,17 @@ +FROM node:20-slim + +WORKDIR /workspace + +RUN npm install -g @angular/cli@latest + +RUN ng new eval-app --skip-git --skip-install --style=scss --ssr=false && \ + cd eval-app && \ + npm install && \ + npm install igniteui-angular + +WORKDIR /workspace/eval-app + +COPY . . + +RUN mkdir -p logs/verifier +CMD ["bash"] diff --git a/evals/tasks/component-combo-reactive-form/instruction.md b/evals/tasks/component-combo-reactive-form/instruction.md new file mode 100644 index 00000000000..9e02aba05c5 --- /dev/null +++ b/evals/tasks/component-combo-reactive-form/instruction.md @@ -0,0 +1,40 @@ +# Task: Add a Multi-Select Combo in a Reactive Form + +You are working in an Angular 20+ project that already has `igniteui-angular` installed and a theme applied. + +## Requirements + +Create a `UserSettingsComponent` with a reactive form that includes a multi-select combo for choosing notification channels. + +1. **Component location**: `src/app/user-settings/user-settings.component.ts` (with its template) + +2. **Form structure**: Create a reactive form (`FormGroup`) with a `notificationChannels` control + +3. **Data source**: Use the following list of notification channels: + + ```typescript + channels = [ + { id: 1, name: 'Email', icon: 'email' }, + { id: 2, name: 'SMS', icon: 'sms' }, + { id: 3, name: 'Push Notification', icon: 'notifications' }, + { id: 4, name: 'Slack', icon: 'chat' }, + { id: 5, name: 'Microsoft Teams', icon: 'groups' }, + ]; + ``` + +4. **Combo configuration**: + - Use the Ignite UI for Angular Combo component for multi-selection + - Bind it to the `notificationChannels` form control + - Display the `name` field in the dropdown + - Use the `id` field as the value key + +5. **Form validation**: The `notificationChannels` control must be required (at least one channel must be selected) + +6. **Submit button**: Add a submit button that is disabled when the form is invalid + +## Constraints + +- Use the Ignite UI `igx-combo` component — do NOT use a native ``, Angular Material `mat-select`, or other third-party select components? +- Did the agent correctly identify that multi-select requires the Combo component, not the Select component? + +## Skill Routing & Reference File Usage (0–0.3) +- Did the agent read the components skill SKILL.md to identify the correct component? +- Did the agent read `references/form-controls.md` for Combo API details? +- Did the agent follow the mandatory protocol (identify component → read references → produce output)? +- Did the agent avoid writing code from memory without consulting references? + +## Idiomatic API Usage (0–0.25) +- Did the agent bind data using `[data]` input on the combo? +- Did the agent configure `[displayKey]` and `[valueKey]` correctly? +- Did the agent use `[formControlName]` or `[formControl]` to bind to the reactive form? +- Did the agent import from the correct igniteui-angular entry point? +- Did the agent import `ReactiveFormsModule` or use standalone form directives? + +## Code Quality (0–0.15) +- Is the component standalone with `ChangeDetectionStrategy.OnPush`? +- Did the agent set up form validation (required validator)? +- Did the agent avoid hallucinated API names or non-existent inputs/outputs? +- Is the code clean, well-structured, and following Angular best practices? diff --git a/evals/tasks/component-combo-reactive-form/skills/igniteui-angular-components/SKILL.md b/evals/tasks/component-combo-reactive-form/skills/igniteui-angular-components/SKILL.md new file mode 120000 index 00000000000..40a2d1a6e84 --- /dev/null +++ b/evals/tasks/component-combo-reactive-form/skills/igniteui-angular-components/SKILL.md @@ -0,0 +1 @@ +../../../../../skills/igniteui-angular-components/SKILL.md \ No newline at end of file diff --git a/evals/tasks/component-combo-reactive-form/solution/solve.sh b/evals/tasks/component-combo-reactive-form/solution/solve.sh new file mode 100755 index 00000000000..396e07382f5 --- /dev/null +++ b/evals/tasks/component-combo-reactive-form/solution/solve.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# Reference solution for component-combo-reactive-form +# Proves the task is solvable and validates grader correctness + +set -euo pipefail + +mkdir -p src/app/user-settings + +# Create the component TypeScript file +cat > src/app/user-settings/user-settings.component.ts << 'EOF' +import { ChangeDetectionStrategy, Component } from '@angular/core'; +import { FormGroup, FormControl, Validators, ReactiveFormsModule } from '@angular/forms'; +import { IgxComboComponent } from 'igniteui-angular/combo'; + +@Component({ + selector: 'app-user-settings', + templateUrl: './user-settings.component.html', + changeDetection: ChangeDetectionStrategy.OnPush, + imports: [ReactiveFormsModule, IgxComboComponent], +}) +export class UserSettingsComponent { + channels = [ + { id: 1, name: 'Email', icon: 'email' }, + { id: 2, name: 'SMS', icon: 'sms' }, + { id: 3, name: 'Push Notification', icon: 'notifications' }, + { id: 4, name: 'Slack', icon: 'chat' }, + { id: 5, name: 'Microsoft Teams', icon: 'groups' }, + ]; + + settingsForm = new FormGroup({ + notificationChannels: new FormControl([], Validators.required), + }); + + onSubmit() { + if (this.settingsForm.valid) { + console.log('Selected channels:', this.settingsForm.value.notificationChannels); + } + } +} +EOF + +# Create the template +cat > src/app/user-settings/user-settings.component.html << 'EOF' +
+ + + +
+EOF diff --git a/evals/tasks/component-combo-reactive-form/task.toml b/evals/tasks/component-combo-reactive-form/task.toml new file mode 100644 index 00000000000..111a254676d --- /dev/null +++ b/evals/tasks/component-combo-reactive-form/task.toml @@ -0,0 +1,26 @@ +version = "1.0" + +[metadata] +author_name = "Ignite UI Team" +difficulty = "medium" +category = "component-forms" +tags = ["combo", "reactive-forms", "multi-select", "igx-combo"] + +[agent] +timeout_sec = 600.0 + +[environment] +build_timeout_sec = 300.0 +cpus = 2 +memory_mb = 4096 +storage_mb = 1000 + +[[graders]] +type = "deterministic" +command = "bash tests/test.sh" +weight = 0.6 + +[[graders]] +type = "llm_rubric" +rubric = "prompts/quality.md" +weight = 0.4 diff --git a/evals/tasks/component-combo-reactive-form/tests/test.sh b/evals/tasks/component-combo-reactive-form/tests/test.sh new file mode 100755 index 00000000000..d82caf36bbe --- /dev/null +++ b/evals/tasks/component-combo-reactive-form/tests/test.sh @@ -0,0 +1,102 @@ +#!/bin/bash +# Deterministic grader for component-combo-reactive-form +# Checks outcomes: correct files exist, correct selectors, reactive form usage + +set -euo pipefail + +mkdir -p logs/verifier + +SCORE=0 +TOTAL=5 +DETAILS="" + +# --- Check 1: Component file exists --- +COMPONENT_FILE=$(find src -name "user-settings.component.ts" 2>/dev/null | head -1) +if [ -n "$COMPONENT_FILE" ]; then + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: user-settings.component.ts exists\n" +else + DETAILS="${DETAILS}FAIL: user-settings.component.ts not found\n" +fi + +# --- Check 2: igx-combo selector is present in the template --- +TEMPLATE_FILE=$(find src -name "user-settings.component.html" 2>/dev/null | head -1) +COMBO_FOUND=0 + +if [ -n "${TEMPLATE_FILE:-}" ] && grep -q "igx-combo" "$TEMPLATE_FILE" 2>/dev/null; then + COMBO_FOUND=1 +elif [ -n "${COMPONENT_FILE:-}" ] && grep -q "igx-combo" "$COMPONENT_FILE" 2>/dev/null; then + COMBO_FOUND=1 +fi + +if [ "$COMBO_FOUND" -eq 1 ]; then + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: igx-combo selector found\n" +else + DETAILS="${DETAILS}FAIL: igx-combo selector not found in template\n" +fi + +# --- Check 3: Reactive form usage (FormGroup, FormControl, or formControlName) --- +REACTIVE_FOUND=0 +SEARCH_FILES="" +[ -n "${TEMPLATE_FILE:-}" ] && SEARCH_FILES="$TEMPLATE_FILE" +[ -n "${COMPONENT_FILE:-}" ] && SEARCH_FILES="$SEARCH_FILES $COMPONENT_FILE" + +for f in $SEARCH_FILES; do + if grep -qE 'FormGroup|FormControl|formControlName|formControl|ReactiveFormsModule' "$f" 2>/dev/null; then + REACTIVE_FOUND=1 + break + fi +done + +if [ "$REACTIVE_FOUND" -eq 1 ]; then + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: Reactive form usage found\n" +else + DETAILS="${DETAILS}FAIL: No reactive form usage found\n" +fi + +# --- Check 4: No forbidden alternatives --- +ALL_FILES=$(find src -name "*.ts" -o -name "*.html" 2>/dev/null) +FORBIDDEN=0 +for f in $ALL_FILES; do + if grep -qE ' ].*multiple|mat-select|MatSelectModule|igx-select' "$f" 2>/dev/null; then + FORBIDDEN=1 + break + fi +done + +if [ "$FORBIDDEN" -eq 0 ]; then + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: No forbidden alternatives found\n" +else + DETAILS="${DETAILS}FAIL: Forbidden alternative (native select, mat-select, igx-select) detected\n" +fi + +# --- Check 5: Correct entry-point import from igniteui-angular/combo --- +# The skill requires entry-point imports (not the root barrel). +COMBO_IMPORT_PATTERN="from ['\"](@infragistics/)?igniteui-angular/combo['\"]" +IMPORT_FOUND=0 +if [ -n "${COMPONENT_FILE:-}" ]; then + if grep -qE "$COMBO_IMPORT_PATTERN" "$COMPONENT_FILE" 2>/dev/null; then + IMPORT_FOUND=1 + fi +fi + +if [ "$IMPORT_FOUND" -eq 1 ]; then + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: Correct combo entry-point import found\n" +else + DETAILS="${DETAILS}FAIL: Missing import from igniteui-angular/combo entry point\n" +fi + +# --- Calculate reward --- +REWARD=$(echo "scale=2; $SCORE / $TOTAL" | bc) + +echo "$REWARD" > logs/verifier/reward.txt +printf "Score: %d/%d (%.0f%%)\n" "$SCORE" "$TOTAL" "$(echo "$REWARD * 100" | bc)" +printf "$DETAILS" + +if [ "$SCORE" -lt "$TOTAL" ]; then + exit 1 +fi diff --git a/evals/tasks/grid-basic-setup/environment/Dockerfile b/evals/tasks/grid-basic-setup/environment/Dockerfile new file mode 100644 index 00000000000..4cfd43a762c --- /dev/null +++ b/evals/tasks/grid-basic-setup/environment/Dockerfile @@ -0,0 +1,17 @@ +FROM node:20-slim + +WORKDIR /workspace + +RUN npm install -g @angular/cli@latest + +RUN ng new eval-app --skip-git --skip-install --style=scss --ssr=false && \ + cd eval-app && \ + npm install && \ + npm install igniteui-angular + +WORKDIR /workspace/eval-app + +COPY . . + +RUN mkdir -p logs/verifier +CMD ["bash"] diff --git a/evals/tasks/grid-basic-setup/instruction.md b/evals/tasks/grid-basic-setup/instruction.md new file mode 100644 index 00000000000..3a9880564e5 --- /dev/null +++ b/evals/tasks/grid-basic-setup/instruction.md @@ -0,0 +1,36 @@ +# Task: Add a Data Grid with Sorting and Pagination + +You are working in an Angular 20+ project that already has `igniteui-angular` installed and a theme applied. + +## Requirements + +Add a data grid to the `EmployeeListComponent` that displays employee data with the following features: + +1. **Data source**: Use the following flat employee data (add it as a property in the component): + + ```typescript + employees = [ + { id: 1, name: 'Alice Johnson', department: 'Engineering', salary: 95000, hireDate: new Date('2020-03-15') }, + { id: 2, name: 'Bob Smith', department: 'Marketing', salary: 72000, hireDate: new Date('2019-07-22') }, + { id: 3, name: 'Carol Davis', department: 'Engineering', salary: 105000, hireDate: new Date('2018-01-10') }, + { id: 4, name: 'David Wilson', department: 'Sales', salary: 68000, hireDate: new Date('2021-11-05') }, + { id: 5, name: 'Eva Martinez', department: 'Engineering', salary: 98000, hireDate: new Date('2020-09-18') }, + { id: 6, name: 'Frank Brown', department: 'Marketing', salary: 75000, hireDate: new Date('2017-04-30') }, + { id: 7, name: 'Grace Lee', department: 'Sales', salary: 82000, hireDate: new Date('2019-12-01') }, + { id: 8, name: 'Henry Taylor', department: 'Engineering', salary: 110000, hireDate: new Date('2016-06-14') }, + ]; + ``` + +2. **Columns**: Display all fields — `id`, `name`, `department`, `salary`, `hireDate` + +3. **Sorting**: Enable sorting on all columns + +4. **Pagination**: Add a paginator with a page size of 5 + +5. **Component**: Create or edit the file at `src/app/employee-list/employee-list.component.ts` (with its template and styles) + +## Constraints + +- Use the Ignite UI for Angular `igx-grid` component — do NOT use a native HTML ``, Angular Material table, or any other grid library. +- Import from the correct `igniteui-angular` entry point. +- The component must be standalone and use `ChangeDetectionStrategy.OnPush`. diff --git a/evals/tasks/grid-basic-setup/prompt.md b/evals/tasks/grid-basic-setup/prompt.md new file mode 100644 index 00000000000..b019190c21c --- /dev/null +++ b/evals/tasks/grid-basic-setup/prompt.md @@ -0,0 +1,29 @@ +# Agent Prompt: Grid Basic Setup + +You are working in an Angular 20+ project that already has `igniteui-angular` installed. + +Create an `EmployeeListComponent` at `src/app/employee-list/employee-list.component.ts` that shows a data grid with employee data, sorting on all columns, and pagination with 5 items per page. + +Use this flat employee data: + +```typescript +employees = [ + { id: 1, name: 'Alice Johnson', department: 'Engineering', salary: 95000, hireDate: new Date('2020-03-15') }, + { id: 2, name: 'Bob Smith', department: 'Marketing', salary: 72000, hireDate: new Date('2019-07-22') }, + { id: 3, name: 'Carol Davis', department: 'Engineering', salary: 105000, hireDate: new Date('2018-01-10') }, + { id: 4, name: 'David Wilson', department: 'Sales', salary: 68000, hireDate: new Date('2021-11-05') }, + { id: 5, name: 'Eva Martinez', department: 'Engineering', salary: 98000, hireDate: new Date('2020-09-18') }, + { id: 6, name: 'Frank Brown', department: 'Marketing', salary: 75000, hireDate: new Date('2017-04-30') }, + { id: 7, name: 'Grace Lee', department: 'Sales', salary: 82000, hireDate: new Date('2019-12-01') }, + { id: 8, name: 'Henry Taylor', department: 'Engineering', salary: 110000, hireDate: new Date('2016-06-14') }, +]; +``` + +Requirements: +- Use the Ignite UI for Angular `igx-grid` component (NOT tree-grid, hierarchical-grid, or pivot-grid) +- Display columns: id, name, department, salary, hireDate +- Enable sorting on all columns +- Add a paginator with page size of 5 +- Import from the `igniteui-angular/grids/grid` entry point (not the root barrel) +- Component must be standalone with ChangeDetectionStrategy.OnPush +- Create both a `.ts` file and a `.html` template file diff --git a/evals/tasks/grid-basic-setup/prompts/quality.md b/evals/tasks/grid-basic-setup/prompts/quality.md new file mode 100644 index 00000000000..fc65eede86f --- /dev/null +++ b/evals/tasks/grid-basic-setup/prompts/quality.md @@ -0,0 +1,25 @@ +# Grid Basic Setup — LLM Rubric + +Evaluate the agent's approach to adding a flat data grid with sorting and pagination. + +## Correct Grid Type Selection (0–0.3) +- Did the agent choose `igx-grid` (Flat Grid) for the flat employee data? +- Did the agent avoid `igx-tree-grid`, `igx-hierarchical-grid`, or `igx-pivot-grid` — which are wrong for flat, non-hierarchical data? +- Did the agent avoid native HTML `
`, Angular Material `mat-table`, or other third-party grids? + +## Skill Routing & Reference File Usage (0–0.3) +- Did the agent read the grids skill SKILL.md to identify the correct grid type? +- Did the agent read the relevant reference files (`structure.md` for columns/sorting, `paging-remote.md` for pagination) before writing code? +- Did the agent follow the mandatory protocol (identify grid type → read references → produce output)? + +## Idiomatic API Usage (0–0.25) +- Did the agent bind data correctly using the `[data]` input? +- Did the agent use `igx-column` elements with correct `[field]` bindings for each data field? +- Did the agent enable sorting correctly (e.g., `[sortable]="true"` on columns or grid-level `[allowSorting]`)? +- Did the agent import from the correct entry point (`igniteui-angular/grids/grid`)? +- Did the agent use `IGX_GRID_DIRECTIVES` or individual component imports? + +## Code Quality (0–0.15) +- Is the component standalone with `ChangeDetectionStrategy.OnPush`? +- Did the agent avoid hallucinated API names or non-existent inputs/outputs? +- Is the code clean and well-structured? diff --git a/evals/tasks/grid-basic-setup/skills/igniteui-angular-grids/SKILL.md b/evals/tasks/grid-basic-setup/skills/igniteui-angular-grids/SKILL.md new file mode 120000 index 00000000000..0ba573d65d2 --- /dev/null +++ b/evals/tasks/grid-basic-setup/skills/igniteui-angular-grids/SKILL.md @@ -0,0 +1 @@ +../../../../../skills/igniteui-angular-grids/SKILL.md \ No newline at end of file diff --git a/evals/tasks/grid-basic-setup/solution/solve.sh b/evals/tasks/grid-basic-setup/solution/solve.sh new file mode 100755 index 00000000000..0466ce7d623 --- /dev/null +++ b/evals/tasks/grid-basic-setup/solution/solve.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# Reference solution for grid-basic-setup +# Proves the task is solvable and validates grader correctness + +set -euo pipefail + +mkdir -p src/app/employee-list + +# Create the component TypeScript file +cat > src/app/employee-list/employee-list.component.ts << 'EOF' +import { ChangeDetectionStrategy, Component } from '@angular/core'; +import { IGX_GRID_DIRECTIVES } from 'igniteui-angular/grids/grid'; +import { IgxPaginatorComponent } from 'igniteui-angular/grids/grid'; + +@Component({ + selector: 'app-employee-list', + templateUrl: './employee-list.component.html', + changeDetection: ChangeDetectionStrategy.OnPush, + imports: [IGX_GRID_DIRECTIVES, IgxPaginatorComponent], +}) +export class EmployeeListComponent { + employees = [ + { id: 1, name: 'Alice Johnson', department: 'Engineering', salary: 95000, hireDate: new Date('2020-03-15') }, + { id: 2, name: 'Bob Smith', department: 'Marketing', salary: 72000, hireDate: new Date('2019-07-22') }, + { id: 3, name: 'Carol Davis', department: 'Engineering', salary: 105000, hireDate: new Date('2018-01-10') }, + { id: 4, name: 'David Wilson', department: 'Sales', salary: 68000, hireDate: new Date('2021-11-05') }, + { id: 5, name: 'Eva Martinez', department: 'Engineering', salary: 98000, hireDate: new Date('2020-09-18') }, + { id: 6, name: 'Frank Brown', department: 'Marketing', salary: 75000, hireDate: new Date('2017-04-30') }, + { id: 7, name: 'Grace Lee', department: 'Sales', salary: 82000, hireDate: new Date('2019-12-01') }, + { id: 8, name: 'Henry Taylor', department: 'Engineering', salary: 110000, hireDate: new Date('2016-06-14') }, + ]; +} +EOF + +# Create the template +cat > src/app/employee-list/employee-list.component.html << 'EOF' + + + + + + + + +EOF diff --git a/evals/tasks/grid-basic-setup/task.toml b/evals/tasks/grid-basic-setup/task.toml new file mode 100644 index 00000000000..07e25fdd0aa --- /dev/null +++ b/evals/tasks/grid-basic-setup/task.toml @@ -0,0 +1,26 @@ +version = "1.0" + +[metadata] +author_name = "Ignite UI Team" +difficulty = "medium" +category = "grid-setup" +tags = ["grid", "flat-grid", "sorting", "pagination", "igx-grid"] + +[agent] +timeout_sec = 600.0 + +[environment] +build_timeout_sec = 300.0 +cpus = 2 +memory_mb = 4096 +storage_mb = 1000 + +[[graders]] +type = "deterministic" +command = "bash tests/test.sh" +weight = 0.6 + +[[graders]] +type = "llm_rubric" +rubric = "prompts/quality.md" +weight = 0.4 diff --git a/evals/tasks/grid-basic-setup/tests/test.sh b/evals/tasks/grid-basic-setup/tests/test.sh new file mode 100755 index 00000000000..72eff909578 --- /dev/null +++ b/evals/tasks/grid-basic-setup/tests/test.sh @@ -0,0 +1,101 @@ +#!/bin/bash +# Deterministic grader for grid-basic-setup +# Checks outcomes: correct files exist, project compiles, correct selectors used + +set -euo pipefail + +mkdir -p logs/verifier + +SCORE=0 +TOTAL=5 +DETAILS="" + +# --- Check 1: Component file exists --- +COMPONENT_FILE=$(find src -name "employee-list.component.ts" 2>/dev/null | head -1) +if [ -n "$COMPONENT_FILE" ]; then + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: employee-list.component.ts exists\n" +else + DETAILS="${DETAILS}FAIL: employee-list.component.ts not found\n" +fi + +# --- Check 2: igx-grid selector is present in the template --- +TEMPLATE_FILE=$(find src -name "employee-list.component.html" 2>/dev/null | head -1) +INLINE_TEMPLATE="" +if [ -z "$TEMPLATE_FILE" ] && [ -n "$COMPONENT_FILE" ]; then + # Check for inline template + INLINE_TEMPLATE=$(grep -l "igx-grid" "$COMPONENT_FILE" 2>/dev/null || true) +fi + +if [ -n "$TEMPLATE_FILE" ] && grep -q "igx-grid" "$TEMPLATE_FILE" 2>/dev/null; then + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: igx-grid selector found in template\n" +elif [ -n "$INLINE_TEMPLATE" ]; then + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: igx-grid selector found in inline template\n" +else + DETAILS="${DETAILS}FAIL: igx-grid selector not found in template\n" +fi + +# --- Check 3: Correct import from igniteui-angular entry point --- +# Accepts either the OSS or licensed package path +GRID_IMPORT_PATTERN="from ['\"](@infragistics/)?igniteui-angular/grids/grid['\"]" +if [ -n "$COMPONENT_FILE" ]; then + if grep -qE "$GRID_IMPORT_PATTERN" "$COMPONENT_FILE" 2>/dev/null; then + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: Correct grid entry-point import found\n" + else + DETAILS="${DETAILS}FAIL: Missing import from igniteui-angular/grids/grid entry point\n" + fi +else + DETAILS="${DETAILS}FAIL: Cannot check imports — component file not found\n" +fi + +# --- Check 4: No forbidden alternatives --- +ALL_TS_FILES=$(find src -name "*.ts" -o -name "*.html" 2>/dev/null) +FORBIDDEN=0 +for f in $ALL_TS_FILES; do + # Check for native table, Angular Material table, or other grid libs + if grep -qE ' ]|MatTableModule|mat-table|ag-grid|kendo-grid' "$f" 2>/dev/null; then + FORBIDDEN=1 + break + fi +done + +if [ "$FORBIDDEN" -eq 0 ]; then + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: No forbidden alternatives found\n" +else + DETAILS="${DETAILS}FAIL: Forbidden alternative (native table, Material table, etc.) detected\n" +fi + +# --- Check 5: Pagination is configured --- +PAGING_FOUND=0 +SEARCH_FILES="" +[ -n "$TEMPLATE_FILE" ] && SEARCH_FILES="$TEMPLATE_FILE" +[ -n "$COMPONENT_FILE" ] && SEARCH_FILES="$SEARCH_FILES $COMPONENT_FILE" + +for f in $SEARCH_FILES; do + if grep -qE 'igx-paginator|IgxPaginatorComponent|paging|perPage|\[perPage\]' "$f" 2>/dev/null; then + PAGING_FOUND=1 + break + fi +done + +if [ "$PAGING_FOUND" -eq 1 ]; then + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: Pagination configuration found\n" +else + DETAILS="${DETAILS}FAIL: No pagination configuration found\n" +fi + +# --- Calculate reward --- +REWARD=$(echo "scale=2; $SCORE / $TOTAL" | bc) + +echo "$REWARD" > logs/verifier/reward.txt +printf "Score: %d/%d (%.0f%%)\n" "$SCORE" "$TOTAL" "$(echo "$REWARD * 100" | bc)" +printf "$DETAILS" + +if [ "$SCORE" -lt "$TOTAL" ]; then + exit 1 +fi diff --git a/evals/tasks/theming-palette-generation/environment/Dockerfile b/evals/tasks/theming-palette-generation/environment/Dockerfile new file mode 100644 index 00000000000..4cfd43a762c --- /dev/null +++ b/evals/tasks/theming-palette-generation/environment/Dockerfile @@ -0,0 +1,17 @@ +FROM node:20-slim + +WORKDIR /workspace + +RUN npm install -g @angular/cli@latest + +RUN ng new eval-app --skip-git --skip-install --style=scss --ssr=false && \ + cd eval-app && \ + npm install && \ + npm install igniteui-angular + +WORKDIR /workspace/eval-app + +COPY . . + +RUN mkdir -p logs/verifier +CMD ["bash"] diff --git a/evals/tasks/theming-palette-generation/instruction.md b/evals/tasks/theming-palette-generation/instruction.md new file mode 100644 index 00000000000..cb3a03360e3 --- /dev/null +++ b/evals/tasks/theming-palette-generation/instruction.md @@ -0,0 +1,27 @@ +# Task: Create a Custom Branded Theme + +You are working in an Angular 20+ project that already has `igniteui-angular` installed with Sass support enabled. + +## Requirements + +Create a custom Ignite UI for Angular theme with a blue primary color and orange secondary color. + +1. **Theme file location**: `src/styles.scss` (or update the existing global styles file) + +2. **Palette**: + - Primary color: `#1976D2` (Material Blue) + - Secondary color: `#FF9800` (Material Orange) + - Surface color appropriate for a light theme + +3. **Theme application**: + - Generate a complete theme using the Ignite UI theming functions + - Apply the theme globally + +4. **Typography**: Include typography configuration with a sans-serif font family + +## Constraints + +- Use the Ignite UI Sass theming API (`palette()`, `theme()`) — do NOT hardcode individual CSS custom properties or use plain CSS variables to replicate the palette. +- Import from `igniteui-angular/theming` (or `@infragistics/igniteui-angular/theming` for licensed packages). +- The theme must include both `palette()` and `theme()` function calls. +- Include `core()` mixin invocation before the `theme()` mixin. diff --git a/evals/tasks/theming-palette-generation/prompt.md b/evals/tasks/theming-palette-generation/prompt.md new file mode 100644 index 00000000000..a665e386564 --- /dev/null +++ b/evals/tasks/theming-palette-generation/prompt.md @@ -0,0 +1,14 @@ +# Agent Prompt: Custom Branded Theme + +You are working in an Angular 20+ project that already has `igniteui-angular` installed with Sass support. + +Create a custom Ignite UI for Angular theme in `src/styles.scss` with a blue primary and orange secondary palette. + +Requirements: +- Import from `igniteui-angular/theming` using `@use` syntax +- Create a palette with primary #1976D2, secondary #FF9800, and a light surface color +- Configure typography with a sans-serif font family +- Call `@include core()` BEFORE `@include theme()` +- Pass the palette to the `theme()` mixin +- Use the `palette()` function (do NOT hardcode CSS custom properties) +- Use `@use` module syntax (not deprecated `@import`) diff --git a/evals/tasks/theming-palette-generation/prompts/quality.md b/evals/tasks/theming-palette-generation/prompts/quality.md new file mode 100644 index 00000000000..d5400180ba6 --- /dev/null +++ b/evals/tasks/theming-palette-generation/prompts/quality.md @@ -0,0 +1,27 @@ +# Theming Palette Generation — LLM Rubric + +Evaluate the agent's approach to creating a custom branded Ignite UI theme. + +## Correct Theming Approach (0–0.3) +- Did the agent use the Ignite UI Sass theming API (`palette()`, `theme()`) instead of hardcoding CSS custom properties? +- Did the agent use `@use 'igniteui-angular/theming'` (modern Sass module syntax) rather than deprecated `@import`? +- Did the agent include `core()` mixin before `theme()` mixin as required by the theming system? + +## Skill Routing & Reference Usage (0–0.3) +- Did the agent read the theming skill SKILL.md for theming guidance? +- Did the agent follow the correct theming sequence: palette → typography → theme? +- Did the agent check for MCP server availability before writing SCSS manually? +- If MCP tools were available, did the agent prefer using them over manual SCSS? + +## Idiomatic API Usage (0–0.25) +- Did the agent pass `$primary` and `$secondary` parameters to `palette()`? +- Did the agent pass a `$surface` color appropriate for a light theme? +- Did the agent configure typography with a font family? +- Did the agent pass the `$palette` variable to the `theme()` mixin? +- Did the agent use the `$schema` parameter or rely on the correct default schema? + +## Code Quality (0–0.15) +- Is the SCSS well-structured and readable? +- Did the agent use `@use` with a namespace (e.g., `as *` or a custom namespace)? +- Did the agent avoid hallucinated function names or non-existent parameters? +- Did the agent avoid mixing Sass theming with manual CSS overrides unnecessarily? diff --git a/evals/tasks/theming-palette-generation/skills/igniteui-angular-theming/SKILL.md b/evals/tasks/theming-palette-generation/skills/igniteui-angular-theming/SKILL.md new file mode 120000 index 00000000000..05e9980f01a --- /dev/null +++ b/evals/tasks/theming-palette-generation/skills/igniteui-angular-theming/SKILL.md @@ -0,0 +1 @@ +../../../../../skills/igniteui-angular-theming/SKILL.md \ No newline at end of file diff --git a/evals/tasks/theming-palette-generation/solution/solve.sh b/evals/tasks/theming-palette-generation/solution/solve.sh new file mode 100755 index 00000000000..c032689ba43 --- /dev/null +++ b/evals/tasks/theming-palette-generation/solution/solve.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Reference solution for theming-palette-generation +# Proves the task is solvable and validates grader correctness + +set -euo pipefail + +# Write the themed styles.scss +cat > src/styles.scss << 'SCSS' +@use 'igniteui-angular/theming' as *; + +$custom-palette: palette( + $primary: #1976D2, + $secondary: #FF9800, + $surface: #FAFAFA, +); + +$custom-typography: typography( + $font-family: 'Roboto, "Helvetica Neue", sans-serif', +); + +@include core(); +@include typography($custom-typography); +@include theme( + $palette: $custom-palette, + $schema: $light-material-schema, +); +SCSS diff --git a/evals/tasks/theming-palette-generation/task.toml b/evals/tasks/theming-palette-generation/task.toml new file mode 100644 index 00000000000..459be454723 --- /dev/null +++ b/evals/tasks/theming-palette-generation/task.toml @@ -0,0 +1,26 @@ +version = "1.0" + +[metadata] +author_name = "Ignite UI Team" +difficulty = "medium" +category = "theming" +tags = ["theming", "palette", "scss", "sass", "custom-theme"] + +[agent] +timeout_sec = 600.0 + +[environment] +build_timeout_sec = 300.0 +cpus = 2 +memory_mb = 4096 +storage_mb = 1000 + +[[graders]] +type = "deterministic" +command = "bash tests/test.sh" +weight = 0.6 + +[[graders]] +type = "llm_rubric" +rubric = "prompts/quality.md" +weight = 0.4 diff --git a/evals/tasks/theming-palette-generation/tests/test.sh b/evals/tasks/theming-palette-generation/tests/test.sh new file mode 100755 index 00000000000..2a992f39aea --- /dev/null +++ b/evals/tasks/theming-palette-generation/tests/test.sh @@ -0,0 +1,90 @@ +#!/bin/bash +# Deterministic grader for theming-palette-generation +# Checks outcomes: correct SCSS structure, palette/theme calls present + +set -euo pipefail + +mkdir -p logs/verifier + +SCORE=0 +TOTAL=5 +DETAILS="" + +# Find the main styles file (could be styles.scss or another scss file) +STYLES_FILE=$(find src -name "styles.scss" -o -name "styles.sass" 2>/dev/null | head -1) +if [ -z "$STYLES_FILE" ]; then + # Also check for any scss file that might contain the theme + STYLES_FILE=$(grep -rl "palette\|theme()" src/ --include="*.scss" 2>/dev/null | head -1) +fi + +if [ -z "${STYLES_FILE:-}" ]; then + echo "0" > logs/verifier/reward.txt + printf "FAIL: No SCSS file with theming code found\n" + exit 1 +fi + +# --- Check 1: Import from igniteui-angular/theming --- +# Accepts @use or @import with either the OSS or licensed package path +THEMING_IMPORT_PATTERN="@(use|import) ['\"](@infragistics/)?igniteui-angular/theming['\"]" +LEGACY_IMPORT_PATTERN="@import ['\"]~igniteui-angular/lib/core/styles/themes" +if grep -qE "$THEMING_IMPORT_PATTERN|$LEGACY_IMPORT_PATTERN" "$STYLES_FILE" 2>/dev/null; then + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: Correct theming import found\n" +else + DETAILS="${DETAILS}FAIL: Missing import from igniteui-angular/theming\n" +fi + +# --- Check 2: palette() function call with primary and secondary --- +if grep -qE 'palette\(' "$STYLES_FILE" 2>/dev/null; then + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: palette() function call found\n" +else + DETAILS="${DETAILS}FAIL: No palette() function call found\n" +fi + +# --- Check 3: theme() mixin call --- +if grep -qE '@include.*theme\(|@include.*css-vars\(' "$STYLES_FILE" 2>/dev/null; then + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: theme() mixin call found\n" +else + DETAILS="${DETAILS}FAIL: No theme() mixin call found\n" +fi + +# --- Check 4: core() mixin call must appear before theme() --- +CORE_LINE=$(grep -nE '@include.*core\(' "$STYLES_FILE" 2>/dev/null | head -1 | cut -d: -f1) +THEME_LINE=$(grep -nE '@include.*theme\(' "$STYLES_FILE" 2>/dev/null | head -1 | cut -d: -f1) + +if [ -z "${CORE_LINE:-}" ]; then + DETAILS="${DETAILS}FAIL: No core() mixin call found\n" +elif [ -n "${THEME_LINE:-}" ] && [ "$CORE_LINE" -gt "$THEME_LINE" ]; then + DETAILS="${DETAILS}FAIL: core() must be called before theme() (core on line $CORE_LINE, theme on line $THEME_LINE)\n" +else + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: core() mixin call found before theme()\n" +fi + +# --- Check 5: No hardcoded CSS custom properties as the sole theming approach --- +# Allow CSS vars if palette() is also used, but fail if ONLY css vars without palette() +PALETTE_USED=$(grep -c 'palette(' "$STYLES_FILE" 2>/dev/null || echo "0") +CSS_VARS_ONLY=$(grep -cE '^\s*--ig-' "$STYLES_FILE" 2>/dev/null || echo "0") + +if [ "$PALETTE_USED" -gt 0 ]; then + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: Uses palette() function (not hardcoded CSS variables)\n" +elif [ "$CSS_VARS_ONLY" -gt 0 ]; then + DETAILS="${DETAILS}FAIL: Only hardcoded CSS custom properties found without palette()\n" +else + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: No hardcoded-only CSS variables approach\n" +fi + +# --- Calculate reward --- +REWARD=$(echo "scale=2; $SCORE / $TOTAL" | bc) + +echo "$REWARD" > logs/verifier/reward.txt +printf "Score: %d/%d (%.0f%%)\n" "$SCORE" "$TOTAL" "$(echo "$REWARD * 100" | bc)" +printf "$DETAILS" + +if [ "$SCORE" -lt "$TOTAL" ]; then + exit 1 +fi