From 5b7cca0ebebb4e4e018fddb4f94ca949ff3ea810 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 8 Mar 2026 07:05:56 +0000 Subject: [PATCH 01/17] Initial plan From 23aecf0adb50c983c55667e57baffbecfb2deca2 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 8 Mar 2026 07:12:41 +0000 Subject: [PATCH 02/17] feat: scaffold eval test suite with three skill tasks and CI workflow Add automated eval test suite for Angular Skills using the skill-eval framework: - grid-basic-setup task (igniteui-angular-grids skill) - component-combo-reactive-form task (igniteui-angular-components skill) - theming-palette-generation task (igniteui-angular-theming skill) - GitHub Actions workflow for CI integration - Baseline results JSON for regression comparison - README with local run and task authoring docs Co-authored-by: zdrawku <11193764+zdrawku@users.noreply.github.com> --- .github/workflows/skill-eval.yml | 88 ++++++++++ .gitignore | 5 + evals/README.md | 155 ++++++++++++++++++ evals/package.json | 21 +++ evals/results/baseline.json | 36 ++++ .../environment/Dockerfile | 17 ++ .../instruction.md | 40 +++++ .../prompts/quality.md | 28 ++++ .../igniteui-angular-components/SKILL.md | 1 + .../solution/solve.sh | 55 +++++++ .../component-combo-reactive-form/task.toml | 26 +++ .../tests/test.sh | 100 +++++++++++ .../grid-basic-setup/environment/Dockerfile | 17 ++ evals/tasks/grid-basic-setup/instruction.md | 36 ++++ .../tasks/grid-basic-setup/prompts/quality.md | 25 +++ .../skills/igniteui-angular-grids/SKILL.md | 1 + .../tasks/grid-basic-setup/solution/solve.sh | 45 +++++ evals/tasks/grid-basic-setup/task.toml | 26 +++ evals/tasks/grid-basic-setup/tests/test.sh | 99 +++++++++++ .../environment/Dockerfile | 17 ++ .../theming-palette-generation/instruction.md | 27 +++ .../prompts/quality.md | 27 +++ .../skills/igniteui-angular-theming/SKILL.md | 1 + .../solution/solve.sh | 27 +++ .../theming-palette-generation/task.toml | 26 +++ .../theming-palette-generation/tests/test.sh | 82 +++++++++ 26 files changed, 1028 insertions(+) create mode 100644 .github/workflows/skill-eval.yml create mode 100644 evals/README.md create mode 100644 evals/package.json create mode 100644 evals/results/baseline.json create mode 100644 evals/tasks/component-combo-reactive-form/environment/Dockerfile create mode 100644 evals/tasks/component-combo-reactive-form/instruction.md create mode 100644 evals/tasks/component-combo-reactive-form/prompts/quality.md create mode 120000 evals/tasks/component-combo-reactive-form/skills/igniteui-angular-components/SKILL.md create mode 100755 evals/tasks/component-combo-reactive-form/solution/solve.sh create mode 100644 evals/tasks/component-combo-reactive-form/task.toml create mode 100755 evals/tasks/component-combo-reactive-form/tests/test.sh create mode 100644 evals/tasks/grid-basic-setup/environment/Dockerfile create mode 100644 evals/tasks/grid-basic-setup/instruction.md create mode 100644 evals/tasks/grid-basic-setup/prompts/quality.md create mode 120000 evals/tasks/grid-basic-setup/skills/igniteui-angular-grids/SKILL.md create mode 100755 evals/tasks/grid-basic-setup/solution/solve.sh create mode 100644 evals/tasks/grid-basic-setup/task.toml create mode 100755 evals/tasks/grid-basic-setup/tests/test.sh create mode 100644 evals/tasks/theming-palette-generation/environment/Dockerfile create mode 100644 evals/tasks/theming-palette-generation/instruction.md create mode 100644 evals/tasks/theming-palette-generation/prompts/quality.md create mode 120000 evals/tasks/theming-palette-generation/skills/igniteui-angular-theming/SKILL.md create mode 100755 evals/tasks/theming-palette-generation/solution/solve.sh create mode 100644 evals/tasks/theming-palette-generation/task.toml create mode 100755 evals/tasks/theming-palette-generation/tests/test.sh diff --git a/.github/workflows/skill-eval.yml b/.github/workflows/skill-eval.yml new file mode 100644 index 00000000000..7e172c404a6 --- /dev/null +++ b/.github/workflows/skill-eval.yml @@ -0,0 +1,88 @@ +name: Skill Eval + +on: + pull_request: + paths: + - 'skills/**' + - 'evals/**' + +jobs: + eval: + runs-on: ubuntu-latest + timeout-minutes: 30 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: '20' + + - name: Install eval dependencies + working-directory: evals + run: npm install + + - name: Run skill evals + working-directory: evals + run: npx skill-eval _ --suite=all --trials=5 + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} + + - name: Upload results + if: always() + uses: actions/upload-artifact@v4 + with: + name: skill-eval-results + path: evals/results/ + retention-days: 30 + + - name: Post summary comment + if: always() && github.event_name == 'pull_request' + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + const path = require('path'); + + const resultsDir = 'evals/results'; + let summary = '## 📊 Skill Eval Results\n\n'; + + try { + const files = fs.readdirSync(resultsDir).filter(f => f.endsWith('.json')); + if (files.length === 0) { + summary += '> ⚠️ No eval results found. The eval run may have failed.\n'; + } else { + summary += '| Task | Pass Rate | pass@5 | Status |\n'; + summary += '|---|---|---|---|\n'; + + for (const file of files) { + try { + const data = JSON.parse(fs.readFileSync(path.join(resultsDir, file), 'utf8')); + const taskName = data.task || file.replace('.json', ''); + const passRate = data.passRate != null ? `${(data.passRate * 100).toFixed(0)}%` : 'N/A'; + const passAtK = data.passAtK != null ? `${(data.passAtK * 100).toFixed(0)}%` : 'N/A'; + const status = data.passAtK >= 0.8 ? '✅' : data.passAtK >= 0.6 ? '⚠️' : '❌'; + summary += `| ${taskName} | ${passRate} | ${passAtK} | ${status} |\n`; + } catch (e) { + summary += `| ${file} | Error | Error | ❌ |\n`; + } + } + + summary += '\n### Thresholds\n'; + summary += '- ✅ `pass@5 ≥ 80%` — merge gate passed\n'; + summary += '- ⚠️ `pass@5 ≥ 60%` — needs investigation\n'; + summary += '- ❌ `pass@5 < 60%` — blocks merge for affected skill\n'; + } + } catch (e) { + summary += `> ⚠️ Could not read results: ${e.message}\n`; + } + + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body: summary, + }); diff --git a/.gitignore b/.gitignore index a4542ab1403..b0820022330 100644 --- a/.gitignore +++ b/.gitignore @@ -56,3 +56,8 @@ extras/docs/themes/sassdoc/sassdoc/* # Localization sources i18nRepo + +# Eval artifacts (keep baseline results) +evals/node_modules +evals/results/*.json +!evals/results/baseline.json diff --git a/evals/README.md b/evals/README.md new file mode 100644 index 00000000000..631ea8741c9 --- /dev/null +++ b/evals/README.md @@ -0,0 +1,155 @@ +# Ignite UI for Angular — Skill Evals + +Automated evaluation suite for the Ignite UI for Angular agent skills. Uses the +[skill-eval](https://github.com/mgechev/skill-eval) framework to measure skill +quality, detect regressions, and gate merges. + +## Overview + +The suite tests three skills: + +| Skill | Task ID | What it tests | +|---|---|---| +| `igniteui-angular-grids` | `grid-basic-setup` | Flat grid with sorting and pagination on flat employee data | +| `igniteui-angular-components` | `component-combo-reactive-form` | Multi-select combo bound to a reactive form control | +| `igniteui-angular-theming` | `theming-palette-generation` | Custom branded palette with `palette()` and `theme()` | + +Each task includes: + +- **`instruction.md`** — the prompt given to the agent +- **`tests/test.sh`** — deterministic grader (file checks, compilation, lint) +- **`prompts/quality.md`** — LLM rubric grader (intent routing, API usage) +- **`solution/solve.sh`** — reference solution for baseline validation +- **`environment/Dockerfile`** — isolated environment for agent execution +- **`skills/`** — symlinked or copied skill files under test + +## Prerequisites + +- Node.js 20+ +- Docker (for isolated agent execution) +- An API key for the agent provider (Gemini or Anthropic) + +## Running Evals Locally + +### Install dependencies + +```bash +cd evals +npm install +``` + +### Run a single task + +```bash +# Gemini (default) +GEMINI_API_KEY=your-key npm run eval -- grid-basic-setup + +# Claude +ANTHROPIC_API_KEY=your-key npm run eval -- grid-basic-setup --agent=claude +``` + +### Run all tasks + +```bash +GEMINI_API_KEY=your-key npm run eval:all +``` + +### Options + +```bash +# Adjust trials (default: 5) +npm run eval -- grid-basic-setup --trials=5 + +# Run locally without Docker +npm run eval -- grid-basic-setup --provider=local + +# Validate graders against the reference solution +npm run eval -- grid-basic-setup --validate --provider=local + +# Run multiple trials in parallel +npm run eval -- grid-basic-setup --parallel=3 +``` + +### Preview results + +```bash +# CLI report +npm run preview + +# Web UI at http://localhost:3847 +npm run preview:browser +``` + +## Adding a New Task + +1. Create a directory under `evals/tasks//` with the standard structure: + + ``` + tasks// + ├── task.toml # Config: graders, timeouts, resource limits + ├── instruction.md # Agent prompt + ├── environment/Dockerfile # Container setup + ├── tests/test.sh # Deterministic grader + ├── prompts/quality.md # LLM rubric grader + ├── solution/solve.sh # Reference solution + └── skills/ # Skill files under test + └── /SKILL.md + ``` + +2. Write a clear, unambiguous `instruction.md` that tells the agent exactly what + to build. + +3. Write `tests/test.sh` to check **outcomes** (files exist, project compiles, + correct selectors are present) rather than specific steps. + +4. Write `prompts/quality.md` with rubric dimensions that sum to 1.0. + +5. Write `solution/solve.sh` — a shell script that proves the task is solvable + and validates that the graders work correctly. + +6. Validate graders before submitting: + + ```bash + npm run eval -- --validate --provider=local + ``` + +## Pass / Fail Thresholds + +Following [Anthropic's recommendations](https://www.anthropic.com/engineering/demystifying-evals-for-ai-agents): + +| Metric | Threshold | Effect | +|---|---|---| +| `pass@5 ≥ 80%` | **Merge gate** | At least 1 success in 5 trials required | +| `pass^5 ≥ 60%` | **Tracked** | Flags flaky skills for investigation | +| `pass@5 < 60%` | **Blocks merge** | On PRs touching the relevant skill | + +## CI Integration + +The GitHub Actions workflow at `.github/workflows/skill-eval.yml` runs +automatically on PRs that modify `skills/**` or `evals/**`. It: + +1. Checks out the repo +2. Installs eval dependencies +3. Runs all tasks with 5 trials +4. Uploads results as an artifact +5. Posts a summary comment on the PR + +## Grading Strategy + +**Deterministic grader (60% weight)** — checks: +- Project builds without errors +- Correct Ignite UI selector is present in the generated template +- Required imports exist +- No use of forbidden alternatives + +**LLM rubric grader (40% weight)** — evaluates: +- Correct intent routing +- Idiomatic API usage +- Absence of hallucinated APIs +- Following the skill's guidance + +## Results + +Baseline results are stored in `evals/results/baseline.json` and used for +regression comparison on PRs. The CI workflow uploads per-run results as +GitHub Actions artifacts. diff --git a/evals/package.json b/evals/package.json new file mode 100644 index 00000000000..9a945614306 --- /dev/null +++ b/evals/package.json @@ -0,0 +1,21 @@ +{ + "name": "igniteui-angular-skill-evals", + "version": "1.0.0", + "description": "Evaluation suite for Ignite UI for Angular agent skills", + "private": true, + "scripts": { + "eval": "npx skill-eval", + "eval:grid": "npx skill-eval grid-basic-setup", + "eval:combo": "npx skill-eval component-combo-reactive-form", + "eval:theming": "npx skill-eval theming-palette-generation", + "eval:all": "npx skill-eval _ --suite=all", + "preview": "npx skill-eval preview", + "preview:browser": "npx skill-eval preview browser" + }, + "dependencies": { + "skill-eval": "^1.0.0" + }, + "engines": { + "node": ">=20.0.0" + } +} diff --git a/evals/results/baseline.json b/evals/results/baseline.json new file mode 100644 index 00000000000..0bdcc9d6469 --- /dev/null +++ b/evals/results/baseline.json @@ -0,0 +1,36 @@ +{ + "generated_at": "2026-03-08T07:00:00.000Z", + "framework_version": "1.0.0", + "description": "Initial baseline results for skill evals. Actual scores will be populated after the first full eval run with an API key.", + "thresholds": { + "pass_at_5_merge_gate": 0.8, + "pass_at_5_block": 0.6, + "pass_pow_5_tracked": 0.6 + }, + "tasks": { + "grid-basic-setup": { + "skill": "igniteui-angular-grids", + "trials": 5, + "pass_rate": null, + "pass_at_5": null, + "pass_pow_5": null, + "status": "pending_first_run" + }, + "component-combo-reactive-form": { + "skill": "igniteui-angular-components", + "trials": 5, + "pass_rate": null, + "pass_at_5": null, + "pass_pow_5": null, + "status": "pending_first_run" + }, + "theming-palette-generation": { + "skill": "igniteui-angular-theming", + "trials": 5, + "pass_rate": null, + "pass_at_5": null, + "pass_pow_5": null, + "status": "pending_first_run" + } + } +} diff --git a/evals/tasks/component-combo-reactive-form/environment/Dockerfile b/evals/tasks/component-combo-reactive-form/environment/Dockerfile new file mode 100644 index 00000000000..4cfd43a762c --- /dev/null +++ b/evals/tasks/component-combo-reactive-form/environment/Dockerfile @@ -0,0 +1,17 @@ +FROM node:20-slim + +WORKDIR /workspace + +RUN npm install -g @angular/cli@latest + +RUN ng new eval-app --skip-git --skip-install --style=scss --ssr=false && \ + cd eval-app && \ + npm install && \ + npm install igniteui-angular + +WORKDIR /workspace/eval-app + +COPY . . + +RUN mkdir -p logs/verifier +CMD ["bash"] diff --git a/evals/tasks/component-combo-reactive-form/instruction.md b/evals/tasks/component-combo-reactive-form/instruction.md new file mode 100644 index 00000000000..9e02aba05c5 --- /dev/null +++ b/evals/tasks/component-combo-reactive-form/instruction.md @@ -0,0 +1,40 @@ +# Task: Add a Multi-Select Combo in a Reactive Form + +You are working in an Angular 20+ project that already has `igniteui-angular` installed and a theme applied. + +## Requirements + +Create a `UserSettingsComponent` with a reactive form that includes a multi-select combo for choosing notification channels. + +1. **Component location**: `src/app/user-settings/user-settings.component.ts` (with its template) + +2. **Form structure**: Create a reactive form (`FormGroup`) with a `notificationChannels` control + +3. **Data source**: Use the following list of notification channels: + + ```typescript + channels = [ + { id: 1, name: 'Email', icon: 'email' }, + { id: 2, name: 'SMS', icon: 'sms' }, + { id: 3, name: 'Push Notification', icon: 'notifications' }, + { id: 4, name: 'Slack', icon: 'chat' }, + { id: 5, name: 'Microsoft Teams', icon: 'groups' }, + ]; + ``` + +4. **Combo configuration**: + - Use the Ignite UI for Angular Combo component for multi-selection + - Bind it to the `notificationChannels` form control + - Display the `name` field in the dropdown + - Use the `id` field as the value key + +5. **Form validation**: The `notificationChannels` control must be required (at least one channel must be selected) + +6. **Submit button**: Add a submit button that is disabled when the form is invalid + +## Constraints + +- Use the Ignite UI `igx-combo` component — do NOT use a native ``, Angular Material `mat-select`, or other third-party select components? +- Did the agent correctly identify that multi-select requires the Combo component, not the Select component? + +## Skill Routing & Reference File Usage (0–0.3) +- Did the agent read the components skill SKILL.md to identify the correct component? +- Did the agent read `references/form-controls.md` for Combo API details? +- Did the agent follow the mandatory protocol (identify component → read references → produce output)? +- Did the agent avoid writing code from memory without consulting references? + +## Idiomatic API Usage (0–0.25) +- Did the agent bind data using `[data]` input on the combo? +- Did the agent configure `[displayKey]` and `[valueKey]` correctly? +- Did the agent use `[formControlName]` or `[formControl]` to bind to the reactive form? +- Did the agent import from the correct igniteui-angular entry point? +- Did the agent import `ReactiveFormsModule` or use standalone form directives? + +## Code Quality (0–0.15) +- Is the component standalone with `ChangeDetectionStrategy.OnPush`? +- Did the agent set up form validation (required validator)? +- Did the agent avoid hallucinated API names or non-existent inputs/outputs? +- Is the code clean, well-structured, and following Angular best practices? diff --git a/evals/tasks/component-combo-reactive-form/skills/igniteui-angular-components/SKILL.md b/evals/tasks/component-combo-reactive-form/skills/igniteui-angular-components/SKILL.md new file mode 120000 index 00000000000..40a2d1a6e84 --- /dev/null +++ b/evals/tasks/component-combo-reactive-form/skills/igniteui-angular-components/SKILL.md @@ -0,0 +1 @@ +../../../../../skills/igniteui-angular-components/SKILL.md \ No newline at end of file diff --git a/evals/tasks/component-combo-reactive-form/solution/solve.sh b/evals/tasks/component-combo-reactive-form/solution/solve.sh new file mode 100755 index 00000000000..6b284457490 --- /dev/null +++ b/evals/tasks/component-combo-reactive-form/solution/solve.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# Reference solution for component-combo-reactive-form +# Proves the task is solvable and validates grader correctness + +set -euo pipefail + +mkdir -p src/app/user-settings + +# Create the component TypeScript file +cat > src/app/user-settings/user-settings.component.ts << 'EOF' +import { ChangeDetectionStrategy, Component } from '@angular/core'; +import { FormGroup, FormControl, Validators, ReactiveFormsModule } from '@angular/forms'; +import { IgxComboComponent } from 'igniteui-angular'; + +@Component({ + selector: 'app-user-settings', + templateUrl: './user-settings.component.html', + changeDetection: ChangeDetectionStrategy.OnPush, + imports: [ReactiveFormsModule, IgxComboComponent], +}) +export class UserSettingsComponent { + channels = [ + { id: 1, name: 'Email', icon: 'email' }, + { id: 2, name: 'SMS', icon: 'sms' }, + { id: 3, name: 'Push Notification', icon: 'notifications' }, + { id: 4, name: 'Slack', icon: 'chat' }, + { id: 5, name: 'Microsoft Teams', icon: 'groups' }, + ]; + + settingsForm = new FormGroup({ + notificationChannels: new FormControl([], Validators.required), + }); + + onSubmit() { + if (this.settingsForm.valid) { + console.log('Selected channels:', this.settingsForm.value.notificationChannels); + } + } +} +EOF + +# Create the template +cat > src/app/user-settings/user-settings.component.html << 'EOF' +
+ + + +
+EOF diff --git a/evals/tasks/component-combo-reactive-form/task.toml b/evals/tasks/component-combo-reactive-form/task.toml new file mode 100644 index 00000000000..111a254676d --- /dev/null +++ b/evals/tasks/component-combo-reactive-form/task.toml @@ -0,0 +1,26 @@ +version = "1.0" + +[metadata] +author_name = "Ignite UI Team" +difficulty = "medium" +category = "component-forms" +tags = ["combo", "reactive-forms", "multi-select", "igx-combo"] + +[agent] +timeout_sec = 600.0 + +[environment] +build_timeout_sec = 300.0 +cpus = 2 +memory_mb = 4096 +storage_mb = 1000 + +[[graders]] +type = "deterministic" +command = "bash tests/test.sh" +weight = 0.6 + +[[graders]] +type = "llm_rubric" +rubric = "prompts/quality.md" +weight = 0.4 diff --git a/evals/tasks/component-combo-reactive-form/tests/test.sh b/evals/tasks/component-combo-reactive-form/tests/test.sh new file mode 100755 index 00000000000..23022ab241b --- /dev/null +++ b/evals/tasks/component-combo-reactive-form/tests/test.sh @@ -0,0 +1,100 @@ +#!/bin/bash +# Deterministic grader for component-combo-reactive-form +# Checks outcomes: correct files exist, correct selectors, reactive form usage + +set -euo pipefail + +mkdir -p logs/verifier + +SCORE=0 +TOTAL=5 +DETAILS="" + +# --- Check 1: Component file exists --- +COMPONENT_FILE=$(find src -name "user-settings.component.ts" 2>/dev/null | head -1) +if [ -n "$COMPONENT_FILE" ]; then + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: user-settings.component.ts exists\n" +else + DETAILS="${DETAILS}FAIL: user-settings.component.ts not found\n" +fi + +# --- Check 2: igx-combo selector is present in the template --- +TEMPLATE_FILE=$(find src -name "user-settings.component.html" 2>/dev/null | head -1) +COMBO_FOUND=0 + +if [ -n "${TEMPLATE_FILE:-}" ] && grep -q "igx-combo" "$TEMPLATE_FILE" 2>/dev/null; then + COMBO_FOUND=1 +elif [ -n "${COMPONENT_FILE:-}" ] && grep -q "igx-combo" "$COMPONENT_FILE" 2>/dev/null; then + COMBO_FOUND=1 +fi + +if [ "$COMBO_FOUND" -eq 1 ]; then + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: igx-combo selector found\n" +else + DETAILS="${DETAILS}FAIL: igx-combo selector not found in template\n" +fi + +# --- Check 3: Reactive form usage (FormGroup, FormControl, or formControlName) --- +REACTIVE_FOUND=0 +SEARCH_FILES="" +[ -n "${TEMPLATE_FILE:-}" ] && SEARCH_FILES="$TEMPLATE_FILE" +[ -n "${COMPONENT_FILE:-}" ] && SEARCH_FILES="$SEARCH_FILES $COMPONENT_FILE" + +for f in $SEARCH_FILES; do + if grep -qE 'FormGroup|FormControl|formControlName|formControl|ReactiveFormsModule' "$f" 2>/dev/null; then + REACTIVE_FOUND=1 + break + fi +done + +if [ "$REACTIVE_FOUND" -eq 1 ]; then + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: Reactive form usage found\n" +else + DETAILS="${DETAILS}FAIL: No reactive form usage found\n" +fi + +# --- Check 4: No forbidden alternatives --- +ALL_FILES=$(find src -name "*.ts" -o -name "*.html" 2>/dev/null) +FORBIDDEN=0 +for f in $ALL_FILES; do + if grep -qE ' ].*multiple|mat-select|MatSelectModule|igx-select' "$f" 2>/dev/null; then + FORBIDDEN=1 + break + fi +done + +if [ "$FORBIDDEN" -eq 0 ]; then + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: No forbidden alternatives found\n" +else + DETAILS="${DETAILS}FAIL: Forbidden alternative (native select, mat-select, igx-select) detected\n" +fi + +# --- Check 5: Correct import from igniteui-angular --- +IMPORT_FOUND=0 +if [ -n "${COMPONENT_FILE:-}" ]; then + if grep -qE "from ['\"]igniteui-angular|from ['\"]@infragistics/igniteui-angular" "$COMPONENT_FILE" 2>/dev/null; then + IMPORT_FOUND=1 + fi +fi + +if [ "$IMPORT_FOUND" -eq 1 ]; then + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: igniteui-angular import found\n" +else + DETAILS="${DETAILS}FAIL: No igniteui-angular import found\n" +fi + +# --- Calculate reward --- +REWARD=$(echo "scale=2; $SCORE / $TOTAL" | bc) + +echo "$REWARD" > logs/verifier/reward.txt +printf "Score: %d/%d (%.0f%%)\n" "$SCORE" "$TOTAL" "$(echo "$REWARD * 100" | bc)" +printf "$DETAILS" + +if [ "$SCORE" -lt "$TOTAL" ]; then + exit 1 +fi diff --git a/evals/tasks/grid-basic-setup/environment/Dockerfile b/evals/tasks/grid-basic-setup/environment/Dockerfile new file mode 100644 index 00000000000..4cfd43a762c --- /dev/null +++ b/evals/tasks/grid-basic-setup/environment/Dockerfile @@ -0,0 +1,17 @@ +FROM node:20-slim + +WORKDIR /workspace + +RUN npm install -g @angular/cli@latest + +RUN ng new eval-app --skip-git --skip-install --style=scss --ssr=false && \ + cd eval-app && \ + npm install && \ + npm install igniteui-angular + +WORKDIR /workspace/eval-app + +COPY . . + +RUN mkdir -p logs/verifier +CMD ["bash"] diff --git a/evals/tasks/grid-basic-setup/instruction.md b/evals/tasks/grid-basic-setup/instruction.md new file mode 100644 index 00000000000..3a9880564e5 --- /dev/null +++ b/evals/tasks/grid-basic-setup/instruction.md @@ -0,0 +1,36 @@ +# Task: Add a Data Grid with Sorting and Pagination + +You are working in an Angular 20+ project that already has `igniteui-angular` installed and a theme applied. + +## Requirements + +Add a data grid to the `EmployeeListComponent` that displays employee data with the following features: + +1. **Data source**: Use the following flat employee data (add it as a property in the component): + + ```typescript + employees = [ + { id: 1, name: 'Alice Johnson', department: 'Engineering', salary: 95000, hireDate: new Date('2020-03-15') }, + { id: 2, name: 'Bob Smith', department: 'Marketing', salary: 72000, hireDate: new Date('2019-07-22') }, + { id: 3, name: 'Carol Davis', department: 'Engineering', salary: 105000, hireDate: new Date('2018-01-10') }, + { id: 4, name: 'David Wilson', department: 'Sales', salary: 68000, hireDate: new Date('2021-11-05') }, + { id: 5, name: 'Eva Martinez', department: 'Engineering', salary: 98000, hireDate: new Date('2020-09-18') }, + { id: 6, name: 'Frank Brown', department: 'Marketing', salary: 75000, hireDate: new Date('2017-04-30') }, + { id: 7, name: 'Grace Lee', department: 'Sales', salary: 82000, hireDate: new Date('2019-12-01') }, + { id: 8, name: 'Henry Taylor', department: 'Engineering', salary: 110000, hireDate: new Date('2016-06-14') }, + ]; + ``` + +2. **Columns**: Display all fields — `id`, `name`, `department`, `salary`, `hireDate` + +3. **Sorting**: Enable sorting on all columns + +4. **Pagination**: Add a paginator with a page size of 5 + +5. **Component**: Create or edit the file at `src/app/employee-list/employee-list.component.ts` (with its template and styles) + +## Constraints + +- Use the Ignite UI for Angular `igx-grid` component — do NOT use a native HTML ``, Angular Material table, or any other grid library. +- Import from the correct `igniteui-angular` entry point. +- The component must be standalone and use `ChangeDetectionStrategy.OnPush`. diff --git a/evals/tasks/grid-basic-setup/prompts/quality.md b/evals/tasks/grid-basic-setup/prompts/quality.md new file mode 100644 index 00000000000..fc65eede86f --- /dev/null +++ b/evals/tasks/grid-basic-setup/prompts/quality.md @@ -0,0 +1,25 @@ +# Grid Basic Setup — LLM Rubric + +Evaluate the agent's approach to adding a flat data grid with sorting and pagination. + +## Correct Grid Type Selection (0–0.3) +- Did the agent choose `igx-grid` (Flat Grid) for the flat employee data? +- Did the agent avoid `igx-tree-grid`, `igx-hierarchical-grid`, or `igx-pivot-grid` — which are wrong for flat, non-hierarchical data? +- Did the agent avoid native HTML `
`, Angular Material `mat-table`, or other third-party grids? + +## Skill Routing & Reference File Usage (0–0.3) +- Did the agent read the grids skill SKILL.md to identify the correct grid type? +- Did the agent read the relevant reference files (`structure.md` for columns/sorting, `paging-remote.md` for pagination) before writing code? +- Did the agent follow the mandatory protocol (identify grid type → read references → produce output)? + +## Idiomatic API Usage (0–0.25) +- Did the agent bind data correctly using the `[data]` input? +- Did the agent use `igx-column` elements with correct `[field]` bindings for each data field? +- Did the agent enable sorting correctly (e.g., `[sortable]="true"` on columns or grid-level `[allowSorting]`)? +- Did the agent import from the correct entry point (`igniteui-angular/grids/grid`)? +- Did the agent use `IGX_GRID_DIRECTIVES` or individual component imports? + +## Code Quality (0–0.15) +- Is the component standalone with `ChangeDetectionStrategy.OnPush`? +- Did the agent avoid hallucinated API names or non-existent inputs/outputs? +- Is the code clean and well-structured? diff --git a/evals/tasks/grid-basic-setup/skills/igniteui-angular-grids/SKILL.md b/evals/tasks/grid-basic-setup/skills/igniteui-angular-grids/SKILL.md new file mode 120000 index 00000000000..0ba573d65d2 --- /dev/null +++ b/evals/tasks/grid-basic-setup/skills/igniteui-angular-grids/SKILL.md @@ -0,0 +1 @@ +../../../../../skills/igniteui-angular-grids/SKILL.md \ No newline at end of file diff --git a/evals/tasks/grid-basic-setup/solution/solve.sh b/evals/tasks/grid-basic-setup/solution/solve.sh new file mode 100755 index 00000000000..0466ce7d623 --- /dev/null +++ b/evals/tasks/grid-basic-setup/solution/solve.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# Reference solution for grid-basic-setup +# Proves the task is solvable and validates grader correctness + +set -euo pipefail + +mkdir -p src/app/employee-list + +# Create the component TypeScript file +cat > src/app/employee-list/employee-list.component.ts << 'EOF' +import { ChangeDetectionStrategy, Component } from '@angular/core'; +import { IGX_GRID_DIRECTIVES } from 'igniteui-angular/grids/grid'; +import { IgxPaginatorComponent } from 'igniteui-angular/grids/grid'; + +@Component({ + selector: 'app-employee-list', + templateUrl: './employee-list.component.html', + changeDetection: ChangeDetectionStrategy.OnPush, + imports: [IGX_GRID_DIRECTIVES, IgxPaginatorComponent], +}) +export class EmployeeListComponent { + employees = [ + { id: 1, name: 'Alice Johnson', department: 'Engineering', salary: 95000, hireDate: new Date('2020-03-15') }, + { id: 2, name: 'Bob Smith', department: 'Marketing', salary: 72000, hireDate: new Date('2019-07-22') }, + { id: 3, name: 'Carol Davis', department: 'Engineering', salary: 105000, hireDate: new Date('2018-01-10') }, + { id: 4, name: 'David Wilson', department: 'Sales', salary: 68000, hireDate: new Date('2021-11-05') }, + { id: 5, name: 'Eva Martinez', department: 'Engineering', salary: 98000, hireDate: new Date('2020-09-18') }, + { id: 6, name: 'Frank Brown', department: 'Marketing', salary: 75000, hireDate: new Date('2017-04-30') }, + { id: 7, name: 'Grace Lee', department: 'Sales', salary: 82000, hireDate: new Date('2019-12-01') }, + { id: 8, name: 'Henry Taylor', department: 'Engineering', salary: 110000, hireDate: new Date('2016-06-14') }, + ]; +} +EOF + +# Create the template +cat > src/app/employee-list/employee-list.component.html << 'EOF' + + + + + + + + +EOF diff --git a/evals/tasks/grid-basic-setup/task.toml b/evals/tasks/grid-basic-setup/task.toml new file mode 100644 index 00000000000..07e25fdd0aa --- /dev/null +++ b/evals/tasks/grid-basic-setup/task.toml @@ -0,0 +1,26 @@ +version = "1.0" + +[metadata] +author_name = "Ignite UI Team" +difficulty = "medium" +category = "grid-setup" +tags = ["grid", "flat-grid", "sorting", "pagination", "igx-grid"] + +[agent] +timeout_sec = 600.0 + +[environment] +build_timeout_sec = 300.0 +cpus = 2 +memory_mb = 4096 +storage_mb = 1000 + +[[graders]] +type = "deterministic" +command = "bash tests/test.sh" +weight = 0.6 + +[[graders]] +type = "llm_rubric" +rubric = "prompts/quality.md" +weight = 0.4 diff --git a/evals/tasks/grid-basic-setup/tests/test.sh b/evals/tasks/grid-basic-setup/tests/test.sh new file mode 100755 index 00000000000..455c56ae96a --- /dev/null +++ b/evals/tasks/grid-basic-setup/tests/test.sh @@ -0,0 +1,99 @@ +#!/bin/bash +# Deterministic grader for grid-basic-setup +# Checks outcomes: correct files exist, project compiles, correct selectors used + +set -euo pipefail + +mkdir -p logs/verifier + +SCORE=0 +TOTAL=5 +DETAILS="" + +# --- Check 1: Component file exists --- +COMPONENT_FILE=$(find src -name "employee-list.component.ts" 2>/dev/null | head -1) +if [ -n "$COMPONENT_FILE" ]; then + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: employee-list.component.ts exists\n" +else + DETAILS="${DETAILS}FAIL: employee-list.component.ts not found\n" +fi + +# --- Check 2: igx-grid selector is present in the template --- +TEMPLATE_FILE=$(find src -name "employee-list.component.html" 2>/dev/null | head -1) +INLINE_TEMPLATE="" +if [ -z "$TEMPLATE_FILE" ] && [ -n "$COMPONENT_FILE" ]; then + # Check for inline template + INLINE_TEMPLATE=$(grep -l "igx-grid" "$COMPONENT_FILE" 2>/dev/null || true) +fi + +if [ -n "$TEMPLATE_FILE" ] && grep -q "igx-grid" "$TEMPLATE_FILE" 2>/dev/null; then + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: igx-grid selector found in template\n" +elif [ -n "$INLINE_TEMPLATE" ]; then + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: igx-grid selector found in inline template\n" +else + DETAILS="${DETAILS}FAIL: igx-grid selector not found in template\n" +fi + +# --- Check 3: Correct import from igniteui-angular entry point --- +if [ -n "$COMPONENT_FILE" ]; then + if grep -qE "from ['\"]igniteui-angular/grids/grid['\"]|from ['\"]@infragistics/igniteui-angular/grids/grid['\"]" "$COMPONENT_FILE" 2>/dev/null; then + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: Correct grid entry-point import found\n" + else + DETAILS="${DETAILS}FAIL: Missing import from igniteui-angular/grids/grid entry point\n" + fi +else + DETAILS="${DETAILS}FAIL: Cannot check imports — component file not found\n" +fi + +# --- Check 4: No forbidden alternatives --- +ALL_TS_FILES=$(find src -name "*.ts" -o -name "*.html" 2>/dev/null) +FORBIDDEN=0 +for f in $ALL_TS_FILES; do + # Check for native table, Angular Material table, or other grid libs + if grep -qE ' ]|MatTableModule|mat-table|ag-grid|kendo-grid' "$f" 2>/dev/null; then + FORBIDDEN=1 + break + fi +done + +if [ "$FORBIDDEN" -eq 0 ]; then + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: No forbidden alternatives found\n" +else + DETAILS="${DETAILS}FAIL: Forbidden alternative (native table, Material table, etc.) detected\n" +fi + +# --- Check 5: Pagination is configured --- +PAGING_FOUND=0 +SEARCH_FILES="" +[ -n "$TEMPLATE_FILE" ] && SEARCH_FILES="$TEMPLATE_FILE" +[ -n "$COMPONENT_FILE" ] && SEARCH_FILES="$SEARCH_FILES $COMPONENT_FILE" + +for f in $SEARCH_FILES; do + if grep -qE 'igx-paginator|IgxPaginatorComponent|paging|perPage|\[perPage\]' "$f" 2>/dev/null; then + PAGING_FOUND=1 + break + fi +done + +if [ "$PAGING_FOUND" -eq 1 ]; then + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: Pagination configuration found\n" +else + DETAILS="${DETAILS}FAIL: No pagination configuration found\n" +fi + +# --- Calculate reward --- +REWARD=$(echo "scale=2; $SCORE / $TOTAL" | bc) + +echo "$REWARD" > logs/verifier/reward.txt +printf "Score: %d/%d (%.0f%%)\n" "$SCORE" "$TOTAL" "$(echo "$REWARD * 100" | bc)" +printf "$DETAILS" + +if [ "$SCORE" -lt "$TOTAL" ]; then + exit 1 +fi diff --git a/evals/tasks/theming-palette-generation/environment/Dockerfile b/evals/tasks/theming-palette-generation/environment/Dockerfile new file mode 100644 index 00000000000..4cfd43a762c --- /dev/null +++ b/evals/tasks/theming-palette-generation/environment/Dockerfile @@ -0,0 +1,17 @@ +FROM node:20-slim + +WORKDIR /workspace + +RUN npm install -g @angular/cli@latest + +RUN ng new eval-app --skip-git --skip-install --style=scss --ssr=false && \ + cd eval-app && \ + npm install && \ + npm install igniteui-angular + +WORKDIR /workspace/eval-app + +COPY . . + +RUN mkdir -p logs/verifier +CMD ["bash"] diff --git a/evals/tasks/theming-palette-generation/instruction.md b/evals/tasks/theming-palette-generation/instruction.md new file mode 100644 index 00000000000..cb3a03360e3 --- /dev/null +++ b/evals/tasks/theming-palette-generation/instruction.md @@ -0,0 +1,27 @@ +# Task: Create a Custom Branded Theme + +You are working in an Angular 20+ project that already has `igniteui-angular` installed with Sass support enabled. + +## Requirements + +Create a custom Ignite UI for Angular theme with a blue primary color and orange secondary color. + +1. **Theme file location**: `src/styles.scss` (or update the existing global styles file) + +2. **Palette**: + - Primary color: `#1976D2` (Material Blue) + - Secondary color: `#FF9800` (Material Orange) + - Surface color appropriate for a light theme + +3. **Theme application**: + - Generate a complete theme using the Ignite UI theming functions + - Apply the theme globally + +4. **Typography**: Include typography configuration with a sans-serif font family + +## Constraints + +- Use the Ignite UI Sass theming API (`palette()`, `theme()`) — do NOT hardcode individual CSS custom properties or use plain CSS variables to replicate the palette. +- Import from `igniteui-angular/theming` (or `@infragistics/igniteui-angular/theming` for licensed packages). +- The theme must include both `palette()` and `theme()` function calls. +- Include `core()` mixin invocation before the `theme()` mixin. diff --git a/evals/tasks/theming-palette-generation/prompts/quality.md b/evals/tasks/theming-palette-generation/prompts/quality.md new file mode 100644 index 00000000000..d5400180ba6 --- /dev/null +++ b/evals/tasks/theming-palette-generation/prompts/quality.md @@ -0,0 +1,27 @@ +# Theming Palette Generation — LLM Rubric + +Evaluate the agent's approach to creating a custom branded Ignite UI theme. + +## Correct Theming Approach (0–0.3) +- Did the agent use the Ignite UI Sass theming API (`palette()`, `theme()`) instead of hardcoding CSS custom properties? +- Did the agent use `@use 'igniteui-angular/theming'` (modern Sass module syntax) rather than deprecated `@import`? +- Did the agent include `core()` mixin before `theme()` mixin as required by the theming system? + +## Skill Routing & Reference Usage (0–0.3) +- Did the agent read the theming skill SKILL.md for theming guidance? +- Did the agent follow the correct theming sequence: palette → typography → theme? +- Did the agent check for MCP server availability before writing SCSS manually? +- If MCP tools were available, did the agent prefer using them over manual SCSS? + +## Idiomatic API Usage (0–0.25) +- Did the agent pass `$primary` and `$secondary` parameters to `palette()`? +- Did the agent pass a `$surface` color appropriate for a light theme? +- Did the agent configure typography with a font family? +- Did the agent pass the `$palette` variable to the `theme()` mixin? +- Did the agent use the `$schema` parameter or rely on the correct default schema? + +## Code Quality (0–0.15) +- Is the SCSS well-structured and readable? +- Did the agent use `@use` with a namespace (e.g., `as *` or a custom namespace)? +- Did the agent avoid hallucinated function names or non-existent parameters? +- Did the agent avoid mixing Sass theming with manual CSS overrides unnecessarily? diff --git a/evals/tasks/theming-palette-generation/skills/igniteui-angular-theming/SKILL.md b/evals/tasks/theming-palette-generation/skills/igniteui-angular-theming/SKILL.md new file mode 120000 index 00000000000..05e9980f01a --- /dev/null +++ b/evals/tasks/theming-palette-generation/skills/igniteui-angular-theming/SKILL.md @@ -0,0 +1 @@ +../../../../../skills/igniteui-angular-theming/SKILL.md \ No newline at end of file diff --git a/evals/tasks/theming-palette-generation/solution/solve.sh b/evals/tasks/theming-palette-generation/solution/solve.sh new file mode 100755 index 00000000000..c032689ba43 --- /dev/null +++ b/evals/tasks/theming-palette-generation/solution/solve.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Reference solution for theming-palette-generation +# Proves the task is solvable and validates grader correctness + +set -euo pipefail + +# Write the themed styles.scss +cat > src/styles.scss << 'SCSS' +@use 'igniteui-angular/theming' as *; + +$custom-palette: palette( + $primary: #1976D2, + $secondary: #FF9800, + $surface: #FAFAFA, +); + +$custom-typography: typography( + $font-family: 'Roboto, "Helvetica Neue", sans-serif', +); + +@include core(); +@include typography($custom-typography); +@include theme( + $palette: $custom-palette, + $schema: $light-material-schema, +); +SCSS diff --git a/evals/tasks/theming-palette-generation/task.toml b/evals/tasks/theming-palette-generation/task.toml new file mode 100644 index 00000000000..459be454723 --- /dev/null +++ b/evals/tasks/theming-palette-generation/task.toml @@ -0,0 +1,26 @@ +version = "1.0" + +[metadata] +author_name = "Ignite UI Team" +difficulty = "medium" +category = "theming" +tags = ["theming", "palette", "scss", "sass", "custom-theme"] + +[agent] +timeout_sec = 600.0 + +[environment] +build_timeout_sec = 300.0 +cpus = 2 +memory_mb = 4096 +storage_mb = 1000 + +[[graders]] +type = "deterministic" +command = "bash tests/test.sh" +weight = 0.6 + +[[graders]] +type = "llm_rubric" +rubric = "prompts/quality.md" +weight = 0.4 diff --git a/evals/tasks/theming-palette-generation/tests/test.sh b/evals/tasks/theming-palette-generation/tests/test.sh new file mode 100755 index 00000000000..769d60af99e --- /dev/null +++ b/evals/tasks/theming-palette-generation/tests/test.sh @@ -0,0 +1,82 @@ +#!/bin/bash +# Deterministic grader for theming-palette-generation +# Checks outcomes: correct SCSS structure, palette/theme calls present + +set -euo pipefail + +mkdir -p logs/verifier + +SCORE=0 +TOTAL=5 +DETAILS="" + +# Find the main styles file (could be styles.scss or another scss file) +STYLES_FILE=$(find src -name "styles.scss" -o -name "styles.sass" 2>/dev/null | head -1) +if [ -z "$STYLES_FILE" ]; then + # Also check for any scss file that might contain the theme + STYLES_FILE=$(grep -rl "palette\|theme()" src/ --include="*.scss" 2>/dev/null | head -1) +fi + +if [ -z "${STYLES_FILE:-}" ]; then + echo "0" > logs/verifier/reward.txt + printf "FAIL: No SCSS file with theming code found\n" + exit 1 +fi + +# --- Check 1: Import from igniteui-angular/theming --- +if grep -qE "@use ['\"]igniteui-angular/theming['\"]|@use ['\"]@infragistics/igniteui-angular/theming['\"]|@import ['\"]igniteui-angular/theming['\"]|@import ['\"]@infragistics/igniteui-angular/theming['\"]|@import ['\"]~igniteui-angular/lib/core/styles/themes" "$STYLES_FILE" 2>/dev/null; then + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: Correct theming import found\n" +else + DETAILS="${DETAILS}FAIL: Missing import from igniteui-angular/theming\n" +fi + +# --- Check 2: palette() function call with primary and secondary --- +if grep -qE 'palette\(' "$STYLES_FILE" 2>/dev/null; then + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: palette() function call found\n" +else + DETAILS="${DETAILS}FAIL: No palette() function call found\n" +fi + +# --- Check 3: theme() mixin call --- +if grep -qE '@include.*theme\(|@include.*css-vars\(' "$STYLES_FILE" 2>/dev/null; then + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: theme() mixin call found\n" +else + DETAILS="${DETAILS}FAIL: No theme() mixin call found\n" +fi + +# --- Check 4: core() mixin call (must be before theme) --- +if grep -qE '@include.*core\(' "$STYLES_FILE" 2>/dev/null; then + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: core() mixin call found\n" +else + DETAILS="${DETAILS}FAIL: No core() mixin call found\n" +fi + +# --- Check 5: No hardcoded CSS custom properties as the sole theming approach --- +# Allow CSS vars if palette() is also used, but fail if ONLY css vars without palette() +PALETTE_USED=$(grep -c 'palette(' "$STYLES_FILE" 2>/dev/null || echo "0") +CSS_VARS_ONLY=$(grep -cE '^\s*--ig-' "$STYLES_FILE" 2>/dev/null || echo "0") + +if [ "$PALETTE_USED" -gt 0 ]; then + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: Uses palette() function (not hardcoded CSS variables)\n" +elif [ "$CSS_VARS_ONLY" -gt 0 ]; then + DETAILS="${DETAILS}FAIL: Only hardcoded CSS custom properties found without palette()\n" +else + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: No hardcoded-only CSS variables approach\n" +fi + +# --- Calculate reward --- +REWARD=$(echo "scale=2; $SCORE / $TOTAL" | bc) + +echo "$REWARD" > logs/verifier/reward.txt +printf "Score: %d/%d (%.0f%%)\n" "$SCORE" "$TOTAL" "$(echo "$REWARD * 100" | bc)" +printf "$DETAILS" + +if [ "$SCORE" -lt "$TOTAL" ]; then + exit 1 +fi From ac1335a316ff24fece6db887e2c6ab5468577b9f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 8 Mar 2026 07:13:34 +0000 Subject: [PATCH 03/17] refactor: improve regex readability in grader scripts per code review Co-authored-by: zdrawku <11193764+zdrawku@users.noreply.github.com> --- evals/tasks/grid-basic-setup/tests/test.sh | 4 +++- evals/tasks/theming-palette-generation/tests/test.sh | 5 ++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/evals/tasks/grid-basic-setup/tests/test.sh b/evals/tasks/grid-basic-setup/tests/test.sh index 455c56ae96a..72eff909578 100755 --- a/evals/tasks/grid-basic-setup/tests/test.sh +++ b/evals/tasks/grid-basic-setup/tests/test.sh @@ -38,8 +38,10 @@ else fi # --- Check 3: Correct import from igniteui-angular entry point --- +# Accepts either the OSS or licensed package path +GRID_IMPORT_PATTERN="from ['\"](@infragistics/)?igniteui-angular/grids/grid['\"]" if [ -n "$COMPONENT_FILE" ]; then - if grep -qE "from ['\"]igniteui-angular/grids/grid['\"]|from ['\"]@infragistics/igniteui-angular/grids/grid['\"]" "$COMPONENT_FILE" 2>/dev/null; then + if grep -qE "$GRID_IMPORT_PATTERN" "$COMPONENT_FILE" 2>/dev/null; then SCORE=$((SCORE + 1)) DETAILS="${DETAILS}PASS: Correct grid entry-point import found\n" else diff --git a/evals/tasks/theming-palette-generation/tests/test.sh b/evals/tasks/theming-palette-generation/tests/test.sh index 769d60af99e..0fa64771be2 100755 --- a/evals/tasks/theming-palette-generation/tests/test.sh +++ b/evals/tasks/theming-palette-generation/tests/test.sh @@ -24,7 +24,10 @@ if [ -z "${STYLES_FILE:-}" ]; then fi # --- Check 1: Import from igniteui-angular/theming --- -if grep -qE "@use ['\"]igniteui-angular/theming['\"]|@use ['\"]@infragistics/igniteui-angular/theming['\"]|@import ['\"]igniteui-angular/theming['\"]|@import ['\"]@infragistics/igniteui-angular/theming['\"]|@import ['\"]~igniteui-angular/lib/core/styles/themes" "$STYLES_FILE" 2>/dev/null; then +# Accepts @use or @import with either the OSS or licensed package path +THEMING_IMPORT_PATTERN="@(use|import) ['\"](@infragistics/)?igniteui-angular/theming['\"]" +LEGACY_IMPORT_PATTERN="@import ['\"]~igniteui-angular/lib/core/styles/themes" +if grep -qE "$THEMING_IMPORT_PATTERN|$LEGACY_IMPORT_PATTERN" "$STYLES_FILE" 2>/dev/null; then SCORE=$((SCORE + 1)) DETAILS="${DETAILS}PASS: Correct theming import found\n" else From f807aa3c4fb78940231f30237ad8a9326be7a2ed Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 8 Mar 2026 07:14:16 +0000 Subject: [PATCH 04/17] fix: add explicit permissions to skill-eval workflow (CodeQL alert) Co-authored-by: zdrawku <11193764+zdrawku@users.noreply.github.com> --- .github/workflows/skill-eval.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/skill-eval.yml b/.github/workflows/skill-eval.yml index 7e172c404a6..4e3aafd5bdc 100644 --- a/.github/workflows/skill-eval.yml +++ b/.github/workflows/skill-eval.yml @@ -6,6 +6,10 @@ on: - 'skills/**' - 'evals/**' +permissions: + contents: read + pull-requests: write + jobs: eval: runs-on: ubuntu-latest From 6e7b838ef291aec21eeca3ef9f8f2e5b9bbb04ef Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 9 Mar 2026 17:42:33 +0000 Subject: [PATCH 05/17] fix: replace skill-eval package dep with self-contained local runner skill-eval is a reference repo, not an installable npm package. Replaced with a local run-eval.sh script that executes reference solutions and deterministic graders directly. Co-authored-by: kdinev <1472513+kdinev@users.noreply.github.com> --- .github/workflows/skill-eval.yml | 9 +-- evals/README.md | 86 ++++++++------------ evals/package.json | 19 +++-- evals/run-eval.sh | 131 +++++++++++++++++++++++++++++++ 4 files changed, 176 insertions(+), 69 deletions(-) create mode 100755 evals/run-eval.sh diff --git a/.github/workflows/skill-eval.yml b/.github/workflows/skill-eval.yml index 4e3aafd5bdc..136259fc1e1 100644 --- a/.github/workflows/skill-eval.yml +++ b/.github/workflows/skill-eval.yml @@ -26,14 +26,11 @@ jobs: - name: Install eval dependencies working-directory: evals - run: npm install + run: npm install --ignore-scripts - - name: Run skill evals + - name: Validate graders against reference solutions working-directory: evals - run: npx skill-eval _ --suite=all --trials=5 - env: - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} + run: bash run-eval.sh --all --validate - name: Upload results if: always() diff --git a/evals/README.md b/evals/README.md index 631ea8741c9..ca8b31cba6f 100644 --- a/evals/README.md +++ b/evals/README.md @@ -1,8 +1,13 @@ # Ignite UI for Angular — Skill Evals -Automated evaluation suite for the Ignite UI for Angular agent skills. Uses the -[skill-eval](https://github.com/mgechev/skill-eval) framework to measure skill -quality, detect regressions, and gate merges. +Automated evaluation suite for the Ignite UI for Angular agent skills. +Inspired by the [skill-eval](https://github.com/mgechev/skill-eval) reference +architecture and extended with patterns from +[Anthropic's agent eval research](https://www.anthropic.com/engineering/demystifying-evals-for-ai-agents). + +The infrastructure is **self-contained** — there are no external eval-framework +dependencies. A lightweight shell runner (`run-eval.sh`) executes each task's +reference solution and deterministic grader. ## Overview @@ -21,63 +26,38 @@ Each task includes: - **`prompts/quality.md`** — LLM rubric grader (intent routing, API usage) - **`solution/solve.sh`** — reference solution for baseline validation - **`environment/Dockerfile`** — isolated environment for agent execution -- **`skills/`** — symlinked or copied skill files under test +- **`skills/`** — symlinked skill files under test ## Prerequisites -- Node.js 20+ -- Docker (for isolated agent execution) -- An API key for the agent provider (Gemini or Anthropic) +- Bash 4+ +- `bc` (installed by default on most Linux / macOS systems) ## Running Evals Locally -### Install dependencies - -```bash -cd evals -npm install -``` - -### Run a single task - -```bash -# Gemini (default) -GEMINI_API_KEY=your-key npm run eval -- grid-basic-setup - -# Claude -ANTHROPIC_API_KEY=your-key npm run eval -- grid-basic-setup --agent=claude -``` - -### Run all tasks - -```bash -GEMINI_API_KEY=your-key npm run eval:all -``` +### Validate graders against reference solutions -### Options +This applies each task's `solution/solve.sh`, then runs `tests/test.sh` to +confirm the grader scores 100%. Use this to catch grader regressions. ```bash -# Adjust trials (default: 5) -npm run eval -- grid-basic-setup --trials=5 - -# Run locally without Docker -npm run eval -- grid-basic-setup --provider=local +cd evals -# Validate graders against the reference solution -npm run eval -- grid-basic-setup --validate --provider=local +# Validate all tasks +bash run-eval.sh --all --validate -# Run multiple trials in parallel -npm run eval -- grid-basic-setup --parallel=3 +# Validate a single task +bash run-eval.sh grid-basic-setup --validate ``` -### Preview results +### npm scripts (convenience wrappers) ```bash -# CLI report -npm run preview - -# Web UI at http://localhost:3847 -npm run preview:browser +cd evals +npm run validate # all tasks +npm run validate:grid # grid-basic-setup only +npm run validate:combo # component-combo-reactive-form only +npm run validate:theming # theming-palette-generation only ``` ## Adding a New Task @@ -86,9 +66,9 @@ npm run preview:browser ``` tasks// - ├── task.toml # Config: graders, timeouts, resource limits + ├── task.toml # Config: grader metadata, weights, timeouts ├── instruction.md # Agent prompt - ├── environment/Dockerfile # Container setup + ├── environment/Dockerfile # Container setup (for future Docker-based runs) ├── tests/test.sh # Deterministic grader ├── prompts/quality.md # LLM rubric grader ├── solution/solve.sh # Reference solution @@ -100,7 +80,8 @@ npm run preview:browser to build. 3. Write `tests/test.sh` to check **outcomes** (files exist, project compiles, - correct selectors are present) rather than specific steps. + correct selectors are present) rather than specific steps. The grader must + write a reward (0.0–1.0) to `logs/verifier/reward.txt`. 4. Write `prompts/quality.md` with rubric dimensions that sum to 1.0. @@ -110,7 +91,7 @@ npm run preview:browser 6. Validate graders before submitting: ```bash - npm run eval -- --validate --provider=local + bash run-eval.sh --validate ``` ## Pass / Fail Thresholds @@ -129,10 +110,9 @@ The GitHub Actions workflow at `.github/workflows/skill-eval.yml` runs automatically on PRs that modify `skills/**` or `evals/**`. It: 1. Checks out the repo -2. Installs eval dependencies -3. Runs all tasks with 5 trials -4. Uploads results as an artifact -5. Posts a summary comment on the PR +2. Validates all graders against their reference solutions +3. Uploads results as an artifact +4. Posts a summary comment on the PR ## Grading Strategy diff --git a/evals/package.json b/evals/package.json index 9a945614306..b660ea782d8 100644 --- a/evals/package.json +++ b/evals/package.json @@ -4,16 +4,15 @@ "description": "Evaluation suite for Ignite UI for Angular agent skills", "private": true, "scripts": { - "eval": "npx skill-eval", - "eval:grid": "npx skill-eval grid-basic-setup", - "eval:combo": "npx skill-eval component-combo-reactive-form", - "eval:theming": "npx skill-eval theming-palette-generation", - "eval:all": "npx skill-eval _ --suite=all", - "preview": "npx skill-eval preview", - "preview:browser": "npx skill-eval preview browser" - }, - "dependencies": { - "skill-eval": "^1.0.0" + "eval": "bash run-eval.sh", + "eval:grid": "bash run-eval.sh grid-basic-setup", + "eval:combo": "bash run-eval.sh component-combo-reactive-form", + "eval:theming": "bash run-eval.sh theming-palette-generation", + "eval:all": "bash run-eval.sh --all", + "validate": "bash run-eval.sh --all --validate", + "validate:grid": "bash run-eval.sh grid-basic-setup --validate", + "validate:combo": "bash run-eval.sh component-combo-reactive-form --validate", + "validate:theming": "bash run-eval.sh theming-palette-generation --validate" }, "engines": { "node": ">=20.0.0" diff --git a/evals/run-eval.sh b/evals/run-eval.sh new file mode 100755 index 00000000000..8ac0dc5fc66 --- /dev/null +++ b/evals/run-eval.sh @@ -0,0 +1,131 @@ +#!/bin/bash +# run-eval.sh — Self-contained eval runner for Ignite UI Angular skills. +# Inspired by https://github.com/mgechev/skill-eval (a reference architecture, +# not an installable package). +# +# Usage: +# bash run-eval.sh # validate one task +# bash run-eval.sh --all # validate all tasks +# bash run-eval.sh --validate # run reference solution then grade + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TASKS_DIR="$SCRIPT_DIR/tasks" +RESULTS_DIR="$SCRIPT_DIR/results" + +# --- helpers --------------------------------------------------------------- # + +usage() { + cat < [--validate] + +Arguments: + Name of the task directory under tasks/ + --all Run all tasks + +Options: + --validate Apply the reference solution before grading (sanity-check mode) + +Examples: + $(basename "$0") grid-basic-setup --validate + $(basename "$0") --all +EOF + exit 1 +} + +run_task() { + local TASK_ID="$1" + local VALIDATE="${2:-false}" + local TASK_DIR="$TASKS_DIR/$TASK_ID" + + if [ ! -d "$TASK_DIR" ]; then + echo "ERROR: Task directory not found: $TASK_DIR" >&2 + return 1 + fi + + echo "═══════════════════════════════════════════════════════" + echo " Task: $TASK_ID" + echo "═══════════════════════════════════════════════════════" + + # Create a temporary workspace so graders run in isolation + local WORK_DIR + WORK_DIR=$(mktemp -d) + trap "rm -rf '$WORK_DIR'" RETURN + + # Seed the workspace with a minimal src/ tree + mkdir -p "$WORK_DIR/src" + + # If --validate, apply the reference solution first + if [ "$VALIDATE" = "true" ]; then + if [ ! -f "$TASK_DIR/solution/solve.sh" ]; then + echo "ERROR: No reference solution at $TASK_DIR/solution/solve.sh" >&2 + return 1 + fi + echo "→ Applying reference solution …" + (cd "$WORK_DIR" && bash "$TASK_DIR/solution/solve.sh") + fi + + # Run deterministic grader + if [ ! -f "$TASK_DIR/tests/test.sh" ]; then + echo "ERROR: No deterministic grader at $TASK_DIR/tests/test.sh" >&2 + return 1 + fi + + echo "→ Running deterministic grader …" + local GRADER_EXIT=0 + (cd "$WORK_DIR" && bash "$TASK_DIR/tests/test.sh") || GRADER_EXIT=$? + + # Read reward + local REWARD="0" + if [ -f "$WORK_DIR/logs/verifier/reward.txt" ]; then + REWARD=$(cat "$WORK_DIR/logs/verifier/reward.txt") + fi + + local STATUS="fail" + if [ "$GRADER_EXIT" -eq 0 ]; then + STATUS="pass" + fi + + echo "" + echo " Result: $STATUS (reward=$REWARD)" + echo "" + + # Persist result + mkdir -p "$RESULTS_DIR" + cat > "$RESULTS_DIR/${TASK_ID}.json" < Date: Tue, 10 Mar 2026 08:22:33 +0000 Subject: [PATCH 06/17] fix: emit passRate/passAtK in result JSON so CI summary shows actual scores The workflow summary comment reads passRate and passAtK from per-task JSON files. The runner was only writing reward/status, causing N/A. Also skip baseline.json in the summary since it uses a different schema. Co-authored-by: kdinev <1472513+kdinev@users.noreply.github.com> --- .github/workflows/skill-eval.yml | 2 +- evals/run-eval.sh | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/.github/workflows/skill-eval.yml b/.github/workflows/skill-eval.yml index 136259fc1e1..c942bce4c76 100644 --- a/.github/workflows/skill-eval.yml +++ b/.github/workflows/skill-eval.yml @@ -52,7 +52,7 @@ jobs: let summary = '## 📊 Skill Eval Results\n\n'; try { - const files = fs.readdirSync(resultsDir).filter(f => f.endsWith('.json')); + const files = fs.readdirSync(resultsDir).filter(f => f.endsWith('.json') && f !== 'baseline.json'); if (files.length === 0) { summary += '> ⚠️ No eval results found. The eval run may have failed.\n'; } else { diff --git a/evals/run-eval.sh b/evals/run-eval.sh index 8ac0dc5fc66..0fb93b9527a 100755 --- a/evals/run-eval.sh +++ b/evals/run-eval.sh @@ -83,21 +83,28 @@ run_task() { fi local STATUS="fail" + local PASS_RATE="0" + local PASS_AT_K="0" if [ "$GRADER_EXIT" -eq 0 ]; then STATUS="pass" + PASS_RATE="1" + PASS_AT_K="1" fi echo "" echo " Result: $STATUS (reward=$REWARD)" echo "" - # Persist result + # Persist result — includes passRate/passAtK so the CI summary comment can + # read them directly (these are the fields the workflow script expects). mkdir -p "$RESULTS_DIR" cat > "$RESULTS_DIR/${TASK_ID}.json" < Date: Tue, 10 Mar 2026 10:37:35 +0200 Subject: [PATCH 07/17] Update evals/tasks/component-combo-reactive-form/solution/solve.sh Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- evals/tasks/component-combo-reactive-form/solution/solve.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/tasks/component-combo-reactive-form/solution/solve.sh b/evals/tasks/component-combo-reactive-form/solution/solve.sh index 6b284457490..396e07382f5 100755 --- a/evals/tasks/component-combo-reactive-form/solution/solve.sh +++ b/evals/tasks/component-combo-reactive-form/solution/solve.sh @@ -10,7 +10,7 @@ mkdir -p src/app/user-settings cat > src/app/user-settings/user-settings.component.ts << 'EOF' import { ChangeDetectionStrategy, Component } from '@angular/core'; import { FormGroup, FormControl, Validators, ReactiveFormsModule } from '@angular/forms'; -import { IgxComboComponent } from 'igniteui-angular'; +import { IgxComboComponent } from 'igniteui-angular/combo'; @Component({ selector: 'app-user-settings', From 2df335efc01177f4a6931e4819386b841555fa4e Mon Sep 17 00:00:00 2001 From: Konstantin Dinev Date: Tue, 10 Mar 2026 10:38:43 +0200 Subject: [PATCH 08/17] Update .github/workflows/skill-eval.yml Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .github/workflows/skill-eval.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/skill-eval.yml b/.github/workflows/skill-eval.yml index c942bce4c76..467f73a0768 100644 --- a/.github/workflows/skill-eval.yml +++ b/.github/workflows/skill-eval.yml @@ -41,7 +41,7 @@ jobs: retention-days: 30 - name: Post summary comment - if: always() && github.event_name == 'pull_request' + if: always() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.fork == false uses: actions/github-script@v7 with: script: | From b22b13f37cbcb9d0365dc1969da27df013eb26ed Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 10 Mar 2026 08:41:55 +0000 Subject: [PATCH 09/17] fix: tighten grader checks per review feedback - Combo import check now specifically requires the entry-point import (igniteui-angular/combo) and rejects root barrel imports - Theming grader now enforces core() must appear before theme() by comparing line numbers - README updated to match actual grader behavior (no build step) Co-authored-by: kdinev <1472513+kdinev@users.noreply.github.com> --- evals/README.md | 12 +++++++----- .../component-combo-reactive-form/tests/test.sh | 10 ++++++---- .../theming-palette-generation/tests/test.sh | 15 ++++++++++----- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/evals/README.md b/evals/README.md index ca8b31cba6f..f69d72a4d19 100644 --- a/evals/README.md +++ b/evals/README.md @@ -79,9 +79,10 @@ npm run validate:theming # theming-palette-generation only 2. Write a clear, unambiguous `instruction.md` that tells the agent exactly what to build. -3. Write `tests/test.sh` to check **outcomes** (files exist, project compiles, - correct selectors are present) rather than specific steps. The grader must - write a reward (0.0–1.0) to `logs/verifier/reward.txt`. +3. Write `tests/test.sh` to check **outcomes** (files exist, correct selectors + and entry-point imports are present, correct API call ordering) rather than + specific steps. The grader must write a reward (0.0–1.0) to + `logs/verifier/reward.txt`. 4. Write `prompts/quality.md` with rubric dimensions that sum to 1.0. @@ -117,10 +118,11 @@ automatically on PRs that modify `skills/**` or `evals/**`. It: ## Grading Strategy **Deterministic grader (60% weight)** — checks: -- Project builds without errors +- Expected component files exist - Correct Ignite UI selector is present in the generated template -- Required imports exist +- Required entry-point imports exist (not root barrel) - No use of forbidden alternatives +- Correct API call ordering (e.g. `core()` before `theme()`) **LLM rubric grader (40% weight)** — evaluates: - Correct intent routing diff --git a/evals/tasks/component-combo-reactive-form/tests/test.sh b/evals/tasks/component-combo-reactive-form/tests/test.sh index 23022ab241b..d82caf36bbe 100755 --- a/evals/tasks/component-combo-reactive-form/tests/test.sh +++ b/evals/tasks/component-combo-reactive-form/tests/test.sh @@ -73,19 +73,21 @@ else DETAILS="${DETAILS}FAIL: Forbidden alternative (native select, mat-select, igx-select) detected\n" fi -# --- Check 5: Correct import from igniteui-angular --- +# --- Check 5: Correct entry-point import from igniteui-angular/combo --- +# The skill requires entry-point imports (not the root barrel). +COMBO_IMPORT_PATTERN="from ['\"](@infragistics/)?igniteui-angular/combo['\"]" IMPORT_FOUND=0 if [ -n "${COMPONENT_FILE:-}" ]; then - if grep -qE "from ['\"]igniteui-angular|from ['\"]@infragistics/igniteui-angular" "$COMPONENT_FILE" 2>/dev/null; then + if grep -qE "$COMBO_IMPORT_PATTERN" "$COMPONENT_FILE" 2>/dev/null; then IMPORT_FOUND=1 fi fi if [ "$IMPORT_FOUND" -eq 1 ]; then SCORE=$((SCORE + 1)) - DETAILS="${DETAILS}PASS: igniteui-angular import found\n" + DETAILS="${DETAILS}PASS: Correct combo entry-point import found\n" else - DETAILS="${DETAILS}FAIL: No igniteui-angular import found\n" + DETAILS="${DETAILS}FAIL: Missing import from igniteui-angular/combo entry point\n" fi # --- Calculate reward --- diff --git a/evals/tasks/theming-palette-generation/tests/test.sh b/evals/tasks/theming-palette-generation/tests/test.sh index 0fa64771be2..2a992f39aea 100755 --- a/evals/tasks/theming-palette-generation/tests/test.sh +++ b/evals/tasks/theming-palette-generation/tests/test.sh @@ -50,12 +50,17 @@ else DETAILS="${DETAILS}FAIL: No theme() mixin call found\n" fi -# --- Check 4: core() mixin call (must be before theme) --- -if grep -qE '@include.*core\(' "$STYLES_FILE" 2>/dev/null; then - SCORE=$((SCORE + 1)) - DETAILS="${DETAILS}PASS: core() mixin call found\n" -else +# --- Check 4: core() mixin call must appear before theme() --- +CORE_LINE=$(grep -nE '@include.*core\(' "$STYLES_FILE" 2>/dev/null | head -1 | cut -d: -f1) +THEME_LINE=$(grep -nE '@include.*theme\(' "$STYLES_FILE" 2>/dev/null | head -1 | cut -d: -f1) + +if [ -z "${CORE_LINE:-}" ]; then DETAILS="${DETAILS}FAIL: No core() mixin call found\n" +elif [ -n "${THEME_LINE:-}" ] && [ "$CORE_LINE" -gt "$THEME_LINE" ]; then + DETAILS="${DETAILS}FAIL: core() must be called before theme() (core on line $CORE_LINE, theme on line $THEME_LINE)\n" +else + SCORE=$((SCORE + 1)) + DETAILS="${DETAILS}PASS: core() mixin call found before theme()\n" fi # --- Check 5: No hardcoded CSS custom properties as the sole theming approach --- From 94d4bf831fbdecff1ff14c222869eead1c826a5d Mon Sep 17 00:00:00 2001 From: Konstantin Dinev Date: Tue, 10 Mar 2026 10:45:24 +0200 Subject: [PATCH 10/17] Update Node.js version in skill-eval workflow --- .github/workflows/skill-eval.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/skill-eval.yml b/.github/workflows/skill-eval.yml index 467f73a0768..45a7d608c61 100644 --- a/.github/workflows/skill-eval.yml +++ b/.github/workflows/skill-eval.yml @@ -22,7 +22,7 @@ jobs: - name: Set up Node.js uses: actions/setup-node@v4 with: - node-version: '20' + node-version: '22' - name: Install eval dependencies working-directory: evals From 18f3e2570b837323cfb03d940d6aeacda11aa78b Mon Sep 17 00:00:00 2001 From: Konstantin Dinev Date: Tue, 10 Mar 2026 10:47:56 +0200 Subject: [PATCH 11/17] Update skill-eval.yml --- .github/workflows/skill-eval.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/skill-eval.yml b/.github/workflows/skill-eval.yml index 45a7d608c61..26294ace0dd 100644 --- a/.github/workflows/skill-eval.yml +++ b/.github/workflows/skill-eval.yml @@ -11,7 +11,7 @@ permissions: pull-requests: write jobs: - eval: + skill_eval: runs-on: ubuntu-latest timeout-minutes: 30 From 568b04d54ce9f1250af896088b50244b60456009 Mon Sep 17 00:00:00 2001 From: Konstantin Dinev Date: Tue, 10 Mar 2026 11:05:31 +0200 Subject: [PATCH 12/17] Remove eval dependencies installation step Removed installation of eval dependencies from workflow. --- .github/workflows/skill-eval.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/skill-eval.yml b/.github/workflows/skill-eval.yml index 26294ace0dd..c7ab06b4c75 100644 --- a/.github/workflows/skill-eval.yml +++ b/.github/workflows/skill-eval.yml @@ -24,10 +24,6 @@ jobs: with: node-version: '22' - - name: Install eval dependencies - working-directory: evals - run: npm install --ignore-scripts - - name: Validate graders against reference solutions working-directory: evals run: bash run-eval.sh --all --validate From b181ca0600f3bcaf0120cd18871264a6a935e4e9 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 10 Mar 2026 09:32:17 +0000 Subject: [PATCH 13/17] feat: add copilot-cli and gemini-cli agent modes to eval runner Co-authored-by: kdinev <1472513+kdinev@users.noreply.github.com> --- .github/workflows/skill-eval.yml | 95 +++++++-- evals/README.md | 109 ++++++++++- evals/eval-config.json | 23 +++ evals/package.json | 10 +- evals/run-eval.sh | 321 +++++++++++++++++++++++++++++-- 5 files changed, 521 insertions(+), 37 deletions(-) create mode 100644 evals/eval-config.json diff --git a/.github/workflows/skill-eval.yml b/.github/workflows/skill-eval.yml index c7ab06b4c75..8109bb6106d 100644 --- a/.github/workflows/skill-eval.yml +++ b/.github/workflows/skill-eval.yml @@ -5,15 +5,31 @@ on: paths: - 'skills/**' - 'evals/**' + workflow_dispatch: + inputs: + agent: + description: 'Agent to run evals against (copilot or gemini)' + required: true + default: 'copilot' + type: choice + options: + - copilot + - gemini + trials: + description: 'Number of trials per task' + required: false + default: '1' + type: string permissions: contents: read pull-requests: write jobs: - skill_eval: + # Job 1: Always validate graders against reference solutions + validate_graders: runs-on: ubuntu-latest - timeout-minutes: 30 + timeout-minutes: 10 steps: - name: Checkout repository @@ -28,16 +44,70 @@ jobs: working-directory: evals run: bash run-eval.sh --all --validate - - name: Upload results + - name: Upload validation results if: always() uses: actions/upload-artifact@v4 with: - name: skill-eval-results + name: skill-eval-validation-results path: evals/results/ retention-days: 30 + # Job 2: Run evals against an AI agent (copilot or gemini) + # Triggered manually via workflow_dispatch, or can be called from other workflows + agent_eval: + if: github.event_name == 'workflow_dispatch' + runs-on: ubuntu-latest + timeout-minutes: 60 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: '22' + + - name: Install Copilot CLI + if: inputs.agent == 'copilot' + run: npm install -g @github/copilot + + - name: Install Gemini CLI + if: inputs.agent == 'gemini' + run: npm install -g @google/gemini-cli + + - name: Run agent-based eval + working-directory: evals + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} + run: | + bash run-eval.sh --all \ + --agent ${{ inputs.agent }} \ + --trials ${{ inputs.trials || '1' }} + + - name: Upload agent eval results + if: always() + uses: actions/upload-artifact@v4 + with: + name: skill-eval-agent-${{ inputs.agent }}-results + path: evals/results/ + retention-days: 30 + + # Job 3: Post summary comment on PRs + post_summary: + if: always() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.fork == false + needs: [validate_graders] + runs-on: ubuntu-latest + + steps: + - name: Download validation results + uses: actions/download-artifact@v4 + with: + name: skill-eval-validation-results + path: evals/results/ + - name: Post summary comment - if: always() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.fork == false uses: actions/github-script@v7 with: script: | @@ -52,26 +122,27 @@ jobs: if (files.length === 0) { summary += '> ⚠️ No eval results found. The eval run may have failed.\n'; } else { - summary += '| Task | Pass Rate | pass@5 | Status |\n'; - summary += '|---|---|---|---|\n'; + summary += '| Task | Agent | Pass Rate | pass@k | Status |\n'; + summary += '|---|---|---|---|---|\n'; for (const file of files) { try { const data = JSON.parse(fs.readFileSync(path.join(resultsDir, file), 'utf8')); const taskName = data.task || file.replace('.json', ''); + const agent = data.agent || 'reference'; const passRate = data.passRate != null ? `${(data.passRate * 100).toFixed(0)}%` : 'N/A'; const passAtK = data.passAtK != null ? `${(data.passAtK * 100).toFixed(0)}%` : 'N/A'; const status = data.passAtK >= 0.8 ? '✅' : data.passAtK >= 0.6 ? '⚠️' : '❌'; - summary += `| ${taskName} | ${passRate} | ${passAtK} | ${status} |\n`; + summary += `| ${taskName} | ${agent} | ${passRate} | ${passAtK} | ${status} |\n`; } catch (e) { - summary += `| ${file} | Error | Error | ❌ |\n`; + summary += `| ${file} | — | Error | Error | ❌ |\n`; } } summary += '\n### Thresholds\n'; - summary += '- ✅ `pass@5 ≥ 80%` — merge gate passed\n'; - summary += '- ⚠️ `pass@5 ≥ 60%` — needs investigation\n'; - summary += '- ❌ `pass@5 < 60%` — blocks merge for affected skill\n'; + summary += '- ✅ `pass@k ≥ 80%` — merge gate passed\n'; + summary += '- ⚠️ `pass@k ≥ 60%` — needs investigation\n'; + summary += '- ❌ `pass@k < 60%` — blocks merge for affected skill\n'; } } catch (e) { summary += `> ⚠️ Could not read results: ${e.message}\n`; diff --git a/evals/README.md b/evals/README.md index f69d72a4d19..2c7160237bc 100644 --- a/evals/README.md +++ b/evals/README.md @@ -7,7 +7,9 @@ architecture and extended with patterns from The infrastructure is **self-contained** — there are no external eval-framework dependencies. A lightweight shell runner (`run-eval.sh`) executes each task's -reference solution and deterministic grader. +reference solution and deterministic grader, and can also dispatch tasks to +AI coding agents (GitHub Copilot CLI or Google Gemini CLI) for end-to-end +evaluation. ## Overview @@ -32,6 +34,14 @@ Each task includes: - Bash 4+ - `bc` (installed by default on most Linux / macOS systems) +- Node.js 20+ (for config parsing and agent CLI installation) + +**For agent-based evaluation (optional):** + +| Agent | Install | Auth | +|---|---|---| +| GitHub Copilot | `npm install -g @github/copilot` | Active Copilot subscription; `GITHUB_TOKEN` env var | +| Google Gemini | `npm install -g @google/gemini-cli` | `GEMINI_API_KEY` env var | ## Running Evals Locally @@ -50,16 +60,73 @@ bash run-eval.sh --all --validate bash run-eval.sh grid-basic-setup --validate ``` +### Run evals against an AI agent + +Send the `instruction.md` to a coding agent CLI, let the agent generate code +in an isolated workspace, then run the deterministic grader on the output. + +```bash +cd evals + +# Run all tasks with GitHub Copilot CLI +bash run-eval.sh --all --agent copilot + +# Run a single task with Gemini CLI +bash run-eval.sh grid-basic-setup --agent gemini + +# Run 3 trials per task for statistical robustness +bash run-eval.sh --all --agent copilot --trials 3 +``` + ### npm scripts (convenience wrappers) ```bash cd evals + +# Validation (reference solutions) npm run validate # all tasks npm run validate:grid # grid-basic-setup only npm run validate:combo # component-combo-reactive-form only npm run validate:theming # theming-palette-generation only + +# Agent-based evaluation +npm run agent:copilot # all tasks with Copilot +npm run agent:copilot:grid # grid task with Copilot +npm run agent:gemini # all tasks with Gemini +npm run agent:gemini:theming # theming task with Gemini +``` + +## Agent Configuration + +Agent settings are stored in `eval-config.json`: + +```json +{ + "defaultAgent": "copilot", + "agents": { + "copilot": { + "command": "copilot", + "installCommand": "npm install -g @github/copilot", + "promptArgs": ["-p"], + "autoApproveArgs": ["--yes"], + "envAuth": "GITHUB_TOKEN" + }, + "gemini": { + "command": "gemini", + "installCommand": "npm install -g @google/gemini-cli", + "promptArgs": ["-p"], + "autoApproveArgs": ["--sandbox"], + "envAuth": "GEMINI_API_KEY" + } + }, + "trialCount": 1, + "timeoutSec": 600 +} ``` +You can customize the agent command, flags, and timeouts by editing this file. +To switch the default agent, change `defaultAgent`. + ## Adding a New Task 1. Create a directory under `evals/tasks//` with the standard structure: @@ -95,25 +162,43 @@ npm run validate:theming # theming-palette-generation only bash run-eval.sh --validate ``` +7. Test against at least one agent: + + ```bash + bash run-eval.sh --agent copilot + ``` + ## Pass / Fail Thresholds Following [Anthropic's recommendations](https://www.anthropic.com/engineering/demystifying-evals-for-ai-agents): | Metric | Threshold | Effect | |---|---|---| -| `pass@5 ≥ 80%` | **Merge gate** | At least 1 success in 5 trials required | -| `pass^5 ≥ 60%` | **Tracked** | Flags flaky skills for investigation | -| `pass@5 < 60%` | **Blocks merge** | On PRs touching the relevant skill | +| `pass@k ≥ 80%` | **Merge gate** | At least 1 success in k trials required | +| `pass@k ≥ 60%` | **Tracked** | Flags flaky skills for investigation | +| `pass@k < 60%` | **Blocks merge** | On PRs touching the relevant skill | ## CI Integration -The GitHub Actions workflow at `.github/workflows/skill-eval.yml` runs -automatically on PRs that modify `skills/**` or `evals/**`. It: +The GitHub Actions workflow at `.github/workflows/skill-eval.yml` provides two +evaluation modes: -1. Checks out the repo -2. Validates all graders against their reference solutions -3. Uploads results as an artifact -4. Posts a summary comment on the PR +### Automatic (on PR) +Runs on every PR that modifies `skills/**` or `evals/**`: +1. Validates all graders against their reference solutions +2. Uploads results as an artifact +3. Posts a summary comment on the PR + +### Manual (workflow_dispatch) +Triggered manually from the Actions tab to run agent-based evaluation: +1. Select the agent (`copilot` or `gemini`) and number of trials +2. Installs the selected agent CLI +3. Runs all tasks against the agent +4. Uploads results as an artifact + +**Secrets required for agent-based CI:** +- `GITHUB_TOKEN` — automatically available (for Copilot) +- `GEMINI_API_KEY` — must be added as a repository secret (for Gemini) ## Grading Strategy @@ -135,3 +220,7 @@ automatically on PRs that modify `skills/**` or `evals/**`. It: Baseline results are stored in `evals/results/baseline.json` and used for regression comparison on PRs. The CI workflow uploads per-run results as GitHub Actions artifacts. + +Agent-based results are suffixed with the agent name (e.g., +`grid-basic-setup-copilot.json`) to distinguish them from reference +validation results. diff --git a/evals/eval-config.json b/evals/eval-config.json new file mode 100644 index 00000000000..3c073c7832b --- /dev/null +++ b/evals/eval-config.json @@ -0,0 +1,23 @@ +{ + "defaultAgent": "copilot", + "agents": { + "copilot": { + "command": "copilot", + "installCommand": "npm install -g @github/copilot", + "promptArgs": ["-p"], + "autoApproveArgs": ["--yes"], + "envAuth": "GITHUB_TOKEN", + "description": "GitHub Copilot CLI (requires active Copilot subscription)" + }, + "gemini": { + "command": "gemini", + "installCommand": "npm install -g @google/gemini-cli", + "promptArgs": ["-p"], + "autoApproveArgs": ["--sandbox"], + "envAuth": "GEMINI_API_KEY", + "description": "Google Gemini CLI (requires GEMINI_API_KEY)" + } + }, + "trialCount": 1, + "timeoutSec": 600 +} diff --git a/evals/package.json b/evals/package.json index b660ea782d8..b6d79561471 100644 --- a/evals/package.json +++ b/evals/package.json @@ -12,7 +12,15 @@ "validate": "bash run-eval.sh --all --validate", "validate:grid": "bash run-eval.sh grid-basic-setup --validate", "validate:combo": "bash run-eval.sh component-combo-reactive-form --validate", - "validate:theming": "bash run-eval.sh theming-palette-generation --validate" + "validate:theming": "bash run-eval.sh theming-palette-generation --validate", + "agent:copilot": "bash run-eval.sh --all --agent copilot", + "agent:copilot:grid": "bash run-eval.sh grid-basic-setup --agent copilot", + "agent:copilot:combo": "bash run-eval.sh component-combo-reactive-form --agent copilot", + "agent:copilot:theming": "bash run-eval.sh theming-palette-generation --agent copilot", + "agent:gemini": "bash run-eval.sh --all --agent gemini", + "agent:gemini:grid": "bash run-eval.sh grid-basic-setup --agent gemini", + "agent:gemini:combo": "bash run-eval.sh component-combo-reactive-form --agent gemini", + "agent:gemini:theming": "bash run-eval.sh theming-palette-generation --agent gemini" }, "engines": { "node": ">=20.0.0" diff --git a/evals/run-eval.sh b/evals/run-eval.sh index 0fb93b9527a..802a9bc8c47 100755 --- a/evals/run-eval.sh +++ b/evals/run-eval.sh @@ -4,21 +4,26 @@ # not an installable package). # # Usage: -# bash run-eval.sh # validate one task -# bash run-eval.sh --all # validate all tasks -# bash run-eval.sh --validate # run reference solution then grade +# bash run-eval.sh # validate one task (reference solution) +# bash run-eval.sh --all # validate all tasks +# bash run-eval.sh --validate # run reference solution then grade +# bash run-eval.sh --agent copilot # run task using copilot CLI agent +# bash run-eval.sh --agent gemini # run task using gemini CLI agent +# bash run-eval.sh --all --agent copilot # run all tasks with copilot agent +# bash run-eval.sh --all --agent gemini --trials 3 # 3 trials per task with gemini set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" TASKS_DIR="$SCRIPT_DIR/tasks" RESULTS_DIR="$SCRIPT_DIR/results" +CONFIG_FILE="$SCRIPT_DIR/eval-config.json" # --- helpers --------------------------------------------------------------- # usage() { cat < [--validate] +Usage: $(basename "$0") [--validate] [--agent ] [--trials ] Arguments: Name of the task directory under tasks/ @@ -26,17 +31,144 @@ Arguments: Options: --validate Apply the reference solution before grading (sanity-check mode) + --agent NAME Run task using an AI agent CLI (copilot | gemini) + --trials N Number of trials per task when using --agent (default: 1) Examples: $(basename "$0") grid-basic-setup --validate $(basename "$0") --all + $(basename "$0") grid-basic-setup --agent copilot + $(basename "$0") --all --agent gemini --trials 3 EOF exit 1 } +# Read a JSON string field from eval-config.json +# Usage: read_config '.agents.copilot.command' +read_config() { + local QUERY="$1" + if [ ! -f "$CONFIG_FILE" ]; then + echo "" + return + fi + # Use node to parse JSON (available in CI and most dev environments) + node -e " + const fs = require('fs'); + const cfg = JSON.parse(fs.readFileSync('$CONFIG_FILE', 'utf8')); + const keys = '${QUERY}'.replace(/^\\./, '').split('.'); + let val = cfg; + for (const k of keys) { val = val?.[k]; } + if (Array.isArray(val)) { console.log(val.join(' ')); } + else { console.log(val ?? ''); } + " 2>/dev/null || echo "" +} + +# Resolve the agent CLI command and flags from config +resolve_agent() { + local AGENT_NAME="$1" + AGENT_CMD=$(read_config "agents.${AGENT_NAME}.command") + AGENT_PROMPT_ARGS=$(read_config "agents.${AGENT_NAME}.promptArgs") + AGENT_APPROVE_ARGS=$(read_config "agents.${AGENT_NAME}.autoApproveArgs") + AGENT_ENV_AUTH=$(read_config "agents.${AGENT_NAME}.envAuth") + + if [ -z "$AGENT_CMD" ]; then + echo "ERROR: Unknown agent '$AGENT_NAME'. Check eval-config.json" >&2 + exit 1 + fi + + # Verify the CLI is installed + if ! command -v "$AGENT_CMD" &>/dev/null; then + local INSTALL_CMD + INSTALL_CMD=$(read_config "agents.${AGENT_NAME}.installCommand") + echo "ERROR: '$AGENT_CMD' is not installed." >&2 + echo " Install with: $INSTALL_CMD" >&2 + exit 1 + fi + + # Verify the auth env var is set + if [ -n "$AGENT_ENV_AUTH" ]; then + if [ -z "${!AGENT_ENV_AUTH:-}" ]; then + echo "WARNING: $AGENT_ENV_AUTH is not set. The agent may fail to authenticate." >&2 + fi + fi +} + +# Run a single task using the agent CLI +run_agent_task() { + local TASK_DIR="$1" + local WORK_DIR="$2" + local AGENT_NAME="$3" + + local INSTRUCTION_FILE="$TASK_DIR/instruction.md" + if [ ! -f "$INSTRUCTION_FILE" ]; then + echo "ERROR: No instruction.md found at $INSTRUCTION_FILE" >&2 + return 1 + fi + + local PROMPT + PROMPT=$(cat "$INSTRUCTION_FILE") + + # Build the skill context preamble if skills/ directory exists + local SKILL_CONTEXT="" + if [ -d "$TASK_DIR/skills" ]; then + for SKILL_FILE in "$TASK_DIR"/skills/*/SKILL.md; do + if [ -f "$SKILL_FILE" ]; then + SKILL_CONTEXT="${SKILL_CONTEXT}$(cat "$SKILL_FILE")\n\n" + fi + done + fi + + # Combine skill context + instruction into a single prompt + local FULL_PROMPT="" + if [ -n "$SKILL_CONTEXT" ]; then + FULL_PROMPT="Use the following skill reference when completing the task:\n\n${SKILL_CONTEXT}---\n\n${PROMPT}" + else + FULL_PROMPT="$PROMPT" + fi + + echo " → Sending instruction to $AGENT_NAME agent …" + + local TIMEOUT_SEC + TIMEOUT_SEC=$(read_config "timeoutSec") + TIMEOUT_SEC="${TIMEOUT_SEC:-600}" + + # Build the agent command + local CMD_ARGS=() + CMD_ARGS+=("$AGENT_CMD") + + # Add prompt args (e.g., -p) + if [ -n "$AGENT_PROMPT_ARGS" ]; then + # shellcheck disable=SC2206 + CMD_ARGS+=($AGENT_PROMPT_ARGS) + fi + CMD_ARGS+=("$FULL_PROMPT") + + # Add auto-approve args (e.g., --yes, --sandbox) + if [ -n "$AGENT_APPROVE_ARGS" ]; then + # shellcheck disable=SC2206 + CMD_ARGS+=($AGENT_APPROVE_ARGS) + fi + + # Run the agent in the work directory with a timeout + local AGENT_EXIT=0 + ( + cd "$WORK_DIR" + timeout "${TIMEOUT_SEC}s" "${CMD_ARGS[@]}" 2>&1 || true + ) > "$WORK_DIR/agent-output.log" 2>&1 || AGENT_EXIT=$? + + if [ "$AGENT_EXIT" -eq 124 ]; then + echo " ⚠ Agent timed out after ${TIMEOUT_SEC}s" + elif [ "$AGENT_EXIT" -ne 0 ]; then + echo " ⚠ Agent exited with code $AGENT_EXIT" + fi + + echo " → Agent output saved to $WORK_DIR/agent-output.log" +} + run_task() { local TASK_ID="$1" - local VALIDATE="${2:-false}" + local MODE="${2:-validate}" # validate | agent + local AGENT_NAME="${3:-}" local TASK_DIR="$TASKS_DIR/$TASK_ID" if [ ! -d "$TASK_DIR" ]; then @@ -46,6 +178,9 @@ run_task() { echo "═══════════════════════════════════════════════════════" echo " Task: $TASK_ID" + if [ "$MODE" = "agent" ]; then + echo " Agent: $AGENT_NAME" + fi echo "═══════════════════════════════════════════════════════" # Create a temporary workspace so graders run in isolation @@ -56,14 +191,17 @@ run_task() { # Seed the workspace with a minimal src/ tree mkdir -p "$WORK_DIR/src" - # If --validate, apply the reference solution first - if [ "$VALIDATE" = "true" ]; then + if [ "$MODE" = "validate" ]; then + # --validate: apply the reference solution first if [ ! -f "$TASK_DIR/solution/solve.sh" ]; then echo "ERROR: No reference solution at $TASK_DIR/solution/solve.sh" >&2 return 1 fi echo "→ Applying reference solution …" (cd "$WORK_DIR" && bash "$TASK_DIR/solution/solve.sh") + elif [ "$MODE" = "agent" ]; then + # --agent: send the instruction to the agent CLI + run_agent_task "$TASK_DIR" "$WORK_DIR" "$AGENT_NAME" fi # Run deterministic grader @@ -98,9 +236,14 @@ run_task() { # Persist result — includes passRate/passAtK so the CI summary comment can # read them directly (these are the fields the workflow script expects). mkdir -p "$RESULTS_DIR" - cat > "$RESULTS_DIR/${TASK_ID}.json" < "$RESULTS_DIR/${TASK_ID}${RESULT_SUFFIX}.json" <&2 + return 1 + fi + + local PASS_COUNT=0 + local TOTAL_REWARD=0 + + for i in $(seq 1 "$TRIALS"); do + echo "" + echo " ── Trial $i/$TRIALS ──" + + # Create a temporary workspace for each trial + local WORK_DIR + WORK_DIR=$(mktemp -d) + + mkdir -p "$WORK_DIR/src" + + # Send to agent + run_agent_task "$TASK_DIR" "$WORK_DIR" "$AGENT_NAME" + + # Run grader + local GRADER_EXIT=0 + (cd "$WORK_DIR" && bash "$TASK_DIR/tests/test.sh") || GRADER_EXIT=$? + + local REWARD="0" + if [ -f "$WORK_DIR/logs/verifier/reward.txt" ]; then + REWARD=$(cat "$WORK_DIR/logs/verifier/reward.txt") + fi + + if [ "$GRADER_EXIT" -eq 0 ]; then + PASS_COUNT=$((PASS_COUNT + 1)) + fi + TOTAL_REWARD=$(echo "$TOTAL_REWARD + $REWARD" | bc) + + # Cleanup trial workspace + rm -rf "$WORK_DIR" + + echo " Trial $i: reward=$REWARD $([ "$GRADER_EXIT" -eq 0 ] && echo "✅" || echo "❌")" + done + + # Calculate aggregate metrics + local PASS_RATE + PASS_RATE=$(echo "scale=2; $PASS_COUNT / $TRIALS" | bc) + # pass@k = 1 if at least one trial passed, else 0 + local PASS_AT_K=0 + if [ "$PASS_COUNT" -gt 0 ]; then + PASS_AT_K=1 + fi + local AVG_REWARD + AVG_REWARD=$(echo "scale=2; $TOTAL_REWARD / $TRIALS" | bc) + + echo "" + echo " ═══ Aggregate ($TRIALS trials) ═══" + echo " Pass rate: $PASS_COUNT/$TRIALS ($PASS_RATE)" + echo " pass@$TRIALS: $PASS_AT_K" + echo " Avg reward: $AVG_REWARD" + echo "" + + # Persist aggregated result + mkdir -p "$RESULTS_DIR" + cat > "$RESULTS_DIR/${TASK_ID}-${AGENT_NAME}.json" <&2 + exit 1 + fi + shift 2 + ;; + --trials) + TRIALS="${2:-1}" + shift 2 + ;; + -h|--help) + usage + ;; + *) + if [ -z "$TASK_ARG" ]; then + TASK_ARG="$1" + fi + shift + ;; + esac +done + +if [ -z "$TASK_ARG" ]; then + usage +fi + +# If using agent mode, resolve and verify agent CLI +if [ "$MODE" = "agent" ]; then + # Default to configured agent if none specified + if [ -z "$AGENT_NAME" ]; then + AGENT_NAME=$(read_config "defaultAgent") + AGENT_NAME="${AGENT_NAME:-copilot}" + fi + resolve_agent "$AGENT_NAME" + echo "Using agent: $AGENT_NAME ($AGENT_CMD)" + echo "" fi OVERALL_EXIT=0 @@ -129,10 +408,24 @@ OVERALL_EXIT=0 if [ "$TASK_ARG" = "--all" ]; then for TASK_PATH in "$TASKS_DIR"/*/; do TASK_NAME=$(basename "$TASK_PATH") - run_task "$TASK_NAME" "$VALIDATE" || OVERALL_EXIT=1 + if [ "$MODE" = "agent" ] && [ "$TRIALS" -gt 1 ]; then + echo "═══════════════════════════════════════════════════════" + echo " Task: $TASK_NAME (Agent: $AGENT_NAME, $TRIALS trials)" + echo "═══════════════════════════════════════════════════════" + run_task_trials "$TASK_NAME" "$AGENT_NAME" "$TRIALS" || OVERALL_EXIT=1 + else + run_task "$TASK_NAME" "$MODE" "$AGENT_NAME" || OVERALL_EXIT=1 + fi done else - run_task "$TASK_ARG" "$VALIDATE" || OVERALL_EXIT=1 + if [ "$MODE" = "agent" ] && [ "$TRIALS" -gt 1 ]; then + echo "═══════════════════════════════════════════════════════" + echo " Task: $TASK_ARG (Agent: $AGENT_NAME, $TRIALS trials)" + echo "═══════════════════════════════════════════════════════" + run_task_trials "$TASK_ARG" "$AGENT_NAME" "$TRIALS" || OVERALL_EXIT=1 + else + run_task "$TASK_ARG" "$MODE" "$AGENT_NAME" || OVERALL_EXIT=1 + fi fi exit "$OVERALL_EXIT" From 665264b892ec37e00b09439753fdeaf4f2dabf80 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 10 Mar 2026 09:33:53 +0000 Subject: [PATCH 14/17] fix: use read -ra for safe array parsing, add TRIALS guard Co-authored-by: kdinev <1472513+kdinev@users.noreply.github.com> --- evals/run-eval.sh | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/evals/run-eval.sh b/evals/run-eval.sh index 802a9bc8c47..348788dc861 100755 --- a/evals/run-eval.sh +++ b/evals/run-eval.sh @@ -138,15 +138,15 @@ run_agent_task() { # Add prompt args (e.g., -p) if [ -n "$AGENT_PROMPT_ARGS" ]; then - # shellcheck disable=SC2206 - CMD_ARGS+=($AGENT_PROMPT_ARGS) + read -ra _PROMPT_PARTS <<< "$AGENT_PROMPT_ARGS" + CMD_ARGS+=("${_PROMPT_PARTS[@]}") fi CMD_ARGS+=("$FULL_PROMPT") # Add auto-approve args (e.g., --yes, --sandbox) if [ -n "$AGENT_APPROVE_ARGS" ]; then - # shellcheck disable=SC2206 - CMD_ARGS+=($AGENT_APPROVE_ARGS) + read -ra _APPROVE_PARTS <<< "$AGENT_APPROVE_ARGS" + CMD_ARGS+=("${_APPROVE_PARTS[@]}") fi # Run the agent in the work directory with a timeout @@ -304,6 +304,10 @@ run_task_trials() { done # Calculate aggregate metrics + if [ "$TRIALS" -le 0 ]; then + echo "ERROR: TRIALS must be > 0" >&2 + return 1 + fi local PASS_RATE PASS_RATE=$(echo "scale=2; $PASS_COUNT / $TRIALS" | bc) # pass@k = 1 if at least one trial passed, else 0 From b3fa97391e69df7653bf1dac213d17b7d2fd6dee Mon Sep 17 00:00:00 2001 From: Konstantin Dinev Date: Tue, 10 Mar 2026 11:46:27 +0200 Subject: [PATCH 15/17] Update skill-eval.yml --- .github/workflows/skill-eval.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/skill-eval.yml b/.github/workflows/skill-eval.yml index 8109bb6106d..63fba463be7 100644 --- a/.github/workflows/skill-eval.yml +++ b/.github/workflows/skill-eval.yml @@ -55,7 +55,6 @@ jobs: # Job 2: Run evals against an AI agent (copilot or gemini) # Triggered manually via workflow_dispatch, or can be called from other workflows agent_eval: - if: github.event_name == 'workflow_dispatch' runs-on: ubuntu-latest timeout-minutes: 60 From 1330989356b35150d862272597d3ad31492507f7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 10 Mar 2026 09:54:04 +0000 Subject: [PATCH 16/17] rework CI workflow to always run against both copilot and gemini agents Co-authored-by: kdinev <1472513+kdinev@users.noreply.github.com> --- .github/workflows/skill-eval.yml | 163 ++++++++++++++++++++----------- evals/README.md | 29 +++--- 2 files changed, 119 insertions(+), 73 deletions(-) diff --git a/.github/workflows/skill-eval.yml b/.github/workflows/skill-eval.yml index 63fba463be7..95541d6c576 100644 --- a/.github/workflows/skill-eval.yml +++ b/.github/workflows/skill-eval.yml @@ -6,27 +6,13 @@ on: - 'skills/**' - 'evals/**' workflow_dispatch: - inputs: - agent: - description: 'Agent to run evals against (copilot or gemini)' - required: true - default: 'copilot' - type: choice - options: - - copilot - - gemini - trials: - description: 'Number of trials per task' - required: false - default: '1' - type: string permissions: contents: read pull-requests: write jobs: - # Job 1: Always validate graders against reference solutions + # Job 1: Validate graders against reference solutions validate_graders: runs-on: ubuntu-latest timeout-minutes: 10 @@ -52,9 +38,8 @@ jobs: path: evals/results/ retention-days: 30 - # Job 2: Run evals against an AI agent (copilot or gemini) - # Triggered manually via workflow_dispatch, or can be called from other workflows - agent_eval: + # Job 2: Run evals against the Copilot agent + agent_eval_copilot: runs-on: ubuntu-latest timeout-minutes: 60 @@ -68,35 +53,57 @@ jobs: node-version: '22' - name: Install Copilot CLI - if: inputs.agent == 'copilot' run: npm install -g @github/copilot + - name: Run eval against Copilot + working-directory: evals + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: bash run-eval.sh --all --agent copilot + + - name: Upload Copilot eval results + if: always() + uses: actions/upload-artifact@v4 + with: + name: skill-eval-agent-copilot-results + path: evals/results/ + retention-days: 30 + + # Job 3: Run evals against the Gemini agent + agent_eval_gemini: + runs-on: ubuntu-latest + timeout-minutes: 60 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: '22' + - name: Install Gemini CLI - if: inputs.agent == 'gemini' run: npm install -g @google/gemini-cli - - name: Run agent-based eval + - name: Run eval against Gemini working-directory: evals env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} - run: | - bash run-eval.sh --all \ - --agent ${{ inputs.agent }} \ - --trials ${{ inputs.trials || '1' }} + run: bash run-eval.sh --all --agent gemini - - name: Upload agent eval results + - name: Upload Gemini eval results if: always() uses: actions/upload-artifact@v4 with: - name: skill-eval-agent-${{ inputs.agent }}-results + name: skill-eval-agent-gemini-results path: evals/results/ retention-days: 30 - # Job 3: Post summary comment on PRs + # Job 4: Post combined summary comment on PRs post_summary: if: always() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.fork == false - needs: [validate_graders] + needs: [validate_graders, agent_eval_copilot, agent_eval_gemini] runs-on: ubuntu-latest steps: @@ -104,7 +111,22 @@ jobs: uses: actions/download-artifact@v4 with: name: skill-eval-validation-results - path: evals/results/ + path: evals/results/validation + continue-on-error: true + + - name: Download Copilot results + uses: actions/download-artifact@v4 + with: + name: skill-eval-agent-copilot-results + path: evals/results/copilot + continue-on-error: true + + - name: Download Gemini results + uses: actions/download-artifact@v4 + with: + name: skill-eval-agent-gemini-results + path: evals/results/gemini + continue-on-error: true - name: Post summary comment uses: actions/github-script@v7 @@ -113,40 +135,69 @@ jobs: const fs = require('fs'); const path = require('path'); - const resultsDir = 'evals/results'; - let summary = '## 📊 Skill Eval Results\n\n'; - - try { - const files = fs.readdirSync(resultsDir).filter(f => f.endsWith('.json') && f !== 'baseline.json'); - if (files.length === 0) { - summary += '> ⚠️ No eval results found. The eval run may have failed.\n'; - } else { - summary += '| Task | Agent | Pass Rate | pass@k | Status |\n'; - summary += '|---|---|---|---|---|\n'; - + function readResults(dir) { + const results = []; + try { + if (!fs.existsSync(dir)) return results; + const files = fs.readdirSync(dir).filter(f => f.endsWith('.json') && f !== 'baseline.json'); for (const file of files) { try { - const data = JSON.parse(fs.readFileSync(path.join(resultsDir, file), 'utf8')); - const taskName = data.task || file.replace('.json', ''); - const agent = data.agent || 'reference'; - const passRate = data.passRate != null ? `${(data.passRate * 100).toFixed(0)}%` : 'N/A'; - const passAtK = data.passAtK != null ? `${(data.passAtK * 100).toFixed(0)}%` : 'N/A'; - const status = data.passAtK >= 0.8 ? '✅' : data.passAtK >= 0.6 ? '⚠️' : '❌'; - summary += `| ${taskName} | ${agent} | ${passRate} | ${passAtK} | ${status} |\n`; + results.push(JSON.parse(fs.readFileSync(path.join(dir, file), 'utf8'))); } catch (e) { - summary += `| ${file} | — | Error | Error | ❌ |\n`; + results.push({ task: file.replace('.json', ''), error: true }); } } + } catch (e) { /* dir doesn't exist */ } + return results; + } + + let summary = '## 📊 Skill Eval Results\n\n'; + + // --- Validation results --- + const validation = readResults('evals/results/validation'); + if (validation.length > 0) { + summary += '### Grader Validation (reference solutions)\n\n'; + summary += '| Task | Pass Rate | Status |\n'; + summary += '|---|---|---|\n'; + for (const r of validation) { + if (r.error) { summary += `| ${r.task} | Error | ❌ |\n`; continue; } + const passRate = r.passRate != null ? `${(r.passRate * 100).toFixed(0)}%` : 'N/A'; + const status = r.passRate >= 1.0 ? '✅' : '❌'; + summary += `| ${r.task} | ${passRate} | ${status} |\n`; + } + summary += '\n'; + } - summary += '\n### Thresholds\n'; - summary += '- ✅ `pass@k ≥ 80%` — merge gate passed\n'; - summary += '- ⚠️ `pass@k ≥ 60%` — needs investigation\n'; - summary += '- ❌ `pass@k < 60%` — blocks merge for affected skill\n'; + // --- Agent results --- + const copilot = readResults('evals/results/copilot'); + const gemini = readResults('evals/results/gemini'); + + if (copilot.length > 0 || gemini.length > 0) { + summary += '### Agent Evaluation\n\n'; + summary += '| Task | Agent | Pass Rate | pass@k | Status |\n'; + summary += '|---|---|---|---|---|\n'; + + for (const r of [...copilot, ...gemini]) { + if (r.error) { summary += `| ${r.task} | — | Error | Error | ❌ |\n`; continue; } + const taskName = r.task || 'unknown'; + const agent = r.agent || 'unknown'; + const passRate = r.passRate != null ? `${(r.passRate * 100).toFixed(0)}%` : 'N/A'; + const passAtK = r.passAtK != null ? `${(r.passAtK * 100).toFixed(0)}%` : 'N/A'; + const status = r.passAtK >= 0.8 ? '✅' : r.passAtK >= 0.6 ? '⚠️' : '❌'; + summary += `| ${taskName} | ${agent} | ${passRate} | ${passAtK} | ${status} |\n`; } - } catch (e) { - summary += `> ⚠️ Could not read results: ${e.message}\n`; + summary += '\n'; } + if (validation.length === 0 && copilot.length === 0 && gemini.length === 0) { + summary += '> ⚠️ No eval results found. The eval runs may have failed.\n'; + } + + summary += '### Thresholds\n'; + summary += '- ✅ `pass@k ≥ 80%` — merge gate passed\n'; + summary += '- ⚠️ `pass@k ≥ 60%` — needs investigation\n'; + summary += '- ❌ `pass@k < 60%` — blocks merge for affected skill\n'; + await github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, diff --git a/evals/README.md b/evals/README.md index 2c7160237bc..3bdab554e0d 100644 --- a/evals/README.md +++ b/evals/README.md @@ -180,23 +180,18 @@ Following [Anthropic's recommendations](https://www.anthropic.com/engineering/de ## CI Integration -The GitHub Actions workflow at `.github/workflows/skill-eval.yml` provides two -evaluation modes: - -### Automatic (on PR) -Runs on every PR that modifies `skills/**` or `evals/**`: -1. Validates all graders against their reference solutions -2. Uploads results as an artifact -3. Posts a summary comment on the PR - -### Manual (workflow_dispatch) -Triggered manually from the Actions tab to run agent-based evaluation: -1. Select the agent (`copilot` or `gemini`) and number of trials -2. Installs the selected agent CLI -3. Runs all tasks against the agent -4. Uploads results as an artifact - -**Secrets required for agent-based CI:** +The GitHub Actions workflow at `.github/workflows/skill-eval.yml` runs +both on PRs (that modify `skills/**` or `evals/**`) and via manual +`workflow_dispatch`. Every run executes three parallel jobs: + +1. **Grader validation** — applies reference solutions, verifies graders score 100% +2. **Copilot agent eval** — installs `@github/copilot`, runs all tasks against Copilot CLI +3. **Gemini agent eval** — installs `@google/gemini-cli`, runs all tasks against Gemini CLI + +A fourth summary job collects results from all three and posts a combined +PR comment showing pass rates per task per agent. + +**Secrets required:** - `GITHUB_TOKEN` — automatically available (for Copilot) - `GEMINI_API_KEY` — must be added as a repository secret (for Gemini) From a9da524cf3c1e3b9b1b30d00c9311dee8dafb805 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 10 Mar 2026 10:06:46 +0000 Subject: [PATCH 17/17] add agent prompt files, switch CI to npm scripts, clean up README Co-authored-by: kdinev <1472513+kdinev@users.noreply.github.com> --- .github/workflows/skill-eval.yml | 6 +-- evals/README.md | 49 +++++++++++-------- evals/run-eval.sh | 14 ++++-- .../component-combo-reactive-form/prompt.md | 28 +++++++++++ evals/tasks/grid-basic-setup/prompt.md | 29 +++++++++++ .../theming-palette-generation/prompt.md | 14 ++++++ 6 files changed, 111 insertions(+), 29 deletions(-) create mode 100644 evals/tasks/component-combo-reactive-form/prompt.md create mode 100644 evals/tasks/grid-basic-setup/prompt.md create mode 100644 evals/tasks/theming-palette-generation/prompt.md diff --git a/.github/workflows/skill-eval.yml b/.github/workflows/skill-eval.yml index 95541d6c576..1be42960da9 100644 --- a/.github/workflows/skill-eval.yml +++ b/.github/workflows/skill-eval.yml @@ -28,7 +28,7 @@ jobs: - name: Validate graders against reference solutions working-directory: evals - run: bash run-eval.sh --all --validate + run: npm run validate - name: Upload validation results if: always() @@ -59,7 +59,7 @@ jobs: working-directory: evals env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: bash run-eval.sh --all --agent copilot + run: npm run agent:copilot - name: Upload Copilot eval results if: always() @@ -90,7 +90,7 @@ jobs: working-directory: evals env: GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} - run: bash run-eval.sh --all --agent gemini + run: npm run agent:gemini - name: Upload Gemini eval results if: always() diff --git a/evals/README.md b/evals/README.md index 3bdab554e0d..196948924df 100644 --- a/evals/README.md +++ b/evals/README.md @@ -23,8 +23,9 @@ The suite tests three skills: Each task includes: -- **`instruction.md`** — the prompt given to the agent -- **`tests/test.sh`** — deterministic grader (file checks, compilation, lint) +- **`prompt.md`** — the agent prompt sent to the CLI (concise, actionable) +- **`instruction.md`** — human-readable task description (detailed requirements) +- **`tests/test.sh`** — deterministic grader (file checks, import validation, ordering) - **`prompts/quality.md`** — LLM rubric grader (intent routing, API usage) - **`solution/solve.sh`** — reference solution for baseline validation - **`environment/Dockerfile`** — isolated environment for agent execution @@ -54,31 +55,34 @@ confirm the grader scores 100%. Use this to catch grader regressions. cd evals # Validate all tasks -bash run-eval.sh --all --validate +npm run validate # Validate a single task -bash run-eval.sh grid-basic-setup --validate +npm run validate:grid +npm run validate:combo +npm run validate:theming ``` ### Run evals against an AI agent -Send the `instruction.md` to a coding agent CLI, let the agent generate code +Send the `prompt.md` to a coding agent CLI, let the agent generate code in an isolated workspace, then run the deterministic grader on the output. ```bash cd evals # Run all tasks with GitHub Copilot CLI -bash run-eval.sh --all --agent copilot +npm run agent:copilot -# Run a single task with Gemini CLI -bash run-eval.sh grid-basic-setup --agent gemini +# Run all tasks with Gemini CLI +npm run agent:gemini -# Run 3 trials per task for statistical robustness -bash run-eval.sh --all --agent copilot --trials 3 +# Run a single task with a specific agent +npm run agent:copilot:grid +npm run agent:gemini:theming ``` -### npm scripts (convenience wrappers) +### All npm scripts ```bash cd evals @@ -134,7 +138,8 @@ To switch the default agent, change `defaultAgent`. ``` tasks// ├── task.toml # Config: grader metadata, weights, timeouts - ├── instruction.md # Agent prompt + ├── prompt.md # Agent prompt (sent to CLI agents) + ├── instruction.md # Human-readable task description ├── environment/Dockerfile # Container setup (for future Docker-based runs) ├── tests/test.sh # Deterministic grader ├── prompts/quality.md # LLM rubric grader @@ -143,29 +148,31 @@ To switch the default agent, change `defaultAgent`. └── /SKILL.md ``` -2. Write a clear, unambiguous `instruction.md` that tells the agent exactly what - to build. +2. Write a clear, unambiguous `instruction.md` with full task requirements. -3. Write `tests/test.sh` to check **outcomes** (files exist, correct selectors +3. Write a concise `prompt.md` that is sent directly to the agent CLI. This + should be a focused, actionable prompt derived from the instruction. + +4. Write `tests/test.sh` to check **outcomes** (files exist, correct selectors and entry-point imports are present, correct API call ordering) rather than specific steps. The grader must write a reward (0.0–1.0) to `logs/verifier/reward.txt`. -4. Write `prompts/quality.md` with rubric dimensions that sum to 1.0. +5. Write `prompts/quality.md` with rubric dimensions that sum to 1.0. -5. Write `solution/solve.sh` — a shell script that proves the task is solvable +6. Write `solution/solve.sh` — a shell script that proves the task is solvable and validates that the graders work correctly. -6. Validate graders before submitting: +7. Validate graders before submitting: ```bash - bash run-eval.sh --validate + npm run validate: ``` -7. Test against at least one agent: +8. Test against at least one agent: ```bash - bash run-eval.sh --agent copilot + npm run agent:copilot: ``` ## Pass / Fail Thresholds diff --git a/evals/run-eval.sh b/evals/run-eval.sh index 348788dc861..9d7f026dca3 100755 --- a/evals/run-eval.sh +++ b/evals/run-eval.sh @@ -99,14 +99,18 @@ run_agent_task() { local WORK_DIR="$2" local AGENT_NAME="$3" - local INSTRUCTION_FILE="$TASK_DIR/instruction.md" - if [ ! -f "$INSTRUCTION_FILE" ]; then - echo "ERROR: No instruction.md found at $INSTRUCTION_FILE" >&2 + # Prefer prompt.md (agent-oriented prompt) over instruction.md (human-oriented task description) + local PROMPT_FILE="$TASK_DIR/prompt.md" + if [ ! -f "$PROMPT_FILE" ]; then + PROMPT_FILE="$TASK_DIR/instruction.md" + fi + if [ ! -f "$PROMPT_FILE" ]; then + echo "ERROR: No prompt.md or instruction.md found in $TASK_DIR" >&2 return 1 fi local PROMPT - PROMPT=$(cat "$INSTRUCTION_FILE") + PROMPT=$(cat "$PROMPT_FILE") # Build the skill context preamble if skills/ directory exists local SKILL_CONTEXT="" @@ -118,7 +122,7 @@ run_agent_task() { done fi - # Combine skill context + instruction into a single prompt + # Combine skill context + prompt into a single agent instruction local FULL_PROMPT="" if [ -n "$SKILL_CONTEXT" ]; then FULL_PROMPT="Use the following skill reference when completing the task:\n\n${SKILL_CONTEXT}---\n\n${PROMPT}" diff --git a/evals/tasks/component-combo-reactive-form/prompt.md b/evals/tasks/component-combo-reactive-form/prompt.md new file mode 100644 index 00000000000..299eb44b413 --- /dev/null +++ b/evals/tasks/component-combo-reactive-form/prompt.md @@ -0,0 +1,28 @@ +# Agent Prompt: Combo with Reactive Form + +You are working in an Angular 20+ project that already has `igniteui-angular` installed. + +Create a `UserSettingsComponent` at `src/app/user-settings/user-settings.component.ts` with a reactive form containing a multi-select combo for notification channel selection. + +Use this data: + +```typescript +channels = [ + { id: 1, name: 'Email', icon: 'email' }, + { id: 2, name: 'SMS', icon: 'sms' }, + { id: 3, name: 'Push Notification', icon: 'notifications' }, + { id: 4, name: 'Slack', icon: 'chat' }, + { id: 5, name: 'Microsoft Teams', icon: 'groups' }, +]; +``` + +Requirements: +- Use the Ignite UI for Angular `igx-combo` component (NOT igx-select, native select, or mat-select) +- Bind the combo to a `notificationChannels` FormControl inside a FormGroup +- Set displayKey to 'name' and valueKey to 'id' +- Add required validation (at least one channel must be selected) +- Add a submit button disabled when form is invalid +- Import IgxComboComponent from the `igniteui-angular/combo` entry point (not the root barrel) +- Import ReactiveFormsModule for form support +- Component must be standalone with ChangeDetectionStrategy.OnPush +- Create both a `.ts` file and a `.html` template file diff --git a/evals/tasks/grid-basic-setup/prompt.md b/evals/tasks/grid-basic-setup/prompt.md new file mode 100644 index 00000000000..b019190c21c --- /dev/null +++ b/evals/tasks/grid-basic-setup/prompt.md @@ -0,0 +1,29 @@ +# Agent Prompt: Grid Basic Setup + +You are working in an Angular 20+ project that already has `igniteui-angular` installed. + +Create an `EmployeeListComponent` at `src/app/employee-list/employee-list.component.ts` that shows a data grid with employee data, sorting on all columns, and pagination with 5 items per page. + +Use this flat employee data: + +```typescript +employees = [ + { id: 1, name: 'Alice Johnson', department: 'Engineering', salary: 95000, hireDate: new Date('2020-03-15') }, + { id: 2, name: 'Bob Smith', department: 'Marketing', salary: 72000, hireDate: new Date('2019-07-22') }, + { id: 3, name: 'Carol Davis', department: 'Engineering', salary: 105000, hireDate: new Date('2018-01-10') }, + { id: 4, name: 'David Wilson', department: 'Sales', salary: 68000, hireDate: new Date('2021-11-05') }, + { id: 5, name: 'Eva Martinez', department: 'Engineering', salary: 98000, hireDate: new Date('2020-09-18') }, + { id: 6, name: 'Frank Brown', department: 'Marketing', salary: 75000, hireDate: new Date('2017-04-30') }, + { id: 7, name: 'Grace Lee', department: 'Sales', salary: 82000, hireDate: new Date('2019-12-01') }, + { id: 8, name: 'Henry Taylor', department: 'Engineering', salary: 110000, hireDate: new Date('2016-06-14') }, +]; +``` + +Requirements: +- Use the Ignite UI for Angular `igx-grid` component (NOT tree-grid, hierarchical-grid, or pivot-grid) +- Display columns: id, name, department, salary, hireDate +- Enable sorting on all columns +- Add a paginator with page size of 5 +- Import from the `igniteui-angular/grids/grid` entry point (not the root barrel) +- Component must be standalone with ChangeDetectionStrategy.OnPush +- Create both a `.ts` file and a `.html` template file diff --git a/evals/tasks/theming-palette-generation/prompt.md b/evals/tasks/theming-palette-generation/prompt.md new file mode 100644 index 00000000000..a665e386564 --- /dev/null +++ b/evals/tasks/theming-palette-generation/prompt.md @@ -0,0 +1,14 @@ +# Agent Prompt: Custom Branded Theme + +You are working in an Angular 20+ project that already has `igniteui-angular` installed with Sass support. + +Create a custom Ignite UI for Angular theme in `src/styles.scss` with a blue primary and orange secondary palette. + +Requirements: +- Import from `igniteui-angular/theming` using `@use` syntax +- Create a palette with primary #1976D2, secondary #FF9800, and a light surface color +- Configure typography with a sans-serif font family +- Call `@include core()` BEFORE `@include theme()` +- Pass the palette to the `theme()` mixin +- Use the `palette()` function (do NOT hardcode CSS custom properties) +- Use `@use` module syntax (not deprecated `@import`)