From 5b7cca0ebebb4e4e018fddb4f94ca949ff3ea810 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 8 Mar 2026 07:05:56 +0000
Subject: [PATCH 01/17] Initial plan


From 23aecf0adb50c983c55667e57baffbecfb2deca2 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 8 Mar 2026 07:12:41 +0000
Subject: [PATCH 02/17] feat: scaffold eval test suite with three skill tasks
 and CI workflow

Add automated eval test suite for Angular Skills using the skill-eval framework:
- grid-basic-setup task (igniteui-angular-grids skill)
- component-combo-reactive-form task (igniteui-angular-components skill)
- theming-palette-generation task (igniteui-angular-theming skill)
- GitHub Actions workflow for CI integration
- Baseline results JSON for regression comparison
- README with local run and task authoring docs

Co-authored-by: zdrawku <11193764+zdrawku@users.noreply.github.com>
---
 .github/workflows/skill-eval.yml              |  88 ++++++++++
 .gitignore                                    |   5 +
 evals/README.md                               | 155 ++++++++++++++++++
 evals/package.json                            |  21 +++
 evals/results/baseline.json                   |  36 ++++
 .../environment/Dockerfile                    |  17 ++
 .../instruction.md                            |  40 +++++
 .../prompts/quality.md                        |  28 ++++
 .../igniteui-angular-components/SKILL.md      |   1 +
 .../solution/solve.sh                         |  55 +++++++
 .../component-combo-reactive-form/task.toml   |  26 +++
 .../tests/test.sh                             | 100 +++++++++++
 .../grid-basic-setup/environment/Dockerfile   |  17 ++
 evals/tasks/grid-basic-setup/instruction.md   |  36 ++++
 .../tasks/grid-basic-setup/prompts/quality.md |  25 +++
 .../skills/igniteui-angular-grids/SKILL.md    |   1 +
 .../tasks/grid-basic-setup/solution/solve.sh  |  45 +++++
 evals/tasks/grid-basic-setup/task.toml        |  26 +++
 evals/tasks/grid-basic-setup/tests/test.sh    |  99 +++++++++++
 .../environment/Dockerfile                    |  17 ++
 .../theming-palette-generation/instruction.md |  27 +++
 .../prompts/quality.md                        |  27 +++
 .../skills/igniteui-angular-theming/SKILL.md  |   1 +
 .../solution/solve.sh                         |  27 +++
 .../theming-palette-generation/task.toml      |  26 +++
 .../theming-palette-generation/tests/test.sh  |  82 +++++++++
 26 files changed, 1028 insertions(+)
 create mode 100644 .github/workflows/skill-eval.yml
 create mode 100644 evals/README.md
 create mode 100644 evals/package.json
 create mode 100644 evals/results/baseline.json
 create mode 100644 evals/tasks/component-combo-reactive-form/environment/Dockerfile
 create mode 100644 evals/tasks/component-combo-reactive-form/instruction.md
 create mode 100644 evals/tasks/component-combo-reactive-form/prompts/quality.md
 create mode 120000 evals/tasks/component-combo-reactive-form/skills/igniteui-angular-components/SKILL.md
 create mode 100755 evals/tasks/component-combo-reactive-form/solution/solve.sh
 create mode 100644 evals/tasks/component-combo-reactive-form/task.toml
 create mode 100755 evals/tasks/component-combo-reactive-form/tests/test.sh
 create mode 100644 evals/tasks/grid-basic-setup/environment/Dockerfile
 create mode 100644 evals/tasks/grid-basic-setup/instruction.md
 create mode 100644 evals/tasks/grid-basic-setup/prompts/quality.md
 create mode 120000 evals/tasks/grid-basic-setup/skills/igniteui-angular-grids/SKILL.md
 create mode 100755 evals/tasks/grid-basic-setup/solution/solve.sh
 create mode 100644 evals/tasks/grid-basic-setup/task.toml
 create mode 100755 evals/tasks/grid-basic-setup/tests/test.sh
 create mode 100644 evals/tasks/theming-palette-generation/environment/Dockerfile
 create mode 100644 evals/tasks/theming-palette-generation/instruction.md
 create mode 100644 evals/tasks/theming-palette-generation/prompts/quality.md
 create mode 120000 evals/tasks/theming-palette-generation/skills/igniteui-angular-theming/SKILL.md
 create mode 100755 evals/tasks/theming-palette-generation/solution/solve.sh
 create mode 100644 evals/tasks/theming-palette-generation/task.toml
 create mode 100755 evals/tasks/theming-palette-generation/tests/test.sh

diff --git a/.github/workflows/skill-eval.yml b/.github/workflows/skill-eval.yml
new file mode 100644
index 00000000000..7e172c404a6
--- /dev/null
+++ b/.github/workflows/skill-eval.yml
@@ -0,0 +1,88 @@
+name: Skill Eval
+
+on:
+  pull_request:
+    paths:
+      - 'skills/**'
+      - 'evals/**'
+
+jobs:
+  eval:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: '20'
+
+      - name: Install eval dependencies
+        working-directory: evals
+        run: npm install
+
+      - name: Run skill evals
+        working-directory: evals
+        run: npx skill-eval _ --suite=all --trials=5
+        env:
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+          GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
+
+      - name: Upload results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: skill-eval-results
+          path: evals/results/
+          retention-days: 30
+
+      - name: Post summary comment
+        if: always() && github.event_name == 'pull_request'
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require('fs');
+            const path = require('path');
+
+            const resultsDir = 'evals/results';
+            let summary = '## 📊 Skill Eval Results\n\n';
+
+            try {
+              const files = fs.readdirSync(resultsDir).filter(f => f.endsWith('.json'));
+              if (files.length === 0) {
+                summary += '> ⚠️ No eval results found. The eval run may have failed.\n';
+              } else {
+                summary += '| Task | Pass Rate | pass@5 | Status |\n';
+                summary += '|---|---|---|---|\n';
+
+                for (const file of files) {
+                  try {
+                    const data = JSON.parse(fs.readFileSync(path.join(resultsDir, file), 'utf8'));
+                    const taskName = data.task || file.replace('.json', '');
+                    const passRate = data.passRate != null ? `${(data.passRate * 100).toFixed(0)}%` : 'N/A';
+                    const passAtK = data.passAtK != null ? `${(data.passAtK * 100).toFixed(0)}%` : 'N/A';
+                    const status = data.passAtK >= 0.8 ? '✅' : data.passAtK >= 0.6 ? '⚠️' : '❌';
+                    summary += `| ${taskName} | ${passRate} | ${passAtK} | ${status} |\n`;
+                  } catch (e) {
+                    summary += `| ${file} | Error | Error | ❌ |\n`;
+                  }
+                }
+
+                summary += '\n### Thresholds\n';
+                summary += '- ✅ `pass@5 ≥ 80%` — merge gate passed\n';
+                summary += '- ⚠️ `pass@5 ≥ 60%` — needs investigation\n';
+                summary += '- ❌ `pass@5 < 60%` — blocks merge for affected skill\n';
+              }
+            } catch (e) {
+              summary += `> ⚠️ Could not read results: ${e.message}\n`;
+            }
+
+            await github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+              body: summary,
+            });
diff --git a/.gitignore b/.gitignore
index a4542ab1403..b0820022330 100644
--- a/.gitignore
+++ b/.gitignore
@@ -56,3 +56,8 @@ extras/docs/themes/sassdoc/sassdoc/*
 
 # Localization sources
 i18nRepo
+
+# Eval artifacts (keep baseline results)
+evals/node_modules
+evals/results/*.json
+!evals/results/baseline.json
diff --git a/evals/README.md b/evals/README.md
new file mode 100644
index 00000000000..631ea8741c9
--- /dev/null
+++ b/evals/README.md
@@ -0,0 +1,155 @@
+# Ignite UI for Angular — Skill Evals
+
+Automated evaluation suite for the Ignite UI for Angular agent skills. Uses the
+[skill-eval](https://github.com/mgechev/skill-eval) framework to measure skill
+quality, detect regressions, and gate merges.
+
+## Overview
+
+The suite tests three skills:
+
+| Skill | Task ID | What it tests |
+|---|---|---|
+| `igniteui-angular-grids` | `grid-basic-setup` | Flat grid with sorting and pagination on flat employee data |
+| `igniteui-angular-components` | `component-combo-reactive-form` | Multi-select combo bound to a reactive form control |
+| `igniteui-angular-theming` | `theming-palette-generation` | Custom branded palette with `palette()` and `theme()` |
+
+Each task includes:
+
+- **`instruction.md`** — the prompt given to the agent
+- **`tests/test.sh`** — deterministic grader (file checks, compilation, lint)
+- **`prompts/quality.md`** — LLM rubric grader (intent routing, API usage)
+- **`solution/solve.sh`** — reference solution for baseline validation
+- **`environment/Dockerfile`** — isolated environment for agent execution
+- **`skills/`** — symlinked or copied skill files under test
+
+## Prerequisites
+
+- Node.js 20+
+- Docker (for isolated agent execution)
+- An API key for the agent provider (Gemini or Anthropic)
+
+## Running Evals Locally
+
+### Install dependencies
+
+```bash
+cd evals
+npm install
+```
+
+### Run a single task
+
+```bash
+# Gemini (default)
+GEMINI_API_KEY=your-key npm run eval -- grid-basic-setup
+
+# Claude
+ANTHROPIC_API_KEY=your-key npm run eval -- grid-basic-setup --agent=claude
+```
+
+### Run all tasks
+
+```bash
+GEMINI_API_KEY=your-key npm run eval:all
+```
+
+### Options
+
+```bash
+# Adjust trials (default: 5)
+npm run eval -- grid-basic-setup --trials=5
+
+# Run locally without Docker
+npm run eval -- grid-basic-setup --provider=local
+
+# Validate graders against the reference solution
+npm run eval -- grid-basic-setup --validate --provider=local
+
+# Run multiple trials in parallel
+npm run eval -- grid-basic-setup --parallel=3
+```
+
+### Preview results
+
+```bash
+# CLI report
+npm run preview
+
+# Web UI at http://localhost:3847
+npm run preview:browser
+```
+
+## Adding a New Task
+
+1. Create a directory under `evals/tasks/<task-id>/` with the standard structure:
+
+   ```
+   tasks/<task-id>/
+   ├── task.toml               # Config: graders, timeouts, resource limits
+   ├── instruction.md          # Agent prompt
+   ├── environment/Dockerfile  # Container setup
+   ├── tests/test.sh           # Deterministic grader
+   ├── prompts/quality.md      # LLM rubric grader
+   ├── solution/solve.sh       # Reference solution
+   └── skills/                 # Skill files under test
+       └── <skill-name>/SKILL.md
+   ```
+
+2. Write a clear, unambiguous `instruction.md` that tells the agent exactly what
+   to build.
+
+3. Write `tests/test.sh` to check **outcomes** (files exist, project compiles,
+   correct selectors are present) rather than specific steps.
+
+4. Write `prompts/quality.md` with rubric dimensions that sum to 1.0.
+
+5. Write `solution/solve.sh` — a shell script that proves the task is solvable
+   and validates that the graders work correctly.
+
+6. Validate graders before submitting:
+
+   ```bash
+   npm run eval -- <task-id> --validate --provider=local
+   ```
+
+## Pass / Fail Thresholds
+
+Following [Anthropic's recommendations](https://www.anthropic.com/engineering/demystifying-evals-for-ai-agents):
+
+| Metric | Threshold | Effect |
+|---|---|---|
+| `pass@5 ≥ 80%` | **Merge gate** | At least 1 success in 5 trials required |
+| `pass^5 ≥ 60%` | **Tracked** | Flags flaky skills for investigation |
+| `pass@5 < 60%` | **Blocks merge** | On PRs touching the relevant skill |
+
+## CI Integration
+
+The GitHub Actions workflow at `.github/workflows/skill-eval.yml` runs
+automatically on PRs that modify `skills/**` or `evals/**`. It:
+
+1. Checks out the repo
+2. Installs eval dependencies
+3. Runs all tasks with 5 trials
+4. Uploads results as an artifact
+5. Posts a summary comment on the PR
+
+## Grading Strategy
+
+**Deterministic grader (60% weight)** — checks:
+- Project builds without errors
+- Correct Ignite UI selector is present in the generated template
+- Required imports exist
+- No use of forbidden alternatives
+
+**LLM rubric grader (40% weight)** — evaluates:
+- Correct intent routing
+- Idiomatic API usage
+- Absence of hallucinated APIs
+- Following the skill's guidance
+
+## Results
+
+Baseline results are stored in `evals/results/baseline.json` and used for
+regression comparison on PRs. The CI workflow uploads per-run results as
+GitHub Actions artifacts.
diff --git a/evals/package.json b/evals/package.json
new file mode 100644
index 00000000000..9a945614306
--- /dev/null
+++ b/evals/package.json
@@ -0,0 +1,21 @@
+{
+  "name": "igniteui-angular-skill-evals",
+  "version": "1.0.0",
+  "description": "Evaluation suite for Ignite UI for Angular agent skills",
+  "private": true,
+  "scripts": {
+    "eval": "npx skill-eval",
+    "eval:grid": "npx skill-eval grid-basic-setup",
+    "eval:combo": "npx skill-eval component-combo-reactive-form",
+    "eval:theming": "npx skill-eval theming-palette-generation",
+    "eval:all": "npx skill-eval _ --suite=all",
+    "preview": "npx skill-eval preview",
+    "preview:browser": "npx skill-eval preview browser"
+  },
+  "dependencies": {
+    "skill-eval": "^1.0.0"
+  },
+  "engines": {
+    "node": ">=20.0.0"
+  }
+}
diff --git a/evals/results/baseline.json b/evals/results/baseline.json
new file mode 100644
index 00000000000..0bdcc9d6469
--- /dev/null
+++ b/evals/results/baseline.json
@@ -0,0 +1,36 @@
+{
+  "generated_at": "2026-03-08T07:00:00.000Z",
+  "framework_version": "1.0.0",
+  "description": "Initial baseline results for skill evals. Actual scores will be populated after the first full eval run with an API key.",
+  "thresholds": {
+    "pass_at_5_merge_gate": 0.8,
+    "pass_at_5_block": 0.6,
+    "pass_pow_5_tracked": 0.6
+  },
+  "tasks": {
+    "grid-basic-setup": {
+      "skill": "igniteui-angular-grids",
+      "trials": 5,
+      "pass_rate": null,
+      "pass_at_5": null,
+      "pass_pow_5": null,
+      "status": "pending_first_run"
+    },
+    "component-combo-reactive-form": {
+      "skill": "igniteui-angular-components",
+      "trials": 5,
+      "pass_rate": null,
+      "pass_at_5": null,
+      "pass_pow_5": null,
+      "status": "pending_first_run"
+    },
+    "theming-palette-generation": {
+      "skill": "igniteui-angular-theming",
+      "trials": 5,
+      "pass_rate": null,
+      "pass_at_5": null,
+      "pass_pow_5": null,
+      "status": "pending_first_run"
+    }
+  }
+}
diff --git a/evals/tasks/component-combo-reactive-form/environment/Dockerfile b/evals/tasks/component-combo-reactive-form/environment/Dockerfile
new file mode 100644
index 00000000000..4cfd43a762c
--- /dev/null
+++ b/evals/tasks/component-combo-reactive-form/environment/Dockerfile
@@ -0,0 +1,17 @@
+FROM node:20-slim
+
+WORKDIR /workspace
+
+RUN npm install -g @angular/cli@latest
+
+RUN ng new eval-app --skip-git --skip-install --style=scss --ssr=false && \
+    cd eval-app && \
+    npm install && \
+    npm install igniteui-angular
+
+WORKDIR /workspace/eval-app
+
+COPY . .
+
+RUN mkdir -p logs/verifier
+CMD ["bash"]
diff --git a/evals/tasks/component-combo-reactive-form/instruction.md b/evals/tasks/component-combo-reactive-form/instruction.md
new file mode 100644
index 00000000000..9e02aba05c5
--- /dev/null
+++ b/evals/tasks/component-combo-reactive-form/instruction.md
@@ -0,0 +1,40 @@
+# Task: Add a Multi-Select Combo in a Reactive Form
+
+You are working in an Angular 20+ project that already has `igniteui-angular` installed and a theme applied.
+
+## Requirements
+
+Create a `UserSettingsComponent` with a reactive form that includes a multi-select combo for choosing notification channels.
+
+1. **Component location**: `src/app/user-settings/user-settings.component.ts` (with its template)
+
+2. **Form structure**: Create a reactive form (`FormGroup`) with a `notificationChannels` control
+
+3. **Data source**: Use the following list of notification channels:
+
+   ```typescript
+   channels = [
+     { id: 1, name: 'Email', icon: 'email' },
+     { id: 2, name: 'SMS', icon: 'sms' },
+     { id: 3, name: 'Push Notification', icon: 'notifications' },
+     { id: 4, name: 'Slack', icon: 'chat' },
+     { id: 5, name: 'Microsoft Teams', icon: 'groups' },
+   ];
+   ```
+
+4. **Combo configuration**:
+   - Use the Ignite UI for Angular Combo component for multi-selection
+   - Bind it to the `notificationChannels` form control
+   - Display the `name` field in the dropdown
+   - Use the `id` field as the value key
+
+5. **Form validation**: The `notificationChannels` control must be required (at least one channel must be selected)
+
+6. **Submit button**: Add a submit button that is disabled when the form is invalid
+
+## Constraints
+
+- Use the Ignite UI `igx-combo` component — do NOT use a native `<select multiple>`, `igx-select`, or Angular Material `mat-select`.
+- Import from the correct `igniteui-angular` entry point.
+- The component must be standalone and use `ChangeDetectionStrategy.OnPush`.
+- Use reactive forms (`FormGroup` / `FormControl`), not template-driven forms.
diff --git a/evals/tasks/component-combo-reactive-form/prompts/quality.md b/evals/tasks/component-combo-reactive-form/prompts/quality.md
new file mode 100644
index 00000000000..bd189b5e32e
--- /dev/null
+++ b/evals/tasks/component-combo-reactive-form/prompts/quality.md
@@ -0,0 +1,28 @@
+# Component Combo Reactive Form — LLM Rubric
+
+Evaluate the agent's approach to adding a multi-select combo bound to a reactive form.
+
+## Correct Component Selection (0–0.3)
+- Did the agent choose `igx-combo` for the multi-select requirement?
+- Did the agent avoid using `igx-select` (which is single-select only)?
+- Did the agent avoid using native `<select multiple>`, Angular Material `mat-select`, or other third-party select components?
+- Did the agent correctly identify that multi-select requires the Combo component, not the Select component?
+
+## Skill Routing & Reference File Usage (0–0.3)
+- Did the agent read the components skill SKILL.md to identify the correct component?
+- Did the agent read `references/form-controls.md` for Combo API details?
+- Did the agent follow the mandatory protocol (identify component → read references → produce output)?
+- Did the agent avoid writing code from memory without consulting references?
+
+## Idiomatic API Usage (0–0.25)
+- Did the agent bind data using `[data]` input on the combo?
+- Did the agent configure `[displayKey]` and `[valueKey]` correctly?
+- Did the agent use `[formControlName]` or `[formControl]` to bind to the reactive form?
+- Did the agent import from the correct igniteui-angular entry point?
+- Did the agent import `ReactiveFormsModule` or use standalone form directives?
+
+## Code Quality (0–0.15)
+- Is the component standalone with `ChangeDetectionStrategy.OnPush`?
+- Did the agent set up form validation (required validator)?
+- Did the agent avoid hallucinated API names or non-existent inputs/outputs?
+- Is the code clean, well-structured, and following Angular best practices?
diff --git a/evals/tasks/component-combo-reactive-form/skills/igniteui-angular-components/SKILL.md b/evals/tasks/component-combo-reactive-form/skills/igniteui-angular-components/SKILL.md
new file mode 120000
index 00000000000..40a2d1a6e84
--- /dev/null
+++ b/evals/tasks/component-combo-reactive-form/skills/igniteui-angular-components/SKILL.md
@@ -0,0 +1 @@
+../../../../../skills/igniteui-angular-components/SKILL.md
\ No newline at end of file
diff --git a/evals/tasks/component-combo-reactive-form/solution/solve.sh b/evals/tasks/component-combo-reactive-form/solution/solve.sh
new file mode 100755
index 00000000000..6b284457490
--- /dev/null
+++ b/evals/tasks/component-combo-reactive-form/solution/solve.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Reference solution for component-combo-reactive-form
+# Proves the task is solvable and validates grader correctness
+
+set -euo pipefail
+
+mkdir -p src/app/user-settings
+
+# Create the component TypeScript file
+cat > src/app/user-settings/user-settings.component.ts << 'EOF'
+import { ChangeDetectionStrategy, Component } from '@angular/core';
+import { FormGroup, FormControl, Validators, ReactiveFormsModule } from '@angular/forms';
+import { IgxComboComponent } from 'igniteui-angular';
+
+@Component({
+  selector: 'app-user-settings',
+  templateUrl: './user-settings.component.html',
+  changeDetection: ChangeDetectionStrategy.OnPush,
+  imports: [ReactiveFormsModule, IgxComboComponent],
+})
+export class UserSettingsComponent {
+  channels = [
+    { id: 1, name: 'Email', icon: 'email' },
+    { id: 2, name: 'SMS', icon: 'sms' },
+    { id: 3, name: 'Push Notification', icon: 'notifications' },
+    { id: 4, name: 'Slack', icon: 'chat' },
+    { id: 5, name: 'Microsoft Teams', icon: 'groups' },
+  ];
+
+  settingsForm = new FormGroup({
+    notificationChannels: new FormControl<number[]>([], Validators.required),
+  });
+
+  onSubmit() {
+    if (this.settingsForm.valid) {
+      console.log('Selected channels:', this.settingsForm.value.notificationChannels);
+    }
+  }
+}
+EOF
+
+# Create the template
+cat > src/app/user-settings/user-settings.component.html << 'EOF'
+<form [formGroup]="settingsForm" (ngSubmit)="onSubmit()">
+  <igx-combo
+    [data]="channels"
+    [displayKey]="'name'"
+    [valueKey]="'id'"
+    formControlName="notificationChannels"
+    placeholder="Select notification channels"
+  ></igx-combo>
+
+  <button type="submit" [disabled]="settingsForm.invalid">Save Settings</button>
+</form>
+EOF
diff --git a/evals/tasks/component-combo-reactive-form/task.toml b/evals/tasks/component-combo-reactive-form/task.toml
new file mode 100644
index 00000000000..111a254676d
--- /dev/null
+++ b/evals/tasks/component-combo-reactive-form/task.toml
@@ -0,0 +1,26 @@
+version = "1.0"
+
+[metadata]
+author_name = "Ignite UI Team"
+difficulty = "medium"
+category = "component-forms"
+tags = ["combo", "reactive-forms", "multi-select", "igx-combo"]
+
+[agent]
+timeout_sec = 600.0
+
+[environment]
+build_timeout_sec = 300.0
+cpus = 2
+memory_mb = 4096
+storage_mb = 1000
+
+[[graders]]
+type = "deterministic"
+command = "bash tests/test.sh"
+weight = 0.6
+
+[[graders]]
+type = "llm_rubric"
+rubric = "prompts/quality.md"
+weight = 0.4
diff --git a/evals/tasks/component-combo-reactive-form/tests/test.sh b/evals/tasks/component-combo-reactive-form/tests/test.sh
new file mode 100755
index 00000000000..23022ab241b
--- /dev/null
+++ b/evals/tasks/component-combo-reactive-form/tests/test.sh
@@ -0,0 +1,100 @@
+#!/bin/bash
+# Deterministic grader for component-combo-reactive-form
+# Checks outcomes: correct files exist, correct selectors, reactive form usage
+
+set -euo pipefail
+
+mkdir -p logs/verifier
+
+SCORE=0
+TOTAL=5
+DETAILS=""
+
+# --- Check 1: Component file exists ---
+COMPONENT_FILE=$(find src -name "user-settings.component.ts" 2>/dev/null | head -1)
+if [ -n "$COMPONENT_FILE" ]; then
+  SCORE=$((SCORE + 1))
+  DETAILS="${DETAILS}PASS: user-settings.component.ts exists\n"
+else
+  DETAILS="${DETAILS}FAIL: user-settings.component.ts not found\n"
+fi
+
+# --- Check 2: igx-combo selector is present in the template ---
+TEMPLATE_FILE=$(find src -name "user-settings.component.html" 2>/dev/null | head -1)
+COMBO_FOUND=0
+
+if [ -n "${TEMPLATE_FILE:-}" ] && grep -q "igx-combo" "$TEMPLATE_FILE" 2>/dev/null; then
+  COMBO_FOUND=1
+elif [ -n "${COMPONENT_FILE:-}" ] && grep -q "igx-combo" "$COMPONENT_FILE" 2>/dev/null; then
+  COMBO_FOUND=1
+fi
+
+if [ "$COMBO_FOUND" -eq 1 ]; then
+  SCORE=$((SCORE + 1))
+  DETAILS="${DETAILS}PASS: igx-combo selector found\n"
+else
+  DETAILS="${DETAILS}FAIL: igx-combo selector not found in template\n"
+fi
+
+# --- Check 3: Reactive form usage (FormGroup, FormControl, or formControlName) ---
+REACTIVE_FOUND=0
+SEARCH_FILES=""
+[ -n "${TEMPLATE_FILE:-}" ] && SEARCH_FILES="$TEMPLATE_FILE"
+[ -n "${COMPONENT_FILE:-}" ] && SEARCH_FILES="$SEARCH_FILES $COMPONENT_FILE"
+
+for f in $SEARCH_FILES; do
+  if grep -qE 'FormGroup|FormControl|formControlName|formControl|ReactiveFormsModule' "$f" 2>/dev/null; then
+    REACTIVE_FOUND=1
+    break
+  fi
+done
+
+if [ "$REACTIVE_FOUND" -eq 1 ]; then
+  SCORE=$((SCORE + 1))
+  DETAILS="${DETAILS}PASS: Reactive form usage found\n"
+else
+  DETAILS="${DETAILS}FAIL: No reactive form usage found\n"
+fi
+
+# --- Check 4: No forbidden alternatives ---
+ALL_FILES=$(find src -name "*.ts" -o -name "*.html" 2>/dev/null)
+FORBIDDEN=0
+for f in $ALL_FILES; do
+  if grep -qE '<select[> ].*multiple|mat-select|MatSelectModule|igx-select' "$f" 2>/dev/null; then
+    FORBIDDEN=1
+    break
+  fi
+done
+
+if [ "$FORBIDDEN" -eq 0 ]; then
+  SCORE=$((SCORE + 1))
+  DETAILS="${DETAILS}PASS: No forbidden alternatives found\n"
+else
+  DETAILS="${DETAILS}FAIL: Forbidden alternative (native select, mat-select, igx-select) detected\n"
+fi
+
+# --- Check 5: Correct import from igniteui-angular ---
+IMPORT_FOUND=0
+if [ -n "${COMPONENT_FILE:-}" ]; then
+  if grep -qE "from ['\"]igniteui-angular|from ['\"]@infragistics/igniteui-angular" "$COMPONENT_FILE" 2>/dev/null; then
+    IMPORT_FOUND=1
+  fi
+fi
+
+if [ "$IMPORT_FOUND" -eq 1 ]; then
+  SCORE=$((SCORE + 1))
+  DETAILS="${DETAILS}PASS: igniteui-angular import found\n"
+else
+  DETAILS="${DETAILS}FAIL: No igniteui-angular import found\n"
+fi
+
+# --- Calculate reward ---
+REWARD=$(echo "scale=2; $SCORE / $TOTAL" | bc)
+
+echo "$REWARD" > logs/verifier/reward.txt
+printf "Score: %d/%d (%.0f%%)\n" "$SCORE" "$TOTAL" "$(echo "$REWARD * 100" | bc)"
+printf "$DETAILS"
+
+if [ "$SCORE" -lt "$TOTAL" ]; then
+  exit 1
+fi
diff --git a/evals/tasks/grid-basic-setup/environment/Dockerfile b/evals/tasks/grid-basic-setup/environment/Dockerfile
new file mode 100644
index 00000000000..4cfd43a762c
--- /dev/null
+++ b/evals/tasks/grid-basic-setup/environment/Dockerfile
@@ -0,0 +1,17 @@
+FROM node:20-slim
+
+WORKDIR /workspace
+
+RUN npm install -g @angular/cli@latest
+
+RUN ng new eval-app --skip-git --skip-install --style=scss --ssr=false && \
+    cd eval-app && \
+    npm install && \
+    npm install igniteui-angular
+
+WORKDIR /workspace/eval-app
+
+COPY . .
+
+RUN mkdir -p logs/verifier
+CMD ["bash"]
diff --git a/evals/tasks/grid-basic-setup/instruction.md b/evals/tasks/grid-basic-setup/instruction.md
new file mode 100644
index 00000000000..3a9880564e5
--- /dev/null
+++ b/evals/tasks/grid-basic-setup/instruction.md
@@ -0,0 +1,36 @@
+# Task: Add a Data Grid with Sorting and Pagination
+
+You are working in an Angular 20+ project that already has `igniteui-angular` installed and a theme applied.
+
+## Requirements
+
+Add a data grid to the `EmployeeListComponent` that displays employee data with the following features:
+
+1. **Data source**: Use the following flat employee data (add it as a property in the component):
+
+   ```typescript
+   employees = [
+     { id: 1, name: 'Alice Johnson', department: 'Engineering', salary: 95000, hireDate: new Date('2020-03-15') },
+     { id: 2, name: 'Bob Smith', department: 'Marketing', salary: 72000, hireDate: new Date('2019-07-22') },
+     { id: 3, name: 'Carol Davis', department: 'Engineering', salary: 105000, hireDate: new Date('2018-01-10') },
+     { id: 4, name: 'David Wilson', department: 'Sales', salary: 68000, hireDate: new Date('2021-11-05') },
+     { id: 5, name: 'Eva Martinez', department: 'Engineering', salary: 98000, hireDate: new Date('2020-09-18') },
+     { id: 6, name: 'Frank Brown', department: 'Marketing', salary: 75000, hireDate: new Date('2017-04-30') },
+     { id: 7, name: 'Grace Lee', department: 'Sales', salary: 82000, hireDate: new Date('2019-12-01') },
+     { id: 8, name: 'Henry Taylor', department: 'Engineering', salary: 110000, hireDate: new Date('2016-06-14') },
+   ];
+   ```
+
+2. **Columns**: Display all fields — `id`, `name`, `department`, `salary`, `hireDate`
+
+3. **Sorting**: Enable sorting on all columns
+
+4. **Pagination**: Add a paginator with a page size of 5
+
+5. **Component**: Create or edit the file at `src/app/employee-list/employee-list.component.ts` (with its template and styles)
+
+## Constraints
+
+- Use the Ignite UI for Angular `igx-grid` component — do NOT use a native HTML `<table>`, Angular Material table, or any other grid library.
+- Import from the correct `igniteui-angular` entry point.
+- The component must be standalone and use `ChangeDetectionStrategy.OnPush`.
diff --git a/evals/tasks/grid-basic-setup/prompts/quality.md b/evals/tasks/grid-basic-setup/prompts/quality.md
new file mode 100644
index 00000000000..fc65eede86f
--- /dev/null
+++ b/evals/tasks/grid-basic-setup/prompts/quality.md
@@ -0,0 +1,25 @@
+# Grid Basic Setup — LLM Rubric
+
+Evaluate the agent's approach to adding a flat data grid with sorting and pagination.
+
+## Correct Grid Type Selection (0–0.3)
+- Did the agent choose `igx-grid` (Flat Grid) for the flat employee data?
+- Did the agent avoid `igx-tree-grid`, `igx-hierarchical-grid`, or `igx-pivot-grid` — which are wrong for flat, non-hierarchical data?
+- Did the agent avoid native HTML `<table>`, Angular Material `mat-table`, or other third-party grids?
+
+## Skill Routing & Reference File Usage (0–0.3)
+- Did the agent read the grids skill SKILL.md to identify the correct grid type?
+- Did the agent read the relevant reference files (`structure.md` for columns/sorting, `paging-remote.md` for pagination) before writing code?
+- Did the agent follow the mandatory protocol (identify grid type → read references → produce output)?
+
+## Idiomatic API Usage (0–0.25)
+- Did the agent bind data correctly using the `[data]` input?
+- Did the agent use `igx-column` elements with correct `[field]` bindings for each data field?
+- Did the agent enable sorting correctly (e.g., `[sortable]="true"` on columns or grid-level `[allowSorting]`)?
+- Did the agent import from the correct entry point (`igniteui-angular/grids/grid`)?
+- Did the agent use `IGX_GRID_DIRECTIVES` or individual component imports?
+
+## Code Quality (0–0.15)
+- Is the component standalone with `ChangeDetectionStrategy.OnPush`?
+- Did the agent avoid hallucinated API names or non-existent inputs/outputs?
+- Is the code clean and well-structured?
diff --git a/evals/tasks/grid-basic-setup/skills/igniteui-angular-grids/SKILL.md b/evals/tasks/grid-basic-setup/skills/igniteui-angular-grids/SKILL.md
new file mode 120000
index 00000000000..0ba573d65d2
--- /dev/null
+++ b/evals/tasks/grid-basic-setup/skills/igniteui-angular-grids/SKILL.md
@@ -0,0 +1 @@
+../../../../../skills/igniteui-angular-grids/SKILL.md
\ No newline at end of file
diff --git a/evals/tasks/grid-basic-setup/solution/solve.sh b/evals/tasks/grid-basic-setup/solution/solve.sh
new file mode 100755
index 00000000000..0466ce7d623
--- /dev/null
+++ b/evals/tasks/grid-basic-setup/solution/solve.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# Reference solution for grid-basic-setup
+# Proves the task is solvable and validates grader correctness
+
+set -euo pipefail
+
+mkdir -p src/app/employee-list
+
+# Create the component TypeScript file
+cat > src/app/employee-list/employee-list.component.ts << 'EOF'
+import { ChangeDetectionStrategy, Component } from '@angular/core';
+import { IGX_GRID_DIRECTIVES } from 'igniteui-angular/grids/grid';
+import { IgxPaginatorComponent } from 'igniteui-angular/grids/grid';
+
+@Component({
+  selector: 'app-employee-list',
+  templateUrl: './employee-list.component.html',
+  changeDetection: ChangeDetectionStrategy.OnPush,
+  imports: [IGX_GRID_DIRECTIVES, IgxPaginatorComponent],
+})
+export class EmployeeListComponent {
+  employees = [
+    { id: 1, name: 'Alice Johnson', department: 'Engineering', salary: 95000, hireDate: new Date('2020-03-15') },
+    { id: 2, name: 'Bob Smith', department: 'Marketing', salary: 72000, hireDate: new Date('2019-07-22') },
+    { id: 3, name: 'Carol Davis', department: 'Engineering', salary: 105000, hireDate: new Date('2018-01-10') },
+    { id: 4, name: 'David Wilson', department: 'Sales', salary: 68000, hireDate: new Date('2021-11-05') },
+    { id: 5, name: 'Eva Martinez', department: 'Engineering', salary: 98000, hireDate: new Date('2020-09-18') },
+    { id: 6, name: 'Frank Brown', department: 'Marketing', salary: 75000, hireDate: new Date('2017-04-30') },
+    { id: 7, name: 'Grace Lee', department: 'Sales', salary: 82000, hireDate: new Date('2019-12-01') },
+    { id: 8, name: 'Henry Taylor', department: 'Engineering', salary: 110000, hireDate: new Date('2016-06-14') },
+  ];
+}
+EOF
+
+# Create the template
+cat > src/app/employee-list/employee-list.component.html << 'EOF'
+<igx-grid [data]="employees" [allowFiltering]="false" width="100%" height="600px">
+  <igx-column field="id" header="ID" [sortable]="true"></igx-column>
+  <igx-column field="name" header="Name" [sortable]="true"></igx-column>
+  <igx-column field="department" header="Department" [sortable]="true"></igx-column>
+  <igx-column field="salary" header="Salary" [sortable]="true"></igx-column>
+  <igx-column field="hireDate" header="Hire Date" [sortable]="true"></igx-column>
+  <igx-paginator [perPage]="5"></igx-paginator>
+</igx-grid>
+EOF
diff --git a/evals/tasks/grid-basic-setup/task.toml b/evals/tasks/grid-basic-setup/task.toml
new file mode 100644
index 00000000000..07e25fdd0aa
--- /dev/null
+++ b/evals/tasks/grid-basic-setup/task.toml
@@ -0,0 +1,26 @@
+version = "1.0"
+
+[metadata]
+author_name = "Ignite UI Team"
+difficulty = "medium"
+category = "grid-setup"
+tags = ["grid", "flat-grid", "sorting", "pagination", "igx-grid"]
+
+[agent]
+timeout_sec = 600.0
+
+[environment]
+build_timeout_sec = 300.0
+cpus = 2
+memory_mb = 4096
+storage_mb = 1000
+
+[[graders]]
+type = "deterministic"
+command = "bash tests/test.sh"
+weight = 0.6
+
+[[graders]]
+type = "llm_rubric"
+rubric = "prompts/quality.md"
+weight = 0.4
diff --git a/evals/tasks/grid-basic-setup/tests/test.sh b/evals/tasks/grid-basic-setup/tests/test.sh
new file mode 100755
index 00000000000..455c56ae96a
--- /dev/null
+++ b/evals/tasks/grid-basic-setup/tests/test.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+# Deterministic grader for grid-basic-setup
+# Checks outcomes: correct files exist, project compiles, correct selectors used
+
+set -euo pipefail
+
+mkdir -p logs/verifier
+
+SCORE=0
+TOTAL=5
+DETAILS=""
+
+# --- Check 1: Component file exists ---
+COMPONENT_FILE=$(find src -name "employee-list.component.ts" 2>/dev/null | head -1)
+if [ -n "$COMPONENT_FILE" ]; then
+  SCORE=$((SCORE + 1))
+  DETAILS="${DETAILS}PASS: employee-list.component.ts exists\n"
+else
+  DETAILS="${DETAILS}FAIL: employee-list.component.ts not found\n"
+fi
+
+# --- Check 2: igx-grid selector is present in the template ---
+TEMPLATE_FILE=$(find src -name "employee-list.component.html" 2>/dev/null | head -1)
+INLINE_TEMPLATE=""
+if [ -z "$TEMPLATE_FILE" ] && [ -n "$COMPONENT_FILE" ]; then
+  # Check for inline template
+  INLINE_TEMPLATE=$(grep -l "igx-grid" "$COMPONENT_FILE" 2>/dev/null || true)
+fi
+
+if [ -n "$TEMPLATE_FILE" ] && grep -q "igx-grid" "$TEMPLATE_FILE" 2>/dev/null; then
+  SCORE=$((SCORE + 1))
+  DETAILS="${DETAILS}PASS: igx-grid selector found in template\n"
+elif [ -n "$INLINE_TEMPLATE" ]; then
+  SCORE=$((SCORE + 1))
+  DETAILS="${DETAILS}PASS: igx-grid selector found in inline template\n"
+else
+  DETAILS="${DETAILS}FAIL: igx-grid selector not found in template\n"
+fi
+
+# --- Check 3: Correct import from igniteui-angular entry point ---
+if [ -n "$COMPONENT_FILE" ]; then
+  if grep -qE "from ['\"]igniteui-angular/grids/grid['\"]|from ['\"]@infragistics/igniteui-angular/grids/grid['\"]" "$COMPONENT_FILE" 2>/dev/null; then
+    SCORE=$((SCORE + 1))
+    DETAILS="${DETAILS}PASS: Correct grid entry-point import found\n"
+  else
+    DETAILS="${DETAILS}FAIL: Missing import from igniteui-angular/grids/grid entry point\n"
+  fi
+else
+  DETAILS="${DETAILS}FAIL: Cannot check imports — component file not found\n"
+fi
+
+# --- Check 4: No forbidden alternatives ---
+ALL_TS_FILES=$(find src -name "*.ts" -o -name "*.html" 2>/dev/null)
+FORBIDDEN=0
+for f in $ALL_TS_FILES; do
+  # Check for native table, Angular Material table, or other grid libs
+  if grep -qE '<table[> ]|MatTableModule|mat-table|ag-grid|kendo-grid' "$f" 2>/dev/null; then
+    FORBIDDEN=1
+    break
+  fi
+done
+
+if [ "$FORBIDDEN" -eq 0 ]; then
+  SCORE=$((SCORE + 1))
+  DETAILS="${DETAILS}PASS: No forbidden alternatives found\n"
+else
+  DETAILS="${DETAILS}FAIL: Forbidden alternative (native table, Material table, etc.) detected\n"
+fi
+
+# --- Check 5: Pagination is configured ---
+PAGING_FOUND=0
+SEARCH_FILES=""
+[ -n "$TEMPLATE_FILE" ] && SEARCH_FILES="$TEMPLATE_FILE"
+[ -n "$COMPONENT_FILE" ] && SEARCH_FILES="$SEARCH_FILES $COMPONENT_FILE"
+
+for f in $SEARCH_FILES; do
+  if grep -qE 'igx-paginator|IgxPaginatorComponent|paging|perPage|\[perPage\]' "$f" 2>/dev/null; then
+    PAGING_FOUND=1
+    break
+  fi
+done
+
+if [ "$PAGING_FOUND" -eq 1 ]; then
+  SCORE=$((SCORE + 1))
+  DETAILS="${DETAILS}PASS: Pagination configuration found\n"
+else
+  DETAILS="${DETAILS}FAIL: No pagination configuration found\n"
+fi
+
+# --- Calculate reward ---
+REWARD=$(echo "scale=2; $SCORE / $TOTAL" | bc)
+
+echo "$REWARD" > logs/verifier/reward.txt
+printf "Score: %d/%d (%.0f%%)\n" "$SCORE" "$TOTAL" "$(echo "$REWARD * 100" | bc)"
+printf "$DETAILS"
+
+if [ "$SCORE" -lt "$TOTAL" ]; then
+  exit 1
+fi
diff --git a/evals/tasks/theming-palette-generation/environment/Dockerfile b/evals/tasks/theming-palette-generation/environment/Dockerfile
new file mode 100644
index 00000000000..4cfd43a762c
--- /dev/null
+++ b/evals/tasks/theming-palette-generation/environment/Dockerfile
@@ -0,0 +1,17 @@
+FROM node:20-slim
+
+WORKDIR /workspace
+
+RUN npm install -g @angular/cli@latest
+
+RUN ng new eval-app --skip-git --skip-install --style=scss --ssr=false && \
+    cd eval-app && \
+    npm install && \
+    npm install igniteui-angular
+
+WORKDIR /workspace/eval-app
+
+COPY . .
+
+RUN mkdir -p logs/verifier
+CMD ["bash"]
diff --git a/evals/tasks/theming-palette-generation/instruction.md b/evals/tasks/theming-palette-generation/instruction.md
new file mode 100644
index 00000000000..cb3a03360e3
--- /dev/null
+++ b/evals/tasks/theming-palette-generation/instruction.md
@@ -0,0 +1,27 @@
+# Task: Create a Custom Branded Theme
+
+You are working in an Angular 20+ project that already has `igniteui-angular` installed with Sass support enabled.
+
+## Requirements
+
+Create a custom Ignite UI for Angular theme with a blue primary color and orange secondary color.
+
+1. **Theme file location**: `src/styles.scss` (or update the existing global styles file)
+
+2. **Palette**:
+   - Primary color: `#1976D2` (Material Blue)
+   - Secondary color: `#FF9800` (Material Orange)
+   - Surface color appropriate for a light theme
+
+3. **Theme application**:
+   - Generate a complete theme using the Ignite UI theming functions
+   - Apply the theme globally
+
+4. **Typography**: Include typography configuration with a sans-serif font family
+
+## Constraints
+
+- Use the Ignite UI Sass theming API (`palette()`, `theme()`) — do NOT hardcode individual CSS custom properties or use plain CSS variables to replicate the palette.
+- Import from `igniteui-angular/theming` (or `@infragistics/igniteui-angular/theming` for licensed packages).
+- The theme must include both `palette()` and `theme()` function calls.
+- Include `core()` mixin invocation before the `theme()` mixin.
diff --git a/evals/tasks/theming-palette-generation/prompts/quality.md b/evals/tasks/theming-palette-generation/prompts/quality.md
new file mode 100644
index 00000000000..d5400180ba6
--- /dev/null
+++ b/evals/tasks/theming-palette-generation/prompts/quality.md
@@ -0,0 +1,27 @@
+# Theming Palette Generation — LLM Rubric
+
+Evaluate the agent's approach to creating a custom branded Ignite UI theme.
+
+## Correct Theming Approach (0–0.3)
+- Did the agent use the Ignite UI Sass theming API (`palette()`, `theme()`) instead of hardcoding CSS custom properties?
+- Did the agent use `@use 'igniteui-angular/theming'` (modern Sass module syntax) rather than deprecated `@import`?
+- Did the agent include `core()` mixin before `theme()` mixin as required by the theming system?
+
+## Skill Routing & Reference Usage (0–0.3)
+- Did the agent read the theming skill SKILL.md for theming guidance?
+- Did the agent follow the correct theming sequence: palette → typography → theme?
+- Did the agent check for MCP server availability before writing SCSS manually?
+- If MCP tools were available, did the agent prefer using them over manual SCSS?
+
+## Idiomatic API Usage (0–0.25)
+- Did the agent pass `$primary` and `$secondary` parameters to `palette()`?
+- Did the agent pass a `$surface` color appropriate for a light theme?
+- Did the agent configure typography with a font family?
+- Did the agent pass the `$palette` variable to the `theme()` mixin?
+- Did the agent use the `$schema` parameter or rely on the correct default schema?
+
+## Code Quality (0–0.15)
+- Is the SCSS well-structured and readable?
+- Did the agent use `@use` with a namespace (e.g., `as *` or a custom namespace)?
+- Did the agent avoid hallucinated function names or non-existent parameters?
+- Did the agent avoid mixing Sass theming with manual CSS overrides unnecessarily?
diff --git a/evals/tasks/theming-palette-generation/skills/igniteui-angular-theming/SKILL.md b/evals/tasks/theming-palette-generation/skills/igniteui-angular-theming/SKILL.md
new file mode 120000
index 00000000000..05e9980f01a
--- /dev/null
+++ b/evals/tasks/theming-palette-generation/skills/igniteui-angular-theming/SKILL.md
@@ -0,0 +1 @@
+../../../../../skills/igniteui-angular-theming/SKILL.md
\ No newline at end of file
diff --git a/evals/tasks/theming-palette-generation/solution/solve.sh b/evals/tasks/theming-palette-generation/solution/solve.sh
new file mode 100755
index 00000000000..c032689ba43
--- /dev/null
+++ b/evals/tasks/theming-palette-generation/solution/solve.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Reference solution for theming-palette-generation
+# Proves the task is solvable and validates grader correctness
+
+set -euo pipefail
+
+# Write the themed styles.scss
+cat > src/styles.scss << 'SCSS'
+@use 'igniteui-angular/theming' as *;
+
+$custom-palette: palette(
+  $primary: #1976D2,
+  $secondary: #FF9800,
+  $surface: #FAFAFA,
+);
+
+$custom-typography: typography(
+  $font-family: 'Roboto, "Helvetica Neue", sans-serif',
+);
+
+@include core();
+@include typography($custom-typography);
+@include theme(
+  $palette: $custom-palette,
+  $schema: $light-material-schema,
+);
+SCSS
diff --git a/evals/tasks/theming-palette-generation/task.toml b/evals/tasks/theming-palette-generation/task.toml
new file mode 100644
index 00000000000..459be454723
--- /dev/null
+++ b/evals/tasks/theming-palette-generation/task.toml
@@ -0,0 +1,26 @@
+version = "1.0"
+
+[metadata]
+author_name = "Ignite UI Team"
+difficulty = "medium"
+category = "theming"
+tags = ["theming", "palette", "scss", "sass", "custom-theme"]
+
+[agent]
+timeout_sec = 600.0
+
+[environment]
+build_timeout_sec = 300.0
+cpus = 2
+memory_mb = 4096
+storage_mb = 1000
+
+[[graders]]
+type = "deterministic"
+command = "bash tests/test.sh"
+weight = 0.6
+
+[[graders]]
+type = "llm_rubric"
+rubric = "prompts/quality.md"
+weight = 0.4
diff --git a/evals/tasks/theming-palette-generation/tests/test.sh b/evals/tasks/theming-palette-generation/tests/test.sh
new file mode 100755
index 00000000000..769d60af99e
--- /dev/null
+++ b/evals/tasks/theming-palette-generation/tests/test.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+# Deterministic grader for theming-palette-generation
+# Checks outcomes: correct SCSS structure, palette/theme calls present
+
+set -euo pipefail
+
+mkdir -p logs/verifier
+
+SCORE=0
+TOTAL=5
+DETAILS=""
+
+# Find the main styles file (could be styles.scss or another scss file)
+STYLES_FILE=$(find src -name "styles.scss" -o -name "styles.sass" 2>/dev/null | head -1)
+if [ -z "$STYLES_FILE" ]; then
+  # Also check for any scss file that might contain the theme
+  STYLES_FILE=$(grep -rl "palette\|theme()" src/ --include="*.scss" 2>/dev/null | head -1)
+fi
+
+if [ -z "${STYLES_FILE:-}" ]; then
+  echo "0" > logs/verifier/reward.txt
+  printf "FAIL: No SCSS file with theming code found\n"
+  exit 1
+fi
+
+# --- Check 1: Import from igniteui-angular/theming ---
+if grep -qE "@use ['\"]igniteui-angular/theming['\"]|@use ['\"]@infragistics/igniteui-angular/theming['\"]|@import ['\"]igniteui-angular/theming['\"]|@import ['\"]@infragistics/igniteui-angular/theming['\"]|@import ['\"]~igniteui-angular/lib/core/styles/themes" "$STYLES_FILE" 2>/dev/null; then
+  SCORE=$((SCORE + 1))
+  DETAILS="${DETAILS}PASS: Correct theming import found\n"
+else
+  DETAILS="${DETAILS}FAIL: Missing import from igniteui-angular/theming\n"
+fi
+
+# --- Check 2: palette() function call with primary and secondary ---
+if grep -qE 'palette\(' "$STYLES_FILE" 2>/dev/null; then
+  SCORE=$((SCORE + 1))
+  DETAILS="${DETAILS}PASS: palette() function call found\n"
+else
+  DETAILS="${DETAILS}FAIL: No palette() function call found\n"
+fi
+
+# --- Check 3: theme() mixin call ---
+if grep -qE '@include.*theme\(|@include.*css-vars\(' "$STYLES_FILE" 2>/dev/null; then
+  SCORE=$((SCORE + 1))
+  DETAILS="${DETAILS}PASS: theme() mixin call found\n"
+else
+  DETAILS="${DETAILS}FAIL: No theme() mixin call found\n"
+fi
+
+# --- Check 4: core() mixin call (must be before theme) ---
+if grep -qE '@include.*core\(' "$STYLES_FILE" 2>/dev/null; then
+  SCORE=$((SCORE + 1))
+  DETAILS="${DETAILS}PASS: core() mixin call found\n"
+else
+  DETAILS="${DETAILS}FAIL: No core() mixin call found\n"
+fi
+
+# --- Check 5: No hardcoded CSS custom properties as the sole theming approach ---
+# Allow CSS vars if palette() is also used, but fail if ONLY css vars without palette()
+PALETTE_USED=$(grep -c 'palette(' "$STYLES_FILE" 2>/dev/null || echo "0")
+CSS_VARS_ONLY=$(grep -cE '^\s*--ig-' "$STYLES_FILE" 2>/dev/null || echo "0")
+
+if [ "$PALETTE_USED" -gt 0 ]; then
+  SCORE=$((SCORE + 1))
+  DETAILS="${DETAILS}PASS: Uses palette() function (not hardcoded CSS variables)\n"
+elif [ "$CSS_VARS_ONLY" -gt 0 ]; then
+  DETAILS="${DETAILS}FAIL: Only hardcoded CSS custom properties found without palette()\n"
+else
+  SCORE=$((SCORE + 1))
+  DETAILS="${DETAILS}PASS: No hardcoded-only CSS variables approach\n"
+fi
+
+# --- Calculate reward ---
+REWARD=$(echo "scale=2; $SCORE / $TOTAL" | bc)
+
+echo "$REWARD" > logs/verifier/reward.txt
+printf "Score: %d/%d (%.0f%%)\n" "$SCORE" "$TOTAL" "$(echo "$REWARD * 100" | bc)"
+printf "$DETAILS"
+
+if [ "$SCORE" -lt "$TOTAL" ]; then
+  exit 1
+fi

From ac1335a316ff24fece6db887e2c6ab5468577b9f Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 8 Mar 2026 07:13:34 +0000
Subject: [PATCH 03/17] refactor: improve regex readability in grader scripts
 per code review

Co-authored-by: zdrawku <11193764+zdrawku@users.noreply.github.com>
---
 evals/tasks/grid-basic-setup/tests/test.sh           | 4 +++-
 evals/tasks/theming-palette-generation/tests/test.sh | 5 ++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/evals/tasks/grid-basic-setup/tests/test.sh b/evals/tasks/grid-basic-setup/tests/test.sh
index 455c56ae96a..72eff909578 100755
--- a/evals/tasks/grid-basic-setup/tests/test.sh
+++ b/evals/tasks/grid-basic-setup/tests/test.sh
@@ -38,8 +38,10 @@ else
 fi
 
 # --- Check 3: Correct import from igniteui-angular entry point ---
+# Accepts either the OSS or licensed package path
+GRID_IMPORT_PATTERN="from ['\"](@infragistics/)?igniteui-angular/grids/grid['\"]"
 if [ -n "$COMPONENT_FILE" ]; then
-  if grep -qE "from ['\"]igniteui-angular/grids/grid['\"]|from ['\"]@infragistics/igniteui-angular/grids/grid['\"]" "$COMPONENT_FILE" 2>/dev/null; then
+  if grep -qE "$GRID_IMPORT_PATTERN" "$COMPONENT_FILE" 2>/dev/null; then
     SCORE=$((SCORE + 1))
     DETAILS="${DETAILS}PASS: Correct grid entry-point import found\n"
   else
diff --git a/evals/tasks/theming-palette-generation/tests/test.sh b/evals/tasks/theming-palette-generation/tests/test.sh
index 769d60af99e..0fa64771be2 100755
--- a/evals/tasks/theming-palette-generation/tests/test.sh
+++ b/evals/tasks/theming-palette-generation/tests/test.sh
@@ -24,7 +24,10 @@ if [ -z "${STYLES_FILE:-}" ]; then
 fi
 
 # --- Check 1: Import from igniteui-angular/theming ---
-if grep -qE "@use ['\"]igniteui-angular/theming['\"]|@use ['\"]@infragistics/igniteui-angular/theming['\"]|@import ['\"]igniteui-angular/theming['\"]|@import ['\"]@infragistics/igniteui-angular/theming['\"]|@import ['\"]~igniteui-angular/lib/core/styles/themes" "$STYLES_FILE" 2>/dev/null; then
+# Accepts @use or @import with either the OSS or licensed package path
+THEMING_IMPORT_PATTERN="@(use|import) ['\"](@infragistics/)?igniteui-angular/theming['\"]"
+LEGACY_IMPORT_PATTERN="@import ['\"]~igniteui-angular/lib/core/styles/themes"
+if grep -qE "$THEMING_IMPORT_PATTERN|$LEGACY_IMPORT_PATTERN" "$STYLES_FILE" 2>/dev/null; then
   SCORE=$((SCORE + 1))
   DETAILS="${DETAILS}PASS: Correct theming import found\n"
 else

From f807aa3c4fb78940231f30237ad8a9326be7a2ed Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 8 Mar 2026 07:14:16 +0000
Subject: [PATCH 04/17] fix: add explicit permissions to skill-eval workflow
 (CodeQL alert)

Co-authored-by: zdrawku <11193764+zdrawku@users.noreply.github.com>
---
 .github/workflows/skill-eval.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/skill-eval.yml b/.github/workflows/skill-eval.yml
index 7e172c404a6..4e3aafd5bdc 100644
--- a/.github/workflows/skill-eval.yml
+++ b/.github/workflows/skill-eval.yml
@@ -6,6 +6,10 @@ on:
       - 'skills/**'
       - 'evals/**'
 
+permissions:
+  contents: read
+  pull-requests: write
+
 jobs:
   eval:
     runs-on: ubuntu-latest

From 6e7b838ef291aec21eeca3ef9f8f2e5b9bbb04ef Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 9 Mar 2026 17:42:33 +0000
Subject: [PATCH 05/17] fix: replace skill-eval package dep with self-contained
 local runner

skill-eval is a reference repo, not an installable npm package.
Replaced with a local run-eval.sh script that executes reference
solutions and deterministic graders directly.

Co-authored-by: kdinev <1472513+kdinev@users.noreply.github.com>
---
 .github/workflows/skill-eval.yml |   9 +--
 evals/README.md                  |  86 ++++++++------------
 evals/package.json               |  19 +++--
 evals/run-eval.sh                | 131 +++++++++++++++++++++++++++++++
 4 files changed, 176 insertions(+), 69 deletions(-)
 create mode 100755 evals/run-eval.sh

diff --git a/.github/workflows/skill-eval.yml b/.github/workflows/skill-eval.yml
index 4e3aafd5bdc..136259fc1e1 100644
--- a/.github/workflows/skill-eval.yml
+++ b/.github/workflows/skill-eval.yml
@@ -26,14 +26,11 @@ jobs:
 
       - name: Install eval dependencies
         working-directory: evals
-        run: npm install
+        run: npm install --ignore-scripts
 
-      - name: Run skill evals
+      - name: Validate graders against reference solutions
         working-directory: evals
-        run: npx skill-eval _ --suite=all --trials=5
-        env:
-          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
-          GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
+        run: bash run-eval.sh --all --validate
 
       - name: Upload results
         if: always()
diff --git a/evals/README.md b/evals/README.md
index 631ea8741c9..ca8b31cba6f 100644
--- a/evals/README.md
+++ b/evals/README.md
@@ -1,8 +1,13 @@
 # Ignite UI for Angular — Skill Evals
 
-Automated evaluation suite for the Ignite UI for Angular agent skills. Uses the
-[skill-eval](https://github.com/mgechev/skill-eval) framework to measure skill
-quality, detect regressions, and gate merges.
+Automated evaluation suite for the Ignite UI for Angular agent skills.
+Inspired by the [skill-eval](https://github.com/mgechev/skill-eval) reference
+architecture and extended with patterns from
+[Anthropic's agent eval research](https://www.anthropic.com/engineering/demystifying-evals-for-ai-agents).
+
+The infrastructure is **self-contained** — there are no external eval-framework
+dependencies. A lightweight shell runner (`run-eval.sh`) executes each task's
+reference solution and deterministic grader.
 
 ## Overview
 
@@ -21,63 +26,38 @@ Each task includes:
 - **`prompts/quality.md`** — LLM rubric grader (intent routing, API usage)
 - **`solution/solve.sh`** — reference solution for baseline validation
 - **`environment/Dockerfile`** — isolated environment for agent execution
-- **`skills/`** — symlinked or copied skill files under test
+- **`skills/`** — symlinked skill files under test
 
 ## Prerequisites
 
-- Node.js 20+
-- Docker (for isolated agent execution)
-- An API key for the agent provider (Gemini or Anthropic)
+- Bash 4+
+- `bc` (installed by default on most Linux / macOS systems)
 
 ## Running Evals Locally
 
-### Install dependencies
-
-```bash
-cd evals
-npm install
-```
-
-### Run a single task
-
-```bash
-# Gemini (default)
-GEMINI_API_KEY=your-key npm run eval -- grid-basic-setup
-
-# Claude
-ANTHROPIC_API_KEY=your-key npm run eval -- grid-basic-setup --agent=claude
-```
-
-### Run all tasks
-
-```bash
-GEMINI_API_KEY=your-key npm run eval:all
-```
+### Validate graders against reference solutions
 
-### Options
+This applies each task's `solution/solve.sh`, then runs `tests/test.sh` to
+confirm the grader scores 100%. Use this to catch grader regressions.
 
 ```bash
-# Adjust trials (default: 5)
-npm run eval -- grid-basic-setup --trials=5
-
-# Run locally without Docker
-npm run eval -- grid-basic-setup --provider=local
+cd evals
 
-# Validate graders against the reference solution
-npm run eval -- grid-basic-setup --validate --provider=local
+# Validate all tasks
+bash run-eval.sh --all --validate
 
-# Run multiple trials in parallel
-npm run eval -- grid-basic-setup --parallel=3
+# Validate a single task
+bash run-eval.sh grid-basic-setup --validate
 ```
 
-### Preview results
+### npm scripts (convenience wrappers)
 
 ```bash
-# CLI report
-npm run preview
-
-# Web UI at http://localhost:3847
-npm run preview:browser
+cd evals
+npm run validate               # all tasks
+npm run validate:grid          # grid-basic-setup only
+npm run validate:combo         # component-combo-reactive-form only
+npm run validate:theming       # theming-palette-generation only
 ```
 
 ## Adding a New Task
@@ -86,9 +66,9 @@ npm run preview:browser
 
    ```
    tasks/<task-id>/
-   ├── task.toml               # Config: graders, timeouts, resource limits
+   ├── task.toml               # Config: grader metadata, weights, timeouts
    ├── instruction.md          # Agent prompt
-   ├── environment/Dockerfile  # Container setup
+   ├── environment/Dockerfile  # Container setup (for future Docker-based runs)
    ├── tests/test.sh           # Deterministic grader
    ├── prompts/quality.md      # LLM rubric grader
    ├── solution/solve.sh       # Reference solution
@@ -100,7 +80,8 @@ npm run preview:browser
    to build.
 
 3. Write `tests/test.sh` to check **outcomes** (files exist, project compiles,
-   correct selectors are present) rather than specific steps.
+   correct selectors are present) rather than specific steps. The grader must
+   write a reward (0.0–1.0) to `logs/verifier/reward.txt`.
 
 4. Write `prompts/quality.md` with rubric dimensions that sum to 1.0.
 
@@ -110,7 +91,7 @@ npm run preview:browser
 6. Validate graders before submitting:
 
    ```bash
-   npm run eval -- <task-id> --validate --provider=local
+   bash run-eval.sh <task-id> --validate
    ```
 
 ## Pass / Fail Thresholds
@@ -129,10 +110,9 @@ The GitHub Actions workflow at `.github/workflows/skill-eval.yml` runs
 automatically on PRs that modify `skills/**` or `evals/**`. It:
 
 1. Checks out the repo
-2. Installs eval dependencies
-3. Runs all tasks with 5 trials
-4. Uploads results as an artifact
-5. Posts a summary comment on the PR
+2. Validates all graders against their reference solutions
+3. Uploads results as an artifact
+4. Posts a summary comment on the PR
 
 ## Grading Strategy
 
diff --git a/evals/package.json b/evals/package.json
index 9a945614306..b660ea782d8 100644
--- a/evals/package.json
+++ b/evals/package.json
@@ -4,16 +4,15 @@
   "description": "Evaluation suite for Ignite UI for Angular agent skills",
   "private": true,
   "scripts": {
-    "eval": "npx skill-eval",
-    "eval:grid": "npx skill-eval grid-basic-setup",
-    "eval:combo": "npx skill-eval component-combo-reactive-form",
-    "eval:theming": "npx skill-eval theming-palette-generation",
-    "eval:all": "npx skill-eval _ --suite=all",
-    "preview": "npx skill-eval preview",
-    "preview:browser": "npx skill-eval preview browser"
-  },
-  "dependencies": {
-    "skill-eval": "^1.0.0"
+    "eval": "bash run-eval.sh",
+    "eval:grid": "bash run-eval.sh grid-basic-setup",
+    "eval:combo": "bash run-eval.sh component-combo-reactive-form",
+    "eval:theming": "bash run-eval.sh theming-palette-generation",
+    "eval:all": "bash run-eval.sh --all",
+    "validate": "bash run-eval.sh --all --validate",
+    "validate:grid": "bash run-eval.sh grid-basic-setup --validate",
+    "validate:combo": "bash run-eval.sh component-combo-reactive-form --validate",
+    "validate:theming": "bash run-eval.sh theming-palette-generation --validate"
   },
   "engines": {
     "node": ">=20.0.0"
diff --git a/evals/run-eval.sh b/evals/run-eval.sh
new file mode 100755
index 00000000000..8ac0dc5fc66
--- /dev/null
+++ b/evals/run-eval.sh
@@ -0,0 +1,131 @@
+#!/bin/bash
+# run-eval.sh — Self-contained eval runner for Ignite UI Angular skills.
+# Inspired by https://github.com/mgechev/skill-eval (a reference architecture,
+# not an installable package).
+#
+# Usage:
+#   bash run-eval.sh <task-id>              # validate one task
+#   bash run-eval.sh --all                  # validate all tasks
+#   bash run-eval.sh <task-id> --validate   # run reference solution then grade
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TASKS_DIR="$SCRIPT_DIR/tasks"
+RESULTS_DIR="$SCRIPT_DIR/results"
+
+# --- helpers --------------------------------------------------------------- #
+
+usage() {
+  cat <<EOF
+Usage: $(basename "$0") <task-id|--all> [--validate]
+
+Arguments:
+  <task-id>     Name of the task directory under tasks/
+  --all         Run all tasks
+
+Options:
+  --validate    Apply the reference solution before grading (sanity-check mode)
+
+Examples:
+  $(basename "$0") grid-basic-setup --validate
+  $(basename "$0") --all
+EOF
+  exit 1
+}
+
+run_task() {
+  local TASK_ID="$1"
+  local VALIDATE="${2:-false}"
+  local TASK_DIR="$TASKS_DIR/$TASK_ID"
+
+  if [ ! -d "$TASK_DIR" ]; then
+    echo "ERROR: Task directory not found: $TASK_DIR" >&2
+    return 1
+  fi
+
+  echo "═══════════════════════════════════════════════════════"
+  echo "  Task: $TASK_ID"
+  echo "═══════════════════════════════════════════════════════"
+
+  # Create a temporary workspace so graders run in isolation
+  local WORK_DIR
+  WORK_DIR=$(mktemp -d)
+  trap "rm -rf '$WORK_DIR'" RETURN
+
+  # Seed the workspace with a minimal src/ tree
+  mkdir -p "$WORK_DIR/src"
+
+  # If --validate, apply the reference solution first
+  if [ "$VALIDATE" = "true" ]; then
+    if [ ! -f "$TASK_DIR/solution/solve.sh" ]; then
+      echo "ERROR: No reference solution at $TASK_DIR/solution/solve.sh" >&2
+      return 1
+    fi
+    echo "→ Applying reference solution …"
+    (cd "$WORK_DIR" && bash "$TASK_DIR/solution/solve.sh")
+  fi
+
+  # Run deterministic grader
+  if [ ! -f "$TASK_DIR/tests/test.sh" ]; then
+    echo "ERROR: No deterministic grader at $TASK_DIR/tests/test.sh" >&2
+    return 1
+  fi
+
+  echo "→ Running deterministic grader …"
+  local GRADER_EXIT=0
+  (cd "$WORK_DIR" && bash "$TASK_DIR/tests/test.sh") || GRADER_EXIT=$?
+
+  # Read reward
+  local REWARD="0"
+  if [ -f "$WORK_DIR/logs/verifier/reward.txt" ]; then
+    REWARD=$(cat "$WORK_DIR/logs/verifier/reward.txt")
+  fi
+
+  local STATUS="fail"
+  if [ "$GRADER_EXIT" -eq 0 ]; then
+    STATUS="pass"
+  fi
+
+  echo ""
+  echo "  Result: $STATUS  (reward=$REWARD)"
+  echo ""
+
+  # Persist result
+  mkdir -p "$RESULTS_DIR"
+  cat > "$RESULTS_DIR/${TASK_ID}.json" <<EOF
+{
+  "task": "$TASK_ID",
+  "reward": $REWARD,
+  "status": "$STATUS",
+  "timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
+}
+EOF
+
+  return "$GRADER_EXIT"
+}
+
+# --- main ------------------------------------------------------------------ #
+
+if [ $# -lt 1 ]; then
+  usage
+fi
+
+TASK_ARG="$1"
+VALIDATE="false"
+if [[ "${2:-}" == "--validate" ]]; then
+  VALIDATE="true"
+fi
+
+OVERALL_EXIT=0
+
+if [ "$TASK_ARG" = "--all" ]; then
+  for TASK_PATH in "$TASKS_DIR"/*/; do
+    TASK_NAME=$(basename "$TASK_PATH")
+    run_task "$TASK_NAME" "$VALIDATE" || OVERALL_EXIT=1
+  done
+else
+  run_task "$TASK_ARG" "$VALIDATE" || OVERALL_EXIT=1
+fi
+
+exit "$OVERALL_EXIT"

From b2047d8c3b95e1d8a118dc54fb5f18cecf5d374f Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 10 Mar 2026 08:22:33 +0000
Subject: [PATCH 06/17] fix: emit passRate/passAtK in result JSON so CI summary
 shows actual scores

The workflow summary comment reads passRate and passAtK from per-task
JSON files. The runner was only writing reward/status, causing N/A.

Also skip baseline.json in the summary since it uses a different schema.

Co-authored-by: kdinev <1472513+kdinev@users.noreply.github.com>
---
 .github/workflows/skill-eval.yml | 2 +-
 evals/run-eval.sh                | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/skill-eval.yml b/.github/workflows/skill-eval.yml
index 136259fc1e1..c942bce4c76 100644
--- a/.github/workflows/skill-eval.yml
+++ b/.github/workflows/skill-eval.yml
@@ -52,7 +52,7 @@ jobs:
             let summary = '## 📊 Skill Eval Results\n\n';
 
             try {
-              const files = fs.readdirSync(resultsDir).filter(f => f.endsWith('.json'));
+              const files = fs.readdirSync(resultsDir).filter(f => f.endsWith('.json') && f !== 'baseline.json');
               if (files.length === 0) {
                 summary += '> ⚠️ No eval results found. The eval run may have failed.\n';
               } else {
diff --git a/evals/run-eval.sh b/evals/run-eval.sh
index 8ac0dc5fc66..0fb93b9527a 100755
--- a/evals/run-eval.sh
+++ b/evals/run-eval.sh
@@ -83,21 +83,28 @@ run_task() {
   fi
 
   local STATUS="fail"
+  local PASS_RATE="0"
+  local PASS_AT_K="0"
   if [ "$GRADER_EXIT" -eq 0 ]; then
     STATUS="pass"
+    PASS_RATE="1"
+    PASS_AT_K="1"
   fi
 
   echo ""
   echo "  Result: $STATUS  (reward=$REWARD)"
   echo ""
 
-  # Persist result
+  # Persist result — includes passRate/passAtK so the CI summary comment can
+  # read them directly (these are the fields the workflow script expects).
   mkdir -p "$RESULTS_DIR"
   cat > "$RESULTS_DIR/${TASK_ID}.json" <<EOF
 {
   "task": "$TASK_ID",
   "reward": $REWARD,
   "status": "$STATUS",
+  "passRate": $PASS_RATE,
+  "passAtK": $PASS_AT_K,
   "timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
 }
 EOF

From 169129653301d08635f91b7bb1ba75c81517ea91 Mon Sep 17 00:00:00 2001
From: Konstantin Dinev <kdinev@infragistics.com>
Date: Tue, 10 Mar 2026 10:37:35 +0200
Subject: [PATCH 07/17] Update
 evals/tasks/component-combo-reactive-form/solution/solve.sh

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 evals/tasks/component-combo-reactive-form/solution/solve.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evals/tasks/component-combo-reactive-form/solution/solve.sh b/evals/tasks/component-combo-reactive-form/solution/solve.sh
index 6b284457490..396e07382f5 100755
--- a/evals/tasks/component-combo-reactive-form/solution/solve.sh
+++ b/evals/tasks/component-combo-reactive-form/solution/solve.sh
@@ -10,7 +10,7 @@ mkdir -p src/app/user-settings
 cat > src/app/user-settings/user-settings.component.ts << 'EOF'
 import { ChangeDetectionStrategy, Component } from '@angular/core';
 import { FormGroup, FormControl, Validators, ReactiveFormsModule } from '@angular/forms';
-import { IgxComboComponent } from 'igniteui-angular';
+import { IgxComboComponent } from 'igniteui-angular/combo';
 
 @Component({
   selector: 'app-user-settings',

From 2df335efc01177f4a6931e4819386b841555fa4e Mon Sep 17 00:00:00 2001
From: Konstantin Dinev <kdinev@infragistics.com>
Date: Tue, 10 Mar 2026 10:38:43 +0200
Subject: [PATCH 08/17] Update .github/workflows/skill-eval.yml

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 .github/workflows/skill-eval.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/skill-eval.yml b/.github/workflows/skill-eval.yml
index c942bce4c76..467f73a0768 100644
--- a/.github/workflows/skill-eval.yml
+++ b/.github/workflows/skill-eval.yml
@@ -41,7 +41,7 @@ jobs:
           retention-days: 30
 
       - name: Post summary comment
-        if: always() && github.event_name == 'pull_request'
+        if: always() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.fork == false
         uses: actions/github-script@v7
         with:
           script: |

From b22b13f37cbcb9d0365dc1969da27df013eb26ed Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 10 Mar 2026 08:41:55 +0000
Subject: [PATCH 09/17] fix: tighten grader checks per review feedback

- Combo import check now specifically requires the entry-point import
  (igniteui-angular/combo) and rejects root barrel imports
- Theming grader now enforces core() must appear before theme() by
  comparing line numbers
- README updated to match actual grader behavior (no build step)

Co-authored-by: kdinev <1472513+kdinev@users.noreply.github.com>
---
 evals/README.md                                   | 12 +++++++-----
 .../component-combo-reactive-form/tests/test.sh   | 10 ++++++----
 .../theming-palette-generation/tests/test.sh      | 15 ++++++++++-----
 3 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/evals/README.md b/evals/README.md
index ca8b31cba6f..f69d72a4d19 100644
--- a/evals/README.md
+++ b/evals/README.md
@@ -79,9 +79,10 @@ npm run validate:theming       # theming-palette-generation only
 2. Write a clear, unambiguous `instruction.md` that tells the agent exactly what
    to build.
 
-3. Write `tests/test.sh` to check **outcomes** (files exist, project compiles,
-   correct selectors are present) rather than specific steps. The grader must
-   write a reward (0.0–1.0) to `logs/verifier/reward.txt`.
+3. Write `tests/test.sh` to check **outcomes** (files exist, correct selectors
+   and entry-point imports are present, correct API call ordering) rather than
+   specific steps. The grader must write a reward (0.0–1.0) to
+   `logs/verifier/reward.txt`.
 
 4. Write `prompts/quality.md` with rubric dimensions that sum to 1.0.
 
@@ -117,10 +118,11 @@ automatically on PRs that modify `skills/**` or `evals/**`. It:
 ## Grading Strategy
 
 **Deterministic grader (60% weight)** — checks:
-- Project builds without errors
+- Expected component files exist
 - Correct Ignite UI selector is present in the generated template
-- Required imports exist
+- Required entry-point imports exist (not root barrel)
 - No use of forbidden alternatives
+- Correct API call ordering (e.g. `core()` before `theme()`)
 
 **LLM rubric grader (40% weight)** — evaluates:
 - Correct intent routing
diff --git a/evals/tasks/component-combo-reactive-form/tests/test.sh b/evals/tasks/component-combo-reactive-form/tests/test.sh
index 23022ab241b..d82caf36bbe 100755
--- a/evals/tasks/component-combo-reactive-form/tests/test.sh
+++ b/evals/tasks/component-combo-reactive-form/tests/test.sh
@@ -73,19 +73,21 @@ else
   DETAILS="${DETAILS}FAIL: Forbidden alternative (native select, mat-select, igx-select) detected\n"
 fi
 
-# --- Check 5: Correct import from igniteui-angular ---
+# --- Check 5: Correct entry-point import from igniteui-angular/combo ---
+# The skill requires entry-point imports (not the root barrel).
+COMBO_IMPORT_PATTERN="from ['\"](@infragistics/)?igniteui-angular/combo['\"]"
 IMPORT_FOUND=0
 if [ -n "${COMPONENT_FILE:-}" ]; then
-  if grep -qE "from ['\"]igniteui-angular|from ['\"]@infragistics/igniteui-angular" "$COMPONENT_FILE" 2>/dev/null; then
+  if grep -qE "$COMBO_IMPORT_PATTERN" "$COMPONENT_FILE" 2>/dev/null; then
     IMPORT_FOUND=1
   fi
 fi
 
 if [ "$IMPORT_FOUND" -eq 1 ]; then
   SCORE=$((SCORE + 1))
-  DETAILS="${DETAILS}PASS: igniteui-angular import found\n"
+  DETAILS="${DETAILS}PASS: Correct combo entry-point import found\n"
 else
-  DETAILS="${DETAILS}FAIL: No igniteui-angular import found\n"
+  DETAILS="${DETAILS}FAIL: Missing import from igniteui-angular/combo entry point\n"
 fi
 
 # --- Calculate reward ---
diff --git a/evals/tasks/theming-palette-generation/tests/test.sh b/evals/tasks/theming-palette-generation/tests/test.sh
index 0fa64771be2..2a992f39aea 100755
--- a/evals/tasks/theming-palette-generation/tests/test.sh
+++ b/evals/tasks/theming-palette-generation/tests/test.sh
@@ -50,12 +50,17 @@ else
   DETAILS="${DETAILS}FAIL: No theme() mixin call found\n"
 fi
 
-# --- Check 4: core() mixin call (must be before theme) ---
-if grep -qE '@include.*core\(' "$STYLES_FILE" 2>/dev/null; then
-  SCORE=$((SCORE + 1))
-  DETAILS="${DETAILS}PASS: core() mixin call found\n"
-else
+# --- Check 4: core() mixin call must appear before theme() ---
+CORE_LINE=$(grep -nE '@include.*core\(' "$STYLES_FILE" 2>/dev/null | head -1 | cut -d: -f1)
+THEME_LINE=$(grep -nE '@include.*theme\(' "$STYLES_FILE" 2>/dev/null | head -1 | cut -d: -f1)
+
+if [ -z "${CORE_LINE:-}" ]; then
   DETAILS="${DETAILS}FAIL: No core() mixin call found\n"
+elif [ -n "${THEME_LINE:-}" ] && [ "$CORE_LINE" -gt "$THEME_LINE" ]; then
+  DETAILS="${DETAILS}FAIL: core() must be called before theme() (core on line $CORE_LINE, theme on line $THEME_LINE)\n"
+else
+  SCORE=$((SCORE + 1))
+  DETAILS="${DETAILS}PASS: core() mixin call found before theme()\n"
 fi
 
 # --- Check 5: No hardcoded CSS custom properties as the sole theming approach ---

From 94d4bf831fbdecff1ff14c222869eead1c826a5d Mon Sep 17 00:00:00 2001
From: Konstantin Dinev <kdinev@infragistics.com>
Date: Tue, 10 Mar 2026 10:45:24 +0200
Subject: [PATCH 10/17] Update Node.js version in skill-eval workflow

---
 .github/workflows/skill-eval.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/skill-eval.yml b/.github/workflows/skill-eval.yml
index 467f73a0768..45a7d608c61 100644
--- a/.github/workflows/skill-eval.yml
+++ b/.github/workflows/skill-eval.yml
@@ -22,7 +22,7 @@ jobs:
       - name: Set up Node.js
         uses: actions/setup-node@v4
         with:
-          node-version: '20'
+          node-version: '22'
 
       - name: Install eval dependencies
         working-directory: evals

From 18f3e2570b837323cfb03d940d6aeacda11aa78b Mon Sep 17 00:00:00 2001
From: Konstantin Dinev <kdinev@infragistics.com>
Date: Tue, 10 Mar 2026 10:47:56 +0200
Subject: [PATCH 11/17] Update skill-eval.yml

---
 .github/workflows/skill-eval.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/skill-eval.yml b/.github/workflows/skill-eval.yml
index 45a7d608c61..26294ace0dd 100644
--- a/.github/workflows/skill-eval.yml
+++ b/.github/workflows/skill-eval.yml
@@ -11,7 +11,7 @@ permissions:
   pull-requests: write
 
 jobs:
-  eval:
+  skill_eval:
     runs-on: ubuntu-latest
     timeout-minutes: 30
 

From 568b04d54ce9f1250af896088b50244b60456009 Mon Sep 17 00:00:00 2001
From: Konstantin Dinev <kdinev@infragistics.com>
Date: Tue, 10 Mar 2026 11:05:31 +0200
Subject: [PATCH 12/17] Remove eval dependencies installation step

Removed installation of eval dependencies from workflow.
---
 .github/workflows/skill-eval.yml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.github/workflows/skill-eval.yml b/.github/workflows/skill-eval.yml
index 26294ace0dd..c7ab06b4c75 100644
--- a/.github/workflows/skill-eval.yml
+++ b/.github/workflows/skill-eval.yml
@@ -24,10 +24,6 @@ jobs:
         with:
           node-version: '22'
 
-      - name: Install eval dependencies
-        working-directory: evals
-        run: npm install --ignore-scripts
-
       - name: Validate graders against reference solutions
         working-directory: evals
         run: bash run-eval.sh --all --validate

From b181ca0600f3bcaf0120cd18871264a6a935e4e9 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 10 Mar 2026 09:32:17 +0000
Subject: [PATCH 13/17] feat: add copilot-cli and gemini-cli agent modes to
 eval runner

Co-authored-by: kdinev <1472513+kdinev@users.noreply.github.com>
---
 .github/workflows/skill-eval.yml |  95 +++++++--
 evals/README.md                  | 109 ++++++++++-
 evals/eval-config.json           |  23 +++
 evals/package.json               |  10 +-
 evals/run-eval.sh                | 321 +++++++++++++++++++++++++++++--
 5 files changed, 521 insertions(+), 37 deletions(-)
 create mode 100644 evals/eval-config.json

diff --git a/.github/workflows/skill-eval.yml b/.github/workflows/skill-eval.yml
index c7ab06b4c75..8109bb6106d 100644
--- a/.github/workflows/skill-eval.yml
+++ b/.github/workflows/skill-eval.yml
@@ -5,15 +5,31 @@ on:
     paths:
       - 'skills/**'
       - 'evals/**'
+  workflow_dispatch:
+    inputs:
+      agent:
+        description: 'Agent to run evals against (copilot or gemini)'
+        required: true
+        default: 'copilot'
+        type: choice
+        options:
+          - copilot
+          - gemini
+      trials:
+        description: 'Number of trials per task'
+        required: false
+        default: '1'
+        type: string
 
 permissions:
   contents: read
   pull-requests: write
 
 jobs:
-  skill_eval:
+  # Job 1: Always validate graders against reference solutions
+  validate_graders:
     runs-on: ubuntu-latest
-    timeout-minutes: 30
+    timeout-minutes: 10
 
     steps:
       - name: Checkout repository
@@ -28,16 +44,70 @@ jobs:
         working-directory: evals
         run: bash run-eval.sh --all --validate
 
-      - name: Upload results
+      - name: Upload validation results
         if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: skill-eval-results
+          name: skill-eval-validation-results
           path: evals/results/
           retention-days: 30
 
+  # Job 2: Run evals against an AI agent (copilot or gemini)
+  # Triggered manually via workflow_dispatch, or can be called from other workflows
+  agent_eval:
+    if: github.event_name == 'workflow_dispatch'
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: '22'
+
+      - name: Install Copilot CLI
+        if: inputs.agent == 'copilot'
+        run: npm install -g @github/copilot
+
+      - name: Install Gemini CLI
+        if: inputs.agent == 'gemini'
+        run: npm install -g @google/gemini-cli
+
+      - name: Run agent-based eval
+        working-directory: evals
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
+        run: |
+          bash run-eval.sh --all \
+            --agent ${{ inputs.agent }} \
+            --trials ${{ inputs.trials || '1' }}
+
+      - name: Upload agent eval results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: skill-eval-agent-${{ inputs.agent }}-results
+          path: evals/results/
+          retention-days: 30
+
+  # Job 3: Post summary comment on PRs
+  post_summary:
+    if: always() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.fork == false
+    needs: [validate_graders]
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Download validation results
+        uses: actions/download-artifact@v4
+        with:
+          name: skill-eval-validation-results
+          path: evals/results/
+
       - name: Post summary comment
-        if: always() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.fork == false
         uses: actions/github-script@v7
         with:
           script: |
@@ -52,26 +122,27 @@ jobs:
               if (files.length === 0) {
                 summary += '> ⚠️ No eval results found. The eval run may have failed.\n';
               } else {
-                summary += '| Task | Pass Rate | pass@5 | Status |\n';
-                summary += '|---|---|---|---|\n';
+                summary += '| Task | Agent | Pass Rate | pass@k | Status |\n';
+                summary += '|---|---|---|---|---|\n';
 
                 for (const file of files) {
                   try {
                     const data = JSON.parse(fs.readFileSync(path.join(resultsDir, file), 'utf8'));
                     const taskName = data.task || file.replace('.json', '');
+                    const agent = data.agent || 'reference';
                     const passRate = data.passRate != null ? `${(data.passRate * 100).toFixed(0)}%` : 'N/A';
                     const passAtK = data.passAtK != null ? `${(data.passAtK * 100).toFixed(0)}%` : 'N/A';
                     const status = data.passAtK >= 0.8 ? '✅' : data.passAtK >= 0.6 ? '⚠️' : '❌';
-                    summary += `| ${taskName} | ${passRate} | ${passAtK} | ${status} |\n`;
+                    summary += `| ${taskName} | ${agent} | ${passRate} | ${passAtK} | ${status} |\n`;
                   } catch (e) {
-                    summary += `| ${file} | Error | Error | ❌ |\n`;
+                    summary += `| ${file} | — | Error | Error | ❌ |\n`;
                   }
                 }
 
                 summary += '\n### Thresholds\n';
-                summary += '- ✅ `pass@5 ≥ 80%` — merge gate passed\n';
-                summary += '- ⚠️ `pass@5 ≥ 60%` — needs investigation\n';
-                summary += '- ❌ `pass@5 < 60%` — blocks merge for affected skill\n';
+                summary += '- ✅ `pass@k ≥ 80%` — merge gate passed\n';
+                summary += '- ⚠️ `pass@k ≥ 60%` — needs investigation\n';
+                summary += '- ❌ `pass@k < 60%` — blocks merge for affected skill\n';
               }
             } catch (e) {
               summary += `> ⚠️ Could not read results: ${e.message}\n`;
diff --git a/evals/README.md b/evals/README.md
index f69d72a4d19..2c7160237bc 100644
--- a/evals/README.md
+++ b/evals/README.md
@@ -7,7 +7,9 @@ architecture and extended with patterns from
 
 The infrastructure is **self-contained** — there are no external eval-framework
 dependencies. A lightweight shell runner (`run-eval.sh`) executes each task's
-reference solution and deterministic grader.
+reference solution and deterministic grader, and can also dispatch tasks to
+AI coding agents (GitHub Copilot CLI or Google Gemini CLI) for end-to-end
+evaluation.
 
 ## Overview
 
@@ -32,6 +34,14 @@ Each task includes:
 
 - Bash 4+
 - `bc` (installed by default on most Linux / macOS systems)
+- Node.js 20+ (for config parsing and agent CLI installation)
+
+**For agent-based evaluation (optional):**
+
+| Agent | Install | Auth |
+|---|---|---|
+| GitHub Copilot | `npm install -g @github/copilot` | Active Copilot subscription; `GITHUB_TOKEN` env var |
+| Google Gemini | `npm install -g @google/gemini-cli` | `GEMINI_API_KEY` env var |
 
 ## Running Evals Locally
 
@@ -50,16 +60,73 @@ bash run-eval.sh --all --validate
 bash run-eval.sh grid-basic-setup --validate
 ```
 
+### Run evals against an AI agent
+
+Send the `instruction.md` to a coding agent CLI, let the agent generate code
+in an isolated workspace, then run the deterministic grader on the output.
+
+```bash
+cd evals
+
+# Run all tasks with GitHub Copilot CLI
+bash run-eval.sh --all --agent copilot
+
+# Run a single task with Gemini CLI
+bash run-eval.sh grid-basic-setup --agent gemini
+
+# Run 3 trials per task for statistical robustness
+bash run-eval.sh --all --agent copilot --trials 3
+```
+
 ### npm scripts (convenience wrappers)
 
 ```bash
 cd evals
+
+# Validation (reference solutions)
 npm run validate               # all tasks
 npm run validate:grid          # grid-basic-setup only
 npm run validate:combo         # component-combo-reactive-form only
 npm run validate:theming       # theming-palette-generation only
+
+# Agent-based evaluation
+npm run agent:copilot          # all tasks with Copilot
+npm run agent:copilot:grid     # grid task with Copilot
+npm run agent:gemini           # all tasks with Gemini
+npm run agent:gemini:theming   # theming task with Gemini
+```
+
+## Agent Configuration
+
+Agent settings are stored in `eval-config.json`:
+
+```json
+{
+  "defaultAgent": "copilot",
+  "agents": {
+    "copilot": {
+      "command": "copilot",
+      "installCommand": "npm install -g @github/copilot",
+      "promptArgs": ["-p"],
+      "autoApproveArgs": ["--yes"],
+      "envAuth": "GITHUB_TOKEN"
+    },
+    "gemini": {
+      "command": "gemini",
+      "installCommand": "npm install -g @google/gemini-cli",
+      "promptArgs": ["-p"],
+      "autoApproveArgs": ["--sandbox"],
+      "envAuth": "GEMINI_API_KEY"
+    }
+  },
+  "trialCount": 1,
+  "timeoutSec": 600
+}
 ```
 
+You can customize the agent command, flags, and timeouts by editing this file.
+To switch the default agent, change `defaultAgent`.
+
 ## Adding a New Task
 
 1. Create a directory under `evals/tasks/<task-id>/` with the standard structure:
@@ -95,25 +162,43 @@ npm run validate:theming       # theming-palette-generation only
    bash run-eval.sh <task-id> --validate
    ```
 
+7. Test against at least one agent:
+
+   ```bash
+   bash run-eval.sh <task-id> --agent copilot
+   ```
+
 ## Pass / Fail Thresholds
 
 Following [Anthropic's recommendations](https://www.anthropic.com/engineering/demystifying-evals-for-ai-agents):
 
 | Metric | Threshold | Effect |
 |---|---|---|
-| `pass@5 ≥ 80%` | **Merge gate** | At least 1 success in 5 trials required |
-| `pass^5 ≥ 60%` | **Tracked** | Flags flaky skills for investigation |
-| `pass@5 < 60%` | **Blocks merge** | On PRs touching the relevant skill |
+| `pass@k ≥ 80%` | **Merge gate** | At least 1 success in k trials required |
+| `pass@k ≥ 60%` | **Tracked** | Flags flaky skills for investigation |
+| `pass@k < 60%` | **Blocks merge** | On PRs touching the relevant skill |
 
 ## CI Integration
 
-The GitHub Actions workflow at `.github/workflows/skill-eval.yml` runs
-automatically on PRs that modify `skills/**` or `evals/**`. It:
+The GitHub Actions workflow at `.github/workflows/skill-eval.yml` provides two
+evaluation modes:
 
-1. Checks out the repo
-2. Validates all graders against their reference solutions
-3. Uploads results as an artifact
-4. Posts a summary comment on the PR
+### Automatic (on PR)
+Runs on every PR that modifies `skills/**` or `evals/**`:
+1. Validates all graders against their reference solutions
+2. Uploads results as an artifact
+3. Posts a summary comment on the PR
+
+### Manual (workflow_dispatch)
+Triggered manually from the Actions tab to run agent-based evaluation:
+1. Select the agent (`copilot` or `gemini`) and number of trials
+2. Installs the selected agent CLI
+3. Runs all tasks against the agent
+4. Uploads results as an artifact
+
+**Secrets required for agent-based CI:**
+- `GITHUB_TOKEN` — automatically available (for Copilot)
+- `GEMINI_API_KEY` — must be added as a repository secret (for Gemini)
 
 ## Grading Strategy
 
@@ -135,3 +220,7 @@ automatically on PRs that modify `skills/**` or `evals/**`. It:
 Baseline results are stored in `evals/results/baseline.json` and used for
 regression comparison on PRs. The CI workflow uploads per-run results as
 GitHub Actions artifacts.
+
+Agent-based results are suffixed with the agent name (e.g.,
+`grid-basic-setup-copilot.json`) to distinguish them from reference
+validation results.
diff --git a/evals/eval-config.json b/evals/eval-config.json
new file mode 100644
index 00000000000..3c073c7832b
--- /dev/null
+++ b/evals/eval-config.json
@@ -0,0 +1,23 @@
+{
+  "defaultAgent": "copilot",
+  "agents": {
+    "copilot": {
+      "command": "copilot",
+      "installCommand": "npm install -g @github/copilot",
+      "promptArgs": ["-p"],
+      "autoApproveArgs": ["--yes"],
+      "envAuth": "GITHUB_TOKEN",
+      "description": "GitHub Copilot CLI (requires active Copilot subscription)"
+    },
+    "gemini": {
+      "command": "gemini",
+      "installCommand": "npm install -g @google/gemini-cli",
+      "promptArgs": ["-p"],
+      "autoApproveArgs": ["--sandbox"],
+      "envAuth": "GEMINI_API_KEY",
+      "description": "Google Gemini CLI (requires GEMINI_API_KEY)"
+    }
+  },
+  "trialCount": 1,
+  "timeoutSec": 600
+}
diff --git a/evals/package.json b/evals/package.json
index b660ea782d8..b6d79561471 100644
--- a/evals/package.json
+++ b/evals/package.json
@@ -12,7 +12,15 @@
     "validate": "bash run-eval.sh --all --validate",
     "validate:grid": "bash run-eval.sh grid-basic-setup --validate",
     "validate:combo": "bash run-eval.sh component-combo-reactive-form --validate",
-    "validate:theming": "bash run-eval.sh theming-palette-generation --validate"
+    "validate:theming": "bash run-eval.sh theming-palette-generation --validate",
+    "agent:copilot": "bash run-eval.sh --all --agent copilot",
+    "agent:copilot:grid": "bash run-eval.sh grid-basic-setup --agent copilot",
+    "agent:copilot:combo": "bash run-eval.sh component-combo-reactive-form --agent copilot",
+    "agent:copilot:theming": "bash run-eval.sh theming-palette-generation --agent copilot",
+    "agent:gemini": "bash run-eval.sh --all --agent gemini",
+    "agent:gemini:grid": "bash run-eval.sh grid-basic-setup --agent gemini",
+    "agent:gemini:combo": "bash run-eval.sh component-combo-reactive-form --agent gemini",
+    "agent:gemini:theming": "bash run-eval.sh theming-palette-generation --agent gemini"
   },
   "engines": {
     "node": ">=20.0.0"
diff --git a/evals/run-eval.sh b/evals/run-eval.sh
index 0fb93b9527a..802a9bc8c47 100755
--- a/evals/run-eval.sh
+++ b/evals/run-eval.sh
@@ -4,21 +4,26 @@
 # not an installable package).
 #
 # Usage:
-#   bash run-eval.sh <task-id>              # validate one task
-#   bash run-eval.sh --all                  # validate all tasks
-#   bash run-eval.sh <task-id> --validate   # run reference solution then grade
+#   bash run-eval.sh <task-id>                        # validate one task (reference solution)
+#   bash run-eval.sh --all                            # validate all tasks
+#   bash run-eval.sh <task-id> --validate             # run reference solution then grade
+#   bash run-eval.sh <task-id> --agent copilot        # run task using copilot CLI agent
+#   bash run-eval.sh <task-id> --agent gemini         # run task using gemini CLI agent
+#   bash run-eval.sh --all --agent copilot            # run all tasks with copilot agent
+#   bash run-eval.sh --all --agent gemini --trials 3  # 3 trials per task with gemini
 
 set -euo pipefail
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 TASKS_DIR="$SCRIPT_DIR/tasks"
 RESULTS_DIR="$SCRIPT_DIR/results"
+CONFIG_FILE="$SCRIPT_DIR/eval-config.json"
 
 # --- helpers --------------------------------------------------------------- #
 
 usage() {
   cat <<EOF
-Usage: $(basename "$0") <task-id|--all> [--validate]
+Usage: $(basename "$0") <task-id|--all> [--validate] [--agent <name>] [--trials <n>]
 
 Arguments:
   <task-id>     Name of the task directory under tasks/
@@ -26,17 +31,144 @@ Arguments:
 
 Options:
   --validate    Apply the reference solution before grading (sanity-check mode)
+  --agent NAME  Run task using an AI agent CLI (copilot | gemini)
+  --trials N    Number of trials per task when using --agent (default: 1)
 
 Examples:
   $(basename "$0") grid-basic-setup --validate
   $(basename "$0") --all
+  $(basename "$0") grid-basic-setup --agent copilot
+  $(basename "$0") --all --agent gemini --trials 3
 EOF
   exit 1
 }
 
+# Read a JSON string field from eval-config.json
+# Usage: read_config '.agents.copilot.command'
+read_config() {
+  local QUERY="$1"
+  if [ ! -f "$CONFIG_FILE" ]; then
+    echo ""
+    return
+  fi
+  # Use node to parse JSON (available in CI and most dev environments)
+  node -e "
+    const fs = require('fs');
+    const cfg = JSON.parse(fs.readFileSync('$CONFIG_FILE', 'utf8'));
+    const keys = '${QUERY}'.replace(/^\\./, '').split('.');
+    let val = cfg;
+    for (const k of keys) { val = val?.[k]; }
+    if (Array.isArray(val)) { console.log(val.join(' ')); }
+    else { console.log(val ?? ''); }
+  " 2>/dev/null || echo ""
+}
+
+# Resolve the agent CLI command and flags from config
+resolve_agent() {
+  local AGENT_NAME="$1"
+  AGENT_CMD=$(read_config "agents.${AGENT_NAME}.command")
+  AGENT_PROMPT_ARGS=$(read_config "agents.${AGENT_NAME}.promptArgs")
+  AGENT_APPROVE_ARGS=$(read_config "agents.${AGENT_NAME}.autoApproveArgs")
+  AGENT_ENV_AUTH=$(read_config "agents.${AGENT_NAME}.envAuth")
+
+  if [ -z "$AGENT_CMD" ]; then
+    echo "ERROR: Unknown agent '$AGENT_NAME'. Check eval-config.json" >&2
+    exit 1
+  fi
+
+  # Verify the CLI is installed
+  if ! command -v "$AGENT_CMD" &>/dev/null; then
+    local INSTALL_CMD
+    INSTALL_CMD=$(read_config "agents.${AGENT_NAME}.installCommand")
+    echo "ERROR: '$AGENT_CMD' is not installed." >&2
+    echo "  Install with: $INSTALL_CMD" >&2
+    exit 1
+  fi
+
+  # Verify the auth env var is set
+  if [ -n "$AGENT_ENV_AUTH" ]; then
+    if [ -z "${!AGENT_ENV_AUTH:-}" ]; then
+      echo "WARNING: $AGENT_ENV_AUTH is not set. The agent may fail to authenticate." >&2
+    fi
+  fi
+}
+
+# Run a single task using the agent CLI
+run_agent_task() {
+  local TASK_DIR="$1"
+  local WORK_DIR="$2"
+  local AGENT_NAME="$3"
+
+  local INSTRUCTION_FILE="$TASK_DIR/instruction.md"
+  if [ ! -f "$INSTRUCTION_FILE" ]; then
+    echo "ERROR: No instruction.md found at $INSTRUCTION_FILE" >&2
+    return 1
+  fi
+
+  local PROMPT
+  PROMPT=$(cat "$INSTRUCTION_FILE")
+
+  # Build the skill context preamble if skills/ directory exists
+  local SKILL_CONTEXT=""
+  if [ -d "$TASK_DIR/skills" ]; then
+    for SKILL_FILE in "$TASK_DIR"/skills/*/SKILL.md; do
+      if [ -f "$SKILL_FILE" ]; then
+        SKILL_CONTEXT="${SKILL_CONTEXT}$(cat "$SKILL_FILE")\n\n"
+      fi
+    done
+  fi
+
+  # Combine skill context + instruction into a single prompt
+  local FULL_PROMPT=""
+  if [ -n "$SKILL_CONTEXT" ]; then
+    FULL_PROMPT="Use the following skill reference when completing the task:\n\n${SKILL_CONTEXT}---\n\n${PROMPT}"
+  else
+    FULL_PROMPT="$PROMPT"
+  fi
+
+  echo "  → Sending instruction to $AGENT_NAME agent …"
+
+  local TIMEOUT_SEC
+  TIMEOUT_SEC=$(read_config "timeoutSec")
+  TIMEOUT_SEC="${TIMEOUT_SEC:-600}"
+
+  # Build the agent command
+  local CMD_ARGS=()
+  CMD_ARGS+=("$AGENT_CMD")
+
+  # Add prompt args (e.g., -p)
+  if [ -n "$AGENT_PROMPT_ARGS" ]; then
+    # shellcheck disable=SC2206
+    CMD_ARGS+=($AGENT_PROMPT_ARGS)
+  fi
+  CMD_ARGS+=("$FULL_PROMPT")
+
+  # Add auto-approve args (e.g., --yes, --sandbox)
+  if [ -n "$AGENT_APPROVE_ARGS" ]; then
+    # shellcheck disable=SC2206
+    CMD_ARGS+=($AGENT_APPROVE_ARGS)
+  fi
+
+  # Run the agent in the work directory with a timeout
+  local AGENT_EXIT=0
+  (
+    cd "$WORK_DIR"
+    timeout "${TIMEOUT_SEC}s" "${CMD_ARGS[@]}" 2>&1 || true
+  ) > "$WORK_DIR/agent-output.log" 2>&1 || AGENT_EXIT=$?
+
+  if [ "$AGENT_EXIT" -eq 124 ]; then
+    echo "  ⚠ Agent timed out after ${TIMEOUT_SEC}s"
+  elif [ "$AGENT_EXIT" -ne 0 ]; then
+    echo "  ⚠ Agent exited with code $AGENT_EXIT"
+  fi
+
+  echo "  → Agent output saved to $WORK_DIR/agent-output.log"
+}
+
 run_task() {
   local TASK_ID="$1"
-  local VALIDATE="${2:-false}"
+  local MODE="${2:-validate}"   # validate | agent
+  local AGENT_NAME="${3:-}"
   local TASK_DIR="$TASKS_DIR/$TASK_ID"
 
   if [ ! -d "$TASK_DIR" ]; then
@@ -46,6 +178,9 @@ run_task() {
 
   echo "═══════════════════════════════════════════════════════"
   echo "  Task: $TASK_ID"
+  if [ "$MODE" = "agent" ]; then
+    echo "  Agent: $AGENT_NAME"
+  fi
   echo "═══════════════════════════════════════════════════════"
 
   # Create a temporary workspace so graders run in isolation
@@ -56,14 +191,17 @@ run_task() {
   # Seed the workspace with a minimal src/ tree
   mkdir -p "$WORK_DIR/src"
 
-  # If --validate, apply the reference solution first
-  if [ "$VALIDATE" = "true" ]; then
+  if [ "$MODE" = "validate" ]; then
+    # --validate: apply the reference solution first
     if [ ! -f "$TASK_DIR/solution/solve.sh" ]; then
       echo "ERROR: No reference solution at $TASK_DIR/solution/solve.sh" >&2
       return 1
     fi
     echo "→ Applying reference solution …"
     (cd "$WORK_DIR" && bash "$TASK_DIR/solution/solve.sh")
+  elif [ "$MODE" = "agent" ]; then
+    # --agent: send the instruction to the agent CLI
+    run_agent_task "$TASK_DIR" "$WORK_DIR" "$AGENT_NAME"
   fi
 
   # Run deterministic grader
@@ -98,9 +236,14 @@ run_task() {
   # Persist result — includes passRate/passAtK so the CI summary comment can
   # read them directly (these are the fields the workflow script expects).
   mkdir -p "$RESULTS_DIR"
-  cat > "$RESULTS_DIR/${TASK_ID}.json" <<EOF
+  local RESULT_SUFFIX=""
+  if [ "$MODE" = "agent" ]; then
+    RESULT_SUFFIX="-${AGENT_NAME}"
+  fi
+  cat > "$RESULTS_DIR/${TASK_ID}${RESULT_SUFFIX}.json" <<EOF
 {
   "task": "$TASK_ID",
+  "agent": "${AGENT_NAME:-reference}",
   "reward": $REWARD,
   "status": "$STATUS",
   "passRate": $PASS_RATE,
@@ -112,16 +255,152 @@ EOF
   return "$GRADER_EXIT"
 }
 
+# Run a task N times (trials) and aggregate results
+run_task_trials() {
+  local TASK_ID="$1"
+  local AGENT_NAME="$2"
+  local TRIALS="$3"
+  local TASK_DIR="$TASKS_DIR/$TASK_ID"
+
+  if [ ! -d "$TASK_DIR" ]; then
+    echo "ERROR: Task directory not found: $TASK_DIR" >&2
+    return 1
+  fi
+
+  local PASS_COUNT=0
+  local TOTAL_REWARD=0
+
+  for i in $(seq 1 "$TRIALS"); do
+    echo ""
+    echo "  ── Trial $i/$TRIALS ──"
+
+    # Create a temporary workspace for each trial
+    local WORK_DIR
+    WORK_DIR=$(mktemp -d)
+
+    mkdir -p "$WORK_DIR/src"
+
+    # Send to agent
+    run_agent_task "$TASK_DIR" "$WORK_DIR" "$AGENT_NAME"
+
+    # Run grader
+    local GRADER_EXIT=0
+    (cd "$WORK_DIR" && bash "$TASK_DIR/tests/test.sh") || GRADER_EXIT=$?
+
+    local REWARD="0"
+    if [ -f "$WORK_DIR/logs/verifier/reward.txt" ]; then
+      REWARD=$(cat "$WORK_DIR/logs/verifier/reward.txt")
+    fi
+
+    if [ "$GRADER_EXIT" -eq 0 ]; then
+      PASS_COUNT=$((PASS_COUNT + 1))
+    fi
+    TOTAL_REWARD=$(echo "$TOTAL_REWARD + $REWARD" | bc)
+
+    # Cleanup trial workspace
+    rm -rf "$WORK_DIR"
+
+    echo "  Trial $i: reward=$REWARD $([ "$GRADER_EXIT" -eq 0 ] && echo "✅" || echo "❌")"
+  done
+
+  # Calculate aggregate metrics
+  local PASS_RATE
+  PASS_RATE=$(echo "scale=2; $PASS_COUNT / $TRIALS" | bc)
+  # pass@k = 1 if at least one trial passed, else 0
+  local PASS_AT_K=0
+  if [ "$PASS_COUNT" -gt 0 ]; then
+    PASS_AT_K=1
+  fi
+  local AVG_REWARD
+  AVG_REWARD=$(echo "scale=2; $TOTAL_REWARD / $TRIALS" | bc)
+
+  echo ""
+  echo "  ═══ Aggregate ($TRIALS trials) ═══"
+  echo "  Pass rate: $PASS_COUNT/$TRIALS ($PASS_RATE)"
+  echo "  pass@$TRIALS: $PASS_AT_K"
+  echo "  Avg reward: $AVG_REWARD"
+  echo ""
+
+  # Persist aggregated result
+  mkdir -p "$RESULTS_DIR"
+  cat > "$RESULTS_DIR/${TASK_ID}-${AGENT_NAME}.json" <<EOF
+{
+  "task": "$TASK_ID",
+  "agent": "$AGENT_NAME",
+  "trials": $TRIALS,
+  "passCount": $PASS_COUNT,
+  "reward": $AVG_REWARD,
+  "status": "$([ "$PASS_AT_K" -eq 1 ] && echo "pass" || echo "fail")",
+  "passRate": $PASS_RATE,
+  "passAtK": $PASS_AT_K,
+  "timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
+}
+EOF
+
+  [ "$PASS_AT_K" -eq 1 ] && return 0 || return 1
+}
+
 # --- main ------------------------------------------------------------------ #
 
 if [ $# -lt 1 ]; then
   usage
 fi
 
-TASK_ARG="$1"
-VALIDATE="false"
-if [[ "${2:-}" == "--validate" ]]; then
-  VALIDATE="true"
+# Parse arguments
+TASK_ARG=""
+MODE="validate"
+AGENT_NAME=""
+TRIALS=1
+
+while [ $# -gt 0 ]; do
+  case "$1" in
+    --all)
+      TASK_ARG="--all"
+      shift
+      ;;
+    --validate)
+      MODE="validate"
+      shift
+      ;;
+    --agent)
+      MODE="agent"
+      AGENT_NAME="${2:-}"
+      if [ -z "$AGENT_NAME" ]; then
+        echo "ERROR: --agent requires a name (copilot | gemini)" >&2
+        exit 1
+      fi
+      shift 2
+      ;;
+    --trials)
+      TRIALS="${2:-1}"
+      shift 2
+      ;;
+    -h|--help)
+      usage
+      ;;
+    *)
+      if [ -z "$TASK_ARG" ]; then
+        TASK_ARG="$1"
+      fi
+      shift
+      ;;
+  esac
+done
+
+if [ -z "$TASK_ARG" ]; then
+  usage
+fi
+
+# If using agent mode, resolve and verify agent CLI
+if [ "$MODE" = "agent" ]; then
+  # Default to configured agent if none specified
+  if [ -z "$AGENT_NAME" ]; then
+    AGENT_NAME=$(read_config "defaultAgent")
+    AGENT_NAME="${AGENT_NAME:-copilot}"
+  fi
+  resolve_agent "$AGENT_NAME"
+  echo "Using agent: $AGENT_NAME ($AGENT_CMD)"
+  echo ""
 fi
 
 OVERALL_EXIT=0
@@ -129,10 +408,24 @@ OVERALL_EXIT=0
 if [ "$TASK_ARG" = "--all" ]; then
   for TASK_PATH in "$TASKS_DIR"/*/; do
     TASK_NAME=$(basename "$TASK_PATH")
-    run_task "$TASK_NAME" "$VALIDATE" || OVERALL_EXIT=1
+    if [ "$MODE" = "agent" ] && [ "$TRIALS" -gt 1 ]; then
+      echo "═══════════════════════════════════════════════════════"
+      echo "  Task: $TASK_NAME  (Agent: $AGENT_NAME, $TRIALS trials)"
+      echo "═══════════════════════════════════════════════════════"
+      run_task_trials "$TASK_NAME" "$AGENT_NAME" "$TRIALS" || OVERALL_EXIT=1
+    else
+      run_task "$TASK_NAME" "$MODE" "$AGENT_NAME" || OVERALL_EXIT=1
+    fi
   done
 else
-  run_task "$TASK_ARG" "$VALIDATE" || OVERALL_EXIT=1
+  if [ "$MODE" = "agent" ] && [ "$TRIALS" -gt 1 ]; then
+    echo "═══════════════════════════════════════════════════════"
+    echo "  Task: $TASK_ARG  (Agent: $AGENT_NAME, $TRIALS trials)"
+    echo "═══════════════════════════════════════════════════════"
+    run_task_trials "$TASK_ARG" "$AGENT_NAME" "$TRIALS" || OVERALL_EXIT=1
+  else
+    run_task "$TASK_ARG" "$MODE" "$AGENT_NAME" || OVERALL_EXIT=1
+  fi
 fi
 
 exit "$OVERALL_EXIT"

From 665264b892ec37e00b09439753fdeaf4f2dabf80 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 10 Mar 2026 09:33:53 +0000
Subject: [PATCH 14/17] fix: use read -ra for safe array parsing, add TRIALS
 guard

Co-authored-by: kdinev <1472513+kdinev@users.noreply.github.com>
---
 evals/run-eval.sh | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/evals/run-eval.sh b/evals/run-eval.sh
index 802a9bc8c47..348788dc861 100755
--- a/evals/run-eval.sh
+++ b/evals/run-eval.sh
@@ -138,15 +138,15 @@ run_agent_task() {
 
   # Add prompt args (e.g., -p)
   if [ -n "$AGENT_PROMPT_ARGS" ]; then
-    # shellcheck disable=SC2206
-    CMD_ARGS+=($AGENT_PROMPT_ARGS)
+    read -ra _PROMPT_PARTS <<< "$AGENT_PROMPT_ARGS"
+    CMD_ARGS+=("${_PROMPT_PARTS[@]}")
   fi
   CMD_ARGS+=("$FULL_PROMPT")
 
   # Add auto-approve args (e.g., --yes, --sandbox)
   if [ -n "$AGENT_APPROVE_ARGS" ]; then
-    # shellcheck disable=SC2206
-    CMD_ARGS+=($AGENT_APPROVE_ARGS)
+    read -ra _APPROVE_PARTS <<< "$AGENT_APPROVE_ARGS"
+    CMD_ARGS+=("${_APPROVE_PARTS[@]}")
   fi
 
   # Run the agent in the work directory with a timeout
@@ -304,6 +304,10 @@ run_task_trials() {
   done
 
   # Calculate aggregate metrics
+  if [ "$TRIALS" -le 0 ]; then
+    echo "ERROR: TRIALS must be > 0" >&2
+    return 1
+  fi
   local PASS_RATE
   PASS_RATE=$(echo "scale=2; $PASS_COUNT / $TRIALS" | bc)
   # pass@k = 1 if at least one trial passed, else 0

From b3fa97391e69df7653bf1dac213d17b7d2fd6dee Mon Sep 17 00:00:00 2001
From: Konstantin Dinev <kdinev@infragistics.com>
Date: Tue, 10 Mar 2026 11:46:27 +0200
Subject: [PATCH 15/17] Update skill-eval.yml

---
 .github/workflows/skill-eval.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/skill-eval.yml b/.github/workflows/skill-eval.yml
index 8109bb6106d..63fba463be7 100644
--- a/.github/workflows/skill-eval.yml
+++ b/.github/workflows/skill-eval.yml
@@ -55,7 +55,6 @@ jobs:
   # Job 2: Run evals against an AI agent (copilot or gemini)
   # Triggered manually via workflow_dispatch, or can be called from other workflows
   agent_eval:
-    if: github.event_name == 'workflow_dispatch'
     runs-on: ubuntu-latest
     timeout-minutes: 60
 

From 1330989356b35150d862272597d3ad31492507f7 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 10 Mar 2026 09:54:04 +0000
Subject: [PATCH 16/17] rework CI workflow to always run against both copilot
 and gemini agents

Co-authored-by: kdinev <1472513+kdinev@users.noreply.github.com>
---
 .github/workflows/skill-eval.yml | 163 ++++++++++++++++++++-----------
 evals/README.md                  |  29 +++---
 2 files changed, 119 insertions(+), 73 deletions(-)

diff --git a/.github/workflows/skill-eval.yml b/.github/workflows/skill-eval.yml
index 63fba463be7..95541d6c576 100644
--- a/.github/workflows/skill-eval.yml
+++ b/.github/workflows/skill-eval.yml
@@ -6,27 +6,13 @@ on:
       - 'skills/**'
       - 'evals/**'
   workflow_dispatch:
-    inputs:
-      agent:
-        description: 'Agent to run evals against (copilot or gemini)'
-        required: true
-        default: 'copilot'
-        type: choice
-        options:
-          - copilot
-          - gemini
-      trials:
-        description: 'Number of trials per task'
-        required: false
-        default: '1'
-        type: string
 
 permissions:
   contents: read
   pull-requests: write
 
 jobs:
-  # Job 1: Always validate graders against reference solutions
+  # Job 1: Validate graders against reference solutions
   validate_graders:
     runs-on: ubuntu-latest
     timeout-minutes: 10
@@ -52,9 +38,8 @@ jobs:
           path: evals/results/
           retention-days: 30
 
-  # Job 2: Run evals against an AI agent (copilot or gemini)
-  # Triggered manually via workflow_dispatch, or can be called from other workflows
-  agent_eval:
+  # Job 2: Run evals against the Copilot agent
+  agent_eval_copilot:
     runs-on: ubuntu-latest
     timeout-minutes: 60
 
@@ -68,35 +53,57 @@ jobs:
           node-version: '22'
 
       - name: Install Copilot CLI
-        if: inputs.agent == 'copilot'
         run: npm install -g @github/copilot
 
+      - name: Run eval against Copilot
+        working-directory: evals
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: bash run-eval.sh --all --agent copilot
+
+      - name: Upload Copilot eval results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: skill-eval-agent-copilot-results
+          path: evals/results/
+          retention-days: 30
+
+  # Job 3: Run evals against the Gemini agent
+  agent_eval_gemini:
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: '22'
+
       - name: Install Gemini CLI
-        if: inputs.agent == 'gemini'
         run: npm install -g @google/gemini-cli
 
-      - name: Run agent-based eval
+      - name: Run eval against Gemini
         working-directory: evals
         env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
-        run: |
-          bash run-eval.sh --all \
-            --agent ${{ inputs.agent }} \
-            --trials ${{ inputs.trials || '1' }}
+        run: bash run-eval.sh --all --agent gemini
 
-      - name: Upload agent eval results
+      - name: Upload Gemini eval results
         if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: skill-eval-agent-${{ inputs.agent }}-results
+          name: skill-eval-agent-gemini-results
           path: evals/results/
           retention-days: 30
 
-  # Job 3: Post summary comment on PRs
+  # Job 4: Post combined summary comment on PRs
   post_summary:
     if: always() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.fork == false
-    needs: [validate_graders]
+    needs: [validate_graders, agent_eval_copilot, agent_eval_gemini]
     runs-on: ubuntu-latest
 
     steps:
@@ -104,7 +111,22 @@ jobs:
         uses: actions/download-artifact@v4
         with:
           name: skill-eval-validation-results
-          path: evals/results/
+          path: evals/results/validation
+        continue-on-error: true
+
+      - name: Download Copilot results
+        uses: actions/download-artifact@v4
+        with:
+          name: skill-eval-agent-copilot-results
+          path: evals/results/copilot
+        continue-on-error: true
+
+      - name: Download Gemini results
+        uses: actions/download-artifact@v4
+        with:
+          name: skill-eval-agent-gemini-results
+          path: evals/results/gemini
+        continue-on-error: true
 
       - name: Post summary comment
         uses: actions/github-script@v7
@@ -113,40 +135,69 @@ jobs:
             const fs = require('fs');
             const path = require('path');
 
-            const resultsDir = 'evals/results';
-            let summary = '## 📊 Skill Eval Results\n\n';
-
-            try {
-              const files = fs.readdirSync(resultsDir).filter(f => f.endsWith('.json') && f !== 'baseline.json');
-              if (files.length === 0) {
-                summary += '> ⚠️ No eval results found. The eval run may have failed.\n';
-              } else {
-                summary += '| Task | Agent | Pass Rate | pass@k | Status |\n';
-                summary += '|---|---|---|---|---|\n';
-
+            function readResults(dir) {
+              const results = [];
+              try {
+                if (!fs.existsSync(dir)) return results;
+                const files = fs.readdirSync(dir).filter(f => f.endsWith('.json') && f !== 'baseline.json');
                 for (const file of files) {
                   try {
-                    const data = JSON.parse(fs.readFileSync(path.join(resultsDir, file), 'utf8'));
-                    const taskName = data.task || file.replace('.json', '');
-                    const agent = data.agent || 'reference';
-                    const passRate = data.passRate != null ? `${(data.passRate * 100).toFixed(0)}%` : 'N/A';
-                    const passAtK = data.passAtK != null ? `${(data.passAtK * 100).toFixed(0)}%` : 'N/A';
-                    const status = data.passAtK >= 0.8 ? '✅' : data.passAtK >= 0.6 ? '⚠️' : '❌';
-                    summary += `| ${taskName} | ${agent} | ${passRate} | ${passAtK} | ${status} |\n`;
+                    results.push(JSON.parse(fs.readFileSync(path.join(dir, file), 'utf8')));
                   } catch (e) {
-                    summary += `| ${file} | — | Error | Error | ❌ |\n`;
+                    results.push({ task: file.replace('.json', ''), error: true });
                   }
                 }
+              } catch (e) { /* dir doesn't exist */ }
+              return results;
+            }
+
+            let summary = '## 📊 Skill Eval Results\n\n';
+
+            // --- Validation results ---
+            const validation = readResults('evals/results/validation');
+            if (validation.length > 0) {
+              summary += '### Grader Validation (reference solutions)\n\n';
+              summary += '| Task | Pass Rate | Status |\n';
+              summary += '|---|---|---|\n';
+              for (const r of validation) {
+                if (r.error) { summary += `| ${r.task} | Error | ❌ |\n`; continue; }
+                const passRate = r.passRate != null ? `${(r.passRate * 100).toFixed(0)}%` : 'N/A';
+                const status = r.passRate >= 1.0 ? '✅' : '❌';
+                summary += `| ${r.task} | ${passRate} | ${status} |\n`;
+              }
+              summary += '\n';
+            }
 
-                summary += '\n### Thresholds\n';
-                summary += '- ✅ `pass@k ≥ 80%` — merge gate passed\n';
-                summary += '- ⚠️ `pass@k ≥ 60%` — needs investigation\n';
-                summary += '- ❌ `pass@k < 60%` — blocks merge for affected skill\n';
+            // --- Agent results ---
+            const copilot = readResults('evals/results/copilot');
+            const gemini = readResults('evals/results/gemini');
+
+            if (copilot.length > 0 || gemini.length > 0) {
+              summary += '### Agent Evaluation\n\n';
+              summary += '| Task | Agent | Pass Rate | pass@k | Status |\n';
+              summary += '|---|---|---|---|---|\n';
+
+              for (const r of [...copilot, ...gemini]) {
+                if (r.error) { summary += `| ${r.task} | — | Error | Error | ❌ |\n`; continue; }
+                const taskName = r.task || 'unknown';
+                const agent = r.agent || 'unknown';
+                const passRate = r.passRate != null ? `${(r.passRate * 100).toFixed(0)}%` : 'N/A';
+                const passAtK = r.passAtK != null ? `${(r.passAtK * 100).toFixed(0)}%` : 'N/A';
+                const status = r.passAtK >= 0.8 ? '✅' : r.passAtK >= 0.6 ? '⚠️' : '❌';
+                summary += `| ${taskName} | ${agent} | ${passRate} | ${passAtK} | ${status} |\n`;
               }
-            } catch (e) {
-              summary += `> ⚠️ Could not read results: ${e.message}\n`;
+              summary += '\n';
             }
 
+            if (validation.length === 0 && copilot.length === 0 && gemini.length === 0) {
+              summary += '> ⚠️ No eval results found. The eval runs may have failed.\n';
+            }
+
+            summary += '### Thresholds\n';
+            summary += '- ✅ `pass@k ≥ 80%` — merge gate passed\n';
+            summary += '- ⚠️ `pass@k ≥ 60%` — needs investigation\n';
+            summary += '- ❌ `pass@k < 60%` — blocks merge for affected skill\n';
+
             await github.rest.issues.createComment({
               owner: context.repo.owner,
               repo: context.repo.repo,
diff --git a/evals/README.md b/evals/README.md
index 2c7160237bc..3bdab554e0d 100644
--- a/evals/README.md
+++ b/evals/README.md
@@ -180,23 +180,18 @@ Following [Anthropic's recommendations](https://www.anthropic.com/engineering/de
 
 ## CI Integration
 
-The GitHub Actions workflow at `.github/workflows/skill-eval.yml` provides two
-evaluation modes:
-
-### Automatic (on PR)
-Runs on every PR that modifies `skills/**` or `evals/**`:
-1. Validates all graders against their reference solutions
-2. Uploads results as an artifact
-3. Posts a summary comment on the PR
-
-### Manual (workflow_dispatch)
-Triggered manually from the Actions tab to run agent-based evaluation:
-1. Select the agent (`copilot` or `gemini`) and number of trials
-2. Installs the selected agent CLI
-3. Runs all tasks against the agent
-4. Uploads results as an artifact
-
-**Secrets required for agent-based CI:**
+The GitHub Actions workflow at `.github/workflows/skill-eval.yml` runs
+both on PRs (that modify `skills/**` or `evals/**`) and via manual
+`workflow_dispatch`. Every run executes three parallel jobs:
+
+1. **Grader validation** — applies reference solutions, verifies graders score 100%
+2. **Copilot agent eval** — installs `@github/copilot`, runs all tasks against Copilot CLI
+3. **Gemini agent eval** — installs `@google/gemini-cli`, runs all tasks against Gemini CLI
+
+A fourth summary job collects results from all three and posts a combined
+PR comment showing pass rates per task per agent.
+
+**Secrets required:**
 - `GITHUB_TOKEN` — automatically available (for Copilot)
 - `GEMINI_API_KEY` — must be added as a repository secret (for Gemini)
 

From a9da524cf3c1e3b9b1b30d00c9311dee8dafb805 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 10 Mar 2026 10:06:46 +0000
Subject: [PATCH 17/17] add agent prompt files, switch CI to npm scripts, clean
 up README

Co-authored-by: kdinev <1472513+kdinev@users.noreply.github.com>
---
 .github/workflows/skill-eval.yml              |  6 +--
 evals/README.md                               | 49 +++++++++++--------
 evals/run-eval.sh                             | 14 ++++--
 .../component-combo-reactive-form/prompt.md   | 28 +++++++++++
 evals/tasks/grid-basic-setup/prompt.md        | 29 +++++++++++
 .../theming-palette-generation/prompt.md      | 14 ++++++
 6 files changed, 111 insertions(+), 29 deletions(-)
 create mode 100644 evals/tasks/component-combo-reactive-form/prompt.md
 create mode 100644 evals/tasks/grid-basic-setup/prompt.md
 create mode 100644 evals/tasks/theming-palette-generation/prompt.md

diff --git a/.github/workflows/skill-eval.yml b/.github/workflows/skill-eval.yml
index 95541d6c576..1be42960da9 100644
--- a/.github/workflows/skill-eval.yml
+++ b/.github/workflows/skill-eval.yml
@@ -28,7 +28,7 @@ jobs:
 
       - name: Validate graders against reference solutions
         working-directory: evals
-        run: bash run-eval.sh --all --validate
+        run: npm run validate
 
       - name: Upload validation results
         if: always()
@@ -59,7 +59,7 @@ jobs:
         working-directory: evals
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: bash run-eval.sh --all --agent copilot
+        run: npm run agent:copilot
 
       - name: Upload Copilot eval results
         if: always()
@@ -90,7 +90,7 @@ jobs:
         working-directory: evals
         env:
           GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
-        run: bash run-eval.sh --all --agent gemini
+        run: npm run agent:gemini
 
       - name: Upload Gemini eval results
         if: always()
diff --git a/evals/README.md b/evals/README.md
index 3bdab554e0d..196948924df 100644
--- a/evals/README.md
+++ b/evals/README.md
@@ -23,8 +23,9 @@ The suite tests three skills:
 
 Each task includes:
 
-- **`instruction.md`** — the prompt given to the agent
-- **`tests/test.sh`** — deterministic grader (file checks, compilation, lint)
+- **`prompt.md`** — the agent prompt sent to the CLI (concise, actionable)
+- **`instruction.md`** — human-readable task description (detailed requirements)
+- **`tests/test.sh`** — deterministic grader (file checks, import validation, ordering)
 - **`prompts/quality.md`** — LLM rubric grader (intent routing, API usage)
 - **`solution/solve.sh`** — reference solution for baseline validation
 - **`environment/Dockerfile`** — isolated environment for agent execution
@@ -54,31 +55,34 @@ confirm the grader scores 100%. Use this to catch grader regressions.
 cd evals
 
 # Validate all tasks
-bash run-eval.sh --all --validate
+npm run validate
 
 # Validate a single task
-bash run-eval.sh grid-basic-setup --validate
+npm run validate:grid
+npm run validate:combo
+npm run validate:theming
 ```
 
 ### Run evals against an AI agent
 
-Send the `instruction.md` to a coding agent CLI, let the agent generate code
+Send the `prompt.md` to a coding agent CLI, let the agent generate code
 in an isolated workspace, then run the deterministic grader on the output.
 
 ```bash
 cd evals
 
 # Run all tasks with GitHub Copilot CLI
-bash run-eval.sh --all --agent copilot
+npm run agent:copilot
 
-# Run a single task with Gemini CLI
-bash run-eval.sh grid-basic-setup --agent gemini
+# Run all tasks with Gemini CLI
+npm run agent:gemini
 
-# Run 3 trials per task for statistical robustness
-bash run-eval.sh --all --agent copilot --trials 3
+# Run a single task with a specific agent
+npm run agent:copilot:grid
+npm run agent:gemini:theming
 ```
 
-### npm scripts (convenience wrappers)
+### All npm scripts
 
 ```bash
 cd evals
@@ -134,7 +138,8 @@ To switch the default agent, change `defaultAgent`.
    ```
    tasks/<task-id>/
    ├── task.toml               # Config: grader metadata, weights, timeouts
-   ├── instruction.md          # Agent prompt
+   ├── prompt.md               # Agent prompt (sent to CLI agents)
+   ├── instruction.md          # Human-readable task description
    ├── environment/Dockerfile  # Container setup (for future Docker-based runs)
    ├── tests/test.sh           # Deterministic grader
    ├── prompts/quality.md      # LLM rubric grader
@@ -143,29 +148,31 @@ To switch the default agent, change `defaultAgent`.
        └── <skill-name>/SKILL.md
    ```
 
-2. Write a clear, unambiguous `instruction.md` that tells the agent exactly what
-   to build.
+2. Write a clear, unambiguous `instruction.md` with full task requirements.
 
-3. Write `tests/test.sh` to check **outcomes** (files exist, correct selectors
+3. Write a concise `prompt.md` that is sent directly to the agent CLI. This
+   should be a focused, actionable prompt derived from the instruction.
+
+4. Write `tests/test.sh` to check **outcomes** (files exist, correct selectors
    and entry-point imports are present, correct API call ordering) rather than
    specific steps. The grader must write a reward (0.0–1.0) to
    `logs/verifier/reward.txt`.
 
-4. Write `prompts/quality.md` with rubric dimensions that sum to 1.0.
+5. Write `prompts/quality.md` with rubric dimensions that sum to 1.0.
 
-5. Write `solution/solve.sh` — a shell script that proves the task is solvable
+6. Write `solution/solve.sh` — a shell script that proves the task is solvable
    and validates that the graders work correctly.
 
-6. Validate graders before submitting:
+7. Validate graders before submitting:
 
    ```bash
-   bash run-eval.sh <task-id> --validate
+   npm run validate:<task-shortname>
    ```
 
-7. Test against at least one agent:
+8. Test against at least one agent:
 
    ```bash
-   bash run-eval.sh <task-id> --agent copilot
+   npm run agent:copilot:<task-shortname>
    ```
 
 ## Pass / Fail Thresholds
diff --git a/evals/run-eval.sh b/evals/run-eval.sh
index 348788dc861..9d7f026dca3 100755
--- a/evals/run-eval.sh
+++ b/evals/run-eval.sh
@@ -99,14 +99,18 @@ run_agent_task() {
   local WORK_DIR="$2"
   local AGENT_NAME="$3"
 
-  local INSTRUCTION_FILE="$TASK_DIR/instruction.md"
-  if [ ! -f "$INSTRUCTION_FILE" ]; then
-    echo "ERROR: No instruction.md found at $INSTRUCTION_FILE" >&2
+  # Prefer prompt.md (agent-oriented prompt) over instruction.md (human-oriented task description)
+  local PROMPT_FILE="$TASK_DIR/prompt.md"
+  if [ ! -f "$PROMPT_FILE" ]; then
+    PROMPT_FILE="$TASK_DIR/instruction.md"
+  fi
+  if [ ! -f "$PROMPT_FILE" ]; then
+    echo "ERROR: No prompt.md or instruction.md found in $TASK_DIR" >&2
     return 1
   fi
 
   local PROMPT
-  PROMPT=$(cat "$INSTRUCTION_FILE")
+  PROMPT=$(cat "$PROMPT_FILE")
 
   # Build the skill context preamble if skills/ directory exists
   local SKILL_CONTEXT=""
@@ -118,7 +122,7 @@ run_agent_task() {
     done
   fi
 
-  # Combine skill context + instruction into a single prompt
+  # Combine skill context + prompt into a single agent instruction
   local FULL_PROMPT=""
   if [ -n "$SKILL_CONTEXT" ]; then
     FULL_PROMPT="Use the following skill reference when completing the task:\n\n${SKILL_CONTEXT}---\n\n${PROMPT}"
diff --git a/evals/tasks/component-combo-reactive-form/prompt.md b/evals/tasks/component-combo-reactive-form/prompt.md
new file mode 100644
index 00000000000..299eb44b413
--- /dev/null
+++ b/evals/tasks/component-combo-reactive-form/prompt.md
@@ -0,0 +1,28 @@
+# Agent Prompt: Combo with Reactive Form
+
+You are working in an Angular 20+ project that already has `igniteui-angular` installed.
+
+Create a `UserSettingsComponent` at `src/app/user-settings/user-settings.component.ts` with a reactive form containing a multi-select combo for notification channel selection.
+
+Use this data:
+
+```typescript
+channels = [
+  { id: 1, name: 'Email', icon: 'email' },
+  { id: 2, name: 'SMS', icon: 'sms' },
+  { id: 3, name: 'Push Notification', icon: 'notifications' },
+  { id: 4, name: 'Slack', icon: 'chat' },
+  { id: 5, name: 'Microsoft Teams', icon: 'groups' },
+];
+```
+
+Requirements:
+- Use the Ignite UI for Angular `igx-combo` component (NOT igx-select, native select, or mat-select)
+- Bind the combo to a `notificationChannels` FormControl inside a FormGroup
+- Set displayKey to 'name' and valueKey to 'id'
+- Add required validation (at least one channel must be selected)
+- Add a submit button disabled when form is invalid
+- Import IgxComboComponent from the `igniteui-angular/combo` entry point (not the root barrel)
+- Import ReactiveFormsModule for form support
+- Component must be standalone with ChangeDetectionStrategy.OnPush
+- Create both a `.ts` file and a `.html` template file
diff --git a/evals/tasks/grid-basic-setup/prompt.md b/evals/tasks/grid-basic-setup/prompt.md
new file mode 100644
index 00000000000..b019190c21c
--- /dev/null
+++ b/evals/tasks/grid-basic-setup/prompt.md
@@ -0,0 +1,29 @@
+# Agent Prompt: Grid Basic Setup
+
+You are working in an Angular 20+ project that already has `igniteui-angular` installed.
+
+Create an `EmployeeListComponent` at `src/app/employee-list/employee-list.component.ts` that shows a data grid with employee data, sorting on all columns, and pagination with 5 items per page.
+
+Use this flat employee data:
+
+```typescript
+employees = [
+  { id: 1, name: 'Alice Johnson', department: 'Engineering', salary: 95000, hireDate: new Date('2020-03-15') },
+  { id: 2, name: 'Bob Smith', department: 'Marketing', salary: 72000, hireDate: new Date('2019-07-22') },
+  { id: 3, name: 'Carol Davis', department: 'Engineering', salary: 105000, hireDate: new Date('2018-01-10') },
+  { id: 4, name: 'David Wilson', department: 'Sales', salary: 68000, hireDate: new Date('2021-11-05') },
+  { id: 5, name: 'Eva Martinez', department: 'Engineering', salary: 98000, hireDate: new Date('2020-09-18') },
+  { id: 6, name: 'Frank Brown', department: 'Marketing', salary: 75000, hireDate: new Date('2017-04-30') },
+  { id: 7, name: 'Grace Lee', department: 'Sales', salary: 82000, hireDate: new Date('2019-12-01') },
+  { id: 8, name: 'Henry Taylor', department: 'Engineering', salary: 110000, hireDate: new Date('2016-06-14') },
+];
+```
+
+Requirements:
+- Use the Ignite UI for Angular `igx-grid` component (NOT tree-grid, hierarchical-grid, or pivot-grid)
+- Display columns: id, name, department, salary, hireDate
+- Enable sorting on all columns
+- Add a paginator with page size of 5
+- Import from the `igniteui-angular/grids/grid` entry point (not the root barrel)
+- Component must be standalone with ChangeDetectionStrategy.OnPush
+- Create both a `.ts` file and a `.html` template file
diff --git a/evals/tasks/theming-palette-generation/prompt.md b/evals/tasks/theming-palette-generation/prompt.md
new file mode 100644
index 00000000000..a665e386564
--- /dev/null
+++ b/evals/tasks/theming-palette-generation/prompt.md
@@ -0,0 +1,14 @@
+# Agent Prompt: Custom Branded Theme
+
+You are working in an Angular 20+ project that already has `igniteui-angular` installed with Sass support.
+
+Create a custom Ignite UI for Angular theme in `src/styles.scss` with a blue primary and orange secondary palette.
+
+Requirements:
+- Import from `igniteui-angular/theming` using `@use` syntax
+- Create a palette with primary #1976D2, secondary #FF9800, and a light surface color
+- Configure typography with a sans-serif font family
+- Call `@include core()` BEFORE `@include theme()`
+- Pass the palette to the `theme()` mixin
+- Use the `palette()` function (do NOT hardcode CSS custom properties)
+- Use `@use` module syntax (not deprecated `@import`)