diff --git a/skills/data-designer/evals/evals.json b/skills/data-designer/evals/evals.json
new file mode 100644
index 000000000..92152275f
--- /dev/null
+++ b/skills/data-designer/evals/evals.json
@@ -0,0 +1,86 @@
+[
+  {
+    "id": "data-designer-autopilot-support-tickets",
+    "question": "Use the data-designer skill to create synthetic customer support tickets with category, priority, customer sentiment, issue summary, and resolution time. Just build it with sensible defaults and do not ask me follow-up questions.",
+    "expected_skill": "data-designer",
+    "expected_script": null,
+    "ground_truth": "The agent selected the Autopilot workflow and built a Data Designer script for support tickets with appropriate sampler and generated columns, then validated and previewed the configuration.",
+    "expected_behavior": [
+      "The agent read workflows/autopilot.md",
+      "The agent did not ask the user a clarifying question before building the script",
+      "The agent ran data-designer agent context before writing the script",
+      "load_config_builder() returns a DataDesignerConfigBuilder",
+      "The agent ran data-designer validate on the generated script",
+      "The agent ran data-designer preview with --save-results on the generated script"
+    ]
+  },
+  {
+    "id": "data-designer-autopilot-person-reviews",
+    "question": "Create a synthetic e-commerce product review dataset with star ratings, review text, product categories, reviewer full names, city, age bracket, and persona-driven review tone. Be opinionated and make the decisions yourself.",
+    "expected_skill": "data-designer",
+    "expected_script": "get_person_object_schema.py",
+    "ground_truth": "The agent used Autopilot and the person sampling reference to create product reviews with person-derived reviewer attributes and persona-driven review tone.",
+    "expected_behavior": [
+      "The agent read workflows/autopilot.md",
+      "The agent read references/person-sampling.md",
+      "The agent ran python scripts/get_person_object_schema.py with a locale argument",
+      "The generated script includes a SamplerColumnConfig for person data",
+      "The agent ran data-designer validate on the generated script",
+      "The agent ran data-designer preview with --save-results on the generated script"
+    ]
+  },
+  {
+    "id": "data-designer-autopilot-llm-judge-scores",
+    "question": "Build me a synthetic instruction-following dataset in autopilot: each row should have a user request, an assistant response, an LLM judge quality rubric with correctness and helpfulness scores, and a final accepted boolean based on those numeric scores. Make reasonable assumptions.",
+    "expected_skill": "data-designer",
+    "expected_script": null,
+    "ground_truth": "The agent built an Autopilot Data Designer script using an LLM judge column and correctly referenced nested numeric judge scores when deriving the accepted boolean.",
+    "expected_behavior": [
+      "The agent read workflows/autopilot.md",
+      "The agent inspected the LLM judge column config schema before writing the script",
+      "The generated script includes an LLM judge column",
+      "The generated script includes correctness as an LLM judge score",
+      "The generated script includes helpfulness as an LLM judge score",
+      "The accepted boolean derivation references judge scores with .score"
+    ]
+  },
+  {
+    "id": "data-designer-autopilot-sampler-params",
+    "question": "Generate a synthetic IoT sensor telemetry dataset with device_id, site, timestamp, temperature, vibration, status, and anomaly_label. Just take it from here and use your best judgment.",
+    "expected_skill": "data-designer",
+    "expected_script": null,
+    "ground_truth": "The agent built an Autopilot Data Designer script using appropriate built-in samplers and parameters for telemetry data, then validated and previewed the configuration.",
+    "expected_behavior": [
+      "The agent read workflows/autopilot.md",
+      "The site column is generated by a category sampler",
+      "The timestamp column is generated by a datetime sampler",
+      "Every SamplerColumnConfig includes sampler_type",
+      "Every SamplerColumnConfig includes params",
+      "No SamplerColumnConfig in the generated script uses sampler_params"
+    ]
+  },
+  {
+    "id": "data-designer-negative-database-admin",
+    "question": "How do I set up a PostgreSQL database with proper indexing for my transaction logs? Be decisive and handle the recommendation without asking me questions.",
+    "expected_skill": null,
+    "expected_script": null,
+    "ground_truth": "The agent provided guidance on PostgreSQL database setup and indexing strategies without invoking the data-designer skill, as this is a database administration question unrelated to synthetic data generation.",
+    "expected_behavior": [
+      "The agent did not read the data-designer SKILL.md",
+      "The agent did not create synthetic data",
+      "The agent answered with PostgreSQL indexing guidance"
+    ]
+  },
+  {
+    "id": "data-designer-negative-react-component",
+    "question": "Build a React settings page with a dark mode toggle, notification preferences, and a save button. Make reasonable UI choices and do not ask follow-up questions.",
+    "expected_skill": null,
+    "expected_script": null,
+    "ground_truth": "The agent implemented or described a React settings page without invoking the data-designer skill, because this is a UI task unrelated to creating synthetic datasets or data generation pipelines.",
+    "expected_behavior": [
+      "The agent did not read the data-designer SKILL.md",
+      "The agent did not create synthetic data",
+      "The agent's answer contains React UI code"
+    ]
+  }
+]