diff --git a/skills/data-designer/evals/evals.json b/skills/data-designer/evals/evals.json new file mode 100644 index 000000000..92152275f --- /dev/null +++ b/skills/data-designer/evals/evals.json @@ -0,0 +1,86 @@ +[ + { + "id": "data-designer-autopilot-support-tickets", + "question": "Use the data-designer skill to create synthetic customer support tickets with category, priority, customer sentiment, issue summary, and resolution time. Just build it with sensible defaults and do not ask me follow-up questions.", + "expected_skill": "data-designer", + "expected_script": null, + "ground_truth": "The agent selected the Autopilot workflow and built a Data Designer script for support tickets with appropriate sampler and generated columns, then validated and previewed the configuration.", + "expected_behavior": [ + "The agent read workflows/autopilot.md", + "The agent did not ask the user a clarifying question before building the script", + "The agent ran data-designer agent context before writing the script", + "load_config_builder() returns a DataDesignerConfigBuilder", + "The agent ran data-designer validate on the generated script", + "The agent ran data-designer preview with --save-results on the generated script" + ] + }, + { + "id": "data-designer-autopilot-person-reviews", + "question": "Create a synthetic e-commerce product review dataset with star ratings, review text, product categories, reviewer full names, city, age bracket, and persona-driven review tone. Be opinionated and make the decisions yourself.", + "expected_skill": "data-designer", + "expected_script": "get_person_object_schema.py", + "ground_truth": "The agent used Autopilot and the person sampling reference to create product reviews with person-derived reviewer attributes and persona-driven review tone.", + "expected_behavior": [ + "The agent read workflows/autopilot.md", + "The agent read references/person-sampling.md", + "The agent ran python scripts/get_person_object_schema.py with a locale argument", + "The generated script includes a SamplerColumnConfig for person data", + "The agent ran data-designer validate on the generated script", + "The agent ran data-designer preview with --save-results on the generated script" + ] + }, + { + "id": "data-designer-autopilot-llm-judge-scores", + "question": "Build me a synthetic instruction-following dataset in autopilot: each row should have a user request, an assistant response, an LLM judge quality rubric with correctness and helpfulness scores, and a final accepted boolean based on those numeric scores. Make reasonable assumptions.", + "expected_skill": "data-designer", + "expected_script": null, + "ground_truth": "The agent built an Autopilot Data Designer script using an LLM judge column and correctly referenced nested numeric judge scores when deriving the accepted boolean.", + "expected_behavior": [ + "The agent read workflows/autopilot.md", + "The agent inspected the LLM judge column config schema before writing the script", + "The generated script includes an LLM judge column", + "The generated script includes correctness as an LLM judge score", + "The generated script includes helpfulness as an LLM judge score", + "The accepted boolean derivation references judge scores with .score" + ] + }, + { + "id": "data-designer-autopilot-sampler-params", + "question": "Generate a synthetic IoT sensor telemetry dataset with device_id, site, timestamp, temperature, vibration, status, and anomaly_label. Just take it from here and use your best judgment.", + "expected_skill": "data-designer", + "expected_script": null, + "ground_truth": "The agent built an Autopilot Data Designer script using appropriate built-in samplers and parameters for telemetry data, then validated and previewed the configuration.", + "expected_behavior": [ + "The agent read workflows/autopilot.md", + "The site column is generated by a category sampler", + "The timestamp column is generated by a datetime sampler", + "Every SamplerColumnConfig includes sampler_type", + "Every SamplerColumnConfig includes params", + "No SamplerColumnConfig in the generated script uses sampler_params" + ] + }, + { + "id": "data-designer-negative-database-admin", + "question": "How do I set up a PostgreSQL database with proper indexing for my transaction logs? Be decisive and handle the recommendation without asking me questions.", + "expected_skill": null, + "expected_script": null, + "ground_truth": "The agent provided guidance on PostgreSQL database setup and indexing strategies without invoking the data-designer skill, as this is a database administration question unrelated to synthetic data generation.", + "expected_behavior": [ + "The agent did not read the data-designer SKILL.md", + "The agent did not create synthetic data", + "The agent answered with PostgreSQL indexing guidance" + ] + }, + { + "id": "data-designer-negative-react-component", + "question": "Build a React settings page with a dark mode toggle, notification preferences, and a save button. Make reasonable UI choices and do not ask follow-up questions.", + "expected_skill": null, + "expected_script": null, + "ground_truth": "The agent implemented or described a React settings page without invoking the data-designer skill, because this is a UI task unrelated to creating synthetic datasets or data generation pipelines.", + "expected_behavior": [ + "The agent did not read the data-designer SKILL.md", + "The agent did not create synthetic data", + "The agent's answer contains React UI code" + ] + } +]