From 5832f8798787964bacd8d70b985478419bbc99d9 Mon Sep 17 00:00:00 2001 From: Patrick Date: Sun, 12 Oct 2025 20:33:44 +0200 Subject: [PATCH 1/6] Listing generic pipeline in index --- docs/guides/index.md | 1 + docs/index.md | 2 ++ 2 files changed, 3 insertions(+) diff --git a/docs/guides/index.md b/docs/guides/index.md index 2928407..0bf0add 100644 --- a/docs/guides/index.md +++ b/docs/guides/index.md @@ -11,5 +11,6 @@ Datafast supports generating the following types of datasets: - [Multiple Choice](generating_mcq_datasets.md) - Build datasets with multiple choice questions and answers - [Instruction](generating_ultrachat_datasets.md) - Develop instruction-following datasets - [Preference](generating_preference_datasets.md) - Generate datasets for preference-based learning +- [Generic Pipeline](generating_generic_pipeline_datasets.md) - Build custom input-output LLM synthetic data generation pipelines Select a guide from the navigation menu to learn more about each dataset type. diff --git a/docs/index.md b/docs/index.md index 4a74d5d..ace0ba3 100644 --- a/docs/index.md +++ b/docs/index.md @@ -17,6 +17,7 @@ Create high-quality and diverse synthetic text datasets in minutes, not weeks. - ✅ Instruction Dataset (Ultrachat-like) - ✅ Multiple Choice Question (MCQ) Dataset - ✅ Preference Dataset +- ✅ Generic Pipeline Dataset - ⏳ more to come... ## Supported LLM Providers @@ -118,6 +119,7 @@ Check out our comprehensive guides for different dataset types: - [Multiple Choice Questions](guides/generating_mcq_datasets.md) - Build datasets with multiple choice questions and answers - [Instruction Following](guides/generating_ultrachat_datasets.md) - Develop instruction-following conversation datasets - [Preference Pairs](guides/generating_preference_datasets.md) - Generate datasets for preference-based learning +- [Generic Pipeline](guides/generating_generic_pipeline_datasets.md) - Build custom input-output LLM synthetic data generation pipelines To understand the core concepts behind Datafast, visit our [Concepts](concepts.md) page. From b057486b75c0d1e268257f18f98640a440b45a96 Mon Sep 17 00:00:00 2001 From: Patrick Date: Sun, 12 Oct 2025 20:34:14 +0200 Subject: [PATCH 2/6] docs: add full guide for LLM transformations --- .../generating_generic_pipeline_datasets.md | 305 ++++++++++++++++++ 1 file changed, 305 insertions(+) create mode 100644 docs/guides/generating_generic_pipeline_datasets.md diff --git a/docs/guides/generating_generic_pipeline_datasets.md b/docs/guides/generating_generic_pipeline_datasets.md new file mode 100644 index 0000000..dd70e73 --- /dev/null +++ b/docs/guides/generating_generic_pipeline_datasets.md @@ -0,0 +1,305 @@ +# How to Create a Generic Pipeline Dataset + +!!! example "Use Case" + You have an existing dataset with structured information and you want to **transform or augment it using LLMs** with custom prompts. + For example, you might have a dataset of personas and want to generate tweets and CVs for each persona, or you might have product descriptions and want to generate marketing copy in multiple styles. + +The Generic Pipeline Dataset is designed for **maximum flexibility**. Unlike other dataset types in Datafast that have predefined structures (like classification labels or MCQ questions), the Generic Pipeline Dataset allows you to: + +- **Process any existing dataset** (from Hugging Face Hub or local files) +- **Define custom input-output transformations** using your own prompts +- **Specify exactly which columns** to use as input, to forward, or to generate as output + +This makes it ideal for creating custom input-output LLM synthetic data generation steps which you can even chain together. + +## Why Use Generic Pipeline Dataset? + +The Generic Pipeline Dataset is perfect when: + +1. **You need custom transformations**: The predefined dataset types (classification, MCQ, etc.) don't fit your use case +2. **You have some existing data**: You have some seeds rows (like some topics), and you want to transform or generate more data for each seed row +3. **You need flexibility**: You want full control over the input columns, output structure, and processing logic + +## Step 1: Import Required Modules + +Generating a dataset with `datafast` requires 3 types of imports: + +* Dataset +* Configs +* LLM Providers + +```python +from datafast.datasets import GenericPipelineDataset +from datafast.schema.config import GenericPipelineDatasetConfig +from datafast.llms import OpenRouterProvider +``` + +In addition, we'll use `dotenv` to load environment variables containing API keys: + +```python +from dotenv import load_dotenv + +# Load environment variables containing API keys +load_dotenv("secrets.env") +``` + +Make sure you have created a `secrets.env` file with your API keys: + +``` +OPENROUTER_API_KEY=XXXX +HF_TOKEN=hf_XXXXX +``` + +## Step 2: Understand the Configuration + +The `GenericPipelineDatasetConfig` class provides flexible configuration for processing any dataset: + +### Key Parameters + +- **`hf_dataset_name`** or **`local_file_path`**: Specify your data source + - Use `hf_dataset_name` for Hugging Face datasets (e.g., `"organization/dataset-name"`) + - Use `local_file_path` for local files (CSV, TXT, PARQUET, or JSONL) + - You must provide exactly one of these + +- **`input_columns`**: List of column names from your source dataset to use as input + - These columns will be available as placeholders in your prompts + - At least one column is required + - Example: `["persona", "background"]` + +- **`forward_columns`**: (Optional) Columns to pass through unchanged to the output + - Useful for IDs, labels, or metadata you want to preserve + - Example: `["user_id", "category"]` + +- **`output_columns`**: (Optional) Names for the generated data fields + - If specified, your LLM will generate JSON with these field names + - If not specified, defaults to a single `"generated_text"` field + - Example: `["tweet", "cv"]` + +- **`prompts`**: List of custom prompt templates + - **Mandatory placeholders** (use single curly braces): + - `{num_samples}`: Number of samples to generate per input + - `{language}`: Language name from the languages config + - At least one input column as placeholder (e.g., `{persona}`, `{background}`) + - You can use multiple input columns in your prompt; unused columns will trigger a warning + - **Optional placeholders** (use double curly braces): + - Any custom placeholders for prompt expansion (e.g., `{{style}}`) + +- **`num_samples_per_prompt`**: Number of outputs to generate for each input row + - Default: 1 + - Recommended: Keep this low (1-3) for complex outputs + +- **`sample_count`**: (Optional) Limit the number of rows to process from source dataset + - Useful for testing before running on full dataset + +- **`skip_function`**: (Optional) Custom function to skip certain rows + - Must be a callable that takes a row dict (with all source dataset columns) and returns `True` to skip + - Example use cases: skip rows with certain keywords, filter by length, exclude based on metadata + - Example: `lambda row: len(row.get("text", "")) < 100` (skip short texts) + +## Step 3: Configure Your Dataset + +Let's create a simple example that generates tweets from personas: + +```python +# Define a simple prompt template +PROMPT_TEMPLATE = """I will give you a persona description. +Generate {num_samples} authentic tweets in {language} that this person might write. +Make each tweet engaging and true to their character. + +Persona: {persona} + +Your response should be formatted in valid JSON.""" + +# Configure the dataset +config = GenericPipelineDatasetConfig( + # Data source + hf_dataset_name="patrickfleith/FinePersonas-v0.1-100k-space-filtered", + + # Define input/output structure + input_columns=["persona"], # Use persona column as input + forward_columns=["summary_label"], # Keep summary_label in output + output_columns=["tweet"], # Generate a tweet + + # Processing parameters + sample_count=10, # Process only 10 rows for testing + num_samples_per_prompt=1, # Generate 1 tweet per persona + + # Prompt configuration + prompts=[PROMPT_TEMPLATE], # We could use define a variety of prompts here + + # Output file + output_file="persona_tweets.jsonl", + + # Languages + languages={"en": "English", "fr", "French"} # languages to generate tweets in +) +``` + + +!!! note + If you need to generate multiple fields at once, specify them in `output_columns`. Make sure your prompt clearly instructs the LLM to generate valid JSON with those exact field names. + + ```python + MULTI_OUTPUT_PROMPT = """Given this persona, generate {num_samples} complete profiles in {language}: + 1. A tweet they might write + 2. A brief CV/resume + + Persona: {persona} + + Respond with valid JSON containing both fields.""" + ``` + +## Step 4: Set Up LLM Providers + +Configure one LLM provider or more for diversity. Read more about LLM providers [here](../llms.md). + +```python +providers = [ + OpenRouterProvider(model_id="z-ai/glm-4.6") +] +``` + +## Step 5: Generate the Dataset + +Now generate your dataset: + +```python +# Initialize dataset +dataset = GenericPipelineDataset(config) + +# Check expected output size +num_expected = dataset.get_num_expected_rows(providers) +print(f"Expected rows: {num_expected}") + +# Generate +dataset.generate(providers) + +# Print summary +print(f"Generated {len(dataset.data_rows)} examples") +print(f"Saved to {config.output_file}") +``` + +The generation process: +1. Loads the source dataset +2. For each row in the source dataset: + - Extracts input columns + - Formats prompts with input data + - Applies prompt expansion (if configured) + - Calls each LLM provider + - Saves generated outputs to file + + +## Step 6: Publishing to Hugging Face Hub (Optional) + +Push your generated dataset to Hugging Face: + +```python +url = dataset.push_to_hub( + repo_id="YOUR_USERNAME/persona-tweets", + train_size=0.8, + seed=42, + shuffle=True +) +print(f"Dataset published at: {url}") +``` + +!!! warning + Ensure your `HF_TOKEN` environment variable is set before pushing to the Hub. + +## Complete Example + +Here's a complete working example: + +```python +from datafast.datasets import GenericPipelineDataset +from datafast.schema.config import GenericPipelineDatasetConfig +from datafast.llms import OpenRouterProvider +from dotenv import load_dotenv + +# Load API keys +load_dotenv("secrets.env") + +# Define prompt +PROMPT = """I will give you a persona description. +Generate {num_samples} authentic tweets in {language} that this person might write. +Make each tweet engaging and true to their character. + +Persona: {persona} + +Your response should be formatted in valid JSON.""" + +# Configure dataset +config = GenericPipelineDatasetConfig( + hf_dataset_name="patrickfleith/FinePersonas-v0.1-100k-space-filtered", + input_columns=["persona"], + forward_columns=["summary_label"], + output_columns=["tweet"], + sample_count=10, + num_samples_per_prompt=1, + prompts=[PROMPT], + output_file="persona_tweets.jsonl", + languages={"en": "English", "fr": "French"} +) + +# Set up provider +providers = [ + OpenRouterProvider(model_id="z-ai/glm-4.6") +] + +# Generate dataset +dataset = GenericPipelineDataset(config) +num_expected = dataset.get_num_expected_rows(providers) +print(f"Expected rows: {num_expected}") +dataset.generate(providers) +print(f"Generated {len(dataset.data_rows)} examples") +print(f"Saved to {config.output_file}") + +# Optional: Push to hub +url = dataset.push_to_hub( + repo_id="YOUR_USERNAME/persona-tweets", # <--- Your hugging face username + train_size=0.8, + seed=42, + shuffle=True +) +print(f"Dataset published at: {url}") +``` + +## Understanding the Generated Data + +Each generated example is stored as a `GenericPipelineRow` with: + +- **Input columns**: Original data from your source (e.g., `persona`) +- **Forward columns**: Pass-through data without modification (e.g., `summary_label`) +- **Output columns**: Generated text data (e.g., `tweet`, `cv`) +- **`model_id`**: The LLM model that generated this output +- **`pipeline_source`**: Source type (always `SYNTHETIC`) +- **`language`**: Language code for the generated content +- **`metadata`**: Additional info like prompt index and source row index +- **`uuid`**: Unique identifier + +## Advanced: Skip Function + +Filter which rows to process using a skip function: + +```python +def skip_short_personas(row): + """Skip personas with very short descriptions""" + persona = row.get("persona", "") + return len(persona) < 50 + +config = GenericPipelineDatasetConfig( + hf_dataset_name="patrickfleith/FinePersonas-v0.1-100k-space-filtered", + input_columns=["persona"], + output_columns=["tweet"], + skip_function=skip_short_personas, # Apply filter + prompts=["Generate tweet for: {persona}"], + output_file="filtered_tweets.jsonl", + languages={"en": "English"} +) +``` + +## Next Steps + +- Explore [Prompt Expansion](prompt_expansion.md) for more advanced diversity techniques +- Learn about other dataset types in the [Guides Index](index.md) +- Check out the [Concepts](../concepts.md) page for understanding Datafast's architecture From e67260c057995d583d1a07ba72c47a365554af85 Mon Sep 17 00:00:00 2001 From: Patrick Date: Sun, 12 Oct 2025 22:45:35 +0200 Subject: [PATCH 3/6] docs: add OpenRouter provider documentation and update model examples --- docs/index.md | 9 ++++++--- docs/llms.md | 20 +++++++++++++++++++- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/docs/index.md b/docs/index.md index ace0ba3..01fdce7 100644 --- a/docs/index.md +++ b/docs/index.md @@ -27,7 +27,8 @@ Currently we support the following LLM providers: - ✔︎ OpenAI - ✔︎ Anthropic - ✔︎ Google Gemini -- ✔︎ Ollama +- ✔︎ Ollama (your local LLM server) +- ✔︎ OpenRouter (almost any LLM including open-source models) - ⏳ more to come... ## Quick Start @@ -41,6 +42,7 @@ Other keys depends on which LLM providers you use. GEMINI_API_KEY=XXXX OPENAI_API_KEY=sk-XXXX ANTHROPIC_API_KEY=sk-ant-XXXXX +OPENROUTER_API_KEY=XXXXX HF_TOKEN=hf_XXXXX ``` @@ -48,7 +50,7 @@ HF_TOKEN=hf_XXXXX ```python from datafast.datasets import ClassificationDataset from datafast.schema.config import ClassificationDatasetConfig, PromptExpansionConfig -from datafast.llms import OpenAIProvider, AnthropicProvider, GeminiProvider +from datafast.llms import OpenAIProvider, AnthropicProvider, GeminiProvider, OpenRouterProvider from dotenv import load_dotenv # Load environment variables @@ -93,7 +95,8 @@ config = ClassificationDatasetConfig( providers = [ OpenAIProvider(model_id="gpt-4.1-mini-2025-04-14"), AnthropicProvider(model_id="claude-3-5-haiku-latest"), - GeminiProvider(model_id="gemini-2.0-flash") + GeminiProvider(model_id="gemini-2.5-flash"), + OpenRouterProvider(model_id="z-ai/glm-4.6") ] ``` diff --git a/docs/llms.md b/docs/llms.md index 458165a..e34edc2 100644 --- a/docs/llms.md +++ b/docs/llms.md @@ -41,7 +41,7 @@ openrouter_llm = OpenRouterProvider() ```python openai_llm = OpenAIProvider( - model_id="gpt-4o-mini", # Custom model + model_id="gpt-5-mini-2025-08-07", # Custom model temperature=0.2, # Lower temperature for more deterministic outputs max_completion_tokens=100, # Limit token generation top_p=0.9, # Nucleus sampling parameter @@ -53,6 +53,17 @@ ollama_llm = OllamaProvider( model_id="llama3.2:latest", api_base="http://localhost:11434" # <--- this is the default url ) + +# OpenRouter with different models +openrouter_llm = OpenRouterProvider( + model_id="z-ai/glm-4.6", # Access glm-4.6 via OpenRouter + temperature=0.7, + max_completion_tokens=500 +) + +# You can access many models through OpenRouter +openrouter_deepseek = OpenRouterProvider(model_id="deepseek/deepseek-r1-0528") +openrouter_qwen = OpenRouterProvider(model_id="qwen/qwen3-next-80b-a3b-instruct") ``` ## API Keys @@ -74,6 +85,13 @@ openrouter_llm = OpenRouterProvider(api_key="your-openrouter-key") **Note**: Ollama typically runs locally and doesn't require an API key. You can set `OLLAMA_API_BASE` to specify a custom endpoint (defaults to `http://localhost:11434`). +!!! warning + Note that `gpt-oss:20b` or `gpt-oss:120b` do not work well with structured output. Therefore we recommend you not to use them with datafast. + +## About OpenRouter + +[OpenRouter](https://openrouter.ai/) provides access to a wide variety of LLM models through a single API key. Model IDs follow the format `provider/model-name` (e.g., `deepseek/deepseek-r1-0528`, `qwen/qwen3-next-80b-a3b-instruct`). Visit [OpenRouter's models page](https://openrouter.ai/models) for the complete list. + ## Generation Methods ### Simple Text Generation From 9a47e6b64e1cef9b38d2dfd57fd4c140bbd2c854 Mon Sep 17 00:00:00 2001 From: Patrick Date: Tue, 14 Oct 2025 23:30:03 +0200 Subject: [PATCH 4/6] update OpenAI model ID from gpt-4.1-mini to gpt-5-mini-2025-08-07 across all files --- README.md | 2 +- .../examples/classification_trail_conditions_example.py | 2 +- datafast/examples/generic_pipeline_row_model_example.py | 2 +- datafast/examples/inspect_dataset_example.py | 2 +- datafast/examples/mcq_contextual_example.py | 4 ++-- datafast/examples/mcq_example.py | 2 +- datafast/examples/preference_dataset_example.py | 4 ++-- datafast/examples/raw_text_space_engineering_example.py | 2 +- datafast/llms.py | 8 ++++---- docs/guides/generating_mcq_datasets.md | 4 ++-- docs/guides/generating_preference_datasets.md | 8 ++++---- docs/guides/generating_text_classification_datasets.md | 4 ++-- docs/guides/generating_text_datasets.md | 4 ++-- docs/index.md | 2 +- docs/llms.md | 2 +- tests/test_llms.py | 4 ++-- 16 files changed, 28 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index a80d368..93c77d0 100644 --- a/README.md +++ b/README.md @@ -103,7 +103,7 @@ config = ClassificationDatasetConfig( ```python # Create LLM providers providers = [ - OpenAIProvider(model_id="gpt-4.1-mini-2025-04-14"), + OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), AnthropicProvider(model_id="claude-3-5-haiku-latest"), GeminiProvider(model_id="gemini-2.0-flash") ] diff --git a/datafast/examples/classification_trail_conditions_example.py b/datafast/examples/classification_trail_conditions_example.py index 1fe6264..5ec037a 100644 --- a/datafast/examples/classification_trail_conditions_example.py +++ b/datafast/examples/classification_trail_conditions_example.py @@ -58,7 +58,7 @@ # Set up providers providers = [ - OpenAIProvider(model_id="gpt-4.1-mini-2025-04-14"), + OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), AnthropicProvider(model_id="claude-3-5-haiku-latest") ] diff --git a/datafast/examples/generic_pipeline_row_model_example.py b/datafast/examples/generic_pipeline_row_model_example.py index 41323b1..6536605 100644 --- a/datafast/examples/generic_pipeline_row_model_example.py +++ b/datafast/examples/generic_pipeline_row_model_example.py @@ -32,7 +32,7 @@ question="Qu'est ce qui t'a plu?", # System fields - model_id="gpt-4", + model_id="gpt-5-mini-2025-08-07", language="fr" ) diff --git a/datafast/examples/inspect_dataset_example.py b/datafast/examples/inspect_dataset_example.py index 104d72a..2d84cd4 100644 --- a/datafast/examples/inspect_dataset_example.py +++ b/datafast/examples/inspect_dataset_example.py @@ -46,7 +46,7 @@ from datafast.llms import OpenAIProvider, AnthropicProvider, GeminiProvider providers = [ - OpenAIProvider(model_id="gpt-4.1-nano"), + OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), # Uncomment to use additional providers # AnthropicProvider(model_id="claude-3-5-haiku-latest"), # GeminiProvider(model_id="gemini-2.0-flash"), diff --git a/datafast/examples/mcq_contextual_example.py b/datafast/examples/mcq_contextual_example.py index 48cc9d9..e8e853f 100644 --- a/datafast/examples/mcq_contextual_example.py +++ b/datafast/examples/mcq_contextual_example.py @@ -50,9 +50,9 @@ def main(): output_file="mcq_ar6_contextual_dataset.jsonl", ) - # 3. Initialize OpenAI provider with gpt-4.1-mini + # 3. Initialize OpenAI provider with gpt-5-mini-2025-08-07 providers = [ - OpenAIProvider(model_id="gpt-4.1-mini"), + OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), ] # 4. Generate the dataset diff --git a/datafast/examples/mcq_example.py b/datafast/examples/mcq_example.py index 6eb8d65..1103aa0 100644 --- a/datafast/examples/mcq_example.py +++ b/datafast/examples/mcq_example.py @@ -25,7 +25,7 @@ def main(): # 2. Initialize LLM providers providers = [ - OpenAIProvider(model_id="gpt-4.1-mini-2025-04-14"), + OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), # AnthropicProvider(model_id="claude-3-5-haiku-latest"), # GeminiProvider(model_id="gemini-2.0-flash"), ] diff --git a/datafast/examples/preference_dataset_example.py b/datafast/examples/preference_dataset_example.py index 54c1170..d162791 100644 --- a/datafast/examples/preference_dataset_example.py +++ b/datafast/examples/preference_dataset_example.py @@ -44,10 +44,10 @@ def main(): ) # 2. Initialize LLM providers - question_gen_llm = OpenAIProvider(model_id="gpt-4.1-mini") + question_gen_llm = OpenAIProvider(model_id="gpt-5-mini-2025-08-07") chosen_response_gen_llm = AnthropicProvider(model_id="claude-3-7-sonnet-latest") rejected_response_gen_llm = GeminiProvider(model_id="gemini-2.0-flash") - judge_llm = OpenAIProvider(model_id="gpt-4.1") + judge_llm = OpenAIProvider(model_id="gpt-5-mini-2025-08-07") # 3. Generate the dataset dataset = PreferenceDataset(config) diff --git a/datafast/examples/raw_text_space_engineering_example.py b/datafast/examples/raw_text_space_engineering_example.py index a1eca16..7e8b879 100644 --- a/datafast/examples/raw_text_space_engineering_example.py +++ b/datafast/examples/raw_text_space_engineering_example.py @@ -42,7 +42,7 @@ def main(): # 2. Create LLM providers with specific models providers = [ - OpenAIProvider(model_id="gpt-4.1-mini-2025-04-14"), # You may want to use stronger models + OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), # You may want to use stronger models AnthropicProvider(model_id="claude-3-5-haiku-latest"), ] diff --git a/datafast/llms.py b/datafast/llms.py index b812cae..fdb2a8b 100644 --- a/datafast/llms.py +++ b/datafast/llms.py @@ -242,7 +242,7 @@ def env_key_name(self) -> str: def __init__( self, - model_id: str = "gpt-4.1-mini-2025-04-14", + model_id: str = "gpt-5-mini-2025-08-07", api_key: str | None = None, temperature: float | None = None, max_completion_tokens: int | None = None, @@ -252,7 +252,7 @@ def __init__( """Initialize the OpenAI provider. Args: - model_id: The model ID (defaults to gpt-4.1-mini-2025-04-14) + model_id: The model ID (defaults to gpt-5-mini-2025-08-07) api_key: API key (if None, will get from environment) temperature: The sampling temperature to be used, between 0 and 2. Higher values like 0.8 produce more random outputs, while lower values like 0.2 make outputs more focused and deterministic max_completion_tokens: An upper bound for the number of tokens that can be generated for a completion, including visible output tokens and reasoning tokens. @@ -419,7 +419,7 @@ def env_key_name(self) -> str: def __init__( self, - model_id: str = "openai/gpt-4.1-mini", # for default model + model_id: str = "openai/gpt-5-mini", # for default model api_key: str | None = None, temperature: float | None = None, max_completion_tokens: int | None = None, @@ -429,7 +429,7 @@ def __init__( """Initialize the OpenRouter provider. Args: - model_id: The model ID (defaults to openai/gpt-4.1-mini) + model_id: The model ID (defaults to openai/gpt-5-mini) api_key: API key (if None, will get from environment) temperature: Temperature for generation (0.0 to 1.0) max_completion_tokens: Maximum tokens to generate diff --git a/docs/guides/generating_mcq_datasets.md b/docs/guides/generating_mcq_datasets.md index c8a5b21..bfb9a73 100644 --- a/docs/guides/generating_mcq_datasets.md +++ b/docs/guides/generating_mcq_datasets.md @@ -183,7 +183,7 @@ Configure one or more LLM providers to generate your dataset: ```python providers = [ - OpenAIProvider(model_id="gpt-4.1-mini-2025-04-14"), + OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), AnthropicProvider(model_id="claude-3-5-haiku-latest"), GeminiProvider(model_id="gemini-2.0-flash") ] @@ -272,7 +272,7 @@ def main(): # 2. Initialize LLM providers providers = [ - OpenAIProvider(model_id="gpt-4.1-mini-2025-04-14"), + OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), # Add more providers as needed # AnthropicProvider(model_id="claude-3-5-haiku-latest"), # GeminiProvider(model_id="gemini-2.0-flash"), diff --git a/docs/guides/generating_preference_datasets.md b/docs/guides/generating_preference_datasets.md index 1764cc4..0cebb12 100644 --- a/docs/guides/generating_preference_datasets.md +++ b/docs/guides/generating_preference_datasets.md @@ -151,7 +151,7 @@ Configure LLM providers for different aspects of dataset generation: ```python # For generating questions from NASA lessons learned documents -question_gen_llm = OpenAIProvider(model_id="gpt-4.1-mini") +question_gen_llm = OpenAIProvider(model_id="gpt-5-mini-2025-08-07") # For generating high-quality (chosen) responses chosen_response_gen_llm = AnthropicProvider(model_id="claude-3-7-sonnet-latest") @@ -160,7 +160,7 @@ chosen_response_gen_llm = AnthropicProvider(model_id="claude-3-7-sonnet-latest") rejected_response_gen_llm = GeminiProvider(model_id="gemini-2.0-flash") # For scoring responses (only needed if llm_as_judge=True) -judge_llm = OpenAIProvider(model_id="gpt-4.1") +judge_llm = OpenAIProvider(model_id="gpt-5-mini-2025-08-07") ``` Using different providers for different aspects of generation helps create more diverse and realistic preference pairs. @@ -265,10 +265,10 @@ config = PreferenceDatasetConfig( ) # 2. Initialize LLM providers -question_gen_llm = OpenAIProvider(model_id="gpt-4.1-mini") +question_gen_llm = OpenAIProvider(model_id="gpt-5-mini-2025-08-07") chosen_response_gen_llm = AnthropicProvider(model_id="claude-3-7-sonnet-latest") rejected_response_gen_llm = GeminiProvider(model_id="gemini-2.0-flash") -judge_llm = OpenAIProvider(model_id="gpt-4.1") +judge_llm = OpenAIProvider(model_id="gpt-5-mini-2025-08-07") # 3. Generate the dataset dataset = PreferenceDataset(config) diff --git a/docs/guides/generating_text_classification_datasets.md b/docs/guides/generating_text_classification_datasets.md index d99d7ad..81e983e 100644 --- a/docs/guides/generating_text_classification_datasets.md +++ b/docs/guides/generating_text_classification_datasets.md @@ -166,7 +166,7 @@ Configure one or more LLM providers to generate your dataset: ```python providers = [ - OpenAIProvider(model_id="gpt-4.1-mini-2025-04-14"), + OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), AnthropicProvider(model_id="claude-3-5-haiku-latest") ] ``` @@ -293,7 +293,7 @@ config = ClassificationDatasetConfig( # Set up providers providers = [ - OpenAIProvider(model_id="gpt-4.1-mini-2025-04-14"), + OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), AnthropicProvider(model_id="claude-3-5-haiku-latest") ] diff --git a/docs/guides/generating_text_datasets.md b/docs/guides/generating_text_datasets.md index 4f5736b..a834c71 100644 --- a/docs/guides/generating_text_datasets.md +++ b/docs/guides/generating_text_datasets.md @@ -156,7 +156,7 @@ Configure one or more LLM providers to generate your dataset: ```python providers = [ - OpenAIProvider(model_id="gpt-4.1-mini-2025-04-14"), # You may want to use stronger models + OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), # You may want to use stronger models AnthropicProvider(model_id="claude-3-5-haiku-latest"), ] ``` @@ -280,7 +280,7 @@ def main(): # 2. Create LLM providers with specific models providers = [ - OpenAIProvider(model_id="gpt-4.1-mini-2025-04-14"), # You may want to use stronger models + OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), # You may want to use stronger models AnthropicProvider(model_id="claude-3-5-haiku-latest"), ] diff --git a/docs/index.md b/docs/index.md index 01fdce7..6e18634 100644 --- a/docs/index.md +++ b/docs/index.md @@ -93,7 +93,7 @@ config = ClassificationDatasetConfig( ```python # Create LLM providers providers = [ - OpenAIProvider(model_id="gpt-4.1-mini-2025-04-14"), + OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), AnthropicProvider(model_id="claude-3-5-haiku-latest"), GeminiProvider(model_id="gemini-2.5-flash"), OpenRouterProvider(model_id="z-ai/glm-4.6") diff --git a/docs/llms.md b/docs/llms.md index e34edc2..c829d25 100644 --- a/docs/llms.md +++ b/docs/llms.md @@ -21,7 +21,7 @@ from datafast.llms import OpenAIProvider, AnthropicProvider, GeminiProvider, Oll Each provider can be instantiated with default parameters: ```python -# OpenAI (default: gpt-4.1-mini-2025-04-14) +# OpenAI (default: gpt-5-mini-2025-08-07) openai_llm = OpenAIProvider() # Anthropic (default: claude-3-5-haiku-latest) diff --git a/tests/test_llms.py b/tests/test_llms.py index eb61ad8..2038681 100644 --- a/tests/test_llms.py +++ b/tests/test_llms.py @@ -268,7 +268,7 @@ def test_openrouter_messages_with_structured_output(): def test_openai_with_all_parameters(): """Test OpenAI provider with all optional parameters specified.""" provider = OpenAIProvider( - model_id="gpt-4.1-mini-2025-04-14", + model_id="gpt-5-mini-2025-08-07", temperature=0.2, max_completion_tokens=100, top_p=0.9, @@ -976,7 +976,7 @@ def test_gemini_batch_messages_with_structured_output(): def test_openai_batch_with_all_parameters(): """Test OpenAI provider with batch processing and all optional parameters.""" provider = OpenAIProvider( - model_id="gpt-4.1-mini-2025-04-14", + model_id="gpt-5-mini-2025-08-07", temperature=0.1, max_completion_tokens=50, top_p=0.9, From 41b7dc9dfbd6559b42861119f204ae06e1425f7e Mon Sep 17 00:00:00 2001 From: Patrick Date: Tue, 14 Oct 2025 23:40:33 +0200 Subject: [PATCH 5/6] update Anthropic default model from claude-3-5-haiku-latest to claude-sonnet-4-5-20250929 across codebase --- README.md | 2 +- datafast/examples/classification_trail_conditions_example.py | 2 +- datafast/examples/generic_pipeline_example.py | 2 +- datafast/examples/inspect_dataset_example.py | 2 +- datafast/examples/mcq_example.py | 2 +- datafast/examples/quickstart_example.py | 2 +- datafast/examples/raw_text_space_engineering_example.py | 2 +- datafast/examples/show_dataset_examples.py | 4 ++-- datafast/examples/ultrachat_materials_science.py | 2 +- datafast/llms.py | 4 ++-- docs/guides/generating_mcq_datasets.md | 4 ++-- docs/guides/generating_text_classification_datasets.md | 4 ++-- docs/guides/generating_text_datasets.md | 4 ++-- docs/guides/generating_ultrachat_datasets.md | 4 ++-- docs/index.md | 2 +- docs/llms.md | 2 +- tests/test_llms.py | 4 ++-- 17 files changed, 24 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 93c77d0..84efcc7 100644 --- a/README.md +++ b/README.md @@ -104,7 +104,7 @@ config = ClassificationDatasetConfig( # Create LLM providers providers = [ OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), - AnthropicProvider(model_id="claude-3-5-haiku-latest"), + AnthropicProvider(model_id="claude-sonnet-4-5-20250929"), GeminiProvider(model_id="gemini-2.0-flash") ] ``` diff --git a/datafast/examples/classification_trail_conditions_example.py b/datafast/examples/classification_trail_conditions_example.py index 5ec037a..5789267 100644 --- a/datafast/examples/classification_trail_conditions_example.py +++ b/datafast/examples/classification_trail_conditions_example.py @@ -59,7 +59,7 @@ # Set up providers providers = [ OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), - AnthropicProvider(model_id="claude-3-5-haiku-latest") + AnthropicProvider(model_id="claude-sonnet-4-5-20250929") ] # Generate dataset diff --git a/datafast/examples/generic_pipeline_example.py b/datafast/examples/generic_pipeline_example.py index b010d41..0b31035 100644 --- a/datafast/examples/generic_pipeline_example.py +++ b/datafast/examples/generic_pipeline_example.py @@ -42,7 +42,7 @@ def main(): model_id="gpt-5-mini-2025-08-07", temperature=1 ), - # AnthropicProvider(model_id="claude-3-5-haiku-latest"), + # AnthropicProvider(model_id="claude-sonnet-4-5-20250929"), # GeminiProvider(model_id="gemini-2.5-flash-lite", rpm_limit=15), # OllamaProvider(model_id="gemma3:4b"), ] diff --git a/datafast/examples/inspect_dataset_example.py b/datafast/examples/inspect_dataset_example.py index 2d84cd4..a7fff6d 100644 --- a/datafast/examples/inspect_dataset_example.py +++ b/datafast/examples/inspect_dataset_example.py @@ -48,7 +48,7 @@ providers = [ OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), # Uncomment to use additional providers - # AnthropicProvider(model_id="claude-3-5-haiku-latest"), + # AnthropicProvider(model_id="claude-sonnet-4-5-20250929"), # GeminiProvider(model_id="gemini-2.0-flash"), ] diff --git a/datafast/examples/mcq_example.py b/datafast/examples/mcq_example.py index 1103aa0..1e67b60 100644 --- a/datafast/examples/mcq_example.py +++ b/datafast/examples/mcq_example.py @@ -26,7 +26,7 @@ def main(): # 2. Initialize LLM providers providers = [ OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), - # AnthropicProvider(model_id="claude-3-5-haiku-latest"), + # AnthropicProvider(model_id="claude-sonnet-4-5-20250929"), # GeminiProvider(model_id="gemini-2.0-flash"), ] diff --git a/datafast/examples/quickstart_example.py b/datafast/examples/quickstart_example.py index 7565adf..f0e3af2 100644 --- a/datafast/examples/quickstart_example.py +++ b/datafast/examples/quickstart_example.py @@ -35,7 +35,7 @@ providers = [ OpenAIProvider(model_id="gpt-5-nano-2025-08-07"), - # AnthropicProvider(model_id="claude-3-5-haiku-latest"), + # AnthropicProvider(model_id="claude-sonnet-4-5-20250929"), # GeminiProvider(model_id="gemini-2.0-flash"), # OllamaProvider(model_id="gemma3:12b") ] diff --git a/datafast/examples/raw_text_space_engineering_example.py b/datafast/examples/raw_text_space_engineering_example.py index 7e8b879..18f8de8 100644 --- a/datafast/examples/raw_text_space_engineering_example.py +++ b/datafast/examples/raw_text_space_engineering_example.py @@ -43,7 +43,7 @@ def main(): # 2. Create LLM providers with specific models providers = [ OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), # You may want to use stronger models - AnthropicProvider(model_id="claude-3-5-haiku-latest"), + AnthropicProvider(model_id="claude-sonnet-4-5-20250929"), ] # 3. Generate the dataset diff --git a/datafast/examples/show_dataset_examples.py b/datafast/examples/show_dataset_examples.py index a8ea3a2..d06cc68 100644 --- a/datafast/examples/show_dataset_examples.py +++ b/datafast/examples/show_dataset_examples.py @@ -46,7 +46,7 @@ classification_row2 = TextClassificationRow( text="The trail is well maintained and easy to follow.", label="positive_conditions", - model_id="claude-3-5-haiku-latest", + model_id="claude-sonnet-4-5-20250929", language="en", ) classification_dataset.data_rows = [classification_row, classification_row2] @@ -85,7 +85,7 @@ question="What was the main goal of the Mars 2020 mission?", chosen_response="To search for signs of ancient life and collect samples.", rejected_response="To launch a satellite.", - chosen_model_id="claude-3-5-haiku-latest", + chosen_model_id="claude-sonnet-4-5-20250929", rejected_model_id="gpt-4.1-nano", chosen_response_score=9, rejected_response_score=3, diff --git a/datafast/examples/ultrachat_materials_science.py b/datafast/examples/ultrachat_materials_science.py index 3cfc88e..7bc3d4d 100644 --- a/datafast/examples/ultrachat_materials_science.py +++ b/datafast/examples/ultrachat_materials_science.py @@ -25,7 +25,7 @@ def main(): # 2. Initialize LLM providers - using just one for simplicity providers = [ - AnthropicProvider(model_id="claude-3-5-haiku-latest"), + AnthropicProvider(model_id="claude-sonnet-4-5-20250929"), ] # 3. Get expected number of rows diff --git a/datafast/llms.py b/datafast/llms.py index fdb2a8b..7999b38 100644 --- a/datafast/llms.py +++ b/datafast/llms.py @@ -282,7 +282,7 @@ def env_key_name(self) -> str: def __init__( self, - model_id: str = "claude-3-5-haiku-latest", + model_id: str = "claude-sonnet-4-5-20250929", api_key: str | None = None, temperature: float | None = None, max_completion_tokens: int | None = None, @@ -292,7 +292,7 @@ def __init__( """Initialize the Anthropic provider. Args: - model_id: The model ID (defaults to claude-3-5-haiku-latest) + model_id: The model ID (defaults to claude-sonnet-4-5-20250929) api_key: API key (if None, will get from environment) temperature: Temperature for generation (0.0 to 1.0) max_completion_tokens: Maximum tokens to generate diff --git a/docs/guides/generating_mcq_datasets.md b/docs/guides/generating_mcq_datasets.md index bfb9a73..77ac0e3 100644 --- a/docs/guides/generating_mcq_datasets.md +++ b/docs/guides/generating_mcq_datasets.md @@ -184,7 +184,7 @@ Configure one or more LLM providers to generate your dataset: ```python providers = [ OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), - AnthropicProvider(model_id="claude-3-5-haiku-latest"), + AnthropicProvider(model_id="claude-sonnet-4-5-20250929"), GeminiProvider(model_id="gemini-2.0-flash") ] ``` @@ -274,7 +274,7 @@ def main(): providers = [ OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), # Add more providers as needed - # AnthropicProvider(model_id="claude-3-5-haiku-latest"), + # AnthropicProvider(model_id="claude-sonnet-4-5-20250929"), # GeminiProvider(model_id="gemini-2.0-flash"), ] diff --git a/docs/guides/generating_text_classification_datasets.md b/docs/guides/generating_text_classification_datasets.md index 81e983e..ab504a0 100644 --- a/docs/guides/generating_text_classification_datasets.md +++ b/docs/guides/generating_text_classification_datasets.md @@ -167,7 +167,7 @@ Configure one or more LLM providers to generate your dataset: ```python providers = [ OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), - AnthropicProvider(model_id="claude-3-5-haiku-latest") + AnthropicProvider(model_id="claude-sonnet-4-5-20250929") ] ``` @@ -294,7 +294,7 @@ config = ClassificationDatasetConfig( # Set up providers providers = [ OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), - AnthropicProvider(model_id="claude-3-5-haiku-latest") + AnthropicProvider(model_id="claude-sonnet-4-5-20250929") ] # Generate dataset diff --git a/docs/guides/generating_text_datasets.md b/docs/guides/generating_text_datasets.md index a834c71..fb8c7ef 100644 --- a/docs/guides/generating_text_datasets.md +++ b/docs/guides/generating_text_datasets.md @@ -157,7 +157,7 @@ Configure one or more LLM providers to generate your dataset: ```python providers = [ OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), # You may want to use stronger models - AnthropicProvider(model_id="claude-3-5-haiku-latest"), + AnthropicProvider(model_id="claude-sonnet-4-5-20250929"), ] ``` @@ -281,7 +281,7 @@ def main(): # 2. Create LLM providers with specific models providers = [ OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), # You may want to use stronger models - AnthropicProvider(model_id="claude-3-5-haiku-latest"), + AnthropicProvider(model_id="claude-sonnet-4-5-20250929"), ] # 3. Generate the dataset diff --git a/docs/guides/generating_ultrachat_datasets.md b/docs/guides/generating_ultrachat_datasets.md index ff8673f..6458fe2 100644 --- a/docs/guides/generating_ultrachat_datasets.md +++ b/docs/guides/generating_ultrachat_datasets.md @@ -153,7 +153,7 @@ Configure one or more LLM providers to generate your dataset: ```python providers = [ - AnthropicProvider(model_id="claude-3-5-haiku-latest"), + AnthropicProvider(model_id="claude-sonnet-4-5-20250929"), ] ``` @@ -259,7 +259,7 @@ def main(): # 2. Initialize LLM providers - using just one for simplicity providers = [ - AnthropicProvider(model_id="claude-3-5-haiku-latest"), + AnthropicProvider(model_id="claude-sonnet-4-5-20250929"), ] # 3. Get expected number of rows diff --git a/docs/index.md b/docs/index.md index 6e18634..4b68f54 100644 --- a/docs/index.md +++ b/docs/index.md @@ -94,7 +94,7 @@ config = ClassificationDatasetConfig( # Create LLM providers providers = [ OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), - AnthropicProvider(model_id="claude-3-5-haiku-latest"), + AnthropicProvider(model_id="claude-sonnet-4-5-20250929"), GeminiProvider(model_id="gemini-2.5-flash"), OpenRouterProvider(model_id="z-ai/glm-4.6") ] diff --git a/docs/llms.md b/docs/llms.md index c829d25..1783cf1 100644 --- a/docs/llms.md +++ b/docs/llms.md @@ -24,7 +24,7 @@ Each provider can be instantiated with default parameters: # OpenAI (default: gpt-5-mini-2025-08-07) openai_llm = OpenAIProvider() -# Anthropic (default: claude-3-5-haiku-latest) +# Anthropic (default:claude-sonnet-4-5-20250929) anthropic_llm = AnthropicProvider() # Gemini (default: gemini-2.0-flash) diff --git a/tests/test_llms.py b/tests/test_llms.py index 2038681..064c110 100644 --- a/tests/test_llms.py +++ b/tests/test_llms.py @@ -327,7 +327,7 @@ def test_gemini_messages_with_structured_output(): def test_anthropic_with_all_parameters(): """Test Anthropic provider with all optional parameters specified.""" provider = AnthropicProvider( - model_id="claude-3-5-haiku-latest", + model_id="claude-sonnet-4-5-20250929", temperature=0.3, max_completion_tokens=200, top_p=0.95, @@ -999,7 +999,7 @@ def test_openai_batch_with_all_parameters(): def test_anthropic_batch_with_all_parameters(): """Test Anthropic provider with batch processing and all optional parameters.""" provider = AnthropicProvider( - model_id="claude-3-5-haiku-latest", + model_id="claude-sonnet-4-5-20250929", temperature=0.1, max_completion_tokens=50, top_p=0.9 From c495b753e3ed1d32252cb26515723747fc155b53 Mon Sep 17 00:00:00 2001 From: Patrick Date: Wed, 15 Oct 2025 23:16:56 +0200 Subject: [PATCH 6/6] Update default anthropic model to Claude Haiku 4.5 --- README.md | 2 +- datafast/examples/classification_trail_conditions_example.py | 2 +- datafast/examples/generic_pipeline_example.py | 2 +- datafast/examples/inspect_dataset_example.py | 2 +- datafast/examples/mcq_example.py | 2 +- datafast/examples/quickstart_example.py | 2 +- datafast/examples/raw_text_space_engineering_example.py | 2 +- datafast/examples/show_dataset_examples.py | 4 ++-- datafast/examples/ultrachat_materials_science.py | 2 +- datafast/llms.py | 4 ++-- docs/guides/generating_mcq_datasets.md | 4 ++-- docs/guides/generating_text_classification_datasets.md | 4 ++-- docs/guides/generating_text_datasets.md | 4 ++-- docs/guides/generating_ultrachat_datasets.md | 4 ++-- docs/index.md | 2 +- docs/llms.md | 2 +- tests/test_llms.py | 4 ++-- 17 files changed, 24 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 84efcc7..fd76aa7 100644 --- a/README.md +++ b/README.md @@ -104,7 +104,7 @@ config = ClassificationDatasetConfig( # Create LLM providers providers = [ OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), - AnthropicProvider(model_id="claude-sonnet-4-5-20250929"), + AnthropicProvider(model_id="claude-haiku-4-5-20251001"), GeminiProvider(model_id="gemini-2.0-flash") ] ``` diff --git a/datafast/examples/classification_trail_conditions_example.py b/datafast/examples/classification_trail_conditions_example.py index 5789267..c6a33a0 100644 --- a/datafast/examples/classification_trail_conditions_example.py +++ b/datafast/examples/classification_trail_conditions_example.py @@ -59,7 +59,7 @@ # Set up providers providers = [ OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), - AnthropicProvider(model_id="claude-sonnet-4-5-20250929") + AnthropicProvider(model_id="claude-haiku-4-5-20251001") ] # Generate dataset diff --git a/datafast/examples/generic_pipeline_example.py b/datafast/examples/generic_pipeline_example.py index 0b31035..e09a685 100644 --- a/datafast/examples/generic_pipeline_example.py +++ b/datafast/examples/generic_pipeline_example.py @@ -42,7 +42,7 @@ def main(): model_id="gpt-5-mini-2025-08-07", temperature=1 ), - # AnthropicProvider(model_id="claude-sonnet-4-5-20250929"), + # AnthropicProvider(model_id="claude-haiku-4-5-20251001"), # GeminiProvider(model_id="gemini-2.5-flash-lite", rpm_limit=15), # OllamaProvider(model_id="gemma3:4b"), ] diff --git a/datafast/examples/inspect_dataset_example.py b/datafast/examples/inspect_dataset_example.py index a7fff6d..6225b65 100644 --- a/datafast/examples/inspect_dataset_example.py +++ b/datafast/examples/inspect_dataset_example.py @@ -48,7 +48,7 @@ providers = [ OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), # Uncomment to use additional providers - # AnthropicProvider(model_id="claude-sonnet-4-5-20250929"), + # AnthropicProvider(model_id="claude-haiku-4-5-20251001"), # GeminiProvider(model_id="gemini-2.0-flash"), ] diff --git a/datafast/examples/mcq_example.py b/datafast/examples/mcq_example.py index 1e67b60..dba76f8 100644 --- a/datafast/examples/mcq_example.py +++ b/datafast/examples/mcq_example.py @@ -26,7 +26,7 @@ def main(): # 2. Initialize LLM providers providers = [ OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), - # AnthropicProvider(model_id="claude-sonnet-4-5-20250929"), + # AnthropicProvider(model_id="claude-haiku-4-5-20251001"), # GeminiProvider(model_id="gemini-2.0-flash"), ] diff --git a/datafast/examples/quickstart_example.py b/datafast/examples/quickstart_example.py index f0e3af2..4216082 100644 --- a/datafast/examples/quickstart_example.py +++ b/datafast/examples/quickstart_example.py @@ -35,7 +35,7 @@ providers = [ OpenAIProvider(model_id="gpt-5-nano-2025-08-07"), - # AnthropicProvider(model_id="claude-sonnet-4-5-20250929"), + # AnthropicProvider(model_id="claude-haiku-4-5-20251001"), # GeminiProvider(model_id="gemini-2.0-flash"), # OllamaProvider(model_id="gemma3:12b") ] diff --git a/datafast/examples/raw_text_space_engineering_example.py b/datafast/examples/raw_text_space_engineering_example.py index 18f8de8..cccead7 100644 --- a/datafast/examples/raw_text_space_engineering_example.py +++ b/datafast/examples/raw_text_space_engineering_example.py @@ -43,7 +43,7 @@ def main(): # 2. Create LLM providers with specific models providers = [ OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), # You may want to use stronger models - AnthropicProvider(model_id="claude-sonnet-4-5-20250929"), + AnthropicProvider(model_id="claude-haiku-4-5-20251001"), ] # 3. Generate the dataset diff --git a/datafast/examples/show_dataset_examples.py b/datafast/examples/show_dataset_examples.py index d06cc68..b49cce7 100644 --- a/datafast/examples/show_dataset_examples.py +++ b/datafast/examples/show_dataset_examples.py @@ -46,7 +46,7 @@ classification_row2 = TextClassificationRow( text="The trail is well maintained and easy to follow.", label="positive_conditions", - model_id="claude-sonnet-4-5-20250929", + model_id="claude-haiku-4-5-20251001", language="en", ) classification_dataset.data_rows = [classification_row, classification_row2] @@ -85,7 +85,7 @@ question="What was the main goal of the Mars 2020 mission?", chosen_response="To search for signs of ancient life and collect samples.", rejected_response="To launch a satellite.", - chosen_model_id="claude-sonnet-4-5-20250929", + chosen_model_id="claude-haiku-4-5-20251001", rejected_model_id="gpt-4.1-nano", chosen_response_score=9, rejected_response_score=3, diff --git a/datafast/examples/ultrachat_materials_science.py b/datafast/examples/ultrachat_materials_science.py index 7bc3d4d..d6477fc 100644 --- a/datafast/examples/ultrachat_materials_science.py +++ b/datafast/examples/ultrachat_materials_science.py @@ -25,7 +25,7 @@ def main(): # 2. Initialize LLM providers - using just one for simplicity providers = [ - AnthropicProvider(model_id="claude-sonnet-4-5-20250929"), + AnthropicProvider(model_id="claude-haiku-4-5-20251001"), ] # 3. Get expected number of rows diff --git a/datafast/llms.py b/datafast/llms.py index 7999b38..8949686 100644 --- a/datafast/llms.py +++ b/datafast/llms.py @@ -282,7 +282,7 @@ def env_key_name(self) -> str: def __init__( self, - model_id: str = "claude-sonnet-4-5-20250929", + model_id: str = "claude-haiku-4-5-20251001", api_key: str | None = None, temperature: float | None = None, max_completion_tokens: int | None = None, @@ -292,7 +292,7 @@ def __init__( """Initialize the Anthropic provider. Args: - model_id: The model ID (defaults to claude-sonnet-4-5-20250929) + model_id: The model ID (defaults to claude-haiku-4-5-20251001) api_key: API key (if None, will get from environment) temperature: Temperature for generation (0.0 to 1.0) max_completion_tokens: Maximum tokens to generate diff --git a/docs/guides/generating_mcq_datasets.md b/docs/guides/generating_mcq_datasets.md index 77ac0e3..2e9c0db 100644 --- a/docs/guides/generating_mcq_datasets.md +++ b/docs/guides/generating_mcq_datasets.md @@ -184,7 +184,7 @@ Configure one or more LLM providers to generate your dataset: ```python providers = [ OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), - AnthropicProvider(model_id="claude-sonnet-4-5-20250929"), + AnthropicProvider(model_id="claude-haiku-4-5-20251001"), GeminiProvider(model_id="gemini-2.0-flash") ] ``` @@ -274,7 +274,7 @@ def main(): providers = [ OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), # Add more providers as needed - # AnthropicProvider(model_id="claude-sonnet-4-5-20250929"), + # AnthropicProvider(model_id="claude-haiku-4-5-20251001"), # GeminiProvider(model_id="gemini-2.0-flash"), ] diff --git a/docs/guides/generating_text_classification_datasets.md b/docs/guides/generating_text_classification_datasets.md index ab504a0..4ce36e7 100644 --- a/docs/guides/generating_text_classification_datasets.md +++ b/docs/guides/generating_text_classification_datasets.md @@ -167,7 +167,7 @@ Configure one or more LLM providers to generate your dataset: ```python providers = [ OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), - AnthropicProvider(model_id="claude-sonnet-4-5-20250929") + AnthropicProvider(model_id="claude-haiku-4-5-20251001") ] ``` @@ -294,7 +294,7 @@ config = ClassificationDatasetConfig( # Set up providers providers = [ OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), - AnthropicProvider(model_id="claude-sonnet-4-5-20250929") + AnthropicProvider(model_id="claude-haiku-4-5-20251001") ] # Generate dataset diff --git a/docs/guides/generating_text_datasets.md b/docs/guides/generating_text_datasets.md index fb8c7ef..ad6b717 100644 --- a/docs/guides/generating_text_datasets.md +++ b/docs/guides/generating_text_datasets.md @@ -157,7 +157,7 @@ Configure one or more LLM providers to generate your dataset: ```python providers = [ OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), # You may want to use stronger models - AnthropicProvider(model_id="claude-sonnet-4-5-20250929"), + AnthropicProvider(model_id="claude-haiku-4-5-20251001"), ] ``` @@ -281,7 +281,7 @@ def main(): # 2. Create LLM providers with specific models providers = [ OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), # You may want to use stronger models - AnthropicProvider(model_id="claude-sonnet-4-5-20250929"), + AnthropicProvider(model_id="claude-haiku-4-5-20251001"), ] # 3. Generate the dataset diff --git a/docs/guides/generating_ultrachat_datasets.md b/docs/guides/generating_ultrachat_datasets.md index 6458fe2..1b9d99c 100644 --- a/docs/guides/generating_ultrachat_datasets.md +++ b/docs/guides/generating_ultrachat_datasets.md @@ -153,7 +153,7 @@ Configure one or more LLM providers to generate your dataset: ```python providers = [ - AnthropicProvider(model_id="claude-sonnet-4-5-20250929"), + AnthropicProvider(model_id="claude-haiku-4-5-20251001"), ] ``` @@ -259,7 +259,7 @@ def main(): # 2. Initialize LLM providers - using just one for simplicity providers = [ - AnthropicProvider(model_id="claude-sonnet-4-5-20250929"), + AnthropicProvider(model_id="claude-haiku-4-5-20251001"), ] # 3. Get expected number of rows diff --git a/docs/index.md b/docs/index.md index 4b68f54..ef53fc7 100644 --- a/docs/index.md +++ b/docs/index.md @@ -94,7 +94,7 @@ config = ClassificationDatasetConfig( # Create LLM providers providers = [ OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), - AnthropicProvider(model_id="claude-sonnet-4-5-20250929"), + AnthropicProvider(model_id="claude-haiku-4-5-20251001"), GeminiProvider(model_id="gemini-2.5-flash"), OpenRouterProvider(model_id="z-ai/glm-4.6") ] diff --git a/docs/llms.md b/docs/llms.md index 1783cf1..e5d1939 100644 --- a/docs/llms.md +++ b/docs/llms.md @@ -24,7 +24,7 @@ Each provider can be instantiated with default parameters: # OpenAI (default: gpt-5-mini-2025-08-07) openai_llm = OpenAIProvider() -# Anthropic (default:claude-sonnet-4-5-20250929) +# Anthropic (default:claude-haiku-4-5-20251001) anthropic_llm = AnthropicProvider() # Gemini (default: gemini-2.0-flash) diff --git a/tests/test_llms.py b/tests/test_llms.py index 064c110..93e01ae 100644 --- a/tests/test_llms.py +++ b/tests/test_llms.py @@ -327,7 +327,7 @@ def test_gemini_messages_with_structured_output(): def test_anthropic_with_all_parameters(): """Test Anthropic provider with all optional parameters specified.""" provider = AnthropicProvider( - model_id="claude-sonnet-4-5-20250929", + model_id="claude-haiku-4-5-20251001", temperature=0.3, max_completion_tokens=200, top_p=0.95, @@ -999,7 +999,7 @@ def test_openai_batch_with_all_parameters(): def test_anthropic_batch_with_all_parameters(): """Test Anthropic provider with batch processing and all optional parameters.""" provider = AnthropicProvider( - model_id="claude-sonnet-4-5-20250929", + model_id="claude-haiku-4-5-20251001", temperature=0.1, max_completion_tokens=50, top_p=0.9