From 4c2428d801c7ad91de33098d24d1a69f68213641 Mon Sep 17 00:00:00 2001 From: Patrick Date: Mon, 14 Jul 2025 21:51:03 +0200 Subject: [PATCH 1/8] update prompt for contextualised question generation --- datafast/prompts/mcq_prompts.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/datafast/prompts/mcq_prompts.py b/datafast/prompts/mcq_prompts.py index 4338396..25e10b1 100644 --- a/datafast/prompts/mcq_prompts.py +++ b/datafast/prompts/mcq_prompts.py @@ -1,5 +1,4 @@ -DEFAULT_TEMPLATES = [ - """You are an expert at creating exam questions. Your task is to come up with {num_samples} \ +DEFAULT_TEMPLATES = ["""You are an expert at creating exam questions. Your task is to come up with {num_samples} \ difficult multiple choice questions written in {language_name} in relation to the following document along with the correct answer. The question should be self-contained, short and answerable. It is very important to have unique questions. No questions should be like 'what is X and what about Y?' or 'what is X and when did Y happen?'. @@ -11,7 +10,27 @@ Now come up with {num_samples} questions in relation to the document. Make sure the questions are difficult, but answerable with a short answer. -Provide the correct answer for each question.""" +Provide the correct answer for each question."""] + +# Template for more contextualized questions +CONTEXTUALISED_TEMPLATES = ["""You are an expert at creating exam questions. Your task is to come up with {num_samples} \ + multiple choice questions written in {language_name} in relation to the following document along with the correct answer. +The question should be self-contained, short and answerable. +It is very important to have unique questions. No questions should be like 'what is X and what about Y?' or 'what is X and when did Y happen?'. +The answer must be short. +It must relate to the details of the document. However questions should never contain wording as reference to the document like "according to the report, 'in this paper', 'in the document', etc. +Make sure to write the questions to include some very brief context, like if the person asking the questions would be explaining the context in which the question arise very concisely. This is just to remove ambiguity like if the question was provided in an exam. + +### Context +{context} + +### Document +{document} + +Now come up with {num_samples} contextualized questions in relation to the document. +Make sure the questions are difficult, but answerable with a short answer. +Provide the correct answer for each question. +""" ] DISTRACTOR_TEMPLATE = """ From 630713c0e221e4699921770226db1ddade8901fa Mon Sep 17 00:00:00 2001 From: Patrick Date: Mon, 14 Jul 2025 21:51:44 +0200 Subject: [PATCH 2/8] Updating MCQDatasetConfig and MCQDataset to support optionally a context_column --- datafast/datasets.py | 38 +++++++++++++++++++++++++++++--------- datafast/schema/config.py | 6 ++++++ 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/datafast/datasets.py b/datafast/datasets.py index 96c181a..d31dafd 100644 --- a/datafast/datasets.py +++ b/datafast/datasets.py @@ -833,17 +833,37 @@ def generate(self, llms: list[LLMProvider]) -> "MCQDataset": if len(document.strip()) > self.config.max_document_length: # Skip very long documents continue + # Check if context_column exists and extract context if available + context = None + if self.config.context_column and self.config.context_column in sample: + context = sample[self.config.context_column] + for lang_code, language_name in languages.items(): # 1. First call: Generate questions and correct answers - question_prompts = self.config.prompts or self._get_default_prompts() - question_prompts = [ - prompt.format( - num_samples=self.config.num_samples_per_prompt, - language_name=language_name, - document=document, - ) - for prompt in question_prompts - ] + if context and isinstance(context, str): + # Use contextualized templates if context is available + from datafast.prompts.mcq_prompts import CONTEXTUALISED_TEMPLATES + question_prompts = self.config.prompts or CONTEXTUALISED_TEMPLATES + question_prompts = [ + prompt.format( + num_samples=self.config.num_samples_per_prompt, + language_name=language_name, + document=document, + context=context + ) + for prompt in question_prompts + ] + else: + # Use default templates if no context is available + question_prompts = self.config.prompts or self._get_default_prompts() + question_prompts = [ + prompt.format( + num_samples=self.config.num_samples_per_prompt, + language_name=language_name, + document=document, + ) + for prompt in question_prompts + ] # Expand prompts with configured variations question_expansions = expand_prompts( diff --git a/datafast/schema/config.py b/datafast/schema/config.py index 05c376b..fb7f736 100644 --- a/datafast/schema/config.py +++ b/datafast/schema/config.py @@ -350,6 +350,12 @@ class MCQDatasetConfig(BaseModel): description="Column name containing the text to generate questions from" ) + context_column: str | None = Field( + default=None, + description="Optional column name containing contextual information to enhance question generation. \ + When provided, questions will be generated with this contextual information." + ) + # MCQ Generation parameters num_samples_per_prompt: int = Field( default=3, From 6bda0c05b115ca7e5fcf4e46a422bcc8017fb76b Mon Sep 17 00:00:00 2001 From: Patrick Date: Mon, 14 Jul 2025 21:51:56 +0200 Subject: [PATCH 3/8] Created an example script --- datafast/examples/mcq_contextual_example.py | 85 +++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 datafast/examples/mcq_contextual_example.py diff --git a/datafast/examples/mcq_contextual_example.py b/datafast/examples/mcq_contextual_example.py new file mode 100644 index 0000000..75af10f --- /dev/null +++ b/datafast/examples/mcq_contextual_example.py @@ -0,0 +1,85 @@ +""" +Example script for generating MCQ questions from the AR6 dataset using context information. +This script demonstrates the use of the context_column parameter to enhance question generation. +""" + +import os +import json +import random +from pathlib import Path +import pandas as pd +from dotenv import load_dotenv + +load_dotenv("secrets.env") + +from datafast.schema.config import MCQDatasetConfig +from datafast.datasets import MCQDataset +from datafast.llms import OpenAIProvider + +def main(): + # 1. Create a temporary filtered version of the dataset + ar6_file_path = Path("datafast/examples/data/mcq/ar6.jsonl") + filtered_file_path = Path("datafast/examples/data/mcq/ar6_filtered.jsonl") + + # Read the ar6.jsonl file + with open(ar6_file_path, "r") as f: + data = [json.loads(line) for line in f if line.strip()] + + # Filter for rows where chunk_grade is "OK" or "GREAT" + filtered_data = [row for row in data if row.get("chunk_grade") in ["OK", "GREAT"]] + + # Randomly select 10 examples + selected_data = random.sample(filtered_data, min(10, len(filtered_data))) + + # Write the selected data to a temporary file + with open(filtered_file_path, "w") as f: + for row in selected_data: + f.write(json.dumps(row) + "\n") + + print(f"Selected {len(selected_data)} examples from AR6 dataset") + + # 2. Create MCQ dataset config + config = MCQDatasetConfig( + local_file_path=str(filtered_file_path), + text_column="chunk_text", # Column containing the text to generate questions from + context_column="document_summary", # Column containing context information + num_samples_per_prompt=2, # Generate 2 questions per document + min_document_length=100, # Skip documents shorter than 100 chars + max_document_length=20000, # Skip documents longer than 20000 chars + sample_count=len(selected_data), # Number of samples to process + output_file="mcq_ar6_contextual_dataset.jsonl", + ) + + # 3. Initialize OpenAI provider with gpt-4.1-mini + providers = [ + OpenAIProvider(model_id="gpt-4.1-mini"), + ] + + # 4. Generate the dataset + dataset = MCQDataset(config) + num_expected_rows = dataset.get_num_expected_rows(providers, source_data_num_rows=len(selected_data)) + print(f"\nExpected number of rows: {num_expected_rows}") + dataset.generate(providers) + + # 5. Print results summary + print(f"\nGenerated {len(dataset.data_rows)} MCQs") + print(f"Results saved to {config.output_file}") + + # 6. Cleanup temporary file + os.remove(filtered_file_path) + print(f"Cleaned up temporary file {filtered_file_path}") + # 5. Optional: Push to HF hub + USERNAME = "patrickfleith" # <--- Your hugging face username + DATASET_NAME = "mcq_ar6_contextual_dataset" # <--- Your hugging face dataset name + url = dataset.push_to_hub( + repo_id=f"{USERNAME}/{DATASET_NAME}", + train_size=0.7, + shuffle=True, + upload_card=True, + ) + print(f"\nDataset pushed to Hugging Face Hub: {url}") + + dataset.inspect() + +if __name__ == "__main__": + main() From 9c53722b6e6a672ba92d3b6114a330cd313de10b Mon Sep 17 00:00:00 2001 From: Patrick Date: Mon, 14 Jul 2025 21:53:41 +0200 Subject: [PATCH 4/8] commented out auto push --- datafast/examples/mcq_contextual_example.py | 23 +++++++++++---------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/datafast/examples/mcq_contextual_example.py b/datafast/examples/mcq_contextual_example.py index 75af10f..d3263fd 100644 --- a/datafast/examples/mcq_contextual_example.py +++ b/datafast/examples/mcq_contextual_example.py @@ -68,18 +68,19 @@ def main(): # 6. Cleanup temporary file os.remove(filtered_file_path) print(f"Cleaned up temporary file {filtered_file_path}") - # 5. Optional: Push to HF hub - USERNAME = "patrickfleith" # <--- Your hugging face username - DATASET_NAME = "mcq_ar6_contextual_dataset" # <--- Your hugging face dataset name - url = dataset.push_to_hub( - repo_id=f"{USERNAME}/{DATASET_NAME}", - train_size=0.7, - shuffle=True, - upload_card=True, - ) - print(f"\nDataset pushed to Hugging Face Hub: {url}") + # # 5. Optional: Push to HF hub + # USERNAME = "your_username" # <--- Your hugging face username + # DATASET_NAME = "your_dataset_name" # <--- Your hugging face dataset name + # url = dataset.push_to_hub( + # repo_id=f"{USERNAME}/{DATASET_NAME}", + # train_size=0.7, + # shuffle=True, + # upload_card=True, + # ) + # print(f"\nDataset pushed to Hugging Face Hub: {url}") - dataset.inspect() + ## Uncomment to inspect the dataset in a gradio app + # dataset.inspect() if __name__ == "__main__": main() From 49bcc9bea9e45b3db4ac0aab06f658b5d3dc3cc9 Mon Sep 17 00:00:00 2001 From: Patrick Date: Mon, 14 Jul 2025 21:53:50 +0200 Subject: [PATCH 5/8] documentation update --- datafast/examples/mcq_contextual_example.py | 3 +-- docs/guides/generating_mcq_datasets.md | 3 +++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/datafast/examples/mcq_contextual_example.py b/datafast/examples/mcq_contextual_example.py index d3263fd..48cc9d9 100644 --- a/datafast/examples/mcq_contextual_example.py +++ b/datafast/examples/mcq_contextual_example.py @@ -79,8 +79,7 @@ def main(): # ) # print(f"\nDataset pushed to Hugging Face Hub: {url}") - ## Uncomment to inspect the dataset in a gradio app - # dataset.inspect() + dataset.inspect() if __name__ == "__main__": main() diff --git a/docs/guides/generating_mcq_datasets.md b/docs/guides/generating_mcq_datasets.md index a3c9d43..c8a5b21 100644 --- a/docs/guides/generating_mcq_datasets.md +++ b/docs/guides/generating_mcq_datasets.md @@ -72,6 +72,7 @@ The `MCQDatasetConfig` class defines all parameters for your MCQ dataset generat - **`hf_dataset_name`**: (Optional) Name of a Hugging Face dataset to use as source material - **`local_file_path`**: (Optional) Path to a local file to use as source material - **`text_column`**: (Required) Column name containing the text to generate questions from +- **`context_column`**: (Optional) Column name containing contextual information to enhance question generation with domain-specific context - **`num_samples_per_prompt`**: Number of questions to generate for each text document - **`sample_count`**: (Optional) Number of samples to process from the source text dataset (useful for testing) - **`min_document_length`**: Minimum text length (in characters) for processing (skips shorter documents) @@ -295,6 +296,7 @@ def main(): # train_size=0.7, # seed=42, # shuffle=True, + # upload_card=True, # ) # print(f"\nDataset pushed to Hugging Face Hub: {url}") @@ -322,3 +324,4 @@ Each generated question is stored as an `MCQRow` with these properties: 3. **Model Selection**: Larger, more capable models generally produce better questions and answers. 4. **Validation**: Review a sample of the generated questions to ensure quality and accuracy, then edit prompt. 5. **Start Small**: Begin with a small sample_count to test the configuration before scaling up. +6. **Use Context**: When available, use the `context_column` parameter to provide additional domain-specific context that helps generate more self-contained questions. Good contexts include document summaries, topic descriptions. From 97f152a656c7d9264e89bda5a6ea8972f5992224 Mon Sep 17 00:00:00 2001 From: Patrick Date: Sat, 19 Jul 2025 12:09:22 +0200 Subject: [PATCH 6/8] Adding optional rpm_limit parameters to LLMProvider and to Gemini and Ollama. --- datafast/llms.py | 40 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/datafast/llms.py b/datafast/llms.py index b7b01be..a8a2fb1 100644 --- a/datafast/llms.py +++ b/datafast/llms.py @@ -7,6 +7,7 @@ from typing import Any, Type, TypeVar from abc import ABC, abstractmethod import os +import time import traceback # Pydantic @@ -35,6 +36,7 @@ def __init__( max_completion_tokens: int | None = None, top_p: float | None = None, frequency_penalty: float | None = None, + rpm_limit: int | None = None, ): """Initialize the LLM provider with common parameters. @@ -54,6 +56,10 @@ def __init__( self.max_completion_tokens = max_completion_tokens self.top_p = top_p self.frequency_penalty = frequency_penalty + + # Rate limiting + self.rpm_limit = rpm_limit + self._request_timestamps: list[float] = [] # Configure environment with API key if needed self._configure_env() @@ -88,6 +94,23 @@ def _configure_env(self) -> None: def _get_model_string(self) -> str: """Get the full model string for LiteLLM.""" return f"{self.provider_name}/{self.model_id}" + + def _respect_rate_limit(self) -> None: + """Block execution to ensure we do not exceed the rpm_limit.""" + if self.rpm_limit is None: + return + current = time.monotonic() + # Keep only timestamps within the last minute + self._request_timestamps = [ts for ts in self._request_timestamps if current - ts < 60] + if len(self._request_timestamps) < self.rpm_limit: + return + # Need to wait until the earliest request is outside the 60-second window + earliest = self._request_timestamps[0] + # Add a 1s margin to avoid accidental rate limit exceedance + sleep_time = 61 - (current - earliest) + if sleep_time > 0: + print("Waiting for rate limit...") + time.sleep(sleep_time) def generate( self, @@ -122,6 +145,8 @@ def generate( else: messages_to_send = messages + # Enforce rate limit if set + self._respect_rate_limit() # Prepare completion parameters completion_params = { "model": self._get_model_string(), @@ -138,6 +163,9 @@ def generate( # Call LiteLLM completion response: ModelResponse = litellm.completion(**completion_params) + # Record timestamp for rate limiting + if self.rpm_limit is not None: + self._request_timestamps.append(time.monotonic()) # Extract content from response content = response.choices[0].message.content @@ -172,7 +200,7 @@ def __init__( max_completion_tokens: int | None = None, top_p: float | None = None, frequency_penalty: float | None = None, - ): + ): """Initialize the OpenAI provider. Args: @@ -212,7 +240,7 @@ def __init__( max_completion_tokens: int | None = None, top_p: float | None = None, # frequency_penalty: float | None = None, # Not supported by anthropic - ): + ): """Initialize the Anthropic provider. Args: @@ -250,7 +278,8 @@ def __init__( max_completion_tokens: int | None = None, top_p: float | None = None, frequency_penalty: float | None = None, - ): + rpm_limit: int | None = None, + ): """Initialize the Gemini provider. Args: @@ -268,6 +297,7 @@ def __init__( max_completion_tokens=max_completion_tokens, top_p=top_p, frequency_penalty=frequency_penalty, + rpm_limit=rpm_limit, ) @@ -301,7 +331,8 @@ def __init__( top_p: float | None = None, frequency_penalty: float | None = None, api_base: str | None = None, - ): + rpm_limit: int | None = None, + ): """Initialize the Ollama provider. Args: @@ -323,4 +354,5 @@ def __init__( max_completion_tokens=max_completion_tokens, top_p=top_p, frequency_penalty=frequency_penalty, + rpm_limit=rpm_limit, ) From 346c973ec75782238811e722dee788a7ef845250 Mon Sep 17 00:00:00 2001 From: Patrick Date: Sat, 19 Jul 2025 12:09:44 +0200 Subject: [PATCH 7/8] Writing test with real Gemini and waiting time. --- tests/test_llms.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/tests/test_llms.py b/tests/test_llms.py index 4e0f1e9..bd04df8 100644 --- a/tests/test_llms.py +++ b/tests/test_llms.py @@ -66,7 +66,22 @@ def test_gemini_provider(): response = provider.generate(prompt="What is the capital of France? Answer in one word.") assert "Paris" in response - +@pytest.mark.slow +@pytest.mark.integration +def test_gemini_rpm_limit_real(): + """Test GeminiProvider RPM limit (15 requests/minute) is enforced with real waiting.""" + import time + prompts_count = 17 + rpm = 15 + provider = GeminiProvider(model_id="gemini-2.5-flash-lite-preview-06-17", rpm_limit=rpm) + prompts = [f"Test request {i}" for i in range(prompts_count)] + start = time.monotonic() + for prompt in prompts: + provider.generate(prompt=prompt) + elapsed = time.monotonic() - start + # 17 requests, rpm=15, donc on doit attendre au moins ~60s pour les 2 requêtes au-delà de la limite + assert elapsed >= 59, f"Elapsed time too short for RPM limit: {elapsed:.2f}s for {prompts_count} requests with rpm={rpm}" + @pytest.mark.integration def test_openai_structured_output(): """Test the OpenAI provider with structured output.""" From d801c7f68e00da59b605494bc092595a2b8cbed0 Mon Sep 17 00:00:00 2001 From: Patrick Date: Thu, 7 Aug 2025 19:55:58 +0200 Subject: [PATCH 8/8] update .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index d230bb5..f8ec391 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ __pycache__/ *.py[cod] *$py.class LAUNCH.md +projects/* # C extensions *.so