diff --git a/README.md b/README.md index fd76aa7..1168df9 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ pip install datafast ### 1. Environment Setup -Make sure you have created a `secrets.env` file with your API keys. +Make sure you have created a `.env` file with your API keys. HF token is needed if you want to push the dataset to your HF hub. Other keys depends on which LLM providers you use. ``` @@ -64,7 +64,7 @@ from datafast.llms import OpenAIProvider, AnthropicProvider, GeminiProvider from dotenv import load_dotenv # Load environment variables -load_dotenv("secrets.env") # <--- your API keys +load_dotenv() # <--- your API keys ``` ### 3. Configure Dataset diff --git a/datafast/__init__.py b/datafast/__init__.py index df09d15..4f58b0f 100644 --- a/datafast/__init__.py +++ b/datafast/__init__.py @@ -1,6 +1,7 @@ """Datafast - A Python package for synthetic text dataset generation""" import importlib.metadata +from datafast.logger_config import configure_logger try: __version__ = importlib.metadata.version("datafast") @@ -11,3 +12,5 @@ def get_version(): """Return the current version of the datafast package.""" return __version__ + +__all__ = ["configure_logger", "get_version"] diff --git a/datafast/datasets.py b/datafast/datasets.py index 907a38d..ddc1416 100644 --- a/datafast/datasets.py +++ b/datafast/datasets.py @@ -35,6 +35,7 @@ ) from datafast.expanders import expand_prompts import os +import time from datafast import utils from loguru import logger @@ -268,7 +269,10 @@ def push_to_hub( ValueError: If invalid split parameters are provided """ if not self.data_rows: + logger.error("No data rows to push") raise ValueError("No data rows to push. Generate data first.") + + logger.info(f"Pushing dataset to {repo_id}...") # Convert Pydantic models to dictionaries and handle UUID serialization data = [] @@ -302,6 +306,7 @@ def push_to_hub( # Get token from env if not provided token = token or os.getenv("HF_TOKEN") if token is None: + logger.error("No HuggingFace token found | Set HF_TOKEN environment variable") raise ValueError( "No token provided and HF_TOKEN environment variable not set" ) @@ -334,12 +339,14 @@ def push_to_hub( try: from datafast.card_utils import upload_dataset_card upload_dataset_card(repo_id=repo_id, token=token) - print("Dataset card uploaded successfully") + logger.info("Dataset card uploaded successfully") except Exception as e: - print(f"Warning: Failed to upload dataset card: {e}") + logger.warning(f"Failed to upload dataset card: {e}") # Continue even if card upload fails - return f"https://huggingface.co/datasets/{repo_id}" + url = f"https://huggingface.co/datasets/{repo_id}" + logger.success(f"Dataset pushed to Hub | URL: {url}") + return url class ClassificationDataset(DatasetBase): @@ -372,10 +379,20 @@ def generate(self, llms: list[LLMProvider]) -> "ClassificationDataset": ValueError: If no LLM providers are supplied or if no classes are defined. """ if not llms: + logger.error("No LLM providers supplied") raise ValueError("At least one LLM provider must be supplied") if not self.config.classes: + logger.error("No classification classes provided in config") raise ValueError("No classification classes provided in config") + + start_time = time.time() + expected_rows = self.get_num_expected_rows(llms) + logger.info( + f"Starting ClassificationDataset.generate() | " + f"Expected rows: {expected_rows} | " + f"Providers: {len(llms)}" + ) # Get labels listing for context in prompts labels_listing = [label["name"] for label in self.config.classes] @@ -408,6 +425,9 @@ def generate(self, llms: list[LLMProvider]) -> "ClassificationDataset": for expanded_prompt, meta in expansions: for llm in llms: try: + # Track batch start time + batch_start_time = time.time() + # Generate multiple examples using the LLM response = llm.generate( expanded_prompt, response_format=TextEntries @@ -426,12 +446,36 @@ def generate(self, llms: list[LLMProvider]) -> "ClassificationDataset": self.data_rows.append(row) new_rows.append(row) + # Calculate batch duration + batch_duration = time.time() - batch_start_time + # Save this batch - self.to_jsonl(self.config.output_file, new_rows, append=True) - print(f" Generated and saved {len(self.data_rows)} examples total") + try: + self.to_jsonl(self.config.output_file, new_rows, append=True) + utils.log_generation_progress( + len(self.data_rows), + llm.provider_name, + llm.model_id, + batch_duration, + "examples" + ) + except IOError as e: + logger.error( + f"Failed to save to {self.config.output_file} | Error: {e}" + ) + raise except Exception as e: - print(f"Error with llm provider {llm.provider_name}: {e}") + logger.warning( + f"Provider {llm.provider_name} failed, continuing | Error: {e}" + ) + + duration = time.time() - start_time + logger.success( + f"ClassificationDataset.generate() completed | " + f"Rows: {len(self.data_rows)} | " + f"Duration: {duration:.1f}s" + ) return self def _get_default_prompts(self) -> list[str]: @@ -467,7 +511,16 @@ def generate(self, llms: list[LLMProvider]) -> "RawDataset": ValueError: If no LLM providers are supplied or if text_attributes are missing. """ if not llms: + logger.error("No LLM providers supplied") raise ValueError("At least one LLM provider must be supplied") + + start_time = time.time() + expected_rows = self.get_num_expected_rows(llms) + logger.info( + f"Starting RawDataset.generate() | " + f"Expected rows: {expected_rows} | " + f"Providers: {len(llms)}" + ) # Get languages from config, default to English if not specified languages = self.config.languages or {"en": "English"} @@ -503,6 +556,9 @@ def generate(self, llms: list[LLMProvider]) -> "RawDataset": for expanded_prompt, meta in expansions: for llm in llms: try: + # Track batch start time + batch_start_time = time.time() + # Generate multiple examples using the LLM response = llm.generate( expanded_prompt, response_format=TextExamples @@ -524,13 +580,36 @@ def generate(self, llms: list[LLMProvider]) -> "RawDataset": self.data_rows.append(row) new_rows.append(row) + # Calculate batch duration + batch_duration = time.time() - batch_start_time + # Save this batch - self.to_jsonl(self.config.output_file, new_rows, append=True) - print(f" Generated and saved {len(self.data_rows)} examples total") + try: + self.to_jsonl(self.config.output_file, new_rows, append=True) + utils.log_generation_progress( + len(self.data_rows), + llm.provider_name, + llm.model_id, + batch_duration, + "examples" + ) + except IOError as e: + logger.error( + f"Failed to save to {self.config.output_file} | Error: {e}" + ) + raise except Exception as e: - print(f"Error with llm provider {llm.provider_name}: {e}") + logger.warning( + f"Provider {llm.provider_name} failed, continuing | Error: {e}" + ) + duration = time.time() - start_time + logger.success( + f"RawDataset.generate() completed | " + f"Rows: {len(self.data_rows)} | " + f"Duration: {duration:.1f}s" + ) return self def _get_default_prompts(self) -> list[str]: @@ -559,7 +638,16 @@ def get_num_expected_rows(self, llms: list[LLMProvider]) -> int: def generate(self, llms: list[LLMProvider]) -> "UltrachatDataset": if not llms: + logger.error("No LLM providers supplied") raise ValueError("At least one LLM provider must be supplied") + + start_time = time.time() + expected_rows = self.get_num_expected_rows(llms) + logger.info( + f"Starting UltrachatDataset.generate() | " + f"Expected rows: {expected_rows} | " + f"Providers: {len(llms)}" + ) # Get languages from config, default to English if not specified languages = self.config.languages or {"en": "English"} @@ -602,6 +690,8 @@ def generate(self, llms: list[LLMProvider]) -> "UltrachatDataset": ) for opening_question in opening_questions.questions: + # Track conversation start time + conversation_start_time = time.time() random_persona = np.random.choice( self.config.personas ) @@ -700,18 +790,39 @@ def generate(self, llms: list[LLMProvider]) -> "UltrachatDataset": persona=random_persona, ) self.data_rows.append(row) + + # Calculate conversation duration + conversation_duration = time.time() - conversation_start_time + # Save each chat conversation as it's generated - self.to_jsonl(self.config.output_file, [row], append=True) - print(f" Generated and saved {len(self.data_rows)} chat conversations total") + try: + self.to_jsonl(self.config.output_file, [row], append=True) + utils.log_generation_progress( + len(self.data_rows), + llm.provider_name, + llm.model_id, + conversation_duration, + "chat conversations" + ) + except IOError as e: + logger.error( + f"Failed to save to {self.config.output_file} | Error: {e}" + ) + raise except Exception as e: import traceback error_trace = traceback.format_exc() - print(f"\nError with llm provider {llm.provider_name}:\n{error_trace}") - print(f"Error occurred at response type: {response_format.__name__ if 'response_format' in locals() else 'unknown'}") - if 'reformulated_question' in locals(): - print(f"Last reformulated_question: {reformulated_question}") + logger.warning( + f"Provider {llm.provider_name} failed, continuing | Error: {str(e)}" + ) + duration = time.time() - start_time + logger.success( + f"UltrachatDataset.generate() completed | " + f"Rows: {len(self.data_rows)} | " + f"Duration: {duration:.1f}s" + ) return self def _get_default_question_generation_prompts(self) -> list[str]: @@ -762,20 +873,34 @@ def generate(self, llms: list[LLMProvider]) -> "MCQDataset": ValueError: If no LLM providers are supplied or if required configuration is missing. """ if not llms: + logger.error("No LLM providers supplied") raise ValueError("At least one LLM provider must be supplied") - + + start_time = time.time() + # Load the dataset using shared utility try: + source = self.config.hf_dataset_name or self.config.local_file_path + logger.info(f"Loading source dataset from {source}") dataset = utils.load_dataset_from_source( hf_dataset_name=self.config.hf_dataset_name, local_file_path=self.config.local_file_path, sample_count=self.config.sample_count, text_column=self.config.text_column ) + logger.info(f"Loaded {len(dataset)} documents from source") except Exception as e: source = self.config.hf_dataset_name or self.config.local_file_path + logger.error(f"Failed to load dataset from {source} | Error: {e}") raise ValueError(f"Error loading data from {source}: {e}") + + expected_rows = self.get_num_expected_rows(llms, len(dataset)) + logger.info( + f"Starting MCQDataset.generate() | " + f"Expected rows: {expected_rows} | " + f"Providers: {len(llms)}" + ) # Get languages from config, default to English if not specified languages = self.config.languages or {"en": "English"} @@ -839,6 +964,9 @@ def generate(self, llms: list[LLMProvider]) -> "MCQDataset": response = llm.generate(expanded_prompt, response_format=QAEntries) for qa_entry in response.entries: + # Track MCQ generation start time + mcq_start_time = time.time() + # Extract question and correct answer from the QAEntry try: # QAEntry already has question and answer fields @@ -880,18 +1008,44 @@ def generate(self, llms: list[LLMProvider]) -> "MCQDataset": }, ) self.data_rows.append(row) + + # Calculate MCQ generation duration + mcq_duration = time.time() - mcq_start_time + # Save each MCQ as it's generated - self.to_jsonl(self.config.output_file, [row], append=True) + try: + self.to_jsonl(self.config.output_file, [row], append=True) + utils.log_generation_progress( + len(self.data_rows), + llm.provider_name, + llm.model_id, + mcq_duration, + "MCQs" + ) + except IOError as e: + logger.error( + f"Failed to save to {self.config.output_file} | Error: {e}" + ) + raise else: - print(f"Warning: Not enough incorrect answers generated (got {len(incorrect_answers)}, need 3)") + logger.warning( + f"Not enough incorrect answers generated (got {len(incorrect_answers)}, need 3)" + ) except Exception as e: - print(f"Error generating distractors: {e}") + logger.warning(f"Error generating distractors: {e}") except Exception as e: - print(f"Error processing entry: {e}") - print(f" Generated and saved {len(self.data_rows)} MCQs total") + logger.warning(f"Error processing entry: {e}") except Exception as e: - print(f"Error with llm provider {llm.provider_name}: {e}") + logger.warning( + f"Provider {llm.provider_name} failed, continuing | Error: {e}" + ) + duration = time.time() - start_time + logger.success( + f"MCQDataset.generate() completed | " + f"Rows: {len(self.data_rows)} | " + f"Duration: {duration:.1f}s" + ) return self def _get_default_prompts(self) -> list[str]: @@ -963,8 +1117,16 @@ def generate(self, ValueError: If input_documents are missing in the configuration. """ if not self.config.input_documents: + logger.error("No input documents provided in configuration") raise ValueError("input_documents must be provided in the configuration") + start_time = time.time() + expected_rows = self.get_num_expected_rows([question_gen_llm, chosen_response_gen_llm, rejected_response_gen_llm]) + logger.info( + f"Starting PreferenceDataset.generate() | " + f"Expected rows: {expected_rows}" + ) + # Get languages from config, default to English if not specified languages = self.config.languages or {"en": "English"} @@ -977,6 +1139,9 @@ def generate(self, # For each question, generate chosen and rejected responses for question in questions: + # Track preference pair generation start time + pair_start_time = time.time() + # Generate chosen response chosen_response = self._generate_chosen_response( doc, @@ -1069,10 +1234,31 @@ def generate(self, row = PreferenceRow(**row_data) self.data_rows.append(row) + # Calculate preference pair generation duration + pair_duration = time.time() - pair_start_time + # Save each preference pair immediately - self.to_jsonl(self.config.output_file, [row], append=True) - print(f" Generated and saved {len(self.data_rows)} preference pairs total") + try: + self.to_jsonl(self.config.output_file, [row], append=True) + utils.log_generation_progress( + len(self.data_rows), + question_gen_llm.provider_name, + question_gen_llm.model_id, + pair_duration, + "preference pairs" + ) + except IOError as e: + logger.error( + f"Failed to save to {self.config.output_file} | Error: {e}" + ) + raise + duration = time.time() - start_time + logger.success( + f"PreferenceDataset.generate() completed | " + f"Rows: {len(self.data_rows)} | " + f"Duration: {duration:.1f}s" + ) return self def _generate_questions(self, document: str, llm: LLMProvider, language_name: str) -> list[str]: @@ -1296,16 +1482,26 @@ def generate(self, llms: list[LLMProvider]) -> "GenericPipelineDataset": Self for method chaining. """ if not llms: + logger.error("No LLM providers supplied") raise ValueError("At least one LLM provider must be supplied") + start_time = time.time() + # Load source dataset source_dataset = self._load_source_dataset() - print(f"Loaded source dataset with {len(source_dataset)} rows") + logger.info(f"Loaded source dataset with {len(source_dataset)} rows") # Apply sample limit if specified if self.config.sample_count: source_dataset = source_dataset[:min(self.config.sample_count, len(source_dataset))] - print(f"Limited to {len(source_dataset)} rows") + logger.info(f"Limited to {len(source_dataset)} rows") + + expected_rows = self.get_num_expected_rows(llms) + logger.info( + f"Starting GenericPipelineDataset.generate() | " + f"Expected rows: {expected_rows} | " + f"Providers: {len(llms)}" + ) # Get languages from config languages = self.config.languages or {"en": "English"} @@ -1314,7 +1510,7 @@ def generate(self, llms: list[LLMProvider]) -> "GenericPipelineDataset": for row_idx, source_row in enumerate(source_dataset): # Apply skip function if provided if self.config.skip_function and self.config.skip_function(source_row): - print(f"Skipping row {row_idx} due to skip_function") + logger.debug(f"Skipping row {row_idx} due to skip_function") continue # Extract input data based on input_columns @@ -1347,6 +1543,9 @@ def generate(self, llms: list[LLMProvider]) -> "GenericPipelineDataset": # Process with each LLM for llm in llms: try: + # Track batch start time + batch_start_time = time.time() + # Create dynamic response model based on output_columns configuration response_model = utils.create_response_model(self.config) @@ -1390,12 +1589,35 @@ def generate(self, llms: list[LLMProvider]) -> "GenericPipelineDataset": self.data_rows.append(row) new_rows.append(row) + # Calculate batch duration + batch_duration = time.time() - batch_start_time + # Save this batch - self.to_jsonl(self.config.output_file, new_rows, append=True) - logger.success(f"Generated and saved {len(self.data_rows)} examples total") + try: + self.to_jsonl(self.config.output_file, new_rows, append=True) + utils.log_generation_progress( + len(self.data_rows), + llm.provider_name, + llm.model_id, + batch_duration, + "examples" + ) + except IOError as e: + logger.error( + f"Failed to save to {self.config.output_file} | Error: {e}" + ) + raise except Exception as e: - logger.error(f"Error with llm provider {llm.provider_name} on row {row_idx}: {e}") + logger.warning( + f"Provider {llm.provider_name} failed on row {row_idx}, continuing | Error: {e}" + ) continue + duration = time.time() - start_time + logger.success( + f"GenericPipelineDataset.generate() completed | " + f"Rows: {len(self.data_rows)} | " + f"Duration: {duration:.1f}s" + ) return self \ No newline at end of file diff --git a/datafast/examples/classification_trail_conditions_example.py b/datafast/examples/classification_trail_conditions_example.py index c6a33a0..1a43c60 100644 --- a/datafast/examples/classification_trail_conditions_example.py +++ b/datafast/examples/classification_trail_conditions_example.py @@ -1,10 +1,14 @@ from datafast.datasets import ClassificationDataset from datafast.schema.config import ClassificationDatasetConfig, PromptExpansionConfig from datafast.llms import OpenAIProvider, AnthropicProvider +from datafast.logger_config import configure_logger from dotenv import load_dotenv # Load API keys -load_dotenv("secrets.env") +load_dotenv() + +# Configure logger +configure_logger() # Configure dataset config = ClassificationDatasetConfig( diff --git a/datafast/examples/generic_pipeline_example.py b/datafast/examples/generic_pipeline_example.py index e09a685..e7a242b 100644 --- a/datafast/examples/generic_pipeline_example.py +++ b/datafast/examples/generic_pipeline_example.py @@ -7,6 +7,8 @@ from datafast.schema.config import GenericPipelineDatasetConfig from datafast.datasets import GenericPipelineDataset from datafast.llms import OpenAIProvider, GeminiProvider, OllamaProvider +from datafast.logger_config import configure_logger +from dotenv import load_dotenv PROMPT_TEMPLATE = """I will give you a persona. @@ -80,7 +82,6 @@ def main(): if __name__ == "__main__": - from dotenv import load_dotenv - - load_dotenv("secrets.env") + load_dotenv() + configure_logger() main() diff --git a/datafast/examples/generic_pipeline_response_format_example.py b/datafast/examples/generic_pipeline_response_format_example.py index 627cc74..87e2f26 100644 --- a/datafast/examples/generic_pipeline_response_format_example.py +++ b/datafast/examples/generic_pipeline_response_format_example.py @@ -2,6 +2,10 @@ from datafast.schema.config import GenericPipelineDatasetConfig from datafast.utils import create_response_model +from datafast.logger_config import configure_logger + +# Configure logger +configure_logger() # Test with multiple columns and num_samples_per_prompt = 3 config = GenericPipelineDatasetConfig( diff --git a/datafast/examples/generic_pipeline_row_model_example.py b/datafast/examples/generic_pipeline_row_model_example.py index 6536605..49e6a81 100644 --- a/datafast/examples/generic_pipeline_row_model_example.py +++ b/datafast/examples/generic_pipeline_row_model_example.py @@ -2,6 +2,10 @@ from datafast.schema.config import GenericPipelineDatasetConfig from datafast.utils import create_generic_pipeline_row_model +from datafast.logger_config import configure_logger + +# Configure logger +configure_logger() # Test with multiple input, forward, and output columns config = GenericPipelineDatasetConfig( diff --git a/datafast/examples/inspect_dataset_example.py b/datafast/examples/inspect_dataset_example.py index 6225b65..77161ab 100644 --- a/datafast/examples/inspect_dataset_example.py +++ b/datafast/examples/inspect_dataset_example.py @@ -5,15 +5,19 @@ python -m datafast.examples.inspect_dataset_example Requires: - - OpenAI API key in secrets.env or environment + - OpenAI API key in .env or environment - gradio package (pip install gradio) """ from datafast.datasets import ClassificationDataset from datafast.schema.config import ClassificationDatasetConfig, PromptExpansionConfig +from datafast.logger_config import configure_logger from dotenv import load_dotenv -# Load API keys from environment or secrets.env -load_dotenv("secrets.env") +# Load API keys from environment or .env +load_dotenv() + +# Configure logger +configure_logger() # Configure the dataset generation config = ClassificationDatasetConfig( diff --git a/datafast/examples/keywords_extraction_example.py b/datafast/examples/keywords_extraction_example.py index c91d7bc..54e69d2 100644 --- a/datafast/examples/keywords_extraction_example.py +++ b/datafast/examples/keywords_extraction_example.py @@ -7,6 +7,12 @@ from datafast.schema.config import GenericPipelineDatasetConfig from datafast.datasets import GenericPipelineDataset from datafast.llms import OpenAIProvider, GeminiProvider, OllamaProvider, OpenRouterProvider +from datafast.logger_config import configure_logger +from dotenv import load_dotenv + +# Load environment variables and configure logger +load_dotenv() +configure_logger() PROMPT_TEMPLATE = """I will give you a tweet. Generate a comma separated list of 3 keywords for the tweet. Avoid multi-word keywords. @@ -63,7 +69,4 @@ def main(): if __name__ == "__main__": - from dotenv import load_dotenv - - load_dotenv("secrets.env") main() diff --git a/datafast/examples/mcq_contextual_example.py b/datafast/examples/mcq_contextual_example.py index e8e853f..0cec495 100644 --- a/datafast/examples/mcq_contextual_example.py +++ b/datafast/examples/mcq_contextual_example.py @@ -10,11 +10,15 @@ import pandas as pd from dotenv import load_dotenv -load_dotenv("secrets.env") +load_dotenv() from datafast.schema.config import MCQDatasetConfig from datafast.datasets import MCQDataset from datafast.llms import OpenAIProvider +from datafast.logger_config import configure_logger + +# Configure logger +configure_logger() def main(): # 1. Create a temporary filtered version of the dataset diff --git a/datafast/examples/mcq_example.py b/datafast/examples/mcq_example.py index 0a12312..0656ec3 100644 --- a/datafast/examples/mcq_example.py +++ b/datafast/examples/mcq_example.py @@ -5,7 +5,13 @@ import os from datafast.schema.config import MCQDatasetConfig from datafast.datasets import MCQDataset -from datafast.llms import OpenAIProvider, AnthropicProvider, GeminiProvider +from datafast.llms import OpenAIProvider, OpenRouterProvider +from datafast.logger_config import configure_logger +from dotenv import load_dotenv + +# Load environment variables and configure logger +load_dotenv() +configure_logger() def main(): @@ -16,8 +22,8 @@ def main(): # local_file_path="datafast/examples/data/mcq/sample.txt", #local_file_path="datafast/examples/data/mcq/sample.jsonl", text_column="text", # Column containing the text to generate questions from - sample_count=2, # Process only 3 samples for testing - num_samples_per_prompt=2,# Generate 2 questions per document + sample_count=20, # Process only 3 samples for testing + num_samples_per_prompt=3,# Generate 2 questions per document min_document_length=100, # Skip documents shorter than 100 chars max_document_length=20000,# Skip documents longer than 20000 chars output_file="mcq_test_dataset.jsonl", @@ -26,13 +32,12 @@ def main(): # 2. Initialize LLM providers providers = [ OpenAIProvider(model_id="gpt-5-mini-2025-08-07"), - # AnthropicProvider(model_id="claude-haiku-4-5-20251001"), - # GeminiProvider(model_id="gemini-2.0-flash"), + OpenRouterProvider(model_id="qwen/qwen3-next-80b-a3b-instruct"), ] # 3. Generate the dataset dataset = MCQDataset(config) - num_expected_rows = dataset.get_num_expected_rows(providers, source_data_num_rows=2) + num_expected_rows = dataset.get_num_expected_rows(providers, source_data_num_rows=20) print(f"\nExpected number of rows: {num_expected_rows}") dataset.generate(providers) @@ -53,7 +58,4 @@ def main(): if __name__ == "__main__": - from dotenv import load_dotenv - - load_dotenv() main() diff --git a/datafast/examples/preference_dataset_example.py b/datafast/examples/preference_dataset_example.py index d162791..ee0360b 100644 --- a/datafast/examples/preference_dataset_example.py +++ b/datafast/examples/preference_dataset_example.py @@ -78,5 +78,5 @@ def main(): if __name__ == "__main__": from dotenv import load_dotenv - load_dotenv("secrets.env") + load_dotenv() main() diff --git a/datafast/examples/quickstart_example.py b/datafast/examples/quickstart_example.py index 4216082..0a3ae13 100644 --- a/datafast/examples/quickstart_example.py +++ b/datafast/examples/quickstart_example.py @@ -1,7 +1,10 @@ from datafast.datasets import ClassificationDataset from datafast.schema.config import ClassificationDatasetConfig, PromptExpansionConfig from dotenv import load_dotenv -load_dotenv("secrets.env") +from datafast.logger_config import configure_logger +load_dotenv() + +configure_logger() config = ClassificationDatasetConfig( classes=[ diff --git a/datafast/examples/raw_text_space_engineering_example.py b/datafast/examples/raw_text_space_engineering_example.py index cccead7..aae5794 100644 --- a/datafast/examples/raw_text_space_engineering_example.py +++ b/datafast/examples/raw_text_space_engineering_example.py @@ -1,23 +1,26 @@ from datafast.datasets import RawDataset from datafast.schema.config import RawDatasetConfig, PromptExpansionConfig from datafast.llms import OpenAIProvider, AnthropicProvider +from datafast.logger_config import configure_logger + +configure_logger() def main(): # 1. Configure the dataset generation config = RawDatasetConfig( document_types=[ - "space engineering textbook", - "spacecraft design justification document", + # "space engineering textbook", + # "spacecraft design justification document", "personal blog of a space engineer" ], topics=[ - "Microgravity", - "Vacuum", - "Heavy Ions", - "Thermal Extremes", - "Atomic Oxygen", - "Debris Impact", + # "Microgravity", + # "Vacuum", + # "Heavy Ions", + # "Thermal Extremes", + # "Atomic Oxygen", + # "Debris Impact", "Electrostatic Charging", "Propellant Boil-off", # ... You can pour hundreds of topics here. 8 is enough for this example @@ -67,5 +70,5 @@ def main(): if __name__ == "__main__": from dotenv import load_dotenv - load_dotenv("secrets.env") + load_dotenv() main() \ No newline at end of file diff --git a/datafast/examples/ultrachat_materials_science.py b/datafast/examples/ultrachat_materials_science.py index d6477fc..6d20c32 100644 --- a/datafast/examples/ultrachat_materials_science.py +++ b/datafast/examples/ultrachat_materials_science.py @@ -53,5 +53,5 @@ def main(): if __name__ == "__main__": from dotenv import load_dotenv - load_dotenv("secrets.env") + load_dotenv() main() \ No newline at end of file diff --git a/datafast/llms.py b/datafast/llms.py index 6220c05..7d0bcee 100644 --- a/datafast/llms.py +++ b/datafast/llms.py @@ -10,6 +10,7 @@ import time import traceback import warnings +from loguru import logger # Pydantic from pydantic import BaseModel @@ -65,6 +66,9 @@ def __init__( # Configure environment with API key if needed self._configure_env() + + # Log successful initialization + logger.info(f"Initialized {self.provider_name} | Model: {self.model_id}") @property @abstractmethod @@ -82,6 +86,9 @@ def _get_api_key(self) -> str: """Get API key from environment variables.""" api_key = os.getenv(self.env_key_name) if not api_key: + logger.error( + f"Missing API key | Set {self.env_key_name} environment variable" + ) raise ValueError( f"{self.env_key_name} environment variable not set. " f"Please set it or provide an API key when initializing the provider." @@ -112,7 +119,7 @@ def _respect_rate_limit(self) -> None: # Add a 1s margin to avoid accidental rate limit exceedance sleep_time = 61 - (current - earliest) if sleep_time > 0: - print("Waiting for rate limit...") + logger.warning(f"Rate limit reached | Waiting {sleep_time:.1f}s") time.sleep(sleep_time) @staticmethod @@ -264,8 +271,23 @@ def generate( if response_format is not None: # Strip code fences before validation content = self._strip_code_fences(content) - results.append( - response_format.model_validate_json(content)) + try: + results.append( + response_format.model_validate_json(content)) + except Exception as validation_error: + # Show the content that failed to parse for debugging + content_preview = content[:200] + "..." if len(content) > 200 else content + logger.warning( + f"JSON parsing failed, skipping response | " + f"Model: {self.model_id} | " + f"Format: {response_format.__name__} | " + f"Content preview: {content_preview}" + ) + raise ValueError( + f"Failed to parse JSON response into {response_format.__name__}.\n" + f"Validation error: {validation_error}\n" + f"Content received (first 200 chars):\n{content_preview}" + ) from validation_error else: # Strip leading/trailing whitespace for text responses results.append(content.strip() if content else content) @@ -277,6 +299,10 @@ def generate( except Exception as e: error_trace = traceback.format_exc() + logger.error( + f"Generation failed | Provider: {self.provider_name} | " + f"Model: {self.model_id} | Error: {str(e)}" + ) raise RuntimeError( f"Error generating batch response with {self.provider_name}:\n{error_trace}" ) @@ -471,7 +497,22 @@ def generate( if response_format is not None: # Strip code fences before validation content = self._strip_code_fences(content) - results.append(response_format.model_validate_json(content)) + try: + results.append(response_format.model_validate_json(content)) + except Exception as validation_error: + # Show the content that failed to parse for debugging + content_preview = content[:200] + "..." if len(content) > 200 else content + logger.warning( + f"JSON parsing failed, skipping response | " + f"Model: {self.model_id} | " + f"Format: {response_format.__name__} | " + f"Content preview: {content_preview}" + ) + raise ValueError( + f"Failed to parse JSON response into {response_format.__name__}.\n" + f"Validation error: {validation_error}\n" + f"Content received (first 200 chars):\n{content_preview}" + ) from validation_error else: # Strip leading/trailing whitespace for text responses results.append(content.strip() if content else content) @@ -483,6 +524,10 @@ def generate( except Exception as e: error_trace = traceback.format_exc() + logger.error( + f"Generation failed | Provider: {self.provider_name} | " + f"Model: {self.model_id} | Error: {str(e)}" + ) raise RuntimeError( f"Error generating response with {self.provider_name}:\n{error_trace}" ) diff --git a/datafast/logger_config.py b/datafast/logger_config.py new file mode 100644 index 0000000..083d8b7 --- /dev/null +++ b/datafast/logger_config.py @@ -0,0 +1,88 @@ +"""Logger configuration for datafast using loguru. + +This module provides centralized logging configuration with support for both +console and file output. +""" + +from loguru import logger +import sys + + +def configure_logger( + level: str = "INFO", + log_file: str | None = None, + format_string: str | None = None, + colorize: bool = True, + serialize: bool = False, +) -> None: + """Configure the global logger for datafast. + + Args: + level: Minimum log level (DEBUG, INFO, SUCCESS, WARNING, ERROR) + log_file: Optional path to log file. If provided, logs will be written to both + console and file with automatic rotation. + format_string: Custom format string (uses default if None) + colorize: Enable colored output for console + serialize: Output logs as JSON (useful for production monitoring) + + Examples: + >>> # Default: INFO level, console only + >>> configure_logger() + + >>> # With file logging + >>> configure_logger(level="INFO", log_file="datafast.log") + + >>> # Debug mode with file + >>> configure_logger(level="DEBUG", log_file="debug.log") + + >>> # Production: JSON format + >>> configure_logger(level="WARNING", serialize=True, log_file="prod.log") + """ + # Remove default handler + logger.remove() + + if format_string is None: + if serialize: + # JSON format for production - add to both console and file + logger.add(sys.stderr, serialize=True, level=level) + if log_file: + logger.add( + log_file, + serialize=True, + level=level, + rotation="10 MB", + retention="1 week", + compression="zip", + ) + else: + # Human-readable format + format_string = ( + "{time:YYYY-MM-DD HH:mm:ss} | " + "{level: <8} | " + "{name}:{function} - " + "{message}" + ) + + # Console handler (colorized) + logger.add( + sys.stderr, + format=format_string, + level=level, + colorize=colorize, + ) + + # File handler (no colors, with rotation) + if log_file: + logger.add( + log_file, + format=format_string, + level=level, + colorize=False, # No ANSI colors in file + rotation="10 MB", # Rotate when file reaches 10MB + retention="1 week", # Keep logs for 1 week + compression="zip", # Compress rotated logs + ) + + +# Initialize with default configuration +configure_logger() diff --git a/datafast/prompts/mcq_prompts.py b/datafast/prompts/mcq_prompts.py index 25e10b1..9b9e4d6 100644 --- a/datafast/prompts/mcq_prompts.py +++ b/datafast/prompts/mcq_prompts.py @@ -1,8 +1,8 @@ DEFAULT_TEMPLATES = ["""You are an expert at creating exam questions. Your task is to come up with {num_samples} \ difficult multiple choice questions written in {language_name} in relation to the following document along with the correct answer. -The question should be self-contained, short and answerable. +The questions should be self-contained, short and answerable. It is very important to have unique questions. No questions should be like 'what is X and what about Y?' or 'what is X and when did Y happen?'. -The answer must be short. +The correct answers must be short. It must relate to the details of the document, but the question should contain enough context to be answerable for a person without the document. ### Document @@ -10,14 +10,17 @@ Now come up with {num_samples} questions in relation to the document. Make sure the questions are difficult, but answerable with a short answer. -Provide the correct answer for each question."""] +Provide the correct answer for each question. +Do not list candidate answers as part of the questions. +Do not include numbers or letters (A, B, C, D, etc.) as prefix in the questions or answers. +"""] # Template for more contextualized questions CONTEXTUALISED_TEMPLATES = ["""You are an expert at creating exam questions. Your task is to come up with {num_samples} \ - multiple choice questions written in {language_name} in relation to the following document along with the correct answer. -The question should be self-contained, short and answerable. + multiple choice questions written in {language_name} in relation to the following document along with the correct answers. +The questions should be self-contained, short and answerable. It is very important to have unique questions. No questions should be like 'what is X and what about Y?' or 'what is X and when did Y happen?'. -The answer must be short. +The answers must be short. It must relate to the details of the document. However questions should never contain wording as reference to the document like "according to the report, 'in this paper', 'in the document', etc. Make sure to write the questions to include some very brief context, like if the person asking the questions would be explaining the context in which the question arise very concisely. This is just to remove ambiguity like if the question was provided in an exam. @@ -30,6 +33,8 @@ Now come up with {num_samples} contextualized questions in relation to the document. Make sure the questions are difficult, but answerable with a short answer. Provide the correct answer for each question. +Do not list candidate answers as part of the questions. +Do not include numbers or letters (A, B, C, D, etc.) as prefix in the questions or answers. """ ] @@ -45,4 +50,6 @@ Correct Answer: {correct_answer} Now provide exactly 3 short, plausible but incorrect answers: +Make sure the the length, structure, or writing style of the incorrect answers are not all at the same time obviously different from the correct answer (e.g. if the correct answer does not contain a trailing period, also avoid trailing periods in the incorrect answers, or make sure their length are similar, the way numbers or names are written, etc.) +Do not include numbers or letters (A, B, C, D, etc.) as prefix in the incorrect answers. """ diff --git a/datafast/utils.py b/datafast/utils.py index 576b78e..f773e23 100644 --- a/datafast/utils.py +++ b/datafast/utils.py @@ -2,6 +2,37 @@ from datafast.llms import LLMProvider from datasets import Dataset, load_dataset from pydantic import BaseModel, Field, create_model +from loguru import logger +import time + + +def log_generation_progress( + total_rows: int, + provider_name: str, + model_id: str, + duration: float, + item_type: str = "examples" +) -> None: + """Log generation progress with provider and timing information. + + Args: + total_rows: Total number of rows generated so far + provider_name: Name of the LLM provider (e.g., "openai", "anthropic") + model_id: Model identifier (e.g., "gpt-4", "claude-3-sonnet") + duration: Duration in seconds for this generation step + item_type: Type of items being generated (e.g., "examples", "chat conversations", "MCQs") + + Example: + >>> log_generation_progress(25, "openai", "gpt-4", 3.2, "examples") + # Logs: Generated and saved 25 examples total | Provider: openai | Model: gpt-4 | Duration: 3.2s + """ + logger.success( + f"Generated and saved {total_rows} {item_type} total | " + f"Provider: {provider_name} | " + f"Model: {model_id} | " + f"Duration: {duration:.1f}s" + ) + def calculate_num_prompt_expansions(base_prompts: list[str], expansion_config: PromptExpansionConfig) -> int: """Calculate the number of prompt expansions based on the expansion configuration. @@ -177,10 +208,12 @@ def load_dataset_from_source(hf_dataset_name: str | None = None, try: if hf_dataset_name: # Load from Hugging Face + logger.info(f"Loading dataset from HuggingFace: {hf_dataset_name}") hf_dataset = load_dataset(hf_dataset_name) # Most datasets have a 'train' split, but fallback to first available split split_names = list(hf_dataset.keys()) if not split_names: + logger.error(f"No splits found in dataset {hf_dataset_name}") raise ValueError(f"No splits found in dataset {hf_dataset_name}") main_split = "train" if "train" in split_names else split_names[0] @@ -190,6 +223,7 @@ def load_dataset_from_source(hf_dataset_name: str | None = None, elif local_file_path: # Load from local file based on extension + logger.info(f"Loading dataset from local file: {local_file_path}") file_ext = local_file_path.lower().split('.')[-1] if file_ext == 'csv': @@ -217,17 +251,23 @@ def load_dataset_from_source(hf_dataset_name: str | None = None, dataset.append(json.loads(line)) else: + logger.error(f"Unsupported file extension: {file_ext}") raise ValueError(f"Unsupported file extension: {file_ext}. Supported extensions are: csv, txt, parquet, jsonl, json") else: + logger.error("No dataset source specified") raise ValueError("Either hf_dataset_name or local_file_path must be specified") # Limit the number of samples if specified if sample_count is not None: dataset = dataset[:min(sample_count, len(dataset))] + logger.info(f"Dataset loaded successfully | Rows: {len(dataset)}") + else: + logger.info(f"Dataset loaded successfully | Rows: {len(dataset)}") return dataset except Exception as e: + logger.error(f"Failed to load dataset | Error: {str(e)}") raise ValueError(f"Error loading dataset: {str(e)}") @@ -238,6 +278,7 @@ def _get_generic_pipeline_specific_factors(config: GenericPipelineDatasetConfig) def _get_generic_pipeline_num_expected_rows(config: GenericPipelineDatasetConfig, llms: list[LLMProvider]) -> int: """Calculate expected rows for GenericPipelineDataset including prompt expansions.""" # Load source dataset to get row count + logger.debug("Calculating expected rows for GenericPipelineDataset") source_dataset = load_dataset_from_source( hf_dataset_name=config.hf_dataset_name, local_file_path=config.local_file_path, diff --git a/docs/concepts.md b/docs/concepts.md index 542c118..84c9685 100644 --- a/docs/concepts.md +++ b/docs/concepts.md @@ -69,13 +69,41 @@ The prompt expansion system is key and enables: The datafast workflow follows a consistent pattern across all dataset types: 1. **Configuration**: Define the dataset parameters, classes/topics, and generation settings -2. **Prompt Design**: Create base prompts with mandatory and optional placeholders -3. **Provider Setup**: Initialize one or more LLM providers -4. **Generation**: Execute the generation process, which: +2. **Logging Setup**: Configure logging to monitor the generation process (recommended) +3. **Prompt Design**: Create base prompts with mandatory and optional placeholders +4. **Provider Setup**: Initialize one or more LLM providers +5. **Generation**: Execute the generation process, which: - Expands prompts based on configuration - Distributes generation across providers - Collects and processes responses -5. **Output**: Save the resulting dataset to a file and optionally push to Hugging Face Hub +6. **Output**: Save the resulting dataset to a file and optionally push to Hugging Face Hub + +## Logging and Monitoring + +Datafast includes comprehensive logging to provide visibility into the generation process: + +### Why Configure Logging? + +Without `configure_logger()`, your datafast scripts will run silently without: +- Progress indicators during generation +- Rate limiting warnings +- Success completion messages +- Detailed error information + +### Basic Usage + +```python +from datafast.logger_config import configure_logger + +# Default: INFO level, console output with colors +configure_logger() + +# With file logging for long-running jobs +configure_logger(level="INFO", log_file="generation.log") + +# Debug mode for troubleshooting +configure_logger(level="DEBUG", log_file="debug.log") +``` ## Dataset Diversity Mechanisms diff --git a/docs/guides/generating_generic_pipeline_datasets.md b/docs/guides/generating_generic_pipeline_datasets.md index dd70e73..4559891 100644 --- a/docs/guides/generating_generic_pipeline_datasets.md +++ b/docs/guides/generating_generic_pipeline_datasets.md @@ -34,16 +34,20 @@ from datafast.schema.config import GenericPipelineDatasetConfig from datafast.llms import OpenRouterProvider ``` -In addition, we'll use `dotenv` to load environment variables containing API keys: +In addition, we'll use `dotenv` to load environment variables containing API keys and configure logging to monitor the generation process: ```python from dotenv import load_dotenv +from datafast.logger_config import configure_logger # Load environment variables containing API keys -load_dotenv("secrets.env") +load_dotenv() + +# Configure logger to see progress, warnings, and success messages +configure_logger() ``` -Make sure you have created a `secrets.env` file with your API keys: +Make sure you have created a `.env` file with your API keys: ``` OPENROUTER_API_KEY=XXXX @@ -214,10 +218,14 @@ Here's a complete working example: from datafast.datasets import GenericPipelineDataset from datafast.schema.config import GenericPipelineDatasetConfig from datafast.llms import OpenRouterProvider +from datafast.logger_config import configure_logger from dotenv import load_dotenv # Load API keys -load_dotenv("secrets.env") +load_dotenv() + +# Configure logger +configure_logger() # Define prompt PROMPT = """I will give you a persona description. diff --git a/docs/guides/generating_mcq_datasets.md b/docs/guides/generating_mcq_datasets.md index 2e9c0db..dd78b01 100644 --- a/docs/guides/generating_mcq_datasets.md +++ b/docs/guides/generating_mcq_datasets.md @@ -31,15 +31,19 @@ from datafast.schema.config import MCQDatasetConfig, PromptExpansionConfig from datafast.llms import OpenAIProvider, AnthropicProvider, GeminiProvider ``` -In addition, we'll use `dotenv` to load environment variables containing API keys. +In addition, we'll use `dotenv` to load environment variables containing API keys and configure logging to monitor the generation process. ```python from dotenv import load_dotenv +from datafast.logger_config import configure_logger # Load environment variables containing API keys -load_dotenv("secrets.env") +load_dotenv() + +# Configure logger to see progress, warnings, and success messages +configure_logger() ``` -Make sure you have created a `secrets.env` file with your API keys. HF token is needed if you want to push the dataset to your HF hub. Other keys depend on which LLM providers you use. +Make sure you have created a `.env` file with your API keys. HF token is needed if you want to push the dataset to your HF hub. Other keys depend on which LLM providers you use. ``` GEMINI_API_KEY=XXXX @@ -253,10 +257,14 @@ Here's a complete example for creating an MCQ dataset from a local JSONL file: from datafast.datasets import MCQDataset from datafast.schema.config import MCQDatasetConfig, PromptExpansionConfig from datafast.llms import OpenAIProvider, AnthropicProvider, GeminiProvider +from datafast.logger_config import configure_logger from dotenv import load_dotenv # Load environment variables -load_dotenv("secrets.env") +load_dotenv() + +# Configure logger +configure_logger() def main(): # 1. Define the configuration diff --git a/docs/guides/generating_preference_datasets.md b/docs/guides/generating_preference_datasets.md index 0cebb12..1ea9f59 100644 --- a/docs/guides/generating_preference_datasets.md +++ b/docs/guides/generating_preference_datasets.md @@ -21,19 +21,23 @@ Generating a preference dataset with `datafast` requires these imports: from datafast.datasets import PreferenceDataset from datafast.schema.config import PreferenceDatasetConfig from datafast.llms import OpenAIProvider, GeminiProvider, AnthropicProvider +from datafast.logger_config import configure_logger from dotenv import load_dotenv import json from pathlib import Path ``` -You'll need to load environment variables containing API keys: +You'll need to load environment variables containing API keys and configure logging: ```python # Load environment variables containing API keys -load_dotenv("secrets.env") +load_dotenv() + +# Configure logger to see progress, warnings, and success messages +configure_logger() ``` -Make sure you have created a `secrets.env` file with your API keys for the LLM providers you plan to use: +Make sure you have created a `.env` file with your API keys for the LLM providers you plan to use: ``` OPENAI_API_KEY=sk-XXXX @@ -235,10 +239,14 @@ from pathlib import Path from datafast.schema.config import PreferenceDatasetConfig from datafast.datasets import PreferenceDataset from datafast.llms import OpenAIProvider, GeminiProvider, AnthropicProvider +from datafast.logger_config import configure_logger from dotenv import load_dotenv # Load environment variables with API keys -load_dotenv("secrets.env") +load_dotenv() + +# Configure logger +configure_logger() # Load NASA lessons learned documents from JSONL file def load_documents_from_jsonl(jsonl_path: str | Path) -> list[str]: diff --git a/docs/guides/generating_text_classification_datasets.md b/docs/guides/generating_text_classification_datasets.md index 4ce36e7..bd1818e 100644 --- a/docs/guides/generating_text_classification_datasets.md +++ b/docs/guides/generating_text_classification_datasets.md @@ -26,15 +26,19 @@ from datafast.schema.config import ClassificationDatasetConfig, PromptExpansionC from datafast.llms import OpenAIProvider, AnthropicProvider ``` -In addition, we'll use `dotenv` to load environment variables containing API keys. +In addition, we'll use `dotenv` to load environment variables containing API keys and configure logging to monitor the generation process. ```python from dotenv import load_dotenv +from datafast.logger_config import configure_logger # Load environment variables containing API keys -load_dotenv("secrets.env") +load_dotenv() + +# Configure logger to see progress, warnings, and success messages +configure_logger() ``` -Make sure you have created a `secrets.env` file with your API keys. HF token is needed if you want to push the dataset to your HF hub. Other keys depends on which LLM providers you use. In our example, we use OpenAI and Anthropic. +Make sure you have created a `.env` file with your API keys. HF token is needed if you want to push the dataset to your HF hub. Other keys depends on which LLM providers you use. In our example, we use OpenAI and Anthropic. ``` GEMINI_API_KEY=XXXX @@ -236,10 +240,14 @@ Here's a complete example for creating a trail conditions classification dataset from datafast.datasets import ClassificationDataset from datafast.schema.config import ClassificationDatasetConfig, PromptExpansionConfig from datafast.llms import OpenAIProvider, AnthropicProvider +from datafast.logger_config import configure_logger from dotenv import load_dotenv # Load API keys -load_dotenv("secrets.env") +load_dotenv() + +# Configure logger +configure_logger() # Configure dataset config = ClassificationDatasetConfig( diff --git a/docs/guides/generating_text_datasets.md b/docs/guides/generating_text_datasets.md index ad6b717..b0930c1 100644 --- a/docs/guides/generating_text_datasets.md +++ b/docs/guides/generating_text_datasets.md @@ -31,15 +31,19 @@ from datafast.schema.config import RawDatasetConfig, PromptExpansionConfig from datafast.llms import OpenAIProvider, AnthropicProvider, GeminiProvider ``` -In addition, we'll use `dotenv` to load environment variables containing API keys. +In addition, we'll use `dotenv` to load environment variables containing API keys and configure logging to monitor the generation process. ```python from dotenv import load_dotenv +from datafast.logger_config import configure_logger # Load environment variables containing API keys -load_dotenv("secrets.env") +load_dotenv() + +# Configure logger to see progress, warnings, and success messages +configure_logger() ``` -Make sure you have created a secrets.env file with your API keys. HF token is needed if you want to push the dataset to your HF hub. Other keys depend on which LLM providers you use. In our example, we use OpenAI and Anthropic. +Make sure you have created a .env file with your API keys. HF token is needed if you want to push the dataset to your HF hub. Other keys depend on which LLM providers you use. In our example, we use OpenAI and Anthropic. ``` GEMINI_API_KEY=XXXX @@ -239,6 +243,14 @@ Here's a complete example script that generates a text dataset across multiple d from datafast.datasets import RawDataset from datafast.schema.config import RawDatasetConfig, PromptExpansionConfig from datafast.llms import OpenAIProvider, AnthropicProvider +from datafast.logger_config import configure_logger +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() + +# Configure logger +configure_logger() def main(): @@ -303,9 +315,6 @@ def main(): if __name__ == "__main__": - from dotenv import load_dotenv - - load_dotenv("secrets.env") main() ``` diff --git a/docs/guides/generating_ultrachat_datasets.md b/docs/guides/generating_ultrachat_datasets.md index 1b9d99c..d5872f2 100644 --- a/docs/guides/generating_ultrachat_datasets.md +++ b/docs/guides/generating_ultrachat_datasets.md @@ -31,15 +31,19 @@ from datafast.schema.config import UltrachatDatasetConfig, PromptExpansionConfig from datafast.llms import OpenAIProvider, AnthropicProvider, GeminiProvider ``` -In addition, use `dotenv` to load environment variables containing API keys: +In addition, use `dotenv` to load environment variables containing API keys and configure logging to monitor the generation process: ```python from dotenv import load_dotenv +from datafast.logger_config import configure_logger # Load environment variables containing API keys -load_dotenv("secrets.env") +load_dotenv() + +# Configure logger to see progress, warnings, and success messages +configure_logger() ``` -Make sure you have created a `secrets.env` file with your API keys. A Hugging Face token (HF_TOKEN) is needed if you want to push the dataset to your HF hub. Other keys depend on which LLM providers you use. +Make sure you have created a `.env` file with your API keys. A Hugging Face token (HF_TOKEN) is needed if you want to push the dataset to your HF hub. Other keys depend on which LLM providers you use. ``` GEMINI_API_KEY=XXXX @@ -231,10 +235,14 @@ Here's a complete example for creating an Ultrachat dataset: from datafast.datasets import UltrachatDataset from datafast.schema.config import UltrachatDatasetConfig from datafast.llms import AnthropicProvider +from datafast.logger_config import configure_logger from dotenv import load_dotenv # Load environment variables -load_dotenv("secrets.env") +load_dotenv() + +# Configure logger +configure_logger() def main(): # 1. Define the configuration diff --git a/docs/index.md b/docs/index.md index c11ac6d..3cb3881 100644 --- a/docs/index.md +++ b/docs/index.md @@ -35,7 +35,7 @@ Currently we support the following LLM providers: ### 1. Environment Setup -Make sure you have created a `secrets.env` file with your API keys. +Make sure you have created a `.env` file with your API keys. HF token is needed if you want to push the dataset to your HF hub. Other keys depends on which LLM providers you use. ``` @@ -51,10 +51,14 @@ HF_TOKEN=hf_XXXXX from datafast.datasets import ClassificationDataset from datafast.schema.config import ClassificationDatasetConfig, PromptExpansionConfig from datafast.llms import OpenAIProvider, AnthropicProvider, GeminiProvider, OpenRouterProvider +from datafast.logger_config import configure_logger from dotenv import load_dotenv # Load environment variables -load_dotenv("secrets.env") # <--- your API keys +load_dotenv() # <--- your API keys + +# Configure logger for visibility into generation process +configure_logger() # <--- see progress, warnings, and success messages ``` ### 3. Configure Dataset @@ -135,6 +139,7 @@ Star this package to send positive vibes and support 🌟 * **Multiple LLMs** used to boost dataset diversity 🤖 * **Flexible prompt**: use our default prompts or provide your own custom prompts 📝 * **Prompt expansion**: Combinatorial variation of prompts to maximize diversity 🔄 +* **Built-in logging**: Comprehensive logging with progress tracking, rate limiting warnings, and success messages 📊 * **Hugging Face Integration**: Push generated datasets to the Hub 🤗 !!! warning diff --git a/tests/test_get_num_expected_rows.py b/tests/test_get_num_expected_rows.py index 6dcf551..da5dd0f 100644 --- a/tests/test_get_num_expected_rows.py +++ b/tests/test_get_num_expected_rows.py @@ -6,7 +6,7 @@ from dotenv import load_dotenv -load_dotenv('secrets.env') +load_dotenv('.env') @pytest.fixture def providers():