From d4404a8aa86e08487f9713895be862fb9dc56e29 Mon Sep 17 00:00:00 2001 From: Patrick Date: Sat, 1 Feb 2025 11:57:31 +0100 Subject: [PATCH 1/2] draft of documentation is ready --- .github/workflows/deploy-docs.yml | 30 +++++++++++ .gitignore | 3 +- docs/api.md | 8 +++ docs/getting_started.md | 8 +++ docs/index.md | 83 +++++++++++++++++++++++++++++++ mkdocs.yml | 68 +++++++++++++++++++++++++ 6 files changed, 199 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/deploy-docs.yml create mode 100644 docs/api.md create mode 100644 docs/getting_started.md create mode 100644 docs/index.md create mode 100644 mkdocs.yml diff --git a/.github/workflows/deploy-docs.yml b/.github/workflows/deploy-docs.yml new file mode 100644 index 0000000..6de2e7d --- /dev/null +++ b/.github/workflows/deploy-docs.yml @@ -0,0 +1,30 @@ +name: Deploy Documentation + +on: + push: + branches: + - main # or master, depending on your default branch name + pull_request: + branches: + - main # or master + +permissions: + contents: write + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install mkdocs-material + pip install -e . + + - name: Deploy + run: mkdocs gh-deploy --force diff --git a/.gitignore b/.gitignore index dc9bf5e..0b75816 100644 --- a/.gitignore +++ b/.gitignore @@ -176,4 +176,5 @@ secrets.env .windsurfrules IDEAS.md -*.jsonl \ No newline at end of file +*.jsonl +.DS_Store diff --git a/docs/api.md b/docs/api.md new file mode 100644 index 0000000..e74a1fa --- /dev/null +++ b/docs/api.md @@ -0,0 +1,8 @@ +# API Reference + +🚧 **Under Construction** 🚧 + +This API reference documentation is currently being developed. +Please refer to the [Quick Start](index.md) guide for basic usage examples in the meantime. + +Check back soon for updates! \ No newline at end of file diff --git a/docs/getting_started.md b/docs/getting_started.md new file mode 100644 index 0000000..a9989df --- /dev/null +++ b/docs/getting_started.md @@ -0,0 +1,8 @@ +# Getting Started + +🚧 **Under Construction** 🚧 + +This guide is currently being developed. +Please refer to the [Quick Start](index.md) guide for basic usage examples in the meantime. + +Check back soon for updates! diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..db800b0 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,83 @@ +# Datafast: Synthetic Text Dataset Generator + +Datafast is a powerful Python package designed to generate synthetic text datasets, designed to: + +* Experiment and test LLM-based applications +* Fine-tune and evaluate language models (LLMs / NLP) + +!!! warning + This library is in its early stages of development and might change significantly. + +### Key Features + +🚀 **Easy-to-use** and simple interface + +🌍 **Multi-lingual** datasets generation + +🤖 **Multiple LLMs** used to boost dataset **diversity** + +📝 **Flexible prompt**: default or custom + +🔄 **Prompt expansion** to maximize diversity + +🤗 **Hugging Face Integration**: Push generated datasets to the Hub, soon to argilla + +## Quick Start + +### 1. Configuration +```python +from datafast import ClassificationConfig, TextClassificationDataset +from datafast.schema.config import PromptExpansionConfig + +config = ClassificationConfig( + classes=[ + {"name": "positive", "description": "Text expressing positive emotions or approval"}, + {"name": "negative", "description": "Text expressing negative emotions or criticism"} + ], + num_samples_per_prompt=5, + output_file="sentiment_dataset.jsonl", + languages={"en": "English"}, + expansion=PromptExpansionConfig( + placeholders={ + "context": ["product", "movie", "restaurant"], + "style": ["brief", "detailed"] + }, + combinatorial=True + ) +) +``` + +### 2. LLM Providers +```python +from datafast.llms import OpenAIProvider, AnthropicProvider, GoogleProvider + +providers = [ + OpenAIProvider(model_id="gpt-4o-mini"), + AnthropicProvider(model_id="claude-3-5-haiku-latest"), + GoogleProvider(model_id="gemini-1.5-flash") +] +``` + +### 3. Dataset Generation +```python +# Generate dataset +dataset = TextClassificationDataset(config) +dataset.generate(providers) + +# Optional: Push to Hugging Face Hub +dataset.push_to_hub( + repo_id="YOUR_USERNAME/sentiment-dataset", + train_size=0.8 +) +``` + +## Supported Dataset Types + +Currently supported dataset types: + +* ✅ Text Classification +* 📋 More coming soon! + +## Next Steps + +* Visit our [GitHub repository](https://github.com/patrickfleith/datafast) for the latest updates diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..9256e33 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,68 @@ +site_name: Datafast +site_description: A Python package for synthetic text dataset generation +site_author: Patrick Fleith + +# Repository +repo_name: patrickfleith/datafast +repo_url: https://github.com/patrickfleith/datafast + +# Theme configuration +theme: + name: material + features: + - navigation.sections + - navigation.top + - search.highlight + palette: + - scheme: default + primary: black + accent: indigo + toggle: + icon: material/weather-sunny + name: Switch to dark mode + - scheme: slate + primary: black + accent: indigo + toggle: + icon: material/weather-night + name: Switch to light mode + +# Extensions +markdown_extensions: + - admonition + - pymdownx.details + - pymdownx.superfences + - pymdownx.highlight: + anchor_linenums: true + - pymdownx.inlinehilite + - pymdownx.snippets + - pymdownx.superfences + - tables + - toc: + permalink: true + - def_list + - attr_list + - md_in_html + - pymdownx.tasklist: + custom_checkbox: true + - pymdownx.emoji: + emoji_index: !!python/name:material.extensions.emoji.twemoji + emoji_generator: !!python/name:material.extensions.emoji.to_svg + - pymdownx.arithmatex: + generic: true + +# Navigation structure +nav: + - Home: index.md + - Getting Started: getting_started.md + - API Reference: api.md + +# Plugins +plugins: + - search + - mkdocstrings: + handlers: + python: + options: + show_source: true + show_root_heading: true From 2ff02104e3d3a04b97c461a27984b3c89fdafefc Mon Sep 17 00:00:00 2001 From: Patrick Date: Sat, 1 Feb 2025 12:00:58 +0100 Subject: [PATCH 2/2] forgot mkdocstring --- .github/workflows/deploy-docs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/deploy-docs.yml b/.github/workflows/deploy-docs.yml index 6de2e7d..99ec86a 100644 --- a/.github/workflows/deploy-docs.yml +++ b/.github/workflows/deploy-docs.yml @@ -23,7 +23,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install mkdocs-material + pip install mkdocs-material mkdocstrings[python] mkdocs-material-extensions pip install -e . - name: Deploy