Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@


*openai-keys.env
*api-keys.env
**/*.ipynb_checkpoints/

.DS_Store
Expand Down
9 changes: 7 additions & 2 deletions DEVELOPMENT.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,21 @@ How to set up your local machine.
```bash
pip install -r requirements.txt
```
- **Configure environment variable (optional)s**
- copy `api-keys.env.example` to `api-keys.env` and add your API keys.
- required fields for different providers are different, please refer to the [LiteLLM setup](https://docs.litellm.ai/docs#litellm-python-sdk) guide for more details.
- currently only endpoint, model, api_key, api_base, api_version are supported.
- this helps data formulator to automatically load the API keys when you run the app, so you don't need to set the API keys in the app UI.

- **Run**
- **Run the app**
- **Windows**
```bash
.\local_server.bat
```

- **Unix-based**
```bash
.\local_server.sh
./local_server.sh
```

## Frontend (TypeScript)
Expand Down
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@ Transform data and create rich visualizations iteratively with AI 🪄. Try Data

## News 🔥🔥🔥

- [02-12-2025] More models supported now! Powered by [LiteLLM](https://github.com/BerriAI/litellm)!
- Now supports OpenAI, Azure, Ollama, and Anthropic models (and more based on LiteLLM);
- Models with strong code generation capabilities are recommended (gpt-4o, claude-3-5-sonnet etc.);
- You can store API keys in `api-keys.env` to avoid typing them every time (see template `api-keys.env.template`).
- Let us know which models you have good/bad experiences with, and what models you would like to see supported! [[comment here]](https://github.com/microsoft/data-formulator/issues/49)

- [11-07-2024] Minor fun update: data visualization challenges!
- We added a few visualization challenges with the sample datasets. Can you complete them all? [[try them out!]](https://github.com/microsoft/data-formulator/issues/53#issue-2641841252)
- Comment in the issue when you did, or share your results/questions with others! [[comment here]](https://github.com/microsoft/data-formulator/issues/53)
Expand Down Expand Up @@ -77,7 +83,7 @@ Play with Data Formulator with one of the following options:

## Using Data Formulator

Once youve completed the setup using either option, follow these steps to start using Data Formulator:
Once you've completed the setup using either option, follow these steps to start using Data Formulator:

### The basics of data visualization
* Provide OpenAI keys and select a model (GPT-4o suggested) and choose a dataset.
Expand Down
24 changes: 24 additions & 0 deletions api-keys.env.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# OpenAI Configuration
OPENAI_ENABLED=true
OPENAI_API_KEY=#your-openai-api-key
OPENAI_MODELS=gpt-4o,gpt-4o-mini # comma separated list of models

# Azure OpenAI Configuration
AZURE_ENABLED=true
AZURE_API_KEY=#your-azure-openai-api-key
AZURE_API_BASE=https://your-azure-openai-endpoint.openai.azure.com/
AZURE_API_VERSION=2024-02-15-preview
AZURE_MODELS=gpt-4o

# Anthropic Configuration
ANTHROPIC_ENABLED=true
ANTHROPIC_API_KEY=#your-anthropic-api-key
ANTHROPIC_MODELS=claude-3-5-sonnet-20241022,claude-3-5-haiku-20241022

# Ollama Configuration
OLLAMA_ENABLED=true
OLLAMA_API_BASE=http://localhost:11434
OLLAMA_MODELS=codellama:7b # models with good code generation capabilities recommended

# if you want to add other models, you can add them with PROVIDER_API_KEY=your-api-key, PROVIDER_MODELS=model1,model2 etc
# (replacing PROVIDER with the provider name like GEMINI, ANTHROPIC, AZURE, OPENAI, OLLAMA etc. as long as they are supported by LiteLLM)
10 changes: 3 additions & 7 deletions py-src/data_formulator/agents/agent_code_explanation.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,8 @@ def transform_data(df_0):

class CodeExplanationAgent(object):

def __init__(self, client, model):
def __init__(self, client):
self.client = client
self.model = model

def run(self, input_tables, code):

Expand All @@ -82,11 +81,8 @@ def run(self, input_tables, code):
{"role":"user","content": user_query}]

###### the part that calls open_ai
response = self.client.chat.completions.create(
model=self.model, messages = messages, temperature=0.7, max_tokens=1200,
top_p=0.95, n=1, frequency_penalty=0, presence_penalty=0, stop=None)
response = self.client.get_completion(messages = messages)

logger.info('\n=== explanation output ===>\n')
logger.info(response.choices[0].message.content)
logger.info(f"=== explanation output ===>\n{response.choices[0].message.content}\n")

return response.choices[0].message.content
7 changes: 2 additions & 5 deletions py-src/data_formulator/agents/agent_concept_derive.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,9 +167,8 @@

class ConceptDeriveAgent(object):

def __init__(self, client, model):
def __init__(self, client):
self.client = client
self.model = model

def run(self, input_table, input_fields, output_field, description, n=1):
"""derive a new concept based on input table, input fields, and output field name, (and description)
Expand All @@ -190,9 +189,7 @@ def run(self, input_table, input_fields, output_field, description, n=1):
{"role":"user","content": user_query}]

###### the part that calls open_ai
response = self.client.chat.completions.create(
model=self.model, messages = messages, temperature=0.7, max_tokens=1200,
top_p=0.95, n=n, frequency_penalty=0, presence_penalty=0, stop=None)
response = self.client.get_completion(messages = messages)

#log = {'messages': messages, 'response': response.model_dump(mode='json')}

Expand Down
7 changes: 2 additions & 5 deletions py-src/data_formulator/agents/agent_data_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,7 @@

class DataCleanAgent(object):

def __init__(self, client, model):
self.model = model
def __init__(self, client):
self.client = client

def run(self, content_type, raw_data, image_cleaning_instruction):
Expand Down Expand Up @@ -129,9 +128,7 @@ def run(self, content_type, raw_data, image_cleaning_instruction):
messages = [system_message, user_prompt]

###### the part that calls open_ai
response = self.client.chat.completions.create(
model=self.model, messages = messages, temperature=0.7, max_tokens=1200,
top_p=0.95, n=1, frequency_penalty=0, presence_penalty=0, stop=None)
response = self.client.get_completion(messages = messages)

candidates = []
for choice in response.choices:
Expand Down
11 changes: 3 additions & 8 deletions py-src/data_formulator/agents/agent_data_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,8 @@ def filter_row(row, df):

class DataFilterAgent(object):

def __init__(self, client, model):
def __init__(self, client):
self.client = client
self.model = model

def process_gpt_result(self, input_table, response, messages):
#log = {'messages': messages, 'response': response.model_dump(mode='json')}
Expand Down Expand Up @@ -177,9 +176,7 @@ def run(self, input_table, description):
{"role":"user","content": user_query}]

###### the part that calls open_ai
response = self.client.chat.completions.create(
model=self.model, messages = messages, temperature=0.7, max_tokens=1200,
top_p=0.95, n=1, frequency_penalty=0, presence_penalty=0, stop=None)
response = self.client.get_completion(messages = messages)

return self.process_gpt_result(input_table, response, messages)

Expand All @@ -190,8 +187,6 @@ def followup(self, input_table, dialog, new_instruction: str, n=1):
"content": new_instruction + '\nupdate the filter function accordingly'}]

##### the part that calls open_ai
response = self.client.chat.completions.create(
model=self.model, messages=messages, temperature=0.7, max_tokens=1200,
top_p=0.95, n=n, frequency_penalty=0, presence_penalty=0, stop=None)
response = self.client.get_completion(messages = messages)

return self.process_gpt_result(input_table, response, messages)
7 changes: 2 additions & 5 deletions py-src/data_formulator/agents/agent_data_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,8 @@

class DataLoadAgent(object):

def __init__(self, client, model):
def __init__(self, client):
self.client = client
self.model = model

def run(self, input_data, n=1):

Expand All @@ -140,9 +139,7 @@ def run(self, input_data, n=1):
{"role":"user","content": user_query}]

###### the part that calls open_ai
response = self.client.chat.completions.create(
model=self.model, messages=messages, temperature=0.2, max_tokens=4096,
top_p=0.95, n=n, frequency_penalty=0, presence_penalty=0, stop=None)
response = self.client.get_completion(messages = messages)

#log = {'messages': messages, 'response': response.model_dump(mode='json')}

Expand Down
10 changes: 4 additions & 6 deletions py-src/data_formulator/agents/agent_data_rec.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,9 +126,8 @@ def transform_data(df):

class DataRecAgent(object):

def __init__(self, client, model, system_prompt=None):
def __init__(self, client, system_prompt=None):
self.client = client
self.model = model
self.system_prompt = system_prompt if system_prompt is not None else SYSTEM_PROMPT

def process_gpt_response(self, input_tables, messages, response):
Expand Down Expand Up @@ -171,7 +170,7 @@ def process_gpt_response(self, input_tables, messages, response):
logger.warning(error_message)
result = {'status': 'other error', 'code': code_str, 'content': f"Unexpected error: {error_message}"}
else:
result = {'status': 'no transformation', 'code': "", 'content': input_tables[0]['rows']}
result = {'status': 'error', 'code': "", 'content': "No code block found in the response. The model is unable to generate code to complete the task."}

result['dialog'] = [*messages, {"role": choice.message.role, "content": choice.message.content}]
result['agent'] = 'DataRecAgent'
Expand All @@ -192,7 +191,7 @@ def run(self, input_tables, description, n=1):
messages = [{"role":"system", "content": self.system_prompt},
{"role":"user","content": user_query}]

response = completion_response_wrapper(self.client, self.model, messages, n)
response = completion_response_wrapper(self.client, messages, n)

return self.process_gpt_response(input_tables, messages, response)

Expand All @@ -204,7 +203,6 @@ def followup(self, input_tables, dialog, new_instruction: str, n=1):

messages = [*dialog, {"role":"user", "content": f"Update: \n\n{new_instruction}"}]

##### the part that calls open_ai
response = completion_response_wrapper(self.client, self.model, messages, n)
response = completion_response_wrapper(self.client, messages, n)

return self.process_gpt_response(input_tables, messages, response)
29 changes: 17 additions & 12 deletions py-src/data_formulator/agents/agent_data_transform_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Licensed under the MIT License.

import json
import sys

from data_formulator.agents.agent_utils import extract_json_objects, generate_data_summary, extract_code_from_gpt_response
import data_formulator.py_sandbox as py_sandbox
Expand All @@ -10,6 +11,7 @@

import logging

# Replace/update the logger configuration
logger = logging.getLogger(__name__)

SYSTEM_PROMPT = '''You are a data scientist to help user to transform data that will be used for visualization.
Expand Down Expand Up @@ -178,12 +180,10 @@ def transform_data(df):
```
'''

def completion_response_wrapper(client, model, messages, n):
def completion_response_wrapper(client, messages, n):
### wrapper for completion response, especially handling errors
try:
response = client.chat.completions.create(
model=model, messages=messages, temperature=0.7, max_tokens=1200,
top_p=0.95, n=n, frequency_penalty=0, presence_penalty=0, stop=None)
response = client.get_completion(messages = messages)
except Exception as e:
response = e

Expand All @@ -192,9 +192,8 @@ def completion_response_wrapper(client, model, messages, n):

class DataTransformationAgentV2(object):

def __init__(self, client, model, system_prompt=None):
def __init__(self, client, system_prompt=None):
self.client = client
self.model = model
self.system_prompt = system_prompt if system_prompt is not None else SYSTEM_PROMPT

def process_gpt_response(self, input_tables, messages, response):
Expand All @@ -210,8 +209,8 @@ def process_gpt_response(self, input_tables, messages, response):

candidates = []
for choice in response.choices:
# logger.info("\n=== Data transformation result ===>\n")
# logger.info(choice.message.content + "\n")
logger.info("=== Data transformation result ===>")
logger.info(choice.message.content + "\n")

json_blocks = extract_json_objects(choice.message.content + "\n")
if len(json_blocks) > 0:
Expand All @@ -221,6 +220,9 @@ def process_gpt_response(self, input_tables, messages, response):

code_blocks = extract_code_from_gpt_response(choice.message.content + "\n", "python")

logger.info("=== Code blocks ===>")
logger.info(code_blocks)

if len(code_blocks) > 0:
code_str = code_blocks[-1]

Expand All @@ -237,15 +239,18 @@ def process_gpt_response(self, input_tables, messages, response):
logger.warning('Error occurred during code execution:')
error_message = f"An error occurred during code execution. Error type: {type(e).__name__}"
logger.warning(error_message)
result = {'status': 'other error', 'code': code_str, 'content': error_message}
result = {'status': 'error', 'code': code_str, 'content': error_message}
else:
result = {'status': 'no transformation', 'code': "", 'content': input_tables[0]['rows']}
result = {'status': 'error', 'code': "", 'content': "No code block found in the response. The model is unable to generate code to complete the task."}

result['dialog'] = [*messages, {"role": choice.message.role, "content": choice.message.content}]
result['agent'] = 'DataTransformationAgent'
result['refined_goal'] = refined_goal
candidates.append(result)

logger.info("=== Candidates ===>")
logger.info(candidates)

return candidates


Expand All @@ -265,7 +270,7 @@ def run(self, input_tables, description, expected_fields: list[str], n=1):
messages = [{"role":"system", "content": self.system_prompt},
{"role":"user","content": user_query}]

response = completion_response_wrapper(self.client, self.model, messages, n)
response = completion_response_wrapper(self.client, messages, n)

return self.process_gpt_response(input_tables, messages, response)

Expand All @@ -287,6 +292,6 @@ def followup(self, input_tables, dialog, output_fields: list[str], new_instructi
messages = [*updated_dialog, {"role":"user",
"content": f"Update the code above based on the following instruction:\n\n{json.dumps(goal, indent=4)}"}]

response = completion_response_wrapper(self.client, self.model, messages, n)
response = completion_response_wrapper(self.client, messages, n)

return self.process_gpt_response(input_tables, messages, response)
11 changes: 3 additions & 8 deletions py-src/data_formulator/agents/agent_data_transformation.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,9 +122,8 @@ def transform_data(df_0):

class DataTransformationAgent(object):

def __init__(self, client, model):
def __init__(self, client):
self.client = client
self.model = model

def process_gpt_response(self, input_tables, messages, response):
"""process gpt response to handle execution"""
Expand Down Expand Up @@ -185,9 +184,7 @@ def run(self, input_tables, description, expected_fields: list[str], n=1, enrich
{"role":"user","content": user_query}]

###### the part that calls open_ai
response = self.client.chat.completions.create(
model=self.model, messages = messages, temperature=0.7, max_tokens=1200,
top_p=0.95, n=n, frequency_penalty=0, presence_penalty=0, stop=None)
response = self.client.get_completion(messages = messages)

return self.process_gpt_response(input_tables, messages, response)

Expand All @@ -207,9 +204,7 @@ def followup(self, input_tables, dialog, output_fields: list[str], new_instructi
"content": "Update the code above based on the following instruction:\n\n" + new_instruction + output_fields_instr}]

##### the part that calls open_ai
response = self.client.chat.completions.create(
model=self.model, messages=messages, temperature=0.7, max_tokens=1200,
top_p=0.95, n=n, frequency_penalty=0, presence_penalty=0, stop=None)
response = self.client.get_completion(messages = messages)

logger.info(response)

Expand Down
Loading
Loading