Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 1 addition & 13 deletions .env.template
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,4 @@
DISABLE_DISPLAY_KEYS=false # if true, the display keys will not be shown in the frontend
EXEC_PYTHON_IN_SUBPROCESS=false # if true, the python code will be executed in a subprocess to avoid crashing the main app, but it will increase the time of response

LOCAL_DB_DIR= # the directory to store the local database, if not provided, the app will use the temp directory

# External atabase connection settings
# check https://duckdb.org/docs/stable/extensions/mysql.html
# and https://duckdb.org/docs/stable/extensions/postgres.html
USE_EXTERNAL_DB=false # if true, the app will use an external database instead of the one in the app
DB_NAME=mysql_db # the name to refer to this database connection
DB_TYPE=mysql # mysql or postgresql
DB_HOST=localhost
DB_PORT=0
DB_DATABASE=mysql
DB_USER=root
DB_PASSWORD=
LOCAL_DB_DIR= # the directory to store the local database, if not provided, the app will use the temp directory
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 
[![YouTube](https://img.shields.io/badge/YouTube-white?logo=youtube&logoColor=%23FF0000)](https://youtu.be/3ndlwt0Wi3c) 
[![build](https://github.com/microsoft/data-formulator/actions/workflows/python-build.yml/badge.svg)](https://github.com/microsoft/data-formulator/actions/workflows/python-build.yml)
[![Discord](https://img.shields.io/badge/discord-chat-green?logo=discord)](https://discord.gg/mYCZMQKYZb)

</div>

Expand All @@ -22,6 +23,14 @@ Transform data and create rich visualizations iteratively with AI 🪄. Try Data

## News 🔥🔥🔥

- [05-13-2025] Data Formulator 0.2.1: External Data Loader
- We introduced external data loader class to make import data easier. [Readme](https://github.com/microsoft/data-formulator/tree/main/py-src/data_formulator/data_loader) and [Demo](https://github.com/microsoft/data-formulator/pull/155)
- Example data loaders from MySQL and Azure Data Explorer (Kusto) are provided.
- Call for action [link](https://github.com/microsoft/data-formulator/issues/156):
- Users: let us know which data source you'd like to load data from.
- Developers: let's build more data loaders.
- Discord channel for discussions: join us! [![Discord](https://img.shields.io/badge/discord-chat-green?logo=discord)](https://discord.gg/mYCZMQKYZb)

- [04-23-2025] Data Formulator 0.2: working with *large* data 📦📦📦
- Explore large data by:
1. Upload large data file to the local database (powered by [DuckDB](https://github.com/duckdb/duckdb)).
Expand Down
6 changes: 3 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
"version": "0.1.0",
"private": true,
"dependencies": {
"@emotion/react": "^11.9.0",
"@emotion/styled": "^11.8.1",
"@emotion/react": "^11.14.0",
"@emotion/styled": "^11.14.0",
"@fontsource/roboto": "^4.5.5",
"@mui/icons-material": "^5.14.0",
"@mui/material": "^5.6.0",
"@mui/material": "^7.0.2",
"@reduxjs/toolkit": "^1.8.6",
"@types/dompurify": "^3.0.5",
"@types/validator": "^13.12.2",
Expand Down
25 changes: 23 additions & 2 deletions py-src/data_formulator/agent_routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from data_formulator.agents.agent_data_load import DataLoadAgent
from data_formulator.agents.agent_data_clean import DataCleanAgent
from data_formulator.agents.agent_code_explanation import CodeExplanationAgent

from data_formulator.agents.agent_query_completion import QueryCompletionAgent
from data_formulator.agents.client_utils import Client

from data_formulator.db_manager import db_manager
Expand Down Expand Up @@ -437,4 +437,25 @@ def request_code_expl():
expl = code_expl_agent.run(input_tables, code)
else:
expl = ""
return expl
return expl

@agent_bp.route('/query-completion', methods=['POST'])
def query_completion():
if request.is_json:
logger.info("# request data: ")
content = request.get_json()

client = get_client(content['model'])

data_source_metadata = content["data_source_metadata"]
query = content["query"]


query_completion_agent = QueryCompletionAgent(client=client)
reasoning, query = query_completion_agent.run(data_source_metadata, query)
response = flask.jsonify({ "token": "", "status": "ok", "reasoning": reasoning, "query": query })
else:
response = flask.jsonify({ "token": "", "status": "error", "reasoning": "unable to complete query", "query": "" })

response.headers.add('Access-Control-Allow-Origin', '*')
return response
2 changes: 1 addition & 1 deletion py-src/data_formulator/agents/agent_py_data_rec.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def process_gpt_response(self, input_tables, messages, response):
if result['status'] == 'ok':
result_df = result['content']
result['content'] = {
'rows': result_df.to_dict(orient='records'),
'rows': json.loads(result_df.to_json(orient='records')),
}
else:
logger.info(result['content'])
Expand Down
4 changes: 1 addition & 3 deletions py-src/data_formulator/agents/agent_py_data_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,13 +221,11 @@ def process_gpt_response(self, input_tables, messages, response):
result = py_sandbox.run_transform_in_sandbox2020(code_str, [pd.DataFrame.from_records(t['rows']) for t in input_tables], self.exec_python_in_subprocess)
result['code'] = code_str

print(f"result: {result}")

if result['status'] == 'ok':
# parse the content
result_df = result['content']
result['content'] = {
'rows': result_df.to_dict(orient='records'),
'rows': json.loads(result_df.to_json(orient='records')),
}
else:
logger.info(result['content'])
Expand Down
80 changes: 80 additions & 0 deletions py-src/data_formulator/agents/agent_query_completion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

import pandas as pd
import json

from data_formulator.agents.agent_utils import extract_code_from_gpt_response, extract_json_objects
import re
import logging


logger = logging.getLogger(__name__)


SYSTEM_PROMPT = '''You are a data scientist to help with data queries.
The user will provide you with a description of the data source and tables available in the [DATA SOURCE] section and a query in the [USER INPUTS] section.
You will need to help the user complete the query and provide reasoning for the query you generated in the [OUTPUT] section.

Input format:
* The data source description is a json object with the following fields:
* `data_source`: the name of the data source
* `tables`: a list of tables in the data source, which maps the table name to the list of columns available in the table.
* The user input is a natural language description of the query or a partial query you need to complete.

Steps:
* Based on data source description and user input, you should first decide on what language should be used to query the data.
* Then, describe the logic for the query you generated in a json object in a block ```json``` with the following fields:
* `language`: the language of the query you generated
* `tables`: the names of the tables you will use in the query
* `logic`: the reasoning behind why you chose the tables and the logic for the query you generated
* Finally, generate the complete query in the language specified in a code block ```{language}```.

Output format:
* The output should be in the following format, no other text should be included:

[REASONING]
```json
{
"language": {language},
"tables": {tables},
"logic": {logic}
}
```

[QUERY]
```{language}
{query}
```
'''

class QueryCompletionAgent(object):

def __init__(self, client):
self.client = client

def run(self, data_source_metadata, query):

user_query = f"[DATA SOURCE]\n\n{json.dumps(data_source_metadata, indent=2)}\n\n[USER INPUTS]\n\n{query}\n\n[REASONING]\n"

logger.info(user_query)

messages = [{"role":"system", "content": SYSTEM_PROMPT},
{"role":"user","content": user_query}]

###### the part that calls open_ai
response = self.client.get_completion(messages = messages)
response_content = '[REASONING]\n' + response.choices[0].message.content

logger.info(f"=== query completion output ===>\n{response_content}\n")

reasoning = extract_json_objects(response_content.split("[REASONING]")[1].split("[QUERY]")[0].strip())[0]
output_query = response_content.split("[QUERY]")[1].strip()

# Extract the query by removing the language markers
language_pattern = r"```(\w+)\s+(.*?)```"
match = re.search(language_pattern, output_query, re.DOTALL)
if match:
output_query = match.group(2).strip()

return reasoning, output_query
1 change: 1 addition & 0 deletions py-src/data_formulator/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
from data_formulator.tables_routes import tables_bp
from data_formulator.agent_routes import agent_bp


app = Flask(__name__, static_url_path='', static_folder=os.path.join(APP_ROOT, "dist"))
app.secret_key = secrets.token_hex(16) # Generate a random secret key for sessions

Expand Down
36 changes: 36 additions & 0 deletions py-src/data_formulator/data_loader/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
## Data Loader Module

This module provides a framework for loading data from various external sources into DuckDB. It follows an abstract base class pattern to ensure consistent implementation across different data sources.

### Building a New Data Loader

The abstract class `ExternalDataLoader` defines the data loader interface. Each concrete implementation (e.g., `KustoDataLoader`, `MySQLDataLoader`) handles specific data source connections and data ingestion.

To create a new data loader:

1. Create a new class that inherits from `ExternalDataLoader`
2. Implement the required abstract methods:
- `list_params()`: Define required connection parameters
- `__init__()`: Initialize connection to data source
- `list_tables()`: List available tables/views
- `ingest_data()`: Load data from source
- `view_query_sample()`: Preview query results
- `ingest_data_from_query()`: Load data from custom query
3. Register the new class into `__init__.py` so that the front-end can automatically discover the new data loader.

The UI automatically provide the query completion option to help user generate queries for the given data loader (from NL or partial queries).

### Example Implementations

- `KustoDataLoader`: Azure Data Explorer (Kusto) integration
- `MySQLDataLoader`: MySQL database integration

### Testing

Ensure your implementation:
- Handles connection errors gracefully
- Properly sanitizes table names
- Respects size limits for data ingestion
- Returns consistent metadata format

Launch the front-end and test the data loader.
10 changes: 10 additions & 0 deletions py-src/data_formulator/data_loader/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from data_formulator.data_loader.external_data_loader import ExternalDataLoader
from data_formulator.data_loader.mysql_data_loader import MySQLDataLoader
from data_formulator.data_loader.kusto_data_loader import KustoDataLoader

DATA_LOADERS = {
"mysql": MySQLDataLoader,
"kusto": KustoDataLoader
}

__all__ = ["ExternalDataLoader", "MySQLDataLoader", "KustoDataLoader", "DATA_LOADERS"]
90 changes: 90 additions & 0 deletions py-src/data_formulator/data_loader/external_data_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from abc import ABC, abstractmethod
from typing import Dict, Any, List
import pandas as pd
import json
import duckdb
import random
import string
import re

def sanitize_table_name(name_as: str) -> str:
if not name_as:
raise ValueError("Table name cannot be empty")

# Remove any SQL injection attempts
name_as = name_as.replace(";", "").replace("--", "").replace("/*", "").replace("*/", "")

# Replace invalid characters with underscores
# This includes special characters, spaces, dots, dashes, and other non-alphanumeric chars
sanitized = re.sub(r'[^a-zA-Z0-9_]', '_', name_as)

# Ensure the name starts with a letter or underscore
if not sanitized[0].isalpha() and sanitized[0] != '_':
sanitized = '_' + sanitized

# Ensure the name is not a SQL keyword
sql_keywords = {
'SELECT', 'FROM', 'WHERE', 'GROUP', 'BY', 'ORDER', 'HAVING', 'LIMIT',
'OFFSET', 'JOIN', 'INNER', 'LEFT', 'RIGHT', 'FULL', 'OUTER', 'ON',
'AND', 'OR', 'NOT', 'NULL', 'TRUE', 'FALSE', 'UNION', 'ALL', 'DISTINCT',
'INSERT', 'UPDATE', 'DELETE', 'CREATE', 'DROP', 'TABLE', 'VIEW', 'INDEX',
'ALTER', 'ADD', 'COLUMN', 'PRIMARY', 'KEY', 'FOREIGN', 'REFERENCES',
'CONSTRAINT', 'DEFAULT', 'CHECK', 'UNIQUE', 'CASCADE', 'RESTRICT'
}

if sanitized.upper() in sql_keywords:
sanitized = '_' + sanitized

# Ensure the name is not too long (common SQL limit is 63 characters)
if len(sanitized) > 63:
sanitized = sanitized[:63]

return sanitized

class ExternalDataLoader(ABC):

def ingest_df_to_duckdb(self, df: pd.DataFrame, table_name: str):

base_name = table_name
counter = 1
while True:
# Check if table exists
exists = self.duck_db_conn.execute(f"SELECT COUNT(*) FROM duckdb_tables() WHERE table_name = '{table_name}'").fetchone()[0] > 0
if not exists:
break
# If exists, append counter to base name
table_name = f"{base_name}_{counter}"
counter += 1

# Create table
random_suffix = ''.join(random.choices(string.ascii_letters + string.digits, k=6))
self.duck_db_conn.register(f'df_temp_{random_suffix}', df)
self.duck_db_conn.execute(f"CREATE TABLE {table_name} AS SELECT * FROM df_temp_{random_suffix}")
self.duck_db_conn.execute(f"DROP VIEW df_temp_{random_suffix}") # Drop the temporary view after creating the table

@staticmethod
@abstractmethod
def list_params() -> List[Dict[str, Any]]:
pass

@abstractmethod
def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection):
pass

@abstractmethod
def list_tables(self) -> List[Dict[str, Any]]:
# should include: table_name, column_names, column_types, sample_data
pass

@abstractmethod
def ingest_data(self, table_name: str, name_as: str = None, size: int = 1000000):
pass

@abstractmethod
def view_query_sample(self, query: str) -> str:
pass

@abstractmethod
def ingest_data_from_query(self, query: str, name_as: str):
pass

Loading