microsoft · Chenglong-MS · May 13, 2025 · May 1, 2025 · May 1, 2025 · May 2, 2025
diff --git a/.env.template b/.env.template
@@ -5,16 +5,4 @@
 DISABLE_DISPLAY_KEYS=false # if true, the display keys will not be shown in the frontend
 EXEC_PYTHON_IN_SUBPROCESS=false # if true, the python code will be executed in a subprocess to avoid crashing the main app, but it will increase the time of response
 
-LOCAL_DB_DIR= # the directory to store the local database, if not provided, the app will use the temp directory
-
-# External atabase connection settings
-# check https://duckdb.org/docs/stable/extensions/mysql.html 
-# and https://duckdb.org/docs/stable/extensions/postgres.html
-USE_EXTERNAL_DB=false # if true, the app will use an external database instead of the one in the app
-DB_NAME=mysql_db # the name to refer to this database connection
-DB_TYPE=mysql # mysql or postgresql
-DB_HOST=localhost 
-DB_PORT=0 
-DB_DATABASE=mysql 
-DB_USER=root 
-DB_PASSWORD=
+LOCAL_DB_DIR= # the directory to store the local database, if not provided, the app will use the temp directory
diff --git a/README.md b/README.md
@@ -8,6 +8,7 @@
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)&ensp;
 [![YouTube](https://img.shields.io/badge/YouTube-white?logo=youtube&logoColor=%23FF0000)](https://youtu.be/3ndlwt0Wi3c)&ensp;
 [![build](https://github.com/microsoft/data-formulator/actions/workflows/python-build.yml/badge.svg)](https://github.com/microsoft/data-formulator/actions/workflows/python-build.yml)
+[![Discord](https://img.shields.io/badge/discord-chat-green?logo=discord)](https://discord.gg/mYCZMQKYZb)
 
 </div>
 
@@ -22,6 +23,14 @@ Transform data and create rich visualizations iteratively with AI 🪄. Try Data
 
 ## News 🔥🔥🔥
 
+- [05-13-2025] Data Formulator 0.2.1: External Data Loader 
+  - We introduced external data loader class to make import data easier. [Readme](https://github.com/microsoft/data-formulator/tree/main/py-src/data_formulator/data_loader) and [Demo](https://github.com/microsoft/data-formulator/pull/155)
+    - Example data loaders from MySQL and Azure Data Explorer (Kusto) are provided.
+  - Call for action [link](https://github.com/microsoft/data-formulator/issues/156):
+    - Users: let us know which data source you'd like to load data from.
+    - Developers: let's build more data loaders.
+  - Discord channel for discussions: join us! [![Discord](https://img.shields.io/badge/discord-chat-green?logo=discord)](https://discord.gg/mYCZMQKYZb)
+
 - [04-23-2025] Data Formulator 0.2: working with *large* data 📦📦📦
   - Explore large data by:
     1. Upload large data file to the local database (powered by [DuckDB](https://github.com/duckdb/duckdb)).

diff --git a/package.json b/package.json
@@ -4,11 +4,11 @@
     "version": "0.1.0",
     "private": true,
     "dependencies": {
-        "@emotion/react": "^11.9.0",
-        "@emotion/styled": "^11.8.1",
+        "@emotion/react": "^11.14.0",
+        "@emotion/styled": "^11.14.0",
         "@fontsource/roboto": "^4.5.5",
         "@mui/icons-material": "^5.14.0",
-        "@mui/material": "^5.6.0",
+        "@mui/material": "^7.0.2",
         "@reduxjs/toolkit": "^1.8.6",
         "@types/dompurify": "^3.0.5",
         "@types/validator": "^13.12.2",

diff --git a/py-src/data_formulator/agent_routes.py b/py-src/data_formulator/agent_routes.py
@@ -29,7 +29,7 @@
 from data_formulator.agents.agent_data_load import DataLoadAgent
 from data_formulator.agents.agent_data_clean import DataCleanAgent
 from data_formulator.agents.agent_code_explanation import CodeExplanationAgent
-
+from data_formulator.agents.agent_query_completion import QueryCompletionAgent
 from data_formulator.agents.client_utils import Client
 
 from data_formulator.db_manager import db_manager
@@ -437,4 +437,25 @@ def request_code_expl():
         expl = code_expl_agent.run(input_tables, code)
     else:
         expl = ""
-    return expl
+    return expl
+
+@agent_bp.route('/query-completion', methods=['POST'])
+def query_completion():
+    if request.is_json:
+        logger.info("# request data: ")
+        content = request.get_json()        
+
+        client = get_client(content['model'])
+
+        data_source_metadata = content["data_source_metadata"]
+        query = content["query"]
+
+
+        query_completion_agent = QueryCompletionAgent(client=client)
+        reasoning, query = query_completion_agent.run(data_source_metadata, query)
+        response = flask.jsonify({ "token": "", "status": "ok", "reasoning": reasoning, "query": query })
+    else:
+        response = flask.jsonify({ "token": "", "status": "error", "reasoning": "unable to complete query", "query": "" })
+
+    response.headers.add('Access-Control-Allow-Origin', '*')
+    return response
diff --git a/py-src/data_formulator/agents/agent_py_data_rec.py b/py-src/data_formulator/agents/agent_py_data_rec.py
@@ -165,7 +165,7 @@ def process_gpt_response(self, input_tables, messages, response):
                     if result['status'] == 'ok':
                         result_df = result['content']
                         result['content'] = {
-                            'rows': result_df.to_dict(orient='records'),
+                            'rows': json.loads(result_df.to_json(orient='records')),
                         }
                     else:
                         logger.info(result['content'])

diff --git a/py-src/data_formulator/agents/agent_py_data_transform.py b/py-src/data_formulator/agents/agent_py_data_transform.py
@@ -221,13 +221,11 @@ def process_gpt_response(self, input_tables, messages, response):
                     result = py_sandbox.run_transform_in_sandbox2020(code_str, [pd.DataFrame.from_records(t['rows']) for t in input_tables], self.exec_python_in_subprocess)
                     result['code'] = code_str
 
-                    print(f"result: {result}")
-
                     if result['status'] == 'ok':
                         # parse the content
                         result_df = result['content']
                         result['content'] = {
-                            'rows': result_df.to_dict(orient='records'),
+                            'rows': json.loads(result_df.to_json(orient='records')),
                         }
                     else:
                         logger.info(result['content'])

diff --git a/py-src/data_formulator/agents/agent_query_completion.py b/py-src/data_formulator/agents/agent_query_completion.py
@@ -0,0 +1,80 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import pandas as pd
+import json
+
+from data_formulator.agents.agent_utils import extract_code_from_gpt_response, extract_json_objects
+import re
+import logging
+
+
+logger = logging.getLogger(__name__)
+
+
+SYSTEM_PROMPT = '''You are a data scientist to help with data queries. 
+The user will provide you with a description of the data source and tables available in the [DATA SOURCE] section and a query in the [USER INPUTS] section. 
+You will need to help the user complete the query and provide reasoning for the query you generated in the [OUTPUT] section.
+
+Input format:
+* The data source description is a json object with the following fields:
+    * `data_source`: the name of the data source
+    * `tables`: a list of tables in the data source, which maps the table name to the list of columns available in the table.
+* The user input is a natural language description of the query or a partial query you need to complete.
+
+Steps:
+* Based on data source description and user input, you should first decide on what language should be used to query the data. 
+* Then, describe the logic for the query you generated in a json object in a block ```json``` with the following fields:
+    * `language`: the language of the query you generated
+    * `tables`: the names of the tables you will use in the query
+    * `logic`: the reasoning behind why you chose the tables and the logic for the query you generated
+* Finally, generate the complete query in the language specified in a code block ```{language}```.
+
+Output format:
+* The output should be in the following format, no other text should be included:
+
+[REASONING]
+```json
+{
+    "language": {language},
+    "tables": {tables},
+    "logic": {logic}
+}
+```
+
+[QUERY]
+```{language}   
+{query}
+```
+'''
+
+class QueryCompletionAgent(object):
+
+    def __init__(self, client):
+        self.client = client
+
+    def run(self, data_source_metadata, query):
+
+        user_query = f"[DATA SOURCE]\n\n{json.dumps(data_source_metadata, indent=2)}\n\n[USER INPUTS]\n\n{query}\n\n[REASONING]\n"
+
+        logger.info(user_query)
+
+        messages = [{"role":"system", "content": SYSTEM_PROMPT},
+                    {"role":"user","content": user_query}]
+
+        ###### the part that calls open_ai
+        response = self.client.get_completion(messages = messages)
+        response_content = '[REASONING]\n' + response.choices[0].message.content
+
+        logger.info(f"=== query completion output ===>\n{response_content}\n")
+
+        reasoning = extract_json_objects(response_content.split("[REASONING]")[1].split("[QUERY]")[0].strip())[0]
+        output_query = response_content.split("[QUERY]")[1].strip()
+
+        # Extract the query by removing the language markers
+        language_pattern = r"```(\w+)\s+(.*?)```"
+        match = re.search(language_pattern, output_query, re.DOTALL)
+        if match:
+            output_query = match.group(2).strip()
+
+        return reasoning, output_query
diff --git a/py-src/data_formulator/app.py b/py-src/data_formulator/app.py
@@ -37,6 +37,7 @@
 from data_formulator.tables_routes import tables_bp
 from data_formulator.agent_routes import agent_bp
 
+
 app = Flask(__name__, static_url_path='', static_folder=os.path.join(APP_ROOT, "dist"))
 app.secret_key = secrets.token_hex(16)  # Generate a random secret key for sessions
 

diff --git a/py-src/data_formulator/data_loader/README.md b/py-src/data_formulator/data_loader/README.md
@@ -0,0 +1,36 @@
+## Data Loader Module
+
+This module provides a framework for loading data from various external sources into DuckDB. It follows an abstract base class pattern to ensure consistent implementation across different data sources.
+
+### Building a New Data Loader
+
+The abstract class `ExternalDataLoader` defines the data loader interface. Each concrete implementation (e.g., `KustoDataLoader`, `MySQLDataLoader`) handles specific data source connections and data ingestion.
+
+To create a new data loader:
+
+1. Create a new class that inherits from `ExternalDataLoader`
+2. Implement the required abstract methods:
+   - `list_params()`: Define required connection parameters
+   - `__init__()`: Initialize connection to data source
+   - `list_tables()`: List available tables/views
+   - `ingest_data()`: Load data from source
+   - `view_query_sample()`: Preview query results
+   - `ingest_data_from_query()`: Load data from custom query
+3. Register the new class into `__init__.py` so that the front-end can automatically discover the new data loader.
+
+The UI automatically provide the query completion option to help user generate queries for the given data loader (from NL or partial queries).
+
+### Example Implementations
+
+- `KustoDataLoader`: Azure Data Explorer (Kusto) integration
+- `MySQLDataLoader`: MySQL database integration
+
+### Testing
+
+Ensure your implementation:
+- Handles connection errors gracefully
+- Properly sanitizes table names
+- Respects size limits for data ingestion
+- Returns consistent metadata format
+
+Launch the front-end and test the data loader.
diff --git a/py-src/data_formulator/data_loader/__init__.py b/py-src/data_formulator/data_loader/__init__.py
@@ -0,0 +1,10 @@
+from data_formulator.data_loader.external_data_loader import ExternalDataLoader
+from data_formulator.data_loader.mysql_data_loader import MySQLDataLoader
+from data_formulator.data_loader.kusto_data_loader import KustoDataLoader
+
+DATA_LOADERS = {
+    "mysql": MySQLDataLoader,
+    "kusto": KustoDataLoader
+}
+
+__all__ = ["ExternalDataLoader", "MySQLDataLoader", "KustoDataLoader", "DATA_LOADERS"]
diff --git a/py-src/data_formulator/data_loader/external_data_loader.py b/py-src/data_formulator/data_loader/external_data_loader.py
@@ -0,0 +1,90 @@
+from abc import ABC, abstractmethod
+from typing import Dict, Any, List
+import pandas as pd
+import json
+import duckdb
+import random
+import string
+import re
+
+def sanitize_table_name(name_as: str) -> str:
+    if not name_as:
+        raise ValueError("Table name cannot be empty")
+
+    # Remove any SQL injection attempts
+    name_as = name_as.replace(";", "").replace("--", "").replace("/*", "").replace("*/", "")
+
+    # Replace invalid characters with underscores
+    # This includes special characters, spaces, dots, dashes, and other non-alphanumeric chars
+    sanitized = re.sub(r'[^a-zA-Z0-9_]', '_', name_as)
+
+    # Ensure the name starts with a letter or underscore
+    if not sanitized[0].isalpha() and sanitized[0] != '_':
+        sanitized = '_' + sanitized
+
+    # Ensure the name is not a SQL keyword
+    sql_keywords = {
+        'SELECT', 'FROM', 'WHERE', 'GROUP', 'BY', 'ORDER', 'HAVING', 'LIMIT',
+        'OFFSET', 'JOIN', 'INNER', 'LEFT', 'RIGHT', 'FULL', 'OUTER', 'ON',
+        'AND', 'OR', 'NOT', 'NULL', 'TRUE', 'FALSE', 'UNION', 'ALL', 'DISTINCT',
+        'INSERT', 'UPDATE', 'DELETE', 'CREATE', 'DROP', 'TABLE', 'VIEW', 'INDEX',
+        'ALTER', 'ADD', 'COLUMN', 'PRIMARY', 'KEY', 'FOREIGN', 'REFERENCES',
+        'CONSTRAINT', 'DEFAULT', 'CHECK', 'UNIQUE', 'CASCADE', 'RESTRICT'
+    }
+
+    if sanitized.upper() in sql_keywords:
+        sanitized = '_' + sanitized
+
+    # Ensure the name is not too long (common SQL limit is 63 characters)
+    if len(sanitized) > 63:
+        sanitized = sanitized[:63]
+
+    return sanitized
+
+class ExternalDataLoader(ABC):
+
+    def ingest_df_to_duckdb(self, df: pd.DataFrame, table_name: str):
+
+        base_name = table_name
+        counter = 1
+        while True:
+            # Check if table exists
+            exists = self.duck_db_conn.execute(f"SELECT COUNT(*) FROM duckdb_tables() WHERE table_name = '{table_name}'").fetchone()[0] > 0
+            if not exists:
+                break
+            # If exists, append counter to base name
+            table_name = f"{base_name}_{counter}"
+            counter += 1
+
+        # Create table
+        random_suffix = ''.join(random.choices(string.ascii_letters + string.digits, k=6))
+        self.duck_db_conn.register(f'df_temp_{random_suffix}', df)
+        self.duck_db_conn.execute(f"CREATE TABLE {table_name} AS SELECT * FROM df_temp_{random_suffix}")
+        self.duck_db_conn.execute(f"DROP VIEW df_temp_{random_suffix}")  # Drop the temporary view after creating the table
+
+    @staticmethod
+    @abstractmethod
+    def list_params() -> List[Dict[str, Any]]:
+        pass
+
+    @abstractmethod
+    def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection):
+        pass
+
+    @abstractmethod
+    def list_tables(self) -> List[Dict[str, Any]]:
+        # should include: table_name, column_names, column_types, sample_data
+        pass
+
+    @abstractmethod
+    def ingest_data(self, table_name: str, name_as: str = None, size: int = 1000000):
+        pass
+
+    @abstractmethod
+    def view_query_sample(self, query: str) -> str:
+        pass
+
+    @abstractmethod
+    def ingest_data_from_query(self, query: str, name_as: str):
+        pass
+