diff --git a/README.md b/README.md
index d244ad77..324caa96 100644
--- a/README.md
+++ b/README.md
@@ -14,22 +14,24 @@
Transform data and create rich visualizations iteratively with AI 🪄. Try Data Formulator now!
-[](https://codespaces.new/microsoft/data-formulator?quickstart=1)
+Any questions? Ask on the Discord channel! [](https://discord.gg/mYCZMQKYZb)
+
+
+
## News 🔥🔥🔥
-- [05-13-2025] Data Formulator 0.2.1: External Data Loader
+- [05-13-2025] Data Formulator 0.2.3: External Data Loader
- We introduced external data loader class to make import data easier. [Readme](https://github.com/microsoft/data-formulator/tree/main/py-src/data_formulator/data_loader) and [Demo](https://github.com/microsoft/data-formulator/pull/155)
- - Example data loaders from MySQL and Azure Data Explorer (Kusto) are provided.
+ - Current data loaders: MySQL, Azure Data Explorer (Kusto), Azure Blob and Amazon S3 (json, parquet, csv).
- Call for action [link](https://github.com/microsoft/data-formulator/issues/156):
- Users: let us know which data source you'd like to load data from.
- Developers: let's build more data loaders.
- - Discord channel for discussions: join us! [](https://discord.gg/mYCZMQKYZb)
- [04-23-2025] Data Formulator 0.2: working with *large* data 📦📦📦
- Explore large data by:
@@ -68,8 +70,6 @@ Transform data and create rich visualizations iteratively with AI 🪄. Try Data
- [10-01-2024] Initial release of Data Formulator, check out our [[blog]](https://www.microsoft.com/en-us/research/blog/data-formulator-exploring-how-ai-can-help-analysts-create-rich-data-visualizations/) and [[video]](https://youtu.be/3ndlwt0Wi3c)!
-
-
## Overview
**Data Formulator** is an application from Microsoft Research that uses large language models to transform data, expediting the practice of data visualization.
diff --git a/package.json b/package.json
index 962241e0..453c9dea 100644
--- a/package.json
+++ b/package.json
@@ -82,6 +82,6 @@
"globals": "^15.12.0",
"sass": "^1.77.6",
"typescript-eslint": "^8.16.0",
- "vite": "^5.4.15"
+ "vite": "^5.4.19"
}
}
diff --git a/py-src/data_formulator/agents/agent_code_explanation.py b/py-src/data_formulator/agents/agent_code_explanation.py
index 8d16a968..af348e3b 100644
--- a/py-src/data_formulator/agents/agent_code_explanation.py
+++ b/py-src/data_formulator/agents/agent_code_explanation.py
@@ -1,8 +1,7 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
-import pandas as pd
-from data_formulator.agents.agent_utils import generate_data_summary, extract_code_from_gpt_response
+from data_formulator.agents.agent_utils import generate_data_summary
import logging
diff --git a/py-src/data_formulator/agents/agent_py_concept_derive.py b/py-src/data_formulator/agents/agent_py_concept_derive.py
index 58181d85..f6e3e77f 100644
--- a/py-src/data_formulator/agents/agent_py_concept_derive.py
+++ b/py-src/data_formulator/agents/agent_py_concept_derive.py
@@ -1,7 +1,6 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
-import json
import time
from data_formulator.agents.agent_utils import generate_data_summary, extract_code_from_gpt_response
@@ -10,7 +9,6 @@
import traceback
import logging
-import datetime
logger = logging.getLogger(__name__)
diff --git a/py-src/data_formulator/agents/agent_py_data_transform.py b/py-src/data_formulator/agents/agent_py_data_transform.py
index b3cc999e..8cc86daf 100644
--- a/py-src/data_formulator/agents/agent_py_data_transform.py
+++ b/py-src/data_formulator/agents/agent_py_data_transform.py
@@ -2,7 +2,6 @@
# Licensed under the MIT License.
import json
-import sys
from data_formulator.agents.agent_utils import extract_json_objects, generate_data_summary, extract_code_from_gpt_response
import data_formulator.py_sandbox as py_sandbox
diff --git a/py-src/data_formulator/agents/agent_query_completion.py b/py-src/data_formulator/agents/agent_query_completion.py
index 8beed90c..0dd6f494 100644
--- a/py-src/data_formulator/agents/agent_query_completion.py
+++ b/py-src/data_formulator/agents/agent_query_completion.py
@@ -1,10 +1,9 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
-import pandas as pd
import json
-from data_formulator.agents.agent_utils import extract_code_from_gpt_response, extract_json_objects
+from data_formulator.agents.agent_utils import extract_json_objects
import re
import logging
diff --git a/py-src/data_formulator/agents/agent_utils.py b/py-src/data_formulator/agents/agent_utils.py
index 2d518932..e18962e0 100644
--- a/py-src/data_formulator/agents/agent_utils.py
+++ b/py-src/data_formulator/agents/agent_utils.py
@@ -6,10 +6,6 @@
import pandas as pd
import numpy as np
-import base64
-
-from pprint import pprint
-
import re
def string_to_py_varname(var_str):
diff --git a/py-src/data_formulator/agents/client_utils.py b/py-src/data_formulator/agents/client_utils.py
index 94334aee..a0eb3558 100644
--- a/py-src/data_formulator/agents/client_utils.py
+++ b/py-src/data_formulator/agents/client_utils.py
@@ -1,4 +1,3 @@
-import os
import litellm
import openai
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
diff --git a/py-src/data_formulator/data_loader/__init__.py b/py-src/data_formulator/data_loader/__init__.py
index 145ac806..6aa797c7 100644
--- a/py-src/data_formulator/data_loader/__init__.py
+++ b/py-src/data_formulator/data_loader/__init__.py
@@ -1,10 +1,14 @@
from data_formulator.data_loader.external_data_loader import ExternalDataLoader
from data_formulator.data_loader.mysql_data_loader import MySQLDataLoader
from data_formulator.data_loader.kusto_data_loader import KustoDataLoader
+from data_formulator.data_loader.s3_data_loader import S3DataLoader
+from data_formulator.data_loader.azure_blob_data_loader import AzureBlobDataLoader
DATA_LOADERS = {
"mysql": MySQLDataLoader,
- "kusto": KustoDataLoader
+ "kusto": KustoDataLoader,
+ "s3": S3DataLoader,
+ "azure_blob": AzureBlobDataLoader,
}
-__all__ = ["ExternalDataLoader", "MySQLDataLoader", "KustoDataLoader", "DATA_LOADERS"]
\ No newline at end of file
+__all__ = ["ExternalDataLoader", "MySQLDataLoader", "KustoDataLoader", "S3DataLoader", "AzureBlobDataLoader", "DATA_LOADERS"]
diff --git a/py-src/data_formulator/data_loader/azure_blob_data_loader.py b/py-src/data_formulator/data_loader/azure_blob_data_loader.py
new file mode 100644
index 00000000..e2e2f78f
--- /dev/null
+++ b/py-src/data_formulator/data_loader/azure_blob_data_loader.py
@@ -0,0 +1,369 @@
+import json
+import pandas as pd
+import duckdb
+import os
+
+from data_formulator.data_loader.external_data_loader import ExternalDataLoader, sanitize_table_name
+from typing import Dict, Any, List
+
+class AzureBlobDataLoader(ExternalDataLoader):
+
+ @staticmethod
+ def list_params() -> List[Dict[str, Any]]:
+ params_list = [
+ {"name": "account_name", "type": "string", "required": True, "default": "", "description": "Azure storage account name"},
+ {"name": "container_name", "type": "string", "required": True, "default": "", "description": "Azure blob container name"},
+ {"name": "connection_string", "type": "string", "required": False, "default": "", "description": "Azure storage connection string (alternative to account_name + credentials)"},
+ {"name": "credential_chain", "type": "string", "required": False, "default": "cli;managed_identity;env", "description": "Ordered list of Azure credential providers (cli;managed_identity;env)"},
+ {"name": "account_key", "type": "string", "required": False, "default": "", "description": "Azure storage account key"},
+ {"name": "sas_token", "type": "string", "required": False, "default": "", "description": "Azure SAS token"},
+ {"name": "endpoint", "type": "string", "required": False, "default": "blob.core.windows.net", "description": "Azure endpoint override"}
+ ]
+ return params_list
+
+ @staticmethod
+ def auth_instructions() -> str:
+ return """Authentication Options (choose one)
+
+Option 1 - Connection String (Simplest)
+ - Get connection string from Azure Portal > Storage Account > Access keys
+ - Use `connection_string` parameter with full connection string
+ - `account_name` can be omitted when using connection string
+
+Option 2 - Account Key
+ - Get account key from Azure Portal > Storage Account > Access keys
+ - Use `account_name` + `account_key` parameters
+ - Provides full access to storage account
+
+Option 3 - SAS Token (Recommended for limited access)
+ - Generate SAS token from Azure Portal > Storage Account > Shared access signature
+ - Use `account_name` + `sas_token` parameters
+ - Can be time-limited and permission-scoped
+
+Option 4 - Credential Chain (Most Secure)
+ - Use `account_name` + `container_name` only (no explicit credentials)
+ - Requires Azure CLI login (`az login` in terminal) or Managed Identity
+ - Default chain: `cli;managed_identity;env`
+ - Customize with `credential_chain` parameter
+
+Additional Options
+ - `endpoint`: Custom endpoint (default: `blob.core.windows.net`)
+ - For Azure Government: `blob.core.usgovcloudapi.net`
+ - For Azure China: `blob.core.chinacloudapi.cn`
+
+Supported File Formats:
+ - CSV files (.csv)
+ - Parquet files (.parquet)
+ - JSON files (.json, .jsonl)
+"""
+
+ def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection):
+ self.params = params
+ self.duck_db_conn = duck_db_conn
+
+ # Extract parameters
+ self.account_name = params.get("account_name", "")
+ self.container_name = params.get("container_name", "")
+ self.connection_string = params.get("connection_string", "")
+ self.credential_chain = params.get("credential_chain", "cli;managed_identity;env")
+ self.account_key = params.get("account_key", "")
+ self.sas_token = params.get("sas_token", "")
+ self.endpoint = params.get("endpoint", "blob.core.windows.net")
+
+ # Install and load the azure extension
+ self.duck_db_conn.install_extension("azure")
+ self.duck_db_conn.load_extension("azure")
+
+ # Set up Azure authentication using secrets (preferred method)
+ self._setup_azure_authentication()
+
+ def _setup_azure_authentication(self):
+ """Set up Azure authentication using DuckDB secrets."""
+ if self.connection_string:
+ # Use connection string authentication
+ self.duck_db_conn.execute(f"""
+ CREATE OR REPLACE SECRET azure_secret (
+ TYPE AZURE,
+ CONNECTION_STRING '{self.connection_string}'
+ )
+ """)
+ elif self.account_key:
+ # Use account key authentication
+ self.duck_db_conn.execute(f"""
+ CREATE OR REPLACE SECRET azure_secret (
+ TYPE AZURE,
+ ACCOUNT_NAME '{self.account_name}',
+ ACCOUNT_KEY '{self.account_key}'
+ )
+ """)
+ elif self.sas_token:
+ # Use SAS token authentication
+ self.duck_db_conn.execute(f"""
+ CREATE OR REPLACE SECRET azure_secret (
+ TYPE AZURE,
+ ACCOUNT_NAME '{self.account_name}',
+ SAS_TOKEN '{self.sas_token}'
+ )
+ """)
+ else:
+ # Use credential chain authentication (default)
+ self.duck_db_conn.execute(f"""
+ CREATE OR REPLACE SECRET azure_secret (
+ TYPE AZURE,
+ PROVIDER credential_chain,
+ ACCOUNT_NAME '{self.account_name}',
+ CHAIN '{self.credential_chain}'
+ )
+ """)
+
+ def list_tables(self) -> List[Dict[str, Any]]:
+ # Use Azure SDK to list blobs in the container
+ from azure.storage.blob import BlobServiceClient
+
+ # Create blob service client based on authentication method
+ if self.connection_string:
+ blob_service_client = BlobServiceClient.from_connection_string(self.connection_string)
+ elif self.account_key:
+ blob_service_client = BlobServiceClient(
+ account_url=f"https://{self.account_name}.{self.endpoint}",
+ credential=self.account_key
+ )
+ elif self.sas_token:
+ blob_service_client = BlobServiceClient(
+ account_url=f"https://{self.account_name}.{self.endpoint}",
+ credential=self.sas_token
+ )
+ else:
+ # Use default credential chain
+ from azure.identity import DefaultAzureCredential
+ credential = DefaultAzureCredential()
+ blob_service_client = BlobServiceClient(
+ account_url=f"https://{self.account_name}.{self.endpoint}",
+ credential=credential
+ )
+
+ container_client = blob_service_client.get_container_client(self.container_name)
+
+ # List blobs in the container
+ blob_list = container_client.list_blobs()
+
+ results = []
+
+ for blob in blob_list:
+ blob_name = blob.name
+
+ # Skip directories and non-data files
+ if blob_name.endswith('/') or not self._is_supported_file(blob_name):
+ continue
+
+ # Create Azure blob URL
+ azure_url = f"az://{self.account_name}.{self.endpoint}/{self.container_name}/{blob_name}"
+
+ try:
+ # Choose the appropriate read function based on file extension
+ if azure_url.lower().endswith('.parquet'):
+ sample_df = self.duck_db_conn.execute(f"SELECT * FROM read_parquet('{azure_url}') LIMIT 10").df()
+ elif azure_url.lower().endswith('.json') or azure_url.lower().endswith('.jsonl'):
+ sample_df = self.duck_db_conn.execute(f"SELECT * FROM read_json_auto('{azure_url}') LIMIT 10").df()
+ elif azure_url.lower().endswith('.csv'):
+ sample_df = self.duck_db_conn.execute(f"SELECT * FROM read_csv_auto('{azure_url}') LIMIT 10").df()
+
+ # Get column information
+ columns = [{
+ 'name': col,
+ 'type': str(sample_df[col].dtype)
+ } for col in sample_df.columns]
+
+ # Get sample data
+ sample_rows = json.loads(sample_df.to_json(orient="records"))
+
+ # Estimate row count
+ row_count = self._estimate_row_count(azure_url, blob)
+
+ table_metadata = {
+ "row_count": row_count,
+ "columns": columns,
+ "sample_rows": sample_rows
+ }
+
+ results.append({
+ "name": azure_url,
+ "metadata": table_metadata
+ })
+ except Exception as e:
+ # Skip files that can't be read
+ print(f"Error reading {azure_url}: {e}")
+ continue
+
+ return results
+
+ def _is_supported_file(self, blob_name: str) -> bool:
+ """Check if the file type is supported by DuckDB."""
+ supported_extensions = ['.csv', '.parquet', '.json', '.jsonl']
+ return any(blob_name.lower().endswith(ext) for ext in supported_extensions)
+
+ def _estimate_row_count(self, azure_url: str, blob_properties=None) -> int:
+ """Estimate the number of rows in a file using intelligent strategies."""
+ try:
+ file_extension = azure_url.lower().split('.')[-1]
+
+ # For parquet files, use metadata to get exact count efficiently
+ if file_extension == 'parquet':
+ try:
+ # Use DuckDB's parquet_file_metadata to get exact row count without full scan
+ metadata = self.duck_db_conn.execute(
+ f"SELECT num_rows FROM parquet_file_metadata('{azure_url}')"
+ ).fetchone()
+ if metadata and metadata[0] is not None:
+ return metadata[0]
+ except Exception as parquet_error:
+ print(f"Failed to get parquet metadata for {azure_url}: {parquet_error}")
+ # Fall back to counting (expensive but accurate)
+ try:
+ count = self.duck_db_conn.execute(f"SELECT COUNT(*) FROM read_parquet('{azure_url}')").fetchone()[0]
+ return count
+ except Exception:
+ pass
+
+ # For CSV, JSON, and JSONL files, use intelligent sampling
+ elif file_extension in ['csv', 'json', 'jsonl']:
+ return self._estimate_rows_by_sampling(azure_url, blob_properties, file_extension)
+
+ return 0
+
+ except Exception as e:
+ print(f"Error estimating row count for {azure_url}: {e}")
+ return 0
+
+ def _estimate_rows_by_sampling(self, azure_url: str, blob_properties, file_extension: str) -> int:
+ """Estimate row count for text-based files using sampling and file size."""
+ try:
+ # Get file size from blob properties if available
+ file_size_bytes = None
+ if blob_properties and hasattr(blob_properties, 'size'):
+ file_size_bytes = blob_properties.size
+
+ # If no file size available, try a different approach
+ if file_size_bytes is None:
+ # Sample first 10,000 rows and extrapolate if needed
+ return self._estimate_by_row_sampling(azure_url, file_extension)
+
+ # Sample approach: read first N rows and estimate based on size
+ sample_size = min(10000, file_size_bytes // 100) # Adaptive sample size
+ sample_size = max(1000, sample_size) # At least 1000 rows
+
+ try:
+ if file_extension == 'csv':
+ sample_df = self.duck_db_conn.execute(
+ f"SELECT * FROM read_csv_auto('{azure_url}') LIMIT {sample_size}"
+ ).df()
+ elif file_extension in ['json', 'jsonl']:
+ sample_df = self.duck_db_conn.execute(
+ f"SELECT * FROM read_json_auto('{azure_url}') LIMIT {sample_size}"
+ ).df()
+ else:
+ return 0
+
+ sample_rows = len(sample_df)
+ if sample_rows == 0:
+ return 0
+
+ # If we got fewer rows than requested, that's probably all there is
+ if sample_rows < sample_size:
+ return sample_rows
+
+ # Estimate bytes per row from sample
+ # For CSV: assume average line length based on file size
+ if file_extension == 'csv':
+ # Rough estimate: file_size / (sample_rows * estimated_line_overhead)
+ # CSV overhead includes delimiters, quotes, newlines
+ estimated_bytes_per_row = file_size_bytes / sample_rows * (sample_size / file_size_bytes)
+ estimated_total_rows = int(file_size_bytes / max(estimated_bytes_per_row, 50)) # Min 50 bytes per row
+ else:
+ # For JSON: more complex structure, use conservative estimate
+ # Assume JSON overhead is higher
+ estimated_bytes_per_row = file_size_bytes / sample_rows * (sample_size / file_size_bytes)
+ estimated_total_rows = int(file_size_bytes / max(estimated_bytes_per_row, 100)) # Min 100 bytes per row
+
+ # Apply reasonable bounds
+ estimated_total_rows = max(sample_rows, estimated_total_rows) # At least as many as we sampled
+ estimated_total_rows = min(estimated_total_rows, file_size_bytes // 10) # Max based on very small rows
+
+ return estimated_total_rows
+
+ except Exception as e:
+ print(f"Error in size-based estimation for {azure_url}: {e}")
+ return self._estimate_by_row_sampling(azure_url, file_extension)
+
+ except Exception as e:
+ print(f"Error in sampling estimation for {azure_url}: {e}")
+ return 0
+
+ def _estimate_by_row_sampling(self, azure_url: str, file_extension: str) -> int:
+ """Fallback method: sample rows without file size info."""
+ try:
+ # Try to read a reasonable sample and see if we get less than requested
+ # This indicates we've read the whole file
+ test_limit = 50000
+
+ if file_extension == 'csv':
+ sample_df = self.duck_db_conn.execute(
+ f"SELECT * FROM read_csv_auto('{azure_url}') LIMIT {test_limit}"
+ ).df()
+ elif file_extension in ['json', 'jsonl']:
+ sample_df = self.duck_db_conn.execute(
+ f"SELECT * FROM read_json_auto('{azure_url}') LIMIT {test_limit}"
+ ).df()
+ else:
+ return 0
+
+ sample_rows = len(sample_df)
+
+ # If we got fewer rows than the limit, that's likely the total
+ if sample_rows < test_limit:
+ return sample_rows
+
+ # Otherwise, we can't estimate accurately without more information
+ # Return the sample size as a lower bound
+ return sample_rows
+
+ except Exception as e:
+ print(f"Error in row sampling for {azure_url}: {e}")
+ return 0
+
+ def ingest_data(self, table_name: str, name_as: str = None, size: int = 1000000):
+ if name_as is None:
+ name_as = table_name.split('/')[-1].split('.')[0]
+
+ name_as = sanitize_table_name(name_as)
+
+ # Determine file type and use appropriate DuckDB function
+ if table_name.lower().endswith('.csv'):
+ self.duck_db_conn.execute(f"""
+ CREATE OR REPLACE TABLE main.{name_as} AS
+ SELECT * FROM read_csv_auto('{table_name}')
+ LIMIT {size}
+ """)
+ elif table_name.lower().endswith('.parquet'):
+ self.duck_db_conn.execute(f"""
+ CREATE OR REPLACE TABLE main.{name_as} AS
+ SELECT * FROM read_parquet('{table_name}')
+ LIMIT {size}
+ """)
+ elif table_name.lower().endswith('.json') or table_name.lower().endswith('.jsonl'):
+ self.duck_db_conn.execute(f"""
+ CREATE OR REPLACE TABLE main.{name_as} AS
+ SELECT * FROM read_json_auto('{table_name}')
+ LIMIT {size}
+ """)
+ else:
+ raise ValueError(f"Unsupported file type: {table_name}")
+
+ def view_query_sample(self, query: str) -> List[Dict[str, Any]]:
+ return self.duck_db_conn.execute(query).df().head(10).to_dict(orient="records")
+
+ def ingest_data_from_query(self, query: str, name_as: str):
+ # Execute the query and get results as a DataFrame
+ df = self.duck_db_conn.execute(query).df()
+ # Use the base class's method to ingest the DataFrame
+ self.ingest_df_to_duckdb(df, name_as)
\ No newline at end of file
diff --git a/py-src/data_formulator/data_loader/external_data_loader.py b/py-src/data_formulator/data_loader/external_data_loader.py
index 540f1748..5d55796f 100644
--- a/py-src/data_formulator/data_loader/external_data_loader.py
+++ b/py-src/data_formulator/data_loader/external_data_loader.py
@@ -67,6 +67,11 @@ def ingest_df_to_duckdb(self, df: pd.DataFrame, table_name: str):
def list_params() -> List[Dict[str, Any]]:
pass
+ @staticmethod
+ @abstractmethod
+ def auth_instructions() -> str:
+ pass
+
@abstractmethod
def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection):
pass
diff --git a/py-src/data_formulator/data_loader/kusto_data_loader.py b/py-src/data_formulator/data_loader/kusto_data_loader.py
index 210c3e68..b0ed936c 100644
--- a/py-src/data_formulator/data_loader/kusto_data_loader.py
+++ b/py-src/data_formulator/data_loader/kusto_data_loader.py
@@ -23,6 +23,33 @@ def list_params() -> bool:
{"name": "tenant_id", "type": "string", "required": False, "description": "only necessary for AppKey auth"}
]
return params_list
+
+ @staticmethod
+ def auth_instructions() -> str:
+ return """Azure Kusto Authentication Instructions
+
+Method 1: Azure CLI Authentication
+ 1. Install Azure CLI: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli
+ 2. Run `az login` in your terminal to authenticate
+ 3. Ensure you have access to the specified Kusto cluster and database
+ 4. Leave client_id, client_secret, and tenant_id parameters empty
+
+Method 2: Application Key Authentication
+ 1. Register an Azure AD application in your tenant
+ 2. Generate a client secret for the application
+ 3. Grant the application appropriate permissions to your Kusto cluster:
+ - Go to your Kusto cluster in Azure Portal
+ - Navigate to Permissions > Add
+ - Add your application as a user with appropriate role (e.g., "AllDatabasesViewer" for read access)
+ 4. Provide the following parameters:
+ - client_id: Application (client) ID from your Azure AD app registration
+ - client_secret: Client secret value you generated
+ - tenant_id: Directory (tenant) ID from your Azure AD
+
+Required Parameters:
+ - kusto_cluster: Your Kusto cluster URI (e.g., "https://mycluster.region.kusto.windows.net")
+ - kusto_database: Name of the database you want to access
+"""
def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection):
@@ -51,41 +78,6 @@ def query(self, kql: str) -> pd.DataFrame:
return dataframe_from_result_table(result.primary_results[0])
def list_tables(self) -> List[Dict[str, Any]]:
- # first list functions (views)
- query = ".show functions"
- function_result_df = self.query(query)
-
- functions = []
- for func in function_result_df.to_dict(orient="records"):
- func_name = func['Name']
- result = self.query(f".show function ['{func_name}'] schema as json").to_dict(orient="records")
- schema = json.loads(result[0]['Schema'])
- parameters = schema['InputParameters']
- columns = [{
- 'name': r["Name"],
- 'type': r["Type"]
- } for r in schema['OutputColumns']]
-
- # skip functions with parameters at the moment
- if len(parameters) > 0:
- continue
-
- sample_query = f"['{func_name}'] | take {10}"
- sample_result = self.query(sample_query).to_dict(orient="records")
-
- function_metadata = {
- "row_count": 0,
- "columns": columns,
- "parameters": parameters,
- "sample_rows": sample_result
- }
- functions.append({
- "type": "function",
- "name": func_name,
- "metadata": function_metadata
- })
-
- # then list tables
query = ".show tables"
tables_df = self.query(query)
@@ -101,8 +93,8 @@ def list_tables(self) -> List[Dict[str, Any]]:
row_count_result = self.query(f".show table ['{table_name}'] details").to_dict(orient="records")
row_count = row_count_result[0]["TotalRowCount"]
- sample_query = f"['{table_name}'] | take {10}"
- sample_result = self.query(sample_query).to_dict(orient="records")
+ sample_query = f"['{table_name}'] | take {5}"
+ sample_result = json.loads(self.query(sample_query).to_json(orient="records"))
table_metadata = {
"row_count": row_count,
@@ -116,7 +108,7 @@ def list_tables(self) -> List[Dict[str, Any]]:
"metadata": table_metadata
})
- return functions + tables
+ return tables
def ingest_data(self, table_name: str, name_as: str = None, size: int = 5000000) -> pd.DataFrame:
if name_as is None:
@@ -167,7 +159,7 @@ def ingest_data(self, table_name: str, name_as: str = None, size: int = 5000000)
total_rows_ingested += len(chunk_df)
def view_query_sample(self, query: str) -> str:
- return self.query(query).head(10).to_dict(orient="records")
+ return json.loads(self.query(query).head(10).to_json(orient="records"))
def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame:
# Sanitize the table name for SQL compatibility
diff --git a/py-src/data_formulator/data_loader/mysql_data_loader.py b/py-src/data_formulator/data_loader/mysql_data_loader.py
index 625204e3..9eddc0cd 100644
--- a/py-src/data_formulator/data_loader/mysql_data_loader.py
+++ b/py-src/data_formulator/data_loader/mysql_data_loader.py
@@ -18,6 +18,32 @@ def list_params() -> bool:
]
return params_list
+ @staticmethod
+ def auth_instructions() -> str:
+ return """
+MySQL Connection Instructions:
+
+1. Local MySQL Setup:
+ - Ensure MySQL server is running on your machine
+ - Default connection: host='localhost', user='root'
+ - If you haven't set a root password, leave password field empty
+
+2. Remote MySQL Connection:
+ - Obtain host address, username, and password from your database administrator
+ - Ensure the MySQL server allows remote connections
+ - Check that your IP is whitelisted in MySQL's user permissions
+
+3. Common Connection Parameters:
+ - user: Your MySQL username (default: 'root')
+ - password: Your MySQL password (leave empty if no password set)
+ - host: MySQL server address (default: 'localhost')
+ - database: Target database name to connect to
+
+4. Troubleshooting:
+ - Verify MySQL service is running: `brew services list` (macOS) or `sudo systemctl status mysql` (Linux)
+ - Test connection: `mysql -u [username] -p -h [host] [database]`
+"""
+
def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection):
self.params = params
self.duck_db_conn = duck_db_conn
@@ -93,7 +119,7 @@ def ingest_data(self, table_name: str, name_as: str | None = None, size: int = 1
""")
def view_query_sample(self, query: str) -> str:
- return self.duck_db_conn.execute(query).df().head(10).to_dict(orient="records")
+ return json.loads(self.duck_db_conn.execute(query).df().head(10).to_json(orient="records"))
def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame:
# Execute the query and get results as a DataFrame
diff --git a/py-src/data_formulator/data_loader/s3_data_loader.py b/py-src/data_formulator/data_loader/s3_data_loader.py
new file mode 100644
index 00000000..ec339c3f
--- /dev/null
+++ b/py-src/data_formulator/data_loader/s3_data_loader.py
@@ -0,0 +1,202 @@
+import json
+import pandas as pd
+import duckdb
+import os
+
+from data_formulator.data_loader.external_data_loader import ExternalDataLoader, sanitize_table_name
+from typing import Dict, Any, List
+
+class S3DataLoader(ExternalDataLoader):
+
+ @staticmethod
+ def list_params() -> List[Dict[str, Any]]:
+ params_list = [
+ {"name": "aws_access_key_id", "type": "string", "required": True, "default": "", "description": "AWS access key ID"},
+ {"name": "aws_secret_access_key", "type": "string", "required": True, "default": "", "description": "AWS secret access key"},
+ {"name": "aws_session_token", "type": "string", "required": False, "default": "", "description": "AWS session token (required for temporary credentials)"},
+ {"name": "region_name", "type": "string", "required": True, "default": "us-east-1", "description": "AWS region name"},
+ {"name": "bucket", "type": "string", "required": True, "default": "", "description": "S3 bucket name"}
+ ]
+ return params_list
+
+ @staticmethod
+ def auth_instructions() -> str:
+ return """
+**Required AWS Credentials:**
+- **AWS Access Key ID**: Your AWS access key identifier
+- **AWS Secret Access Key**: Your AWS secret access key
+- **Region Name**: AWS region (e.g., 'us-east-1', 'us-west-2')
+- **Bucket**: S3 bucket name
+- **AWS Session Token**: Optional, for temporary credentials only
+
+**Getting Credentials:**
+1. AWS Console → IAM → Users → Select user → Security credentials → Create access key
+2. Choose "Application running outside AWS"
+
+**Required S3 Permissions:**
+```json
+{
+ "Version": "2012-10-17",
+ "Statement": [{
+ "Effect": "Allow",
+ "Action": ["s3:GetObject", "s3:ListBucket"],
+ "Resource": [
+ "arn:aws:s3:::your-bucket-name",
+ "arn:aws:s3:::your-bucket-name/*"
+ ]
+ }]
+}
+```
+
+**Supported File Formats:**
+- CSV files (.csv)
+- Parquet files (.parquet)
+- JSON files (.json, .jsonl)
+
+**Security:** Never share secret keys, rotate regularly, use least privilege permissions.
+ """
+
+ def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection):
+ self.params = params
+ self.duck_db_conn = duck_db_conn
+
+ # Extract parameters
+ self.aws_access_key_id = params.get("aws_access_key_id", "")
+ self.aws_secret_access_key = params.get("aws_secret_access_key", "")
+ self.aws_session_token = params.get("aws_session_token", "")
+ self.region_name = params.get("region_name", "us-east-1")
+ self.bucket = params.get("bucket", "")
+
+ # Install and load the httpfs extension for S3 access
+ self.duck_db_conn.install_extension("httpfs")
+ self.duck_db_conn.load_extension("httpfs")
+
+ # Set AWS credentials for DuckDB
+ self.duck_db_conn.execute(f"SET s3_region='{self.region_name}'")
+ self.duck_db_conn.execute(f"SET s3_access_key_id='{self.aws_access_key_id}'")
+ self.duck_db_conn.execute(f"SET s3_secret_access_key='{self.aws_secret_access_key}'")
+ if self.aws_session_token: # Add this block
+ self.duck_db_conn.execute(f"SET s3_session_token='{self.aws_session_token}'")
+
+ def list_tables(self) -> List[Dict[str, Any]]:
+ # Use boto3 to list objects in the bucket
+ import boto3
+
+ s3_client = boto3.client(
+ 's3',
+ aws_access_key_id=self.aws_access_key_id,
+ aws_secret_access_key=self.aws_secret_access_key,
+ aws_session_token=self.aws_session_token if self.aws_session_token else None,
+ region_name=self.region_name
+ )
+
+ # List objects in the bucket
+ response = s3_client.list_objects_v2(Bucket=self.bucket)
+
+ results = []
+
+ if 'Contents' in response:
+ for obj in response['Contents']:
+ key = obj['Key']
+
+ # Skip directories and non-data files
+ if key.endswith('/') or not self._is_supported_file(key):
+ continue
+
+ # Create S3 URL
+ s3_url = f"s3://{self.bucket}/{key}"
+
+ try:
+ # Choose the appropriate read function based on file extension
+ if s3_url.lower().endswith('.parquet'):
+ sample_df = self.duck_db_conn.execute(f"SELECT * FROM read_parquet('{s3_url}') LIMIT 10").df()
+ elif s3_url.lower().endswith('.json') or s3_url.lower().endswith('.jsonl'):
+ sample_df = self.duck_db_conn.execute(f"SELECT * FROM read_json_auto('{s3_url}') LIMIT 10").df()
+ elif s3_url.lower().endswith('.csv'): # Default to CSV for other formats
+ sample_df = self.duck_db_conn.execute(f"SELECT * FROM read_csv_auto('{s3_url}') LIMIT 10").df()
+
+ # Get column information
+ columns = [{
+ 'name': col,
+ 'type': str(sample_df[col].dtype)
+ } for col in sample_df.columns]
+
+ # Get sample data
+ sample_rows = json.loads(sample_df.to_json(orient="records"))
+
+ # Estimate row count (this is approximate for CSV files)
+ row_count = self._estimate_row_count(s3_url)
+
+ table_metadata = {
+ "row_count": row_count,
+ "columns": columns,
+ "sample_rows": sample_rows
+ }
+
+ results.append({
+ "name": s3_url,
+ "metadata": table_metadata
+ })
+ except Exception as e:
+ # Skip files that can't be read
+ print(f"Error reading {s3_url}: {e}")
+ continue
+
+ return results
+
+ def _is_supported_file(self, key: str) -> bool:
+ """Check if the file type is supported by DuckDB."""
+ supported_extensions = ['.csv', '.parquet', '.json', '.jsonl']
+ return any(key.lower().endswith(ext) for ext in supported_extensions)
+
+ def _estimate_row_count(self, s3_url: str) -> int:
+ """Estimate the number of rows in a file."""
+ try:
+ # For parquet files, we can get the exact count
+ if s3_url.lower().endswith('.parquet'):
+ count = self.duck_db_conn.execute(f"SELECT COUNT(*) FROM read_parquet('{s3_url}')").fetchone()[0]
+ return count
+
+ # For CSV, JSON, and JSONL files, we'll skip row count
+ if s3_url.lower().endswith('.csv') or s3_url.lower().endswith('.json') or s3_url.lower().endswith('.jsonl'):
+ return 0
+ except Exception as e:
+ print(f"Error estimating row count for {s3_url}: {e}")
+ return 0
+
+ def ingest_data(self, table_name: str, name_as: str = None, size: int = 1000000):
+ if name_as is None:
+ name_as = table_name.split('/')[-1].split('.')[0]
+
+ name_as = sanitize_table_name(name_as)
+
+ # Determine file type and use appropriate DuckDB function
+ if table_name.lower().endswith('.csv'):
+ self.duck_db_conn.execute(f"""
+ CREATE OR REPLACE TABLE main.{name_as} AS
+ SELECT * FROM read_csv_auto('{table_name}')
+ LIMIT {size}
+ """)
+ elif table_name.lower().endswith('.parquet'):
+ self.duck_db_conn.execute(f"""
+ CREATE OR REPLACE TABLE main.{name_as} AS
+ SELECT * FROM read_parquet('{table_name}')
+ LIMIT {size}
+ """)
+ elif table_name.lower().endswith('.json') or table_name.lower().endswith('.jsonl'):
+ self.duck_db_conn.execute(f"""
+ CREATE OR REPLACE TABLE main.{name_as} AS
+ SELECT * FROM read_json_auto('{table_name}')
+ LIMIT {size}
+ """)
+ else:
+ raise ValueError(f"Unsupported file type: {table_name}")
+
+ def view_query_sample(self, query: str) -> List[Dict[str, Any]]:
+ return self.duck_db_conn.execute(query).df().head(10).to_dict(orient="records")
+
+ def ingest_data_from_query(self, query: str, name_as: str):
+ # Execute the query and get results as a DataFrame
+ df = self.duck_db_conn.execute(query).df()
+ # Use the base class's method to ingest the DataFrame
+ self.ingest_df_to_duckdb(df, name_as)
\ No newline at end of file
diff --git a/py-src/data_formulator/tables_routes.py b/py-src/data_formulator/tables_routes.py
index cb04f707..5d76574d 100644
--- a/py-src/data_formulator/tables_routes.py
+++ b/py-src/data_formulator/tables_routes.py
@@ -728,7 +728,10 @@ def data_loader_list_data_loaders():
return jsonify({
"status": "success",
"data_loaders": {
- name: data_loader.list_params()
+ name: {
+ "params": data_loader.list_params(),
+ "auth_instructions": data_loader.auth_instructions()
+ }
for name, data_loader in DATA_LOADERS.items()
}
})
diff --git a/pyproject.toml b/pyproject.toml
index 96675706..15f21d3e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "data_formulator"
-version = "0.2.1.2"
+version = "0.2.1.3"
requires-python = ">=3.9"
authors = [
@@ -21,7 +21,6 @@ classifiers = [
]
dependencies = [
- "autopep8",
"jupyter",
"pandas",
"docker",
@@ -31,6 +30,7 @@ dependencies = [
"flask-cors",
"openai",
"azure-identity",
+ "azure-kusto-data",
"azure-keyvault-secrets",
"python-dotenv",
"vega_datasets",
diff --git a/requirements.txt b/requirements.txt
index 60068b75..01726dcb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,3 @@
-autopep8
jupyter
pandas
docker
@@ -7,9 +6,13 @@ matplotlib
flask
openai
azure-identity
+azure-kusto-data
azure-keyvault-secrets
+azure-kusto-data
+azure-storage-blob
python-dotenv
vega_datasets
litellm
duckdb
--e . #also need to install data formulator itself
\ No newline at end of file
+boto3
+-e . #also need to install data formulator itself
diff --git a/src/views/DBTableManager.tsx b/src/views/DBTableManager.tsx
index a856122d..01b710b7 100644
--- a/src/views/DBTableManager.tsx
+++ b/src/views/DBTableManager.tsx
@@ -67,12 +67,14 @@ import { DataFormulatorState } from '../app/dfSlice';
import { fetchFieldSemanticType } from '../app/dfSlice';
import { AppDispatch } from '../app/store';
import Editor from 'react-simple-code-editor';
+import Markdown from 'markdown-to-jsx';
import Prism from 'prismjs'
import 'prismjs/components/prism-javascript' // Language
import 'prismjs/themes/prism.css'; //Example style, you can use another
import PrecisionManufacturingIcon from '@mui/icons-material/PrecisionManufacturing';
import CheckIcon from '@mui/icons-material/Check';
+import MuiMarkdown from 'mui-markdown';
export const handleDBDownload = async (sessionId: string) => {
try {
@@ -273,7 +275,9 @@ export const DBTableSelectionDialog: React.FC<{ buttonElement: any }> = function
const [tableAnalysisMap, setTableAnalysisMap] = useState>({});
// maps data loader type to list of param defs
- const [dataLoaderParamDefs, setDataLoaderParamDefs] = useState>({});
+ const [dataLoaderMetadata, setDataLoaderMetadata] = useState>({});
const [dbTables, setDbTables] = useState([]);
const [selectedTabKey, setSelectedTabKey] = useState("");
@@ -294,7 +298,7 @@ export const DBTableSelectionDialog: React.FC<{ buttonElement: any }> = function
}, [errorMessage])
useEffect(() => {
- if (dbTables.length == 0) {
+ if (!selectedTabKey.startsWith("dataLoader:") && dbTables.length == 0) {
setSelectedTabKey("");
} else if (!selectedTabKey.startsWith("dataLoader:") && dbTables.find(t => t.name === selectedTabKey) == undefined) {
setSelectedTabKey(dbTables[0].name);
@@ -325,7 +329,7 @@ export const DBTableSelectionDialog: React.FC<{ buttonElement: any }> = function
.then(response => response.json())
.then(data => {
if (data.status === "success") {
- setDataLoaderParamDefs(data.data_loaders);
+ setDataLoaderMetadata(data.data_loaders);
} else {
console.error('Failed to fetch data loader params:', data.error);
}
@@ -652,8 +656,22 @@ export const DBTableSelectionDialog: React.FC<{ buttonElement: any }> = function
value={0} // not used, just to keep MUI happy
sx={{px: 0.5}}
>
- connect external data
- {["file upload", "mysql", "kusto"].map((dataLoaderType, i) => (
+
+ connect external data
+
+ {
+ fetchDataLoaders();
+ }}>
+
+
+
+
+ {["file upload", ...Object.keys(dataLoaderMetadata ?? {})].map((dataLoaderType, i) => (
= function
{uploadFileButton({isUploading ? 'uploading...' : 'upload a csv/tsv file to the local database'})}
- {dataLoaderParamDefs && Object.entries(dataLoaderParamDefs).map(([dataLoaderType, paramDefs]) => (
+ {dataLoaderMetadata && Object.entries(dataLoaderMetadata).map(([dataLoaderType, metadata]) => (
{
setIsUploading(true);
}}
@@ -857,9 +876,10 @@ export const DBTableSelectionDialog: React.FC<{ buttonElement: any }> = function
export const DataLoaderForm: React.FC<{
dataLoaderType: string,
paramDefs: {name: string, default: string, type: string, required: boolean, description: string}[],
+ authInstructions: string,
onImport: () => void,
onFinish: (status: "success" | "error", message: string) => void
-}> = ({dataLoaderType, paramDefs, onImport, onFinish}) => {
+}> = ({dataLoaderType, paramDefs, authInstructions, onImport, onFinish}) => {
const dispatch = useDispatch();
@@ -868,6 +888,8 @@ export const DataLoaderForm: React.FC<{
const [tableMetadata, setTableMetadata] = useState>({});
let [displaySamples, setDisplaySamples] = useState>({});
+ const [displayAuthInstructions, setDisplayAuthInstructions] = useState(false);
+
let [isConnecting, setIsConnecting] = useState(false);
let [mode, setMode] = useState<"view tables" | "query">("view tables");
const toggleDisplaySamples = (tableName: string) => {
@@ -911,7 +933,7 @@ export const DataLoaderForm: React.FC<{
return [
toggleDisplaySamples(tableName)}>
@@ -1031,6 +1053,7 @@ export const DataLoaderForm: React.FC<{
sx={{textTransform: "none"}}
onClick={() => {
setIsConnecting(true);
+ setDisplayAuthInstructions(false);
fetch(getUrls().DATA_LOADER_LIST_TABLES, {
method: 'POST',
headers: {
@@ -1068,8 +1091,26 @@ export const DataLoaderForm: React.FC<{
}}>
disconnect
- }
+
+ }
+
+
+ {
+
+
+ {authInstructions.trim()}
+
+
+
+ }
+
{Object.keys(tableMetadata).length > 0 && tableMetadataBox }
);
diff --git a/src/views/ModelSelectionDialog.tsx b/src/views/ModelSelectionDialog.tsx
index 461386f7..4d957334 100644
--- a/src/views/ModelSelectionDialog.tsx
+++ b/src/views/ModelSelectionDialog.tsx
@@ -59,6 +59,12 @@ interface AppConfig {
DISABLE_DISPLAY_KEYS: boolean;
}
+const decodeHtmlEntities = (text: string): string => {
+ const textarea = document.createElement('textarea');
+ textarea.innerHTML = text;
+ return textarea.value;
+};
+
export const ModelSelectionButton: React.FC<{}> = ({ }) => {
const dispatch = useDispatch();
@@ -410,12 +416,15 @@ export const ModelSelectionButton: React.FC<{}> = ({ }) => {
if (status == "unknown") {
message = "Click the status icon to test again before applying.";
} else if (status == "error") {
- message = testedModels.find(t => t.id == model.id)?.message || "Unknown error";
+ const rawMessage = testedModels.find(t => t.id == model.id)?.message || "Unknown error";
+ message = decodeHtmlEntities(rawMessage);
}
const borderStyle = ['error', 'unknown'].includes(status) ? '1px dashed text.secondary' : undefined;
const noBorderStyle = ['error', 'unknown'].includes(status) ? 'none' : undefined;
+ console.log(message)
+
return (
<>
= ({ }) => {
- {message}
+ {message}
+
diff --git a/src/views/VisualizationView.tsx b/src/views/VisualizationView.tsx
index 7460c0bc..f3443876 100644
--- a/src/views/VisualizationView.tsx
+++ b/src/views/VisualizationView.tsx
@@ -363,8 +363,6 @@ export const ChartEditorFC: FC<{ cachedCandidates: DictTable[],
const [errorMessage, setErrorMessage] = useState<{content: string, severity: "error" | "warning" | "info" | "success"}>({content: "", severity: "error"});
const [showError, setShowError] = useState(false);
-
-
let createVisTableRowsLocal = (rows: any[]) => {
if (visFields.length == 0) {
return rows;
diff --git a/yarn.lock b/yarn.lock
index 265b56d9..88db78c2 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -4479,10 +4479,10 @@ vega@^5.32.0:
vega-voronoi "~4.2.4"
vega-wordcloud "~4.1.6"
-vite@^5.4.15:
- version "5.4.15"
- resolved "https://registry.yarnpkg.com/vite/-/vite-5.4.15.tgz#2941547f10ebb4bf9b0fa0da863c06711eb7e5e5"
- integrity sha512-6ANcZRivqL/4WtwPGTKNaosuNJr5tWiftOC7liM7G9+rMb8+oeJeyzymDu4rTN93seySBmbjSfsS3Vzr19KNtA==
+vite@^5.4.19:
+ version "5.4.19"
+ resolved "https://registry.yarnpkg.com/vite/-/vite-5.4.19.tgz#20efd060410044b3ed555049418a5e7d1998f959"
+ integrity sha512-qO3aKv3HoQC8QKiNSTuUM1l9o/XX3+c+VTgLHbJWHZGeTPVAg2XwazI9UWzoxjIJCGCV2zU60uqMzjeLZuULqA==
dependencies:
esbuild "^0.21.3"
postcss "^8.4.43"