From a00d9a4d3f68be7a5abfc897ecdcaf4a6b5f682c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 13 May 2025 23:42:02 +0000 Subject: [PATCH 1/7] Bump vite from 5.4.15 to 5.4.19 Bumps [vite](https://github.com/vitejs/vite/tree/HEAD/packages/vite) from 5.4.15 to 5.4.19. - [Release notes](https://github.com/vitejs/vite/releases) - [Changelog](https://github.com/vitejs/vite/blob/v5.4.19/packages/vite/CHANGELOG.md) - [Commits](https://github.com/vitejs/vite/commits/v5.4.19/packages/vite) --- updated-dependencies: - dependency-name: vite dependency-version: 5.4.19 dependency-type: direct:development ... Signed-off-by: dependabot[bot] --- package.json | 2 +- yarn.lock | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/package.json b/package.json index 962241e0..453c9dea 100644 --- a/package.json +++ b/package.json @@ -82,6 +82,6 @@ "globals": "^15.12.0", "sass": "^1.77.6", "typescript-eslint": "^8.16.0", - "vite": "^5.4.15" + "vite": "^5.4.19" } } diff --git a/yarn.lock b/yarn.lock index 265b56d9..88db78c2 100644 --- a/yarn.lock +++ b/yarn.lock @@ -4479,10 +4479,10 @@ vega@^5.32.0: vega-voronoi "~4.2.4" vega-wordcloud "~4.1.6" -vite@^5.4.15: - version "5.4.15" - resolved "https://registry.yarnpkg.com/vite/-/vite-5.4.15.tgz#2941547f10ebb4bf9b0fa0da863c06711eb7e5e5" - integrity sha512-6ANcZRivqL/4WtwPGTKNaosuNJr5tWiftOC7liM7G9+rMb8+oeJeyzymDu4rTN93seySBmbjSfsS3Vzr19KNtA== +vite@^5.4.19: + version "5.4.19" + resolved "https://registry.yarnpkg.com/vite/-/vite-5.4.19.tgz#20efd060410044b3ed555049418a5e7d1998f959" + integrity sha512-qO3aKv3HoQC8QKiNSTuUM1l9o/XX3+c+VTgLHbJWHZGeTPVAg2XwazI9UWzoxjIJCGCV2zU60uqMzjeLZuULqA== dependencies: esbuild "^0.21.3" postcss "^8.4.43" From 4538055f041b1bfca5e8c0cc041250772e4b10d7 Mon Sep 17 00:00:00 2001 From: Chenglong Wang Date: Thu, 15 May 2025 17:42:55 -0700 Subject: [PATCH 2/7] various fixes --- .../agents/agent_code_explanation.py | 3 +- .../agents/agent_py_concept_derive.py | 2 - .../agents/agent_py_data_transform.py | 1 - .../agents/agent_query_completion.py | 3 +- py-src/data_formulator/agents/agent_utils.py | 4 -- py-src/data_formulator/agents/client_utils.py | 1 - .../data_loader/kusto_data_loader.py | 43 ++----------------- .../data_loader/mysql_data_loader.py | 2 +- pyproject.toml | 2 +- requirements.txt | 2 +- src/views/DBTableManager.tsx | 2 +- 11 files changed, 10 insertions(+), 55 deletions(-) diff --git a/py-src/data_formulator/agents/agent_code_explanation.py b/py-src/data_formulator/agents/agent_code_explanation.py index 8d16a968..af348e3b 100644 --- a/py-src/data_formulator/agents/agent_code_explanation.py +++ b/py-src/data_formulator/agents/agent_code_explanation.py @@ -1,8 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -import pandas as pd -from data_formulator.agents.agent_utils import generate_data_summary, extract_code_from_gpt_response +from data_formulator.agents.agent_utils import generate_data_summary import logging diff --git a/py-src/data_formulator/agents/agent_py_concept_derive.py b/py-src/data_formulator/agents/agent_py_concept_derive.py index 58181d85..f6e3e77f 100644 --- a/py-src/data_formulator/agents/agent_py_concept_derive.py +++ b/py-src/data_formulator/agents/agent_py_concept_derive.py @@ -1,7 +1,6 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -import json import time from data_formulator.agents.agent_utils import generate_data_summary, extract_code_from_gpt_response @@ -10,7 +9,6 @@ import traceback import logging -import datetime logger = logging.getLogger(__name__) diff --git a/py-src/data_formulator/agents/agent_py_data_transform.py b/py-src/data_formulator/agents/agent_py_data_transform.py index b3cc999e..8cc86daf 100644 --- a/py-src/data_formulator/agents/agent_py_data_transform.py +++ b/py-src/data_formulator/agents/agent_py_data_transform.py @@ -2,7 +2,6 @@ # Licensed under the MIT License. import json -import sys from data_formulator.agents.agent_utils import extract_json_objects, generate_data_summary, extract_code_from_gpt_response import data_formulator.py_sandbox as py_sandbox diff --git a/py-src/data_formulator/agents/agent_query_completion.py b/py-src/data_formulator/agents/agent_query_completion.py index 8beed90c..0dd6f494 100644 --- a/py-src/data_formulator/agents/agent_query_completion.py +++ b/py-src/data_formulator/agents/agent_query_completion.py @@ -1,10 +1,9 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -import pandas as pd import json -from data_formulator.agents.agent_utils import extract_code_from_gpt_response, extract_json_objects +from data_formulator.agents.agent_utils import extract_json_objects import re import logging diff --git a/py-src/data_formulator/agents/agent_utils.py b/py-src/data_formulator/agents/agent_utils.py index 2d518932..e18962e0 100644 --- a/py-src/data_formulator/agents/agent_utils.py +++ b/py-src/data_formulator/agents/agent_utils.py @@ -6,10 +6,6 @@ import pandas as pd import numpy as np -import base64 - -from pprint import pprint - import re def string_to_py_varname(var_str): diff --git a/py-src/data_formulator/agents/client_utils.py b/py-src/data_formulator/agents/client_utils.py index 94334aee..a0eb3558 100644 --- a/py-src/data_formulator/agents/client_utils.py +++ b/py-src/data_formulator/agents/client_utils.py @@ -1,4 +1,3 @@ -import os import litellm import openai from azure.identity import DefaultAzureCredential, get_bearer_token_provider diff --git a/py-src/data_formulator/data_loader/kusto_data_loader.py b/py-src/data_formulator/data_loader/kusto_data_loader.py index 210c3e68..9aca5dbb 100644 --- a/py-src/data_formulator/data_loader/kusto_data_loader.py +++ b/py-src/data_formulator/data_loader/kusto_data_loader.py @@ -51,41 +51,6 @@ def query(self, kql: str) -> pd.DataFrame: return dataframe_from_result_table(result.primary_results[0]) def list_tables(self) -> List[Dict[str, Any]]: - # first list functions (views) - query = ".show functions" - function_result_df = self.query(query) - - functions = [] - for func in function_result_df.to_dict(orient="records"): - func_name = func['Name'] - result = self.query(f".show function ['{func_name}'] schema as json").to_dict(orient="records") - schema = json.loads(result[0]['Schema']) - parameters = schema['InputParameters'] - columns = [{ - 'name': r["Name"], - 'type': r["Type"] - } for r in schema['OutputColumns']] - - # skip functions with parameters at the moment - if len(parameters) > 0: - continue - - sample_query = f"['{func_name}'] | take {10}" - sample_result = self.query(sample_query).to_dict(orient="records") - - function_metadata = { - "row_count": 0, - "columns": columns, - "parameters": parameters, - "sample_rows": sample_result - } - functions.append({ - "type": "function", - "name": func_name, - "metadata": function_metadata - }) - - # then list tables query = ".show tables" tables_df = self.query(query) @@ -101,8 +66,8 @@ def list_tables(self) -> List[Dict[str, Any]]: row_count_result = self.query(f".show table ['{table_name}'] details").to_dict(orient="records") row_count = row_count_result[0]["TotalRowCount"] - sample_query = f"['{table_name}'] | take {10}" - sample_result = self.query(sample_query).to_dict(orient="records") + sample_query = f"['{table_name}'] | take {5}" + sample_result = json.loads(self.query(sample_query).to_json(orient="records")) table_metadata = { "row_count": row_count, @@ -116,7 +81,7 @@ def list_tables(self) -> List[Dict[str, Any]]: "metadata": table_metadata }) - return functions + tables + return tables def ingest_data(self, table_name: str, name_as: str = None, size: int = 5000000) -> pd.DataFrame: if name_as is None: @@ -167,7 +132,7 @@ def ingest_data(self, table_name: str, name_as: str = None, size: int = 5000000) total_rows_ingested += len(chunk_df) def view_query_sample(self, query: str) -> str: - return self.query(query).head(10).to_dict(orient="records") + return json.loads(self.query(query).head(10).to_json(orient="records")) def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame: # Sanitize the table name for SQL compatibility diff --git a/py-src/data_formulator/data_loader/mysql_data_loader.py b/py-src/data_formulator/data_loader/mysql_data_loader.py index 625204e3..37c08fc9 100644 --- a/py-src/data_formulator/data_loader/mysql_data_loader.py +++ b/py-src/data_formulator/data_loader/mysql_data_loader.py @@ -93,7 +93,7 @@ def ingest_data(self, table_name: str, name_as: str | None = None, size: int = 1 """) def view_query_sample(self, query: str) -> str: - return self.duck_db_conn.execute(query).df().head(10).to_dict(orient="records") + return json.loads(self.duck_db_conn.execute(query).df().head(10).to_json(orient="records")) def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame: # Execute the query and get results as a DataFrame diff --git a/pyproject.toml b/pyproject.toml index 96675706..2a4c3e2d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,6 @@ classifiers = [ ] dependencies = [ - "autopep8", "jupyter", "pandas", "docker", @@ -31,6 +30,7 @@ dependencies = [ "flask-cors", "openai", "azure-identity", + "azure-kusto-data", "azure-keyvault-secrets", "python-dotenv", "vega_datasets", diff --git a/requirements.txt b/requirements.txt index 60068b75..e7919529 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -autopep8 jupyter pandas docker @@ -7,6 +6,7 @@ matplotlib flask openai azure-identity +azure-kusto-data azure-keyvault-secrets python-dotenv vega_datasets diff --git a/src/views/DBTableManager.tsx b/src/views/DBTableManager.tsx index a856122d..f9fac51d 100644 --- a/src/views/DBTableManager.tsx +++ b/src/views/DBTableManager.tsx @@ -294,7 +294,7 @@ export const DBTableSelectionDialog: React.FC<{ buttonElement: any }> = function }, [errorMessage]) useEffect(() => { - if (dbTables.length == 0) { + if (!selectedTabKey.startsWith("dataLoader:") && dbTables.length == 0) { setSelectedTabKey(""); } else if (!selectedTabKey.startsWith("dataLoader:") && dbTables.find(t => t.name === selectedTabKey) == undefined) { setSelectedTabKey(dbTables[0].name); From 48f4ee2fcbacdf877f7927d196119cd1a716a363 Mon Sep 17 00:00:00 2001 From: slackroo Date: Wed, 28 May 2025 09:52:56 +0530 Subject: [PATCH 3/7] "Add S3 data loader support to DBTableManager and data formulator" --- .../data_formulator/data_loader/__init__.py | 6 +- .../data_loader/s3_data_loader.py | 188 ++++++++++++++++++ requirements.txt | 5 +- src/views/DBTableManager.tsx | 2 +- 4 files changed, 197 insertions(+), 4 deletions(-) create mode 100644 py-src/data_formulator/data_loader/s3_data_loader.py diff --git a/py-src/data_formulator/data_loader/__init__.py b/py-src/data_formulator/data_loader/__init__.py index 145ac806..19982df5 100644 --- a/py-src/data_formulator/data_loader/__init__.py +++ b/py-src/data_formulator/data_loader/__init__.py @@ -1,10 +1,12 @@ from data_formulator.data_loader.external_data_loader import ExternalDataLoader from data_formulator.data_loader.mysql_data_loader import MySQLDataLoader from data_formulator.data_loader.kusto_data_loader import KustoDataLoader +from data_formulator.data_loader.s3_data_loader import S3DataLoader DATA_LOADERS = { "mysql": MySQLDataLoader, - "kusto": KustoDataLoader + "kusto": KustoDataLoader, + "s3": S3DataLoader, } -__all__ = ["ExternalDataLoader", "MySQLDataLoader", "KustoDataLoader", "DATA_LOADERS"] \ No newline at end of file +__all__ = ["ExternalDataLoader", "MySQLDataLoader", "KustoDataLoader", "S3DataLoader", "DATA_LOADERS"] diff --git a/py-src/data_formulator/data_loader/s3_data_loader.py b/py-src/data_formulator/data_loader/s3_data_loader.py new file mode 100644 index 00000000..6ce9cd42 --- /dev/null +++ b/py-src/data_formulator/data_loader/s3_data_loader.py @@ -0,0 +1,188 @@ +import json +import pandas as pd +import duckdb +import os + +from data_formulator.data_loader.external_data_loader import ExternalDataLoader, sanitize_table_name +from typing import Dict, Any, List + +class S3DataLoader(ExternalDataLoader): + + @staticmethod + def list_params() -> List[Dict[str, Any]]: + params_list = [ + {"name": "aws_access_key_id", "type": "string", "required": True, "default": "", "description": "AWS access key ID"}, + {"name": "aws_secret_access_key", "type": "string", "required": True, "default": "", "description": "AWS secret access key"}, + {"name": "aws_session_token", "type": "string", "required": False, "default": "", "description": "AWS session token (required for temporary credentials)"}, + {"name": "region_name", "type": "string", "required": True, "default": "us-east-1", "description": "AWS region name"}, + {"name": "bucket", "type": "string", "required": True, "default": "", "description": "S3 bucket name"} + ] + return params_list + + def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): + self.params = params + self.duck_db_conn = duck_db_conn + + # Extract parameters + self.aws_access_key_id = params.get("aws_access_key_id", "") + self.aws_secret_access_key = params.get("aws_secret_access_key", "") + self.aws_session_token = params.get("aws_session_token", "") + self.region_name = params.get("region_name", "us-east-1") + self.bucket = params.get("bucket", "") + + # Install and load the httpfs extension for S3 access + self.duck_db_conn.install_extension("httpfs") + self.duck_db_conn.load_extension("httpfs") + + # Set AWS credentials for DuckDB + self.duck_db_conn.execute(f"SET s3_region='{self.region_name}'") + self.duck_db_conn.execute(f"SET s3_access_key_id='{self.aws_access_key_id}'") + self.duck_db_conn.execute(f"SET s3_secret_access_key='{self.aws_secret_access_key}'") + if self.aws_session_token: # Add this block + self.duck_db_conn.execute(f"SET s3_session_token='{self.aws_session_token}'") + + def list_tables(self) -> List[Dict[str, Any]]: + # Use boto3 to list objects in the bucket + import boto3 + + s3_client = boto3.client( + 's3', + aws_access_key_id=self.aws_access_key_id, + aws_secret_access_key=self.aws_secret_access_key, + aws_session_token=self.aws_session_token if self.aws_session_token else None, + region_name=self.region_name + ) + + # List objects in the bucket + response = s3_client.list_objects_v2(Bucket=self.bucket) + + results = [] + + if 'Contents' in response: + for obj in response['Contents']: + key = obj['Key'] + + # Skip directories and non-data files + if key.endswith('/') or not self._is_supported_file(key): + continue + + # Create S3 URL + s3_url = f"s3://{self.bucket}/{key}" + + try: + # Choose the appropriate read function based on file extension + if s3_url.lower().endswith('.parquet'): + sample_df = self.duck_db_conn.execute(f"SELECT * FROM read_parquet('{s3_url}') LIMIT 10").df() + elif s3_url.lower().endswith('.json') or s3_url.lower().endswith('.jsonl'): + sample_df = self.duck_db_conn.execute(f"SELECT * FROM read_json_auto('{s3_url}') LIMIT 10").df() + elif s3_url.lower().endswith('.csv'): # Default to CSV for other formats + sample_df = self.duck_db_conn.execute(f"SELECT * FROM read_csv_auto('{s3_url}') LIMIT 10").df() + + # Get column information + columns = [{ + 'name': col, + 'type': str(sample_df[col].dtype) + } for col in sample_df.columns] + + # Get sample data + sample_rows = json.loads(sample_df.to_json(orient="records")) + + # Estimate row count (this is approximate for CSV files) + row_count = self._estimate_row_count(s3_url) + + table_metadata = { + "row_count": row_count, + "columns": columns, + "sample_rows": sample_rows + } + + results.append({ + "name": s3_url, + "metadata": table_metadata + }) + except Exception as e: + # Skip files that can't be read + print(f"Error reading {s3_url}: {e}") + continue + + return results + + def _is_supported_file(self, key: str) -> bool: + """Check if the file type is supported by DuckDB.""" + supported_extensions = ['.csv', '.parquet', '.json', '.jsonl'] + return any(key.lower().endswith(ext) for ext in supported_extensions) + + def _estimate_row_count(self, s3_url: str) -> int: + """Estimate the number of rows in a file.""" + try: + # For parquet files, we can get the exact count + if s3_url.lower().endswith('.parquet'): + count = self.duck_db_conn.execute(f"SELECT COUNT(*) FROM read_parquet('{s3_url}')").fetchone()[0] + return count + + # For CSV files, we'll sample the file to estimate size + sample_size = 1000 + sample_df = self.duck_db_conn.execute(f"SELECT * FROM read_csv_auto('{s3_url}') LIMIT {sample_size}").df() + + # Get file size from S3 + import boto3 + s3_client = boto3.client( + 's3', + aws_access_key_id=self.aws_access_key_id, + aws_secret_access_key=self.aws_secret_access_key, + aws_session_token=self.aws_session_token if self.aws_session_token else None, + region_name=self.region_name + ) + + key = s3_url.replace(f"s3://{self.bucket}/", "") + response = s3_client.head_object(Bucket=self.bucket, Key=key) + file_size = response['ContentLength'] + + # Estimate based on sample size and file size + if len(sample_df) > 0: + # Calculate average row size in bytes + avg_row_size = file_size / len(sample_df) + estimated_rows = int(file_size / avg_row_size) + return min(estimated_rows, 1000000) # Cap at 1 million for UI performance + + return 0 + except Exception as e: + print(f"Error estimating row count for {s3_url}: {e}") + return 0 + + def ingest_data(self, table_name: str, name_as: str = None, size: int = 1000000): + if name_as is None: + name_as = table_name.split('/')[-1].split('.')[0] + + name_as = sanitize_table_name(name_as) + + # Determine file type and use appropriate DuckDB function + if table_name.lower().endswith('.csv'): + self.duck_db_conn.execute(f""" + CREATE OR REPLACE TABLE main.{name_as} AS + SELECT * FROM read_csv_auto('{table_name}') + LIMIT {size} + """) + elif table_name.lower().endswith('.parquet'): + self.duck_db_conn.execute(f""" + CREATE OR REPLACE TABLE main.{name_as} AS + SELECT * FROM read_parquet('{table_name}') + LIMIT {size} + """) + elif table_name.lower().endswith('.json') or table_name.lower().endswith('.jsonl'): + self.duck_db_conn.execute(f""" + CREATE OR REPLACE TABLE main.{name_as} AS + SELECT * FROM read_json_auto('{table_name}') + LIMIT {size} + """) + else: + raise ValueError(f"Unsupported file type: {table_name}") + + def view_query_sample(self, query: str) -> List[Dict[str, Any]]: + return self.duck_db_conn.execute(query).df().head(10).to_dict(orient="records") + + def ingest_data_from_query(self, query: str, name_as: str): + # Execute the query and get results as a DataFrame + df = self.duck_db_conn.execute(query).df() + # Use the base class's method to ingest the DataFrame + self.ingest_df_to_duckdb(df, name_as) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 60068b75..b936b23c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,8 +8,11 @@ flask openai azure-identity azure-keyvault-secrets +azure-kusto-data +azure-storage-blob python-dotenv vega_datasets litellm duckdb --e . #also need to install data formulator itself \ No newline at end of file +boto3 +-e . #also need to install data formulator itself diff --git a/src/views/DBTableManager.tsx b/src/views/DBTableManager.tsx index a856122d..e2902027 100644 --- a/src/views/DBTableManager.tsx +++ b/src/views/DBTableManager.tsx @@ -653,7 +653,7 @@ export const DBTableSelectionDialog: React.FC<{ buttonElement: any }> = function sx={{px: 0.5}} > connect external data - {["file upload", "mysql", "kusto"].map((dataLoaderType, i) => ( + {["file upload", "mysql", "kusto","s3"].map((dataLoaderType, i) => ( Date: Thu, 29 May 2025 17:44:40 -0700 Subject: [PATCH 4/7] adding azure blob data loader --- .../data_formulator/data_loader/__init__.py | 4 +- .../data_loader/azure_blob_data_loader.py | 363 ++++++++++++++++++ .../data_loader/external_data_loader.py | 5 + .../data_loader/kusto_data_loader.py | 35 ++ .../data_loader/mysql_data_loader.py | 32 ++ .../data_loader/s3_data_loader.py | 93 +++-- src/views/DBTableManager.tsx | 4 +- src/views/VisualizationView.tsx | 2 - 8 files changed, 507 insertions(+), 31 deletions(-) create mode 100644 py-src/data_formulator/data_loader/azure_blob_data_loader.py diff --git a/py-src/data_formulator/data_loader/__init__.py b/py-src/data_formulator/data_loader/__init__.py index 19982df5..6aa797c7 100644 --- a/py-src/data_formulator/data_loader/__init__.py +++ b/py-src/data_formulator/data_loader/__init__.py @@ -2,11 +2,13 @@ from data_formulator.data_loader.mysql_data_loader import MySQLDataLoader from data_formulator.data_loader.kusto_data_loader import KustoDataLoader from data_formulator.data_loader.s3_data_loader import S3DataLoader +from data_formulator.data_loader.azure_blob_data_loader import AzureBlobDataLoader DATA_LOADERS = { "mysql": MySQLDataLoader, "kusto": KustoDataLoader, "s3": S3DataLoader, + "azure_blob": AzureBlobDataLoader, } -__all__ = ["ExternalDataLoader", "MySQLDataLoader", "KustoDataLoader", "S3DataLoader", "DATA_LOADERS"] +__all__ = ["ExternalDataLoader", "MySQLDataLoader", "KustoDataLoader", "S3DataLoader", "AzureBlobDataLoader", "DATA_LOADERS"] diff --git a/py-src/data_formulator/data_loader/azure_blob_data_loader.py b/py-src/data_formulator/data_loader/azure_blob_data_loader.py new file mode 100644 index 00000000..081eb6d2 --- /dev/null +++ b/py-src/data_formulator/data_loader/azure_blob_data_loader.py @@ -0,0 +1,363 @@ +import json +import pandas as pd +import duckdb +import os + +from data_formulator.data_loader.external_data_loader import ExternalDataLoader, sanitize_table_name +from typing import Dict, Any, List + +class AzureBlobDataLoader(ExternalDataLoader): + + @staticmethod + def list_params() -> List[Dict[str, Any]]: + params_list = [ + {"name": "account_name", "type": "string", "required": True, "default": "", "description": "Azure storage account name"}, + {"name": "container_name", "type": "string", "required": True, "default": "", "description": "Azure blob container name"}, + {"name": "connection_string", "type": "string", "required": False, "default": "", "description": "Azure storage connection string (alternative to account_name + credentials)"}, + {"name": "credential_chain", "type": "string", "required": False, "default": "cli;managed_identity;env", "description": "Ordered list of Azure credential providers (cli;managed_identity;env)"}, + {"name": "account_key", "type": "string", "required": False, "default": "", "description": "Azure storage account key"}, + {"name": "sas_token", "type": "string", "required": False, "default": "", "description": "Azure SAS token"}, + {"name": "endpoint", "type": "string", "required": False, "default": "blob.core.windows.net", "description": "Azure endpoint override"} + ] + return params_list + + @staticmethod + def auth_instructions() -> str: + return """**Authentication Options (choose one)** + +**Option 1 - Connection String (Simplest)** +- Get connection string from Azure Portal > Storage Account > Access keys +- Use `connection_string` parameter with full connection string +- `account_name` can be omitted when using connection string + +**Option 2 - Account Key** +- Get account key from Azure Portal > Storage Account > Access keys +- Use `account_name` + `account_key` parameters +- Provides full access to storage account + +**Option 3 - SAS Token (Recommended for limited access)** +- Generate SAS token from Azure Portal > Storage Account > Shared access signature +- Use `account_name` + `sas_token` parameters +- Can be time-limited and permission-scoped + +**Option 4 - Credential Chain (Most Secure)** +- Use `account_name` + `container_name` only (no explicit credentials) +- Requires Azure CLI login (`az login`) or Managed Identity +- Default chain: `cli;managed_identity;env` +- Customize with `credential_chain` parameter + +**Additional Options** +- `endpoint`: Custom endpoint (default: `blob.core.windows.net`) +- For Azure Government: `blob.core.usgovcloudapi.net` +- For Azure China: `blob.core.chinacloudapi.cn`""" + + def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): + self.params = params + self.duck_db_conn = duck_db_conn + + # Extract parameters + self.account_name = params.get("account_name", "") + self.container_name = params.get("container_name", "") + self.connection_string = params.get("connection_string", "") + self.credential_chain = params.get("credential_chain", "cli;managed_identity;env") + self.account_key = params.get("account_key", "") + self.sas_token = params.get("sas_token", "") + self.endpoint = params.get("endpoint", "blob.core.windows.net") + + # Install and load the azure extension + self.duck_db_conn.install_extension("azure") + self.duck_db_conn.load_extension("azure") + + # Set up Azure authentication using secrets (preferred method) + self._setup_azure_authentication() + + def _setup_azure_authentication(self): + """Set up Azure authentication using DuckDB secrets.""" + if self.connection_string: + # Use connection string authentication + self.duck_db_conn.execute(f""" + CREATE OR REPLACE SECRET azure_secret ( + TYPE AZURE, + CONNECTION_STRING '{self.connection_string}' + ) + """) + elif self.account_key: + # Use account key authentication + self.duck_db_conn.execute(f""" + CREATE OR REPLACE SECRET azure_secret ( + TYPE AZURE, + ACCOUNT_NAME '{self.account_name}', + ACCOUNT_KEY '{self.account_key}' + ) + """) + elif self.sas_token: + # Use SAS token authentication + self.duck_db_conn.execute(f""" + CREATE OR REPLACE SECRET azure_secret ( + TYPE AZURE, + ACCOUNT_NAME '{self.account_name}', + SAS_TOKEN '{self.sas_token}' + ) + """) + else: + # Use credential chain authentication (default) + self.duck_db_conn.execute(f""" + CREATE OR REPLACE SECRET azure_secret ( + TYPE AZURE, + PROVIDER credential_chain, + ACCOUNT_NAME '{self.account_name}', + CHAIN '{self.credential_chain}' + ) + """) + + def list_tables(self) -> List[Dict[str, Any]]: + # Use Azure SDK to list blobs in the container + from azure.storage.blob import BlobServiceClient + + # Create blob service client based on authentication method + if self.connection_string: + blob_service_client = BlobServiceClient.from_connection_string(self.connection_string) + elif self.account_key: + blob_service_client = BlobServiceClient( + account_url=f"https://{self.account_name}.{self.endpoint}", + credential=self.account_key + ) + elif self.sas_token: + blob_service_client = BlobServiceClient( + account_url=f"https://{self.account_name}.{self.endpoint}", + credential=self.sas_token + ) + else: + # Use default credential chain + from azure.identity import DefaultAzureCredential + credential = DefaultAzureCredential() + blob_service_client = BlobServiceClient( + account_url=f"https://{self.account_name}.{self.endpoint}", + credential=credential + ) + + container_client = blob_service_client.get_container_client(self.container_name) + + # List blobs in the container + blob_list = container_client.list_blobs() + + results = [] + + for blob in blob_list: + blob_name = blob.name + + # Skip directories and non-data files + if blob_name.endswith('/') or not self._is_supported_file(blob_name): + continue + + # Create Azure blob URL + azure_url = f"az://{self.account_name}.{self.endpoint}/{self.container_name}/{blob_name}" + + try: + # Choose the appropriate read function based on file extension + if azure_url.lower().endswith('.parquet'): + sample_df = self.duck_db_conn.execute(f"SELECT * FROM read_parquet('{azure_url}') LIMIT 10").df() + elif azure_url.lower().endswith('.json') or azure_url.lower().endswith('.jsonl'): + sample_df = self.duck_db_conn.execute(f"SELECT * FROM read_json_auto('{azure_url}') LIMIT 10").df() + elif azure_url.lower().endswith('.csv'): + sample_df = self.duck_db_conn.execute(f"SELECT * FROM read_csv_auto('{azure_url}') LIMIT 10").df() + + # Get column information + columns = [{ + 'name': col, + 'type': str(sample_df[col].dtype) + } for col in sample_df.columns] + + # Get sample data + sample_rows = json.loads(sample_df.to_json(orient="records")) + + # Estimate row count + row_count = self._estimate_row_count(azure_url, blob) + + table_metadata = { + "row_count": row_count, + "columns": columns, + "sample_rows": sample_rows + } + + results.append({ + "name": azure_url, + "metadata": table_metadata + }) + except Exception as e: + # Skip files that can't be read + print(f"Error reading {azure_url}: {e}") + continue + + return results + + def _is_supported_file(self, blob_name: str) -> bool: + """Check if the file type is supported by DuckDB.""" + supported_extensions = ['.csv', '.parquet', '.json', '.jsonl'] + return any(blob_name.lower().endswith(ext) for ext in supported_extensions) + + def _estimate_row_count(self, azure_url: str, blob_properties=None) -> int: + """Estimate the number of rows in a file using intelligent strategies.""" + try: + file_extension = azure_url.lower().split('.')[-1] + + # For parquet files, use metadata to get exact count efficiently + if file_extension == 'parquet': + try: + # Use DuckDB's parquet_file_metadata to get exact row count without full scan + metadata = self.duck_db_conn.execute( + f"SELECT num_rows FROM parquet_file_metadata('{azure_url}')" + ).fetchone() + if metadata and metadata[0] is not None: + return metadata[0] + except Exception as parquet_error: + print(f"Failed to get parquet metadata for {azure_url}: {parquet_error}") + # Fall back to counting (expensive but accurate) + try: + count = self.duck_db_conn.execute(f"SELECT COUNT(*) FROM read_parquet('{azure_url}')").fetchone()[0] + return count + except Exception: + pass + + # For CSV, JSON, and JSONL files, use intelligent sampling + elif file_extension in ['csv', 'json', 'jsonl']: + return self._estimate_rows_by_sampling(azure_url, blob_properties, file_extension) + + return 0 + + except Exception as e: + print(f"Error estimating row count for {azure_url}: {e}") + return 0 + + def _estimate_rows_by_sampling(self, azure_url: str, blob_properties, file_extension: str) -> int: + """Estimate row count for text-based files using sampling and file size.""" + try: + # Get file size from blob properties if available + file_size_bytes = None + if blob_properties and hasattr(blob_properties, 'size'): + file_size_bytes = blob_properties.size + + # If no file size available, try a different approach + if file_size_bytes is None: + # Sample first 10,000 rows and extrapolate if needed + return self._estimate_by_row_sampling(azure_url, file_extension) + + # Sample approach: read first N rows and estimate based on size + sample_size = min(10000, file_size_bytes // 100) # Adaptive sample size + sample_size = max(1000, sample_size) # At least 1000 rows + + try: + if file_extension == 'csv': + sample_df = self.duck_db_conn.execute( + f"SELECT * FROM read_csv_auto('{azure_url}') LIMIT {sample_size}" + ).df() + elif file_extension in ['json', 'jsonl']: + sample_df = self.duck_db_conn.execute( + f"SELECT * FROM read_json_auto('{azure_url}') LIMIT {sample_size}" + ).df() + else: + return 0 + + sample_rows = len(sample_df) + if sample_rows == 0: + return 0 + + # If we got fewer rows than requested, that's probably all there is + if sample_rows < sample_size: + return sample_rows + + # Estimate bytes per row from sample + # For CSV: assume average line length based on file size + if file_extension == 'csv': + # Rough estimate: file_size / (sample_rows * estimated_line_overhead) + # CSV overhead includes delimiters, quotes, newlines + estimated_bytes_per_row = file_size_bytes / sample_rows * (sample_size / file_size_bytes) + estimated_total_rows = int(file_size_bytes / max(estimated_bytes_per_row, 50)) # Min 50 bytes per row + else: + # For JSON: more complex structure, use conservative estimate + # Assume JSON overhead is higher + estimated_bytes_per_row = file_size_bytes / sample_rows * (sample_size / file_size_bytes) + estimated_total_rows = int(file_size_bytes / max(estimated_bytes_per_row, 100)) # Min 100 bytes per row + + # Apply reasonable bounds + estimated_total_rows = max(sample_rows, estimated_total_rows) # At least as many as we sampled + estimated_total_rows = min(estimated_total_rows, file_size_bytes // 10) # Max based on very small rows + + return estimated_total_rows + + except Exception as e: + print(f"Error in size-based estimation for {azure_url}: {e}") + return self._estimate_by_row_sampling(azure_url, file_extension) + + except Exception as e: + print(f"Error in sampling estimation for {azure_url}: {e}") + return 0 + + def _estimate_by_row_sampling(self, azure_url: str, file_extension: str) -> int: + """Fallback method: sample rows without file size info.""" + try: + # Try to read a reasonable sample and see if we get less than requested + # This indicates we've read the whole file + test_limit = 50000 + + if file_extension == 'csv': + sample_df = self.duck_db_conn.execute( + f"SELECT * FROM read_csv_auto('{azure_url}') LIMIT {test_limit}" + ).df() + elif file_extension in ['json', 'jsonl']: + sample_df = self.duck_db_conn.execute( + f"SELECT * FROM read_json_auto('{azure_url}') LIMIT {test_limit}" + ).df() + else: + return 0 + + sample_rows = len(sample_df) + + # If we got fewer rows than the limit, that's likely the total + if sample_rows < test_limit: + return sample_rows + + # Otherwise, we can't estimate accurately without more information + # Return the sample size as a lower bound + return sample_rows + + except Exception as e: + print(f"Error in row sampling for {azure_url}: {e}") + return 0 + + def ingest_data(self, table_name: str, name_as: str = None, size: int = 1000000): + if name_as is None: + name_as = table_name.split('/')[-1].split('.')[0] + + name_as = sanitize_table_name(name_as) + + # Determine file type and use appropriate DuckDB function + if table_name.lower().endswith('.csv'): + self.duck_db_conn.execute(f""" + CREATE OR REPLACE TABLE main.{name_as} AS + SELECT * FROM read_csv_auto('{table_name}') + LIMIT {size} + """) + elif table_name.lower().endswith('.parquet'): + self.duck_db_conn.execute(f""" + CREATE OR REPLACE TABLE main.{name_as} AS + SELECT * FROM read_parquet('{table_name}') + LIMIT {size} + """) + elif table_name.lower().endswith('.json') or table_name.lower().endswith('.jsonl'): + self.duck_db_conn.execute(f""" + CREATE OR REPLACE TABLE main.{name_as} AS + SELECT * FROM read_json_auto('{table_name}') + LIMIT {size} + """) + else: + raise ValueError(f"Unsupported file type: {table_name}") + + def view_query_sample(self, query: str) -> List[Dict[str, Any]]: + return self.duck_db_conn.execute(query).df().head(10).to_dict(orient="records") + + def ingest_data_from_query(self, query: str, name_as: str): + # Execute the query and get results as a DataFrame + df = self.duck_db_conn.execute(query).df() + # Use the base class's method to ingest the DataFrame + self.ingest_df_to_duckdb(df, name_as) \ No newline at end of file diff --git a/py-src/data_formulator/data_loader/external_data_loader.py b/py-src/data_formulator/data_loader/external_data_loader.py index 540f1748..5d55796f 100644 --- a/py-src/data_formulator/data_loader/external_data_loader.py +++ b/py-src/data_formulator/data_loader/external_data_loader.py @@ -67,6 +67,11 @@ def ingest_df_to_duckdb(self, df: pd.DataFrame, table_name: str): def list_params() -> List[Dict[str, Any]]: pass + @staticmethod + @abstractmethod + def auth_instructions() -> str: + pass + @abstractmethod def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): pass diff --git a/py-src/data_formulator/data_loader/kusto_data_loader.py b/py-src/data_formulator/data_loader/kusto_data_loader.py index 9aca5dbb..4d363d73 100644 --- a/py-src/data_formulator/data_loader/kusto_data_loader.py +++ b/py-src/data_formulator/data_loader/kusto_data_loader.py @@ -23,6 +23,41 @@ def list_params() -> bool: {"name": "tenant_id", "type": "string", "required": False, "description": "only necessary for AppKey auth"} ] return params_list + + @staticmethod + def auth_instructions() -> str: + return """ +Azure Kusto Authentication Instructions: + +This data loader supports two authentication methods: + +**Method 1: Azure CLI Authentication (Recommended for development)** +1. Install Azure CLI: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli +2. Run `az login` in your terminal to authenticate +3. Ensure you have access to the specified Kusto cluster and database +4. Leave client_id, client_secret, and tenant_id parameters empty + +**Method 2: Application Key Authentication (Recommended for production)** +1. Register an Azure AD application in your tenant +2. Generate a client secret for the application +3. Grant the application appropriate permissions to your Kusto cluster: + - Go to your Kusto cluster in Azure Portal + - Navigate to Permissions > Add + - Add your application as a user with appropriate role (e.g., "AllDatabasesViewer" for read access) +4. Provide the following parameters: + - client_id: Application (client) ID from your Azure AD app registration + - client_secret: Client secret value you generated + - tenant_id: Directory (tenant) ID from your Azure AD + +**Required Parameters:** +- kusto_cluster: Your Kusto cluster URI (e.g., "https://mycluster.region.kusto.windows.net") +- kusto_database: Name of the database you want to access + +**Troubleshooting:** +- If authentication fails, ensure you have the correct permissions on the Kusto cluster +- For CLI auth, make sure you're logged in with `az account show` +- For app key auth, verify your client_id, client_secret, and tenant_id are correct + """ def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): diff --git a/py-src/data_formulator/data_loader/mysql_data_loader.py b/py-src/data_formulator/data_loader/mysql_data_loader.py index 37c08fc9..314c36f3 100644 --- a/py-src/data_formulator/data_loader/mysql_data_loader.py +++ b/py-src/data_formulator/data_loader/mysql_data_loader.py @@ -18,6 +18,38 @@ def list_params() -> bool: ] return params_list + @staticmethod + def auth_instructions() -> str: + return """ +MySQL Connection Instructions: + +1. **Local MySQL Setup:** + - Ensure MySQL server is running on your machine + - Default connection: host='localhost', user='root' + - If you haven't set a root password, leave password field empty + +2. **Remote MySQL Connection:** + - Obtain host address, username, and password from your database administrator + - Ensure the MySQL server allows remote connections + - Check that your IP is whitelisted in MySQL's user permissions + +3. **Common Connection Parameters:** + - user: Your MySQL username (default: 'root') + - password: Your MySQL password (leave empty if no password set) + - host: MySQL server address (default: 'localhost') + - database: Target database name to connect to + +4. **Troubleshooting:** + - Verify MySQL service is running: `brew services list` (macOS) or `sudo systemctl status mysql` (Linux) + - Test connection: `mysql -u [username] -p -h [host] [database]` + - Common issues: Wrong credentials, server not running, firewall blocking connection + +5. **Security Notes:** + - Use dedicated database users with limited privileges for applications + - Avoid using root user for application connections + - Consider using SSL connections for remote databases + """ + def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): self.params = params self.duck_db_conn = duck_db_conn diff --git a/py-src/data_formulator/data_loader/s3_data_loader.py b/py-src/data_formulator/data_loader/s3_data_loader.py index 6ce9cd42..dcfacfe3 100644 --- a/py-src/data_formulator/data_loader/s3_data_loader.py +++ b/py-src/data_formulator/data_loader/s3_data_loader.py @@ -19,6 +19,70 @@ def list_params() -> List[Dict[str, Any]]: ] return params_list + @staticmethod + def auth_instructions() -> str: + return """ +To connect to Amazon S3, you'll need the following AWS credentials and configuration: + +**Required Parameters:** +- **AWS Access Key ID**: Your AWS access key identifier +- **AWS Secret Access Key**: Your AWS secret access key +- **Region Name**: The AWS region where your S3 bucket is located (e.g., 'us-east-1', 'us-west-2') +- **Bucket**: The name of your S3 bucket + +**Optional Parameters:** +- **AWS Session Token**: Required only if using temporary credentials (e.g., from AWS STS or IAM roles) + +**How to Get AWS Credentials:** + +1. **AWS IAM User (Recommended for programmatic access):** + - Go to AWS Console → IAM → Users + - Create a new user or select existing user + - Go to "Security credentials" tab + - Click "Create access key" + - Choose "Application running outside AWS" + - Save both the Access Key ID and Secret Access Key securely + +2. **Required S3 Permissions:** + Your IAM user/role needs these permissions for the target bucket: + ```json + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:GetObject", + "s3:ListBucket" + ], + "Resource": [ + "arn:aws:s3:::your-bucket-name", + "arn:aws:s3:::your-bucket-name/*" + ] + } + ] + } + ``` + +3. **Finding Your Region:** + - Go to S3 Console → Select your bucket → Properties + - Look for "AWS Region" in the bucket overview + +**Security Best Practices:** +- Never share your secret access key +- Use IAM roles when possible instead of long-term access keys +- Consider using temporary credentials with session tokens for enhanced security +- Regularly rotate your access keys +- Use the principle of least privilege for S3 permissions + +**Supported File Formats:** +- CSV files (.csv) +- Parquet files (.parquet) +- JSON files (.json, .jsonl) + +The connector will automatically detect file types and load them appropriately using DuckDB's S3 integration. + """ + def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): self.params = params self.duck_db_conn = duck_db_conn @@ -120,32 +184,9 @@ def _estimate_row_count(self, s3_url: str) -> int: count = self.duck_db_conn.execute(f"SELECT COUNT(*) FROM read_parquet('{s3_url}')").fetchone()[0] return count - # For CSV files, we'll sample the file to estimate size - sample_size = 1000 - sample_df = self.duck_db_conn.execute(f"SELECT * FROM read_csv_auto('{s3_url}') LIMIT {sample_size}").df() - - # Get file size from S3 - import boto3 - s3_client = boto3.client( - 's3', - aws_access_key_id=self.aws_access_key_id, - aws_secret_access_key=self.aws_secret_access_key, - aws_session_token=self.aws_session_token if self.aws_session_token else None, - region_name=self.region_name - ) - - key = s3_url.replace(f"s3://{self.bucket}/", "") - response = s3_client.head_object(Bucket=self.bucket, Key=key) - file_size = response['ContentLength'] - - # Estimate based on sample size and file size - if len(sample_df) > 0: - # Calculate average row size in bytes - avg_row_size = file_size / len(sample_df) - estimated_rows = int(file_size / avg_row_size) - return min(estimated_rows, 1000000) # Cap at 1 million for UI performance - - return 0 + # For CSV, JSON, and JSONL files, we'll skip row count + if s3_url.lower().endswith('.csv') or s3_url.lower().endswith('.json') or s3_url.lower().endswith('.jsonl'): + return 0 except Exception as e: print(f"Error estimating row count for {s3_url}: {e}") return 0 diff --git a/src/views/DBTableManager.tsx b/src/views/DBTableManager.tsx index 6d8e5f0d..cff9b2ac 100644 --- a/src/views/DBTableManager.tsx +++ b/src/views/DBTableManager.tsx @@ -653,7 +653,7 @@ export const DBTableSelectionDialog: React.FC<{ buttonElement: any }> = function sx={{px: 0.5}} > connect external data - {["file upload", "mysql", "kusto","s3"].map((dataLoaderType, i) => ( + {["file upload", ...Object.keys(dataLoaderParamDefs ?? {})].map((dataLoaderType, i) => ( toggleDisplaySamples(tableName)}> diff --git a/src/views/VisualizationView.tsx b/src/views/VisualizationView.tsx index 7460c0bc..f3443876 100644 --- a/src/views/VisualizationView.tsx +++ b/src/views/VisualizationView.tsx @@ -363,8 +363,6 @@ export const ChartEditorFC: FC<{ cachedCandidates: DictTable[], const [errorMessage, setErrorMessage] = useState<{content: string, severity: "error" | "warning" | "info" | "success"}>({content: "", severity: "error"}); const [showError, setShowError] = useState(false); - - let createVisTableRowsLocal = (rows: any[]) => { if (visFields.length == 0) { return rows; From ebdc0f0994ed9b91e11bbbacb9edbf89345e1601 Mon Sep 17 00:00:00 2001 From: Chenglong Wang Date: Fri, 30 May 2025 11:24:23 -0700 Subject: [PATCH 5/7] data loader updates --- README.md | 4 +- .../data_loader/azure_blob_data_loader.py | 50 +++++++------ .../data_loader/kusto_data_loader.py | 56 ++++++-------- .../data_loader/mysql_data_loader.py | 16 ++-- .../data_loader/s3_data_loader.py | 75 ++++++------------- py-src/data_formulator/tables_routes.py | 5 +- pyproject.toml | 2 +- src/views/DBTableManager.tsx | 57 ++++++++++++-- 8 files changed, 137 insertions(+), 128 deletions(-) diff --git a/README.md b/README.md index d244ad77..fd57a5d0 100644 --- a/README.md +++ b/README.md @@ -23,9 +23,9 @@ Transform data and create rich visualizations iteratively with AI 🪄. Try Data ## News 🔥🔥🔥 -- [05-13-2025] Data Formulator 0.2.1: External Data Loader +- [05-13-2025] Data Formulator 0.2.3: External Data Loader - We introduced external data loader class to make import data easier. [Readme](https://github.com/microsoft/data-formulator/tree/main/py-src/data_formulator/data_loader) and [Demo](https://github.com/microsoft/data-formulator/pull/155) - - Example data loaders from MySQL and Azure Data Explorer (Kusto) are provided. + - Current data loaders: MySQL, Azure Data Explorer (Kusto), Azure Blob and Amazon S3 (json, parquet, csv). - Call for action [link](https://github.com/microsoft/data-formulator/issues/156): - Users: let us know which data source you'd like to load data from. - Developers: let's build more data loaders. diff --git a/py-src/data_formulator/data_loader/azure_blob_data_loader.py b/py-src/data_formulator/data_loader/azure_blob_data_loader.py index 081eb6d2..e2e2f78f 100644 --- a/py-src/data_formulator/data_loader/azure_blob_data_loader.py +++ b/py-src/data_formulator/data_loader/azure_blob_data_loader.py @@ -23,33 +23,39 @@ def list_params() -> List[Dict[str, Any]]: @staticmethod def auth_instructions() -> str: - return """**Authentication Options (choose one)** + return """Authentication Options (choose one) -**Option 1 - Connection String (Simplest)** -- Get connection string from Azure Portal > Storage Account > Access keys -- Use `connection_string` parameter with full connection string -- `account_name` can be omitted when using connection string +Option 1 - Connection String (Simplest) + - Get connection string from Azure Portal > Storage Account > Access keys + - Use `connection_string` parameter with full connection string + - `account_name` can be omitted when using connection string -**Option 2 - Account Key** -- Get account key from Azure Portal > Storage Account > Access keys -- Use `account_name` + `account_key` parameters -- Provides full access to storage account +Option 2 - Account Key + - Get account key from Azure Portal > Storage Account > Access keys + - Use `account_name` + `account_key` parameters + - Provides full access to storage account -**Option 3 - SAS Token (Recommended for limited access)** -- Generate SAS token from Azure Portal > Storage Account > Shared access signature -- Use `account_name` + `sas_token` parameters -- Can be time-limited and permission-scoped +Option 3 - SAS Token (Recommended for limited access) + - Generate SAS token from Azure Portal > Storage Account > Shared access signature + - Use `account_name` + `sas_token` parameters + - Can be time-limited and permission-scoped -**Option 4 - Credential Chain (Most Secure)** -- Use `account_name` + `container_name` only (no explicit credentials) -- Requires Azure CLI login (`az login`) or Managed Identity -- Default chain: `cli;managed_identity;env` -- Customize with `credential_chain` parameter +Option 4 - Credential Chain (Most Secure) + - Use `account_name` + `container_name` only (no explicit credentials) + - Requires Azure CLI login (`az login` in terminal) or Managed Identity + - Default chain: `cli;managed_identity;env` + - Customize with `credential_chain` parameter -**Additional Options** -- `endpoint`: Custom endpoint (default: `blob.core.windows.net`) -- For Azure Government: `blob.core.usgovcloudapi.net` -- For Azure China: `blob.core.chinacloudapi.cn`""" +Additional Options + - `endpoint`: Custom endpoint (default: `blob.core.windows.net`) + - For Azure Government: `blob.core.usgovcloudapi.net` + - For Azure China: `blob.core.chinacloudapi.cn` + +Supported File Formats: + - CSV files (.csv) + - Parquet files (.parquet) + - JSON files (.json, .jsonl) +""" def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): self.params = params diff --git a/py-src/data_formulator/data_loader/kusto_data_loader.py b/py-src/data_formulator/data_loader/kusto_data_loader.py index 4d363d73..b0ed936c 100644 --- a/py-src/data_formulator/data_loader/kusto_data_loader.py +++ b/py-src/data_formulator/data_loader/kusto_data_loader.py @@ -26,38 +26,30 @@ def list_params() -> bool: @staticmethod def auth_instructions() -> str: - return """ -Azure Kusto Authentication Instructions: - -This data loader supports two authentication methods: - -**Method 1: Azure CLI Authentication (Recommended for development)** -1. Install Azure CLI: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli -2. Run `az login` in your terminal to authenticate -3. Ensure you have access to the specified Kusto cluster and database -4. Leave client_id, client_secret, and tenant_id parameters empty - -**Method 2: Application Key Authentication (Recommended for production)** -1. Register an Azure AD application in your tenant -2. Generate a client secret for the application -3. Grant the application appropriate permissions to your Kusto cluster: - - Go to your Kusto cluster in Azure Portal - - Navigate to Permissions > Add - - Add your application as a user with appropriate role (e.g., "AllDatabasesViewer" for read access) -4. Provide the following parameters: - - client_id: Application (client) ID from your Azure AD app registration - - client_secret: Client secret value you generated - - tenant_id: Directory (tenant) ID from your Azure AD - -**Required Parameters:** -- kusto_cluster: Your Kusto cluster URI (e.g., "https://mycluster.region.kusto.windows.net") -- kusto_database: Name of the database you want to access - -**Troubleshooting:** -- If authentication fails, ensure you have the correct permissions on the Kusto cluster -- For CLI auth, make sure you're logged in with `az account show` -- For app key auth, verify your client_id, client_secret, and tenant_id are correct - """ + return """Azure Kusto Authentication Instructions + +Method 1: Azure CLI Authentication + 1. Install Azure CLI: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli + 2. Run `az login` in your terminal to authenticate + 3. Ensure you have access to the specified Kusto cluster and database + 4. Leave client_id, client_secret, and tenant_id parameters empty + +Method 2: Application Key Authentication + 1. Register an Azure AD application in your tenant + 2. Generate a client secret for the application + 3. Grant the application appropriate permissions to your Kusto cluster: + - Go to your Kusto cluster in Azure Portal + - Navigate to Permissions > Add + - Add your application as a user with appropriate role (e.g., "AllDatabasesViewer" for read access) + 4. Provide the following parameters: + - client_id: Application (client) ID from your Azure AD app registration + - client_secret: Client secret value you generated + - tenant_id: Directory (tenant) ID from your Azure AD + +Required Parameters: + - kusto_cluster: Your Kusto cluster URI (e.g., "https://mycluster.region.kusto.windows.net") + - kusto_database: Name of the database you want to access +""" def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): diff --git a/py-src/data_formulator/data_loader/mysql_data_loader.py b/py-src/data_formulator/data_loader/mysql_data_loader.py index 314c36f3..9eddc0cd 100644 --- a/py-src/data_formulator/data_loader/mysql_data_loader.py +++ b/py-src/data_formulator/data_loader/mysql_data_loader.py @@ -23,32 +23,26 @@ def auth_instructions() -> str: return """ MySQL Connection Instructions: -1. **Local MySQL Setup:** +1. Local MySQL Setup: - Ensure MySQL server is running on your machine - Default connection: host='localhost', user='root' - If you haven't set a root password, leave password field empty -2. **Remote MySQL Connection:** +2. Remote MySQL Connection: - Obtain host address, username, and password from your database administrator - Ensure the MySQL server allows remote connections - Check that your IP is whitelisted in MySQL's user permissions -3. **Common Connection Parameters:** +3. Common Connection Parameters: - user: Your MySQL username (default: 'root') - password: Your MySQL password (leave empty if no password set) - host: MySQL server address (default: 'localhost') - database: Target database name to connect to -4. **Troubleshooting:** +4. Troubleshooting: - Verify MySQL service is running: `brew services list` (macOS) or `sudo systemctl status mysql` (Linux) - Test connection: `mysql -u [username] -p -h [host] [database]` - - Common issues: Wrong credentials, server not running, firewall blocking connection - -5. **Security Notes:** - - Use dedicated database users with limited privileges for applications - - Avoid using root user for application connections - - Consider using SSL connections for remote databases - """ +""" def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): self.params = params diff --git a/py-src/data_formulator/data_loader/s3_data_loader.py b/py-src/data_formulator/data_loader/s3_data_loader.py index dcfacfe3..ec339c3f 100644 --- a/py-src/data_formulator/data_loader/s3_data_loader.py +++ b/py-src/data_formulator/data_loader/s3_data_loader.py @@ -22,65 +22,38 @@ def list_params() -> List[Dict[str, Any]]: @staticmethod def auth_instructions() -> str: return """ -To connect to Amazon S3, you'll need the following AWS credentials and configuration: - -**Required Parameters:** +**Required AWS Credentials:** - **AWS Access Key ID**: Your AWS access key identifier - **AWS Secret Access Key**: Your AWS secret access key -- **Region Name**: The AWS region where your S3 bucket is located (e.g., 'us-east-1', 'us-west-2') -- **Bucket**: The name of your S3 bucket - -**Optional Parameters:** -- **AWS Session Token**: Required only if using temporary credentials (e.g., from AWS STS or IAM roles) - -**How to Get AWS Credentials:** - -1. **AWS IAM User (Recommended for programmatic access):** - - Go to AWS Console → IAM → Users - - Create a new user or select existing user - - Go to "Security credentials" tab - - Click "Create access key" - - Choose "Application running outside AWS" - - Save both the Access Key ID and Secret Access Key securely - -2. **Required S3 Permissions:** - Your IAM user/role needs these permissions for the target bucket: - ```json - { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": [ - "s3:GetObject", - "s3:ListBucket" - ], - "Resource": [ - "arn:aws:s3:::your-bucket-name", - "arn:aws:s3:::your-bucket-name/*" - ] - } - ] - } - ``` - -3. **Finding Your Region:** - - Go to S3 Console → Select your bucket → Properties - - Look for "AWS Region" in the bucket overview - -**Security Best Practices:** -- Never share your secret access key -- Use IAM roles when possible instead of long-term access keys -- Consider using temporary credentials with session tokens for enhanced security -- Regularly rotate your access keys -- Use the principle of least privilege for S3 permissions +- **Region Name**: AWS region (e.g., 'us-east-1', 'us-west-2') +- **Bucket**: S3 bucket name +- **AWS Session Token**: Optional, for temporary credentials only + +**Getting Credentials:** +1. AWS Console → IAM → Users → Select user → Security credentials → Create access key +2. Choose "Application running outside AWS" + +**Required S3 Permissions:** +```json +{ + "Version": "2012-10-17", + "Statement": [{ + "Effect": "Allow", + "Action": ["s3:GetObject", "s3:ListBucket"], + "Resource": [ + "arn:aws:s3:::your-bucket-name", + "arn:aws:s3:::your-bucket-name/*" + ] + }] +} +``` **Supported File Formats:** - CSV files (.csv) - Parquet files (.parquet) - JSON files (.json, .jsonl) -The connector will automatically detect file types and load them appropriately using DuckDB's S3 integration. +**Security:** Never share secret keys, rotate regularly, use least privilege permissions. """ def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): diff --git a/py-src/data_formulator/tables_routes.py b/py-src/data_formulator/tables_routes.py index cb04f707..5d76574d 100644 --- a/py-src/data_formulator/tables_routes.py +++ b/py-src/data_formulator/tables_routes.py @@ -728,7 +728,10 @@ def data_loader_list_data_loaders(): return jsonify({ "status": "success", "data_loaders": { - name: data_loader.list_params() + name: { + "params": data_loader.list_params(), + "auth_instructions": data_loader.auth_instructions() + } for name, data_loader in DATA_LOADERS.items() } }) diff --git a/pyproject.toml b/pyproject.toml index 2a4c3e2d..15f21d3e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "data_formulator" -version = "0.2.1.2" +version = "0.2.1.3" requires-python = ">=3.9" authors = [ diff --git a/src/views/DBTableManager.tsx b/src/views/DBTableManager.tsx index cff9b2ac..01b710b7 100644 --- a/src/views/DBTableManager.tsx +++ b/src/views/DBTableManager.tsx @@ -67,12 +67,14 @@ import { DataFormulatorState } from '../app/dfSlice'; import { fetchFieldSemanticType } from '../app/dfSlice'; import { AppDispatch } from '../app/store'; import Editor from 'react-simple-code-editor'; +import Markdown from 'markdown-to-jsx'; import Prism from 'prismjs' import 'prismjs/components/prism-javascript' // Language import 'prismjs/themes/prism.css'; //Example style, you can use another import PrecisionManufacturingIcon from '@mui/icons-material/PrecisionManufacturing'; import CheckIcon from '@mui/icons-material/Check'; +import MuiMarkdown from 'mui-markdown'; export const handleDBDownload = async (sessionId: string) => { try { @@ -273,7 +275,9 @@ export const DBTableSelectionDialog: React.FC<{ buttonElement: any }> = function const [tableAnalysisMap, setTableAnalysisMap] = useState>({}); // maps data loader type to list of param defs - const [dataLoaderParamDefs, setDataLoaderParamDefs] = useState>({}); + const [dataLoaderMetadata, setDataLoaderMetadata] = useState>({}); const [dbTables, setDbTables] = useState([]); const [selectedTabKey, setSelectedTabKey] = useState(""); @@ -325,7 +329,7 @@ export const DBTableSelectionDialog: React.FC<{ buttonElement: any }> = function .then(response => response.json()) .then(data => { if (data.status === "success") { - setDataLoaderParamDefs(data.data_loaders); + setDataLoaderMetadata(data.data_loaders); } else { console.error('Failed to fetch data loader params:', data.error); } @@ -652,8 +656,22 @@ export const DBTableSelectionDialog: React.FC<{ buttonElement: any }> = function value={0} // not used, just to keep MUI happy sx={{px: 0.5}} > - connect external data - {["file upload", ...Object.keys(dataLoaderParamDefs ?? {})].map((dataLoaderType, i) => ( + + connect external data + + { + fetchDataLoaders(); + }}> + + + + + {["file upload", ...Object.keys(dataLoaderMetadata ?? {})].map((dataLoaderType, i) => ( = function {uploadFileButton({isUploading ? 'uploading...' : 'upload a csv/tsv file to the local database'})} - {dataLoaderParamDefs && Object.entries(dataLoaderParamDefs).map(([dataLoaderType, paramDefs]) => ( + {dataLoaderMetadata && Object.entries(dataLoaderMetadata).map(([dataLoaderType, metadata]) => ( { setIsUploading(true); }} @@ -857,9 +876,10 @@ export const DBTableSelectionDialog: React.FC<{ buttonElement: any }> = function export const DataLoaderForm: React.FC<{ dataLoaderType: string, paramDefs: {name: string, default: string, type: string, required: boolean, description: string}[], + authInstructions: string, onImport: () => void, onFinish: (status: "success" | "error", message: string) => void -}> = ({dataLoaderType, paramDefs, onImport, onFinish}) => { +}> = ({dataLoaderType, paramDefs, authInstructions, onImport, onFinish}) => { const dispatch = useDispatch(); @@ -868,6 +888,8 @@ export const DataLoaderForm: React.FC<{ const [tableMetadata, setTableMetadata] = useState>({}); let [displaySamples, setDisplaySamples] = useState>({}); + const [displayAuthInstructions, setDisplayAuthInstructions] = useState(false); + let [isConnecting, setIsConnecting] = useState(false); let [mode, setMode] = useState<"view tables" | "query">("view tables"); const toggleDisplaySamples = (tableName: string) => { @@ -1031,6 +1053,7 @@ export const DataLoaderForm: React.FC<{ sx={{textTransform: "none"}} onClick={() => { setIsConnecting(true); + setDisplayAuthInstructions(false); fetch(getUrls().DATA_LOADER_LIST_TABLES, { method: 'POST', headers: { @@ -1068,8 +1091,26 @@ export const DataLoaderForm: React.FC<{ }}> disconnect - } + + } + + + { + + + {authInstructions.trim()} + + + + } + {Object.keys(tableMetadata).length > 0 && tableMetadataBox } ); From 4124ad5f5841d55e6fe4f15d6e85334b3917e869 Mon Sep 17 00:00:00 2001 From: Chenglong Wang Date: Fri, 30 May 2025 11:50:24 -0700 Subject: [PATCH 6/7] fix error message display --- src/views/ModelSelectionDialog.tsx | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/views/ModelSelectionDialog.tsx b/src/views/ModelSelectionDialog.tsx index 461386f7..4d957334 100644 --- a/src/views/ModelSelectionDialog.tsx +++ b/src/views/ModelSelectionDialog.tsx @@ -59,6 +59,12 @@ interface AppConfig { DISABLE_DISPLAY_KEYS: boolean; } +const decodeHtmlEntities = (text: string): string => { + const textarea = document.createElement('textarea'); + textarea.innerHTML = text; + return textarea.value; +}; + export const ModelSelectionButton: React.FC<{}> = ({ }) => { const dispatch = useDispatch(); @@ -410,12 +416,15 @@ export const ModelSelectionButton: React.FC<{}> = ({ }) => { if (status == "unknown") { message = "Click the status icon to test again before applying."; } else if (status == "error") { - message = testedModels.find(t => t.id == model.id)?.message || "Unknown error"; + const rawMessage = testedModels.find(t => t.id == model.id)?.message || "Unknown error"; + message = decodeHtmlEntities(rawMessage); } const borderStyle = ['error', 'unknown'].includes(status) ? '1px dashed text.secondary' : undefined; const noBorderStyle = ['error', 'unknown'].includes(status) ? 'none' : undefined; + console.log(message) + return ( <> = ({ }) => { - {message} + {message} + From e01e5442d94369d06305d25e2bc6c527b0e83357 Mon Sep 17 00:00:00 2001 From: Chenglong Wang Date: Fri, 30 May 2025 11:55:53 -0700 Subject: [PATCH 7/7] readme update --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index fd57a5d0..324caa96 100644 --- a/README.md +++ b/README.md @@ -14,13 +14,16 @@ Transform data and create rich visualizations iteratively with AI 🪄. Try Data Formulator now! -[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/microsoft/data-formulator?quickstart=1) +Any questions? Ask on the Discord channel! [![Discord](https://img.shields.io/badge/discord-chat-green?logo=discord)](https://discord.gg/mYCZMQKYZb) + + + ## News 🔥🔥🔥 - [05-13-2025] Data Formulator 0.2.3: External Data Loader @@ -29,7 +32,6 @@ Transform data and create rich visualizations iteratively with AI 🪄. Try Data - Call for action [link](https://github.com/microsoft/data-formulator/issues/156): - Users: let us know which data source you'd like to load data from. - Developers: let's build more data loaders. - - Discord channel for discussions: join us! [![Discord](https://img.shields.io/badge/discord-chat-green?logo=discord)](https://discord.gg/mYCZMQKYZb) - [04-23-2025] Data Formulator 0.2: working with *large* data 📦📦📦 - Explore large data by: @@ -68,8 +70,6 @@ Transform data and create rich visualizations iteratively with AI 🪄. Try Data - [10-01-2024] Initial release of Data Formulator, check out our [[blog]](https://www.microsoft.com/en-us/research/blog/data-formulator-exploring-how-ai-can-help-analysts-create-rich-data-visualizations/) and [[video]](https://youtu.be/3ndlwt0Wi3c)! - - ## Overview **Data Formulator** is an application from Microsoft Research that uses large language models to transform data, expediting the practice of data visualization.