From a00d9a4d3f68be7a5abfc897ecdcaf4a6b5f682c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 13 May 2025 23:42:02 +0000
Subject: [PATCH 1/7] Bump vite from 5.4.15 to 5.4.19

Bumps [vite](https://github.com/vitejs/vite/tree/HEAD/packages/vite) from 5.4.15 to 5.4.19.
- [Release notes](https://github.com/vitejs/vite/releases)
- [Changelog](https://github.com/vitejs/vite/blob/v5.4.19/packages/vite/CHANGELOG.md)
- [Commits](https://github.com/vitejs/vite/commits/v5.4.19/packages/vite)

---
updated-dependencies:
- dependency-name: vite
  dependency-version: 5.4.19
  dependency-type: direct:development
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 package.json | 2 +-
 yarn.lock    | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/package.json b/package.json
index 962241e0..453c9dea 100644
--- a/package.json
+++ b/package.json
@@ -82,6 +82,6 @@
         "globals": "^15.12.0",
         "sass": "^1.77.6",
         "typescript-eslint": "^8.16.0",
-        "vite": "^5.4.15"
+        "vite": "^5.4.19"
     }
 }
diff --git a/yarn.lock b/yarn.lock
index 265b56d9..88db78c2 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -4479,10 +4479,10 @@ vega@^5.32.0:
     vega-voronoi "~4.2.4"
     vega-wordcloud "~4.1.6"
 
-vite@^5.4.15:
-  version "5.4.15"
-  resolved "https://registry.yarnpkg.com/vite/-/vite-5.4.15.tgz#2941547f10ebb4bf9b0fa0da863c06711eb7e5e5"
-  integrity sha512-6ANcZRivqL/4WtwPGTKNaosuNJr5tWiftOC7liM7G9+rMb8+oeJeyzymDu4rTN93seySBmbjSfsS3Vzr19KNtA==
+vite@^5.4.19:
+  version "5.4.19"
+  resolved "https://registry.yarnpkg.com/vite/-/vite-5.4.19.tgz#20efd060410044b3ed555049418a5e7d1998f959"
+  integrity sha512-qO3aKv3HoQC8QKiNSTuUM1l9o/XX3+c+VTgLHbJWHZGeTPVAg2XwazI9UWzoxjIJCGCV2zU60uqMzjeLZuULqA==
   dependencies:
     esbuild "^0.21.3"
     postcss "^8.4.43"

From 4538055f041b1bfca5e8c0cc041250772e4b10d7 Mon Sep 17 00:00:00 2001
From: Chenglong Wang <chenglong.wang@microsoft.com>
Date: Thu, 15 May 2025 17:42:55 -0700
Subject: [PATCH 2/7] various fixes

---
 .../agents/agent_code_explanation.py          |  3 +-
 .../agents/agent_py_concept_derive.py         |  2 -
 .../agents/agent_py_data_transform.py         |  1 -
 .../agents/agent_query_completion.py          |  3 +-
 py-src/data_formulator/agents/agent_utils.py  |  4 --
 py-src/data_formulator/agents/client_utils.py |  1 -
 .../data_loader/kusto_data_loader.py          | 43 ++-----------------
 .../data_loader/mysql_data_loader.py          |  2 +-
 pyproject.toml                                |  2 +-
 requirements.txt                              |  2 +-
 src/views/DBTableManager.tsx                  |  2 +-
 11 files changed, 10 insertions(+), 55 deletions(-)

diff --git a/py-src/data_formulator/agents/agent_code_explanation.py b/py-src/data_formulator/agents/agent_code_explanation.py
index 8d16a968..af348e3b 100644
--- a/py-src/data_formulator/agents/agent_code_explanation.py
+++ b/py-src/data_formulator/agents/agent_code_explanation.py
@@ -1,8 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-import pandas as pd
-from data_formulator.agents.agent_utils import generate_data_summary, extract_code_from_gpt_response
+from data_formulator.agents.agent_utils import generate_data_summary
 
 import logging
 
diff --git a/py-src/data_formulator/agents/agent_py_concept_derive.py b/py-src/data_formulator/agents/agent_py_concept_derive.py
index 58181d85..f6e3e77f 100644
--- a/py-src/data_formulator/agents/agent_py_concept_derive.py
+++ b/py-src/data_formulator/agents/agent_py_concept_derive.py
@@ -1,7 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-import json
 import time
 
 from data_formulator.agents.agent_utils import generate_data_summary, extract_code_from_gpt_response
@@ -10,7 +9,6 @@
 import traceback
 
 import logging
-import datetime
 
 logger = logging.getLogger(__name__)
 
diff --git a/py-src/data_formulator/agents/agent_py_data_transform.py b/py-src/data_formulator/agents/agent_py_data_transform.py
index b3cc999e..8cc86daf 100644
--- a/py-src/data_formulator/agents/agent_py_data_transform.py
+++ b/py-src/data_formulator/agents/agent_py_data_transform.py
@@ -2,7 +2,6 @@
 # Licensed under the MIT License.
 
 import json
-import sys
 
 from data_formulator.agents.agent_utils import extract_json_objects, generate_data_summary, extract_code_from_gpt_response
 import data_formulator.py_sandbox as py_sandbox
diff --git a/py-src/data_formulator/agents/agent_query_completion.py b/py-src/data_formulator/agents/agent_query_completion.py
index 8beed90c..0dd6f494 100644
--- a/py-src/data_formulator/agents/agent_query_completion.py
+++ b/py-src/data_formulator/agents/agent_query_completion.py
@@ -1,10 +1,9 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-import pandas as pd
 import json
 
-from data_formulator.agents.agent_utils import extract_code_from_gpt_response, extract_json_objects
+from data_formulator.agents.agent_utils import extract_json_objects
 import re
 import logging
 
diff --git a/py-src/data_formulator/agents/agent_utils.py b/py-src/data_formulator/agents/agent_utils.py
index 2d518932..e18962e0 100644
--- a/py-src/data_formulator/agents/agent_utils.py
+++ b/py-src/data_formulator/agents/agent_utils.py
@@ -6,10 +6,6 @@
 import pandas as pd
 import numpy as np
 
-import base64
-
-from pprint import pprint
-
 import re
 
 def string_to_py_varname(var_str): 
diff --git a/py-src/data_formulator/agents/client_utils.py b/py-src/data_formulator/agents/client_utils.py
index 94334aee..a0eb3558 100644
--- a/py-src/data_formulator/agents/client_utils.py
+++ b/py-src/data_formulator/agents/client_utils.py
@@ -1,4 +1,3 @@
-import os
 import litellm
 import openai
 from azure.identity import DefaultAzureCredential, get_bearer_token_provider
diff --git a/py-src/data_formulator/data_loader/kusto_data_loader.py b/py-src/data_formulator/data_loader/kusto_data_loader.py
index 210c3e68..9aca5dbb 100644
--- a/py-src/data_formulator/data_loader/kusto_data_loader.py
+++ b/py-src/data_formulator/data_loader/kusto_data_loader.py
@@ -51,41 +51,6 @@ def query(self, kql: str) -> pd.DataFrame:
         return dataframe_from_result_table(result.primary_results[0])
 
     def list_tables(self) -> List[Dict[str, Any]]:
-        # first list functions (views)
-        query = ".show functions"
-        function_result_df = self.query(query)
-
-        functions = []
-        for func in function_result_df.to_dict(orient="records"):
-            func_name = func['Name']
-            result = self.query(f".show function ['{func_name}'] schema as json").to_dict(orient="records")
-            schema = json.loads(result[0]['Schema'])
-            parameters = schema['InputParameters']
-            columns = [{
-                'name': r["Name"],
-                'type': r["Type"]
-            } for r in schema['OutputColumns']]
-
-            # skip functions with parameters at the moment
-            if len(parameters) > 0:
-                continue
-
-            sample_query = f"['{func_name}'] | take {10}"
-            sample_result = self.query(sample_query).to_dict(orient="records")
-        
-            function_metadata = {
-                "row_count": 0,
-                "columns": columns,
-                "parameters": parameters,
-                "sample_rows": sample_result
-            }
-            functions.append({
-                "type": "function",
-                "name": func_name,
-                "metadata": function_metadata
-            })
-
-        # then list tables
         query = ".show tables"
         tables_df = self.query(query)
 
@@ -101,8 +66,8 @@ def list_tables(self) -> List[Dict[str, Any]]:
             row_count_result = self.query(f".show table ['{table_name}'] details").to_dict(orient="records")
             row_count = row_count_result[0]["TotalRowCount"]
 
-            sample_query = f"['{table_name}'] | take {10}"
-            sample_result = self.query(sample_query).to_dict(orient="records")
+            sample_query = f"['{table_name}'] | take {5}"
+            sample_result = json.loads(self.query(sample_query).to_json(orient="records"))
 
             table_metadata = {
                 "row_count": row_count,
@@ -116,7 +81,7 @@ def list_tables(self) -> List[Dict[str, Any]]:
                 "metadata": table_metadata
             })
 
-        return functions + tables
+        return tables
     
     def ingest_data(self, table_name: str, name_as: str = None, size: int = 5000000) -> pd.DataFrame:
         if name_as is None:
@@ -167,7 +132,7 @@ def ingest_data(self, table_name: str, name_as: str = None, size: int = 5000000)
             total_rows_ingested += len(chunk_df)
 
     def view_query_sample(self, query: str) -> str:
-        return self.query(query).head(10).to_dict(orient="records")
+        return json.loads(self.query(query).head(10).to_json(orient="records"))
 
     def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame:
         # Sanitize the table name for SQL compatibility
diff --git a/py-src/data_formulator/data_loader/mysql_data_loader.py b/py-src/data_formulator/data_loader/mysql_data_loader.py
index 625204e3..37c08fc9 100644
--- a/py-src/data_formulator/data_loader/mysql_data_loader.py
+++ b/py-src/data_formulator/data_loader/mysql_data_loader.py
@@ -93,7 +93,7 @@ def ingest_data(self, table_name: str, name_as: str | None = None, size: int = 1
         """)
 
     def view_query_sample(self, query: str) -> str:
-        return self.duck_db_conn.execute(query).df().head(10).to_dict(orient="records")
+        return json.loads(self.duck_db_conn.execute(query).df().head(10).to_json(orient="records"))
 
     def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame:
         # Execute the query and get results as a DataFrame
diff --git a/pyproject.toml b/pyproject.toml
index 96675706..2a4c3e2d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,7 +21,6 @@ classifiers = [
 ]
 
 dependencies = [  
-    "autopep8",  
     "jupyter",  
     "pandas",  
     "docker",  
@@ -31,6 +30,7 @@ dependencies = [
     "flask-cors",  
     "openai",  
     "azure-identity",  
+    "azure-kusto-data",
     "azure-keyvault-secrets",  
     "python-dotenv",  
     "vega_datasets",
diff --git a/requirements.txt b/requirements.txt
index 60068b75..e7919529 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,3 @@
-autopep8
 jupyter
 pandas
 docker
@@ -7,6 +6,7 @@ matplotlib
 flask
 openai
 azure-identity
+azure-kusto-data
 azure-keyvault-secrets
 python-dotenv
 vega_datasets
diff --git a/src/views/DBTableManager.tsx b/src/views/DBTableManager.tsx
index a856122d..f9fac51d 100644
--- a/src/views/DBTableManager.tsx
+++ b/src/views/DBTableManager.tsx
@@ -294,7 +294,7 @@ export const DBTableSelectionDialog: React.FC<{ buttonElement: any }> = function
     }, [errorMessage])
 
     useEffect(() => {
-        if (dbTables.length == 0) {
+        if (!selectedTabKey.startsWith("dataLoader:") && dbTables.length == 0) {
             setSelectedTabKey("");
         } else if (!selectedTabKey.startsWith("dataLoader:") && dbTables.find(t => t.name === selectedTabKey) == undefined) {
             setSelectedTabKey(dbTables[0].name);

From 48f4ee2fcbacdf877f7927d196119cd1a716a363 Mon Sep 17 00:00:00 2001
From: slackroo <sumanthomgowda@gmail.com>
Date: Wed, 28 May 2025 09:52:56 +0530
Subject: [PATCH 3/7] "Add S3 data loader support to DBTableManager and data
 formulator"

---
 .../data_formulator/data_loader/__init__.py   |   6 +-
 .../data_loader/s3_data_loader.py             | 188 ++++++++++++++++++
 requirements.txt                              |   5 +-
 src/views/DBTableManager.tsx                  |   2 +-
 4 files changed, 197 insertions(+), 4 deletions(-)
 create mode 100644 py-src/data_formulator/data_loader/s3_data_loader.py

diff --git a/py-src/data_formulator/data_loader/__init__.py b/py-src/data_formulator/data_loader/__init__.py
index 145ac806..19982df5 100644
--- a/py-src/data_formulator/data_loader/__init__.py
+++ b/py-src/data_formulator/data_loader/__init__.py
@@ -1,10 +1,12 @@
 from data_formulator.data_loader.external_data_loader import ExternalDataLoader
 from data_formulator.data_loader.mysql_data_loader import MySQLDataLoader
 from data_formulator.data_loader.kusto_data_loader import KustoDataLoader
+from data_formulator.data_loader.s3_data_loader import S3DataLoader
 
 DATA_LOADERS = {
     "mysql": MySQLDataLoader,
-    "kusto": KustoDataLoader
+    "kusto": KustoDataLoader,
+    "s3": S3DataLoader,
 }
 
-__all__ = ["ExternalDataLoader", "MySQLDataLoader", "KustoDataLoader", "DATA_LOADERS"]
\ No newline at end of file
+__all__ = ["ExternalDataLoader", "MySQLDataLoader", "KustoDataLoader", "S3DataLoader", "DATA_LOADERS"]
diff --git a/py-src/data_formulator/data_loader/s3_data_loader.py b/py-src/data_formulator/data_loader/s3_data_loader.py
new file mode 100644
index 00000000..6ce9cd42
--- /dev/null
+++ b/py-src/data_formulator/data_loader/s3_data_loader.py
@@ -0,0 +1,188 @@
+import json
+import pandas as pd
+import duckdb
+import os
+
+from data_formulator.data_loader.external_data_loader import ExternalDataLoader, sanitize_table_name
+from typing import Dict, Any, List
+
+class S3DataLoader(ExternalDataLoader):
+
+    @staticmethod
+    def list_params() -> List[Dict[str, Any]]:
+        params_list = [
+            {"name": "aws_access_key_id", "type": "string", "required": True, "default": "", "description": "AWS access key ID"},
+            {"name": "aws_secret_access_key", "type": "string", "required": True, "default": "", "description": "AWS secret access key"},
+            {"name": "aws_session_token", "type": "string", "required": False, "default": "", "description": "AWS session token (required for temporary credentials)"},
+            {"name": "region_name", "type": "string", "required": True, "default": "us-east-1", "description": "AWS region name"},
+            {"name": "bucket", "type": "string", "required": True, "default": "", "description": "S3 bucket name"}
+        ]
+        return params_list
+
+    def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection):
+        self.params = params
+        self.duck_db_conn = duck_db_conn
+        
+        # Extract parameters
+        self.aws_access_key_id = params.get("aws_access_key_id", "")
+        self.aws_secret_access_key = params.get("aws_secret_access_key", "")
+        self.aws_session_token = params.get("aws_session_token", "")
+        self.region_name = params.get("region_name", "us-east-1")
+        self.bucket = params.get("bucket", "")
+        
+        # Install and load the httpfs extension for S3 access
+        self.duck_db_conn.install_extension("httpfs")
+        self.duck_db_conn.load_extension("httpfs")
+        
+        # Set AWS credentials for DuckDB
+        self.duck_db_conn.execute(f"SET s3_region='{self.region_name}'")
+        self.duck_db_conn.execute(f"SET s3_access_key_id='{self.aws_access_key_id}'")
+        self.duck_db_conn.execute(f"SET s3_secret_access_key='{self.aws_secret_access_key}'")
+        if self.aws_session_token:  # Add this block
+            self.duck_db_conn.execute(f"SET s3_session_token='{self.aws_session_token}'")
+
+    def list_tables(self) -> List[Dict[str, Any]]:
+        # Use boto3 to list objects in the bucket
+        import boto3
+        
+        s3_client = boto3.client(
+            's3',
+            aws_access_key_id=self.aws_access_key_id,
+            aws_secret_access_key=self.aws_secret_access_key,
+            aws_session_token=self.aws_session_token if self.aws_session_token else None,
+            region_name=self.region_name
+        )
+        
+        # List objects in the bucket
+        response = s3_client.list_objects_v2(Bucket=self.bucket)
+        
+        results = []
+        
+        if 'Contents' in response:
+            for obj in response['Contents']:
+                key = obj['Key']
+                
+                # Skip directories and non-data files
+                if key.endswith('/') or not self._is_supported_file(key):
+                    continue
+                
+                # Create S3 URL
+                s3_url = f"s3://{self.bucket}/{key}"
+                
+                try:
+                    # Choose the appropriate read function based on file extension
+                    if s3_url.lower().endswith('.parquet'):
+                        sample_df = self.duck_db_conn.execute(f"SELECT * FROM read_parquet('{s3_url}') LIMIT 10").df()
+                    elif s3_url.lower().endswith('.json') or s3_url.lower().endswith('.jsonl'):
+                        sample_df = self.duck_db_conn.execute(f"SELECT * FROM read_json_auto('{s3_url}') LIMIT 10").df()
+                    elif s3_url.lower().endswith('.csv'):  # Default to CSV for other formats
+                        sample_df = self.duck_db_conn.execute(f"SELECT * FROM read_csv_auto('{s3_url}') LIMIT 10").df()
+                    
+                    # Get column information
+                    columns = [{
+                        'name': col,
+                        'type': str(sample_df[col].dtype)
+                    } for col in sample_df.columns]
+                    
+                    # Get sample data
+                    sample_rows = json.loads(sample_df.to_json(orient="records"))
+                    
+                    # Estimate row count (this is approximate for CSV files)
+                    row_count = self._estimate_row_count(s3_url)
+                    
+                    table_metadata = {
+                        "row_count": row_count,
+                        "columns": columns,
+                        "sample_rows": sample_rows
+                    }
+                    
+                    results.append({
+                        "name": s3_url,
+                        "metadata": table_metadata
+                    })
+                except Exception as e:
+                    # Skip files that can't be read
+                    print(f"Error reading {s3_url}: {e}")
+                    continue
+        
+        return results
+    
+    def _is_supported_file(self, key: str) -> bool:
+        """Check if the file type is supported by DuckDB."""
+        supported_extensions = ['.csv', '.parquet', '.json', '.jsonl']
+        return any(key.lower().endswith(ext) for ext in supported_extensions)
+    
+    def _estimate_row_count(self, s3_url: str) -> int:
+        """Estimate the number of rows in a file."""
+        try:
+            # For parquet files, we can get the exact count
+            if s3_url.lower().endswith('.parquet'):
+                count = self.duck_db_conn.execute(f"SELECT COUNT(*) FROM read_parquet('{s3_url}')").fetchone()[0]
+                return count
+            
+            # For CSV files, we'll sample the file to estimate size
+            sample_size = 1000
+            sample_df = self.duck_db_conn.execute(f"SELECT * FROM read_csv_auto('{s3_url}') LIMIT {sample_size}").df()
+            
+            # Get file size from S3
+            import boto3
+            s3_client = boto3.client(
+                's3',
+                aws_access_key_id=self.aws_access_key_id,
+                aws_secret_access_key=self.aws_secret_access_key,
+                aws_session_token=self.aws_session_token if self.aws_session_token else None,
+                region_name=self.region_name
+            )
+            
+            key = s3_url.replace(f"s3://{self.bucket}/", "")
+            response = s3_client.head_object(Bucket=self.bucket, Key=key)
+            file_size = response['ContentLength']
+            
+            # Estimate based on sample size and file size
+            if len(sample_df) > 0:
+                # Calculate average row size in bytes
+                avg_row_size = file_size / len(sample_df)
+                estimated_rows = int(file_size / avg_row_size)
+                return min(estimated_rows, 1000000)  # Cap at 1 million for UI performance
+            
+            return 0
+        except Exception as e:
+            print(f"Error estimating row count for {s3_url}: {e}")
+            return 0
+
+    def ingest_data(self, table_name: str, name_as: str = None, size: int = 1000000):
+        if name_as is None:
+            name_as = table_name.split('/')[-1].split('.')[0]
+        
+        name_as = sanitize_table_name(name_as)
+        
+        # Determine file type and use appropriate DuckDB function
+        if table_name.lower().endswith('.csv'):
+            self.duck_db_conn.execute(f"""
+                CREATE OR REPLACE TABLE main.{name_as} AS 
+                SELECT * FROM read_csv_auto('{table_name}')
+                LIMIT {size}
+            """)
+        elif table_name.lower().endswith('.parquet'):
+            self.duck_db_conn.execute(f"""
+                CREATE OR REPLACE TABLE main.{name_as} AS 
+                SELECT * FROM read_parquet('{table_name}')
+                LIMIT {size}
+            """)
+        elif table_name.lower().endswith('.json') or table_name.lower().endswith('.jsonl'):
+            self.duck_db_conn.execute(f"""
+                CREATE OR REPLACE TABLE main.{name_as} AS 
+                SELECT * FROM read_json_auto('{table_name}')
+                LIMIT {size}
+            """)
+        else:
+            raise ValueError(f"Unsupported file type: {table_name}")
+
+    def view_query_sample(self, query: str) -> List[Dict[str, Any]]:
+        return self.duck_db_conn.execute(query).df().head(10).to_dict(orient="records")
+
+    def ingest_data_from_query(self, query: str, name_as: str):
+        # Execute the query and get results as a DataFrame
+        df = self.duck_db_conn.execute(query).df()
+        # Use the base class's method to ingest the DataFrame
+        self.ingest_df_to_duckdb(df, name_as)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 60068b75..b936b23c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,8 +8,11 @@ flask
 openai
 azure-identity
 azure-keyvault-secrets
+azure-kusto-data
+azure-storage-blob
 python-dotenv
 vega_datasets
 litellm
 duckdb
--e . #also need to install data formulator itself
\ No newline at end of file
+boto3
+-e . #also need to install data formulator itself
diff --git a/src/views/DBTableManager.tsx b/src/views/DBTableManager.tsx
index a856122d..e2902027 100644
--- a/src/views/DBTableManager.tsx
+++ b/src/views/DBTableManager.tsx
@@ -653,7 +653,7 @@ export const DBTableSelectionDialog: React.FC<{ buttonElement: any }> = function
                     sx={{px: 0.5}}
                 >
                     <Typography variant="caption" sx={{color: "text.secondary", fontWeight: "bold", px: 1}}>connect external data</Typography>
-                    {["file upload", "mysql", "kusto"].map((dataLoaderType, i) => (
+                    {["file upload", "mysql", "kusto","s3"].map((dataLoaderType, i) => (
                         <Tab 
                             key={`dataLoader:${dataLoaderType}`} 
                             wrapped 

From 257374fefaeb090b698ffad4d83d8a02dfb44964 Mon Sep 17 00:00:00 2001
From: Chenglong Wang <chenglong.wang@microsoft.com>
Date: Thu, 29 May 2025 17:44:40 -0700
Subject: [PATCH 4/7] adding azure blob data loader

---
 .../data_formulator/data_loader/__init__.py   |   4 +-
 .../data_loader/azure_blob_data_loader.py     | 363 ++++++++++++++++++
 .../data_loader/external_data_loader.py       |   5 +
 .../data_loader/kusto_data_loader.py          |  35 ++
 .../data_loader/mysql_data_loader.py          |  32 ++
 .../data_loader/s3_data_loader.py             |  93 +++--
 src/views/DBTableManager.tsx                  |   4 +-
 src/views/VisualizationView.tsx               |   2 -
 8 files changed, 507 insertions(+), 31 deletions(-)
 create mode 100644 py-src/data_formulator/data_loader/azure_blob_data_loader.py

diff --git a/py-src/data_formulator/data_loader/__init__.py b/py-src/data_formulator/data_loader/__init__.py
index 19982df5..6aa797c7 100644
--- a/py-src/data_formulator/data_loader/__init__.py
+++ b/py-src/data_formulator/data_loader/__init__.py
@@ -2,11 +2,13 @@
 from data_formulator.data_loader.mysql_data_loader import MySQLDataLoader
 from data_formulator.data_loader.kusto_data_loader import KustoDataLoader
 from data_formulator.data_loader.s3_data_loader import S3DataLoader
+from data_formulator.data_loader.azure_blob_data_loader import AzureBlobDataLoader
 
 DATA_LOADERS = {
     "mysql": MySQLDataLoader,
     "kusto": KustoDataLoader,
     "s3": S3DataLoader,
+    "azure_blob": AzureBlobDataLoader,
 }
 
-__all__ = ["ExternalDataLoader", "MySQLDataLoader", "KustoDataLoader", "S3DataLoader", "DATA_LOADERS"]
+__all__ = ["ExternalDataLoader", "MySQLDataLoader", "KustoDataLoader", "S3DataLoader", "AzureBlobDataLoader", "DATA_LOADERS"]
diff --git a/py-src/data_formulator/data_loader/azure_blob_data_loader.py b/py-src/data_formulator/data_loader/azure_blob_data_loader.py
new file mode 100644
index 00000000..081eb6d2
--- /dev/null
+++ b/py-src/data_formulator/data_loader/azure_blob_data_loader.py
@@ -0,0 +1,363 @@
+import json
+import pandas as pd
+import duckdb
+import os
+
+from data_formulator.data_loader.external_data_loader import ExternalDataLoader, sanitize_table_name
+from typing import Dict, Any, List
+
+class AzureBlobDataLoader(ExternalDataLoader):
+
+    @staticmethod
+    def list_params() -> List[Dict[str, Any]]:
+        params_list = [
+            {"name": "account_name", "type": "string", "required": True, "default": "", "description": "Azure storage account name"},
+            {"name": "container_name", "type": "string", "required": True, "default": "", "description": "Azure blob container name"},
+            {"name": "connection_string", "type": "string", "required": False, "default": "", "description": "Azure storage connection string (alternative to account_name + credentials)"},
+            {"name": "credential_chain", "type": "string", "required": False, "default": "cli;managed_identity;env", "description": "Ordered list of Azure credential providers (cli;managed_identity;env)"},
+            {"name": "account_key", "type": "string", "required": False, "default": "", "description": "Azure storage account key"},
+            {"name": "sas_token", "type": "string", "required": False, "default": "", "description": "Azure SAS token"},
+            {"name": "endpoint", "type": "string", "required": False, "default": "blob.core.windows.net", "description": "Azure endpoint override"}
+        ]
+        return params_list
+    
+    @staticmethod
+    def auth_instructions() -> str:
+        return """**Authentication Options (choose one)**
+
+**Option 1 - Connection String (Simplest)**
+- Get connection string from Azure Portal > Storage Account > Access keys
+- Use `connection_string` parameter with full connection string
+- `account_name` can be omitted when using connection string
+
+**Option 2 - Account Key**
+- Get account key from Azure Portal > Storage Account > Access keys
+- Use `account_name` + `account_key` parameters
+- Provides full access to storage account
+
+**Option 3 - SAS Token (Recommended for limited access)**
+- Generate SAS token from Azure Portal > Storage Account > Shared access signature
+- Use `account_name` + `sas_token` parameters
+- Can be time-limited and permission-scoped
+
+**Option 4 - Credential Chain (Most Secure)**
+- Use `account_name` + `container_name` only (no explicit credentials)
+- Requires Azure CLI login (`az login`) or Managed Identity
+- Default chain: `cli;managed_identity;env`
+- Customize with `credential_chain` parameter
+
+**Additional Options**
+- `endpoint`: Custom endpoint (default: `blob.core.windows.net`)
+- For Azure Government: `blob.core.usgovcloudapi.net`
+- For Azure China: `blob.core.chinacloudapi.cn`"""
+
+    def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection):
+        self.params = params
+        self.duck_db_conn = duck_db_conn
+        
+        # Extract parameters
+        self.account_name = params.get("account_name", "")
+        self.container_name = params.get("container_name", "")
+        self.connection_string = params.get("connection_string", "")
+        self.credential_chain = params.get("credential_chain", "cli;managed_identity;env")
+        self.account_key = params.get("account_key", "")
+        self.sas_token = params.get("sas_token", "")
+        self.endpoint = params.get("endpoint", "blob.core.windows.net")
+        
+        # Install and load the azure extension
+        self.duck_db_conn.install_extension("azure")
+        self.duck_db_conn.load_extension("azure")
+        
+        # Set up Azure authentication using secrets (preferred method)
+        self._setup_azure_authentication()
+
+    def _setup_azure_authentication(self):
+        """Set up Azure authentication using DuckDB secrets."""
+        if self.connection_string:
+            # Use connection string authentication
+            self.duck_db_conn.execute(f"""
+                CREATE OR REPLACE SECRET azure_secret (
+                    TYPE AZURE,
+                    CONNECTION_STRING '{self.connection_string}'
+                )
+            """)
+        elif self.account_key:
+            # Use account key authentication
+            self.duck_db_conn.execute(f"""
+                CREATE OR REPLACE SECRET azure_secret (
+                    TYPE AZURE,
+                    ACCOUNT_NAME '{self.account_name}',
+                    ACCOUNT_KEY '{self.account_key}'
+                )
+            """)
+        elif self.sas_token:
+            # Use SAS token authentication
+            self.duck_db_conn.execute(f"""
+                CREATE OR REPLACE SECRET azure_secret (
+                    TYPE AZURE,
+                    ACCOUNT_NAME '{self.account_name}',
+                    SAS_TOKEN '{self.sas_token}'
+                )
+            """)
+        else:
+            # Use credential chain authentication (default)
+            self.duck_db_conn.execute(f"""
+                CREATE OR REPLACE SECRET azure_secret (
+                    TYPE AZURE,
+                    PROVIDER credential_chain,
+                    ACCOUNT_NAME '{self.account_name}',
+                    CHAIN '{self.credential_chain}'
+                )
+            """)
+
+    def list_tables(self) -> List[Dict[str, Any]]:
+        # Use Azure SDK to list blobs in the container
+        from azure.storage.blob import BlobServiceClient
+        
+        # Create blob service client based on authentication method
+        if self.connection_string:
+            blob_service_client = BlobServiceClient.from_connection_string(self.connection_string)
+        elif self.account_key:
+            blob_service_client = BlobServiceClient(
+                account_url=f"https://{self.account_name}.{self.endpoint}",
+                credential=self.account_key
+            )
+        elif self.sas_token:
+            blob_service_client = BlobServiceClient(
+                account_url=f"https://{self.account_name}.{self.endpoint}",
+                credential=self.sas_token
+            )
+        else:
+            # Use default credential chain
+            from azure.identity import DefaultAzureCredential
+            credential = DefaultAzureCredential()
+            blob_service_client = BlobServiceClient(
+                account_url=f"https://{self.account_name}.{self.endpoint}",
+                credential=credential
+            )
+        
+        container_client = blob_service_client.get_container_client(self.container_name)
+        
+        # List blobs in the container
+        blob_list = container_client.list_blobs()
+        
+        results = []
+        
+        for blob in blob_list:
+            blob_name = blob.name
+            
+            # Skip directories and non-data files
+            if blob_name.endswith('/') or not self._is_supported_file(blob_name):
+                continue
+            
+            # Create Azure blob URL
+            azure_url = f"az://{self.account_name}.{self.endpoint}/{self.container_name}/{blob_name}"
+            
+            try:
+                # Choose the appropriate read function based on file extension
+                if azure_url.lower().endswith('.parquet'):
+                    sample_df = self.duck_db_conn.execute(f"SELECT * FROM read_parquet('{azure_url}') LIMIT 10").df()
+                elif azure_url.lower().endswith('.json') or azure_url.lower().endswith('.jsonl'):
+                    sample_df = self.duck_db_conn.execute(f"SELECT * FROM read_json_auto('{azure_url}') LIMIT 10").df()
+                elif azure_url.lower().endswith('.csv'):
+                    sample_df = self.duck_db_conn.execute(f"SELECT * FROM read_csv_auto('{azure_url}') LIMIT 10").df()
+                
+                # Get column information
+                columns = [{
+                    'name': col,
+                    'type': str(sample_df[col].dtype)
+                } for col in sample_df.columns]
+                
+                # Get sample data
+                sample_rows = json.loads(sample_df.to_json(orient="records"))
+                
+                # Estimate row count
+                row_count = self._estimate_row_count(azure_url, blob)
+                
+                table_metadata = {
+                    "row_count": row_count,
+                    "columns": columns,
+                    "sample_rows": sample_rows
+                }
+                
+                results.append({
+                    "name": azure_url,
+                    "metadata": table_metadata
+                })
+            except Exception as e:
+                # Skip files that can't be read
+                print(f"Error reading {azure_url}: {e}")
+                continue
+        
+        return results
+    
+    def _is_supported_file(self, blob_name: str) -> bool:
+        """Check if the file type is supported by DuckDB."""
+        supported_extensions = ['.csv', '.parquet', '.json', '.jsonl']
+        return any(blob_name.lower().endswith(ext) for ext in supported_extensions)
+    
+    def _estimate_row_count(self, azure_url: str, blob_properties=None) -> int:
+        """Estimate the number of rows in a file using intelligent strategies."""
+        try:
+            file_extension = azure_url.lower().split('.')[-1]
+            
+            # For parquet files, use metadata to get exact count efficiently
+            if file_extension == 'parquet':
+                try:
+                    # Use DuckDB's parquet_file_metadata to get exact row count without full scan
+                    metadata = self.duck_db_conn.execute(
+                        f"SELECT num_rows FROM parquet_file_metadata('{azure_url}')"
+                    ).fetchone()
+                    if metadata and metadata[0] is not None:
+                        return metadata[0]
+                except Exception as parquet_error:
+                    print(f"Failed to get parquet metadata for {azure_url}: {parquet_error}")
+                    # Fall back to counting (expensive but accurate)
+                    try:
+                        count = self.duck_db_conn.execute(f"SELECT COUNT(*) FROM read_parquet('{azure_url}')").fetchone()[0]
+                        return count
+                    except Exception:
+                        pass
+            
+            # For CSV, JSON, and JSONL files, use intelligent sampling
+            elif file_extension in ['csv', 'json', 'jsonl']:
+                return self._estimate_rows_by_sampling(azure_url, blob_properties, file_extension)
+            
+            return 0
+            
+        except Exception as e:
+            print(f"Error estimating row count for {azure_url}: {e}")
+            return 0
+
+    def _estimate_rows_by_sampling(self, azure_url: str, blob_properties, file_extension: str) -> int:
+        """Estimate row count for text-based files using sampling and file size."""
+        try:
+            # Get file size from blob properties if available
+            file_size_bytes = None
+            if blob_properties and hasattr(blob_properties, 'size'):
+                file_size_bytes = blob_properties.size
+            
+            # If no file size available, try a different approach
+            if file_size_bytes is None:
+                # Sample first 10,000 rows and extrapolate if needed
+                return self._estimate_by_row_sampling(azure_url, file_extension)
+            
+            # Sample approach: read first N rows and estimate based on size
+            sample_size = min(10000, file_size_bytes // 100)  # Adaptive sample size
+            sample_size = max(1000, sample_size)  # At least 1000 rows
+            
+            try:
+                if file_extension == 'csv':
+                    sample_df = self.duck_db_conn.execute(
+                        f"SELECT * FROM read_csv_auto('{azure_url}') LIMIT {sample_size}"
+                    ).df()
+                elif file_extension in ['json', 'jsonl']:
+                    sample_df = self.duck_db_conn.execute(
+                        f"SELECT * FROM read_json_auto('{azure_url}') LIMIT {sample_size}"
+                    ).df()
+                else:
+                    return 0
+                
+                sample_rows = len(sample_df)
+                if sample_rows == 0:
+                    return 0
+                    
+                # If we got fewer rows than requested, that's probably all there is
+                if sample_rows < sample_size:
+                    return sample_rows
+                
+                # Estimate bytes per row from sample
+                # For CSV: assume average line length based on file size
+                if file_extension == 'csv':
+                    # Rough estimate: file_size / (sample_rows * estimated_line_overhead)
+                    # CSV overhead includes delimiters, quotes, newlines
+                    estimated_bytes_per_row = file_size_bytes / sample_rows * (sample_size / file_size_bytes)
+                    estimated_total_rows = int(file_size_bytes / max(estimated_bytes_per_row, 50))  # Min 50 bytes per row
+                else:
+                    # For JSON: more complex structure, use conservative estimate
+                    # Assume JSON overhead is higher
+                    estimated_bytes_per_row = file_size_bytes / sample_rows * (sample_size / file_size_bytes)
+                    estimated_total_rows = int(file_size_bytes / max(estimated_bytes_per_row, 100))  # Min 100 bytes per row
+                
+                # Apply reasonable bounds
+                estimated_total_rows = max(sample_rows, estimated_total_rows)  # At least as many as we sampled
+                estimated_total_rows = min(estimated_total_rows, file_size_bytes // 10)  # Max based on very small rows
+                
+                return estimated_total_rows
+                
+            except Exception as e:
+                print(f"Error in size-based estimation for {azure_url}: {e}")
+                return self._estimate_by_row_sampling(azure_url, file_extension)
+                
+        except Exception as e:
+            print(f"Error in sampling estimation for {azure_url}: {e}")
+            return 0
+
+    def _estimate_by_row_sampling(self, azure_url: str, file_extension: str) -> int:
+        """Fallback method: sample rows without file size info."""
+        try:
+            # Try to read a reasonable sample and see if we get less than requested
+            # This indicates we've read the whole file
+            test_limit = 50000
+            
+            if file_extension == 'csv':
+                sample_df = self.duck_db_conn.execute(
+                    f"SELECT * FROM read_csv_auto('{azure_url}') LIMIT {test_limit}"
+                ).df()
+            elif file_extension in ['json', 'jsonl']:
+                sample_df = self.duck_db_conn.execute(
+                    f"SELECT * FROM read_json_auto('{azure_url}') LIMIT {test_limit}"
+                ).df()
+            else:
+                return 0
+            
+            sample_rows = len(sample_df)
+            
+            # If we got fewer rows than the limit, that's likely the total
+            if sample_rows < test_limit:
+                return sample_rows
+            
+            # Otherwise, we can't estimate accurately without more information
+            # Return the sample size as a lower bound
+            return sample_rows
+            
+        except Exception as e:
+            print(f"Error in row sampling for {azure_url}: {e}")
+            return 0
+
+    def ingest_data(self, table_name: str, name_as: str = None, size: int = 1000000):
+        if name_as is None:
+            name_as = table_name.split('/')[-1].split('.')[0]
+        
+        name_as = sanitize_table_name(name_as)
+        
+        # Determine file type and use appropriate DuckDB function
+        if table_name.lower().endswith('.csv'):
+            self.duck_db_conn.execute(f"""
+                CREATE OR REPLACE TABLE main.{name_as} AS 
+                SELECT * FROM read_csv_auto('{table_name}')
+                LIMIT {size}
+            """)
+        elif table_name.lower().endswith('.parquet'):
+            self.duck_db_conn.execute(f"""
+                CREATE OR REPLACE TABLE main.{name_as} AS 
+                SELECT * FROM read_parquet('{table_name}')
+                LIMIT {size}
+            """)
+        elif table_name.lower().endswith('.json') or table_name.lower().endswith('.jsonl'):
+            self.duck_db_conn.execute(f"""
+                CREATE OR REPLACE TABLE main.{name_as} AS 
+                SELECT * FROM read_json_auto('{table_name}')
+                LIMIT {size}
+            """)
+        else:
+            raise ValueError(f"Unsupported file type: {table_name}")
+
+    def view_query_sample(self, query: str) -> List[Dict[str, Any]]:
+        return self.duck_db_conn.execute(query).df().head(10).to_dict(orient="records")
+
+    def ingest_data_from_query(self, query: str, name_as: str):
+        # Execute the query and get results as a DataFrame
+        df = self.duck_db_conn.execute(query).df()
+        # Use the base class's method to ingest the DataFrame
+        self.ingest_df_to_duckdb(df, name_as)
\ No newline at end of file
diff --git a/py-src/data_formulator/data_loader/external_data_loader.py b/py-src/data_formulator/data_loader/external_data_loader.py
index 540f1748..5d55796f 100644
--- a/py-src/data_formulator/data_loader/external_data_loader.py
+++ b/py-src/data_formulator/data_loader/external_data_loader.py
@@ -67,6 +67,11 @@ def ingest_df_to_duckdb(self, df: pd.DataFrame, table_name: str):
     def list_params() -> List[Dict[str, Any]]:
         pass
 
+    @staticmethod
+    @abstractmethod
+    def auth_instructions() -> str:
+        pass
+
     @abstractmethod
     def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection):
         pass
diff --git a/py-src/data_formulator/data_loader/kusto_data_loader.py b/py-src/data_formulator/data_loader/kusto_data_loader.py
index 9aca5dbb..4d363d73 100644
--- a/py-src/data_formulator/data_loader/kusto_data_loader.py
+++ b/py-src/data_formulator/data_loader/kusto_data_loader.py
@@ -23,6 +23,41 @@ def list_params() -> bool:
             {"name": "tenant_id", "type": "string", "required": False, "description": "only necessary for AppKey auth"}
         ]
         return params_list
+    
+    @staticmethod
+    def auth_instructions() -> str:
+        return """
+Azure Kusto Authentication Instructions:
+
+This data loader supports two authentication methods:
+
+**Method 1: Azure CLI Authentication (Recommended for development)**
+1. Install Azure CLI: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli
+2. Run `az login` in your terminal to authenticate
+3. Ensure you have access to the specified Kusto cluster and database
+4. Leave client_id, client_secret, and tenant_id parameters empty
+
+**Method 2: Application Key Authentication (Recommended for production)**
+1. Register an Azure AD application in your tenant
+2. Generate a client secret for the application
+3. Grant the application appropriate permissions to your Kusto cluster:
+   - Go to your Kusto cluster in Azure Portal
+   - Navigate to Permissions > Add
+   - Add your application as a user with appropriate role (e.g., "AllDatabasesViewer" for read access)
+4. Provide the following parameters:
+   - client_id: Application (client) ID from your Azure AD app registration
+   - client_secret: Client secret value you generated
+   - tenant_id: Directory (tenant) ID from your Azure AD
+
+**Required Parameters:**
+- kusto_cluster: Your Kusto cluster URI (e.g., "https://mycluster.region.kusto.windows.net")
+- kusto_database: Name of the database you want to access
+
+**Troubleshooting:**
+- If authentication fails, ensure you have the correct permissions on the Kusto cluster
+- For CLI auth, make sure you're logged in with `az account show`
+- For app key auth, verify your client_id, client_secret, and tenant_id are correct
+        """
 
     def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection):
 
diff --git a/py-src/data_formulator/data_loader/mysql_data_loader.py b/py-src/data_formulator/data_loader/mysql_data_loader.py
index 37c08fc9..314c36f3 100644
--- a/py-src/data_formulator/data_loader/mysql_data_loader.py
+++ b/py-src/data_formulator/data_loader/mysql_data_loader.py
@@ -18,6 +18,38 @@ def list_params() -> bool:
         ]
         return params_list
 
+    @staticmethod
+    def auth_instructions() -> str:
+        return """
+MySQL Connection Instructions:
+
+1. **Local MySQL Setup:**
+   - Ensure MySQL server is running on your machine
+   - Default connection: host='localhost', user='root'
+   - If you haven't set a root password, leave password field empty
+
+2. **Remote MySQL Connection:**
+   - Obtain host address, username, and password from your database administrator
+   - Ensure the MySQL server allows remote connections
+   - Check that your IP is whitelisted in MySQL's user permissions
+
+3. **Common Connection Parameters:**
+   - user: Your MySQL username (default: 'root')
+   - password: Your MySQL password (leave empty if no password set)
+   - host: MySQL server address (default: 'localhost')
+   - database: Target database name to connect to
+
+4. **Troubleshooting:**
+   - Verify MySQL service is running: `brew services list` (macOS) or `sudo systemctl status mysql` (Linux)
+   - Test connection: `mysql -u [username] -p -h [host] [database]`
+   - Common issues: Wrong credentials, server not running, firewall blocking connection
+
+5. **Security Notes:**
+   - Use dedicated database users with limited privileges for applications
+   - Avoid using root user for application connections
+   - Consider using SSL connections for remote databases
+        """
+
     def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection):
         self.params = params
         self.duck_db_conn = duck_db_conn
diff --git a/py-src/data_formulator/data_loader/s3_data_loader.py b/py-src/data_formulator/data_loader/s3_data_loader.py
index 6ce9cd42..dcfacfe3 100644
--- a/py-src/data_formulator/data_loader/s3_data_loader.py
+++ b/py-src/data_formulator/data_loader/s3_data_loader.py
@@ -19,6 +19,70 @@ def list_params() -> List[Dict[str, Any]]:
         ]
         return params_list
 
+    @staticmethod
+    def auth_instructions() -> str:
+        return """
+To connect to Amazon S3, you'll need the following AWS credentials and configuration:
+
+**Required Parameters:**
+- **AWS Access Key ID**: Your AWS access key identifier
+- **AWS Secret Access Key**: Your AWS secret access key  
+- **Region Name**: The AWS region where your S3 bucket is located (e.g., 'us-east-1', 'us-west-2')
+- **Bucket**: The name of your S3 bucket
+
+**Optional Parameters:**
+- **AWS Session Token**: Required only if using temporary credentials (e.g., from AWS STS or IAM roles)
+
+**How to Get AWS Credentials:**
+
+1. **AWS IAM User (Recommended for programmatic access):**
+   - Go to AWS Console → IAM → Users
+   - Create a new user or select existing user
+   - Go to "Security credentials" tab
+   - Click "Create access key"
+   - Choose "Application running outside AWS"
+   - Save both the Access Key ID and Secret Access Key securely
+
+2. **Required S3 Permissions:**
+   Your IAM user/role needs these permissions for the target bucket:
+   ```json
+   {
+     "Version": "2012-10-17",
+     "Statement": [
+       {
+         "Effect": "Allow",
+         "Action": [
+           "s3:GetObject",
+           "s3:ListBucket"
+         ],
+         "Resource": [
+           "arn:aws:s3:::your-bucket-name",
+           "arn:aws:s3:::your-bucket-name/*"
+         ]
+       }
+     ]
+   }
+   ```
+
+3. **Finding Your Region:**
+   - Go to S3 Console → Select your bucket → Properties
+   - Look for "AWS Region" in the bucket overview
+
+**Security Best Practices:**
+- Never share your secret access key
+- Use IAM roles when possible instead of long-term access keys
+- Consider using temporary credentials with session tokens for enhanced security
+- Regularly rotate your access keys
+- Use the principle of least privilege for S3 permissions
+
+**Supported File Formats:**
+- CSV files (.csv)
+- Parquet files (.parquet) 
+- JSON files (.json, .jsonl)
+
+The connector will automatically detect file types and load them appropriately using DuckDB's S3 integration.
+        """
+
     def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection):
         self.params = params
         self.duck_db_conn = duck_db_conn
@@ -120,32 +184,9 @@ def _estimate_row_count(self, s3_url: str) -> int:
                 count = self.duck_db_conn.execute(f"SELECT COUNT(*) FROM read_parquet('{s3_url}')").fetchone()[0]
                 return count
             
-            # For CSV files, we'll sample the file to estimate size
-            sample_size = 1000
-            sample_df = self.duck_db_conn.execute(f"SELECT * FROM read_csv_auto('{s3_url}') LIMIT {sample_size}").df()
-            
-            # Get file size from S3
-            import boto3
-            s3_client = boto3.client(
-                's3',
-                aws_access_key_id=self.aws_access_key_id,
-                aws_secret_access_key=self.aws_secret_access_key,
-                aws_session_token=self.aws_session_token if self.aws_session_token else None,
-                region_name=self.region_name
-            )
-            
-            key = s3_url.replace(f"s3://{self.bucket}/", "")
-            response = s3_client.head_object(Bucket=self.bucket, Key=key)
-            file_size = response['ContentLength']
-            
-            # Estimate based on sample size and file size
-            if len(sample_df) > 0:
-                # Calculate average row size in bytes
-                avg_row_size = file_size / len(sample_df)
-                estimated_rows = int(file_size / avg_row_size)
-                return min(estimated_rows, 1000000)  # Cap at 1 million for UI performance
-            
-            return 0
+            # For CSV, JSON, and JSONL files, we'll skip row count
+            if s3_url.lower().endswith('.csv') or s3_url.lower().endswith('.json') or s3_url.lower().endswith('.jsonl'):
+                return 0
         except Exception as e:
             print(f"Error estimating row count for {s3_url}: {e}")
             return 0
diff --git a/src/views/DBTableManager.tsx b/src/views/DBTableManager.tsx
index 6d8e5f0d..cff9b2ac 100644
--- a/src/views/DBTableManager.tsx
+++ b/src/views/DBTableManager.tsx
@@ -653,7 +653,7 @@ export const DBTableSelectionDialog: React.FC<{ buttonElement: any }> = function
                     sx={{px: 0.5}}
                 >
                     <Typography variant="caption" sx={{color: "text.secondary", fontWeight: "bold", px: 1}}>connect external data</Typography>
-                    {["file upload", "mysql", "kusto","s3"].map((dataLoaderType, i) => (
+                    {["file upload", ...Object.keys(dataLoaderParamDefs ?? {})].map((dataLoaderType, i) => (
                         <Tab 
                             key={`dataLoader:${dataLoaderType}`} 
                             wrapped 
@@ -911,7 +911,7 @@ export const DataLoaderForm: React.FC<{
                     return [
                     <TableRow
                         key={tableName}
-                        sx={{ '&:last-child td, &:last-child th': { border: 0 }, '& .MuiTableCell-root': { padding: 0.25 }}}
+                        sx={{ '&:last-child td, &:last-child th': { border: 0 }, '& .MuiTableCell-root': { padding: 0.25, wordWrap: 'break-word', whiteSpace: 'normal' }}}
                     >
                         <TableCell sx={{borderBottom: displaySamples[tableName] ? 'none' : '1px solid rgba(0, 0, 0, 0.1)'}}>
                             <IconButton size="small" onClick={() => toggleDisplaySamples(tableName)}>
diff --git a/src/views/VisualizationView.tsx b/src/views/VisualizationView.tsx
index 7460c0bc..f3443876 100644
--- a/src/views/VisualizationView.tsx
+++ b/src/views/VisualizationView.tsx
@@ -363,8 +363,6 @@ export const ChartEditorFC: FC<{  cachedCandidates: DictTable[],
     const [errorMessage, setErrorMessage] = useState<{content: string, severity: "error" | "warning" | "info" | "success"}>({content: "", severity: "error"});
     const [showError, setShowError] = useState<boolean>(false);
 
-    
-
     let createVisTableRowsLocal = (rows: any[]) => {
         if (visFields.length == 0) {
             return rows;

From ebdc0f0994ed9b91e11bbbacb9edbf89345e1601 Mon Sep 17 00:00:00 2001
From: Chenglong Wang <chenglong.wang@microsoft.com>
Date: Fri, 30 May 2025 11:24:23 -0700
Subject: [PATCH 5/7] data loader updates

---
 README.md                                     |  4 +-
 .../data_loader/azure_blob_data_loader.py     | 50 +++++++------
 .../data_loader/kusto_data_loader.py          | 56 ++++++--------
 .../data_loader/mysql_data_loader.py          | 16 ++--
 .../data_loader/s3_data_loader.py             | 75 ++++++-------------
 py-src/data_formulator/tables_routes.py       |  5 +-
 pyproject.toml                                |  2 +-
 src/views/DBTableManager.tsx                  | 57 ++++++++++++--
 8 files changed, 137 insertions(+), 128 deletions(-)

diff --git a/README.md b/README.md
index d244ad77..fd57a5d0 100644
--- a/README.md
+++ b/README.md
@@ -23,9 +23,9 @@ Transform data and create rich visualizations iteratively with AI 🪄. Try Data
 
 ## News 🔥🔥🔥
 
-- [05-13-2025] Data Formulator 0.2.1: External Data Loader 
+- [05-13-2025] Data Formulator 0.2.3: External Data Loader 
   - We introduced external data loader class to make import data easier. [Readme](https://github.com/microsoft/data-formulator/tree/main/py-src/data_formulator/data_loader) and [Demo](https://github.com/microsoft/data-formulator/pull/155)
-    - Example data loaders from MySQL and Azure Data Explorer (Kusto) are provided.
+    - Current data loaders: MySQL, Azure Data Explorer (Kusto), Azure Blob and Amazon S3 (json, parquet, csv).
   - Call for action [link](https://github.com/microsoft/data-formulator/issues/156):
     - Users: let us know which data source you'd like to load data from.
     - Developers: let's build more data loaders.
diff --git a/py-src/data_formulator/data_loader/azure_blob_data_loader.py b/py-src/data_formulator/data_loader/azure_blob_data_loader.py
index 081eb6d2..e2e2f78f 100644
--- a/py-src/data_formulator/data_loader/azure_blob_data_loader.py
+++ b/py-src/data_formulator/data_loader/azure_blob_data_loader.py
@@ -23,33 +23,39 @@ def list_params() -> List[Dict[str, Any]]:
     
     @staticmethod
     def auth_instructions() -> str:
-        return """**Authentication Options (choose one)**
+        return """Authentication Options (choose one)
 
-**Option 1 - Connection String (Simplest)**
-- Get connection string from Azure Portal > Storage Account > Access keys
-- Use `connection_string` parameter with full connection string
-- `account_name` can be omitted when using connection string
+Option 1 - Connection String (Simplest)
+    - Get connection string from Azure Portal > Storage Account > Access keys
+    - Use `connection_string` parameter with full connection string
+    - `account_name` can be omitted when using connection string
 
-**Option 2 - Account Key**
-- Get account key from Azure Portal > Storage Account > Access keys
-- Use `account_name` + `account_key` parameters
-- Provides full access to storage account
+Option 2 - Account Key
+    - Get account key from Azure Portal > Storage Account > Access keys
+    - Use `account_name` + `account_key` parameters
+    - Provides full access to storage account
 
-**Option 3 - SAS Token (Recommended for limited access)**
-- Generate SAS token from Azure Portal > Storage Account > Shared access signature
-- Use `account_name` + `sas_token` parameters
-- Can be time-limited and permission-scoped
+Option 3 - SAS Token (Recommended for limited access)
+    - Generate SAS token from Azure Portal > Storage Account > Shared access signature
+    - Use `account_name` + `sas_token` parameters
+    - Can be time-limited and permission-scoped
 
-**Option 4 - Credential Chain (Most Secure)**
-- Use `account_name` + `container_name` only (no explicit credentials)
-- Requires Azure CLI login (`az login`) or Managed Identity
-- Default chain: `cli;managed_identity;env`
-- Customize with `credential_chain` parameter
+Option 4 - Credential Chain (Most Secure)
+    - Use `account_name` + `container_name` only (no explicit credentials)
+    - Requires Azure CLI login (`az login` in terminal) or Managed Identity
+    - Default chain: `cli;managed_identity;env`
+    - Customize with `credential_chain` parameter
 
-**Additional Options**
-- `endpoint`: Custom endpoint (default: `blob.core.windows.net`)
-- For Azure Government: `blob.core.usgovcloudapi.net`
-- For Azure China: `blob.core.chinacloudapi.cn`"""
+Additional Options
+    - `endpoint`: Custom endpoint (default: `blob.core.windows.net`)
+    - For Azure Government: `blob.core.usgovcloudapi.net`
+    - For Azure China: `blob.core.chinacloudapi.cn`
+
+Supported File Formats:
+    - CSV files (.csv)
+    - Parquet files (.parquet) 
+    - JSON files (.json, .jsonl)
+"""
 
     def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection):
         self.params = params
diff --git a/py-src/data_formulator/data_loader/kusto_data_loader.py b/py-src/data_formulator/data_loader/kusto_data_loader.py
index 4d363d73..b0ed936c 100644
--- a/py-src/data_formulator/data_loader/kusto_data_loader.py
+++ b/py-src/data_formulator/data_loader/kusto_data_loader.py
@@ -26,38 +26,30 @@ def list_params() -> bool:
     
     @staticmethod
     def auth_instructions() -> str:
-        return """
-Azure Kusto Authentication Instructions:
-
-This data loader supports two authentication methods:
-
-**Method 1: Azure CLI Authentication (Recommended for development)**
-1. Install Azure CLI: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli
-2. Run `az login` in your terminal to authenticate
-3. Ensure you have access to the specified Kusto cluster and database
-4. Leave client_id, client_secret, and tenant_id parameters empty
-
-**Method 2: Application Key Authentication (Recommended for production)**
-1. Register an Azure AD application in your tenant
-2. Generate a client secret for the application
-3. Grant the application appropriate permissions to your Kusto cluster:
-   - Go to your Kusto cluster in Azure Portal
-   - Navigate to Permissions > Add
-   - Add your application as a user with appropriate role (e.g., "AllDatabasesViewer" for read access)
-4. Provide the following parameters:
-   - client_id: Application (client) ID from your Azure AD app registration
-   - client_secret: Client secret value you generated
-   - tenant_id: Directory (tenant) ID from your Azure AD
-
-**Required Parameters:**
-- kusto_cluster: Your Kusto cluster URI (e.g., "https://mycluster.region.kusto.windows.net")
-- kusto_database: Name of the database you want to access
-
-**Troubleshooting:**
-- If authentication fails, ensure you have the correct permissions on the Kusto cluster
-- For CLI auth, make sure you're logged in with `az account show`
-- For app key auth, verify your client_id, client_secret, and tenant_id are correct
-        """
+        return """Azure Kusto Authentication Instructions
+
+Method 1: Azure CLI Authentication
+    1. Install Azure CLI: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli
+    2. Run `az login` in your terminal to authenticate
+    3. Ensure you have access to the specified Kusto cluster and database
+    4. Leave client_id, client_secret, and tenant_id parameters empty
+
+Method 2: Application Key Authentication
+    1. Register an Azure AD application in your tenant
+    2. Generate a client secret for the application
+    3. Grant the application appropriate permissions to your Kusto cluster:
+        - Go to your Kusto cluster in Azure Portal
+        - Navigate to Permissions > Add
+        - Add your application as a user with appropriate role (e.g., "AllDatabasesViewer" for read access)
+    4. Provide the following parameters:
+        - client_id: Application (client) ID from your Azure AD app registration
+        - client_secret: Client secret value you generated
+        - tenant_id: Directory (tenant) ID from your Azure AD
+
+Required Parameters:
+    - kusto_cluster: Your Kusto cluster URI (e.g., "https://mycluster.region.kusto.windows.net")
+    - kusto_database: Name of the database you want to access
+"""
 
     def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection):
 
diff --git a/py-src/data_formulator/data_loader/mysql_data_loader.py b/py-src/data_formulator/data_loader/mysql_data_loader.py
index 314c36f3..9eddc0cd 100644
--- a/py-src/data_formulator/data_loader/mysql_data_loader.py
+++ b/py-src/data_formulator/data_loader/mysql_data_loader.py
@@ -23,32 +23,26 @@ def auth_instructions() -> str:
         return """
 MySQL Connection Instructions:
 
-1. **Local MySQL Setup:**
+1. Local MySQL Setup:
    - Ensure MySQL server is running on your machine
    - Default connection: host='localhost', user='root'
    - If you haven't set a root password, leave password field empty
 
-2. **Remote MySQL Connection:**
+2. Remote MySQL Connection:
    - Obtain host address, username, and password from your database administrator
    - Ensure the MySQL server allows remote connections
    - Check that your IP is whitelisted in MySQL's user permissions
 
-3. **Common Connection Parameters:**
+3. Common Connection Parameters:
    - user: Your MySQL username (default: 'root')
    - password: Your MySQL password (leave empty if no password set)
    - host: MySQL server address (default: 'localhost')
    - database: Target database name to connect to
 
-4. **Troubleshooting:**
+4. Troubleshooting:
    - Verify MySQL service is running: `brew services list` (macOS) or `sudo systemctl status mysql` (Linux)
    - Test connection: `mysql -u [username] -p -h [host] [database]`
-   - Common issues: Wrong credentials, server not running, firewall blocking connection
-
-5. **Security Notes:**
-   - Use dedicated database users with limited privileges for applications
-   - Avoid using root user for application connections
-   - Consider using SSL connections for remote databases
-        """
+"""
 
     def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection):
         self.params = params
diff --git a/py-src/data_formulator/data_loader/s3_data_loader.py b/py-src/data_formulator/data_loader/s3_data_loader.py
index dcfacfe3..ec339c3f 100644
--- a/py-src/data_formulator/data_loader/s3_data_loader.py
+++ b/py-src/data_formulator/data_loader/s3_data_loader.py
@@ -22,65 +22,38 @@ def list_params() -> List[Dict[str, Any]]:
     @staticmethod
     def auth_instructions() -> str:
         return """
-To connect to Amazon S3, you'll need the following AWS credentials and configuration:
-
-**Required Parameters:**
+**Required AWS Credentials:**
 - **AWS Access Key ID**: Your AWS access key identifier
 - **AWS Secret Access Key**: Your AWS secret access key  
-- **Region Name**: The AWS region where your S3 bucket is located (e.g., 'us-east-1', 'us-west-2')
-- **Bucket**: The name of your S3 bucket
-
-**Optional Parameters:**
-- **AWS Session Token**: Required only if using temporary credentials (e.g., from AWS STS or IAM roles)
-
-**How to Get AWS Credentials:**
-
-1. **AWS IAM User (Recommended for programmatic access):**
-   - Go to AWS Console → IAM → Users
-   - Create a new user or select existing user
-   - Go to "Security credentials" tab
-   - Click "Create access key"
-   - Choose "Application running outside AWS"
-   - Save both the Access Key ID and Secret Access Key securely
-
-2. **Required S3 Permissions:**
-   Your IAM user/role needs these permissions for the target bucket:
-   ```json
-   {
-     "Version": "2012-10-17",
-     "Statement": [
-       {
-         "Effect": "Allow",
-         "Action": [
-           "s3:GetObject",
-           "s3:ListBucket"
-         ],
-         "Resource": [
-           "arn:aws:s3:::your-bucket-name",
-           "arn:aws:s3:::your-bucket-name/*"
-         ]
-       }
-     ]
-   }
-   ```
-
-3. **Finding Your Region:**
-   - Go to S3 Console → Select your bucket → Properties
-   - Look for "AWS Region" in the bucket overview
-
-**Security Best Practices:**
-- Never share your secret access key
-- Use IAM roles when possible instead of long-term access keys
-- Consider using temporary credentials with session tokens for enhanced security
-- Regularly rotate your access keys
-- Use the principle of least privilege for S3 permissions
+- **Region Name**: AWS region (e.g., 'us-east-1', 'us-west-2')
+- **Bucket**: S3 bucket name
+- **AWS Session Token**: Optional, for temporary credentials only
+
+**Getting Credentials:**
+1. AWS Console → IAM → Users → Select user → Security credentials → Create access key
+2. Choose "Application running outside AWS"
+
+**Required S3 Permissions:**
+```json
+{
+  "Version": "2012-10-17",
+  "Statement": [{
+    "Effect": "Allow",
+    "Action": ["s3:GetObject", "s3:ListBucket"],
+    "Resource": [
+      "arn:aws:s3:::your-bucket-name",
+      "arn:aws:s3:::your-bucket-name/*"
+    ]
+  }]
+}
+```
 
 **Supported File Formats:**
 - CSV files (.csv)
 - Parquet files (.parquet) 
 - JSON files (.json, .jsonl)
 
-The connector will automatically detect file types and load them appropriately using DuckDB's S3 integration.
+**Security:** Never share secret keys, rotate regularly, use least privilege permissions.
         """
 
     def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection):
diff --git a/py-src/data_formulator/tables_routes.py b/py-src/data_formulator/tables_routes.py
index cb04f707..5d76574d 100644
--- a/py-src/data_formulator/tables_routes.py
+++ b/py-src/data_formulator/tables_routes.py
@@ -728,7 +728,10 @@ def data_loader_list_data_loaders():
         return jsonify({
             "status": "success",
             "data_loaders": {
-                name:  data_loader.list_params()
+                name: {
+                    "params": data_loader.list_params(),
+                    "auth_instructions": data_loader.auth_instructions()
+                }
                 for name, data_loader in DATA_LOADERS.items()
             }
         })
diff --git a/pyproject.toml b/pyproject.toml
index 2a4c3e2d..15f21d3e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "data_formulator"
-version = "0.2.1.2"
+version = "0.2.1.3"
 
 requires-python = ">=3.9"
 authors = [
diff --git a/src/views/DBTableManager.tsx b/src/views/DBTableManager.tsx
index cff9b2ac..01b710b7 100644
--- a/src/views/DBTableManager.tsx
+++ b/src/views/DBTableManager.tsx
@@ -67,12 +67,14 @@ import { DataFormulatorState } from '../app/dfSlice';
 import { fetchFieldSemanticType } from '../app/dfSlice';
 import { AppDispatch } from '../app/store';
 import Editor from 'react-simple-code-editor';
+import Markdown from 'markdown-to-jsx';
 
 import Prism from 'prismjs'
 import 'prismjs/components/prism-javascript' // Language
 import 'prismjs/themes/prism.css'; //Example style, you can use another
 import PrecisionManufacturingIcon from '@mui/icons-material/PrecisionManufacturing';
 import CheckIcon from '@mui/icons-material/Check';
+import MuiMarkdown from 'mui-markdown';
 
 export const handleDBDownload = async (sessionId: string) => {
     try {
@@ -273,7 +275,9 @@ export const DBTableSelectionDialog: React.FC<{ buttonElement: any }> = function
     const [tableAnalysisMap, setTableAnalysisMap] = useState<Record<string, ColumnStatistics[] | null>>({});
     
     // maps data loader type to list of param defs
-    const [dataLoaderParamDefs, setDataLoaderParamDefs] = useState<Record<string, {name: string, default: string, type: string, required: boolean, description: string}[]>>({});
+    const [dataLoaderMetadata, setDataLoaderMetadata] = useState<Record<string, {
+        params: {name: string, default: string, type: string, required: boolean, description: string}[], 
+        auth_instructions: string}>>({});
 
     const [dbTables, setDbTables] = useState<DBTable[]>([]);
     const [selectedTabKey, setSelectedTabKey] = useState("");
@@ -325,7 +329,7 @@ export const DBTableSelectionDialog: React.FC<{ buttonElement: any }> = function
         .then(response => response.json())
         .then(data => {
             if (data.status === "success") {
-                setDataLoaderParamDefs(data.data_loaders);
+                setDataLoaderMetadata(data.data_loaders);
             } else {
                 console.error('Failed to fetch data loader params:', data.error);
             }
@@ -652,8 +656,22 @@ export const DBTableSelectionDialog: React.FC<{ buttonElement: any }> = function
                     value={0} // not used, just to keep MUI happy
                     sx={{px: 0.5}}
                 >
-                    <Typography variant="caption" sx={{color: "text.secondary", fontWeight: "bold", px: 1}}>connect external data</Typography>
-                    {["file upload", ...Object.keys(dataLoaderParamDefs ?? {})].map((dataLoaderType, i) => (
+                    <Typography variant="caption" sx={{color: "text.secondary", fontWeight: "bold", px: 1}}>
+                        connect external data
+                        <Tooltip title="refresh the data loader list">
+                            <IconButton size="small" color="primary" sx={{
+                                '&:hover': {
+                                    transform: 'rotate(180deg)',
+                                },
+                                transition: 'transform 0.3s ease-in-out',
+                            }} onClick={() => {
+                                fetchDataLoaders();
+                            }}>
+                                <RefreshIcon sx={{fontSize: 14}} />
+                            </IconButton>
+                        </Tooltip>
+                    </Typography>
+                    {["file upload", ...Object.keys(dataLoaderMetadata ?? {})].map((dataLoaderType, i) => (
                         <Tab 
                             key={`dataLoader:${dataLoaderType}`} 
                             wrapped 
@@ -676,13 +694,14 @@ export const DBTableSelectionDialog: React.FC<{ buttonElement: any }> = function
             <TabPanel key={`dataLoader:file upload`} sx={{width: 960, }} show={selectedTabKey === 'dataLoader:file upload'}>
                 {uploadFileButton(<Typography component="span" fontSize={18} textTransform="none">{isUploading ? 'uploading...' : 'upload a csv/tsv file to the local database'}</Typography>)} 
             </TabPanel>
-            {dataLoaderParamDefs && Object.entries(dataLoaderParamDefs).map(([dataLoaderType, paramDefs]) => (
+            {dataLoaderMetadata && Object.entries(dataLoaderMetadata).map(([dataLoaderType, metadata]) => (
                 <TabPanel key={`dataLoader:${dataLoaderType}`} sx={{width: 960, position: "relative", maxWidth: '100%'}} 
                     show={selectedTabKey === 'dataLoader:' + dataLoaderType}>
                     <DataLoaderForm 
                         key={`data-loader-form-${dataLoaderType}`}
                         dataLoaderType={dataLoaderType} 
-                        paramDefs={paramDefs}
+                        paramDefs={metadata.params}
+                        authInstructions={metadata.auth_instructions}
                         onImport={() => {
                             setIsUploading(true);
                         }} 
@@ -857,9 +876,10 @@ export const DBTableSelectionDialog: React.FC<{ buttonElement: any }> = function
 export const DataLoaderForm: React.FC<{
     dataLoaderType: string, 
     paramDefs: {name: string, default: string, type: string, required: boolean, description: string}[],
+    authInstructions: string,
     onImport: () => void,
     onFinish: (status: "success" | "error", message: string) => void
-}> = ({dataLoaderType, paramDefs, onImport, onFinish}) => {
+}> = ({dataLoaderType, paramDefs, authInstructions, onImport, onFinish}) => {
 
     const dispatch = useDispatch();
 
@@ -868,6 +888,8 @@ export const DataLoaderForm: React.FC<{
     const [tableMetadata, setTableMetadata] = useState<Record<string, any>>({});
     let [displaySamples, setDisplaySamples] = useState<Record<string, boolean>>({});
 
+    const [displayAuthInstructions, setDisplayAuthInstructions] = useState(false);
+
     let [isConnecting, setIsConnecting] = useState(false);
     let [mode, setMode] = useState<"view tables" | "query">("view tables");
     const toggleDisplaySamples = (tableName: string) => {
@@ -1031,6 +1053,7 @@ export const DataLoaderForm: React.FC<{
                         sx={{textTransform: "none"}}
                         onClick={() => {
                             setIsConnecting(true);
+                            setDisplayAuthInstructions(false);
                             fetch(getUrls().DATA_LOADER_LIST_TABLES, {
                                 method: 'POST',
                                 headers: {
@@ -1068,8 +1091,26 @@ export const DataLoaderForm: React.FC<{
                         }}>
                         disconnect
                     </Button>
-                </ButtonGroup>}
+                </ButtonGroup>
+                }
+                
             </Box>
+            <Button 
+                variant="text" 
+                size="small" 
+                sx={{textTransform: "none", height: 32, mt: 1}}
+                onClick={() => setDisplayAuthInstructions(!displayAuthInstructions)}>
+                {displayAuthInstructions ? "hide" : "show"} authentication instructions
+            </Button>
+            {<Collapse in={displayAuthInstructions} timeout="auto" unmountOnExit>
+                <Paper sx={{px: 1, py: 0.5}}>
+                    <Typography variant="body2" sx={{color: "text.secondary", fontSize: 12, whiteSpace: "pre-wrap", p: 1}}>
+                        {authInstructions.trim()}
+                    </Typography>
+                </Paper>
+                </Collapse>
+            }
+            
             {Object.keys(tableMetadata).length > 0 && tableMetadataBox }
         </Box>
     );

From 4124ad5f5841d55e6fe4f15d6e85334b3917e869 Mon Sep 17 00:00:00 2001
From: Chenglong Wang <chenglong.wang@microsoft.com>
Date: Fri, 30 May 2025 11:50:24 -0700
Subject: [PATCH 6/7] fix error message display

---
 src/views/ModelSelectionDialog.tsx | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/views/ModelSelectionDialog.tsx b/src/views/ModelSelectionDialog.tsx
index 461386f7..4d957334 100644
--- a/src/views/ModelSelectionDialog.tsx
+++ b/src/views/ModelSelectionDialog.tsx
@@ -59,6 +59,12 @@ interface AppConfig {
     DISABLE_DISPLAY_KEYS: boolean;
 }
 
+const decodeHtmlEntities = (text: string): string => {
+    const textarea = document.createElement('textarea');
+    textarea.innerHTML = text;
+    return textarea.value;
+};
+
 export const ModelSelectionButton: React.FC<{}> = ({ }) => {
 
     const dispatch = useDispatch();
@@ -410,12 +416,15 @@ export const ModelSelectionButton: React.FC<{}> = ({ }) => {
                     if (status == "unknown") {
                         message = "Click the status icon to test again before applying.";
                     } else if (status == "error") {
-                        message = testedModels.find(t => t.id == model.id)?.message || "Unknown error";
+                        const rawMessage = testedModels.find(t => t.id == model.id)?.message || "Unknown error";
+                        message = decodeHtmlEntities(rawMessage);
                     }
 
                     const borderStyle = ['error', 'unknown'].includes(status) ? '1px dashed text.secondary' : undefined;
                     const noBorderStyle = ['error', 'unknown'].includes(status) ? 'none' : undefined;
 
+                    console.log(message)
+
                     return (
                         <>
                         <TableRow
@@ -498,7 +507,8 @@ export const ModelSelectionButton: React.FC<{}> = ({ }) => {
                                 <TableCell colSpan={2} align="right" ></TableCell>
                                 <TableCell colSpan={6}>
                                     <Typography variant="caption" color="#c82c2c">
-                                        {message}
+                                        {message} 
+                                        
                                     </Typography>
                                 </TableCell>
                             </TableRow>

From e01e5442d94369d06305d25e2bc6c527b0e83357 Mon Sep 17 00:00:00 2001
From: Chenglong Wang <chenglong.wang@microsoft.com>
Date: Fri, 30 May 2025 11:55:53 -0700
Subject: [PATCH 7/7] readme update

---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index fd57a5d0..324caa96 100644
--- a/README.md
+++ b/README.md
@@ -14,13 +14,16 @@
 
 Transform data and create rich visualizations iteratively with AI 🪄. Try Data Formulator now!
 
-[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/microsoft/data-formulator?quickstart=1)
+Any questions? Ask on the Discord channel! [![Discord](https://img.shields.io/badge/discord-chat-green?logo=discord)](https://discord.gg/mYCZMQKYZb)
+
+<!-- [![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/microsoft/data-formulator?quickstart=1) -->
 
 <kbd>
   <a target="_blank" rel="noopener noreferrer" href="https://codespaces.new/microsoft/data-formulator?quickstart=1" title="open Data Formulator in GitHub Codespaces"><img src="public/data-formulator-screenshot.png"></a>
 </kbd>
 
 
+
 ## News 🔥🔥🔥
 
 - [05-13-2025] Data Formulator 0.2.3: External Data Loader 
@@ -29,7 +32,6 @@ Transform data and create rich visualizations iteratively with AI 🪄. Try Data
   - Call for action [link](https://github.com/microsoft/data-formulator/issues/156):
     - Users: let us know which data source you'd like to load data from.
     - Developers: let's build more data loaders.
-  - Discord channel for discussions: join us! [![Discord](https://img.shields.io/badge/discord-chat-green?logo=discord)](https://discord.gg/mYCZMQKYZb)
 
 - [04-23-2025] Data Formulator 0.2: working with *large* data 📦📦📦
   - Explore large data by:
@@ -68,8 +70,6 @@ Transform data and create rich visualizations iteratively with AI 🪄. Try Data
   
 - [10-01-2024] Initial release of Data Formulator, check out our [[blog]](https://www.microsoft.com/en-us/research/blog/data-formulator-exploring-how-ai-can-help-analysts-create-rich-data-visualizations/) and [[video]](https://youtu.be/3ndlwt0Wi3c)!
 
-
-
 ## Overview
 
 **Data Formulator** is an application from Microsoft Research that uses large language models to transform data, expediting the practice of data visualization.