microsoft · Chenglong-MS · Dec 9, 2025 · Nov 17, 2025 · Nov 18, 2025 · Nov 19, 2025
diff --git a/README.md b/README.md
@@ -1,33 +1,42 @@
-<h1>
-    <img src="./public/favicon.ico" alt="Data Formulator icon" width="28"> <b>Data Formulator: Vibe with data, in control</b>
+<h1 align="center">
+  <img src="./public/favicon.ico" alt="Data Formulator icon" width="28">&nbsp;
+  Data Formulator: AI-powered Data Visualization
 </h1>
 
-<div>
-
-[![arxiv](https://img.shields.io/badge/Paper-arXiv:2408.16119-b31b1b.svg)](https://arxiv.org/abs/2408.16119)&ensp;
-[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)&ensp;
-[![YouTube](https://img.shields.io/badge/YouTube-white?logo=youtube&logoColor=%23FF0000)](https://www.youtube.com/watch?v=GfTE2FLyMrs)&ensp;
-[![build](https://github.com/microsoft/data-formulator/actions/workflows/python-build.yml/badge.svg)](https://github.com/microsoft/data-formulator/actions/workflows/python-build.yml)
-[![Discord](https://img.shields.io/badge/discord-chat-green?logo=discord)](https://discord.gg/mYCZMQKYZb)
 
-</div>
+<p align="center">
+  🪄 Explore data with visualizations, powered by AI agents.
+</p>
 
-🪄 Turn data into insights with AI Agents, with the exploration paths you choose. Try Data Formulator now!
+<p align="center">
+  <a href="https://data-formulator.ai"><img src="https://img.shields.io/badge/🚀_Try_Online_Demo-data--formulator.ai-F59E0B?style=for-the-badge" alt="Try Online Demo"></a>
+  &nbsp;
+  <a href="#get-started"><img src="https://img.shields.io/badge/💻_Install_Locally-pip_install-3776AB?style=for-the-badge" alt="Install Locally"></a>
+</p>
 
-- 🤖 New in v0.5: agent model + interative control [(video)](https://www.youtube.com/watch?v=GfTE2FLyMrs)
-- 🔥🔥🔥 Try our online demo at [https://data-formulator.ai](https://data-formulator.ai)
-- Any questions, thoughts? Discuss in the Discord channel! [![Discord](https://img.shields.io/badge/discord-chat-green?logo=discord)](https://discord.gg/mYCZMQKYZb)
+<p align="center">
+  <a href="https://arxiv.org/abs/2408.16119"><img src="https://img.shields.io/badge/Paper-arXiv:2408.16119-b31b1b.svg" alt="arXiv"></a>&ensp;
+  <a href="https://opensource.org/licenses/MIT"><img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="License: MIT"></a>&ensp;
+  <a href="https://www.youtube.com/watch?v=GfTE2FLyMrs"><img src="https://img.shields.io/badge/YouTube-white?logo=youtube&logoColor=%23FF0000" alt="YouTube"></a>&ensp;
+  <a href="https://github.com/microsoft/data-formulator/actions/workflows/python-build.yml"><img src="https://github.com/microsoft/data-formulator/actions/workflows/python-build.yml/badge.svg" alt="build"></a>&ensp;
+  <a href="https://discord.gg/mYCZMQKYZb"><img src="https://img.shields.io/badge/discord-chat-green?logo=discord" alt="Discord"></a>
+</p>
 
 <!-- [![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/microsoft/data-formulator?quickstart=1) -->
+<!-- 
+https://github.com/user-attachments/assets/8ca57b68-4d7a-42cb-bcce-43f8b1681ce2 -->
 
-https://github.com/user-attachments/assets/8ca57b68-4d7a-42cb-bcce-43f8b1681ce2
-
-<!-- <kbd>
-  <a target="_blank" rel="noopener noreferrer" href="https://codespaces.new/microsoft/data-formulator?quickstart=1" title="open Data Formulator in GitHub Codespaces"><img src="public/data-formulator-screenshot-v0.5.png"></a>
-</kbd> -->
+<kbd>
+  <img src="public/data-formulator-screenshot-v0.5.png">
+</kbd>
 
 
 ## News 🔥🔥🔥
+[12-08-2025] **Data Formulator 0.5.1** — Connect more, visualize more, move faster
+- 🔌 **Community data loaders**: Google BigQuery, MySQL, Postgres, MongoDB
+- 📊 **New chart types**: US Map & Pie Chart (more to be added soon)
+- ✏️ **Editable reports**: Refine generated reports with [Chartifact](https://github.com/microsoft/chartifact) in markdown style. [demo](https://github.com/microsoft/data-formulator/pull/200#issue-3635408217)
+- ⚡ **Snappier UI**: Noticeably faster interactions across the board
 
 [11-07-2025] Data Formulator 0.5: Vibe with your data, in control
 
@@ -109,9 +118,9 @@ Here are milestones that lead to the current design:
 
 ## Overview
 
-**Data Formulator** is an application from Microsoft Research that uses AI agents to make it easier to turn data into insights.
+**Data Formulator** is a Microsoft Research prototype for data exploration with visualizations powered by AI agents.
 
-Data Formulator is an AI-powered tool for analysts to iteratively explore and visualize data. Started with data in any format (screenshot, text, csv, or database), users can work with AI agents with a novel blended interface that combines *user interface interactions (UI)* and *natural language (NL) inputs* to communicate their intents, control branching exploration directions, and create reports to share their insights. 
+Data Formulator enables analysts to iteratively explore and visualize data. Started with data in any format (screenshot, text, csv, or database), users can work with AI agents with a novel blended interface that combines *user interface interactions (UI)* and *natural language (NL) inputs* to communicate their intents, control branching exploration directions, and create reports to share their insights. 
 
 ## Get Started
 

diff --git a/package.json b/package.json
@@ -44,7 +44,7 @@
         "redux-persist": "^6.0.0",
         "typescript": "^4.9.5",
         "validator": "^13.15.20",
-        "vega": "^5.32.0",
+        "vega": "^6.2.0",
         "vega-embed": "^6.21.0",
         "vega-lite": "^5.5.0",
         "vm-browserify": "^1.1.2"

diff --git a/py-src/data_formulator/agents/agent_interactive_explore.py b/py-src/data_formulator/agents/agent_interactive_explore.py
@@ -67,54 +67,42 @@
 * when the exploration context is provided, make your suggestion based on the context as well as the original dataset; otherwise leverage the original dataset to suggest questions.
 
 Guidelines for question suggestions:
-1. Suggest a list of question_groups of interesting analytical questions that are not obvious that can uncover nontrivial insights, including both breadth and depth questions.
-
+1. Suggest a list of question_groups of interesting analytical questions that are not obvious that can uncover nontrivial insights.
 2. Use a diverse language style to display the questions (can be questions, statements etc)
 3. If there are multiple datasets in a thread, consider relationships between them
 4. CONCISENESS: the questions should be concise and to the point
 5. QUESTION GROUP GENERATION: 
     - different questions groups should cover different aspects of the data analysis for user to choose from.
-    - each question_group should include both 'breadth_questions' and 'depth_questions':
-        - breadth_questions: a group of questions that are all relatively simple that helps the user understand the data in a broad sense.
-        - depth_questions: a sequence of questions that build on top of each other to answer a specific aspect of the user's goal.
-    - you have a budget of generating 4 questions in total (or as directed by the user).
-        - allocate 2-3 questions to 'breadth_questions' and 2-3 questions to 'depth_questions' based on the user's goal and the data.
-        - each question group should slightly lean towards 'breadth' or 'depth' exploration, but not too much.
-        - the more focused area can have more questions than the other area.
+    - each question_group is a sequence of 'questions' that builds on top of each other to answer the user's goal.
     - each question group should have a difficulty level (easy / medium / hard),
         - simple questions should be short -- single sentence exploratory questions
         - medium questions can be 1-2 sentences exploratory questions
         - hard questions should introduce some new analysis concept but still make it concise
     - if suitable, include a group of questions that are related to statistical analysis: forecasting, regression, or clustering.
 6. QUESTIONS WITHIN A QUESTION GROUP:
-    - all questions should be a new question based on the thread of exploration the user provided, do not repeat questions that have already been explored in the thread
+    - raise new questions that are related to the user's goal, do not repeat questions that have already been explored in the context provided to you.
     - if the user provides a start question, suggested questions should be related to the start question.
-    - when suggesting 'breadth_questions' in a question_group, they should be a group of questions:
-        - they are related to the user's goal, they should each explore a different aspect of the user's goal in parallel.
-        - questions should consider different fields, metrics and statistical methods.
-        - each question within the group should be distinct from each other that they will lead to different insights and visualizations
-    - when suggesting 'depth_questions' in a question_group, they should be a sequence of questions:
-        - start of the question should provide an overview of the data in the direction going to be explored, and it will be refined in the subsequent questions.
-        - they progressively dive deeper into the data, building on top of the previous question.
-        - each question should be related to the previous question, introducing refined analysis (e.g., updated computation, filtering, different grouping, etc.)
+    - the questions should progressively dive deeper into the data, building on top of the previous question.
+        - start of the question should provide an overview of the data in the direction going to be explored.
+        - followup questions should refine the previous question, introducing refined analysis to deep dive into the data (e.g., updated computation, filtering, different grouping, etc.)
+        - don't jump too far from the previous question so that readers can understand the flow of the questions.
     - every question should be answerable with a visualization.
 7. FORMATTING: 
-    - include "breadth_questions" and "depth_questions" in the question group:
-        - each question group should have 2-3 questions (or as directed by the user).
+    - include "questions" in the question group:
+        - each question group should have 2-4 questions (or as directed by the user).
     - For each question group, include a 'goal' that summarizes the goal of the question group. 
         - The goal should all be a short single sentence (<12 words).
         - Meaning of the 'goal' should be clear that the user won't misunderstand the actual question descibed in 'text'.
         - It should capture the key computation and exploration direction of the question (do not omit any information that may lead to ambiguity), but also keep it concise.
         - include the **bold** keywords for the attributes / metrics that are important to the question, especially when the goal mentions fields / metrics in the original dataset (don't have to be exact match)
     - include 'difficulty' to indicate the difficulty of the question, it should be one of 'easy', 'medium', 'hard'
-    - a 'focus' field to indicate whether the overall question group leans more on 'breadth' or 'depth' exploration.
 
 Output should be a list of json objects in the following format, each line should be a json object representing a question group, starting with 'data: ':
 
 Format:
 
-data: {"breadth_questions": [...], "depth_questions": [...], "goal": ..., "difficulty": ..., "focus": "..."} 
-data: {"breadth_questions": [...], "depth_questions": [...], "goal": ..., "difficulty": ..., "focus": "..."} 
+data: {"questions": [...], "goal": ..., "difficulty": ...} 
+data: {"questions": [...], "goal": ..., "difficulty": ...} 
 ... // more question groups
 '''
 

diff --git a/py-src/data_formulator/agents/agent_query_completion.py b/py-src/data_formulator/agents/agent_query_completion.py
@@ -54,6 +54,10 @@ def __init__(self, client):
 
     def run(self, data_source_metadata, query):
 
+        # For MongoDB, treat it as a SQL-like data source for query generation
+        if data_source_metadata['data_loader_type'] == "mongodb":
+            data_source_metadata['data_loader_type'] = "SQL"
+
         user_query = f"[DATA SOURCE]\n\n{json.dumps(data_source_metadata, indent=2)}\n\n[USER INPUTS]\n\n{query}\n\n"
 
         logger.info(user_query)

diff --git a/py-src/data_formulator/data_loader/__init__.py b/py-src/data_formulator/data_loader/__init__.py
@@ -5,15 +5,18 @@
 from data_formulator.data_loader.s3_data_loader import S3DataLoader
 from data_formulator.data_loader.azure_blob_data_loader import AzureBlobDataLoader
 from data_formulator.data_loader.postgresql_data_loader import PostgreSQLDataLoader
+from data_formulator.data_loader.mongodb_data_loader import MongoDBDataLoader
+from data_formulator.data_loader.bigquery_data_loader import BigQueryDataLoader
 
 DATA_LOADERS = {
     "mysql": MySQLDataLoader,
     "mssql": MSSQLDataLoader,
     "kusto": KustoDataLoader,
     "s3": S3DataLoader,
     "azure_blob": AzureBlobDataLoader,
-    "postgresql": PostgreSQLDataLoader
+    "postgresql": PostgreSQLDataLoader,
+    "mongodb": MongoDBDataLoader,
+    "bigquery": BigQueryDataLoader
 }
 
-__all__ = ["ExternalDataLoader", "MySQLDataLoader", "MSSQLDataLoader", "KustoDataLoader", "S3DataLoader", "AzureBlobDataLoader","PostgreSQLDataLoader","DATA_LOADERS"]
-
+__all__ = ["ExternalDataLoader", "MySQLDataLoader", "MSSQLDataLoader", "KustoDataLoader", "S3DataLoader", "AzureBlobDataLoader","PostgreSQLDataLoader", "MongoDBDataLoader", "BigQueryDataLoader", "DATA_LOADERS"]
diff --git a/py-src/data_formulator/data_loader/azure_blob_data_loader.py b/py-src/data_formulator/data_loader/azure_blob_data_loader.py
@@ -7,6 +7,13 @@
 from typing import Dict, Any, List
 from data_formulator.security import validate_sql_query
 
+try:
+    from azure.storage.blob import BlobServiceClient, ContainerClient
+    from azure.identity import DefaultAzureCredential, AzureCliCredential, ManagedIdentityCredential, EnvironmentCredential, ChainedTokenCredential
+    AZURE_BLOB_AVAILABLE = True
+except ImportError:
+    AZURE_BLOB_AVAILABLE = False
+
 class AzureBlobDataLoader(ExternalDataLoader):
 
     @staticmethod
@@ -59,6 +66,12 @@ def auth_instructions() -> str:
 """
 
     def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection):
+        if not AZURE_BLOB_AVAILABLE:
+            raise ImportError(
+                "Azure storage libraries are required for Azure Blob connections. "
+                "Install with: pip install azure-storage-blob azure-identity"
+            )
+
         self.params = params
         self.duck_db_conn = duck_db_conn
 
@@ -368,7 +381,7 @@ def view_query_sample(self, query: str) -> List[Dict[str, Any]]:
         if not result:
             raise ValueError(error_message)
 
-        return self.duck_db_conn.execute(query).df().head(10).to_dict(orient="records")
+        return json.loads(self.duck_db_conn.execute(query).df().head(10).to_json(orient="records"))
 
     def ingest_data_from_query(self, query: str, name_as: str):
         # Execute the query and get results as a DataFrame