Skip to content

Commit 22f6907

Browse files
Add markdown output format for execute_sql to reduce token usage by ~50% (#297)
SQL results are consumed by LLMs via MCP, but the JSON array-of-objects format repeats every column name on every row — wasting ~42% of the payload on redundant keys. For a 100-row × 10-column result, JSON produces ~27K chars vs ~14K for a markdown table. This adds an `output_format` parameter (default: "markdown") to `execute_sql` and `execute_sql_multi`. Markdown tables state column names once in the header, which LLMs parse natively. Use `output_format="json"` for backwards compatibility. Closes #296
1 parent 1a00b36 commit 22f6907

2 files changed

Lines changed: 141 additions & 5 deletions

File tree

databricks-mcp-server/databricks_mcp_server/tools/sql.py

Lines changed: 63 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""SQL tools - Execute SQL queries and get table information."""
22

3-
from typing import Any, Dict, List, Optional
3+
from typing import Any, Dict, List, Optional, Union
44

55
from databricks_tools_core.sql import (
66
execute_sql as _execute_sql,
@@ -14,6 +14,45 @@
1414
from ..server import mcp
1515

1616

17+
def _format_results_markdown(rows: List[Dict[str, Any]]) -> str:
18+
"""Format SQL results as a markdown table.
19+
20+
Markdown tables state column names once in the header instead of repeating
21+
them on every row (as JSON does), reducing token usage by ~50%.
22+
23+
Args:
24+
rows: List of row dicts from the SQL executor.
25+
26+
Returns:
27+
Markdown table string, or "(no results)" if empty.
28+
"""
29+
if not rows:
30+
return "(no results)"
31+
32+
columns = list(rows[0].keys())
33+
34+
# Build header
35+
header = "| " + " | ".join(columns) + " |"
36+
separator = "| " + " | ".join("---" for _ in columns) + " |"
37+
38+
# Build rows — convert None to empty string, stringify everything
39+
data_lines = []
40+
for row in rows:
41+
cells = []
42+
for col in columns:
43+
val = row.get(col)
44+
cell = "" if val is None else str(val)
45+
# Escape pipe characters inside cell values
46+
cell = cell.replace("|", "\\|")
47+
cells.append(cell)
48+
data_lines.append("| " + " | ".join(cells) + " |")
49+
50+
parts = [header, separator] + data_lines
51+
# Append row count for awareness
52+
parts.append(f"\n({len(rows)} row{'s' if len(rows) != 1 else ''})")
53+
return "\n".join(parts)
54+
55+
1756
@mcp.tool
1857
def execute_sql(
1958
sql_query: str,
@@ -22,7 +61,8 @@ def execute_sql(
2261
schema: str = None,
2362
timeout: int = 180,
2463
query_tags: str = None,
25-
) -> List[Dict[str, Any]]:
64+
output_format: str = "markdown",
65+
) -> Union[str, List[Dict[str, Any]]]:
2666
"""
2767
Execute a SQL query on a Databricks SQL Warehouse.
2868
@@ -40,18 +80,25 @@ def execute_sql(
4080
timeout: Timeout in seconds (default: 180)
4181
query_tags: Optional query tags for cost attribution (e.g., "team:eng,cost_center:701").
4282
Appears in system.query.history and Query History UI.
83+
output_format: Result format — "markdown" (default) or "json".
84+
Markdown tables are ~50% smaller than JSON because column names appear
85+
only once in the header instead of on every row. Use "json" when you
86+
need machine-parseable output.
4387
4488
Returns:
45-
List of dictionaries, each representing a row with column names as keys.
89+
Markdown table string (default) or list of row dictionaries (if output_format="json").
4690
"""
47-
return _execute_sql(
91+
rows = _execute_sql(
4892
sql_query=sql_query,
4993
warehouse_id=warehouse_id,
5094
catalog=catalog,
5195
schema=schema,
5296
timeout=timeout,
5397
query_tags=query_tags,
5498
)
99+
if output_format == "json":
100+
return rows
101+
return _format_results_markdown(rows)
55102

56103

57104
@mcp.tool
@@ -63,6 +110,7 @@ def execute_sql_multi(
63110
timeout: int = 180,
64111
max_workers: int = 4,
65112
query_tags: str = None,
113+
output_format: str = "markdown",
66114
) -> Dict[str, Any]:
67115
"""
68116
Execute multiple SQL statements with dependency-aware parallelism.
@@ -82,11 +130,14 @@ def execute_sql_multi(
82130
timeout: Timeout per query in seconds (default: 180)
83131
max_workers: Maximum parallel queries per group (default: 4)
84132
query_tags: Optional query tags for cost attribution (e.g., "team:eng,cost_center:701").
133+
output_format: Result format — "markdown" (default) or "json".
134+
Markdown tables are ~50% smaller than JSON because column names appear
135+
only once in the header instead of on every row.
85136
86137
Returns:
87138
Dictionary with results per query and execution summary.
88139
"""
89-
return _execute_sql_multi(
140+
result = _execute_sql_multi(
90141
sql_content=sql_content,
91142
warehouse_id=warehouse_id,
92143
catalog=catalog,
@@ -95,6 +146,13 @@ def execute_sql_multi(
95146
max_workers=max_workers,
96147
query_tags=query_tags,
97148
)
149+
# Format sample_results in each query result if markdown requested
150+
if output_format != "json" and "results" in result:
151+
for query_result in result["results"].values():
152+
sample = query_result.get("sample_results")
153+
if sample and isinstance(sample, list) and len(sample) > 0:
154+
query_result["sample_results"] = _format_results_markdown(sample)
155+
return result
98156

99157

100158
@mcp.tool
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
"""Unit tests for SQL output formatting (markdown vs JSON)."""
2+
3+
from databricks_mcp_server.tools.sql import _format_results_markdown
4+
5+
6+
class TestFormatResultsMarkdown:
7+
"""Tests for _format_results_markdown helper."""
8+
9+
def test_empty_list_returns_no_results(self):
10+
assert _format_results_markdown([]) == "(no results)"
11+
12+
def test_single_row(self):
13+
rows = [{"id": "1", "name": "Alice"}]
14+
result = _format_results_markdown(rows)
15+
lines = result.strip().split("\n")
16+
assert lines[0] == "| id | name |"
17+
assert lines[1] == "| --- | --- |"
18+
assert lines[2] == "| 1 | Alice |"
19+
assert "(1 row)" in result
20+
21+
def test_multiple_rows(self):
22+
rows = [
23+
{"id": "1", "name": "Alice", "city": "NYC"},
24+
{"id": "2", "name": "Bob", "city": "Chicago"},
25+
{"id": "3", "name": "Carol", "city": "Denver"},
26+
]
27+
result = _format_results_markdown(rows)
28+
lines = result.strip().split("\n")
29+
# Header + separator + 3 data rows + blank + count
30+
assert lines[0] == "| id | name | city |"
31+
assert lines[1] == "| --- | --- | --- |"
32+
assert lines[2] == "| 1 | Alice | NYC |"
33+
assert lines[3] == "| 2 | Bob | Chicago |"
34+
assert lines[4] == "| 3 | Carol | Denver |"
35+
assert "(3 rows)" in result
36+
37+
def test_none_values_become_empty(self):
38+
rows = [{"id": "1", "name": None}]
39+
result = _format_results_markdown(rows)
40+
assert "| 1 | |" in result
41+
42+
def test_pipe_chars_escaped(self):
43+
rows = [{"expr": "a | b"}]
44+
result = _format_results_markdown(rows)
45+
assert "a \\| b" in result
46+
47+
def test_column_names_appear_once(self):
48+
"""The whole point: column names should appear exactly once (in the header)."""
49+
rows = [
50+
{"event_id": "1", "event_name": "Concert A"},
51+
{"event_id": "2", "event_name": "Concert B"},
52+
{"event_id": "3", "event_name": "Concert C"},
53+
]
54+
result = _format_results_markdown(rows)
55+
# Column name should appear once in header, not repeated per row
56+
assert result.count("event_id") == 1
57+
assert result.count("event_name") == 1
58+
59+
def test_markdown_smaller_than_json(self):
60+
"""Markdown output should be significantly smaller than JSON for many rows."""
61+
import json
62+
63+
rows = [
64+
{
65+
"id": str(i),
66+
"name": f"User {i}",
67+
"email": f"user{i}@example.com",
68+
"department": "Engineering",
69+
"status": "Active",
70+
}
71+
for i in range(50)
72+
]
73+
md = _format_results_markdown(rows)
74+
js = json.dumps(rows)
75+
# Markdown should be at least 30% smaller
76+
assert len(md) < len(js) * 0.7, (
77+
f"Markdown ({len(md)} chars) should be <70% of JSON ({len(js)} chars)"
78+
)

0 commit comments

Comments
 (0)