Skip to content

Commit bb4516f

Browse files
committed
feat: Validate display configuration values in DataFrame
- Added validation to ensure max_table_bytes, min_table_rows, and max_cell_length are greater than 0 in the configure_display method of DataFrame class. - Updated test cases to cover scenarios for zero and negative values, ensuring proper error handling. - Enhanced existing tests to validate extreme values and confirm expected behavior for display configurations.
1 parent 5aae267 commit bb4516f

File tree

3 files changed

+315
-9
lines changed

3 files changed

+315
-9
lines changed

python/datafusion/dataframe.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -828,7 +828,16 @@ def configure_display(
828828
This is used for initial display and in notebooks.
829829
max_cell_length: Maximum length of a cell before it gets minimized (default: 25).
830830
Longer cells will be truncated with an expand button.
831+
832+
Raises:
833+
ValueError: If any of the provided values are less than or equal to 0.
831834
"""
835+
if any(
836+
value is not None and value <= 0
837+
for value in (max_table_bytes, min_table_rows, max_cell_length)
838+
):
839+
raise ValueError("All values must be greater than 0.")
840+
832841
self.df.configure_display(max_table_bytes, min_table_rows, max_cell_length)
833842

834843
def reset_display_config(self) -> None:

python/tests/test_dataframe.py

Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1293,6 +1293,35 @@ def test_configure_display(df):
12931293
assert config.min_table_rows == 5 # only this value changed
12941294
assert config.max_cell_length == 50 # previous value retained
12951295

1296+
# Test with extreme values (still valid, but potentially problematic)
1297+
# Zero values
1298+
with pytest.raises(ValueError, match=r".*must be greater than 0.*"):
1299+
df.configure_display(max_table_bytes=0, min_table_rows=0, max_cell_length=0)
1300+
1301+
# Very large values
1302+
df.configure_display(
1303+
max_table_bytes=10**12, min_table_rows=10**6, max_cell_length=10**4
1304+
)
1305+
config = df.display_config
1306+
assert config.max_table_bytes == 10**12 # 1 TB
1307+
assert config.min_table_rows == 10**6 # 1 million rows
1308+
assert config.max_cell_length == 10**4 # 10,000 chars per cell
1309+
1310+
# Test with negative values
1311+
# This tests for expected behavior when users accidentally pass negative values
1312+
# Since these are usize in Rust, we expect a Python TypeError when trying to pass negative values
1313+
with pytest.raises(ValueError, match=r".*must be greater than 0.*"):
1314+
df.configure_display(max_table_bytes=-1)
1315+
1316+
with pytest.raises(ValueError, match=r".*must be greater than 0.*"):
1317+
df.configure_display(min_table_rows=-5)
1318+
1319+
with pytest.raises(ValueError, match=r".*must be greater than 0.*"):
1320+
df.configure_display(max_cell_length=-10)
1321+
1322+
# Reset for next tests
1323+
df.reset_display_config()
1324+
12961325

12971326
def test_reset_display_config(df):
12981327
"""Test resetting display configuration to defaults."""
@@ -1313,3 +1342,175 @@ def test_reset_display_config(df):
13131342
assert config.max_table_bytes == 2 * 1024 * 1024 # 2 MB
13141343
assert config.min_table_rows == 20
13151344
assert config.max_cell_length == 25
1345+
1346+
1347+
def test_min_table_rows_display(ctx):
1348+
"""Test that at least min_table_rows rows are displayed."""
1349+
# Create a dataframe with more rows than the default min_table_rows
1350+
rows = 100
1351+
data = list(range(rows))
1352+
batch = pa.RecordBatch.from_arrays([pa.array(data)], names=["values"])
1353+
df = ctx.create_dataframe([[batch]])
1354+
1355+
# Set min_table_rows to a specific value
1356+
custom_min_rows = 30
1357+
df.configure_display(min_table_rows=custom_min_rows)
1358+
1359+
# Get HTML representation
1360+
html_output = df._repr_html_()
1361+
1362+
# Count table rows in the HTML (excluding header row)
1363+
# Each row has a <tr> tag
1364+
row_count = html_output.count("<tr>") - 1 # subtract 1 for the header row
1365+
1366+
# Verify at least min_table_rows rows are displayed
1367+
assert (
1368+
row_count >= custom_min_rows
1369+
), f"Expected at least {custom_min_rows} rows, got {row_count}"
1370+
1371+
# If data was truncated, "Data truncated" message should be present
1372+
if row_count < rows:
1373+
assert "Data truncated" in html_output
1374+
1375+
1376+
def test_max_table_bytes_display(ctx):
1377+
"""Test that reducing max_table_bytes limits the amount of data displayed."""
1378+
# Create a dataframe with large string values to consume memory
1379+
# Each string is approximately 1000 bytes
1380+
large_strings = ["x" * 1000 for _ in range(50)]
1381+
batch = pa.RecordBatch.from_arrays([pa.array(large_strings)], names=["large_data"])
1382+
df = ctx.create_dataframe([[batch]])
1383+
1384+
# First test with default settings
1385+
default_html = df._repr_html_()
1386+
default_row_count = default_html.count("<tr>") - 1 # subtract header row
1387+
1388+
# Now set a very small max_table_bytes
1389+
df.configure_display(max_table_bytes=5000) # 5KB should only fit a few rows
1390+
limited_html = df._repr_html_()
1391+
limited_row_count = limited_html.count("<tr>") - 1
1392+
1393+
# Verify fewer rows are displayed with the byte limit
1394+
assert (
1395+
limited_row_count < default_row_count
1396+
), f"Expected fewer rows with byte limit. Default: {default_row_count}, Limited: {limited_row_count}"
1397+
1398+
# "Data truncated" should be present when limited
1399+
assert "Data truncated" in limited_html
1400+
1401+
1402+
def test_max_cell_length_display(ctx):
1403+
"""Test that cells longer than max_cell_length are truncated in display."""
1404+
# Create a dataframe with long string values
1405+
long_strings = [
1406+
"short",
1407+
"medium text",
1408+
"this is a very long string that should be truncated",
1409+
]
1410+
batch = pa.RecordBatch.from_arrays([pa.array(long_strings)], names=["text"])
1411+
df = ctx.create_dataframe([[batch]])
1412+
1413+
# Set a small max_cell_length
1414+
max_length = 10
1415+
df.configure_display(max_cell_length=max_length)
1416+
1417+
# Get HTML representation
1418+
html_output = df._repr_html_()
1419+
1420+
# Check for expand button for long text
1421+
assert "expandable-container" in html_output
1422+
1423+
# Check that expandable class is used for long text
1424+
assert 'class="expandable"' in html_output
1425+
1426+
# Look for the truncated text and expand button
1427+
long_text = long_strings[2]
1428+
assert long_text[:max_length] in html_output # Truncated text should be present
1429+
assert "expand-btn" in html_output # Expand button should be present
1430+
assert long_text in html_output # Full text should also be in the HTML (hidden)
1431+
1432+
1433+
def test_display_config_repr_string(ctx):
1434+
"""Test that __repr__ respects display configuration."""
1435+
# Create a dataframe with more rows than we want to show
1436+
rows = 30
1437+
data = list(range(rows))
1438+
batch = pa.RecordBatch.from_arrays([pa.array(data)], names=["values"])
1439+
df = ctx.create_dataframe([[batch]])
1440+
1441+
# Configure to show only 5 rows in string representation
1442+
df.configure_display(min_table_rows=5)
1443+
1444+
# Get the string representation
1445+
repr_str = df.__repr__()
1446+
1447+
# The string should contain "Data truncated"
1448+
assert "Data truncated" in repr_str
1449+
1450+
# Count the number of rows (each value should be on a separate line)
1451+
# This is an approximation since we don't parse the actual ASCII table
1452+
value_lines = 0
1453+
for i in range(rows):
1454+
if str(i) in repr_str:
1455+
value_lines += 1
1456+
1457+
# Should be fewer rows than the total
1458+
assert value_lines < rows
1459+
1460+
# Now set min_rows higher and see if more rows appear
1461+
df.configure_display(min_table_rows=20)
1462+
repr_str_more = df.__repr__()
1463+
1464+
value_lines_more = 0
1465+
for i in range(rows):
1466+
if str(i) in repr_str_more:
1467+
value_lines_more += 1
1468+
1469+
assert value_lines_more > value_lines
1470+
1471+
1472+
def test_display_config_integrated(ctx):
1473+
"""Test all display config options together in an integrated test."""
1474+
# Create a dataframe with:
1475+
# - Many rows (to test min_table_rows)
1476+
# - Large data (to test max_table_bytes)
1477+
# - Long strings (to test max_cell_length)
1478+
rows = 50
1479+
ids = list(range(rows))
1480+
# Generate strings of increasing length
1481+
texts = [f"{'A' * i}" for i in range(1, rows + 1)]
1482+
1483+
batch = pa.RecordBatch.from_arrays(
1484+
[pa.array(ids), pa.array(texts)], names=["id", "text"]
1485+
)
1486+
1487+
df = ctx.create_dataframe([[batch]])
1488+
1489+
# Set custom display configuration
1490+
df.configure_display(
1491+
max_table_bytes=2000, # Limit bytes to display
1492+
min_table_rows=15, # Show at least 15 rows
1493+
max_cell_length=10, # Truncate cells longer than 10 chars
1494+
)
1495+
1496+
# Get HTML representation
1497+
html_output = df._repr_html_()
1498+
1499+
# Check row count
1500+
row_count = html_output.count("<tr>") - 1 # subtract header
1501+
assert row_count >= 15, f"Should display at least 15 rows, got {row_count}"
1502+
1503+
# Check for truncation
1504+
assert "expandable-container" in html_output
1505+
assert "expand-btn" in html_output
1506+
1507+
# Should be truncated (not all rows displayed)
1508+
assert "Data truncated" in html_output
1509+
1510+
# Now with default settings
1511+
df.reset_display_config()
1512+
default_html = df._repr_html_()
1513+
default_row_count = default_html.count("<tr>") - 1
1514+
1515+
# Default settings should show more data
1516+
assert default_row_count > row_count

0 commit comments

Comments
 (0)