Skip to content

Commit 75ef45d

Browse files
authored
Core: Fix bin packing when target file size is smaller than a row (#2844)
## What does this change do? When `write.target-file-size-bytes` is smaller than a single row, bin packing computed a 0 row chunk size and PyArrow raised a ValueError. This change clamps the chunk size to at least 1, so writes still succeed (one row per batch/file when needed). ## Why is this needed? Fixes a crash when users set a small target file size and attempt to write large records. ## How was this tested? - make lint - uv run python -m pytest tests/io/test_pyarrow.py -k "bin_pack_arrow_table" -v - make test (timed out at ~42%) Closes #2795 Co-authored-by: Soham <010Soham@users.noreply.github.com>
1 parent 6f8f57e commit 75ef45d

File tree

2 files changed

+7
-1
lines changed

2 files changed

+7
-1
lines changed

pyiceberg/io/pyarrow.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2681,7 +2681,7 @@ def bin_pack_arrow_table(tbl: pa.Table, target_file_size: int) -> Iterator[list[
26812681
from pyiceberg.utils.bin_packing import PackingIterator
26822682

26832683
avg_row_size_bytes = tbl.nbytes / tbl.num_rows
2684-
target_rows_per_file = target_file_size // avg_row_size_bytes
2684+
target_rows_per_file = max(1, int(target_file_size / avg_row_size_bytes))
26852685
batches = tbl.to_batches(max_chunksize=target_rows_per_file)
26862686
bin_packed_record_batches = PackingIterator(
26872687
items=batches,

tests/io/test_pyarrow.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2248,6 +2248,12 @@ def test_bin_pack_arrow_table(arrow_table_with_null: pa.Table) -> None:
22482248
assert len(list(bin_packed)) == 5
22492249

22502250

2251+
def test_bin_pack_arrow_table_target_size_smaller_than_row(arrow_table_with_null: pa.Table) -> None:
2252+
bin_packed = list(bin_pack_arrow_table(arrow_table_with_null, target_file_size=1))
2253+
assert len(bin_packed) == arrow_table_with_null.num_rows
2254+
assert sum(batch.num_rows for bin_ in bin_packed for batch in bin_) == arrow_table_with_null.num_rows
2255+
2256+
22512257
def test_schema_mismatch_type(table_schema_simple: Schema) -> None:
22522258
other_schema = pa.schema(
22532259
(

0 commit comments

Comments
 (0)