Skip to content

Commit d6a123b

Browse files
committed
refactor: Remove spec-breaking changes, provide Parquet bloom filter utilities
- Remove bloom_filter_bytes field from manifest (spec-breaking) - Simplify to utility functions for reading Parquet bloom filters - Bloom filters can be read directly from Parquet files via PyArrow - Provides foundation for future row-group level pruning without spec changes
1 parent 0c5ec51 commit d6a123b

File tree

5 files changed

+101
-734
lines changed

5 files changed

+101
-734
lines changed

pyiceberg/expressions/bloom_filter.py

Lines changed: 0 additions & 216 deletions
This file was deleted.

pyiceberg/manifest.py

Lines changed: 0 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -288,13 +288,6 @@ def __repr__(self) -> str:
288288
required=False,
289289
doc="ID representing sort order for this file",
290290
),
291-
NestedField(
292-
field_id=146,
293-
name="bloom_filter_bytes",
294-
field_type=MapType(key_id=147, key_type=IntegerType(), value_id=148, value_type=BinaryType()),
295-
required=False,
296-
doc="Map of column id to bloom filter",
297-
),
298291
),
299292
3: StructType(
300293
NestedField(
@@ -418,13 +411,6 @@ def __repr__(self) -> str:
418411
required=False,
419412
doc="The length of a referenced content stored in the file; required if content_offset is present",
420413
),
421-
NestedField(
422-
field_id=146,
423-
name="bloom_filter_bytes",
424-
field_type=MapType(key_id=147, key_type=IntegerType(), value_id=148, value_type=BinaryType()),
425-
required=False,
426-
doc="Map of column id to bloom filter",
427-
),
428414
),
429415
}
430416

@@ -528,17 +514,6 @@ def equality_ids(self) -> List[int] | None:
528514
def sort_order_id(self) -> int | None:
529515
return self._data[15]
530516

531-
@property
532-
def bloom_filter_bytes(self) -> Dict[int, bytes] | None:
533-
"""Get bloom filter bytes for all columns.
534-
535-
Returns a dict mapping column ID to bloom filter bytes.
536-
"""
537-
# Get bloom_filter_bytes which is the last field in the struct
538-
if len(self._data) > 16:
539-
return self._data[16]
540-
return None
541-
542517
# Spec ID should not be stored in the file
543518
_spec_id: int
544519

@@ -561,19 +536,6 @@ def __hash__(self) -> int:
561536
"""Return the hash of the file path."""
562537
return hash(self.file_path)
563538

564-
def get_bloom_filter(self, column_id: int) -> bytes | None:
565-
"""Get bloom filter bytes for a specific column.
566-
567-
Args:
568-
column_id: The column ID to get the bloom filter for.
569-
570-
Returns:
571-
Bloom filter bytes for the column, or None if not available.
572-
"""
573-
if self.bloom_filter_bytes and column_id in self.bloom_filter_bytes:
574-
return self.bloom_filter_bytes[column_id]
575-
return None
576-
577539
def __eq__(self, other: Any) -> bool:
578540
"""Compare the datafile with another object.
579541

pyiceberg/table/__init__.py

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1922,30 +1922,6 @@ def _build_residual_evaluator(self, spec_id: int) -> Callable[[DataFile], Residu
19221922
)
19231923
)
19241924

1925-
def _should_keep_file_with_bloom_filter(self, data_file: DataFile) -> bool:
1926-
"""Check if a data file should be kept based on bloom filter evaluation.
1927-
1928-
Args:
1929-
data_file: The data file to evaluate.
1930-
1931-
Returns:
1932-
True if the file should be kept, False if it can be pruned.
1933-
"""
1934-
if data_file.bloom_filter_bytes is None:
1935-
# No bloom filter for this file
1936-
return True
1937-
1938-
try:
1939-
from pyiceberg.expressions.bloom_filter import BloomFilterEvaluator
1940-
from pyiceberg.expressions.visitors import visit
1941-
1942-
# Use the bloom filter evaluator to check if the file might contain matching rows
1943-
evaluator = BloomFilterEvaluator(data_file, self.table_metadata.schema())
1944-
return visit(self.row_filter, evaluator)
1945-
except Exception:
1946-
# If there's any error evaluating bloom filters, be conservative and keep the file
1947-
return True
1948-
19491925
@staticmethod
19501926
def _check_sequence_number(min_sequence_number: int, manifest: ManifestFile) -> bool:
19511927
"""Ensure that no manifests are loaded that contain deletes that are older than the data.
@@ -2021,10 +1997,6 @@ def plan_files(self) -> Iterable[FileScanTask]:
20211997
for manifest_entry in chain.from_iterable(self.scan_plan_helper()):
20221998
data_file = manifest_entry.data_file
20231999
if data_file.content == DataFileContent.DATA:
2024-
# Apply bloom filter evaluation to prune files that definitely don't match the filter
2025-
if not self._should_keep_file_with_bloom_filter(data_file):
2026-
# Skip this file as it cannot contain matching rows
2027-
continue
20282000
data_entries.append(manifest_entry)
20292001
elif data_file.content == DataFileContent.POSITION_DELETES:
20302002
positional_delete_entries.add(manifest_entry)

0 commit comments

Comments
 (0)