Skip to content

Commit 55e8516

Browse files
committed
feat: Add utilities for reading Parquet bloom filters
Add utility functions to read and check bloom filters directly from Parquet files using PyArrow, without requiring Iceberg spec changes. - get_parquet_bloom_filter_for_column(): Extract bloom filter from Parquet row group - bloom_filter_might_contain(): Check if value might be in bloom filter This provides foundation for future bloom filter integration without modifying the Iceberg manifest specification.
1 parent bb41a6d commit 55e8516

File tree

2 files changed

+157
-0
lines changed

2 files changed

+157
-0
lines changed

pyiceberg/table/bloom_filter.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
"""Bloom filter support for reading from Parquet files."""
18+
19+
from __future__ import annotations
20+
21+
from typing import TYPE_CHECKING, Any
22+
23+
if TYPE_CHECKING:
24+
import pyarrow.parquet as pq
25+
26+
27+
def get_parquet_bloom_filter_for_column(parquet_file: pq.ParquetFile, column_name: str, row_group_index: int) -> Any | None:
28+
"""Extract bloom filter for a specific column from a Parquet row group.
29+
30+
Args:
31+
parquet_file: PyArrow ParquetFile object.
32+
column_name: Name of the column to get bloom filter for.
33+
row_group_index: Index of the row group.
34+
35+
Returns:
36+
Bloom filter object if available, None otherwise.
37+
"""
38+
try:
39+
# PyArrow provides access to bloom filters through the row group metadata
40+
row_group = parquet_file.metadata.row_group(row_group_index)
41+
42+
# Find the column by name
43+
for i in range(row_group.num_columns):
44+
column = row_group.column(i)
45+
if column.path_in_schema == column_name:
46+
# Check if bloom filter is available
47+
if hasattr(column, "bloom_filter"):
48+
return column.bloom_filter
49+
break
50+
51+
return None
52+
except Exception:
53+
# If bloom filter reading fails, return None
54+
return None
55+
56+
57+
def bloom_filter_might_contain(bloom_filter: Any, value: Any) -> bool:
58+
"""Check if a Parquet bloom filter might contain a value.
59+
60+
Args:
61+
bloom_filter: PyArrow bloom filter object.
62+
value: Value to check.
63+
64+
Returns:
65+
True if value might be in the filter, False if definitely not.
66+
"""
67+
if bloom_filter is None or value is None:
68+
return True # Conservative: assume it might contain
69+
70+
try:
71+
# PyArrow bloom filters have a check method
72+
if hasattr(bloom_filter, "check"):
73+
return bloom_filter.check(value)
74+
elif hasattr(bloom_filter, "__contains__"):
75+
return value in bloom_filter
76+
else:
77+
return True # Conservative: assume it might contain
78+
except Exception:
79+
return True # On error, be conservative

tests/table/test_bloom_filter.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
"""Tests for bloom filter utility functions."""
18+
19+
from unittest.mock import MagicMock
20+
21+
from pyiceberg.table.bloom_filter import bloom_filter_might_contain, get_parquet_bloom_filter_for_column
22+
23+
24+
class TestBloomFilterUtilities:
25+
"""Test cases for Parquet bloom filter reading utilities."""
26+
27+
def test_get_parquet_bloom_filter_returns_none_when_not_available(self) -> None:
28+
"""Test that getting a bloom filter returns None when not available."""
29+
# Mock a ParquetFile without bloom filters
30+
mock_parquet_file = MagicMock()
31+
mock_row_group = MagicMock()
32+
mock_column = MagicMock()
33+
mock_column.path_in_schema = "test_column"
34+
del mock_column.bloom_filter # Ensure bloom_filter attribute doesn't exist
35+
36+
mock_row_group.num_columns = 1
37+
mock_row_group.column.return_value = mock_column
38+
mock_parquet_file.metadata.row_group.return_value = mock_row_group
39+
40+
result = get_parquet_bloom_filter_for_column(mock_parquet_file, "test_column", 0)
41+
assert result is None
42+
43+
def test_bloom_filter_might_contain_returns_true_when_filter_is_none(self) -> None:
44+
"""Test that might_contain returns True conservatively when filter is None."""
45+
result = bloom_filter_might_contain(None, "test_value")
46+
assert result is True
47+
48+
def test_bloom_filter_might_contain_returns_true_when_value_is_none(self) -> None:
49+
"""Test that might_contain returns True conservatively when value is None."""
50+
mock_filter = MagicMock()
51+
result = bloom_filter_might_contain(mock_filter, None)
52+
assert result is True
53+
54+
def test_bloom_filter_might_contain_uses_check_method(self) -> None:
55+
"""Test that might_contain uses the check method if available."""
56+
mock_filter = MagicMock()
57+
mock_filter.check.return_value = True
58+
59+
result = bloom_filter_might_contain(mock_filter, "test_value")
60+
assert result is True
61+
mock_filter.check.assert_called_once_with("test_value")
62+
63+
def test_bloom_filter_might_contain_uses_contains_method(self) -> None:
64+
"""Test that might_contain uses __contains__ if check is not available."""
65+
mock_filter = MagicMock()
66+
del mock_filter.check # Remove check method
67+
mock_filter.__contains__.return_value = True
68+
69+
result = bloom_filter_might_contain(mock_filter, "test_value")
70+
assert result is True
71+
72+
def test_bloom_filter_might_contain_returns_true_on_exception(self) -> None:
73+
"""Test that might_contain returns True conservatively on exception."""
74+
mock_filter = MagicMock()
75+
mock_filter.check.side_effect = Exception("Test error")
76+
77+
result = bloom_filter_might_contain(mock_filter, "test_value")
78+
assert result is True

0 commit comments

Comments
 (0)