Skip to content

Commit 46fd88b

Browse files
authored
Merge branch 'apache:main' into feat/update-sort-order
2 parents 03f0a1b + 05f07ee commit 46fd88b

File tree

15 files changed

+699
-228
lines changed

15 files changed

+699
-228
lines changed

.github/workflows/pypi-build-artifacts.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ jobs:
6262
if: startsWith(matrix.os, 'ubuntu')
6363

6464
- name: Build wheels
65-
uses: pypa/cibuildwheel@v2.23.2
65+
uses: pypa/cibuildwheel@v2.23.3
6666
with:
6767
output-dir: wheelhouse
6868
config-file: "pyproject.toml"

.github/workflows/svn-build-artifacts.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ jobs:
5757
if: startsWith(matrix.os, 'ubuntu')
5858

5959
- name: Build wheels
60-
uses: pypa/cibuildwheel@v2.23.2
60+
uses: pypa/cibuildwheel@v2.23.3
6161
with:
6262
output-dir: wheelhouse
6363
config-file: "pyproject.toml"

dev/Dockerfile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,20 +39,20 @@ WORKDIR ${SPARK_HOME}
3939
# Remember to also update `tests/conftest`'s spark setting
4040
ENV SPARK_VERSION=3.5.4
4141
ENV ICEBERG_SPARK_RUNTIME_VERSION=3.5_2.12
42-
ENV ICEBERG_VERSION=1.9.0-SNAPSHOT
42+
ENV ICEBERG_VERSION=1.9.0
4343
ENV PYICEBERG_VERSION=0.9.0
4444

4545
RUN curl --retry 5 -s -C - https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz -o spark-${SPARK_VERSION}-bin-hadoop3.tgz \
4646
&& tar xzf spark-${SPARK_VERSION}-bin-hadoop3.tgz --directory /opt/spark --strip-components 1 \
4747
&& rm -rf spark-${SPARK_VERSION}-bin-hadoop3.tgz
4848

4949
# Download iceberg spark runtime
50-
RUN curl --retry 5 -s https://repository.apache.org/content/groups/snapshots/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/1.9.0-SNAPSHOT/iceberg-spark-runtime-3.5_2.12-1.9.0-20250409.001855-44.jar \
50+
RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \
5151
-Lo /opt/spark/jars/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar
5252

5353

5454
# Download AWS bundle
55-
RUN curl --retry 5 -s https://repository.apache.org/content/groups/snapshots/org/apache/iceberg/iceberg-aws-bundle/1.9.0-SNAPSHOT/iceberg-aws-bundle-1.9.0-20250409.002731-88.jar \
55+
RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \
5656
-Lo /opt/spark/jars/iceberg-aws-bundle-${ICEBERG_VERSION}.jar
5757

5858
COPY spark-defaults.conf /opt/spark/conf

poetry.lock

Lines changed: 155 additions & 155 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyiceberg/exceptions.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,3 +122,7 @@ class CommitStateUnknownException(RESTError):
122122

123123
class WaitingForLockException(Exception):
124124
"""Need to wait for a lock, try again."""
125+
126+
127+
class ValidationException(Exception):
128+
"""Raised when validation fails."""

pyiceberg/table/snapshots.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -272,10 +272,10 @@ class SnapshotSummaryCollector:
272272
partition_metrics: DefaultDict[str, UpdateMetrics]
273273
max_changed_partitions_for_summaries: int
274274

275-
def __init__(self) -> None:
275+
def __init__(self, partition_summary_limit: int = 0) -> None:
276276
self.metrics = UpdateMetrics()
277277
self.partition_metrics = defaultdict(UpdateMetrics)
278-
self.max_changed_partitions_for_summaries = 0
278+
self.max_changed_partitions_for_summaries = partition_summary_limit
279279

280280
def set_partition_summary_limit(self, limit: int) -> None:
281281
self.max_changed_partitions_for_summaries = limit
@@ -435,3 +435,16 @@ def ancestors_of(current_snapshot: Optional[Snapshot], table_metadata: TableMeta
435435
if snapshot.parent_snapshot_id is None:
436436
break
437437
snapshot = table_metadata.snapshot_by_id(snapshot.parent_snapshot_id)
438+
439+
440+
def ancestors_between(
441+
from_snapshot: Optional[Snapshot], to_snapshot: Snapshot, table_metadata: TableMetadata
442+
) -> Iterable[Snapshot]:
443+
"""Get the ancestors of and including the given snapshot between the to and from snapshots."""
444+
if from_snapshot is not None:
445+
for snapshot in ancestors_of(to_snapshot, table_metadata):
446+
yield snapshot
447+
if snapshot == from_snapshot:
448+
break
449+
else:
450+
yield from ancestors_of(to_snapshot, table_metadata)

pyiceberg/table/update/snapshot.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -203,13 +203,12 @@ def _write_delete_manifest() -> List[ManifestFile]:
203203
def _summary(self, snapshot_properties: Dict[str, str] = EMPTY_DICT) -> Summary:
204204
from pyiceberg.table import TableProperties
205205

206-
ssc = SnapshotSummaryCollector()
207206
partition_summary_limit = int(
208207
self._transaction.table_metadata.properties.get(
209208
TableProperties.WRITE_PARTITION_SUMMARY_LIMIT, TableProperties.WRITE_PARTITION_SUMMARY_LIMIT_DEFAULT
210209
)
211210
)
212-
ssc.set_partition_summary_limit(partition_summary_limit)
211+
ssc = SnapshotSummaryCollector(partition_summary_limit=partition_summary_limit)
213212

214213
for data_file in self._added_data_files:
215214
ssc.add_file(

pyiceberg/table/update/validate.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
from pyiceberg.exceptions import ValidationException
19+
from pyiceberg.manifest import ManifestContent, ManifestFile
20+
from pyiceberg.table import Table
21+
from pyiceberg.table.snapshots import Operation, Snapshot, ancestors_between
22+
23+
24+
def validation_history(
25+
table: Table,
26+
from_snapshot: Snapshot,
27+
to_snapshot: Snapshot,
28+
matching_operations: set[Operation],
29+
manifest_content_filter: ManifestContent,
30+
) -> tuple[list[ManifestFile], set[int]]:
31+
"""Return newly added manifests and snapshot IDs between the starting snapshot and parent snapshot.
32+
33+
Args:
34+
table: Table to get the history from
35+
from_snapshot: Parent snapshot to get the history from
36+
to_snapshot: Starting snapshot
37+
matching_operations: Operations to match on
38+
manifest_content_filter: Manifest content type to filter
39+
40+
Raises:
41+
ValidationException: If no matching snapshot is found or only one snapshot is found
42+
43+
Returns:
44+
List of manifest files and set of snapshots ID's matching conditions
45+
"""
46+
manifests_files: list[ManifestFile] = []
47+
snapshots: set[int] = set()
48+
49+
last_snapshot = None
50+
for snapshot in ancestors_between(from_snapshot, to_snapshot, table.metadata):
51+
last_snapshot = snapshot
52+
summary = snapshot.summary
53+
if summary is None:
54+
raise ValidationException(f"No summary found for snapshot {snapshot}!")
55+
if summary.operation not in matching_operations:
56+
continue
57+
58+
snapshots.add(snapshot.snapshot_id)
59+
# TODO: Maybe do the IO in a separate thread at some point, and collect at the bottom (we can easily merge the sets
60+
manifests_files.extend(
61+
[
62+
manifest
63+
for manifest in snapshot.manifests(table.io)
64+
if manifest.added_snapshot_id == snapshot.snapshot_id and manifest.content == manifest_content_filter
65+
]
66+
)
67+
68+
if last_snapshot is not None and last_snapshot.snapshot_id != from_snapshot.snapshot_id:
69+
raise ValidationException("No matching snapshot found.")
70+
71+
return manifests_files, snapshots

pyiceberg/table/upsert_util.py

Lines changed: 48 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,8 @@ def get_rows_to_update(source_table: pa.Table, target_table: pa.Table, join_cols
6262
"""
6363
all_columns = set(source_table.column_names)
6464
join_cols_set = set(join_cols)
65-
non_key_cols = all_columns - join_cols_set
65+
66+
non_key_cols = list(all_columns - join_cols_set)
6667

6768
if has_duplicate_rows(target_table, join_cols):
6869
raise ValueError("Target table has duplicate rows, aborting upsert")
@@ -71,25 +72,51 @@ def get_rows_to_update(source_table: pa.Table, target_table: pa.Table, join_cols
7172
# When the target table is empty, there is nothing to update :)
7273
return source_table.schema.empty_table()
7374

74-
diff_expr = functools.reduce(
75-
operator.or_,
76-
[
77-
pc.or_kleene(
78-
pc.not_equal(pc.field(f"{col}-lhs"), pc.field(f"{col}-rhs")),
79-
pc.is_null(pc.not_equal(pc.field(f"{col}-lhs"), pc.field(f"{col}-rhs"))),
80-
)
81-
for col in non_key_cols
82-
],
75+
# We need to compare non_key_cols in Python as PyArrow
76+
# 1. Cannot do a join when non-join columns have complex types
77+
# 2. Cannot compare columns with complex types
78+
# See: https://github.com/apache/arrow/issues/35785
79+
SOURCE_INDEX_COLUMN_NAME = "__source_index"
80+
TARGET_INDEX_COLUMN_NAME = "__target_index"
81+
82+
if SOURCE_INDEX_COLUMN_NAME in join_cols or TARGET_INDEX_COLUMN_NAME in join_cols:
83+
raise ValueError(
84+
f"{SOURCE_INDEX_COLUMN_NAME} and {TARGET_INDEX_COLUMN_NAME} are reserved for joining "
85+
f"DataFrames, and cannot be used as column names"
86+
) from None
87+
88+
# Step 1: Prepare source index with join keys and a marker index
89+
# Cast to target table schema, so we can do the join
90+
# See: https://github.com/apache/arrow/issues/37542
91+
source_index = (
92+
source_table.cast(target_table.schema)
93+
.select(join_cols_set)
94+
.append_column(SOURCE_INDEX_COLUMN_NAME, pa.array(range(len(source_table))))
8395
)
8496

85-
return (
86-
source_table
87-
# We already know that the schema is compatible, this is to fix large_ types
88-
.cast(target_table.schema)
89-
.join(target_table, keys=list(join_cols_set), join_type="inner", left_suffix="-lhs", right_suffix="-rhs")
90-
.filter(diff_expr)
91-
.drop_columns([f"{col}-rhs" for col in non_key_cols])
92-
.rename_columns({f"{col}-lhs" if col not in join_cols else col: col for col in source_table.column_names})
93-
# Finally cast to the original schema since it doesn't carry nullability:
94-
# https://github.com/apache/arrow/issues/45557
95-
).cast(target_table.schema)
97+
# Step 2: Prepare target index with join keys and a marker
98+
target_index = target_table.select(join_cols_set).append_column(TARGET_INDEX_COLUMN_NAME, pa.array(range(len(target_table))))
99+
100+
# Step 3: Perform an inner join to find which rows from source exist in target
101+
matching_indices = source_index.join(target_index, keys=list(join_cols_set), join_type="inner")
102+
103+
# Step 4: Compare all rows using Python
104+
to_update_indices = []
105+
for source_idx, target_idx in zip(
106+
matching_indices[SOURCE_INDEX_COLUMN_NAME].to_pylist(), matching_indices[TARGET_INDEX_COLUMN_NAME].to_pylist()
107+
):
108+
source_row = source_table.slice(source_idx, 1)
109+
target_row = target_table.slice(target_idx, 1)
110+
111+
for key in non_key_cols:
112+
source_val = source_row.column(key)[0].as_py()
113+
target_val = target_row.column(key)[0].as_py()
114+
if source_val != target_val:
115+
to_update_indices.append(source_idx)
116+
break
117+
118+
# Step 5: Take rows from source table using the indices and cast to target schema
119+
if to_update_indices:
120+
return source_table.take(to_update_indices)
121+
else:
122+
return source_table.schema.empty_table()

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ docutils = "!=0.21.post1" # https://github.com/python-poetry/poetry/issues/924
105105
[tool.poetry.group.docs.dependencies]
106106
# for mkdocs
107107
mkdocs = "1.6.1"
108-
griffe = "1.7.2"
108+
griffe = "1.7.3"
109109
jinja2 = "3.1.6"
110110
mkdocstrings = "0.29.1"
111111
mkdocstrings-python = "1.16.10"

0 commit comments

Comments
 (0)