Merge branch 'apache:main' into feat/update-sort-order

JasperHG90 · web-flow · commit 46fd88ba3076 · 2025-05-06T15:15:26.000+02:00
diff --git a/.github/workflows/pypi-build-artifacts.yml b/.github/workflows/pypi-build-artifacts.yml
@@ -62,7 +62,7 @@ jobs:
         if: startsWith(matrix.os, 'ubuntu')
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.23.2
+        uses: pypa/cibuildwheel@v2.23.3
         with:
           output-dir: wheelhouse
           config-file: "pyproject.toml"
diff --git a/.github/workflows/svn-build-artifacts.yml b/.github/workflows/svn-build-artifacts.yml
@@ -57,7 +57,7 @@ jobs:
         if: startsWith(matrix.os, 'ubuntu')
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.23.2
+        uses: pypa/cibuildwheel@v2.23.3
         with:
           output-dir: wheelhouse
           config-file: "pyproject.toml"
diff --git a/dev/Dockerfile b/dev/Dockerfile
@@ -39,20 +39,20 @@ WORKDIR ${SPARK_HOME}
 # Remember to also update `tests/conftest`'s spark setting
 ENV SPARK_VERSION=3.5.4
 ENV ICEBERG_SPARK_RUNTIME_VERSION=3.5_2.12
-ENV ICEBERG_VERSION=1.9.0-SNAPSHOT
+ENV ICEBERG_VERSION=1.9.0
 ENV PYICEBERG_VERSION=0.9.0
 
 RUN curl --retry 5 -s -C - https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz -o spark-${SPARK_VERSION}-bin-hadoop3.tgz \
  && tar xzf spark-${SPARK_VERSION}-bin-hadoop3.tgz --directory /opt/spark --strip-components 1 \
  && rm -rf spark-${SPARK_VERSION}-bin-hadoop3.tgz
 
 # Download iceberg spark runtime
-RUN curl --retry 5 -s https://repository.apache.org/content/groups/snapshots/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/1.9.0-SNAPSHOT/iceberg-spark-runtime-3.5_2.12-1.9.0-20250409.001855-44.jar \
+RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \
       -Lo /opt/spark/jars/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar
 
 
 # Download AWS bundle
-RUN curl --retry 5 -s https://repository.apache.org/content/groups/snapshots/org/apache/iceberg/iceberg-aws-bundle/1.9.0-SNAPSHOT/iceberg-aws-bundle-1.9.0-20250409.002731-88.jar \
+RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \
       -Lo /opt/spark/jars/iceberg-aws-bundle-${ICEBERG_VERSION}.jar
 
 COPY spark-defaults.conf /opt/spark/conf
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyiceberg/exceptions.py b/pyiceberg/exceptions.py
@@ -122,3 +122,7 @@ class CommitStateUnknownException(RESTError):
 
 class WaitingForLockException(Exception):
     """Need to wait for a lock, try again."""
+
+
+class ValidationException(Exception):
+    """Raised when validation fails."""
diff --git a/pyiceberg/table/snapshots.py b/pyiceberg/table/snapshots.py
@@ -272,10 +272,10 @@ class SnapshotSummaryCollector:
     partition_metrics: DefaultDict[str, UpdateMetrics]
     max_changed_partitions_for_summaries: int
 
-    def __init__(self) -> None:
+    def __init__(self, partition_summary_limit: int = 0) -> None:
         self.metrics = UpdateMetrics()
         self.partition_metrics = defaultdict(UpdateMetrics)
-        self.max_changed_partitions_for_summaries = 0
+        self.max_changed_partitions_for_summaries = partition_summary_limit
 
     def set_partition_summary_limit(self, limit: int) -> None:
         self.max_changed_partitions_for_summaries = limit
@@ -435,3 +435,16 @@ def ancestors_of(current_snapshot: Optional[Snapshot], table_metadata: TableMeta
         if snapshot.parent_snapshot_id is None:
             break
         snapshot = table_metadata.snapshot_by_id(snapshot.parent_snapshot_id)
+
+
+def ancestors_between(
+    from_snapshot: Optional[Snapshot], to_snapshot: Snapshot, table_metadata: TableMetadata
+) -> Iterable[Snapshot]:
+    """Get the ancestors of and including the given snapshot between the to and from snapshots."""
+    if from_snapshot is not None:
+        for snapshot in ancestors_of(to_snapshot, table_metadata):
+            yield snapshot
+            if snapshot == from_snapshot:
+                break
+    else:
+        yield from ancestors_of(to_snapshot, table_metadata)
diff --git a/pyiceberg/table/update/snapshot.py b/pyiceberg/table/update/snapshot.py
@@ -203,13 +203,12 @@ def _write_delete_manifest() -> List[ManifestFile]:
     def _summary(self, snapshot_properties: Dict[str, str] = EMPTY_DICT) -> Summary:
         from pyiceberg.table import TableProperties
 
-        ssc = SnapshotSummaryCollector()
         partition_summary_limit = int(
             self._transaction.table_metadata.properties.get(
                 TableProperties.WRITE_PARTITION_SUMMARY_LIMIT, TableProperties.WRITE_PARTITION_SUMMARY_LIMIT_DEFAULT
             )
         )
-        ssc.set_partition_summary_limit(partition_summary_limit)
+        ssc = SnapshotSummaryCollector(partition_summary_limit=partition_summary_limit)
 
         for data_file in self._added_data_files:
             ssc.add_file(
diff --git a/pyiceberg/table/update/validate.py b/pyiceberg/table/update/validate.py
@@ -0,0 +1,71 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from pyiceberg.exceptions import ValidationException
+from pyiceberg.manifest import ManifestContent, ManifestFile
+from pyiceberg.table import Table
+from pyiceberg.table.snapshots import Operation, Snapshot, ancestors_between
+
+
+def validation_history(
+    table: Table,
+    from_snapshot: Snapshot,
+    to_snapshot: Snapshot,
+    matching_operations: set[Operation],
+    manifest_content_filter: ManifestContent,
+) -> tuple[list[ManifestFile], set[int]]:
+    """Return newly added manifests and snapshot IDs between the starting snapshot and parent snapshot.
+
+    Args:
+        table: Table to get the history from
+        from_snapshot: Parent snapshot to get the history from
+        to_snapshot: Starting snapshot
+        matching_operations: Operations to match on
+        manifest_content_filter: Manifest content type to filter
+
+    Raises:
+        ValidationException: If no matching snapshot is found or only one snapshot is found
+
+    Returns:
+        List of manifest files and set of snapshots ID's matching conditions
+    """
+    manifests_files: list[ManifestFile] = []
+    snapshots: set[int] = set()
+
+    last_snapshot = None
+    for snapshot in ancestors_between(from_snapshot, to_snapshot, table.metadata):
+        last_snapshot = snapshot
+        summary = snapshot.summary
+        if summary is None:
+            raise ValidationException(f"No summary found for snapshot {snapshot}!")
+        if summary.operation not in matching_operations:
+            continue
+
+        snapshots.add(snapshot.snapshot_id)
+        # TODO: Maybe do the IO in a separate thread at some point, and collect at the bottom (we can easily merge the sets
+        manifests_files.extend(
+            [
+                manifest
+                for manifest in snapshot.manifests(table.io)
+                if manifest.added_snapshot_id == snapshot.snapshot_id and manifest.content == manifest_content_filter
+            ]
+        )
+
+    if last_snapshot is not None and last_snapshot.snapshot_id != from_snapshot.snapshot_id:
+        raise ValidationException("No matching snapshot found.")
+
+    return manifests_files, snapshots
diff --git a/pyiceberg/table/upsert_util.py b/pyiceberg/table/upsert_util.py
@@ -62,7 +62,8 @@ def get_rows_to_update(source_table: pa.Table, target_table: pa.Table, join_cols
     """
     all_columns = set(source_table.column_names)
     join_cols_set = set(join_cols)
-    non_key_cols = all_columns - join_cols_set
+
+    non_key_cols = list(all_columns - join_cols_set)
 
     if has_duplicate_rows(target_table, join_cols):
         raise ValueError("Target table has duplicate rows, aborting upsert")
@@ -71,25 +72,51 @@ def get_rows_to_update(source_table: pa.Table, target_table: pa.Table, join_cols
         # When the target table is empty, there is nothing to update :)
         return source_table.schema.empty_table()
 
-    diff_expr = functools.reduce(
-        operator.or_,
-        [
-            pc.or_kleene(
-                pc.not_equal(pc.field(f"{col}-lhs"), pc.field(f"{col}-rhs")),
-                pc.is_null(pc.not_equal(pc.field(f"{col}-lhs"), pc.field(f"{col}-rhs"))),
-            )
-            for col in non_key_cols
-        ],
+    # We need to compare non_key_cols in Python as PyArrow
+    # 1. Cannot do a join when non-join columns have complex types
+    # 2. Cannot compare columns with complex types
+    # See: https://github.com/apache/arrow/issues/35785
+    SOURCE_INDEX_COLUMN_NAME = "__source_index"
+    TARGET_INDEX_COLUMN_NAME = "__target_index"
+
+    if SOURCE_INDEX_COLUMN_NAME in join_cols or TARGET_INDEX_COLUMN_NAME in join_cols:
+        raise ValueError(
+            f"{SOURCE_INDEX_COLUMN_NAME} and {TARGET_INDEX_COLUMN_NAME} are reserved for joining "
+            f"DataFrames, and cannot be used as column names"
+        ) from None
+
+    # Step 1: Prepare source index with join keys and a marker index
+    # Cast to target table schema, so we can do the join
+    # See: https://github.com/apache/arrow/issues/37542
+    source_index = (
+        source_table.cast(target_table.schema)
+        .select(join_cols_set)
+        .append_column(SOURCE_INDEX_COLUMN_NAME, pa.array(range(len(source_table))))
     )
 
-    return (
-        source_table
-        # We already know that the schema is compatible, this is to fix large_ types
-        .cast(target_table.schema)
-        .join(target_table, keys=list(join_cols_set), join_type="inner", left_suffix="-lhs", right_suffix="-rhs")
-        .filter(diff_expr)
-        .drop_columns([f"{col}-rhs" for col in non_key_cols])
-        .rename_columns({f"{col}-lhs" if col not in join_cols else col: col for col in source_table.column_names})
-        # Finally cast to the original schema since it doesn't carry nullability:
-        # https://github.com/apache/arrow/issues/45557
-    ).cast(target_table.schema)
+    # Step 2: Prepare target index with join keys and a marker
+    target_index = target_table.select(join_cols_set).append_column(TARGET_INDEX_COLUMN_NAME, pa.array(range(len(target_table))))
+
+    # Step 3: Perform an inner join to find which rows from source exist in target
+    matching_indices = source_index.join(target_index, keys=list(join_cols_set), join_type="inner")
+
+    # Step 4: Compare all rows using Python
+    to_update_indices = []
+    for source_idx, target_idx in zip(
+        matching_indices[SOURCE_INDEX_COLUMN_NAME].to_pylist(), matching_indices[TARGET_INDEX_COLUMN_NAME].to_pylist()
+    ):
+        source_row = source_table.slice(source_idx, 1)
+        target_row = target_table.slice(target_idx, 1)
+
+        for key in non_key_cols:
+            source_val = source_row.column(key)[0].as_py()
+            target_val = target_row.column(key)[0].as_py()
+            if source_val != target_val:
+                to_update_indices.append(source_idx)
+                break
+
+    # Step 5: Take rows from source table using the indices and cast to target schema
+    if to_update_indices:
+        return source_table.take(to_update_indices)
+    else:
+        return source_table.schema.empty_table()
diff --git a/pyproject.toml b/pyproject.toml
@@ -105,7 +105,7 @@ docutils = "!=0.21.post1"   # https://github.com/python-poetry/poetry/issues/924
 [tool.poetry.group.docs.dependencies]
 # for mkdocs
 mkdocs = "1.6.1"
-griffe = "1.7.2"
+griffe = "1.7.3"
 jinja2 = "3.1.6"
 mkdocstrings = "0.29.1"
 mkdocstrings-python = "1.16.10"
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -2434,7 +2434,7 @@ def spark() -> "SparkSession":
     # Remember to also update `dev/Dockerfile`
     spark_version = ".".join(importlib.metadata.version("pyspark").split(".")[:2])
     scala_version = "2.12"
-    iceberg_version = "1.8.0"
+    iceberg_version = "1.9.0"
 
     os.environ["PYSPARK_SUBMIT_ARGS"] = (
         f"--packages org.apache.iceberg:iceberg-spark-runtime-{spark_version}_{scala_version}:{iceberg_version},"
diff --git a/tests/table/test_init.py b/tests/table/test_init.py
@@ -57,7 +57,6 @@
     Snapshot,
     SnapshotLogEntry,
     Summary,
-    ancestors_of,
 )
 from pyiceberg.table.sorting import (
     NullOrder,
@@ -225,44 +224,6 @@ def test_snapshot_by_timestamp(table_v2: Table) -> None:
     assert table_v2.snapshot_as_of_timestamp(1515100955770, inclusive=False) is None
 
 
-def test_ancestors_of(table_v2: Table) -> None:
-    assert list(ancestors_of(table_v2.current_snapshot(), table_v2.metadata)) == [
-        Snapshot(
-            snapshot_id=3055729675574597004,
-            parent_snapshot_id=3051729675574597004,
-            sequence_number=1,
-            timestamp_ms=1555100955770,
-            manifest_list="s3://a/b/2.avro",
-            summary=Summary(Operation.APPEND),
-            schema_id=1,
-        ),
-        Snapshot(
-            snapshot_id=3051729675574597004,
-            parent_snapshot_id=None,
-            sequence_number=0,
-            timestamp_ms=1515100955770,
-            manifest_list="s3://a/b/1.avro",
-            summary=Summary(Operation.APPEND),
-            schema_id=None,
-        ),
-    ]
-
-
-def test_ancestors_of_recursive_error(table_v2_with_extensive_snapshots: Table) -> None:
-    # Test RecursionError: maximum recursion depth exceeded
-    assert (
-        len(
-            list(
-                ancestors_of(
-                    table_v2_with_extensive_snapshots.current_snapshot(),
-                    table_v2_with_extensive_snapshots.metadata,
-                )
-            )
-        )
-        == 2000
-    )
-
-
 def test_snapshot_by_id_does_not_exist(table_v2: Table) -> None:
     assert table_v2.snapshot_by_id(-1) is None
 
diff --git a/tests/table/test_snapshots.py b/tests/table/test_snapshots.py
diff --git a/tests/table/test_upsert.py b/tests/table/test_upsert.py
diff --git a/tests/table/test_validate.py b/tests/table/test_validate.py

Original file line number	Diff line number	Diff line change
`@@ -203,13 +203,12 @@ def _write_delete_manifest() -> List[ManifestFile]:`
`203`	`203`	`def _summary(self, snapshot_properties: Dict[str, str] = EMPTY_DICT) -> Summary:`
`204`	`204`	`from pyiceberg.table import TableProperties`
`205`	`205`
`206`		`- ssc = SnapshotSummaryCollector()`
`207`	`206`	`partition_summary_limit = int(`
`208`	`207`	`self._transaction.table_metadata.properties.get(`
`209`	`208`	`TableProperties.WRITE_PARTITION_SUMMARY_LIMIT, TableProperties.WRITE_PARTITION_SUMMARY_LIMIT_DEFAULT`
`210`	`209`	`)`
`211`	`210`	`)`
`212`		`- ssc.set_partition_summary_limit(partition_summary_limit)`
	`211`	`+ ssc = SnapshotSummaryCollector(partition_summary_limit=partition_summary_limit)`
`213`	`212`
`214`	`213`	`for data_file in self._added_data_files:`
`215`	`214`	`ssc.add_file(`