feat: add null_safe_eq parameter to upsert

mdwint · mdwint · commit 80c4e1fc9896 · 2026-03-23T15:52:34.000+01:00
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -718,6 +718,7 @@ def upsert(
         when_matched_update_all: bool = True,
         when_not_matched_insert_all: bool = True,
         case_sensitive: bool = True,
+        null_safe_eq: bool = False,
         branch: str | None = MAIN_BRANCH,
         snapshot_properties: dict[str, str] = EMPTY_DICT,
     ) -> UpsertResult:
@@ -732,6 +733,7 @@ def upsert(
             when_not_matched_insert_all: Bool indicating new rows to be inserted that do not match any
                 existing rows in the table
             case_sensitive: Bool indicating if the match should be case-sensitive
+            null_safe_eq: Bool indicating if the equality operator should be null-safe (<=> instead of =)
             branch: Branch Reference to run the upsert operation
             snapshot_properties: Custom properties to be added to the snapshot summary
 
@@ -824,7 +826,7 @@ def upsert(
                 # values have actually changed. We don't want to do just a blanket overwrite for matched
                 # rows if the actual non-key column data hasn't changed.
                 # this extra step avoids unnecessary IO and writes
-                rows_to_update = upsert_util.get_rows_to_update(df, rows, join_cols)
+                rows_to_update = upsert_util.get_rows_to_update(df, rows, join_cols, null_safe_eq=null_safe_eq)
 
                 if len(rows_to_update) > 0:
                     # build the match predicate filter
@@ -1320,6 +1322,7 @@ def upsert(
         when_matched_update_all: bool = True,
         when_not_matched_insert_all: bool = True,
         case_sensitive: bool = True,
+        null_safe_eq: bool = False,
         branch: str | None = MAIN_BRANCH,
         snapshot_properties: dict[str, str] = EMPTY_DICT,
     ) -> UpsertResult:
@@ -1334,6 +1337,7 @@ def upsert(
             when_not_matched_insert_all: Bool indicating new rows to be inserted that do not match any
                 existing rows in the table
             case_sensitive: Bool indicating if the match should be case-sensitive
+            null_safe_eq: Bool indicating if the equality operator should be null-safe (<=> instead of =)
             branch: Branch Reference to run the upsert operation
             snapshot_properties: Custom properties to be added to the snapshot summary
 
@@ -1368,6 +1372,7 @@ def upsert(
                 when_matched_update_all=when_matched_update_all,
                 when_not_matched_insert_all=when_not_matched_insert_all,
                 case_sensitive=case_sensitive,
+                null_safe_eq=null_safe_eq,
                 branch=branch,
                 snapshot_properties=snapshot_properties,
             )
diff --git a/pyiceberg/table/upsert_util.py b/pyiceberg/table/upsert_util.py
@@ -76,7 +76,7 @@ def has_duplicate_rows(df: pyarrow_table, join_cols: list[str]) -> bool:
     return len(df.select(join_cols).group_by(join_cols).aggregate([([], "count_all")]).filter(pc.field("count_all") > 1)) > 0
 
 
-def get_rows_to_update(source_table: pa.Table, target_table: pa.Table, join_cols: list[str]) -> pa.Table:
+def get_rows_to_update(source_table: pa.Table, target_table: pa.Table, join_cols: list[str], null_safe_eq: bool) -> pa.Table:
     """
     Return a table with rows that need to be updated in the target table based on the join columns.
 
@@ -121,16 +121,20 @@ def get_rows_to_update(source_table: pa.Table, target_table: pa.Table, join_cols
     target_index = target_table.select(join_cols_set).append_column(TARGET_INDEX_COLUMN_NAME, pa.array(range(len(target_table))))
 
     # Step 3: Perform an inner join to find which rows from source exist in target
-    # PyArrow joins ignore null values, and we want null==null to hold, so we compute the join in Python.
-    # This is equivalent to:
-    # matching_indices = source_index.join(target_index, keys=list(join_cols_set), join_type="inner")
-    source_indices = {tuple(row[col] for col in join_cols): row[SOURCE_INDEX_COLUMN_NAME] for row in source_index.to_pylist()}
-    target_indices = {tuple(row[col] for col in join_cols): row[TARGET_INDEX_COLUMN_NAME] for row in target_index.to_pylist()}
-    matching_indices = [(s, t) for key, s in source_indices.items() if (t := target_indices.get(key)) is not None]
+    if null_safe_eq:
+        # PyArrow joins ignore null values, and we want null==null to hold, so we compute the join in Python.
+        source_indices = {tuple(row[col] for col in join_cols): row[SOURCE_INDEX_COLUMN_NAME] for row in source_index.to_pylist()}
+        target_indices = {tuple(row[col] for col in join_cols): row[TARGET_INDEX_COLUMN_NAME] for row in target_index.to_pylist()}
+        paired_indices = [(s, t) for key, s in source_indices.items() if (t := target_indices.get(key)) is not None]
+    else:
+        matching_indices = source_index.join(target_index, keys=list(join_cols_set), join_type="inner")
+        source_indices = matching_indices[SOURCE_INDEX_COLUMN_NAME].to_pylist()
+        target_indices = matching_indices[TARGET_INDEX_COLUMN_NAME].to_pylist()
+        paired_indices = list(zip(source_indices, target_indices, strict=True))
 
     # Step 4: Compare all rows using Python
     to_update_indices = []
-    for source_idx, target_idx in matching_indices:
+    for source_idx, target_idx in paired_indices:
         source_row = source_table.slice(source_idx, 1)
         target_row = target_table.slice(target_idx, 1)
 
diff --git a/tests/table/test_upsert.py b/tests/table/test_upsert.py
@@ -828,7 +828,7 @@ def test_upsert_with_nulls_in_join_columns(catalog: Catalog) -> None:
         ],
         schema=schema,
     )
-    upd = table.upsert(data_with_null, join_cols=["foo", "bar"])
+    upd = table.upsert(data_with_null, join_cols=["foo", "bar"], null_safe_eq=True)
     assert upd.rows_updated == 1
     assert upd.rows_inserted == 1
     assert table.scan().to_arrow() == pa.Table.from_pylist(

Original file line number	Diff line number	Diff line change
`@@ -828,7 +828,7 @@ def test_upsert_with_nulls_in_join_columns(catalog: Catalog) -> None:`
`828`	`828`	`],`
`829`	`829`	`schema=schema,`
`830`	`830`	`)`
`831`		`- upd = table.upsert(data_with_null, join_cols=["foo", "bar"])`
	`831`	`+ upd = table.upsert(data_with_null, join_cols=["foo", "bar"], null_safe_eq=True)`
`832`	`832`	`assert upd.rows_updated == 1`
`833`	`833`	`assert upd.rows_inserted == 1`
`834`	`834`	`assert table.scan().to_arrow() == pa.Table.from_pylist(`