From e0be0ac5cc86665ed05bebb6383dbd431e05530a Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Mon, 25 May 2026 15:34:15 +0800 Subject: [PATCH 01/32] [python] [WIP] Initialize ray.merge_paimon connector Pythonic MERGE INTO on Ray Datasets, mirroring Spark/Flink merge-into. UPSERT-flavored clauses (matched-update, not-matched-insert, not-matched-by-source-update) supported; DELETE raises NotImplementedError pending KeyValueDataWriter row-kind work. API: from pypaimon.ray import merge_paimon merge_paimon(target, source, catalog_options, on=[...], when_matched_update={...}, when_not_matched_insert="*") Algorithm: read target -> tag _side -> union -> groupby(on).map_groups to classify matched/not-matched and apply SET; write back via write_paimon (PK upsert through _SEQUENCE_NUMBER). Known bugs to fix in follow-up: - _schema_type_map referenced but never defined (NameError on call) - for f in batch.schema iterates pa.Schema (TypeError on pyarrow >= 18) - type-mismatch fallback to pa.null() destroys join keys - test helper _make_pk_table_with_flag returns 1 value, test unpacks 2 --- paimon-python/pypaimon/ray/__init__.py | 3 +- paimon-python/pypaimon/ray/merge_into.py | 492 ++++++++++++++++++ paimon-python/pypaimon/ray/shuffle.py | 18 +- .../pypaimon/tests/ray_merge_into_test.py | 351 +++++++++++++ 4 files changed, 857 insertions(+), 7 deletions(-) create mode 100644 paimon-python/pypaimon/ray/merge_into.py create mode 100644 paimon-python/pypaimon/tests/ray_merge_into_test.py diff --git a/paimon-python/pypaimon/ray/__init__.py b/paimon-python/pypaimon/ray/__init__.py index f36eb0253dd8..a50a38bab7ce 100644 --- a/paimon-python/pypaimon/ray/__init__.py +++ b/paimon-python/pypaimon/ray/__init__.py @@ -16,5 +16,6 @@ # under the License. from pypaimon.ray.ray_paimon import read_paimon, write_paimon +from pypaimon.ray.merge_into import merge_paimon -__all__ = ["read_paimon", "write_paimon"] +__all__ = ["read_paimon", "write_paimon", "merge_paimon"] diff --git a/paimon-python/pypaimon/ray/merge_into.py b/paimon-python/pypaimon/ray/merge_into.py new file mode 100644 index 000000000000..775c2b25a9d0 --- /dev/null +++ b/paimon-python/pypaimon/ray/merge_into.py @@ -0,0 +1,492 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +"""MERGE INTO for Paimon primary-key tables, driven by Ray Datasets. + +Mirrors the high-level semantics of +``paimon-flink/.../action/MergeIntoAction.java`` and +``paimon-spark/.../commands/MergeIntoPaimonTable.scala`` but exposes a +Pythonic API instead of SQL. + +MVP scope: only upsert-flavored clauses (matched-update, +not-matched-insert, not-matched-by-source-update). DELETE clauses raise +``NotImplementedError`` because pypaimon's +``KeyValueDataWriter._add_system_fields`` still hardcodes ``_VALUE_KIND`` +to INSERT — see +``paimon-python/pypaimon/write/writer/key_value_data_writer.py:53``. +""" + +from dataclasses import dataclass, field +from functools import partial +from typing import ( + Any, + Callable, + Dict, + List, + Mapping, + Optional, + Sequence, + Tuple, + Union, +) + +import pyarrow as pa +import pyarrow.compute as pc + +from pypaimon.ray.shuffle import ( + _coerce_large_string_types, + _pick_collision_safe_col_name, +) + +SetSpec = Union[str, Dict[str, Any]] +Condition = Callable[[Mapping[str, Any]], bool] + +_SIDE_TARGET = "t" +_SIDE_SOURCE = "s" + + +@dataclass +class _MergeConfig: + on: Tuple[str, ...] + target_field_names: Tuple[str, ...] + side_col: str + matched_update: Optional[Dict[str, Any]] + matched_update_condition: Optional[Condition] + not_matched_insert: Optional[Dict[str, Any]] + not_matched_insert_condition: Optional[Condition] + not_matched_by_source_update: Optional[Dict[str, Any]] + not_matched_by_source_update_condition: Optional[Condition] + target_pa_schema: pa.Schema = field(repr=False) + + +def merge_paimon( + target: str, + source: Any, + catalog_options: Dict[str, str], + *, + on: Sequence[str], + when_matched_update: Optional[SetSpec] = None, + when_matched_update_condition: Optional[Condition] = None, + when_matched_delete_condition: Optional[Condition] = None, + when_not_matched_insert: Optional[SetSpec] = None, + when_not_matched_insert_condition: Optional[Condition] = None, + when_not_matched_by_source_update: Optional[Dict[str, Any]] = None, + when_not_matched_by_source_update_condition: Optional[Condition] = None, + when_not_matched_by_source_delete_condition: Optional[Condition] = None, + ray_remote_args: Optional[Dict[str, Any]] = None, + concurrency: Optional[int] = None, +) -> None: + """MERGE INTO ``target`` USING ``source`` for a Paimon primary-key table. + + Args: + target: Full table identifier, e.g. ``"db.table"``. + source: Right-hand side. One of ``ray.data.Dataset``, a Paimon + table identifier string (read via :func:`read_paimon` with + the same ``catalog_options``), a ``pyarrow.Table``, or a + ``pandas.DataFrame``. + catalog_options: Forwarded to ``CatalogFactory.create`` and to + ``read_paimon`` / ``write_paimon``. + on: Join keys. Must be a subset of the target table's primary + keys. Rows are matched by equality on these columns. + when_matched_update: SET spec for matched rows. Use ``"*"`` to + copy all target columns from source (requires schema + compatibility), or a dict ``{target_col: expr}`` where + ``expr`` is ``"s.col"`` / ``"t.col"`` / a literal / a + callable taking the combined row. + when_matched_update_condition: Optional predicate over the + combined row (keys prefixed ``s.`` / ``t.``); rows not + satisfying it are left unchanged. + when_matched_delete_condition: Not yet supported; raises + ``NotImplementedError`` if non-None. + when_not_matched_insert: SET spec for source rows with no + matching target row. ``"*"`` copies all target-schema + columns from source. + when_not_matched_insert_condition: Optional predicate over the + source row (keys prefixed ``s.``). + when_not_matched_by_source_update: SET spec for target rows + with no matching source row. Same dict format as above; only + ``"t.col"`` / literal / callable values are meaningful since + no source row exists. + when_not_matched_by_source_update_condition: Optional predicate + over the target row (keys prefixed ``t.``). + when_not_matched_by_source_delete_condition: Not yet supported; + raises ``NotImplementedError`` if non-None. + ray_remote_args: Forwarded to ``write_paimon``. + concurrency: Forwarded to ``write_paimon``. + + Notes: + - The target table must be a primary-key table. + - For HASH_FIXED bucket mode, the existing + :func:`maybe_apply_repartition` is applied by ``write_paimon`` + before writing. + - If the user passes any callable as a condition or as a SET + value, the target read falls back to the full schema (callables + are opaque to projection analysis). + - User-supplied callables must be picklable since Ray ships them + to workers. + - If ``on`` is a strict subset of the primary keys and the SET + expression rewrites the remaining PK columns, two PK rows may + collide downstream of the merge engine; do not do this. + """ + if when_matched_delete_condition is not None or when_not_matched_by_source_delete_condition is not None: + raise NotImplementedError( + "DELETE clauses are not supported yet: the pypaimon writer " + "hardcodes _VALUE_KIND to INSERT. See " + "paimon-python/pypaimon/write/writer/key_value_data_writer.py:53. " + "Once that TODO is resolved, DELETE can be wired through." + ) + + from pypaimon.catalog.catalog_factory import CatalogFactory + from pypaimon.schema.data_types import PyarrowFieldParser + + catalog = CatalogFactory.create(catalog_options) + table = catalog.get_table(target) + if not table.is_primary_key_table: + raise ValueError( + f"merge_paimon requires a primary-key table; " + f"'{target}' has no primary keys." + ) + + primary_keys = list(table.primary_keys) + if not set(on).issubset(set(primary_keys)): + raise ValueError( + f"'on' columns {list(on)} must be a subset of target primary " + f"keys {primary_keys}." + ) + + if ( + when_matched_update is None + and when_not_matched_insert is None + and when_not_matched_by_source_update is None + ): + raise ValueError( + "At least one of when_matched_update, when_not_matched_insert, " + "or when_not_matched_by_source_update must be provided." + ) + + target_field_names = list(table.field_names) + matched_update = _normalize_set_spec( + when_matched_update, target_field_names, allow_star=True, star_side=_SIDE_SOURCE + ) + not_matched_insert = _normalize_set_spec( + when_not_matched_insert, target_field_names, allow_star=True, star_side=_SIDE_SOURCE + ) + not_matched_by_source_update = _normalize_set_spec( + when_not_matched_by_source_update, + target_field_names, + allow_star=False, + star_side=None, + ) + + target_pa_schema = PyarrowFieldParser.from_paimon_schema(table.table_schema.fields) + + source_ds = _normalize_source(source, catalog_options) + source_schema = source_ds.schema() + source_col_names = list(source_schema.names) if source_schema is not None else [] + if source_col_names: + for col in on: + if col not in source_col_names: + raise ValueError( + f"'on' column '{col}' is missing from source schema " + f"{source_col_names}." + ) + + target_projection = _compute_target_projection( + on=on, + target_field_names=target_field_names, + matched_update=matched_update, + not_matched_by_source_update=not_matched_by_source_update, + matched_update_condition=when_matched_update_condition, + not_matched_by_source_update_condition=when_not_matched_by_source_update_condition, + ) + + from pypaimon.ray.ray_paimon import read_paimon, write_paimon + + target_ds = read_paimon(target, catalog_options, projection=target_projection) + + side_col = _pick_collision_safe_col_name( + set(target_field_names) | set(source_col_names), "_paimon_side" + ) + aligned_target, aligned_source = _align_for_union( + target_ds, source_ds, side_col=side_col + ) + combined = aligned_target.union(aligned_source) + + cfg = _MergeConfig( + on=tuple(on), + target_field_names=tuple(target_field_names), + side_col=side_col, + matched_update=matched_update, + matched_update_condition=when_matched_update_condition, + not_matched_insert=not_matched_insert, + not_matched_insert_condition=when_not_matched_insert_condition, + not_matched_by_source_update=not_matched_by_source_update, + not_matched_by_source_update_condition=when_not_matched_by_source_update_condition, + target_pa_schema=target_pa_schema, + ) + + merged = ( + combined + .groupby(list(on)) + .map_groups(partial(_merge_groups, cfg=cfg), batch_format="pyarrow") + ) + + write_paimon( + merged, + target, + catalog_options, + ray_remote_args=ray_remote_args, + concurrency=concurrency, + ) + + +def _normalize_set_spec( + spec: Optional[SetSpec], + target_field_names: Sequence[str], + *, + allow_star: bool, + star_side: Optional[str], +) -> Optional[Dict[str, Any]]: + if spec is None: + return None + if isinstance(spec, str): + if not allow_star or spec != "*": + raise ValueError( + f"SET spec strings other than '*' are not supported here; got {spec!r}." + ) + return {col: f"{star_side}.{col}" for col in target_field_names} + if not isinstance(spec, dict): + raise ValueError( + f"SET spec must be '*' or a dict, got {type(spec).__name__}." + ) + target_set = set(target_field_names) + for col in spec: + if col not in target_set: + raise ValueError( + f"SET key '{col}' is not a column of the target table " + f"(columns: {list(target_field_names)})." + ) + return dict(spec) + + +def _normalize_source(source: Any, catalog_options: Dict[str, str]): + import ray.data + + if isinstance(source, ray.data.Dataset): + return source + if isinstance(source, str): + from pypaimon.ray.ray_paimon import read_paimon + return read_paimon(source, catalog_options) + if isinstance(source, pa.Table): + return ray.data.from_arrow(source) + try: + import pandas as pd + except ImportError: + pd = None + if pd is not None and isinstance(source, pd.DataFrame): + return ray.data.from_pandas(source) + raise TypeError( + "source must be a ray.data.Dataset, a Paimon table identifier " + "string, a pyarrow.Table, or a pandas.DataFrame; got " + f"{type(source).__name__}." + ) + + +def _compute_target_projection( + *, + on: Sequence[str], + target_field_names: Sequence[str], + matched_update: Optional[Dict[str, Any]], + not_matched_by_source_update: Optional[Dict[str, Any]], + matched_update_condition: Optional[Condition], + not_matched_by_source_update_condition: Optional[Condition], +) -> Optional[List[str]]: + if matched_update_condition is not None or not_matched_by_source_update_condition is not None: + return None + needed = set(on) + for spec in (matched_update, not_matched_by_source_update): + if not spec: + continue + needed.update(spec.keys()) + for value in spec.values(): + if callable(value): + return None + if isinstance(value, str) and value.startswith("t."): + needed.add(value[2:]) + needed.update(_required_target_cols_for_passthrough(matched_update, target_field_names)) + needed.update(_required_target_cols_for_passthrough(not_matched_by_source_update, target_field_names)) + return [col for col in target_field_names if col in needed] + + +def _required_target_cols_for_passthrough( + spec: Optional[Dict[str, Any]], target_field_names: Sequence[str] +) -> List[str]: + if not spec: + return list(target_field_names) + return [col for col in target_field_names if col not in spec] + + +def _align_for_union(target_ds, source_ds, *, side_col: str): + target_schema = target_ds.schema() + source_schema = source_ds.schema() + target_type_map = _schema_type_map(target_schema) + source_type_map = _schema_type_map(source_schema) + + union_field_types: Dict[str, pa.DataType] = {} + for name, t in target_type_map.items(): + union_field_types[name] = t + for name, t in source_type_map.items(): + if name in union_field_types: + if union_field_types[name] != t: + union_field_types[name] = pa.null() + else: + union_field_types[name] = t + + aligned_target = target_ds.map_batches( + partial( + _align_batch, + union_field_types=union_field_types, + side_value=_SIDE_TARGET, + side_col=side_col, + ), + batch_format="pyarrow", + ) + aligned_source = source_ds.map_batches( + partial( + _align_batch, + union_field_types=union_field_types, + side_value=_SIDE_SOURCE, + side_col=side_col, + ), + batch_format="pyarrow", + ) + return aligned_target, aligned_source + + +def _align_batch( + batch: pa.Table, + *, + union_field_types: Dict[str, pa.DataType], + side_value: str, + side_col: str, +) -> pa.Table: + n = batch.num_rows + arrays = [] + fields = [] + present = {f.name: batch.column(f.name) for f in batch.schema} + for name, target_type in union_field_types.items(): + if name in present: + col = present[name] + if col.type != target_type: + col = col.cast(target_type, safe=False) if target_type != pa.null() else pa.nulls(n, type=pa.null()) + arrays.append(col) + else: + arrays.append(pa.nulls(n, type=target_type)) + fields.append(pa.field(name, target_type)) + arrays.append(pa.array([side_value] * n, type=pa.string())) + fields.append(pa.field(side_col, pa.string())) + return pa.Table.from_arrays(arrays, schema=pa.schema(fields)) + + +def _merge_groups(group: pa.Table, *, cfg: _MergeConfig) -> pa.Table: + if group.num_rows == 0: + return pa.Table.from_pylist([], schema=cfg.target_pa_schema) + side_mask = pc.equal(group.column(cfg.side_col), _SIDE_TARGET) + target_table = group.filter(side_mask).drop([cfg.side_col]) + source_table = group.filter(pc.invert(side_mask)).drop([cfg.side_col]) + target_rows = target_table.to_pylist() + source_rows = source_table.to_pylist() + output_rows: List[Dict[str, Any]] = [] + + if target_rows and source_rows: + if cfg.matched_update is not None: + for t_row in target_rows: + for s_row in source_rows: + combined = _prefixed(s_row, t_row) + if cfg.matched_update_condition is not None and not cfg.matched_update_condition(combined): + continue + output_rows.append(_apply_set(cfg.matched_update, s_row, t_row, cfg.target_field_names)) + elif source_rows and not target_rows: + if cfg.not_matched_insert is not None: + for s_row in source_rows: + combined = _prefixed(s_row, None) + if cfg.not_matched_insert_condition is not None and not cfg.not_matched_insert_condition(combined): + continue + output_rows.append(_apply_set(cfg.not_matched_insert, s_row, None, cfg.target_field_names)) + elif target_rows and not source_rows: + if cfg.not_matched_by_source_update is not None: + for t_row in target_rows: + combined = _prefixed(None, t_row) + if ( + cfg.not_matched_by_source_update_condition is not None + and not cfg.not_matched_by_source_update_condition(combined) + ): + continue + output_rows.append(_apply_set(cfg.not_matched_by_source_update, None, t_row, cfg.target_field_names)) + + if not output_rows: + return pa.Table.from_pylist([], schema=cfg.target_pa_schema) + aligned = [{name: row.get(name) for name in cfg.target_field_names} for row in output_rows] + arrow_table = pa.Table.from_pylist(aligned, schema=cfg.target_pa_schema) + return _coerce_large_string_types(arrow_table) + + +def _prefixed(s_row: Optional[Dict[str, Any]], t_row: Optional[Dict[str, Any]]) -> Dict[str, Any]: + out: Dict[str, Any] = {} + if s_row is not None: + for k, v in s_row.items(): + out[f"s.{k}"] = v + if t_row is not None: + for k, v in t_row.items(): + out[f"t.{k}"] = v + return out + + +def _apply_set( + spec: Dict[str, Any], + s_row: Optional[Dict[str, Any]], + t_row: Optional[Dict[str, Any]], + target_field_names: Sequence[str], +) -> Dict[str, Any]: + combined = _prefixed(s_row, t_row) + out: Dict[str, Any] = {} + base = t_row if t_row is not None else (s_row if s_row is not None else {}) + for col in target_field_names: + if col in spec: + out[col] = _eval_set_value(spec[col], combined, s_row, t_row) + elif col in base: + out[col] = base[col] + else: + out[col] = None + return out + + +def _eval_set_value( + value: Any, + combined: Mapping[str, Any], + s_row: Optional[Dict[str, Any]], + t_row: Optional[Dict[str, Any]], +) -> Any: + if callable(value): + return value(combined) + if isinstance(value, str): + if value.startswith("s.") and s_row is not None: + return s_row.get(value[2:]) + if value.startswith("t.") and t_row is not None: + return t_row.get(value[2:]) + return value diff --git a/paimon-python/pypaimon/ray/shuffle.py b/paimon-python/pypaimon/ray/shuffle.py index b17f7a7ab1c4..8f8f412d4729 100644 --- a/paimon-python/pypaimon/ray/shuffle.py +++ b/paimon-python/pypaimon/ray/shuffle.py @@ -53,17 +53,23 @@ BUCKET_KEY_COL = "__paimon_bucket__" -def _pick_bucket_col_name(existing_names) -> str: - """Return a bucket column name guaranteed not to collide with - ``existing_names``. Falls back to a UUID suffix on collision.""" - if BUCKET_KEY_COL not in existing_names: - return BUCKET_KEY_COL +def _pick_collision_safe_col_name(existing_names, base: str) -> str: + """Return a column name guaranteed not to collide with ``existing_names``. + + Prefer ``base`` itself; on collision, append a short uuid suffix. + """ + if base not in existing_names: + return base while True: - candidate = "__paimon_bucket_{}_".format(uuid.uuid4().hex[:8]) + candidate = "{}_{}_".format(base.rstrip("_"), uuid.uuid4().hex[:8]) if candidate not in existing_names: return candidate +def _pick_bucket_col_name(existing_names) -> str: + return _pick_collision_safe_col_name(existing_names, BUCKET_KEY_COL) + + def maybe_apply_repartition( dataset: "ray.data.Dataset", table: "Table", diff --git a/paimon-python/pypaimon/tests/ray_merge_into_test.py b/paimon-python/pypaimon/tests/ray_merge_into_test.py new file mode 100644 index 000000000000..ecbdd1b8e648 --- /dev/null +++ b/paimon-python/pypaimon/tests/ray_merge_into_test.py @@ -0,0 +1,351 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import shutil +import tempfile +import unittest + +import pyarrow as pa +import ray + +from pypaimon import CatalogFactory, Schema +from pypaimon.ray import merge_paimon + + +class RayMergeIntoTest(unittest.TestCase): + + @classmethod + def setUpClass(cls): + cls.tempdir = tempfile.mkdtemp() + cls.warehouse = os.path.join(cls.tempdir, "warehouse") + cls.catalog_options = {"warehouse": cls.warehouse} + cls.catalog = CatalogFactory.create(cls.catalog_options) + cls.catalog.create_database("default", True) + if not ray.is_initialized(): + ray.init(ignore_reinit_error=True, num_cpus=2) + + @classmethod + def tearDownClass(cls): + try: + if ray.is_initialized(): + ray.shutdown() + except Exception: + pass + try: + shutil.rmtree(cls.tempdir) + except OSError: + pass + + def _make_pk_table(self, name: str, extra_options=None): + pa_schema = pa.schema([ + pa.field("id", pa.int32(), nullable=False), + ("name", pa.string()), + ("value", pa.int64()), + ]) + options = {"bucket": "2"} + if extra_options: + options.update(extra_options) + schema = Schema.from_pyarrow_schema( + pa_schema, primary_keys=["id"], options=options + ) + full = f"default.{name}" + self.catalog.create_table(full, schema, False) + return full, pa_schema + + def _write(self, full_name, data: pa.Table): + table = self.catalog.get_table(full_name) + write_builder = table.new_batch_write_builder() + writer = write_builder.new_write() + writer.write_arrow(data) + commit_messages = writer.prepare_commit() + write_builder.new_commit().commit(commit_messages) + writer.close() + + def _read_sorted(self, full_name): + table = self.catalog.get_table(full_name) + read_builder = table.new_read_builder() + splits = read_builder.new_scan().plan().splits() + result = read_builder.new_read().to_arrow(splits) + return result.sort_by("id").to_pydict() + + def test_basic_upsert_star_set(self): + full, pa_schema = self._make_pk_table("merge_basic") + self._write(full, pa.Table.from_pydict({ + "id": [1, 2, 3], + "name": ["Alice", "Bob", "Charlie"], + "value": [100, 200, 300], + }, schema=pa_schema)) + + source = pa.Table.from_pydict({ + "id": [2, 3, 4], + "name": ["Bob-Updated", "Charlie-Updated", "Dan"], + "value": [250, 350, 400], + }, schema=pa_schema) + + merge_paimon( + target=full, + source=source, + catalog_options=self.catalog_options, + on=["id"], + when_matched_update="*", + when_not_matched_insert="*", + ) + + out = self._read_sorted(full) + self.assertEqual(out["id"], [1, 2, 3, 4]) + self.assertEqual(out["name"], ["Alice", "Bob-Updated", "Charlie-Updated", "Dan"]) + self.assertEqual(out["value"], [100, 250, 350, 400]) + + def test_set_dict_with_literal_and_source_ref(self): + full, pa_schema = self._make_pk_table("merge_set_dict") + self._write(full, pa.Table.from_pydict({ + "id": [1, 2], + "name": ["Alice", "Bob"], + "value": [10, 20], + }, schema=pa_schema)) + + source = pa.Table.from_pydict({ + "id": pa.array([1, 2], type=pa.int32()), + "name": ["A-new", "B-new"], + "value": pa.array([99, 99], type=pa.int64()), + }) + + merge_paimon( + target=full, + source=source, + catalog_options=self.catalog_options, + on=["id"], + when_matched_update={"name": "s.name", "value": 777}, + ) + + out = self._read_sorted(full) + self.assertEqual(out["id"], [1, 2]) + self.assertEqual(out["name"], ["A-new", "B-new"]) + self.assertEqual(out["value"], [777, 777]) + + def test_matched_update_condition(self): + full, pa_schema = self._make_pk_table("merge_condition") + self._write(full, pa.Table.from_pydict({ + "id": [1, 2], + "name": ["Old1", "Old2"], + "value": [50, 50], + }, schema=pa_schema)) + + source = pa.Table.from_pydict({ + "id": pa.array([1, 2], type=pa.int32()), + "name": ["New1", "New2"], + "value": pa.array([100, 10], type=pa.int64()), + }) + + merge_paimon( + target=full, + source=source, + catalog_options=self.catalog_options, + on=["id"], + when_matched_update="*", + when_matched_update_condition=lambda r: r["s.value"] > r["t.value"], + ) + + out = self._read_sorted(full) + self.assertEqual(out["id"], [1, 2]) + self.assertEqual(out["name"], ["New1", "Old2"]) + self.assertEqual(out["value"], [100, 50]) + + def test_not_matched_by_source_update(self): + full, pa_schema = self._make_pk_table_with_flag("merge_soft_delete") + target = self.catalog.get_table(full) + write_builder = target.new_batch_write_builder() + writer = write_builder.new_write() + writer.write_arrow(pa.Table.from_pydict({ + "id": pa.array([1, 2, 3], type=pa.int32()), + "name": ["A", "B", "C"], + "deleted": [False, False, False], + })) + commits = writer.prepare_commit() + write_builder.new_commit().commit(commits) + writer.close() + + source = pa.Table.from_pydict({ + "id": pa.array([1], type=pa.int32()), + "name": ["A"], + "deleted": [False], + }) + + merge_paimon( + target=full, + source=source, + catalog_options=self.catalog_options, + on=["id"], + when_not_matched_by_source_update={"deleted": True}, + ) + + out = self._read_sorted(full) + self.assertEqual(out["id"], [1, 2, 3]) + self.assertEqual(out["name"], ["A", "B", "C"]) + self.assertEqual(out["deleted"], [False, True, True]) + + def _make_pk_table_with_flag(self, name: str): + pa_schema = pa.schema([ + pa.field("id", pa.int32(), nullable=False), + ("name", pa.string()), + ("deleted", pa.bool_()), + ]) + schema = Schema.from_pyarrow_schema( + pa_schema, primary_keys=["id"], options={"bucket": "2"} + ) + full = f"default.{name}" + self.catalog.create_table(full, schema, False) + return full + + def test_delete_raises_not_implemented(self): + full, pa_schema = self._make_pk_table("merge_delete_unsupported") + with self.assertRaises(NotImplementedError) as ctx: + merge_paimon( + target=full, + source=pa.Table.from_pydict({"id": pa.array([1], type=pa.int32())}), + catalog_options=self.catalog_options, + on=["id"], + when_matched_delete_condition=lambda r: True, + ) + self.assertIn("DELETE", str(ctx.exception)) + + with self.assertRaises(NotImplementedError): + merge_paimon( + target=full, + source=pa.Table.from_pydict({"id": pa.array([1], type=pa.int32())}), + catalog_options=self.catalog_options, + on=["id"], + when_not_matched_by_source_delete_condition=lambda r: True, + ) + + def test_validation_errors(self): + # append-only table (no PK) → ValueError + ao_schema = pa.schema([("id", pa.int32()), ("v", pa.int64())]) + ao = Schema.from_pyarrow_schema(ao_schema) + self.catalog.create_table("default.merge_append_only", ao, False) + with self.assertRaises(ValueError): + merge_paimon( + target="default.merge_append_only", + source=pa.Table.from_pydict({"id": [1], "v": [1]}), + catalog_options=self.catalog_options, + on=["id"], + when_matched_update="*", + ) + + full, pa_schema = self._make_pk_table("merge_validation") + + # on not subset of PKs → ValueError + with self.assertRaises(ValueError): + merge_paimon( + target=full, + source=pa.Table.from_pydict({"id": pa.array([1], type=pa.int32())}), + catalog_options=self.catalog_options, + on=["name"], + when_matched_update="*", + ) + + # no when_* clause → ValueError + with self.assertRaises(ValueError): + merge_paimon( + target=full, + source=pa.Table.from_pydict({"id": pa.array([1], type=pa.int32())}), + catalog_options=self.catalog_options, + on=["id"], + ) + + # source missing `on` column → ValueError + with self.assertRaises(ValueError): + merge_paimon( + target=full, + source=pa.Table.from_pydict({"name": ["x"], "value": pa.array([1], type=pa.int64())}), + catalog_options=self.catalog_options, + on=["id"], + when_matched_update="*", + ) + + # SET key not a target column → ValueError + with self.assertRaises(ValueError): + merge_paimon( + target=full, + source=pa.Table.from_pydict({"id": pa.array([1], type=pa.int32())}), + catalog_options=self.catalog_options, + on=["id"], + when_matched_update={"not_a_real_column": "x"}, + ) + + def test_source_type_normalization(self): + import pandas as pd + + # pyarrow.Table source + full_a, pa_schema = self._make_pk_table("merge_src_arrow") + self._write(full_a, pa.Table.from_pydict({ + "id": [1], "name": ["a"], "value": [1], + }, schema=pa_schema)) + merge_paimon( + target=full_a, + source=pa.Table.from_pydict({ + "id": pa.array([2], type=pa.int32()), + "name": ["b"], + "value": pa.array([2], type=pa.int64()), + }), + catalog_options=self.catalog_options, + on=["id"], + when_not_matched_insert="*", + ) + self.assertEqual(self._read_sorted(full_a)["id"], [1, 2]) + + # pandas.DataFrame source + full_b, pa_schema_b = self._make_pk_table("merge_src_pandas") + self._write(full_b, pa.Table.from_pydict({ + "id": [1], "name": ["a"], "value": [1], + }, schema=pa_schema_b)) + merge_paimon( + target=full_b, + source=pd.DataFrame({ + "id": pd.array([2], dtype="int32"), + "name": ["b"], + "value": pd.array([2], dtype="int64"), + }), + catalog_options=self.catalog_options, + on=["id"], + when_not_matched_insert="*", + ) + self.assertEqual(self._read_sorted(full_b)["id"], [1, 2]) + + # Paimon table identifier source + full_c, pa_schema_c = self._make_pk_table("merge_src_target") + src_name, _ = self._make_pk_table("merge_src_source") + self._write(full_c, pa.Table.from_pydict({ + "id": [1], "name": ["a"], "value": [1], + }, schema=pa_schema_c)) + self._write(src_name, pa.Table.from_pydict({ + "id": [2], "name": ["b"], "value": [2], + }, schema=pa_schema_c)) + merge_paimon( + target=full_c, + source=src_name, + catalog_options=self.catalog_options, + on=["id"], + when_not_matched_insert="*", + ) + self.assertEqual(self._read_sorted(full_c)["id"], [1, 2]) + + +if __name__ == "__main__": + unittest.main() From ef359af41ebb553e9a0d404a4f1f0d4c813e07c7 Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Thu, 28 May 2026 23:21:37 +0800 Subject: [PATCH 02/32] [python] Fix ray merge_paimon NameError and schema alignment bugs - _schema_type_map called but undefined: NameError on any cross-schema merge. - for f in batch.schema raises TypeError on pyarrow >= 18. - type-mismatch fallback to pa.null() drops join key values. - _make_pk_table_with_flag returned 1 value but caller unpacks 2. --- paimon-python/pypaimon/ray/merge_into.py | 15 +++++++++------ .../pypaimon/tests/ray_merge_into_test.py | 2 +- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/paimon-python/pypaimon/ray/merge_into.py b/paimon-python/pypaimon/ray/merge_into.py index 775c2b25a9d0..8009945193af 100644 --- a/paimon-python/pypaimon/ray/merge_into.py +++ b/paimon-python/pypaimon/ray/merge_into.py @@ -351,10 +351,7 @@ def _align_for_union(target_ds, source_ds, *, side_col: str): for name, t in target_type_map.items(): union_field_types[name] = t for name, t in source_type_map.items(): - if name in union_field_types: - if union_field_types[name] != t: - union_field_types[name] = pa.null() - else: + if name not in union_field_types: union_field_types[name] = t aligned_target = target_ds.map_batches( @@ -378,6 +375,12 @@ def _align_for_union(target_ds, source_ds, *, side_col: str): return aligned_target, aligned_source +def _schema_type_map(schema: Optional[pa.Schema]) -> Dict[str, pa.DataType]: + if schema is None: + return {} + return dict(zip(schema.names, schema.types)) + + def _align_batch( batch: pa.Table, *, @@ -388,12 +391,12 @@ def _align_batch( n = batch.num_rows arrays = [] fields = [] - present = {f.name: batch.column(f.name) for f in batch.schema} + present = {name: batch.column(name) for name in batch.schema.names} for name, target_type in union_field_types.items(): if name in present: col = present[name] if col.type != target_type: - col = col.cast(target_type, safe=False) if target_type != pa.null() else pa.nulls(n, type=pa.null()) + col = col.cast(target_type, safe=False) arrays.append(col) else: arrays.append(pa.nulls(n, type=target_type)) diff --git a/paimon-python/pypaimon/tests/ray_merge_into_test.py b/paimon-python/pypaimon/tests/ray_merge_into_test.py index ecbdd1b8e648..b0e3013f6793 100644 --- a/paimon-python/pypaimon/tests/ray_merge_into_test.py +++ b/paimon-python/pypaimon/tests/ray_merge_into_test.py @@ -211,7 +211,7 @@ def _make_pk_table_with_flag(self, name: str): ) full = f"default.{name}" self.catalog.create_table(full, schema, False) - return full + return full, pa_schema def test_delete_raises_not_implemented(self): full, pa_schema = self._make_pk_table("merge_delete_unsupported") From 308311c7e0637a8dbf8e0252d4dc68eec7df182c Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Thu, 28 May 2026 23:24:47 +0800 Subject: [PATCH 03/32] [python] Fix ray merge_paimon multi-source semantics and deprecated drop API - pa.Table.drop deprecated in newer pyarrow; switch to drop_columns. - matched branch silently produced cartesian product on multiple source rows. - _required_target_cols_for_passthrough widened projection to all columns when its spec was None, defeating the projection optimization. --- paimon-python/pypaimon/ray/merge_into.py | 24 ++++++++++++------- .../pypaimon/tests/ray_merge_into_test.py | 24 +++++++++++++++++++ 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/paimon-python/pypaimon/ray/merge_into.py b/paimon-python/pypaimon/ray/merge_into.py index 8009945193af..d4ba358da3b7 100644 --- a/paimon-python/pypaimon/ray/merge_into.py +++ b/paimon-python/pypaimon/ray/merge_into.py @@ -336,8 +336,8 @@ def _compute_target_projection( def _required_target_cols_for_passthrough( spec: Optional[Dict[str, Any]], target_field_names: Sequence[str] ) -> List[str]: - if not spec: - return list(target_field_names) + if spec is None: + return [] return [col for col in target_field_names if col not in spec] @@ -410,20 +410,26 @@ def _merge_groups(group: pa.Table, *, cfg: _MergeConfig) -> pa.Table: if group.num_rows == 0: return pa.Table.from_pylist([], schema=cfg.target_pa_schema) side_mask = pc.equal(group.column(cfg.side_col), _SIDE_TARGET) - target_table = group.filter(side_mask).drop([cfg.side_col]) - source_table = group.filter(pc.invert(side_mask)).drop([cfg.side_col]) + target_table = group.filter(side_mask).drop_columns([cfg.side_col]) + source_table = group.filter(pc.invert(side_mask)).drop_columns([cfg.side_col]) target_rows = target_table.to_pylist() source_rows = source_table.to_pylist() output_rows: List[Dict[str, Any]] = [] if target_rows and source_rows: if cfg.matched_update is not None: + if len(source_rows) > 1: + raise ValueError( + f"MERGE INTO matched {len(source_rows)} source rows against " + f"the same target key on {list(cfg.on)}; source must be unique " + f"on the 'on' columns." + ) + s_row = source_rows[0] for t_row in target_rows: - for s_row in source_rows: - combined = _prefixed(s_row, t_row) - if cfg.matched_update_condition is not None and not cfg.matched_update_condition(combined): - continue - output_rows.append(_apply_set(cfg.matched_update, s_row, t_row, cfg.target_field_names)) + combined = _prefixed(s_row, t_row) + if cfg.matched_update_condition is not None and not cfg.matched_update_condition(combined): + continue + output_rows.append(_apply_set(cfg.matched_update, s_row, t_row, cfg.target_field_names)) elif source_rows and not target_rows: if cfg.not_matched_insert is not None: for s_row in source_rows: diff --git a/paimon-python/pypaimon/tests/ray_merge_into_test.py b/paimon-python/pypaimon/tests/ray_merge_into_test.py index b0e3013f6793..41d49ac63f29 100644 --- a/paimon-python/pypaimon/tests/ray_merge_into_test.py +++ b/paimon-python/pypaimon/tests/ray_merge_into_test.py @@ -234,6 +234,30 @@ def test_delete_raises_not_implemented(self): when_not_matched_by_source_delete_condition=lambda r: True, ) + def test_duplicate_source_rows_raise(self): + full, pa_schema = self._make_pk_table("merge_dup_source") + self._write(full, pa.Table.from_pydict({ + "id": [1], + "name": ["Old"], + "value": [10], + }, schema=pa_schema)) + + source = pa.Table.from_pydict({ + "id": [1, 1], + "name": ["A", "B"], + "value": [100, 200], + }, schema=pa_schema) + + with self.assertRaises(ValueError) as ctx: + merge_paimon( + target=full, + source=source, + catalog_options=self.catalog_options, + on=["id"], + when_matched_update="*", + ) + self.assertIn("source must be unique", str(ctx.exception)) + def test_validation_errors(self): # append-only table (no PK) → ValueError ao_schema = pa.schema([("id", pa.int32()), ("v", pa.int64())]) From 2a7a4fb304857bb7b9a0fb9ec12f61461ba7bc6e Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Thu, 28 May 2026 23:39:30 +0800 Subject: [PATCH 04/32] [python] refactor ray merge_into module --- paimon-python/pypaimon/ray/__init__.py | 4 +- .../pypaimon/ray/data_evolution_merge_into.py | 195 +++++++ paimon-python/pypaimon/ray/merge_into.py | 501 ------------------ .../ray_data_evolution_merge_into_test.py | 185 +++++++ .../pypaimon/tests/ray_merge_into_test.py | 375 ------------- 5 files changed, 382 insertions(+), 878 deletions(-) create mode 100644 paimon-python/pypaimon/ray/data_evolution_merge_into.py delete mode 100644 paimon-python/pypaimon/ray/merge_into.py create mode 100644 paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py delete mode 100644 paimon-python/pypaimon/tests/ray_merge_into_test.py diff --git a/paimon-python/pypaimon/ray/__init__.py b/paimon-python/pypaimon/ray/__init__.py index a50a38bab7ce..a1234f6142de 100644 --- a/paimon-python/pypaimon/ray/__init__.py +++ b/paimon-python/pypaimon/ray/__init__.py @@ -16,6 +16,6 @@ # under the License. from pypaimon.ray.ray_paimon import read_paimon, write_paimon -from pypaimon.ray.merge_into import merge_paimon +from pypaimon.ray.data_evolution_merge_into import merge_into -__all__ = ["read_paimon", "write_paimon", "merge_paimon"] +__all__ = ["read_paimon", "write_paimon", "merge_into"] diff --git a/paimon-python/pypaimon/ray/data_evolution_merge_into.py b/paimon-python/pypaimon/ray/data_evolution_merge_into.py new file mode 100644 index 000000000000..619b15dd81cd --- /dev/null +++ b/paimon-python/pypaimon/ray/data_evolution_merge_into.py @@ -0,0 +1,195 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +"""MERGE INTO ... USING ... for Paimon data-evolution tables via Ray Datasets.""" + +from typing import Any, Callable, Dict, Mapping, Optional, Sequence, Union + +import pyarrow as pa + +SetSpec = Union[str, Dict[str, Any]] +Condition = Callable[[Mapping[str, Any]], bool] + + +def merge_into( + target: str, + source: Any, + catalog_options: Dict[str, str], + *, + on: Sequence[str], + when_matched_update: Optional[SetSpec] = None, + when_matched_update_condition: Optional[Condition] = None, + when_matched_delete_condition: Optional[Condition] = None, + when_not_matched_insert: Optional[SetSpec] = None, + when_not_matched_insert_condition: Optional[Condition] = None, + when_not_matched_by_source_update: Optional[Dict[str, Any]] = None, + when_not_matched_by_source_update_condition: Optional[Condition] = None, + when_not_matched_by_source_delete_condition: Optional[Condition] = None, + ray_remote_args: Optional[Dict[str, Any]] = None, + concurrency: Optional[int] = None, +) -> None: + if when_matched_delete_condition is not None: + raise NotImplementedError("WHEN MATCHED THEN DELETE is not supported.") + if ( + when_not_matched_by_source_update is not None + or when_not_matched_by_source_update_condition is not None + or when_not_matched_by_source_delete_condition is not None + ): + raise NotImplementedError( + "WHEN NOT MATCHED BY SOURCE clauses are not supported." + ) + if when_matched_update is None and when_not_matched_insert is None: + raise ValueError( + "At least one of when_matched_update or when_not_matched_insert " + "must be provided." + ) + + from pypaimon.catalog.catalog_factory import CatalogFactory + + catalog = CatalogFactory.create(catalog_options) + table = catalog.get_table(target) + if not table.options.data_evolution_enabled(): + raise ValueError( + f"merge_into requires 'data-evolution.enabled' = 'true' on '{target}'." + ) + if not table.options.row_tracking_enabled(): + raise ValueError( + f"merge_into requires 'row-tracking.enabled' = 'true' on '{target}'." + ) + + target_field_names = list(table.field_names) + matched_update = _normalize_set_spec(when_matched_update, target_field_names) + not_matched_insert = _normalize_set_spec( + when_not_matched_insert, target_field_names + ) + + source_ds = _normalize_source(source, catalog_options) + _validate_source_on_cols(source_ds, on) + + if not_matched_insert is not None: + raise NotImplementedError("not-matched INSERT path not yet implemented.") + if matched_update is not None: + raise NotImplementedError("matched UPDATE path not yet implemented.") + + +def _normalize_set_spec( + spec: Optional[SetSpec], + target_field_names: Sequence[str], +) -> Optional[Dict[str, Any]]: + if spec is None: + return None + if isinstance(spec, str): + if spec != "*": + raise ValueError( + f"SET spec strings other than '*' are not supported; got {spec!r}." + ) + return {col: f"s.{col}" for col in target_field_names} + if not isinstance(spec, dict): + raise ValueError( + f"SET spec must be '*' or a dict, got {type(spec).__name__}." + ) + target_set = set(target_field_names) + for col in spec: + if col not in target_set: + raise ValueError( + f"SET key '{col}' is not a column of the target table " + f"(columns: {list(target_field_names)})." + ) + return dict(spec) + + +def _normalize_source(source: Any, catalog_options: Dict[str, str]): + import ray.data + + if isinstance(source, ray.data.Dataset): + return source + if isinstance(source, str): + from pypaimon.ray.ray_paimon import read_paimon + return read_paimon(source, catalog_options) + if isinstance(source, pa.Table): + return ray.data.from_arrow(source) + try: + import pandas as pd + except ImportError: + pd = None + if pd is not None and isinstance(source, pd.DataFrame): + return ray.data.from_pandas(source) + raise TypeError( + "source must be a ray.data.Dataset, a Paimon table identifier string, " + f"a pyarrow.Table, or a pandas.DataFrame; got {type(source).__name__}." + ) + + +def _validate_source_on_cols(source_ds, on: Sequence[str]) -> None: + schema = source_ds.schema() + if schema is None: + return + names = set(schema.names) + missing = [c for c in on if c not in names] + if missing: + raise ValueError( + f"'on' columns {missing} missing from source schema {list(names)}." + ) + + +def _apply_set( + spec: Dict[str, Any], + s_row: Optional[Dict[str, Any]], + t_row: Optional[Dict[str, Any]], + target_field_names: Sequence[str], +) -> Dict[str, Any]: + combined = _prefixed(s_row, t_row) + base = t_row if t_row is not None else (s_row if s_row is not None else {}) + out: Dict[str, Any] = {} + for col in target_field_names: + if col in spec: + out[col] = _eval_set_value(spec[col], combined, s_row, t_row) + elif col in base: + out[col] = base[col] + else: + out[col] = None + return out + + +def _prefixed( + s_row: Optional[Dict[str, Any]], t_row: Optional[Dict[str, Any]] +) -> Dict[str, Any]: + out: Dict[str, Any] = {} + if s_row is not None: + for k, v in s_row.items(): + out[f"s.{k}"] = v + if t_row is not None: + for k, v in t_row.items(): + out[f"t.{k}"] = v + return out + + +def _eval_set_value( + value: Any, + combined: Mapping[str, Any], + s_row: Optional[Dict[str, Any]], + t_row: Optional[Dict[str, Any]], +) -> Any: + if callable(value): + return value(combined) + if isinstance(value, str): + if value.startswith("s.") and s_row is not None: + return s_row.get(value[2:]) + if value.startswith("t.") and t_row is not None: + return t_row.get(value[2:]) + return value diff --git a/paimon-python/pypaimon/ray/merge_into.py b/paimon-python/pypaimon/ray/merge_into.py deleted file mode 100644 index d4ba358da3b7..000000000000 --- a/paimon-python/pypaimon/ray/merge_into.py +++ /dev/null @@ -1,501 +0,0 @@ -################################################################################ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -"""MERGE INTO for Paimon primary-key tables, driven by Ray Datasets. - -Mirrors the high-level semantics of -``paimon-flink/.../action/MergeIntoAction.java`` and -``paimon-spark/.../commands/MergeIntoPaimonTable.scala`` but exposes a -Pythonic API instead of SQL. - -MVP scope: only upsert-flavored clauses (matched-update, -not-matched-insert, not-matched-by-source-update). DELETE clauses raise -``NotImplementedError`` because pypaimon's -``KeyValueDataWriter._add_system_fields`` still hardcodes ``_VALUE_KIND`` -to INSERT — see -``paimon-python/pypaimon/write/writer/key_value_data_writer.py:53``. -""" - -from dataclasses import dataclass, field -from functools import partial -from typing import ( - Any, - Callable, - Dict, - List, - Mapping, - Optional, - Sequence, - Tuple, - Union, -) - -import pyarrow as pa -import pyarrow.compute as pc - -from pypaimon.ray.shuffle import ( - _coerce_large_string_types, - _pick_collision_safe_col_name, -) - -SetSpec = Union[str, Dict[str, Any]] -Condition = Callable[[Mapping[str, Any]], bool] - -_SIDE_TARGET = "t" -_SIDE_SOURCE = "s" - - -@dataclass -class _MergeConfig: - on: Tuple[str, ...] - target_field_names: Tuple[str, ...] - side_col: str - matched_update: Optional[Dict[str, Any]] - matched_update_condition: Optional[Condition] - not_matched_insert: Optional[Dict[str, Any]] - not_matched_insert_condition: Optional[Condition] - not_matched_by_source_update: Optional[Dict[str, Any]] - not_matched_by_source_update_condition: Optional[Condition] - target_pa_schema: pa.Schema = field(repr=False) - - -def merge_paimon( - target: str, - source: Any, - catalog_options: Dict[str, str], - *, - on: Sequence[str], - when_matched_update: Optional[SetSpec] = None, - when_matched_update_condition: Optional[Condition] = None, - when_matched_delete_condition: Optional[Condition] = None, - when_not_matched_insert: Optional[SetSpec] = None, - when_not_matched_insert_condition: Optional[Condition] = None, - when_not_matched_by_source_update: Optional[Dict[str, Any]] = None, - when_not_matched_by_source_update_condition: Optional[Condition] = None, - when_not_matched_by_source_delete_condition: Optional[Condition] = None, - ray_remote_args: Optional[Dict[str, Any]] = None, - concurrency: Optional[int] = None, -) -> None: - """MERGE INTO ``target`` USING ``source`` for a Paimon primary-key table. - - Args: - target: Full table identifier, e.g. ``"db.table"``. - source: Right-hand side. One of ``ray.data.Dataset``, a Paimon - table identifier string (read via :func:`read_paimon` with - the same ``catalog_options``), a ``pyarrow.Table``, or a - ``pandas.DataFrame``. - catalog_options: Forwarded to ``CatalogFactory.create`` and to - ``read_paimon`` / ``write_paimon``. - on: Join keys. Must be a subset of the target table's primary - keys. Rows are matched by equality on these columns. - when_matched_update: SET spec for matched rows. Use ``"*"`` to - copy all target columns from source (requires schema - compatibility), or a dict ``{target_col: expr}`` where - ``expr`` is ``"s.col"`` / ``"t.col"`` / a literal / a - callable taking the combined row. - when_matched_update_condition: Optional predicate over the - combined row (keys prefixed ``s.`` / ``t.``); rows not - satisfying it are left unchanged. - when_matched_delete_condition: Not yet supported; raises - ``NotImplementedError`` if non-None. - when_not_matched_insert: SET spec for source rows with no - matching target row. ``"*"`` copies all target-schema - columns from source. - when_not_matched_insert_condition: Optional predicate over the - source row (keys prefixed ``s.``). - when_not_matched_by_source_update: SET spec for target rows - with no matching source row. Same dict format as above; only - ``"t.col"`` / literal / callable values are meaningful since - no source row exists. - when_not_matched_by_source_update_condition: Optional predicate - over the target row (keys prefixed ``t.``). - when_not_matched_by_source_delete_condition: Not yet supported; - raises ``NotImplementedError`` if non-None. - ray_remote_args: Forwarded to ``write_paimon``. - concurrency: Forwarded to ``write_paimon``. - - Notes: - - The target table must be a primary-key table. - - For HASH_FIXED bucket mode, the existing - :func:`maybe_apply_repartition` is applied by ``write_paimon`` - before writing. - - If the user passes any callable as a condition or as a SET - value, the target read falls back to the full schema (callables - are opaque to projection analysis). - - User-supplied callables must be picklable since Ray ships them - to workers. - - If ``on`` is a strict subset of the primary keys and the SET - expression rewrites the remaining PK columns, two PK rows may - collide downstream of the merge engine; do not do this. - """ - if when_matched_delete_condition is not None or when_not_matched_by_source_delete_condition is not None: - raise NotImplementedError( - "DELETE clauses are not supported yet: the pypaimon writer " - "hardcodes _VALUE_KIND to INSERT. See " - "paimon-python/pypaimon/write/writer/key_value_data_writer.py:53. " - "Once that TODO is resolved, DELETE can be wired through." - ) - - from pypaimon.catalog.catalog_factory import CatalogFactory - from pypaimon.schema.data_types import PyarrowFieldParser - - catalog = CatalogFactory.create(catalog_options) - table = catalog.get_table(target) - if not table.is_primary_key_table: - raise ValueError( - f"merge_paimon requires a primary-key table; " - f"'{target}' has no primary keys." - ) - - primary_keys = list(table.primary_keys) - if not set(on).issubset(set(primary_keys)): - raise ValueError( - f"'on' columns {list(on)} must be a subset of target primary " - f"keys {primary_keys}." - ) - - if ( - when_matched_update is None - and when_not_matched_insert is None - and when_not_matched_by_source_update is None - ): - raise ValueError( - "At least one of when_matched_update, when_not_matched_insert, " - "or when_not_matched_by_source_update must be provided." - ) - - target_field_names = list(table.field_names) - matched_update = _normalize_set_spec( - when_matched_update, target_field_names, allow_star=True, star_side=_SIDE_SOURCE - ) - not_matched_insert = _normalize_set_spec( - when_not_matched_insert, target_field_names, allow_star=True, star_side=_SIDE_SOURCE - ) - not_matched_by_source_update = _normalize_set_spec( - when_not_matched_by_source_update, - target_field_names, - allow_star=False, - star_side=None, - ) - - target_pa_schema = PyarrowFieldParser.from_paimon_schema(table.table_schema.fields) - - source_ds = _normalize_source(source, catalog_options) - source_schema = source_ds.schema() - source_col_names = list(source_schema.names) if source_schema is not None else [] - if source_col_names: - for col in on: - if col not in source_col_names: - raise ValueError( - f"'on' column '{col}' is missing from source schema " - f"{source_col_names}." - ) - - target_projection = _compute_target_projection( - on=on, - target_field_names=target_field_names, - matched_update=matched_update, - not_matched_by_source_update=not_matched_by_source_update, - matched_update_condition=when_matched_update_condition, - not_matched_by_source_update_condition=when_not_matched_by_source_update_condition, - ) - - from pypaimon.ray.ray_paimon import read_paimon, write_paimon - - target_ds = read_paimon(target, catalog_options, projection=target_projection) - - side_col = _pick_collision_safe_col_name( - set(target_field_names) | set(source_col_names), "_paimon_side" - ) - aligned_target, aligned_source = _align_for_union( - target_ds, source_ds, side_col=side_col - ) - combined = aligned_target.union(aligned_source) - - cfg = _MergeConfig( - on=tuple(on), - target_field_names=tuple(target_field_names), - side_col=side_col, - matched_update=matched_update, - matched_update_condition=when_matched_update_condition, - not_matched_insert=not_matched_insert, - not_matched_insert_condition=when_not_matched_insert_condition, - not_matched_by_source_update=not_matched_by_source_update, - not_matched_by_source_update_condition=when_not_matched_by_source_update_condition, - target_pa_schema=target_pa_schema, - ) - - merged = ( - combined - .groupby(list(on)) - .map_groups(partial(_merge_groups, cfg=cfg), batch_format="pyarrow") - ) - - write_paimon( - merged, - target, - catalog_options, - ray_remote_args=ray_remote_args, - concurrency=concurrency, - ) - - -def _normalize_set_spec( - spec: Optional[SetSpec], - target_field_names: Sequence[str], - *, - allow_star: bool, - star_side: Optional[str], -) -> Optional[Dict[str, Any]]: - if spec is None: - return None - if isinstance(spec, str): - if not allow_star or spec != "*": - raise ValueError( - f"SET spec strings other than '*' are not supported here; got {spec!r}." - ) - return {col: f"{star_side}.{col}" for col in target_field_names} - if not isinstance(spec, dict): - raise ValueError( - f"SET spec must be '*' or a dict, got {type(spec).__name__}." - ) - target_set = set(target_field_names) - for col in spec: - if col not in target_set: - raise ValueError( - f"SET key '{col}' is not a column of the target table " - f"(columns: {list(target_field_names)})." - ) - return dict(spec) - - -def _normalize_source(source: Any, catalog_options: Dict[str, str]): - import ray.data - - if isinstance(source, ray.data.Dataset): - return source - if isinstance(source, str): - from pypaimon.ray.ray_paimon import read_paimon - return read_paimon(source, catalog_options) - if isinstance(source, pa.Table): - return ray.data.from_arrow(source) - try: - import pandas as pd - except ImportError: - pd = None - if pd is not None and isinstance(source, pd.DataFrame): - return ray.data.from_pandas(source) - raise TypeError( - "source must be a ray.data.Dataset, a Paimon table identifier " - "string, a pyarrow.Table, or a pandas.DataFrame; got " - f"{type(source).__name__}." - ) - - -def _compute_target_projection( - *, - on: Sequence[str], - target_field_names: Sequence[str], - matched_update: Optional[Dict[str, Any]], - not_matched_by_source_update: Optional[Dict[str, Any]], - matched_update_condition: Optional[Condition], - not_matched_by_source_update_condition: Optional[Condition], -) -> Optional[List[str]]: - if matched_update_condition is not None or not_matched_by_source_update_condition is not None: - return None - needed = set(on) - for spec in (matched_update, not_matched_by_source_update): - if not spec: - continue - needed.update(spec.keys()) - for value in spec.values(): - if callable(value): - return None - if isinstance(value, str) and value.startswith("t."): - needed.add(value[2:]) - needed.update(_required_target_cols_for_passthrough(matched_update, target_field_names)) - needed.update(_required_target_cols_for_passthrough(not_matched_by_source_update, target_field_names)) - return [col for col in target_field_names if col in needed] - - -def _required_target_cols_for_passthrough( - spec: Optional[Dict[str, Any]], target_field_names: Sequence[str] -) -> List[str]: - if spec is None: - return [] - return [col for col in target_field_names if col not in spec] - - -def _align_for_union(target_ds, source_ds, *, side_col: str): - target_schema = target_ds.schema() - source_schema = source_ds.schema() - target_type_map = _schema_type_map(target_schema) - source_type_map = _schema_type_map(source_schema) - - union_field_types: Dict[str, pa.DataType] = {} - for name, t in target_type_map.items(): - union_field_types[name] = t - for name, t in source_type_map.items(): - if name not in union_field_types: - union_field_types[name] = t - - aligned_target = target_ds.map_batches( - partial( - _align_batch, - union_field_types=union_field_types, - side_value=_SIDE_TARGET, - side_col=side_col, - ), - batch_format="pyarrow", - ) - aligned_source = source_ds.map_batches( - partial( - _align_batch, - union_field_types=union_field_types, - side_value=_SIDE_SOURCE, - side_col=side_col, - ), - batch_format="pyarrow", - ) - return aligned_target, aligned_source - - -def _schema_type_map(schema: Optional[pa.Schema]) -> Dict[str, pa.DataType]: - if schema is None: - return {} - return dict(zip(schema.names, schema.types)) - - -def _align_batch( - batch: pa.Table, - *, - union_field_types: Dict[str, pa.DataType], - side_value: str, - side_col: str, -) -> pa.Table: - n = batch.num_rows - arrays = [] - fields = [] - present = {name: batch.column(name) for name in batch.schema.names} - for name, target_type in union_field_types.items(): - if name in present: - col = present[name] - if col.type != target_type: - col = col.cast(target_type, safe=False) - arrays.append(col) - else: - arrays.append(pa.nulls(n, type=target_type)) - fields.append(pa.field(name, target_type)) - arrays.append(pa.array([side_value] * n, type=pa.string())) - fields.append(pa.field(side_col, pa.string())) - return pa.Table.from_arrays(arrays, schema=pa.schema(fields)) - - -def _merge_groups(group: pa.Table, *, cfg: _MergeConfig) -> pa.Table: - if group.num_rows == 0: - return pa.Table.from_pylist([], schema=cfg.target_pa_schema) - side_mask = pc.equal(group.column(cfg.side_col), _SIDE_TARGET) - target_table = group.filter(side_mask).drop_columns([cfg.side_col]) - source_table = group.filter(pc.invert(side_mask)).drop_columns([cfg.side_col]) - target_rows = target_table.to_pylist() - source_rows = source_table.to_pylist() - output_rows: List[Dict[str, Any]] = [] - - if target_rows and source_rows: - if cfg.matched_update is not None: - if len(source_rows) > 1: - raise ValueError( - f"MERGE INTO matched {len(source_rows)} source rows against " - f"the same target key on {list(cfg.on)}; source must be unique " - f"on the 'on' columns." - ) - s_row = source_rows[0] - for t_row in target_rows: - combined = _prefixed(s_row, t_row) - if cfg.matched_update_condition is not None and not cfg.matched_update_condition(combined): - continue - output_rows.append(_apply_set(cfg.matched_update, s_row, t_row, cfg.target_field_names)) - elif source_rows and not target_rows: - if cfg.not_matched_insert is not None: - for s_row in source_rows: - combined = _prefixed(s_row, None) - if cfg.not_matched_insert_condition is not None and not cfg.not_matched_insert_condition(combined): - continue - output_rows.append(_apply_set(cfg.not_matched_insert, s_row, None, cfg.target_field_names)) - elif target_rows and not source_rows: - if cfg.not_matched_by_source_update is not None: - for t_row in target_rows: - combined = _prefixed(None, t_row) - if ( - cfg.not_matched_by_source_update_condition is not None - and not cfg.not_matched_by_source_update_condition(combined) - ): - continue - output_rows.append(_apply_set(cfg.not_matched_by_source_update, None, t_row, cfg.target_field_names)) - - if not output_rows: - return pa.Table.from_pylist([], schema=cfg.target_pa_schema) - aligned = [{name: row.get(name) for name in cfg.target_field_names} for row in output_rows] - arrow_table = pa.Table.from_pylist(aligned, schema=cfg.target_pa_schema) - return _coerce_large_string_types(arrow_table) - - -def _prefixed(s_row: Optional[Dict[str, Any]], t_row: Optional[Dict[str, Any]]) -> Dict[str, Any]: - out: Dict[str, Any] = {} - if s_row is not None: - for k, v in s_row.items(): - out[f"s.{k}"] = v - if t_row is not None: - for k, v in t_row.items(): - out[f"t.{k}"] = v - return out - - -def _apply_set( - spec: Dict[str, Any], - s_row: Optional[Dict[str, Any]], - t_row: Optional[Dict[str, Any]], - target_field_names: Sequence[str], -) -> Dict[str, Any]: - combined = _prefixed(s_row, t_row) - out: Dict[str, Any] = {} - base = t_row if t_row is not None else (s_row if s_row is not None else {}) - for col in target_field_names: - if col in spec: - out[col] = _eval_set_value(spec[col], combined, s_row, t_row) - elif col in base: - out[col] = base[col] - else: - out[col] = None - return out - - -def _eval_set_value( - value: Any, - combined: Mapping[str, Any], - s_row: Optional[Dict[str, Any]], - t_row: Optional[Dict[str, Any]], -) -> Any: - if callable(value): - return value(combined) - if isinstance(value, str): - if value.startswith("s.") and s_row is not None: - return s_row.get(value[2:]) - if value.startswith("t.") and t_row is not None: - return t_row.get(value[2:]) - return value diff --git a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py new file mode 100644 index 000000000000..5e8696c73617 --- /dev/null +++ b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py @@ -0,0 +1,185 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import shutil +import tempfile +import unittest +import uuid + +import pyarrow as pa +import ray + +from pypaimon import CatalogFactory, Schema +from pypaimon.ray import merge_into + + +class RayDataEvolutionMergeIntoTest(unittest.TestCase): + + pa_schema = pa.schema([ + ('id', pa.int32()), + ('name', pa.string()), + ('age', pa.int32()), + ]) + + de_options = { + 'row-tracking.enabled': 'true', + 'data-evolution.enabled': 'true', + } + + @classmethod + def setUpClass(cls): + cls.tempdir = tempfile.mkdtemp() + cls.warehouse = os.path.join(cls.tempdir, 'warehouse') + cls.catalog_options = {'warehouse': cls.warehouse} + cls.catalog = CatalogFactory.create(cls.catalog_options) + cls.catalog.create_database('default', True) + if not ray.is_initialized(): + ray.init(ignore_reinit_error=True, num_cpus=2) + + @classmethod + def tearDownClass(cls): + try: + if ray.is_initialized(): + ray.shutdown() + except Exception: + pass + shutil.rmtree(cls.tempdir, ignore_errors=True) + + def _create_table(self, options=None): + opts = options if options is not None else self.de_options + name = f'default.tbl_{uuid.uuid4().hex[:8]}' + s = Schema.from_pyarrow_schema(self.pa_schema, options=opts) + self.catalog.create_table(name, s, False) + return name + + def _source(self, ids=(1,)): + return pa.Table.from_pydict( + { + 'id': pa.array(list(ids), type=pa.int32()), + 'name': ['x'] * len(ids), + 'age': [10] * len(ids), + }, + schema=self.pa_schema, + ) + + def test_delete_clause_rejected(self): + target = self._create_table() + with self.assertRaises(NotImplementedError) as ctx: + merge_into( + target=target, + source=self._source(), + catalog_options=self.catalog_options, + on=['id'], + when_matched_delete_condition=lambda r: True, + ) + self.assertIn('DELETE', str(ctx.exception)) + + def test_not_matched_by_source_clauses_rejected(self): + target = self._create_table() + for kwargs in ( + {'when_not_matched_by_source_update': {'age': 0}}, + {'when_not_matched_by_source_update_condition': lambda r: True}, + {'when_not_matched_by_source_delete_condition': lambda r: True}, + ): + with self.assertRaises(NotImplementedError): + merge_into( + target=target, + source=self._source(), + catalog_options=self.catalog_options, + on=['id'], + **kwargs, + ) + + def test_no_clause_raises(self): + target = self._create_table() + with self.assertRaises(ValueError): + merge_into( + target=target, + source=self._source(), + catalog_options=self.catalog_options, + on=['id'], + ) + + def test_non_de_table_rejected(self): + target = self._create_table(options={'row-tracking.enabled': 'true'}) + with self.assertRaises(ValueError) as ctx: + merge_into( + target=target, + source=self._source(), + catalog_options=self.catalog_options, + on=['id'], + when_matched_update='*', + ) + self.assertIn('data-evolution.enabled', str(ctx.exception)) + + def test_no_row_tracking_rejected(self): + target = self._create_table(options={'data-evolution.enabled': 'true'}) + with self.assertRaises(ValueError) as ctx: + merge_into( + target=target, + source=self._source(), + catalog_options=self.catalog_options, + on=['id'], + when_matched_update='*', + ) + self.assertIn('row-tracking.enabled', str(ctx.exception)) + + def test_source_missing_on_col_raises(self): + target = self._create_table() + bad_source = pa.Table.from_pydict( + {'name': ['x'], 'age': [10]}, + schema=pa.schema([('name', pa.string()), ('age', pa.int32())]), + ) + with self.assertRaises(ValueError) as ctx: + merge_into( + target=target, + source=bad_source, + catalog_options=self.catalog_options, + on=['id'], + when_matched_update='*', + ) + self.assertIn("'id'", str(ctx.exception)) + + def test_matched_update_stub_not_implemented(self): + target = self._create_table() + with self.assertRaises(NotImplementedError) as ctx: + merge_into( + target=target, + source=self._source(), + catalog_options=self.catalog_options, + on=['id'], + when_matched_update='*', + ) + self.assertIn('UPDATE', str(ctx.exception)) + + def test_not_matched_insert_stub_not_implemented(self): + target = self._create_table() + with self.assertRaises(NotImplementedError) as ctx: + merge_into( + target=target, + source=self._source(), + catalog_options=self.catalog_options, + on=['id'], + when_not_matched_insert='*', + ) + self.assertIn('INSERT', str(ctx.exception)) + + +if __name__ == '__main__': + unittest.main() diff --git a/paimon-python/pypaimon/tests/ray_merge_into_test.py b/paimon-python/pypaimon/tests/ray_merge_into_test.py deleted file mode 100644 index 41d49ac63f29..000000000000 --- a/paimon-python/pypaimon/tests/ray_merge_into_test.py +++ /dev/null @@ -1,375 +0,0 @@ -################################################################################ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import os -import shutil -import tempfile -import unittest - -import pyarrow as pa -import ray - -from pypaimon import CatalogFactory, Schema -from pypaimon.ray import merge_paimon - - -class RayMergeIntoTest(unittest.TestCase): - - @classmethod - def setUpClass(cls): - cls.tempdir = tempfile.mkdtemp() - cls.warehouse = os.path.join(cls.tempdir, "warehouse") - cls.catalog_options = {"warehouse": cls.warehouse} - cls.catalog = CatalogFactory.create(cls.catalog_options) - cls.catalog.create_database("default", True) - if not ray.is_initialized(): - ray.init(ignore_reinit_error=True, num_cpus=2) - - @classmethod - def tearDownClass(cls): - try: - if ray.is_initialized(): - ray.shutdown() - except Exception: - pass - try: - shutil.rmtree(cls.tempdir) - except OSError: - pass - - def _make_pk_table(self, name: str, extra_options=None): - pa_schema = pa.schema([ - pa.field("id", pa.int32(), nullable=False), - ("name", pa.string()), - ("value", pa.int64()), - ]) - options = {"bucket": "2"} - if extra_options: - options.update(extra_options) - schema = Schema.from_pyarrow_schema( - pa_schema, primary_keys=["id"], options=options - ) - full = f"default.{name}" - self.catalog.create_table(full, schema, False) - return full, pa_schema - - def _write(self, full_name, data: pa.Table): - table = self.catalog.get_table(full_name) - write_builder = table.new_batch_write_builder() - writer = write_builder.new_write() - writer.write_arrow(data) - commit_messages = writer.prepare_commit() - write_builder.new_commit().commit(commit_messages) - writer.close() - - def _read_sorted(self, full_name): - table = self.catalog.get_table(full_name) - read_builder = table.new_read_builder() - splits = read_builder.new_scan().plan().splits() - result = read_builder.new_read().to_arrow(splits) - return result.sort_by("id").to_pydict() - - def test_basic_upsert_star_set(self): - full, pa_schema = self._make_pk_table("merge_basic") - self._write(full, pa.Table.from_pydict({ - "id": [1, 2, 3], - "name": ["Alice", "Bob", "Charlie"], - "value": [100, 200, 300], - }, schema=pa_schema)) - - source = pa.Table.from_pydict({ - "id": [2, 3, 4], - "name": ["Bob-Updated", "Charlie-Updated", "Dan"], - "value": [250, 350, 400], - }, schema=pa_schema) - - merge_paimon( - target=full, - source=source, - catalog_options=self.catalog_options, - on=["id"], - when_matched_update="*", - when_not_matched_insert="*", - ) - - out = self._read_sorted(full) - self.assertEqual(out["id"], [1, 2, 3, 4]) - self.assertEqual(out["name"], ["Alice", "Bob-Updated", "Charlie-Updated", "Dan"]) - self.assertEqual(out["value"], [100, 250, 350, 400]) - - def test_set_dict_with_literal_and_source_ref(self): - full, pa_schema = self._make_pk_table("merge_set_dict") - self._write(full, pa.Table.from_pydict({ - "id": [1, 2], - "name": ["Alice", "Bob"], - "value": [10, 20], - }, schema=pa_schema)) - - source = pa.Table.from_pydict({ - "id": pa.array([1, 2], type=pa.int32()), - "name": ["A-new", "B-new"], - "value": pa.array([99, 99], type=pa.int64()), - }) - - merge_paimon( - target=full, - source=source, - catalog_options=self.catalog_options, - on=["id"], - when_matched_update={"name": "s.name", "value": 777}, - ) - - out = self._read_sorted(full) - self.assertEqual(out["id"], [1, 2]) - self.assertEqual(out["name"], ["A-new", "B-new"]) - self.assertEqual(out["value"], [777, 777]) - - def test_matched_update_condition(self): - full, pa_schema = self._make_pk_table("merge_condition") - self._write(full, pa.Table.from_pydict({ - "id": [1, 2], - "name": ["Old1", "Old2"], - "value": [50, 50], - }, schema=pa_schema)) - - source = pa.Table.from_pydict({ - "id": pa.array([1, 2], type=pa.int32()), - "name": ["New1", "New2"], - "value": pa.array([100, 10], type=pa.int64()), - }) - - merge_paimon( - target=full, - source=source, - catalog_options=self.catalog_options, - on=["id"], - when_matched_update="*", - when_matched_update_condition=lambda r: r["s.value"] > r["t.value"], - ) - - out = self._read_sorted(full) - self.assertEqual(out["id"], [1, 2]) - self.assertEqual(out["name"], ["New1", "Old2"]) - self.assertEqual(out["value"], [100, 50]) - - def test_not_matched_by_source_update(self): - full, pa_schema = self._make_pk_table_with_flag("merge_soft_delete") - target = self.catalog.get_table(full) - write_builder = target.new_batch_write_builder() - writer = write_builder.new_write() - writer.write_arrow(pa.Table.from_pydict({ - "id": pa.array([1, 2, 3], type=pa.int32()), - "name": ["A", "B", "C"], - "deleted": [False, False, False], - })) - commits = writer.prepare_commit() - write_builder.new_commit().commit(commits) - writer.close() - - source = pa.Table.from_pydict({ - "id": pa.array([1], type=pa.int32()), - "name": ["A"], - "deleted": [False], - }) - - merge_paimon( - target=full, - source=source, - catalog_options=self.catalog_options, - on=["id"], - when_not_matched_by_source_update={"deleted": True}, - ) - - out = self._read_sorted(full) - self.assertEqual(out["id"], [1, 2, 3]) - self.assertEqual(out["name"], ["A", "B", "C"]) - self.assertEqual(out["deleted"], [False, True, True]) - - def _make_pk_table_with_flag(self, name: str): - pa_schema = pa.schema([ - pa.field("id", pa.int32(), nullable=False), - ("name", pa.string()), - ("deleted", pa.bool_()), - ]) - schema = Schema.from_pyarrow_schema( - pa_schema, primary_keys=["id"], options={"bucket": "2"} - ) - full = f"default.{name}" - self.catalog.create_table(full, schema, False) - return full, pa_schema - - def test_delete_raises_not_implemented(self): - full, pa_schema = self._make_pk_table("merge_delete_unsupported") - with self.assertRaises(NotImplementedError) as ctx: - merge_paimon( - target=full, - source=pa.Table.from_pydict({"id": pa.array([1], type=pa.int32())}), - catalog_options=self.catalog_options, - on=["id"], - when_matched_delete_condition=lambda r: True, - ) - self.assertIn("DELETE", str(ctx.exception)) - - with self.assertRaises(NotImplementedError): - merge_paimon( - target=full, - source=pa.Table.from_pydict({"id": pa.array([1], type=pa.int32())}), - catalog_options=self.catalog_options, - on=["id"], - when_not_matched_by_source_delete_condition=lambda r: True, - ) - - def test_duplicate_source_rows_raise(self): - full, pa_schema = self._make_pk_table("merge_dup_source") - self._write(full, pa.Table.from_pydict({ - "id": [1], - "name": ["Old"], - "value": [10], - }, schema=pa_schema)) - - source = pa.Table.from_pydict({ - "id": [1, 1], - "name": ["A", "B"], - "value": [100, 200], - }, schema=pa_schema) - - with self.assertRaises(ValueError) as ctx: - merge_paimon( - target=full, - source=source, - catalog_options=self.catalog_options, - on=["id"], - when_matched_update="*", - ) - self.assertIn("source must be unique", str(ctx.exception)) - - def test_validation_errors(self): - # append-only table (no PK) → ValueError - ao_schema = pa.schema([("id", pa.int32()), ("v", pa.int64())]) - ao = Schema.from_pyarrow_schema(ao_schema) - self.catalog.create_table("default.merge_append_only", ao, False) - with self.assertRaises(ValueError): - merge_paimon( - target="default.merge_append_only", - source=pa.Table.from_pydict({"id": [1], "v": [1]}), - catalog_options=self.catalog_options, - on=["id"], - when_matched_update="*", - ) - - full, pa_schema = self._make_pk_table("merge_validation") - - # on not subset of PKs → ValueError - with self.assertRaises(ValueError): - merge_paimon( - target=full, - source=pa.Table.from_pydict({"id": pa.array([1], type=pa.int32())}), - catalog_options=self.catalog_options, - on=["name"], - when_matched_update="*", - ) - - # no when_* clause → ValueError - with self.assertRaises(ValueError): - merge_paimon( - target=full, - source=pa.Table.from_pydict({"id": pa.array([1], type=pa.int32())}), - catalog_options=self.catalog_options, - on=["id"], - ) - - # source missing `on` column → ValueError - with self.assertRaises(ValueError): - merge_paimon( - target=full, - source=pa.Table.from_pydict({"name": ["x"], "value": pa.array([1], type=pa.int64())}), - catalog_options=self.catalog_options, - on=["id"], - when_matched_update="*", - ) - - # SET key not a target column → ValueError - with self.assertRaises(ValueError): - merge_paimon( - target=full, - source=pa.Table.from_pydict({"id": pa.array([1], type=pa.int32())}), - catalog_options=self.catalog_options, - on=["id"], - when_matched_update={"not_a_real_column": "x"}, - ) - - def test_source_type_normalization(self): - import pandas as pd - - # pyarrow.Table source - full_a, pa_schema = self._make_pk_table("merge_src_arrow") - self._write(full_a, pa.Table.from_pydict({ - "id": [1], "name": ["a"], "value": [1], - }, schema=pa_schema)) - merge_paimon( - target=full_a, - source=pa.Table.from_pydict({ - "id": pa.array([2], type=pa.int32()), - "name": ["b"], - "value": pa.array([2], type=pa.int64()), - }), - catalog_options=self.catalog_options, - on=["id"], - when_not_matched_insert="*", - ) - self.assertEqual(self._read_sorted(full_a)["id"], [1, 2]) - - # pandas.DataFrame source - full_b, pa_schema_b = self._make_pk_table("merge_src_pandas") - self._write(full_b, pa.Table.from_pydict({ - "id": [1], "name": ["a"], "value": [1], - }, schema=pa_schema_b)) - merge_paimon( - target=full_b, - source=pd.DataFrame({ - "id": pd.array([2], dtype="int32"), - "name": ["b"], - "value": pd.array([2], dtype="int64"), - }), - catalog_options=self.catalog_options, - on=["id"], - when_not_matched_insert="*", - ) - self.assertEqual(self._read_sorted(full_b)["id"], [1, 2]) - - # Paimon table identifier source - full_c, pa_schema_c = self._make_pk_table("merge_src_target") - src_name, _ = self._make_pk_table("merge_src_source") - self._write(full_c, pa.Table.from_pydict({ - "id": [1], "name": ["a"], "value": [1], - }, schema=pa_schema_c)) - self._write(src_name, pa.Table.from_pydict({ - "id": [2], "name": ["b"], "value": [2], - }, schema=pa_schema_c)) - merge_paimon( - target=full_c, - source=src_name, - catalog_options=self.catalog_options, - on=["id"], - when_not_matched_insert="*", - ) - self.assertEqual(self._read_sorted(full_c)["id"], [1, 2]) - - -if __name__ == "__main__": - unittest.main() From 714bb29983414e411591651195632ff8a382c937 Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Thu, 28 May 2026 23:57:09 +0800 Subject: [PATCH 05/32] [python] wire ray merge_into not-matched INSERT path --- .../pypaimon/ray/data_evolution_merge_into.py | 76 ++++++++++++++- .../ray_data_evolution_merge_into_test.py | 94 +++++++++++++++++-- 2 files changed, 158 insertions(+), 12 deletions(-) diff --git a/paimon-python/pypaimon/ray/data_evolution_merge_into.py b/paimon-python/pypaimon/ray/data_evolution_merge_into.py index 619b15dd81cd..a409744bf7b6 100644 --- a/paimon-python/pypaimon/ray/data_evolution_merge_into.py +++ b/paimon-python/pypaimon/ray/data_evolution_merge_into.py @@ -81,11 +81,83 @@ def merge_into( source_ds = _normalize_source(source, catalog_options) _validate_source_on_cols(source_ds, on) - if not_matched_insert is not None: - raise NotImplementedError("not-matched INSERT path not yet implemented.") if matched_update is not None: raise NotImplementedError("matched UPDATE path not yet implemented.") + if not_matched_insert is not None: + from pypaimon.schema.data_types import PyarrowFieldParser + + target_pa_schema = PyarrowFieldParser.from_paimon_schema( + table.table_schema.fields + ) + _do_not_matched_insert( + target_identifier=target, + source_ds=source_ds, + on=list(on), + target_field_names=target_field_names, + target_pa_schema=target_pa_schema, + spec=not_matched_insert, + condition=when_not_matched_insert_condition, + catalog_options=catalog_options, + ray_remote_args=ray_remote_args, + concurrency=concurrency, + ) + + +def _do_not_matched_insert( + *, + target_identifier: str, + source_ds, + on: Sequence[str], + target_field_names: Sequence[str], + target_pa_schema: pa.Schema, + spec: Dict[str, Any], + condition: Optional[Condition], + catalog_options: Dict[str, str], + ray_remote_args: Optional[Dict[str, Any]], + concurrency: Optional[int], +) -> None: + from pypaimon.ray.ray_paimon import read_paimon, write_paimon + from pypaimon.ray.shuffle import _coerce_large_string_types + + target_on_ds = read_paimon( + target_identifier, catalog_options, projection=list(on) + ) + target_keys = set() + for batch in target_on_ds.iter_batches(batch_format="pyarrow"): + cols = [batch.column(c).to_pylist() for c in on] + for tup in zip(*cols): + target_keys.add(tup) + + on_list = list(on) + field_names = list(target_field_names) + insert_spec = spec + insert_cond = condition + out_schema = target_pa_schema + + def _transform(batch: pa.Table) -> pa.Table: + rows = batch.to_pylist() + out = [] + for s_row in rows: + key = tuple(s_row.get(c) for c in on_list) + if key in target_keys: + continue + if insert_cond is not None and not insert_cond(_prefixed(s_row, None)): + continue + out.append(_apply_set(insert_spec, s_row, None, field_names)) + aligned = [{name: r.get(name) for name in field_names} for r in out] + result = pa.Table.from_pylist(aligned, schema=out_schema) + return _coerce_large_string_types(result) + + transformed = source_ds.map_batches(_transform, batch_format="pyarrow") + write_paimon( + transformed, + target_identifier, + catalog_options, + ray_remote_args=ray_remote_args, + concurrency=concurrency, + ) + def _normalize_set_spec( spec: Optional[SetSpec], diff --git a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py index 5e8696c73617..b959cb689c14 100644 --- a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py +++ b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py @@ -78,6 +78,20 @@ def _source(self, ids=(1,)): schema=self.pa_schema, ) + def _write(self, target, data): + table = self.catalog.get_table(target) + wb = table.new_batch_write_builder() + writer = wb.new_write() + writer.write_arrow(data) + wb.new_commit().commit(writer.prepare_commit()) + writer.close() + + def _read_sorted(self, target): + table = self.catalog.get_table(target) + rb = table.new_read_builder() + splits = rb.new_scan().plan().splits() + return rb.new_read().to_arrow(splits).sort_by('id').to_pydict() + def test_delete_clause_rejected(self): target = self._create_table() with self.assertRaises(NotImplementedError) as ctx: @@ -168,17 +182,77 @@ def test_matched_update_stub_not_implemented(self): ) self.assertIn('UPDATE', str(ctx.exception)) - def test_not_matched_insert_stub_not_implemented(self): + def test_not_matched_insert_appends_unmatched(self): target = self._create_table() - with self.assertRaises(NotImplementedError) as ctx: - merge_into( - target=target, - source=self._source(), - catalog_options=self.catalog_options, - on=['id'], - when_not_matched_insert='*', - ) - self.assertIn('INSERT', str(ctx.exception)) + self._write( + target, + pa.Table.from_pydict( + { + 'id': pa.array([1, 2, 3], type=pa.int32()), + 'name': ['a', 'b', 'c'], + 'age': pa.array([10, 20, 30], type=pa.int32()), + }, + schema=self.pa_schema, + ), + ) + + source = pa.Table.from_pydict( + { + 'id': pa.array([2, 3, 4], type=pa.int32()), + 'name': ['b2', 'c2', 'd'], + 'age': pa.array([22, 33, 40], type=pa.int32()), + }, + schema=self.pa_schema, + ) + + merge_into( + target=target, + source=source, + catalog_options=self.catalog_options, + on=['id'], + when_not_matched_insert='*', + ) + + out = self._read_sorted(target) + self.assertEqual(out['id'], [1, 2, 3, 4]) + self.assertEqual(out['name'], ['a', 'b', 'c', 'd']) + self.assertEqual(out['age'], [10, 20, 30, 40]) + + def test_not_matched_insert_with_condition(self): + target = self._create_table() + self._write( + target, + pa.Table.from_pydict( + { + 'id': pa.array([1], type=pa.int32()), + 'name': ['a'], + 'age': pa.array([10], type=pa.int32()), + }, + schema=self.pa_schema, + ), + ) + + source = pa.Table.from_pydict( + { + 'id': pa.array([2, 3, 4], type=pa.int32()), + 'name': ['b', 'c', 'd'], + 'age': pa.array([5, 50, 100], type=pa.int32()), + }, + schema=self.pa_schema, + ) + + merge_into( + target=target, + source=source, + catalog_options=self.catalog_options, + on=['id'], + when_not_matched_insert='*', + when_not_matched_insert_condition=lambda r: r['s.age'] >= 50, + ) + + out = self._read_sorted(target) + self.assertEqual(out['id'], [1, 3, 4]) + self.assertEqual(out['age'], [10, 50, 100]) if __name__ == '__main__': From 99b88ddd73000fff89f92ea8c764e32a9f4bd556 Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Fri, 29 May 2026 00:02:03 +0800 Subject: [PATCH 06/32] [python] wire ray merge_into matched UPDATE path --- .../pypaimon/ray/data_evolution_merge_into.py | 106 +++++++++++- .../ray_data_evolution_merge_into_test.py | 153 ++++++++++++++++-- 2 files changed, 243 insertions(+), 16 deletions(-) diff --git a/paimon-python/pypaimon/ray/data_evolution_merge_into.py b/paimon-python/pypaimon/ray/data_evolution_merge_into.py index a409744bf7b6..4f08c6980ce0 100644 --- a/paimon-python/pypaimon/ray/data_evolution_merge_into.py +++ b/paimon-python/pypaimon/ray/data_evolution_merge_into.py @@ -81,15 +81,26 @@ def merge_into( source_ds = _normalize_source(source, catalog_options) _validate_source_on_cols(source_ds, on) - if matched_update is not None: - raise NotImplementedError("matched UPDATE path not yet implemented.") + from pypaimon.schema.data_types import PyarrowFieldParser - if not_matched_insert is not None: - from pypaimon.schema.data_types import PyarrowFieldParser + target_pa_schema = PyarrowFieldParser.from_paimon_schema( + table.table_schema.fields + ) - target_pa_schema = PyarrowFieldParser.from_paimon_schema( - table.table_schema.fields + if matched_update is not None: + _do_matched_update( + target_table=table, + target_identifier=target, + source_ds=source_ds, + on=list(on), + target_field_names=target_field_names, + target_pa_schema=target_pa_schema, + spec=matched_update, + condition=when_matched_update_condition, + catalog_options=catalog_options, ) + + if not_matched_insert is not None: _do_not_matched_insert( target_identifier=target, source_ds=source_ds, @@ -159,6 +170,89 @@ def _transform(batch: pa.Table) -> pa.Table: ) +def _do_matched_update( + *, + target_table, + target_identifier: str, + source_ds, + on: Sequence[str], + target_field_names: Sequence[str], + target_pa_schema: pa.Schema, + spec: Dict[str, Any], + condition: Optional[Condition], + catalog_options: Dict[str, str], +) -> None: + from pypaimon.ray.ray_paimon import read_paimon + from pypaimon.table.special_fields import SpecialFields + + row_id_name = SpecialFields.ROW_ID.name + update_cols = list(spec.keys()) + needed_cols = _needed_target_cols(spec, on, update_cols, target_field_names, condition) + projection = [row_id_name] + [c for c in needed_cols if c != row_id_name] + + target_ds = read_paimon(target_identifier, catalog_options, projection=projection) + target_by_key: Dict[tuple, Dict[str, Any]] = {} + for batch in target_ds.iter_batches(batch_format="pyarrow"): + for row in batch.to_pylist(): + key = tuple(row.get(c) for c in on) + target_by_key[key] = row + + if not target_by_key: + return + + field_names = list(target_field_names) + output_row_ids: list = [] + output_cols: Dict[str, list] = {c: [] for c in update_cols} + + for batch in source_ds.iter_batches(batch_format="pyarrow"): + for s_row in batch.to_pylist(): + key = tuple(s_row.get(c) for c in on) + t_row = target_by_key.get(key) + if t_row is None: + continue + if condition is not None and not condition(_prefixed(s_row, t_row)): + continue + new_values = _apply_set(spec, s_row, t_row, field_names) + output_row_ids.append(t_row[row_id_name]) + for col in update_cols: + output_cols[col].append(new_values[col]) + + if not output_row_ids: + return + + pydict = {row_id_name: output_row_ids} + pydict.update(output_cols) + schema_fields = [pa.field(row_id_name, pa.int64(), nullable=False)] + for col in update_cols: + schema_fields.append(target_pa_schema.field(col)) + update_table = pa.Table.from_pydict(pydict, schema=pa.schema(schema_fields)) + + wb = target_table.new_batch_write_builder() + tu = wb.new_update().with_update_type(update_cols) + msgs = tu.update_by_arrow_with_row_id(update_table) + tc = wb.new_commit() + tc.commit(msgs) + tc.close() + + +def _needed_target_cols( + spec: Dict[str, Any], + on: Sequence[str], + update_cols: Sequence[str], + all_target_cols: Sequence[str], + condition: Optional[Condition], +) -> list: + if condition is not None: + return list(all_target_cols) + needed = set(on) | set(update_cols) + for value in spec.values(): + if callable(value): + return list(all_target_cols) + if isinstance(value, str) and value.startswith("t."): + needed.add(value[2:]) + return [c for c in all_target_cols if c in needed] + + def _normalize_set_spec( spec: Optional[SetSpec], target_field_names: Sequence[str], diff --git a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py index b959cb689c14..6dde59beb0f0 100644 --- a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py +++ b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py @@ -170,17 +170,150 @@ def test_source_missing_on_col_raises(self): ) self.assertIn("'id'", str(ctx.exception)) - def test_matched_update_stub_not_implemented(self): + def test_matched_update_star(self): target = self._create_table() - with self.assertRaises(NotImplementedError) as ctx: - merge_into( - target=target, - source=self._source(), - catalog_options=self.catalog_options, - on=['id'], - when_matched_update='*', - ) - self.assertIn('UPDATE', str(ctx.exception)) + self._write( + target, + pa.Table.from_pydict( + { + 'id': pa.array([1, 2, 3], type=pa.int32()), + 'name': ['a', 'b', 'c'], + 'age': pa.array([10, 20, 30], type=pa.int32()), + }, + schema=self.pa_schema, + ), + ) + + source = pa.Table.from_pydict( + { + 'id': pa.array([2, 3, 4], type=pa.int32()), + 'name': ['b2', 'c2', 'd'], + 'age': pa.array([22, 33, 40], type=pa.int32()), + }, + schema=self.pa_schema, + ) + + merge_into( + target=target, + source=source, + catalog_options=self.catalog_options, + on=['id'], + when_matched_update='*', + ) + + out = self._read_sorted(target) + self.assertEqual(out['id'], [1, 2, 3]) + self.assertEqual(out['name'], ['a', 'b2', 'c2']) + self.assertEqual(out['age'], [10, 22, 33]) + + def test_matched_update_dict(self): + target = self._create_table() + self._write( + target, + pa.Table.from_pydict( + { + 'id': pa.array([1, 2], type=pa.int32()), + 'name': ['a', 'b'], + 'age': pa.array([10, 20], type=pa.int32()), + }, + schema=self.pa_schema, + ), + ) + + source = pa.Table.from_pydict( + { + 'id': pa.array([2], type=pa.int32()), + 'name': ['ignored'], + 'age': pa.array([99], type=pa.int32()), + }, + schema=self.pa_schema, + ) + + merge_into( + target=target, + source=source, + catalog_options=self.catalog_options, + on=['id'], + when_matched_update={'age': 's.age'}, + ) + + out = self._read_sorted(target) + self.assertEqual(out['id'], [1, 2]) + self.assertEqual(out['name'], ['a', 'b']) + self.assertEqual(out['age'], [10, 99]) + + def test_matched_update_with_condition(self): + target = self._create_table() + self._write( + target, + pa.Table.from_pydict( + { + 'id': pa.array([1, 2, 3], type=pa.int32()), + 'name': ['a', 'b', 'c'], + 'age': pa.array([10, 20, 30], type=pa.int32()), + }, + schema=self.pa_schema, + ), + ) + + source = pa.Table.from_pydict( + { + 'id': pa.array([1, 2, 3], type=pa.int32()), + 'name': ['a', 'b', 'c'], + 'age': pa.array([5, 100, 50], type=pa.int32()), + }, + schema=self.pa_schema, + ) + + merge_into( + target=target, + source=source, + catalog_options=self.catalog_options, + on=['id'], + when_matched_update={'age': 's.age'}, + when_matched_update_condition=lambda r: r['s.age'] > r['t.age'], + ) + + out = self._read_sorted(target) + self.assertEqual(out['id'], [1, 2, 3]) + self.assertEqual(out['age'], [10, 100, 50]) + + def test_combined_update_and_insert(self): + target = self._create_table() + self._write( + target, + pa.Table.from_pydict( + { + 'id': pa.array([1, 2], type=pa.int32()), + 'name': ['a', 'b'], + 'age': pa.array([10, 20], type=pa.int32()), + }, + schema=self.pa_schema, + ), + ) + + source = pa.Table.from_pydict( + { + 'id': pa.array([2, 3], type=pa.int32()), + 'name': ['b2', 'c'], + 'age': pa.array([22, 30], type=pa.int32()), + }, + schema=self.pa_schema, + ) + + merge_into( + target=target, + source=source, + catalog_options=self.catalog_options, + on=['id'], + when_matched_update='*', + when_not_matched_insert='*', + ) + + out = self._read_sorted(target) + self.assertEqual(out['id'], [1, 2, 3]) + self.assertEqual(out['name'], ['a', 'b2', 'c']) + self.assertEqual(out['age'], [10, 22, 30]) def test_not_matched_insert_appends_unmatched(self): target = self._create_table() From ebe771b26783a9bf841b5fdfe375e0442b1aa61c Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Fri, 29 May 2026 00:05:50 +0800 Subject: [PATCH 07/32] [python] simplify ray merge_into clause surface --- .../pypaimon/ray/data_evolution_merge_into.py | 14 ---------- .../ray_data_evolution_merge_into_test.py | 28 ------------------- 2 files changed, 42 deletions(-) diff --git a/paimon-python/pypaimon/ray/data_evolution_merge_into.py b/paimon-python/pypaimon/ray/data_evolution_merge_into.py index 4f08c6980ce0..e058f56ed3df 100644 --- a/paimon-python/pypaimon/ray/data_evolution_merge_into.py +++ b/paimon-python/pypaimon/ray/data_evolution_merge_into.py @@ -34,25 +34,11 @@ def merge_into( on: Sequence[str], when_matched_update: Optional[SetSpec] = None, when_matched_update_condition: Optional[Condition] = None, - when_matched_delete_condition: Optional[Condition] = None, when_not_matched_insert: Optional[SetSpec] = None, when_not_matched_insert_condition: Optional[Condition] = None, - when_not_matched_by_source_update: Optional[Dict[str, Any]] = None, - when_not_matched_by_source_update_condition: Optional[Condition] = None, - when_not_matched_by_source_delete_condition: Optional[Condition] = None, ray_remote_args: Optional[Dict[str, Any]] = None, concurrency: Optional[int] = None, ) -> None: - if when_matched_delete_condition is not None: - raise NotImplementedError("WHEN MATCHED THEN DELETE is not supported.") - if ( - when_not_matched_by_source_update is not None - or when_not_matched_by_source_update_condition is not None - or when_not_matched_by_source_delete_condition is not None - ): - raise NotImplementedError( - "WHEN NOT MATCHED BY SOURCE clauses are not supported." - ) if when_matched_update is None and when_not_matched_insert is None: raise ValueError( "At least one of when_matched_update or when_not_matched_insert " diff --git a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py index 6dde59beb0f0..46bee1489b08 100644 --- a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py +++ b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py @@ -92,34 +92,6 @@ def _read_sorted(self, target): splits = rb.new_scan().plan().splits() return rb.new_read().to_arrow(splits).sort_by('id').to_pydict() - def test_delete_clause_rejected(self): - target = self._create_table() - with self.assertRaises(NotImplementedError) as ctx: - merge_into( - target=target, - source=self._source(), - catalog_options=self.catalog_options, - on=['id'], - when_matched_delete_condition=lambda r: True, - ) - self.assertIn('DELETE', str(ctx.exception)) - - def test_not_matched_by_source_clauses_rejected(self): - target = self._create_table() - for kwargs in ( - {'when_not_matched_by_source_update': {'age': 0}}, - {'when_not_matched_by_source_update_condition': lambda r: True}, - {'when_not_matched_by_source_delete_condition': lambda r: True}, - ): - with self.assertRaises(NotImplementedError): - merge_into( - target=target, - source=self._source(), - catalog_options=self.catalog_options, - on=['id'], - **kwargs, - ) - def test_no_clause_raises(self): target = self._create_table() with self.assertRaises(ValueError): From 57b0a453814e35c4916c345ed821507f4223e995 Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Fri, 29 May 2026 00:12:19 +0800 Subject: [PATCH 08/32] [python] redesign ray merge_into clause API --- paimon-python/pypaimon/ray/__init__.py | 14 +- .../pypaimon/ray/data_evolution_merge_into.py | 205 ++++++++++++------ .../ray_data_evolution_merge_into_test.py | 173 +++++++++++++-- 3 files changed, 302 insertions(+), 90 deletions(-) diff --git a/paimon-python/pypaimon/ray/__init__.py b/paimon-python/pypaimon/ray/__init__.py index a1234f6142de..9161f3cbb3b7 100644 --- a/paimon-python/pypaimon/ray/__init__.py +++ b/paimon-python/pypaimon/ray/__init__.py @@ -16,6 +16,16 @@ # under the License. from pypaimon.ray.ray_paimon import read_paimon, write_paimon -from pypaimon.ray.data_evolution_merge_into import merge_into +from pypaimon.ray.data_evolution_merge_into import ( + WhenMatched, + WhenNotMatched, + merge_into, +) -__all__ = ["read_paimon", "write_paimon", "merge_into"] +__all__ = [ + "read_paimon", + "write_paimon", + "merge_into", + "WhenMatched", + "WhenNotMatched", +] diff --git a/paimon-python/pypaimon/ray/data_evolution_merge_into.py b/paimon-python/pypaimon/ray/data_evolution_merge_into.py index e058f56ed3df..518d69ee50bb 100644 --- a/paimon-python/pypaimon/ray/data_evolution_merge_into.py +++ b/paimon-python/pypaimon/ray/data_evolution_merge_into.py @@ -18,7 +18,8 @@ """MERGE INTO ... USING ... for Paimon data-evolution tables via Ray Datasets.""" -from typing import Any, Callable, Dict, Mapping, Optional, Sequence, Union +from dataclasses import dataclass, field +from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Union import pyarrow as pa @@ -26,23 +27,35 @@ Condition = Callable[[Mapping[str, Any]], bool] +@dataclass +class WhenMatched: + update: SetSpec + condition: Optional[Condition] = None + + +@dataclass +class WhenNotMatched: + insert: SetSpec + condition: Optional[Condition] = None + + def merge_into( target: str, source: Any, catalog_options: Dict[str, str], *, on: Sequence[str], - when_matched_update: Optional[SetSpec] = None, - when_matched_update_condition: Optional[Condition] = None, - when_not_matched_insert: Optional[SetSpec] = None, - when_not_matched_insert_condition: Optional[Condition] = None, + merge_condition: Optional[Condition] = None, + when_matched: Sequence[WhenMatched] = (), + when_not_matched: Sequence[WhenNotMatched] = (), ray_remote_args: Optional[Dict[str, Any]] = None, concurrency: Optional[int] = None, ) -> None: - if when_matched_update is None and when_not_matched_insert is None: + when_matched = list(when_matched) + when_not_matched = list(when_not_matched) + if not when_matched and not when_not_matched: raise ValueError( - "At least one of when_matched_update or when_not_matched_insert " - "must be provided." + "At least one of when_matched or when_not_matched must be non-empty." ) from pypaimon.catalog.catalog_factory import CatalogFactory @@ -59,10 +72,20 @@ def merge_into( ) target_field_names = list(table.field_names) - matched_update = _normalize_set_spec(when_matched_update, target_field_names) - not_matched_insert = _normalize_set_spec( - when_not_matched_insert, target_field_names - ) + matched_specs = [ + _NormalizedClause( + spec=_normalize_set_spec(c.update, target_field_names), + condition=c.condition, + ) + for c in when_matched + ] + not_matched_specs = [ + _NormalizedClause( + spec=_normalize_set_spec(c.insert, target_field_names), + condition=c.condition, + ) + for c in when_not_matched + ] source_ds = _normalize_source(source, catalog_options) _validate_source_on_cols(source_ds, on) @@ -73,43 +96,49 @@ def merge_into( table.table_schema.fields ) - if matched_update is not None: + if matched_specs: _do_matched_update( target_table=table, target_identifier=target, source_ds=source_ds, on=list(on), + merge_condition=merge_condition, + clauses=matched_specs, target_field_names=target_field_names, target_pa_schema=target_pa_schema, - spec=matched_update, - condition=when_matched_update_condition, catalog_options=catalog_options, ) - if not_matched_insert is not None: + if not_matched_specs: _do_not_matched_insert( target_identifier=target, source_ds=source_ds, on=list(on), + merge_condition=merge_condition, + clauses=not_matched_specs, target_field_names=target_field_names, target_pa_schema=target_pa_schema, - spec=not_matched_insert, - condition=when_not_matched_insert_condition, catalog_options=catalog_options, ray_remote_args=ray_remote_args, concurrency=concurrency, ) +@dataclass +class _NormalizedClause: + spec: Dict[str, Any] + condition: Optional[Condition] + + def _do_not_matched_insert( *, target_identifier: str, source_ds, on: Sequence[str], + merge_condition: Optional[Condition], + clauses: List[_NormalizedClause], target_field_names: Sequence[str], target_pa_schema: pa.Schema, - spec: Dict[str, Any], - condition: Optional[Condition], catalog_options: Dict[str, str], ray_remote_args: Optional[Dict[str, Any]], concurrency: Optional[int], @@ -117,31 +146,54 @@ def _do_not_matched_insert( from pypaimon.ray.ray_paimon import read_paimon, write_paimon from pypaimon.ray.shuffle import _coerce_large_string_types - target_on_ds = read_paimon( - target_identifier, catalog_options, projection=list(on) - ) - target_keys = set() - for batch in target_on_ds.iter_batches(batch_format="pyarrow"): - cols = [batch.column(c).to_pylist() for c in on] - for tup in zip(*cols): - target_keys.add(tup) + needs_full_target = merge_condition is not None + if needs_full_target: + target_ds = read_paimon(target_identifier, catalog_options) + target_by_key: Dict[tuple, List[Dict[str, Any]]] = {} + for batch in target_ds.iter_batches(batch_format="pyarrow"): + for row in batch.to_pylist(): + key = tuple(row.get(c) for c in on) + target_by_key.setdefault(key, []).append(row) + else: + target_on_ds = read_paimon( + target_identifier, catalog_options, projection=list(on) + ) + target_keys: set = set() + for batch in target_on_ds.iter_batches(batch_format="pyarrow"): + cols = [batch.column(c).to_pylist() for c in on] + for tup in zip(*cols): + target_keys.add(tup) on_list = list(on) field_names = list(target_field_names) - insert_spec = spec - insert_cond = condition out_schema = target_pa_schema + captured_clauses = clauses + captured_merge_cond = merge_condition + + def _is_matched(s_row: Dict[str, Any]) -> bool: + key = tuple(s_row.get(c) for c in on_list) + if needs_full_target: + t_rows = target_by_key.get(key) + if not t_rows: + return False + for t_row in t_rows: + if captured_merge_cond(_prefixed(s_row, t_row)): + return True + return False + return key in target_keys def _transform(batch: pa.Table) -> pa.Table: rows = batch.to_pylist() out = [] for s_row in rows: - key = tuple(s_row.get(c) for c in on_list) - if key in target_keys: - continue - if insert_cond is not None and not insert_cond(_prefixed(s_row, None)): + if _is_matched(s_row): continue - out.append(_apply_set(insert_spec, s_row, None, field_names)) + for clause in captured_clauses: + cond = clause.condition + if cond is not None and not cond(_prefixed(s_row, None)): + continue + out.append(_apply_set(clause.spec, s_row, None, field_names)) + break aligned = [{name: r.get(name) for name in field_names} for r in out] result = pa.Table.from_pylist(aligned, schema=out_schema) return _coerce_large_string_types(result) @@ -162,46 +214,60 @@ def _do_matched_update( target_identifier: str, source_ds, on: Sequence[str], + merge_condition: Optional[Condition], + clauses: List[_NormalizedClause], target_field_names: Sequence[str], target_pa_schema: pa.Schema, - spec: Dict[str, Any], - condition: Optional[Condition], catalog_options: Dict[str, str], ) -> None: from pypaimon.ray.ray_paimon import read_paimon from pypaimon.table.special_fields import SpecialFields row_id_name = SpecialFields.ROW_ID.name - update_cols = list(spec.keys()) - needed_cols = _needed_target_cols(spec, on, update_cols, target_field_names, condition) + update_cols_union = _union_update_cols(clauses) + needs_full_target = merge_condition is not None or any( + c.condition is not None for c in clauses + ) + if needs_full_target: + needed_cols = list(target_field_names) + else: + needed_cols = _needed_target_cols( + clauses, on, update_cols_union, target_field_names + ) projection = [row_id_name] + [c for c in needed_cols if c != row_id_name] target_ds = read_paimon(target_identifier, catalog_options, projection=projection) - target_by_key: Dict[tuple, Dict[str, Any]] = {} + target_by_key: Dict[tuple, List[Dict[str, Any]]] = {} for batch in target_ds.iter_batches(batch_format="pyarrow"): for row in batch.to_pylist(): key = tuple(row.get(c) for c in on) - target_by_key[key] = row + target_by_key.setdefault(key, []).append(row) if not target_by_key: return field_names = list(target_field_names) - output_row_ids: list = [] - output_cols: Dict[str, list] = {c: [] for c in update_cols} + output_row_ids: List[Any] = [] + output_cols: Dict[str, list] = {c: [] for c in update_cols_union} for batch in source_ds.iter_batches(batch_format="pyarrow"): for s_row in batch.to_pylist(): key = tuple(s_row.get(c) for c in on) - t_row = target_by_key.get(key) - if t_row is None: + t_rows = target_by_key.get(key) + if not t_rows: continue - if condition is not None and not condition(_prefixed(s_row, t_row)): - continue - new_values = _apply_set(spec, s_row, t_row, field_names) - output_row_ids.append(t_row[row_id_name]) - for col in update_cols: - output_cols[col].append(new_values[col]) + for t_row in t_rows: + combined = _prefixed(s_row, t_row) + if merge_condition is not None and not merge_condition(combined): + continue + for clause in clauses: + if clause.condition is not None and not clause.condition(combined): + continue + new_values = _apply_set(clause.spec, s_row, t_row, field_names) + output_row_ids.append(t_row[row_id_name]) + for col in update_cols_union: + output_cols[col].append(new_values.get(col, t_row.get(col))) + break if not output_row_ids: return @@ -209,42 +275,49 @@ def _do_matched_update( pydict = {row_id_name: output_row_ids} pydict.update(output_cols) schema_fields = [pa.field(row_id_name, pa.int64(), nullable=False)] - for col in update_cols: + for col in update_cols_union: schema_fields.append(target_pa_schema.field(col)) update_table = pa.Table.from_pydict(pydict, schema=pa.schema(schema_fields)) wb = target_table.new_batch_write_builder() - tu = wb.new_update().with_update_type(update_cols) + tu = wb.new_update().with_update_type(update_cols_union) msgs = tu.update_by_arrow_with_row_id(update_table) tc = wb.new_commit() tc.commit(msgs) tc.close() +def _union_update_cols(clauses: List[_NormalizedClause]) -> List[str]: + seen: List[str] = [] + seen_set: set = set() + for clause in clauses: + for col in clause.spec.keys(): + if col not in seen_set: + seen.append(col) + seen_set.add(col) + return seen + + def _needed_target_cols( - spec: Dict[str, Any], + clauses: List[_NormalizedClause], on: Sequence[str], update_cols: Sequence[str], all_target_cols: Sequence[str], - condition: Optional[Condition], ) -> list: - if condition is not None: - return list(all_target_cols) needed = set(on) | set(update_cols) - for value in spec.values(): - if callable(value): - return list(all_target_cols) - if isinstance(value, str) and value.startswith("t."): - needed.add(value[2:]) + for clause in clauses: + for value in clause.spec.values(): + if callable(value): + return list(all_target_cols) + if isinstance(value, str) and value.startswith("t."): + needed.add(value[2:]) return [c for c in all_target_cols if c in needed] def _normalize_set_spec( - spec: Optional[SetSpec], + spec: SetSpec, target_field_names: Sequence[str], -) -> Optional[Dict[str, Any]]: - if spec is None: - return None +) -> Dict[str, Any]: if isinstance(spec, str): if spec != "*": raise ValueError( diff --git a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py index 46bee1489b08..0051d40e8b41 100644 --- a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py +++ b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py @@ -26,7 +26,7 @@ import ray from pypaimon import CatalogFactory, Schema -from pypaimon.ray import merge_into +from pypaimon.ray import WhenMatched, WhenNotMatched, merge_into class RayDataEvolutionMergeIntoTest(unittest.TestCase): @@ -110,7 +110,7 @@ def test_non_de_table_rejected(self): source=self._source(), catalog_options=self.catalog_options, on=['id'], - when_matched_update='*', + when_matched=[WhenMatched(update='*')], ) self.assertIn('data-evolution.enabled', str(ctx.exception)) @@ -122,7 +122,7 @@ def test_no_row_tracking_rejected(self): source=self._source(), catalog_options=self.catalog_options, on=['id'], - when_matched_update='*', + when_matched=[WhenMatched(update='*')], ) self.assertIn('row-tracking.enabled', str(ctx.exception)) @@ -138,7 +138,7 @@ def test_source_missing_on_col_raises(self): source=bad_source, catalog_options=self.catalog_options, on=['id'], - when_matched_update='*', + when_matched=[WhenMatched(update='*')], ) self.assertIn("'id'", str(ctx.exception)) @@ -170,7 +170,7 @@ def test_matched_update_star(self): source=source, catalog_options=self.catalog_options, on=['id'], - when_matched_update='*', + when_matched=[WhenMatched(update='*')], ) out = self._read_sorted(target) @@ -206,7 +206,7 @@ def test_matched_update_dict(self): source=source, catalog_options=self.catalog_options, on=['id'], - when_matched_update={'age': 's.age'}, + when_matched=[WhenMatched(update={'age': 's.age'})], ) out = self._read_sorted(target) @@ -242,23 +242,27 @@ def test_matched_update_with_condition(self): source=source, catalog_options=self.catalog_options, on=['id'], - when_matched_update={'age': 's.age'}, - when_matched_update_condition=lambda r: r['s.age'] > r['t.age'], + when_matched=[ + WhenMatched( + update={'age': 's.age'}, + condition=lambda r: r['s.age'] > r['t.age'], + ), + ], ) out = self._read_sorted(target) self.assertEqual(out['id'], [1, 2, 3]) self.assertEqual(out['age'], [10, 100, 50]) - def test_combined_update_and_insert(self): + def test_matched_multiple_clauses_first_match_wins(self): target = self._create_table() self._write( target, pa.Table.from_pydict( { - 'id': pa.array([1, 2], type=pa.int32()), - 'name': ['a', 'b'], - 'age': pa.array([10, 20], type=pa.int32()), + 'id': pa.array([1, 2, 3], type=pa.int32()), + 'name': ['a', 'b', 'c'], + 'age': pa.array([10, 20, 30], type=pa.int32()), }, schema=self.pa_schema, ), @@ -266,9 +270,9 @@ def test_combined_update_and_insert(self): source = pa.Table.from_pydict( { - 'id': pa.array([2, 3], type=pa.int32()), - 'name': ['b2', 'c'], - 'age': pa.array([22, 30], type=pa.int32()), + 'id': pa.array([1, 2, 3], type=pa.int32()), + 'name': ['s1', 's2', 's3'], + 'age': pa.array([5, 25, 100], type=pa.int32()), }, schema=self.pa_schema, ) @@ -278,14 +282,18 @@ def test_combined_update_and_insert(self): source=source, catalog_options=self.catalog_options, on=['id'], - when_matched_update='*', - when_not_matched_insert='*', + when_matched=[ + WhenMatched( + update={'age': 1}, + condition=lambda r: r['s.age'] < r['t.age'], + ), + WhenMatched(update={'age': 999}), + ], ) out = self._read_sorted(target) self.assertEqual(out['id'], [1, 2, 3]) - self.assertEqual(out['name'], ['a', 'b2', 'c']) - self.assertEqual(out['age'], [10, 22, 30]) + self.assertEqual(out['age'], [1, 999, 999]) def test_not_matched_insert_appends_unmatched(self): target = self._create_table() @@ -315,7 +323,7 @@ def test_not_matched_insert_appends_unmatched(self): source=source, catalog_options=self.catalog_options, on=['id'], - when_not_matched_insert='*', + when_not_matched=[WhenNotMatched(insert='*')], ) out = self._read_sorted(target) @@ -351,14 +359,135 @@ def test_not_matched_insert_with_condition(self): source=source, catalog_options=self.catalog_options, on=['id'], - when_not_matched_insert='*', - when_not_matched_insert_condition=lambda r: r['s.age'] >= 50, + when_not_matched=[ + WhenNotMatched( + insert='*', + condition=lambda r: r['s.age'] >= 50, + ), + ], ) out = self._read_sorted(target) self.assertEqual(out['id'], [1, 3, 4]) self.assertEqual(out['age'], [10, 50, 100]) + def test_not_matched_multiple_clauses_first_match_wins(self): + target = self._create_table() + self._write( + target, + pa.Table.from_pydict( + { + 'id': pa.array([1], type=pa.int32()), + 'name': ['a'], + 'age': pa.array([10], type=pa.int32()), + }, + schema=self.pa_schema, + ), + ) + + source = pa.Table.from_pydict( + { + 'id': pa.array([2, 3], type=pa.int32()), + 'name': ['b', 'c'], + 'age': pa.array([5, 99], type=pa.int32()), + }, + schema=self.pa_schema, + ) + + merge_into( + target=target, + source=source, + catalog_options=self.catalog_options, + on=['id'], + when_not_matched=[ + WhenNotMatched( + insert={'id': 's.id', 'name': 'small', 'age': 1}, + condition=lambda r: r['s.age'] < 10, + ), + WhenNotMatched(insert={'id': 's.id', 'name': 'big', 'age': 2}), + ], + ) + + out = self._read_sorted(target) + self.assertEqual(out['id'], [1, 2, 3]) + self.assertEqual(out['name'], ['a', 'small', 'big']) + self.assertEqual(out['age'], [10, 1, 2]) + + def test_merge_condition_residual_predicate(self): + target = self._create_table() + self._write( + target, + pa.Table.from_pydict( + { + 'id': pa.array([1, 2], type=pa.int32()), + 'name': ['a', 'b'], + 'age': pa.array([10, 20], type=pa.int32()), + }, + schema=self.pa_schema, + ), + ) + + source = pa.Table.from_pydict( + { + 'id': pa.array([1, 2], type=pa.int32()), + 'name': ['a2', 'b2'], + 'age': pa.array([100, 5], type=pa.int32()), + }, + schema=self.pa_schema, + ) + + merge_into( + target=target, + source=source, + catalog_options=self.catalog_options, + on=['id'], + merge_condition=lambda r: r['s.age'] > r['t.age'], + when_matched=[WhenMatched(update={'name': 's.name'})], + when_not_matched=[WhenNotMatched(insert='*')], + ) + + out = self._read_sorted(target) + self.assertEqual(out['id'], [1, 2]) + self.assertEqual(out['name'], ['a2', 'b2']) + self.assertEqual(out['age'], [10, 5]) + + def test_combined_update_and_insert(self): + target = self._create_table() + self._write( + target, + pa.Table.from_pydict( + { + 'id': pa.array([1, 2], type=pa.int32()), + 'name': ['a', 'b'], + 'age': pa.array([10, 20], type=pa.int32()), + }, + schema=self.pa_schema, + ), + ) + + source = pa.Table.from_pydict( + { + 'id': pa.array([2, 3], type=pa.int32()), + 'name': ['b2', 'c'], + 'age': pa.array([22, 30], type=pa.int32()), + }, + schema=self.pa_schema, + ) + + merge_into( + target=target, + source=source, + catalog_options=self.catalog_options, + on=['id'], + when_matched=[WhenMatched(update='*')], + when_not_matched=[WhenNotMatched(insert='*')], + ) + + out = self._read_sorted(target) + self.assertEqual(out['id'], [1, 2, 3]) + self.assertEqual(out['name'], ['a', 'b2', 'c']) + self.assertEqual(out['age'], [10, 22, 30]) + if __name__ == '__main__': unittest.main() From ccd9c9d26d0453bc3ce40fd12342703119ad9db1 Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Fri, 29 May 2026 00:16:11 +0800 Subject: [PATCH 09/32] [python] revert unused shuffle helper extraction --- paimon-python/pypaimon/ray/shuffle.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/paimon-python/pypaimon/ray/shuffle.py b/paimon-python/pypaimon/ray/shuffle.py index 8f8f412d4729..b17f7a7ab1c4 100644 --- a/paimon-python/pypaimon/ray/shuffle.py +++ b/paimon-python/pypaimon/ray/shuffle.py @@ -53,23 +53,17 @@ BUCKET_KEY_COL = "__paimon_bucket__" -def _pick_collision_safe_col_name(existing_names, base: str) -> str: - """Return a column name guaranteed not to collide with ``existing_names``. - - Prefer ``base`` itself; on collision, append a short uuid suffix. - """ - if base not in existing_names: - return base +def _pick_bucket_col_name(existing_names) -> str: + """Return a bucket column name guaranteed not to collide with + ``existing_names``. Falls back to a UUID suffix on collision.""" + if BUCKET_KEY_COL not in existing_names: + return BUCKET_KEY_COL while True: - candidate = "{}_{}_".format(base.rstrip("_"), uuid.uuid4().hex[:8]) + candidate = "__paimon_bucket_{}_".format(uuid.uuid4().hex[:8]) if candidate not in existing_names: return candidate -def _pick_bucket_col_name(existing_names) -> str: - return _pick_collision_safe_col_name(existing_names, BUCKET_KEY_COL) - - def maybe_apply_repartition( dataset: "ray.data.Dataset", table: "Table", From 81438c5ce30c1c76824b5e54343b15c69a86dcf8 Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Fri, 29 May 2026 01:09:10 +0800 Subject: [PATCH 10/32] [python] refactor ray merge_into execution path --- paimon-python/dev/requirements-dev.txt | 5 +- .../pypaimon/ray/data_evolution_merge_into.py | 546 +++++++++++++----- .../ray_data_evolution_merge_into_test.py | 225 +++++++- paimon-python/setup.py | 2 +- 4 files changed, 636 insertions(+), 142 deletions(-) diff --git a/paimon-python/dev/requirements-dev.txt b/paimon-python/dev/requirements-dev.txt index d4e9a0645b17..9ef88817f726 100644 --- a/paimon-python/dev/requirements-dev.txt +++ b/paimon-python/dev/requirements-dev.txt @@ -21,8 +21,9 @@ duckdb==1.3.2 flake8==4.0.1 pytest~=7.0 -# Ray: 2.48+ has no wheel for Python 3.8; use 2.10.0 on 3.8, 2.48.0 on 3.9+ -ray>=2.10.0 +# merge_into needs Dataset.join (added in Ray 2.50). Python 3.8 has no 2.50 wheel. +ray>=2.10.0; python_version < "3.9" +ray>=2.50.0; python_version >= "3.9" requests parameterized # Vortex 0.71.0 regresses native predicate pushdown on single-row files. diff --git a/paimon-python/pypaimon/ray/data_evolution_merge_into.py b/paimon-python/pypaimon/ray/data_evolution_merge_into.py index 518d69ee50bb..62d4f4ec32bb 100644 --- a/paimon-python/pypaimon/ray/data_evolution_merge_into.py +++ b/paimon-python/pypaimon/ray/data_evolution_merge_into.py @@ -18,13 +18,24 @@ """MERGE INTO ... USING ... for Paimon data-evolution tables via Ray Datasets.""" -from dataclasses import dataclass, field -from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Union +from dataclasses import dataclass +from typing import ( + Any, + Callable, + Dict, + List, + Mapping, + Optional, + Sequence, + Tuple, + Union, +) import pyarrow as pa SetSpec = Union[str, Dict[str, Any]] Condition = Callable[[Mapping[str, Any]], bool] +OnSpec = Union[Sequence[str], Mapping[str, str]] @dataclass @@ -39,15 +50,22 @@ class WhenNotMatched: condition: Optional[Condition] = None +@dataclass +class _NormalizedClause: + spec: Dict[str, Any] + condition: Optional[Condition] + + def merge_into( target: str, source: Any, catalog_options: Dict[str, str], *, - on: Sequence[str], + on: OnSpec, merge_condition: Optional[Condition] = None, when_matched: Sequence[WhenMatched] = (), when_not_matched: Sequence[WhenNotMatched] = (), + num_partitions: int = 16, ray_remote_args: Optional[Dict[str, Any]] = None, concurrency: Optional[int] = None, ) -> None: @@ -58,6 +76,8 @@ def merge_into( "At least one of when_matched or when_not_matched must be non-empty." ) + target_on_cols, source_on_cols = _normalize_on(on) + from pypaimon.catalog.catalog_factory import CatalogFactory catalog = CatalogFactory.create(catalog_options) @@ -88,203 +108,453 @@ def merge_into( ] source_ds = _normalize_source(source, catalog_options) - _validate_source_on_cols(source_ds, on) + _validate_source_on_cols(source_ds, source_on_cols) from pypaimon.schema.data_types import PyarrowFieldParser target_pa_schema = PyarrowFieldParser.from_paimon_schema( table.table_schema.fields ) + base_snapshot = table.snapshot_manager().get_latest_snapshot() + is_self_merge = isinstance(source, str) and source == target + update_arrow: Optional[pa.Table] = None + update_cols_union: List[str] = [] + write_update_cols: List[str] = [] if matched_specs: - _do_matched_update( - target_table=table, - target_identifier=target, - source_ds=source_ds, - on=list(on), - merge_condition=merge_condition, - clauses=matched_specs, - target_field_names=target_field_names, - target_pa_schema=target_pa_schema, - catalog_options=catalog_options, - ) + update_cols_union = _union_update_cols(matched_specs) + if base_snapshot is not None: + _check_global_index_collision(table, base_snapshot, update_cols_union) + write_update_cols = [c for c in update_cols_union if c not in target_on_cols] + if write_update_cols: + update_arrow = _compute_matched_update( + target_identifier=target, + source_ds=source_ds, + target_on=target_on_cols, + source_on=source_on_cols, + merge_condition=merge_condition, + clauses=matched_specs, + target_field_names=target_field_names, + target_pa_schema=target_pa_schema, + update_cols=write_update_cols, + catalog_options=catalog_options, + is_self_merge=is_self_merge, + num_partitions=num_partitions, + ) - if not_matched_specs: - _do_not_matched_insert( + insert_arrow: Optional[pa.Table] = None + if not_matched_specs and not is_self_merge: + matched_keys_override: Optional[set] = None + if merge_condition is not None: + matched_keys_override = _compute_matched_source_keys( + target_identifier=target, + source_ds=source_ds, + target_on=target_on_cols, + source_on=source_on_cols, + merge_condition=merge_condition, + catalog_options=catalog_options, + num_partitions=num_partitions, + ) + insert_arrow = _compute_not_matched_insert( target_identifier=target, source_ds=source_ds, - on=list(on), - merge_condition=merge_condition, + target_on=target_on_cols, + source_on=source_on_cols, clauses=not_matched_specs, target_field_names=target_field_names, target_pa_schema=target_pa_schema, catalog_options=catalog_options, - ray_remote_args=ray_remote_args, - concurrency=concurrency, + num_partitions=num_partitions, + matched_keys_override=matched_keys_override, ) - -@dataclass -class _NormalizedClause: - spec: Dict[str, Any] - condition: Optional[Condition] + wb = table.new_batch_write_builder() + all_msgs: list = [] + if update_arrow is not None and update_arrow.num_rows > 0: + tu = wb.new_update().with_update_type(write_update_cols) + all_msgs.extend(tu.update_by_arrow_with_row_id(update_arrow)) + if insert_arrow is not None and insert_arrow.num_rows > 0: + tw = wb.new_write() + tw.write_arrow(insert_arrow) + all_msgs.extend(tw.prepare_commit()) + tw.close() + if all_msgs: + tc = wb.new_commit() + tc.commit(all_msgs) + tc.close() + + +def _normalize_on(on: OnSpec) -> Tuple[List[str], List[str]]: + if isinstance(on, Mapping): + target_cols = list(on.keys()) + source_cols = list(on.values()) + else: + target_cols = list(on) + source_cols = list(on) + if not target_cols: + raise ValueError("'on' must be non-empty.") + return target_cols, source_cols -def _do_not_matched_insert( +def _compute_matched_update( *, target_identifier: str, source_ds, - on: Sequence[str], + target_on: Sequence[str], + source_on: Sequence[str], merge_condition: Optional[Condition], clauses: List[_NormalizedClause], target_field_names: Sequence[str], target_pa_schema: pa.Schema, + update_cols: Sequence[str], catalog_options: Dict[str, str], - ray_remote_args: Optional[Dict[str, Any]], - concurrency: Optional[int], -) -> None: - from pypaimon.ray.ray_paimon import read_paimon, write_paimon - from pypaimon.ray.shuffle import _coerce_large_string_types + is_self_merge: bool, + num_partitions: int, +) -> Optional[pa.Table]: + from pypaimon.ray.ray_paimon import read_paimon + from pypaimon.table.special_fields import SpecialFields - needs_full_target = merge_condition is not None - if needs_full_target: - target_ds = read_paimon(target_identifier, catalog_options) - target_by_key: Dict[tuple, List[Dict[str, Any]]] = {} - for batch in target_ds.iter_batches(batch_format="pyarrow"): - for row in batch.to_pylist(): - key = tuple(row.get(c) for c in on) - target_by_key.setdefault(key, []).append(row) + row_id_name = SpecialFields.ROW_ID.name + needs_full = merge_condition is not None or any( + c.condition is not None for c in clauses + ) + if needs_full: + needed_cols = list(target_field_names) else: - target_on_ds = read_paimon( - target_identifier, catalog_options, projection=list(on) + needed_cols = _needed_target_cols( + clauses, target_on, update_cols, target_field_names ) - target_keys: set = set() - for batch in target_on_ds.iter_batches(batch_format="pyarrow"): - cols = [batch.column(c).to_pylist() for c in on] - for tup in zip(*cols): - target_keys.add(tup) - - on_list = list(on) - field_names = list(target_field_names) - out_schema = target_pa_schema + projection = [row_id_name] + [c for c in needed_cols if c != row_id_name] + + target_ds = read_paimon(target_identifier, catalog_options, projection=projection) + + if is_self_merge: + return _materialize_self_merge_update( + target_ds=target_ds, + merge_condition=merge_condition, + clauses=clauses, + target_field_names=target_field_names, + target_pa_schema=target_pa_schema, + update_cols=update_cols, + row_id_name=row_id_name, + ) + + target_renamed = target_ds.rename_columns( + {c: f"t.{c}" for c in target_ds.schema().names} + ) + source_schema = source_ds.schema() + source_cols = list(source_schema.names) if source_schema is not None else list(source_on) + source_renamed = source_ds.rename_columns({c: f"s.{c}" for c in source_cols}) + + joined = target_renamed.join( + source_renamed, + join_type="inner", + num_partitions=num_partitions, + on=tuple(f"t.{c}" for c in target_on), + right_on=tuple(f"s.{c}" for c in source_on), + ) + captured_clauses = clauses captured_merge_cond = merge_condition - - def _is_matched(s_row: Dict[str, Any]) -> bool: - key = tuple(s_row.get(c) for c in on_list) - if needs_full_target: - t_rows = target_by_key.get(key) - if not t_rows: - return False - for t_row in t_rows: - if captured_merge_cond(_prefixed(s_row, t_row)): - return True - return False - return key in target_keys + captured_update_cols = list(update_cols) + captured_field_names = list(target_field_names) + captured_row_id_name = row_id_name + captured_on_pairs = list(zip(source_on, target_on)) def _transform(batch: pa.Table) -> pa.Table: rows = batch.to_pylist() - out = [] - for s_row in rows: - if _is_matched(s_row): + out_row_ids: list = [] + out_cols: Dict[str, list] = {c: [] for c in captured_update_cols} + for row in rows: + s_row = {k[2:]: v for k, v in row.items() if k.startswith("s.")} + t_row = {k[2:]: v for k, v in row.items() if k.startswith("t.")} + for s_key, t_key in captured_on_pairs: + if s_key not in s_row and t_key in t_row: + s_row[s_key] = t_row[t_key] + combined = _prefixed(s_row, t_row) + if captured_merge_cond is not None and not captured_merge_cond(combined): continue for clause in captured_clauses: - cond = clause.condition - if cond is not None and not cond(_prefixed(s_row, None)): + if clause.condition is not None and not clause.condition(combined): continue - out.append(_apply_set(clause.spec, s_row, None, field_names)) + new_values = _apply_set( + clause.spec, s_row, t_row, captured_field_names + ) + out_row_ids.append(t_row[captured_row_id_name]) + for col in captured_update_cols: + out_cols[col].append(new_values.get(col, t_row.get(col))) break - aligned = [{name: r.get(name) for name in field_names} for r in out] - result = pa.Table.from_pylist(aligned, schema=out_schema) - return _coerce_large_string_types(result) - - transformed = source_ds.map_batches(_transform, batch_format="pyarrow") - write_paimon( - transformed, - target_identifier, - catalog_options, - ray_remote_args=ray_remote_args, - concurrency=concurrency, + return pa.Table.from_pydict( + {captured_row_id_name: out_row_ids, **out_cols} + ) + + transformed = joined.map_batches(_transform, batch_format="pyarrow") + batches = [b for b in transformed.iter_batches(batch_format="pyarrow") if b.num_rows > 0] + if not batches: + return None + combined_table = pa.concat_tables(batches) + + _check_cardinality(combined_table, row_id_name) + return _cast_update_arrow( + combined_table, target_pa_schema, update_cols, row_id_name ) -def _do_matched_update( +def _materialize_self_merge_update( *, - target_table, - target_identifier: str, - source_ds, - on: Sequence[str], + target_ds, merge_condition: Optional[Condition], clauses: List[_NormalizedClause], target_field_names: Sequence[str], target_pa_schema: pa.Schema, + update_cols: Sequence[str], + row_id_name: str, +) -> Optional[pa.Table]: + captured_clauses = clauses + captured_merge_cond = merge_condition + captured_update_cols = list(update_cols) + captured_field_names = list(target_field_names) + + out_row_ids: list = [] + out_cols: Dict[str, list] = {c: [] for c in captured_update_cols} + for batch in target_ds.iter_batches(batch_format="pyarrow"): + for row in batch.to_pylist(): + s_row = dict(row) + t_row = dict(row) + combined = _prefixed(s_row, t_row) + if captured_merge_cond is not None and not captured_merge_cond(combined): + continue + for clause in captured_clauses: + if clause.condition is not None and not clause.condition(combined): + continue + new_values = _apply_set( + clause.spec, s_row, t_row, captured_field_names + ) + out_row_ids.append(t_row[row_id_name]) + for col in captured_update_cols: + out_cols[col].append(new_values.get(col, t_row.get(col))) + break + + if not out_row_ids: + return None + combined_table = pa.Table.from_pydict({row_id_name: out_row_ids, **out_cols}) + _check_cardinality(combined_table, row_id_name) + return _cast_update_arrow( + combined_table, target_pa_schema, update_cols, row_id_name + ) + + +def _compute_matched_source_keys( + *, + target_identifier: str, + source_ds, + target_on: Sequence[str], + source_on: Sequence[str], + merge_condition: Condition, catalog_options: Dict[str, str], -) -> None: + num_partitions: int, +) -> set: from pypaimon.ray.ray_paimon import read_paimon - from pypaimon.table.special_fields import SpecialFields - row_id_name = SpecialFields.ROW_ID.name - update_cols_union = _union_update_cols(clauses) - needs_full_target = merge_condition is not None or any( - c.condition is not None for c in clauses + target_ds = read_paimon(target_identifier, catalog_options) + target_renamed = target_ds.rename_columns( + {c: f"t.{c}" for c in target_ds.schema().names} + ) + source_schema = source_ds.schema() + source_cols = list(source_schema.names) if source_schema is not None else list(source_on) + source_renamed = source_ds.rename_columns({c: f"s.{c}" for c in source_cols}) + + joined = target_renamed.join( + source_renamed, + join_type="inner", + num_partitions=num_partitions, + on=tuple(f"t.{c}" for c in target_on), + right_on=tuple(f"s.{c}" for c in source_on), ) - if needs_full_target: - needed_cols = list(target_field_names) - else: - needed_cols = _needed_target_cols( - clauses, on, update_cols_union, target_field_names - ) - projection = [row_id_name] + [c for c in needed_cols if c != row_id_name] - target_ds = read_paimon(target_identifier, catalog_options, projection=projection) - target_by_key: Dict[tuple, List[Dict[str, Any]]] = {} - for batch in target_ds.iter_batches(batch_format="pyarrow"): + on_pairs = list(zip(source_on, target_on)) + captured_merge_cond = merge_condition + captured_source_on = list(source_on) + + def _emit_matched_keys(batch: pa.Table) -> pa.Table: + out_cols: Dict[str, list] = {c: [] for c in captured_source_on} for row in batch.to_pylist(): - key = tuple(row.get(c) for c in on) - target_by_key.setdefault(key, []).append(row) + s_row = {k[2:]: v for k, v in row.items() if k.startswith("s.")} + t_row = {k[2:]: v for k, v in row.items() if k.startswith("t.")} + for sk, tk in on_pairs: + if sk not in s_row and tk in t_row: + s_row[sk] = t_row[tk] + combined = _prefixed(s_row, t_row) + if captured_merge_cond(combined): + for c in captured_source_on: + out_cols[c].append(s_row.get(c)) + return pa.Table.from_pydict(out_cols) + + matched_ds = joined.map_batches(_emit_matched_keys, batch_format="pyarrow") + matched_keys: set = set() + for batch in matched_ds.iter_batches(batch_format="pyarrow"): + if batch.num_rows == 0: + continue + cols = [batch.column(c).to_pylist() for c in source_on] + for tup in zip(*cols): + matched_keys.add(tup) + return matched_keys + + +def _compute_not_matched_insert( + *, + target_identifier: str, + source_ds, + target_on: Sequence[str], + source_on: Sequence[str], + clauses: List[_NormalizedClause], + target_field_names: Sequence[str], + target_pa_schema: pa.Schema, + catalog_options: Dict[str, str], + num_partitions: int, + matched_keys_override: Optional[set] = None, +) -> Optional[pa.Table]: + from pypaimon.ray.ray_paimon import read_paimon + from pypaimon.ray.shuffle import _coerce_large_string_types - if not target_by_key: - return + captured_clauses = clauses + captured_field_names = list(target_field_names) + out_schema = target_pa_schema - field_names = list(target_field_names) - output_row_ids: List[Any] = [] - output_cols: Dict[str, list] = {c: [] for c in update_cols_union} + if matched_keys_override is not None: + captured_keys = matched_keys_override + captured_source_on = list(source_on) - for batch in source_ds.iter_batches(batch_format="pyarrow"): - for s_row in batch.to_pylist(): - key = tuple(s_row.get(c) for c in on) - t_rows = target_by_key.get(key) - if not t_rows: - continue - for t_row in t_rows: - combined = _prefixed(s_row, t_row) - if merge_condition is not None and not merge_condition(combined): + def _filter_and_apply(batch: pa.Table) -> pa.Table: + rows = batch.to_pylist() + out = [] + for s_row in rows: + key = tuple(s_row.get(c) for c in captured_source_on) + if key in captured_keys: continue - for clause in clauses: + combined = _prefixed(s_row, None) + for clause in captured_clauses: if clause.condition is not None and not clause.condition(combined): continue - new_values = _apply_set(clause.spec, s_row, t_row, field_names) - output_row_ids.append(t_row[row_id_name]) - for col in update_cols_union: - output_cols[col].append(new_values.get(col, t_row.get(col))) + out.append( + _apply_set( + clause.spec, + s_row, + None, + captured_field_names, + null_unspecified=True, + ) + ) break + aligned = [{name: r.get(name) for name in captured_field_names} for r in out] + return pa.Table.from_pylist(aligned, schema=out_schema) + + transformed = source_ds.map_batches(_filter_and_apply, batch_format="pyarrow") + batches = [b for b in transformed.iter_batches(batch_format="pyarrow") if b.num_rows > 0] + if not batches: + return None + return _coerce_large_string_types(pa.concat_tables(batches)) + + target_ds = read_paimon( + target_identifier, catalog_options, projection=list(target_on) + ) + target_renamed = target_ds.rename_columns( + {c: f"t.{c}" for c in target_on} + ) + source_schema = source_ds.schema() + source_cols = list(source_schema.names) if source_schema is not None else list(source_on) + source_renamed = source_ds.rename_columns({c: f"s.{c}" for c in source_cols}) + + unmatched = source_renamed.join( + target_renamed, + join_type="left_anti", + num_partitions=num_partitions, + on=tuple(f"s.{c}" for c in source_on), + right_on=tuple(f"t.{c}" for c in target_on), + ) + + def _transform(batch: pa.Table) -> pa.Table: + rows = batch.to_pylist() + out = [] + for row in rows: + s_row = {k[2:]: v for k, v in row.items() if k.startswith("s.")} + combined = _prefixed(s_row, None) + for clause in captured_clauses: + if clause.condition is not None and not clause.condition(combined): + continue + out.append( + _apply_set( + clause.spec, + s_row, + None, + captured_field_names, + null_unspecified=True, + ) + ) + break + aligned = [{name: r.get(name) for name in captured_field_names} for r in out] + return pa.Table.from_pylist(aligned, schema=out_schema) - if not output_row_ids: + transformed = unmatched.map_batches(_transform, batch_format="pyarrow") + batches = [b for b in transformed.iter_batches(batch_format="pyarrow") if b.num_rows > 0] + if not batches: + return None + return _coerce_large_string_types(pa.concat_tables(batches)) + + +def _check_cardinality(update_table: pa.Table, row_id_name: str) -> None: + row_ids = update_table.column(row_id_name).to_pylist() + if len(set(row_ids)) == len(row_ids): return + seen: set = set() + dupes: set = set() + for rid in row_ids: + if rid in seen: + dupes.add(rid) + seen.add(rid) + raise ValueError( + f"MERGE INTO matched the same target _ROW_IDs {sorted(dupes)[:5]} " + f"via multiple source rows; source must be unique on the join keys." + ) + - pydict = {row_id_name: output_row_ids} - pydict.update(output_cols) +def _cast_update_arrow( + update_table: pa.Table, + target_pa_schema: pa.Schema, + update_cols: Sequence[str], + row_id_name: str, +) -> pa.Table: schema_fields = [pa.field(row_id_name, pa.int64(), nullable=False)] - for col in update_cols_union: + for col in update_cols: schema_fields.append(target_pa_schema.field(col)) - update_table = pa.Table.from_pydict(pydict, schema=pa.schema(schema_fields)) + return update_table.cast(pa.schema(schema_fields)) - wb = target_table.new_batch_write_builder() - tu = wb.new_update().with_update_type(update_cols_union) - msgs = tu.update_by_arrow_with_row_id(update_table) - tc = wb.new_commit() - tc.commit(msgs) - tc.close() + +def _check_global_index_collision( + table, snapshot, update_cols: Sequence[str] +) -> None: + from pypaimon.index.index_file_handler import IndexFileHandler + + handler = IndexFileHandler(table=table) + entries = handler.scan( + snapshot, lambda e: e.index_file.global_index_meta is not None + ) + if not entries: + return + field_by_id = {f.id: f.name for f in table.fields} + update_set = set(update_cols) + conflicted = sorted( + { + field_by_id.get(e.index_file.global_index_meta.index_field_id) + for e in entries + } + & update_set + ) + if conflicted: + raise NotImplementedError( + f"MERGE INTO would update columns {conflicted} that have a global " + f"index; not supported (refusing to leave the index stale)." + ) def _union_update_cols(clauses: List[_NormalizedClause]) -> List[str]: @@ -377,9 +647,15 @@ def _apply_set( s_row: Optional[Dict[str, Any]], t_row: Optional[Dict[str, Any]], target_field_names: Sequence[str], + null_unspecified: bool = False, ) -> Dict[str, Any]: combined = _prefixed(s_row, t_row) - base = t_row if t_row is not None else (s_row if s_row is not None else {}) + if t_row is not None: + base = t_row + elif s_row is not None and not null_unspecified: + base = s_row + else: + base = {} out: Dict[str, Any] = {} for col in target_field_names: if col in spec: diff --git a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py index 0051d40e8b41..d06957c7ea8b 100644 --- a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py +++ b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py @@ -92,6 +92,11 @@ def _read_sorted(self, target): splits = rb.new_scan().plan().splits() return rb.new_read().to_arrow(splits).sort_by('id').to_pydict() + def _snapshot_id(self, target): + table = self.catalog.get_table(target) + snap = table.snapshot_manager().get_latest_snapshot() + return snap.id if snap is not None else None + def test_no_clause_raises(self): target = self._create_table() with self.assertRaises(ValueError): @@ -413,7 +418,7 @@ def test_not_matched_multiple_clauses_first_match_wins(self): self.assertEqual(out['name'], ['a', 'small', 'big']) self.assertEqual(out['age'], [10, 1, 2]) - def test_merge_condition_residual_predicate(self): + def test_merge_condition_filters_matched_update(self): target = self._create_table() self._write( target, @@ -443,13 +448,50 @@ def test_merge_condition_residual_predicate(self): on=['id'], merge_condition=lambda r: r['s.age'] > r['t.age'], when_matched=[WhenMatched(update={'name': 's.name'})], - when_not_matched=[WhenNotMatched(insert='*')], ) out = self._read_sorted(target) self.assertEqual(out['id'], [1, 2]) - self.assertEqual(out['name'], ['a2', 'b2']) - self.assertEqual(out['age'], [10, 5]) + self.assertEqual(out['name'], ['a2', 'b']) + self.assertEqual(out['age'], [10, 20]) + + def test_merge_condition_failure_routes_to_insert(self): + target = self._create_table() + self._write( + target, + pa.Table.from_pydict( + { + 'id': pa.array([1], type=pa.int32()), + 'name': ['old'], + 'age': pa.array([20], type=pa.int32()), + }, + schema=self.pa_schema, + ), + ) + + source = pa.Table.from_pydict( + { + 'id': pa.array([1, 2], type=pa.int32()), + 'name': ['new1', 'new2'], + 'age': pa.array([5, 30], type=pa.int32()), + }, + schema=self.pa_schema, + ) + + merge_into( + target=target, + source=source, + catalog_options=self.catalog_options, + on=['id'], + merge_condition=lambda r: r['s.age'] > r['t.age'], + when_not_matched=[WhenNotMatched(insert='*')], + ) + + out = self._read_sorted(target) + ids_sorted = sorted(out['id']) + self.assertEqual(ids_sorted, [1, 1, 2]) + rows = sorted(zip(out['id'], out['name'], out['age'])) + self.assertEqual(rows, [(1, 'new1', 5), (1, 'old', 20), (2, 'new2', 30)]) def test_combined_update_and_insert(self): target = self._create_table() @@ -488,6 +530,181 @@ def test_combined_update_and_insert(self): self.assertEqual(out['name'], ['a', 'b2', 'c']) self.assertEqual(out['age'], [10, 22, 30]) + def test_on_with_renamed_columns(self): + target = self._create_table() + self._write( + target, + pa.Table.from_pydict( + { + 'id': pa.array([1, 2], type=pa.int32()), + 'name': ['a', 'b'], + 'age': pa.array([10, 20], type=pa.int32()), + }, + schema=self.pa_schema, + ), + ) + + source_schema = pa.schema([ + ('uid', pa.int32()), + ('name', pa.string()), + ('age', pa.int32()), + ]) + source = pa.Table.from_pydict( + { + 'uid': pa.array([2, 3], type=pa.int32()), + 'name': ['b2', 'c'], + 'age': pa.array([22, 30], type=pa.int32()), + }, + schema=source_schema, + ) + + merge_into( + target=target, + source=source, + catalog_options=self.catalog_options, + on={'id': 'uid'}, + when_matched=[WhenMatched(update={'age': 's.age'})], + ) + + out = self._read_sorted(target) + self.assertEqual(out['id'], [1, 2]) + self.assertEqual(out['age'], [10, 22]) + + def test_insert_dict_fills_unspecified_with_null(self): + target = self._create_table() + self._write( + target, + pa.Table.from_pydict( + { + 'id': pa.array([1], type=pa.int32()), + 'name': ['a'], + 'age': pa.array([10], type=pa.int32()), + }, + schema=self.pa_schema, + ), + ) + + source = pa.Table.from_pydict( + { + 'id': pa.array([2], type=pa.int32()), + 'name': ['source-name'], + 'age': pa.array([99], type=pa.int32()), + }, + schema=self.pa_schema, + ) + + merge_into( + target=target, + source=source, + catalog_options=self.catalog_options, + on=['id'], + when_not_matched=[WhenNotMatched(insert={'id': 's.id', 'age': 99})], + ) + + out = self._read_sorted(target) + self.assertEqual(out['id'], [1, 2]) + self.assertEqual(out['name'], ['a', None]) + self.assertEqual(out['age'], [10, 99]) + + def test_cardinality_violation_raises(self): + target = self._create_table() + self._write( + target, + pa.Table.from_pydict( + { + 'id': pa.array([1], type=pa.int32()), + 'name': ['a'], + 'age': pa.array([10], type=pa.int32()), + }, + schema=self.pa_schema, + ), + ) + + source = pa.Table.from_pydict( + { + 'id': pa.array([1, 1], type=pa.int32()), + 'name': ['x', 'y'], + 'age': pa.array([100, 200], type=pa.int32()), + }, + schema=self.pa_schema, + ) + + with self.assertRaises(ValueError) as ctx: + merge_into( + target=target, + source=source, + catalog_options=self.catalog_options, + on=['id'], + when_matched=[WhenMatched(update='*')], + ) + self.assertIn('source must be unique', str(ctx.exception)) + + def test_combined_writes_single_snapshot(self): + target = self._create_table() + self._write( + target, + pa.Table.from_pydict( + { + 'id': pa.array([1, 2], type=pa.int32()), + 'name': ['a', 'b'], + 'age': pa.array([10, 20], type=pa.int32()), + }, + schema=self.pa_schema, + ), + ) + before = self._snapshot_id(target) + + source = pa.Table.from_pydict( + { + 'id': pa.array([2, 3], type=pa.int32()), + 'name': ['b2', 'c'], + 'age': pa.array([22, 30], type=pa.int32()), + }, + schema=self.pa_schema, + ) + + merge_into( + target=target, + source=source, + catalog_options=self.catalog_options, + on=['id'], + when_matched=[WhenMatched(update='*')], + when_not_matched=[WhenNotMatched(insert='*')], + ) + + after = self._snapshot_id(target) + self.assertEqual(after, before + 1) + + def test_self_merge_skips_join(self): + target = self._create_table() + self._write( + target, + pa.Table.from_pydict( + { + 'id': pa.array([1, 2, 3], type=pa.int32()), + 'name': ['a', 'b', 'c'], + 'age': pa.array([10, 20, 30], type=pa.int32()), + }, + schema=self.pa_schema, + ), + ) + + merge_into( + target=target, + source=target, + catalog_options=self.catalog_options, + on=['id'], + when_matched=[ + WhenMatched( + update={'age': lambda r: r['t.age'] + 1}, + ), + ], + ) + + out = self._read_sorted(target) + self.assertEqual(out['id'], [1, 2, 3]) + self.assertEqual(out['age'], [11, 21, 31]) + if __name__ == '__main__': unittest.main() diff --git a/paimon-python/setup.py b/paimon-python/setup.py index 1f464b1988b8..431f19db312c 100644 --- a/paimon-python/setup.py +++ b/paimon-python/setup.py @@ -152,7 +152,7 @@ def read_requirements(): }, extras_require={ 'ray': [ - 'ray>=2.10,<3; python_version>="3.7"', + 'ray>=2.50,<3; python_version>="3.7"', ], 'torch': [ 'torch', From 9384be0c17034d7765bbc15206f398ce6ac2d776 Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Fri, 29 May 2026 08:42:28 +0800 Subject: [PATCH 11/32] [python] parallelize ray merge_into insert path --- .../pypaimon/ray/data_evolution_merge_into.py | 75 +++++++++++++------ 1 file changed, 54 insertions(+), 21 deletions(-) diff --git a/paimon-python/pypaimon/ray/data_evolution_merge_into.py b/paimon-python/pypaimon/ray/data_evolution_merge_into.py index 62d4f4ec32bb..410ba39fe3d2 100644 --- a/paimon-python/pypaimon/ray/data_evolution_merge_into.py +++ b/paimon-python/pypaimon/ray/data_evolution_merge_into.py @@ -142,7 +142,7 @@ def merge_into( num_partitions=num_partitions, ) - insert_arrow: Optional[pa.Table] = None + insert_ds = None if not_matched_specs and not is_self_merge: matched_keys_override: Optional[set] = None if merge_condition is not None: @@ -155,7 +155,7 @@ def merge_into( catalog_options=catalog_options, num_partitions=num_partitions, ) - insert_arrow = _compute_not_matched_insert( + insert_ds = _build_not_matched_insert_ds( target_identifier=target, source_ds=source_ds, target_on=target_on_cols, @@ -173,11 +173,12 @@ def merge_into( if update_arrow is not None and update_arrow.num_rows > 0: tu = wb.new_update().with_update_type(write_update_cols) all_msgs.extend(tu.update_by_arrow_with_row_id(update_arrow)) - if insert_arrow is not None and insert_arrow.num_rows > 0: - tw = wb.new_write() - tw.write_arrow(insert_arrow) - all_msgs.extend(tw.prepare_commit()) - tw.close() + if insert_ds is not None: + all_msgs.extend( + _distributed_write_collect_msgs( + insert_ds, table, ray_remote_args=ray_remote_args, concurrency=concurrency + ) + ) if all_msgs: tc = wb.new_commit() tc.commit(all_msgs) @@ -401,7 +402,7 @@ def _emit_matched_keys(batch: pa.Table) -> pa.Table: return matched_keys -def _compute_not_matched_insert( +def _build_not_matched_insert_ds( *, target_identifier: str, source_ds, @@ -413,7 +414,7 @@ def _compute_not_matched_insert( catalog_options: Dict[str, str], num_partitions: int, matched_keys_override: Optional[set] = None, -) -> Optional[pa.Table]: +): from pypaimon.ray.ray_paimon import read_paimon from pypaimon.ray.shuffle import _coerce_large_string_types @@ -447,13 +448,9 @@ def _filter_and_apply(batch: pa.Table) -> pa.Table: ) break aligned = [{name: r.get(name) for name in captured_field_names} for r in out] - return pa.Table.from_pylist(aligned, schema=out_schema) + return _coerce_large_string_types(pa.Table.from_pylist(aligned, schema=out_schema)) - transformed = source_ds.map_batches(_filter_and_apply, batch_format="pyarrow") - batches = [b for b in transformed.iter_batches(batch_format="pyarrow") if b.num_rows > 0] - if not batches: - return None - return _coerce_large_string_types(pa.concat_tables(batches)) + return source_ds.map_batches(_filter_and_apply, batch_format="pyarrow") target_ds = read_paimon( target_identifier, catalog_options, projection=list(target_on) @@ -493,13 +490,49 @@ def _transform(batch: pa.Table) -> pa.Table: ) break aligned = [{name: r.get(name) for name in captured_field_names} for r in out] - return pa.Table.from_pylist(aligned, schema=out_schema) + return _coerce_large_string_types(pa.Table.from_pylist(aligned, schema=out_schema)) - transformed = unmatched.map_batches(_transform, batch_format="pyarrow") - batches = [b for b in transformed.iter_batches(batch_format="pyarrow") if b.num_rows > 0] - if not batches: - return None - return _coerce_large_string_types(pa.concat_tables(batches)) + return unmatched.map_batches(_transform, batch_format="pyarrow") + + +def _distributed_write_collect_msgs( + insert_ds, + table, + *, + ray_remote_args: Optional[Dict[str, Any]], + concurrency: Optional[int], +) -> list: + from pypaimon.write.ray_datasink import PaimonDatasink + + class _CollectingDatasink(PaimonDatasink): + def __init__(self, t): + super().__init__(t, overwrite=False) + self.collected: list = [] + + def on_write_complete(self, write_result): + if hasattr(write_result, "write_returns"): + write_returns = write_result.write_returns + elif isinstance(write_result, list): + write_returns = write_result + else: + raise TypeError( + f"Unexpected write_result type {type(write_result).__name__}" + ) + self.collected = [ + m + for batch in write_returns + for m in batch + if not m.is_empty() + ] + + sink = _CollectingDatasink(table) + write_kwargs: Dict[str, Any] = {} + if ray_remote_args is not None: + write_kwargs["ray_remote_args"] = ray_remote_args + if concurrency is not None: + write_kwargs["concurrency"] = concurrency + insert_ds.write_datasink(sink, **write_kwargs) + return sink.collected def _check_cardinality(update_table: pa.Table, row_id_name: str) -> None: From 41024dcb400eeb2951cfbd961eda97203abce63e Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Fri, 29 May 2026 09:06:10 +0800 Subject: [PATCH 12/32] [python] parallelize ray merge_into self-merge path --- .../pypaimon/ray/data_evolution_merge_into.py | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/paimon-python/pypaimon/ray/data_evolution_merge_into.py b/paimon-python/pypaimon/ray/data_evolution_merge_into.py index 410ba39fe3d2..8892f7c8b1e3 100644 --- a/paimon-python/pypaimon/ray/data_evolution_merge_into.py +++ b/paimon-python/pypaimon/ray/data_evolution_merge_into.py @@ -315,11 +315,13 @@ def _materialize_self_merge_update( captured_merge_cond = merge_condition captured_update_cols = list(update_cols) captured_field_names = list(target_field_names) + captured_row_id_name = row_id_name - out_row_ids: list = [] - out_cols: Dict[str, list] = {c: [] for c in captured_update_cols} - for batch in target_ds.iter_batches(batch_format="pyarrow"): - for row in batch.to_pylist(): + def _transform(batch: pa.Table) -> pa.Table: + rows = batch.to_pylist() + out_row_ids: list = [] + out_cols: Dict[str, list] = {c: [] for c in captured_update_cols} + for row in rows: s_row = dict(row) t_row = dict(row) combined = _prefixed(s_row, t_row) @@ -331,14 +333,19 @@ def _materialize_self_merge_update( new_values = _apply_set( clause.spec, s_row, t_row, captured_field_names ) - out_row_ids.append(t_row[row_id_name]) + out_row_ids.append(t_row[captured_row_id_name]) for col in captured_update_cols: out_cols[col].append(new_values.get(col, t_row.get(col))) break + return pa.Table.from_pydict( + {captured_row_id_name: out_row_ids, **out_cols} + ) - if not out_row_ids: + transformed = target_ds.map_batches(_transform, batch_format="pyarrow") + batches = [b for b in transformed.iter_batches(batch_format="pyarrow") if b.num_rows > 0] + if not batches: return None - combined_table = pa.Table.from_pydict({row_id_name: out_row_ids, **out_cols}) + combined_table = pa.concat_tables(batches) _check_cardinality(combined_table, row_id_name) return _cast_update_arrow( combined_table, target_pa_schema, update_cols, row_id_name From 31438656960a9822b66876250df64c164a6eaf2d Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Fri, 29 May 2026 09:15:16 +0800 Subject: [PATCH 13/32] [python] distribute ray merge_into update path end-to-end --- .../pypaimon/ray/data_evolution_merge_into.py | 186 ++++++++++++------ 1 file changed, 125 insertions(+), 61 deletions(-) diff --git a/paimon-python/pypaimon/ray/data_evolution_merge_into.py b/paimon-python/pypaimon/ray/data_evolution_merge_into.py index 8892f7c8b1e3..fcbaa0139583 100644 --- a/paimon-python/pypaimon/ray/data_evolution_merge_into.py +++ b/paimon-python/pypaimon/ray/data_evolution_merge_into.py @@ -118,7 +118,7 @@ def merge_into( base_snapshot = table.snapshot_manager().get_latest_snapshot() is_self_merge = isinstance(source, str) and source == target - update_arrow: Optional[pa.Table] = None + update_ds = None update_cols_union: List[str] = [] write_update_cols: List[str] = [] if matched_specs: @@ -127,7 +127,7 @@ def merge_into( _check_global_index_collision(table, base_snapshot, update_cols_union) write_update_cols = [c for c in update_cols_union if c not in target_on_cols] if write_update_cols: - update_arrow = _compute_matched_update( + update_ds = _build_matched_update_ds( target_identifier=target, source_ds=source_ds, target_on=target_on_cols, @@ -168,11 +168,16 @@ def merge_into( matched_keys_override=matched_keys_override, ) - wb = table.new_batch_write_builder() all_msgs: list = [] - if update_arrow is not None and update_arrow.num_rows > 0: - tu = wb.new_update().with_update_type(write_update_cols) - all_msgs.extend(tu.update_by_arrow_with_row_id(update_arrow)) + if update_ds is not None: + all_msgs.extend( + _distributed_update_apply( + update_ds, + table, + write_update_cols, + ray_remote_args=ray_remote_args, + ) + ) if insert_ds is not None: all_msgs.extend( _distributed_write_collect_msgs( @@ -180,6 +185,7 @@ def merge_into( ) ) if all_msgs: + wb = table.new_batch_write_builder() tc = wb.new_commit() tc.commit(all_msgs) tc.close() @@ -197,7 +203,7 @@ def _normalize_on(on: OnSpec) -> Tuple[List[str], List[str]]: return target_cols, source_cols -def _compute_matched_update( +def _build_matched_update_ds( *, target_identifier: str, source_ds, @@ -211,7 +217,7 @@ def _compute_matched_update( catalog_options: Dict[str, str], is_self_merge: bool, num_partitions: int, -) -> Optional[pa.Table]: +): from pypaimon.ray.ray_paimon import read_paimon from pypaimon.table.special_fields import SpecialFields @@ -228,16 +234,17 @@ def _compute_matched_update( projection = [row_id_name] + [c for c in needed_cols if c != row_id_name] target_ds = read_paimon(target_identifier, catalog_options, projection=projection) + update_schema = _build_update_schema(target_pa_schema, update_cols, row_id_name) if is_self_merge: - return _materialize_self_merge_update( + return _build_self_merge_update_ds( target_ds=target_ds, merge_condition=merge_condition, clauses=clauses, target_field_names=target_field_names, - target_pa_schema=target_pa_schema, update_cols=update_cols, row_id_name=row_id_name, + update_schema=update_schema, ) target_renamed = target_ds.rename_columns( @@ -261,6 +268,7 @@ def _compute_matched_update( captured_field_names = list(target_field_names) captured_row_id_name = row_id_name captured_on_pairs = list(zip(source_on, target_on)) + captured_schema = update_schema def _transform(batch: pa.Table) -> pa.Table: rows = batch.to_pylist() @@ -286,36 +294,29 @@ def _transform(batch: pa.Table) -> pa.Table: out_cols[col].append(new_values.get(col, t_row.get(col))) break return pa.Table.from_pydict( - {captured_row_id_name: out_row_ids, **out_cols} + {captured_row_id_name: out_row_ids, **out_cols}, + schema=captured_schema, ) - transformed = joined.map_batches(_transform, batch_format="pyarrow") - batches = [b for b in transformed.iter_batches(batch_format="pyarrow") if b.num_rows > 0] - if not batches: - return None - combined_table = pa.concat_tables(batches) - - _check_cardinality(combined_table, row_id_name) - return _cast_update_arrow( - combined_table, target_pa_schema, update_cols, row_id_name - ) + return joined.map_batches(_transform, batch_format="pyarrow") -def _materialize_self_merge_update( +def _build_self_merge_update_ds( *, target_ds, merge_condition: Optional[Condition], clauses: List[_NormalizedClause], target_field_names: Sequence[str], - target_pa_schema: pa.Schema, update_cols: Sequence[str], row_id_name: str, -) -> Optional[pa.Table]: + update_schema: pa.Schema, +): captured_clauses = clauses captured_merge_cond = merge_condition captured_update_cols = list(update_cols) captured_field_names = list(target_field_names) captured_row_id_name = row_id_name + captured_schema = update_schema def _transform(batch: pa.Table) -> pa.Table: rows = batch.to_pylist() @@ -338,20 +339,111 @@ def _transform(batch: pa.Table) -> pa.Table: out_cols[col].append(new_values.get(col, t_row.get(col))) break return pa.Table.from_pydict( - {captured_row_id_name: out_row_ids, **out_cols} + {captured_row_id_name: out_row_ids, **out_cols}, + schema=captured_schema, ) - transformed = target_ds.map_batches(_transform, batch_format="pyarrow") - batches = [b for b in transformed.iter_batches(batch_format="pyarrow") if b.num_rows > 0] - if not batches: - return None - combined_table = pa.concat_tables(batches) - _check_cardinality(combined_table, row_id_name) - return _cast_update_arrow( - combined_table, target_pa_schema, update_cols, row_id_name + return target_ds.map_batches(_transform, batch_format="pyarrow") + + +def _build_update_schema( + target_pa_schema: pa.Schema, + update_cols: Sequence[str], + row_id_name: str, +) -> pa.Schema: + return pa.schema( + [pa.field(row_id_name, pa.int64(), nullable=False)] + + [target_pa_schema.field(col) for col in update_cols] ) +def _distributed_update_apply( + update_ds, + table, + write_update_cols: Sequence[str], + *, + ray_remote_args: Optional[Dict[str, Any]] = None, +) -> list: + import bisect + import pickle + import uuid + + from pypaimon.snapshot.snapshot import BATCH_COMMIT_IDENTIFIER + from pypaimon.table.special_fields import SpecialFields + from pypaimon.write.table_update_by_row_id import TableUpdateByRowId + + row_id_name = SpecialFields.ROW_ID.name + cols = list(write_update_cols) + + for col in cols: + if col not in table.field_names: + raise ValueError(f"Column '{col}' is not in target table schema.") + + planner = TableUpdateByRowId( + table, + "_merge_into_planner_" + uuid.uuid4().hex[:8], + BATCH_COMMIT_IDENTIFIER, + ) + sorted_first_row_ids = list(planner.first_row_ids) + if not sorted_first_row_ids: + return [] + + frid_col = "_FIRST_ROW_ID" + captured_sorted = sorted_first_row_ids + + def _assign_frid(batch: pa.Table) -> pa.Table: + if batch.num_rows == 0: + return batch.append_column(frid_col, pa.array([], type=pa.int64())) + row_ids = batch.column(row_id_name).to_pylist() + bisect_right = bisect.bisect_right + values = [ + captured_sorted[bisect_right(captured_sorted, rid) - 1] + for rid in row_ids + ] + return batch.append_column(frid_col, pa.array(values, type=pa.int64())) + + with_frid = update_ds.map_batches(_assign_frid, batch_format="pyarrow") + + captured_table = table + captured_cols = cols + + def _apply_group(group: pa.Table) -> pa.Table: + if group.num_rows == 0: + return pa.Table.from_pydict({"msgs_blob": pa.array([], type=pa.binary())}) + + group_row_ids = group.column(row_id_name).to_pylist() + if len(set(group_row_ids)) != len(group_row_ids): + seen: set = set() + dupes: set = set() + for rid in group_row_ids: + if rid in seen: + dupes.add(rid) + seen.add(rid) + raise ValueError( + f"MERGE INTO matched the same target _ROW_IDs {sorted(dupes)[:5]} " + f"via multiple source rows; source must be unique on the join keys." + ) + + for_update = group.drop_columns([frid_col]) + worker = TableUpdateByRowId( + captured_table, + "_merge_into_shard_" + uuid.uuid4().hex[:8], + BATCH_COMMIT_IDENTIFIER, + ) + msgs = worker.update_columns(for_update, list(captured_cols)) + return pa.Table.from_pydict({"msgs_blob": [pickle.dumps(msgs)]}) + + msgs_ds = with_frid.groupby(frid_col).map_groups( + _apply_group, batch_format="pyarrow" + ) + + all_msgs: list = [] + for batch in msgs_ds.iter_batches(batch_format="pyarrow"): + for blob in batch.column("msgs_blob").to_pylist(): + all_msgs.extend(pickle.loads(blob)) + return all_msgs + + def _compute_matched_source_keys( *, target_identifier: str, @@ -542,34 +634,6 @@ def on_write_complete(self, write_result): return sink.collected -def _check_cardinality(update_table: pa.Table, row_id_name: str) -> None: - row_ids = update_table.column(row_id_name).to_pylist() - if len(set(row_ids)) == len(row_ids): - return - seen: set = set() - dupes: set = set() - for rid in row_ids: - if rid in seen: - dupes.add(rid) - seen.add(rid) - raise ValueError( - f"MERGE INTO matched the same target _ROW_IDs {sorted(dupes)[:5]} " - f"via multiple source rows; source must be unique on the join keys." - ) - - -def _cast_update_arrow( - update_table: pa.Table, - target_pa_schema: pa.Schema, - update_cols: Sequence[str], - row_id_name: str, -) -> pa.Table: - schema_fields = [pa.field(row_id_name, pa.int64(), nullable=False)] - for col in update_cols: - schema_fields.append(target_pa_schema.field(col)) - return update_table.cast(pa.schema(schema_fields)) - - def _check_global_index_collision( table, snapshot, update_cols: Sequence[str] ) -> None: From 333eb31ccc66ad3ab5f9a544255d53dc6eefca65 Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Fri, 29 May 2026 09:41:39 +0800 Subject: [PATCH 14/32] [python] address ray merge_into review findings --- .../pypaimon/ray/data_evolution_merge_into.py | 226 +++++++++++------- .../ray_data_evolution_merge_into_test.py | 75 ++++++ .../pypaimon/write/file_store_commit.py | 20 ++ paimon-python/pypaimon/write/table_update.py | 2 - .../pypaimon/write/table_update_by_row_id.py | 25 +- 5 files changed, 247 insertions(+), 101 deletions(-) diff --git a/paimon-python/pypaimon/ray/data_evolution_merge_into.py b/paimon-python/pypaimon/ray/data_evolution_merge_into.py index fcbaa0139583..d0dab03fc7d1 100644 --- a/paimon-python/pypaimon/ray/data_evolution_merge_into.py +++ b/paimon-python/pypaimon/ray/data_evolution_merge_into.py @@ -110,43 +110,55 @@ def merge_into( source_ds = _normalize_source(source, catalog_options) _validate_source_on_cols(source_ds, source_on_cols) + base_snapshot = table.snapshot_manager().get_latest_snapshot() + if base_snapshot is not None: + # Pin the snapshot so the final commit aborts if another writer + # commits between our read and our commit. + table = table.copy( + {"commit.strict-mode.last-safe-snapshot": str(base_snapshot.id)} + ) + is_self_merge = isinstance(source, str) and source == target + + # Row-precise routing needs a stable per-source-row id when merge_condition + # may differ between source rows sharing the same ON key. + if when_not_matched and merge_condition is not None and not is_self_merge: + source_ds = _add_paimon_src_idx(source_ds) + from pypaimon.schema.data_types import PyarrowFieldParser target_pa_schema = PyarrowFieldParser.from_paimon_schema( table.table_schema.fields ) - base_snapshot = table.snapshot_manager().get_latest_snapshot() - is_self_merge = isinstance(source, str) and source == target + + if not_matched_specs and base_snapshot is not None: + _check_global_index_for_insert(table, base_snapshot) update_ds = None update_cols_union: List[str] = [] - write_update_cols: List[str] = [] if matched_specs: update_cols_union = _union_update_cols(matched_specs) if base_snapshot is not None: _check_global_index_collision(table, base_snapshot, update_cols_union) - write_update_cols = [c for c in update_cols_union if c not in target_on_cols] - if write_update_cols: - update_ds = _build_matched_update_ds( - target_identifier=target, - source_ds=source_ds, - target_on=target_on_cols, - source_on=source_on_cols, - merge_condition=merge_condition, - clauses=matched_specs, - target_field_names=target_field_names, - target_pa_schema=target_pa_schema, - update_cols=write_update_cols, - catalog_options=catalog_options, - is_self_merge=is_self_merge, - num_partitions=num_partitions, - ) + update_ds = _build_matched_update_ds( + target_identifier=target, + source_ds=source_ds, + target_on=target_on_cols, + source_on=source_on_cols, + merge_condition=merge_condition, + clauses=matched_specs, + target_field_names=target_field_names, + target_pa_schema=target_pa_schema, + update_cols=update_cols_union, + catalog_options=catalog_options, + is_self_merge=is_self_merge, + num_partitions=num_partitions, + ) insert_ds = None if not_matched_specs and not is_self_merge: - matched_keys_override: Optional[set] = None + matched_keys_ds = None if merge_condition is not None: - matched_keys_override = _compute_matched_source_keys( + matched_keys_ds = _compute_matched_source_idx_ds( target_identifier=target, source_ds=source_ds, target_on=target_on_cols, @@ -165,7 +177,7 @@ def merge_into( target_pa_schema=target_pa_schema, catalog_options=catalog_options, num_partitions=num_partitions, - matched_keys_override=matched_keys_override, + matched_idx_ds=matched_keys_ds, ) all_msgs: list = [] @@ -174,7 +186,7 @@ def merge_into( _distributed_update_apply( update_ds, table, - write_update_cols, + update_cols_union, ray_remote_args=ray_remote_args, ) ) @@ -388,8 +400,18 @@ def _distributed_update_apply( if not sorted_first_row_ids: return [] + # Broadcast the file-info snapshot to every worker so they skip the + # per-task manifest scan and observe a single consistent target view. + precomputed_info = ( + planner.snapshot_id, + planner.first_row_ids, + planner._first_row_id_index, + planner.total_row_count, + ) + frid_col = "_FIRST_ROW_ID" captured_sorted = sorted_first_row_ids + captured_precomputed = precomputed_info def _assign_frid(batch: pa.Table) -> pa.Table: if batch.num_rows == 0: @@ -429,6 +451,7 @@ def _apply_group(group: pa.Table) -> pa.Table: captured_table, "_merge_into_shard_" + uuid.uuid4().hex[:8], BATCH_COMMIT_IDENTIFIER, + precomputed_files_info=captured_precomputed, ) msgs = worker.update_columns(for_update, list(captured_cols)) return pa.Table.from_pydict({"msgs_blob": [pickle.dumps(msgs)]}) @@ -444,7 +467,28 @@ def _apply_group(group: pa.Table) -> pa.Table: return all_msgs -def _compute_matched_source_keys( +PAIMON_SRC_IDX_COL = "_paimon_src_idx" +MATCHED_SRC_IDX_MARKER = "_paimon_matched_src_idx" + + +def _add_paimon_src_idx(source_ds): + """Append a stable per-row hash to source so we can route INSERTs row-precisely + when merge_condition can differ between source rows sharing the same ON key.""" + import hashlib + + def _add_idx(batch: pa.Table) -> pa.Table: + hashes = [ + hashlib.md5(repr(sorted(r.items())).encode()).hexdigest() + for r in batch.to_pylist() + ] + return batch.append_column( + PAIMON_SRC_IDX_COL, pa.array(hashes, type=pa.string()) + ) + + return source_ds.map_batches(_add_idx, batch_format="pyarrow") + + +def _compute_matched_source_idx_ds( *, target_identifier: str, source_ds, @@ -453,7 +497,7 @@ def _compute_matched_source_keys( merge_condition: Condition, catalog_options: Dict[str, str], num_partitions: int, -) -> set: +): from pypaimon.ray.ray_paimon import read_paimon target_ds = read_paimon(target_identifier, catalog_options) @@ -472,33 +516,26 @@ def _compute_matched_source_keys( right_on=tuple(f"s.{c}" for c in source_on), ) - on_pairs = list(zip(source_on, target_on)) captured_merge_cond = merge_condition - captured_source_on = list(source_on) + captured_on_pairs = list(zip(source_on, target_on)) + out_schema = pa.schema([pa.field(MATCHED_SRC_IDX_MARKER, pa.string())]) - def _emit_matched_keys(batch: pa.Table) -> pa.Table: - out_cols: Dict[str, list] = {c: [] for c in captured_source_on} + def _emit_matched_idx(batch: pa.Table) -> pa.Table: + out_idx: list = [] for row in batch.to_pylist(): s_row = {k[2:]: v for k, v in row.items() if k.startswith("s.")} t_row = {k[2:]: v for k, v in row.items() if k.startswith("t.")} - for sk, tk in on_pairs: + for sk, tk in captured_on_pairs: if sk not in s_row and tk in t_row: s_row[sk] = t_row[tk] combined = _prefixed(s_row, t_row) if captured_merge_cond(combined): - for c in captured_source_on: - out_cols[c].append(s_row.get(c)) - return pa.Table.from_pydict(out_cols) + out_idx.append(s_row.get(PAIMON_SRC_IDX_COL)) + return pa.Table.from_pydict( + {MATCHED_SRC_IDX_MARKER: out_idx}, schema=out_schema + ) - matched_ds = joined.map_batches(_emit_matched_keys, batch_format="pyarrow") - matched_keys: set = set() - for batch in matched_ds.iter_batches(batch_format="pyarrow"): - if batch.num_rows == 0: - continue - cols = [batch.column(c).to_pylist() for c in source_on] - for tup in zip(*cols): - matched_keys.add(tup) - return matched_keys + return joined.map_batches(_emit_matched_idx, batch_format="pyarrow") def _build_not_matched_insert_ds( @@ -512,7 +549,7 @@ def _build_not_matched_insert_ds( target_pa_schema: pa.Schema, catalog_options: Dict[str, str], num_partitions: int, - matched_keys_override: Optional[set] = None, + matched_idx_ds=None, ): from pypaimon.ray.ray_paimon import read_paimon from pypaimon.ray.shuffle import _coerce_large_string_types @@ -521,59 +558,52 @@ def _build_not_matched_insert_ds( captured_field_names = list(target_field_names) out_schema = target_pa_schema - if matched_keys_override is not None: - captured_keys = matched_keys_override - captured_source_on = list(source_on) - - def _filter_and_apply(batch: pa.Table) -> pa.Table: - rows = batch.to_pylist() - out = [] - for s_row in rows: - key = tuple(s_row.get(c) for c in captured_source_on) - if key in captured_keys: - continue - combined = _prefixed(s_row, None) - for clause in captured_clauses: - if clause.condition is not None and not clause.condition(combined): - continue - out.append( - _apply_set( - clause.spec, - s_row, - None, - captured_field_names, - null_unspecified=True, - ) - ) - break - aligned = [{name: r.get(name) for name in captured_field_names} for r in out] - return _coerce_large_string_types(pa.Table.from_pylist(aligned, schema=out_schema)) - - return source_ds.map_batches(_filter_and_apply, batch_format="pyarrow") - - target_ds = read_paimon( - target_identifier, catalog_options, projection=list(target_on) - ) - target_renamed = target_ds.rename_columns( - {c: f"t.{c}" for c in target_on} - ) source_schema = source_ds.schema() source_cols = list(source_schema.names) if source_schema is not None else list(source_on) source_renamed = source_ds.rename_columns({c: f"s.{c}" for c in source_cols}) - unmatched = source_renamed.join( - target_renamed, - join_type="left_anti", - num_partitions=num_partitions, - on=tuple(f"s.{c}" for c in source_on), - right_on=tuple(f"t.{c}" for c in target_on), - ) + if matched_idx_ds is not None: + # Ray join hits a pyarrow projection bug when the right side is + # empty; collect matched-idx to a driver set instead. The set is + # bounded by # of matched source rows × ~32B per row-hash. + matched_idx_set: set = set() + for batch in matched_idx_ds.iter_batches(batch_format="pyarrow"): + if batch.num_rows == 0: + continue + matched_idx_set.update( + batch.column(MATCHED_SRC_IDX_MARKER).to_pylist() + ) + captured_idx_set = matched_idx_set + + def _filter_unmatched(batch: pa.Table) -> pa.Table: + idx_arr = batch.column(f"s.{PAIMON_SRC_IDX_COL}").to_pylist() + mask = [v not in captured_idx_set for v in idx_arr] + return batch.filter(pa.array(mask)) + + unmatched = source_renamed.map_batches( + _filter_unmatched, batch_format="pyarrow" + ) + else: + target_ds = read_paimon( + target_identifier, catalog_options, projection=list(target_on) + ) + target_renamed = target_ds.rename_columns( + {c: f"t.{c}" for c in target_on} + ) + unmatched = source_renamed.join( + target_renamed, + join_type="left_anti", + num_partitions=num_partitions, + on=tuple(f"s.{c}" for c in source_on), + right_on=tuple(f"t.{c}" for c in target_on), + ) def _transform(batch: pa.Table) -> pa.Table: rows = batch.to_pylist() out = [] for row in rows: s_row = {k[2:]: v for k, v in row.items() if k.startswith("s.")} + s_row.pop(PAIMON_SRC_IDX_COL, None) combined = _prefixed(s_row, None) for clause in captured_clauses: if clause.condition is not None and not clause.condition(combined): @@ -637,12 +667,7 @@ def on_write_complete(self, write_result): def _check_global_index_collision( table, snapshot, update_cols: Sequence[str] ) -> None: - from pypaimon.index.index_file_handler import IndexFileHandler - - handler = IndexFileHandler(table=table) - entries = handler.scan( - snapshot, lambda e: e.index_file.global_index_meta is not None - ) + entries = _scan_global_index_entries(table, snapshot) if not entries: return field_by_id = {f.id: f.name for f in table.fields} @@ -661,6 +686,23 @@ def _check_global_index_collision( ) +def _check_global_index_for_insert(table, snapshot) -> None: + if _scan_global_index_entries(table, snapshot): + raise NotImplementedError( + "MERGE INTO INSERT into a table with global index entries is not " + "supported (inserted rows would not appear in the index)." + ) + + +def _scan_global_index_entries(table, snapshot): + from pypaimon.index.index_file_handler import IndexFileHandler + + handler = IndexFileHandler(table=table) + return handler.scan( + snapshot, lambda e: e.index_file.global_index_meta is not None + ) + + def _union_update_cols(clauses: List[_NormalizedClause]) -> List[str]: seen: List[str] = [] seen_set: set = set() diff --git a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py index d06957c7ea8b..a4ae1ad47d4b 100644 --- a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py +++ b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py @@ -705,6 +705,81 @@ def test_self_merge_skips_join(self): self.assertEqual(out['id'], [1, 2, 3]) self.assertEqual(out['age'], [11, 21, 31]) + def test_matched_update_can_change_on_column(self): + target = self._create_table() + self._write( + target, + pa.Table.from_pydict( + { + 'id': pa.array([1], type=pa.int32()), + 'name': ['x'], + 'age': pa.array([10], type=pa.int32()), + }, + schema=self.pa_schema, + ), + ) + + source = pa.Table.from_pydict( + { + 'id': pa.array([1], type=pa.int32()), + 'name': ['y'], + 'age': pa.array([20], type=pa.int32()), + }, + schema=self.pa_schema, + ) + + merge_into( + target=target, + source=source, + catalog_options=self.catalog_options, + on=['id'], + when_matched=[WhenMatched(update={'id': 999, 'name': 'y'})], + ) + + out = self._read_sorted(target) + self.assertEqual(out['id'], [999]) + self.assertEqual(out['name'], ['y']) + self.assertEqual(out['age'], [10]) + + def test_merge_condition_routes_per_source_row(self): + target = self._create_table() + self._write( + target, + pa.Table.from_pydict( + { + 'id': pa.array([1], type=pa.int32()), + 'name': ['original'], + 'age': pa.array([100], type=pa.int32()), + }, + schema=self.pa_schema, + ), + ) + + source = pa.Table.from_pydict( + { + 'id': pa.array([1, 1], type=pa.int32()), + 'name': ['high', 'low'], + 'age': pa.array([200, 5], type=pa.int32()), + }, + schema=self.pa_schema, + ) + + merge_into( + target=target, + source=source, + catalog_options=self.catalog_options, + on=['id'], + merge_condition=lambda r: r['s.age'] > r['t.age'], + when_not_matched=[WhenNotMatched(insert='*')], + ) + + out = self._read_sorted(target) + rows = sorted(zip(out['id'], out['name'], out['age'])) + self.assertEqual( + rows, + [(1, 'low', 5), (1, 'original', 100)], + ) + if __name__ == '__main__': unittest.main() diff --git a/paimon-python/pypaimon/write/file_store_commit.py b/paimon-python/pypaimon/write/file_store_commit.py index 486e28924014..92fd19bf0293 100644 --- a/paimon-python/pypaimon/write/file_store_commit.py +++ b/paimon-python/pypaimon/write/file_store_commit.py @@ -117,6 +117,8 @@ def commit(self, commit_messages: List[CommitMessage], commit_identifier: int): if not commit_messages: return + self._enforce_strict_mode_last_safe_snapshot() + # Extract the minimum check_from_snapshot from commit messages valid_snapshots = [msg.check_from_snapshot for msg in commit_messages if msg.check_from_snapshot != -1] @@ -161,6 +163,7 @@ def commit(self, commit_messages: List[CommitMessage], commit_identifier: int): def overwrite(self, overwrite_partition, commit_messages: List[CommitMessage], commit_identifier: int): """Commit the given commit messages in overwrite mode.""" + self._enforce_strict_mode_last_safe_snapshot() logger.info( "Ready to overwrite to table %s, number of commit messages: %d", self.table.identifier, @@ -233,6 +236,7 @@ def drop_partitions(self, partitions: List[Dict[str, str]], commit_identifier: i def truncate_table(self, commit_identifier: int) -> None: """Truncate the entire table, deleting all data.""" + self._enforce_strict_mode_last_safe_snapshot() self._try_commit( commit_kind="OVERWRITE", commit_identifier=commit_identifier, @@ -242,6 +246,22 @@ def truncate_table(self, commit_identifier: int) -> None: allow_rollback=False, ) + def _enforce_strict_mode_last_safe_snapshot(self) -> None: + raw = self.table.options.options.data.get( + "commit.strict-mode.last-safe-snapshot" + ) + if raw is None or raw == "": + return + safe_id = int(raw) + current = self.snapshot_manager.get_latest_snapshot() + current_id = current.id if current is not None else -1 + if current_id > safe_id: + raise RuntimeError( + f"Strict-mode commit aborted: latest snapshot {current_id} is " + f"newer than the recorded last-safe-snapshot {safe_id}; " + f"another writer has committed since this write was planned." + ) + def _try_commit(self, commit_kind, commit_identifier, commit_entries_plan, detect_conflicts=False, allow_rollback=False): diff --git a/paimon-python/pypaimon/write/table_update.py b/paimon-python/pypaimon/write/table_update.py index fe2fb9a64b79..55b755e2d4d5 100644 --- a/paimon-python/pypaimon/write/table_update.py +++ b/paimon-python/pypaimon/write/table_update.py @@ -109,8 +109,6 @@ def with_update_type(self, update_cols: List[str]): for col in update_cols: if col not in self.table.field_names: raise ValueError(f"Column {col} is not in table schema.") - if len(update_cols) == len(self.table.field_names): - update_cols = None self.update_cols = update_cols return self diff --git a/paimon-python/pypaimon/write/table_update_by_row_id.py b/paimon-python/pypaimon/write/table_update_by_row_id.py index ac9c68c3623b..ab61ed21505e 100644 --- a/paimon-python/pypaimon/write/table_update_by_row_id.py +++ b/paimon-python/pypaimon/write/table_update_by_row_id.py @@ -42,19 +42,30 @@ class TableUpdateByRowId: FIRST_ROW_ID_COLUMN = '_FIRST_ROW_ID' - def __init__(self, table, commit_user: str, commit_identifier: int): + def __init__( + self, table, commit_user: str, commit_identifier: int, + precomputed_files_info: Optional[Tuple[ + int, List[int], + Dict[int, Tuple[DataSplit, List[DataFileMeta]]], + int, + ]] = None, + ): from pypaimon.table.file_store_table import FileStoreTable self.table: FileStoreTable = table self.commit_user = commit_user self.commit_identifier = commit_identifier - # Snapshot the current state once: a single ``first_row_id -> (split, files)`` - # map is enough to drive every downstream lookup (partition, row-count, read). - (self.snapshot_id, - self.first_row_ids, - self._first_row_id_index, - self.total_row_count) = self._load_existing_files_info() + if precomputed_files_info is not None: + (self.snapshot_id, + self.first_row_ids, + self._first_row_id_index, + self.total_row_count) = precomputed_files_info + else: + (self.snapshot_id, + self.first_row_ids, + self._first_row_id_index, + self.total_row_count) = self._load_existing_files_info() self.commit_messages: List[CommitMessage] = [] From a1b8a93996b15d69414b283ff21316f7040ac114 Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Fri, 29 May 2026 10:31:54 +0800 Subject: [PATCH 15/32] [python] tighten ray merge_into edge cases and src-row identity --- .../pypaimon/ray/data_evolution_merge_into.py | 63 +++++++----- .../ray_data_evolution_merge_into_test.py | 99 +++++++++++++++++++ 2 files changed, 136 insertions(+), 26 deletions(-) diff --git a/paimon-python/pypaimon/ray/data_evolution_merge_into.py b/paimon-python/pypaimon/ray/data_evolution_merge_into.py index d0dab03fc7d1..83d39491d973 100644 --- a/paimon-python/pypaimon/ray/data_evolution_merge_into.py +++ b/paimon-python/pypaimon/ray/data_evolution_merge_into.py @@ -135,10 +135,10 @@ def merge_into( update_ds = None update_cols_union: List[str] = [] - if matched_specs: + # Empty target → no rows can match; matched UPDATE is a no-op. + if matched_specs and base_snapshot is not None: update_cols_union = _union_update_cols(matched_specs) - if base_snapshot is not None: - _check_global_index_collision(table, base_snapshot, update_cols_union) + _check_global_index_collision(table, base_snapshot, update_cols_union) update_ds = _build_matched_update_ds( target_identifier=target, source_ds=source_ds, @@ -412,16 +412,24 @@ def _distributed_update_apply( frid_col = "_FIRST_ROW_ID" captured_sorted = sorted_first_row_ids captured_precomputed = precomputed_info + total_row_count = planner.total_row_count def _assign_frid(batch: pa.Table) -> pa.Table: if batch.num_rows == 0: return batch.append_column(frid_col, pa.array([], type=pa.int64())) row_ids = batch.column(row_id_name).to_pylist() bisect_right = bisect.bisect_right - values = [ - captured_sorted[bisect_right(captured_sorted, rid) - 1] - for rid in row_ids - ] + values: list = [] + first = captured_sorted[0] + for rid in row_ids: + # Out-of-range _ROW_IDs would silently map via bisect wrap-around. + if rid is None or rid < first or rid >= total_row_count: + raise ValueError( + f"_ROW_ID {rid} is out of valid range " + f"[{first}, {total_row_count}); planner snapshot is stale " + f"or matched rows come from a different table." + ) + values.append(captured_sorted[bisect_right(captured_sorted, rid) - 1]) return batch.append_column(frid_col, pa.array(values, type=pa.int64())) with_frid = update_ds.map_batches(_assign_frid, batch_format="pyarrow") @@ -472,20 +480,13 @@ def _apply_group(group: pa.Table) -> pa.Table: def _add_paimon_src_idx(source_ds): - """Append a stable per-row hash to source so we can route INSERTs row-precisely - when merge_condition can differ between source rows sharing the same ON key.""" - import hashlib - - def _add_idx(batch: pa.Table) -> pa.Table: - hashes = [ - hashlib.md5(repr(sorted(r.items())).encode()).hexdigest() - for r in batch.to_pylist() - ] - return batch.append_column( - PAIMON_SRC_IDX_COL, pa.array(hashes, type=pa.string()) - ) + """Append a unique per-row index so INSERTs are routed by row identity, + not by content. Duplicate identical rows must remain distinguishable.""" + import ray - return source_ds.map_batches(_add_idx, batch_format="pyarrow") + n = source_ds.count() + idx_ds = ray.data.range(n).rename_columns({"id": PAIMON_SRC_IDX_COL}) + return source_ds.zip(idx_ds) def _compute_matched_source_idx_ds( @@ -518,7 +519,7 @@ def _compute_matched_source_idx_ds( captured_merge_cond = merge_condition captured_on_pairs = list(zip(source_on, target_on)) - out_schema = pa.schema([pa.field(MATCHED_SRC_IDX_MARKER, pa.string())]) + out_schema = pa.schema([pa.field(MATCHED_SRC_IDX_MARKER, pa.int64())]) def _emit_matched_idx(batch: pa.Table) -> pa.Table: out_idx: list = [] @@ -687,11 +688,21 @@ def _check_global_index_collision( def _check_global_index_for_insert(table, snapshot) -> None: - if _scan_global_index_entries(table, snapshot): - raise NotImplementedError( - "MERGE INTO INSERT into a table with global index entries is not " - "supported (inserted rows would not appear in the index)." - ) + entries = _scan_global_index_entries(table, snapshot) + if not entries: + return + field_by_id = {f.id: f.name for f in table.fields} + indexed = sorted( + { + field_by_id.get(e.index_file.global_index_meta.index_field_id) + for e in entries + } + ) + raise NotImplementedError( + f"MERGE INTO INSERT is not supported on tables with global-index " + f"columns {indexed} (btree/lumina/tantivy). Inserted rows would not " + f"appear in the index. Drop the global index or omit when_not_matched." + ) def _scan_global_index_entries(table, snapshot): diff --git a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py index a4ae1ad47d4b..9eea0b93cd09 100644 --- a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py +++ b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py @@ -741,6 +741,105 @@ def test_matched_update_can_change_on_column(self): self.assertEqual(out['name'], ['y']) self.assertEqual(out['age'], [10]) + def test_empty_target_matched_update_is_noop(self): + target = self._create_table() + before = self._snapshot_id(target) + + source = pa.Table.from_pydict( + { + 'id': pa.array([1, 2], type=pa.int32()), + 'name': ['a', 'b'], + 'age': pa.array([10, 20], type=pa.int32()), + }, + schema=self.pa_schema, + ) + + merge_into( + target=target, + source=source, + catalog_options=self.catalog_options, + on=['id'], + when_matched=[WhenMatched(update='*')], + ) + + self.assertEqual(self._snapshot_id(target), before) + + def test_duplicate_identical_source_rows_route_separately(self): + target = self._create_table() + self._write( + target, + pa.Table.from_pydict( + { + 'id': pa.array([1], type=pa.int32()), + 'name': ['t1'], + 'age': pa.array([100], type=pa.int32()), + }, + schema=self.pa_schema, + ), + ) + + source = pa.Table.from_pydict( + { + 'id': pa.array([2, 2], type=pa.int32()), + 'name': ['dup', 'dup'], + 'age': pa.array([5, 5], type=pa.int32()), + }, + schema=self.pa_schema, + ) + + merge_into( + target=target, + source=source, + catalog_options=self.catalog_options, + on=['id'], + merge_condition=lambda r: r['s.age'] > r['t.age'], + when_not_matched=[WhenNotMatched(insert='*')], + ) + + out = self._read_sorted(target) + rows = sorted(zip(out['id'], out['name'], out['age'])) + self.assertEqual( + rows, + [(1, 't1', 100), (2, 'dup', 5), (2, 'dup', 5)], + ) + + def test_strict_mode_rejects_when_snapshot_advances(self): + target = self._create_table() + self._write( + target, + pa.Table.from_pydict( + { + 'id': pa.array([1], type=pa.int32()), + 'name': ['x'], + 'age': pa.array([1], type=pa.int32()), + }, + schema=self.pa_schema, + ), + ) + current_id = self._snapshot_id(target) + + table = self.catalog.get_table(target).copy( + {"commit.strict-mode.last-safe-snapshot": str(current_id - 1)} + ) + wb = table.new_batch_write_builder() + tw = wb.new_write() + tw.write_arrow( + pa.Table.from_pydict( + { + 'id': pa.array([2], type=pa.int32()), + 'name': ['y'], + 'age': pa.array([2], type=pa.int32()), + }, + schema=self.pa_schema, + ) + ) + msgs = tw.prepare_commit() + tw.close() + + with self.assertRaises(RuntimeError) as ctx: + wb.new_commit().commit(msgs) + self.assertIn("strict-mode", str(ctx.exception).lower()) + def test_merge_condition_routes_per_source_row(self): target = self._create_table() self._write( From c4af0650148c12aa2a33f468a7f0f31392979125 Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Fri, 29 May 2026 10:55:46 +0800 Subject: [PATCH 16/32] [python] optimize ray merge_into hot path with vectorized fallback --- .../pypaimon/ray/data_evolution_merge_into.py | 166 +++++++++++++++++- 1 file changed, 162 insertions(+), 4 deletions(-) diff --git a/paimon-python/pypaimon/ray/data_evolution_merge_into.py b/paimon-python/pypaimon/ray/data_evolution_merge_into.py index 83d39491d973..78feb532586c 100644 --- a/paimon-python/pypaimon/ray/data_evolution_merge_into.py +++ b/paimon-python/pypaimon/ray/data_evolution_merge_into.py @@ -65,10 +65,11 @@ def merge_into( merge_condition: Optional[Condition] = None, when_matched: Sequence[WhenMatched] = (), when_not_matched: Sequence[WhenNotMatched] = (), - num_partitions: int = 16, + num_partitions: Optional[int] = None, ray_remote_args: Optional[Dict[str, Any]] = None, concurrency: Optional[int] = None, ) -> None: + num_partitions = _resolve_num_partitions(num_partitions) when_matched = list(when_matched) when_not_matched = list(when_not_matched) if not when_matched and not when_not_matched: @@ -282,6 +283,21 @@ def _build_matched_update_ds( captured_on_pairs = list(zip(source_on, target_on)) captured_schema = update_schema + if _clauses_use_vector_fast_path(clauses, merge_condition): + first_spec = clauses[0].spec + + def _fast(batch: pa.Table) -> pa.Table: + return _vectorized_matched_transform( + batch, + first_spec, + captured_on_pairs, + captured_update_cols, + captured_row_id_name, + captured_schema, + ) + + return joined.map_batches(_fast, batch_format="pyarrow") + def _transform(batch: pa.Table) -> pa.Table: rows = batch.to_pylist() out_row_ids: list = [] @@ -330,6 +346,20 @@ def _build_self_merge_update_ds( captured_row_id_name = row_id_name captured_schema = update_schema + if _clauses_use_vector_fast_path(clauses, merge_condition): + first_spec = clauses[0].spec + + def _fast(batch: pa.Table) -> pa.Table: + return _vectorized_self_merge_transform( + batch, + first_spec, + captured_update_cols, + captured_row_id_name, + captured_schema, + ) + + return target_ds.map_batches(_fast, batch_format="pyarrow") + def _transform(batch: pa.Table) -> pa.Table: rows = batch.to_pylist() out_row_ids: list = [] @@ -481,12 +511,128 @@ def _apply_group(group: pa.Table) -> pa.Table: def _add_paimon_src_idx(source_ds): """Append a unique per-row index so INSERTs are routed by row identity, - not by content. Duplicate identical rows must remain distinguishable.""" + not by content. Materialize once so count() + zip don't re-run source.""" import ray - n = source_ds.count() + materialized = source_ds.materialize() + n = materialized.count() idx_ds = ray.data.range(n).rename_columns({"id": PAIMON_SRC_IDX_COL}) - return source_ds.zip(idx_ds) + return materialized.zip(idx_ds) + + +def _resolve_num_partitions(num_partitions: Optional[int]) -> int: + if num_partitions is not None: + return num_partitions + try: + import ray + + cpus = ray.cluster_resources().get("CPU", 16) + return max(16, int(cpus) * 2) + except Exception: + return 16 + + +def _clauses_use_vector_fast_path( + clauses: List[_NormalizedClause], + merge_condition: Optional[Condition], +) -> bool: + if not clauses: + return False + if merge_condition is not None: + return False + for c in clauses: + if c.condition is not None: + return False + for v in c.spec.values(): + if callable(v): + return False + return True + + +def _vectorized_matched_transform( + batch: pa.Table, + spec: Dict[str, Any], + on_pairs: Sequence[Tuple[str, str]], + update_cols: Sequence[str], + row_id_name: str, + update_schema: pa.Schema, +) -> pa.Table: + available = set(batch.schema.names) + arrays: list = [batch.column(f"t.{row_id_name}")] + for col in update_cols: + out_type = update_schema.field(col).type + if col in spec: + arrays.append(_resolve_spec_array(spec[col], batch, available, on_pairs, out_type)) + else: + arrays.append(batch.column(f"t.{col}")) + return pa.Table.from_arrays(arrays, schema=update_schema) + + +def _vectorized_self_merge_transform( + batch: pa.Table, + spec: Dict[str, Any], + update_cols: Sequence[str], + row_id_name: str, + update_schema: pa.Schema, +) -> pa.Table: + # In self-merge, s.X and t.X are the same row; the batch columns are + # unprefixed (it's the raw target read). + arrays: list = [batch.column(row_id_name)] + for col in update_cols: + out_type = update_schema.field(col).type + if col in spec: + val = spec[col] + if isinstance(val, str) and (val.startswith("s.") or val.startswith("t.")): + ref = val[2:] + arrays.append(batch.column(ref) if ref in batch.schema.names + else pa.nulls(batch.num_rows, type=out_type)) + else: + arrays.append(pa.array([val] * batch.num_rows, type=out_type)) + else: + arrays.append(batch.column(col)) + return pa.Table.from_arrays(arrays, schema=update_schema) + + +def _vectorized_insert_transform( + batch: pa.Table, + spec: Dict[str, Any], + target_field_names: Sequence[str], + target_pa_schema: pa.Schema, +) -> pa.Table: + available = set(batch.schema.names) + arrays: list = [] + for col in target_field_names: + out_type = target_pa_schema.field(col).type + if col in spec: + arrays.append(_resolve_spec_array(spec[col], batch, available, (), out_type)) + else: + arrays.append(pa.nulls(batch.num_rows, type=out_type)) + return pa.Table.from_arrays(arrays, schema=target_pa_schema) + + +def _resolve_spec_array( + val: Any, + batch: pa.Table, + available: set, + on_pairs: Sequence[Tuple[str, str]], + out_type: pa.DataType, +): + if isinstance(val, str) and val.startswith("s."): + ref = val[2:] + if f"s.{ref}" in available: + return batch.column(f"s.{ref}") + # Equi-join drops the right-side join key; fall back to target's value. + for sk, tk in on_pairs: + if sk == ref and f"t.{tk}" in available: + return batch.column(f"t.{tk}") + return pa.nulls(batch.num_rows, type=out_type) + if isinstance(val, str) and val.startswith("t."): + ref = val[2:] + col_name = f"t.{ref}" + return batch.column(col_name) if col_name in available else pa.nulls( + batch.num_rows, type=out_type + ) + return pa.array([val] * batch.num_rows, type=out_type) def _compute_matched_source_idx_ds( @@ -599,6 +745,18 @@ def _filter_unmatched(batch: pa.Table) -> pa.Table: right_on=tuple(f"t.{c}" for c in target_on), ) + if _clauses_use_vector_fast_path(clauses, None): + first_spec = clauses[0].spec + + def _fast(batch: pa.Table) -> pa.Table: + return _coerce_large_string_types( + _vectorized_insert_transform( + batch, first_spec, captured_field_names, out_schema + ) + ) + + return unmatched.map_batches(_fast, batch_format="pyarrow") + def _transform(batch: pa.Table) -> pa.Table: rows = batch.to_pylist() out = [] From 5fc8a7e55faaeef3ba546aebe3021fe7ce17aefc Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Fri, 29 May 2026 11:14:45 +0800 Subject: [PATCH 17/32] [python] align ray merge_into cardinality and self-merge with spark --- .../pypaimon/ray/data_evolution_merge_into.py | 119 ++---------------- .../ray_data_evolution_merge_into_test.py | 28 +++-- 2 files changed, 27 insertions(+), 120 deletions(-) diff --git a/paimon-python/pypaimon/ray/data_evolution_merge_into.py b/paimon-python/pypaimon/ray/data_evolution_merge_into.py index 78feb532586c..af9921f92dbf 100644 --- a/paimon-python/pypaimon/ray/data_evolution_merge_into.py +++ b/paimon-python/pypaimon/ray/data_evolution_merge_into.py @@ -118,11 +118,10 @@ def merge_into( table = table.copy( {"commit.strict-mode.last-safe-snapshot": str(base_snapshot.id)} ) - is_self_merge = isinstance(source, str) and source == target # Row-precise routing needs a stable per-source-row id when merge_condition # may differ between source rows sharing the same ON key. - if when_not_matched and merge_condition is not None and not is_self_merge: + if when_not_matched and merge_condition is not None: source_ds = _add_paimon_src_idx(source_ds) from pypaimon.schema.data_types import PyarrowFieldParser @@ -151,12 +150,11 @@ def merge_into( target_pa_schema=target_pa_schema, update_cols=update_cols_union, catalog_options=catalog_options, - is_self_merge=is_self_merge, num_partitions=num_partitions, ) insert_ds = None - if not_matched_specs and not is_self_merge: + if not_matched_specs: matched_keys_ds = None if merge_condition is not None: matched_keys_ds = _compute_matched_source_idx_ds( @@ -228,7 +226,6 @@ def _build_matched_update_ds( target_pa_schema: pa.Schema, update_cols: Sequence[str], catalog_options: Dict[str, str], - is_self_merge: bool, num_partitions: int, ): from pypaimon.ray.ray_paimon import read_paimon @@ -249,17 +246,6 @@ def _build_matched_update_ds( target_ds = read_paimon(target_identifier, catalog_options, projection=projection) update_schema = _build_update_schema(target_pa_schema, update_cols, row_id_name) - if is_self_merge: - return _build_self_merge_update_ds( - target_ds=target_ds, - merge_condition=merge_condition, - clauses=clauses, - target_field_names=target_field_names, - update_cols=update_cols, - row_id_name=row_id_name, - update_schema=update_schema, - ) - target_renamed = target_ds.rename_columns( {c: f"t.{c}" for c in target_ds.schema().names} ) @@ -329,65 +315,6 @@ def _transform(batch: pa.Table) -> pa.Table: return joined.map_batches(_transform, batch_format="pyarrow") -def _build_self_merge_update_ds( - *, - target_ds, - merge_condition: Optional[Condition], - clauses: List[_NormalizedClause], - target_field_names: Sequence[str], - update_cols: Sequence[str], - row_id_name: str, - update_schema: pa.Schema, -): - captured_clauses = clauses - captured_merge_cond = merge_condition - captured_update_cols = list(update_cols) - captured_field_names = list(target_field_names) - captured_row_id_name = row_id_name - captured_schema = update_schema - - if _clauses_use_vector_fast_path(clauses, merge_condition): - first_spec = clauses[0].spec - - def _fast(batch: pa.Table) -> pa.Table: - return _vectorized_self_merge_transform( - batch, - first_spec, - captured_update_cols, - captured_row_id_name, - captured_schema, - ) - - return target_ds.map_batches(_fast, batch_format="pyarrow") - - def _transform(batch: pa.Table) -> pa.Table: - rows = batch.to_pylist() - out_row_ids: list = [] - out_cols: Dict[str, list] = {c: [] for c in captured_update_cols} - for row in rows: - s_row = dict(row) - t_row = dict(row) - combined = _prefixed(s_row, t_row) - if captured_merge_cond is not None and not captured_merge_cond(combined): - continue - for clause in captured_clauses: - if clause.condition is not None and not clause.condition(combined): - continue - new_values = _apply_set( - clause.spec, s_row, t_row, captured_field_names - ) - out_row_ids.append(t_row[captured_row_id_name]) - for col in captured_update_cols: - out_cols[col].append(new_values.get(col, t_row.get(col))) - break - return pa.Table.from_pydict( - {captured_row_id_name: out_row_ids, **out_cols}, - schema=captured_schema, - ) - - return target_ds.map_batches(_transform, batch_format="pyarrow") - - def _build_update_schema( target_pa_schema: pa.Schema, update_cols: Sequence[str], @@ -471,18 +398,17 @@ def _apply_group(group: pa.Table) -> pa.Table: if group.num_rows == 0: return pa.Table.from_pydict({"msgs_blob": pa.array([], type=pa.binary())}) + # Match Spark DE (checkCardinality=false): silently dedupe _ROW_ID, + # keep first occurrence per target row. group_row_ids = group.column(row_id_name).to_pylist() if len(set(group_row_ids)) != len(group_row_ids): seen: set = set() - dupes: set = set() - for rid in group_row_ids: - if rid in seen: - dupes.add(rid) - seen.add(rid) - raise ValueError( - f"MERGE INTO matched the same target _ROW_IDs {sorted(dupes)[:5]} " - f"via multiple source rows; source must be unique on the join keys." - ) + keep_indices: list = [] + for i, rid in enumerate(group_row_ids): + if rid not in seen: + seen.add(rid) + keep_indices.append(i) + group = group.take(pa.array(keep_indices, type=pa.int64())) for_update = group.drop_columns([frid_col]) worker = TableUpdateByRowId( @@ -568,31 +494,6 @@ def _vectorized_matched_transform( return pa.Table.from_arrays(arrays, schema=update_schema) -def _vectorized_self_merge_transform( - batch: pa.Table, - spec: Dict[str, Any], - update_cols: Sequence[str], - row_id_name: str, - update_schema: pa.Schema, -) -> pa.Table: - # In self-merge, s.X and t.X are the same row; the batch columns are - # unprefixed (it's the raw target read). - arrays: list = [batch.column(row_id_name)] - for col in update_cols: - out_type = update_schema.field(col).type - if col in spec: - val = spec[col] - if isinstance(val, str) and (val.startswith("s.") or val.startswith("t.")): - ref = val[2:] - arrays.append(batch.column(ref) if ref in batch.schema.names - else pa.nulls(batch.num_rows, type=out_type)) - else: - arrays.append(pa.array([val] * batch.num_rows, type=out_type)) - else: - arrays.append(batch.column(col)) - return pa.Table.from_arrays(arrays, schema=update_schema) - - def _vectorized_insert_transform( batch: pa.Table, spec: Dict[str, Any], diff --git a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py index 9eea0b93cd09..076c07cef39c 100644 --- a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py +++ b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py @@ -606,7 +606,9 @@ def test_insert_dict_fills_unspecified_with_null(self): self.assertEqual(out['name'], ['a', None]) self.assertEqual(out['age'], [10, 99]) - def test_cardinality_violation_raises(self): + def test_multi_source_match_silently_picks_first(self): + # Spark DE sets checkCardinality=false: silently dedupe target _ROW_IDs + # rather than raising when source has multiple rows for the same key. target = self._create_table() self._write( target, @@ -629,15 +631,19 @@ def test_cardinality_violation_raises(self): schema=self.pa_schema, ) - with self.assertRaises(ValueError) as ctx: - merge_into( - target=target, - source=source, - catalog_options=self.catalog_options, - on=['id'], - when_matched=[WhenMatched(update='*')], - ) - self.assertIn('source must be unique', str(ctx.exception)) + merge_into( + target=target, + source=source, + catalog_options=self.catalog_options, + on=['id'], + when_matched=[WhenMatched(update='*')], + ) + + out = self._read_sorted(target) + self.assertEqual(out['id'], [1]) + # Exactly one of the source rows wins; we don't pin which. + self.assertIn(out['name'][0], ['x', 'y']) + self.assertIn(out['age'][0], [100, 200]) def test_combined_writes_single_snapshot(self): target = self._create_table() @@ -675,7 +681,7 @@ def test_combined_writes_single_snapshot(self): after = self._snapshot_id(target) self.assertEqual(after, before + 1) - def test_self_merge_skips_join(self): + def test_self_merge_via_normal_join(self): target = self._create_table() self._write( target, From 0c0465021d27898b51fc6d32462d82a45af43a6c Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Fri, 29 May 2026 11:51:34 +0800 Subject: [PATCH 18/32] [python] distribute ray merge_into not-matched insert via anti-join Replace the driver-side matched-id set collection in the not-matched INSERT path with a distributed left_anti join on the per-row id, matching Spark's single LeftAnti predicate. Partition count is sized to the matched row count to keep hash partitions dense, since ray's join fails on empty partitions. Co-Authored-By: Claude Opus 4.7 --- .../pypaimon/ray/data_evolution_merge_into.py | 40 ++++++++++--------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/paimon-python/pypaimon/ray/data_evolution_merge_into.py b/paimon-python/pypaimon/ray/data_evolution_merge_into.py index af9921f92dbf..6fa535b7fa2e 100644 --- a/paimon-python/pypaimon/ray/data_evolution_merge_into.py +++ b/paimon-python/pypaimon/ray/data_evolution_merge_into.py @@ -433,6 +433,9 @@ def _apply_group(group: pa.Table) -> pa.Table: PAIMON_SRC_IDX_COL = "_paimon_src_idx" MATCHED_SRC_IDX_MARKER = "_paimon_matched_src_idx" +# Min rows per hash partition for the anti-join; keeps partitions non-empty +# (ray's join crashes on empty hash partitions). +_ANTI_JOIN_ROWS_PER_PARTITION = 8 def _add_paimon_src_idx(source_ds): @@ -611,26 +614,25 @@ def _build_not_matched_insert_ds( source_renamed = source_ds.rename_columns({c: f"s.{c}" for c in source_cols}) if matched_idx_ds is not None: - # Ray join hits a pyarrow projection bug when the right side is - # empty; collect matched-idx to a driver set instead. The set is - # bounded by # of matched source rows × ~32B per row-hash. - matched_idx_set: set = set() - for batch in matched_idx_ds.iter_batches(batch_format="pyarrow"): - if batch.num_rows == 0: - continue - matched_idx_set.update( - batch.column(MATCHED_SRC_IDX_MARKER).to_pylist() + # ray's join is equi-only, so anti-join source against the matched + # per-row ids (Spark folds this into one LeftAnti predicate). Size + # partitions to the matched count: ray's join crashes on empty hash + # partitions, so keep them dense. + matched_idx_ds = matched_idx_ds.materialize() + matched_count = matched_idx_ds.count() + if matched_count == 0: + unmatched = source_renamed + else: + anti_np = max( + 1, min(num_partitions, matched_count // _ANTI_JOIN_ROWS_PER_PARTITION) + ) + unmatched = source_renamed.join( + matched_idx_ds, + join_type="left_anti", + num_partitions=anti_np, + on=(f"s.{PAIMON_SRC_IDX_COL}",), + right_on=(MATCHED_SRC_IDX_MARKER,), ) - captured_idx_set = matched_idx_set - - def _filter_unmatched(batch: pa.Table) -> pa.Table: - idx_arr = batch.column(f"s.{PAIMON_SRC_IDX_COL}").to_pylist() - mask = [v not in captured_idx_set for v in idx_arr] - return batch.filter(pa.array(mask)) - - unmatched = source_renamed.map_batches( - _filter_unmatched, batch_format="pyarrow" - ) else: target_ds = read_paimon( target_identifier, catalog_options, projection=list(target_on) From 51d1bb53ad4d9491e46523175e3890c8f863c740 Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Fri, 29 May 2026 12:43:53 +0800 Subject: [PATCH 19/32] [python] guard ray merge_into correctness and dependency edges - Reject multi-source cardinality by default; add allow_multiple_matches opt-in for deterministic keep-first. - Refuse blob-column writes loudly instead of emitting wrong-format files. - Check Dataset.join (ray>=2.50) at call time and restore the ray extra floor to 2.10, so read/sink users on older ray are unaffected. Co-Authored-By: Claude Opus 4.8 --- .../pypaimon/ray/data_evolution_merge_into.py | 48 +++++++++++++- .../ray_data_evolution_merge_into_test.py | 64 +++++++++++++++++-- paimon-python/setup.py | 2 +- 3 files changed, 107 insertions(+), 7 deletions(-) diff --git a/paimon-python/pypaimon/ray/data_evolution_merge_into.py b/paimon-python/pypaimon/ray/data_evolution_merge_into.py index 6fa535b7fa2e..8b21b3b1d560 100644 --- a/paimon-python/pypaimon/ray/data_evolution_merge_into.py +++ b/paimon-python/pypaimon/ray/data_evolution_merge_into.py @@ -68,7 +68,9 @@ def merge_into( num_partitions: Optional[int] = None, ray_remote_args: Optional[Dict[str, Any]] = None, concurrency: Optional[int] = None, + allow_multiple_matches: bool = False, ) -> None: + _require_ray_join() num_partitions = _resolve_num_partitions(num_partitions) when_matched = list(when_matched) when_not_matched = list(when_not_matched) @@ -108,6 +110,11 @@ def merge_into( for c in when_not_matched ] + written_cols: set = set() + for clause in matched_specs + not_matched_specs: + written_cols.update(clause.spec.keys()) + _reject_blob_writes(table, written_cols) + source_ds = _normalize_source(source, catalog_options) _validate_source_on_cols(source_ds, source_on_cols) @@ -187,6 +194,7 @@ def merge_into( table, update_cols_union, ray_remote_args=ray_remote_args, + allow_multiple_matches=allow_multiple_matches, ) ) if insert_ds is not None: @@ -332,6 +340,7 @@ def _distributed_update_apply( write_update_cols: Sequence[str], *, ray_remote_args: Optional[Dict[str, Any]] = None, + allow_multiple_matches: bool = False, ) -> list: import bisect import pickle @@ -398,10 +407,17 @@ def _apply_group(group: pa.Table) -> pa.Table: if group.num_rows == 0: return pa.Table.from_pydict({"msgs_blob": pa.array([], type=pa.binary())}) - # Match Spark DE (checkCardinality=false): silently dedupe _ROW_ID, - # keep first occurrence per target row. + # One target _ROW_ID matched by several source rows. Default: refuse + # (the winning value is otherwise undefined, as in Spark DE's + # checkCardinality=false). Opt-in keeps the first match deterministically. group_row_ids = group.column(row_id_name).to_pylist() if len(set(group_row_ids)) != len(group_row_ids): + if not allow_multiple_matches: + raise ValueError( + "MERGE matched multiple source rows to the same target " + "_ROW_ID. Deduplicate the source, or pass " + "allow_multiple_matches=True to keep the first match." + ) seen: set = set() keep_indices: list = [] for i, rid in enumerate(group_row_ids): @@ -775,6 +791,34 @@ def _scan_global_index_entries(table, snapshot): ) +def _require_ray_join() -> None: + """merge_into relies on ``Dataset.join`` (ray>=2.50). Read/sink users on + older ray are unaffected unless they call this, so check only here.""" + import ray + from ray.data import Dataset + + if not hasattr(Dataset, "join"): + raise RuntimeError( + f"merge_into requires ray>=2.50 (Dataset.join); " + f"installed ray is {ray.__version__}." + ) + + +def _reject_blob_writes(table, written_cols: set) -> None: + """Blob columns live in a separate .blob format we cannot produce here; + refuse loudly instead of emitting wrong-format files.""" + blob_cols = [ + f.name + for f in table.table_schema.fields + if f.name in written_cols and getattr(f.type, "type", None) == "BLOB" + ] + if blob_cols: + raise NotImplementedError( + f"merge_into cannot write blob columns {blob_cols}; " + f"updating or inserting blob columns is not supported." + ) + + def _union_update_cols(clauses: List[_NormalizedClause]) -> List[str]: seen: List[str] = [] seen_set: set = set() diff --git a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py index 076c07cef39c..9a124966252c 100644 --- a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py +++ b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py @@ -606,9 +606,43 @@ def test_insert_dict_fills_unspecified_with_null(self): self.assertEqual(out['name'], ['a', None]) self.assertEqual(out['age'], [10, 99]) - def test_multi_source_match_silently_picks_first(self): - # Spark DE sets checkCardinality=false: silently dedupe target _ROW_IDs - # rather than raising when source has multiple rows for the same key. + def test_multi_source_match_raises_by_default(self): + # One target row matched by several source rows: the winning value is + # undefined (Spark DE's checkCardinality=false), so we refuse by default. + target = self._create_table() + self._write( + target, + pa.Table.from_pydict( + { + 'id': pa.array([1], type=pa.int32()), + 'name': ['a'], + 'age': pa.array([10], type=pa.int32()), + }, + schema=self.pa_schema, + ), + ) + + source = pa.Table.from_pydict( + { + 'id': pa.array([1, 1], type=pa.int32()), + 'name': ['x', 'y'], + 'age': pa.array([100, 200], type=pa.int32()), + }, + schema=self.pa_schema, + ) + + with self.assertRaises(Exception) as ctx: + merge_into( + target=target, + source=source, + catalog_options=self.catalog_options, + on=['id'], + when_matched=[WhenMatched(update='*')], + ) + self.assertIn("multiple source rows", str(ctx.exception)) + + def test_multi_source_match_allow_keeps_first(self): + # Opt-in: allow_multiple_matches keeps the first match deterministically. target = self._create_table() self._write( target, @@ -637,14 +671,36 @@ def test_multi_source_match_silently_picks_first(self): catalog_options=self.catalog_options, on=['id'], when_matched=[WhenMatched(update='*')], + allow_multiple_matches=True, ) out = self._read_sorted(target) self.assertEqual(out['id'], [1]) - # Exactly one of the source rows wins; we don't pin which. + # One source row wins; we don't pin which. self.assertIn(out['name'][0], ['x', 'y']) self.assertIn(out['age'][0], [100, 200]) + def test_blob_write_is_rejected(self): + # Updating/inserting a blob column is unsupported and must fail loudly + # rather than emit wrong-format files. + import types + + from pypaimon.ray.data_evolution_merge_into import _reject_blob_writes + from pypaimon.schema.data_types import AtomicType, DataField + + fake_table = types.SimpleNamespace( + table_schema=types.SimpleNamespace( + fields=[ + DataField(0, 'id', AtomicType('INT')), + DataField(1, 'payload', AtomicType('BLOB')), + ] + ) + ) + with self.assertRaises(NotImplementedError): + _reject_blob_writes(fake_table, {'payload'}) + # Writing only non-blob columns is allowed even when blob fields exist. + _reject_blob_writes(fake_table, {'id'}) + def test_combined_writes_single_snapshot(self): target = self._create_table() self._write( diff --git a/paimon-python/setup.py b/paimon-python/setup.py index 431f19db312c..1f464b1988b8 100644 --- a/paimon-python/setup.py +++ b/paimon-python/setup.py @@ -152,7 +152,7 @@ def read_requirements(): }, extras_require={ 'ray': [ - 'ray>=2.50,<3; python_version>="3.7"', + 'ray>=2.10,<3; python_version>="3.7"', ], 'torch': [ 'torch', From bacd07e606fecc17732ce7abb7f256ae982f0f8e Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Fri, 29 May 2026 14:04:37 +0800 Subject: [PATCH 20/32] [python] narrow ray merge_into blob guard to update path --- .../pypaimon/ray/data_evolution_merge_into.py | 18 ++++++++---------- .../ray_data_evolution_merge_into_test.py | 11 ++++------- 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/paimon-python/pypaimon/ray/data_evolution_merge_into.py b/paimon-python/pypaimon/ray/data_evolution_merge_into.py index 8b21b3b1d560..6567ca6d9f4e 100644 --- a/paimon-python/pypaimon/ray/data_evolution_merge_into.py +++ b/paimon-python/pypaimon/ray/data_evolution_merge_into.py @@ -110,10 +110,10 @@ def merge_into( for c in when_not_matched ] - written_cols: set = set() - for clause in matched_specs + not_matched_specs: - written_cols.update(clause.spec.keys()) - _reject_blob_writes(table, written_cols) + update_cols: set = set() + for clause in matched_specs: + update_cols.update(clause.spec.keys()) + _reject_blob_updates(table, update_cols) source_ds = _normalize_source(source, catalog_options) _validate_source_on_cols(source_ds, source_on_cols) @@ -804,18 +804,16 @@ def _require_ray_join() -> None: ) -def _reject_blob_writes(table, written_cols: set) -> None: - """Blob columns live in a separate .blob format we cannot produce here; - refuse loudly instead of emitting wrong-format files.""" +def _reject_blob_updates(table, update_cols: set) -> None: blob_cols = [ f.name for f in table.table_schema.fields - if f.name in written_cols and getattr(f.type, "type", None) == "BLOB" + if f.name in update_cols and getattr(f.type, "type", None) == "BLOB" ] if blob_cols: raise NotImplementedError( - f"merge_into cannot write blob columns {blob_cols}; " - f"updating or inserting blob columns is not supported." + f"merge_into cannot update blob columns {blob_cols}; " + f"the row-id rewrite path skips .blob files." ) diff --git a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py index 9a124966252c..e2019b84b7c3 100644 --- a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py +++ b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py @@ -680,12 +680,10 @@ def test_multi_source_match_allow_keeps_first(self): self.assertIn(out['name'][0], ['x', 'y']) self.assertIn(out['age'][0], [100, 200]) - def test_blob_write_is_rejected(self): - # Updating/inserting a blob column is unsupported and must fail loudly - # rather than emit wrong-format files. + def test_blob_update_is_rejected(self): import types - from pypaimon.ray.data_evolution_merge_into import _reject_blob_writes + from pypaimon.ray.data_evolution_merge_into import _reject_blob_updates from pypaimon.schema.data_types import AtomicType, DataField fake_table = types.SimpleNamespace( @@ -697,9 +695,8 @@ def test_blob_write_is_rejected(self): ) ) with self.assertRaises(NotImplementedError): - _reject_blob_writes(fake_table, {'payload'}) - # Writing only non-blob columns is allowed even when blob fields exist. - _reject_blob_writes(fake_table, {'id'}) + _reject_blob_updates(fake_table, {'payload'}) + _reject_blob_updates(fake_table, {'id'}) def test_combined_writes_single_snapshot(self): target = self._create_table() From 2b76ad29233899e50eb769220e8af5c78d728566 Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Fri, 29 May 2026 15:29:20 +0800 Subject: [PATCH 21/32] [python] size ray merge_into update shuffle to actual group count The update path grouped by _FIRST_ROW_ID with ray's default 200 hash partitions, spawning hundreds of empty reduce tasks on small and medium merges. Cap the groupby partitions at the distinct group count (one per target data file), bounded by 200 so large tables keep today's behavior. Verified on a 2-node ray cluster: an 18000-row merge drops the shuffle from num_partitions=200 to 4 with no correctness change. --- .../pypaimon/ray/data_evolution_merge_into.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/paimon-python/pypaimon/ray/data_evolution_merge_into.py b/paimon-python/pypaimon/ray/data_evolution_merge_into.py index 6567ca6d9f4e..3f25a5577f13 100644 --- a/paimon-python/pypaimon/ray/data_evolution_merge_into.py +++ b/paimon-python/pypaimon/ray/data_evolution_merge_into.py @@ -436,7 +436,11 @@ def _apply_group(group: pa.Table) -> pa.Table: msgs = worker.update_columns(for_update, list(captured_cols)) return pa.Table.from_pydict({"msgs_blob": [pickle.dumps(msgs)]}) - msgs_ds = with_frid.groupby(frid_col).map_groups( + # One group per target data file (distinct _FIRST_ROW_ID). Size the shuffle + # to the real group count instead of ray's default 200, which otherwise + # spawns hundreds of empty reduce tasks on small/medium merges. + group_partitions = max(1, min(len(captured_sorted), _MAX_GROUP_PARTITIONS)) + msgs_ds = with_frid.groupby(frid_col, num_partitions=group_partitions).map_groups( _apply_group, batch_format="pyarrow" ) @@ -453,6 +457,10 @@ def _apply_group(group: pa.Table) -> pa.Table: # (ray's join crashes on empty hash partitions). _ANTI_JOIN_ROWS_PER_PARTITION = 8 +# Upper bound on the update groupby shuffle, matching ray's default hash-shuffle +# parallelism so large tables keep today's behavior while small ones shrink. +_MAX_GROUP_PARTITIONS = 200 + def _add_paimon_src_idx(source_ds): """Append a unique per-row index so INSERTs are routed by row identity, From ad70c4d838636f56dbd1830b010681009fc88100 Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Fri, 29 May 2026 15:34:08 +0800 Subject: [PATCH 22/32] [python] vectorize _assign_frid row-id bucketing _assign_frid mapped each matched _ROW_ID to its file's first-row-id with a per-row Python bisect over to_pylist(), a CPU hot spot when many rows match. Replace it with a single numpy searchsorted over the matched batch, keeping the null and out-of-range guards. Verified: 26 ray merge_into unit tests pass; an 18000-row merge on a 2-node ray cluster stays correct. --- .../pypaimon/ray/data_evolution_merge_into.py | 37 +++++++++++-------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/paimon-python/pypaimon/ray/data_evolution_merge_into.py b/paimon-python/pypaimon/ray/data_evolution_merge_into.py index 3f25a5577f13..53938dc5b11e 100644 --- a/paimon-python/pypaimon/ray/data_evolution_merge_into.py +++ b/paimon-python/pypaimon/ray/data_evolution_merge_into.py @@ -342,7 +342,7 @@ def _distributed_update_apply( ray_remote_args: Optional[Dict[str, Any]] = None, allow_multiple_matches: bool = False, ) -> list: - import bisect + import numpy as np import pickle import uuid @@ -377,26 +377,33 @@ def _distributed_update_apply( frid_col = "_FIRST_ROW_ID" captured_sorted = sorted_first_row_ids + captured_sorted_arr = np.asarray(captured_sorted, dtype=np.int64) + first = captured_sorted_arr[0] captured_precomputed = precomputed_info total_row_count = planner.total_row_count def _assign_frid(batch: pa.Table) -> pa.Table: if batch.num_rows == 0: return batch.append_column(frid_col, pa.array([], type=pa.int64())) - row_ids = batch.column(row_id_name).to_pylist() - bisect_right = bisect.bisect_right - values: list = [] - first = captured_sorted[0] - for rid in row_ids: - # Out-of-range _ROW_IDs would silently map via bisect wrap-around. - if rid is None or rid < first or rid >= total_row_count: - raise ValueError( - f"_ROW_ID {rid} is out of valid range " - f"[{first}, {total_row_count}); planner snapshot is stale " - f"or matched rows come from a different table." - ) - values.append(captured_sorted[bisect_right(captured_sorted, rid) - 1]) - return batch.append_column(frid_col, pa.array(values, type=pa.int64())) + rid_col = batch.column(row_id_name) + if rid_col.null_count: + raise ValueError( + "_ROW_ID is null; planner snapshot is stale " + "or matched rows come from a different table." + ) + rids = rid_col.to_numpy(zero_copy_only=False) + # Out-of-range _ROW_IDs would silently map via searchsorted wrap-around. + out_of_range = (rids < first) | (rids >= total_row_count) + if out_of_range.any(): + bad = rids[out_of_range][0] + raise ValueError( + f"_ROW_ID {bad} is out of valid range " + f"[{first}, {total_row_count}); planner snapshot is stale " + f"or matched rows come from a different table." + ) + idx = np.searchsorted(captured_sorted_arr, rids, side="right") - 1 + frids = captured_sorted_arr[idx] + return batch.append_column(frid_col, pa.array(frids, type=pa.int64())) with_frid = update_ds.map_batches(_assign_frid, batch_format="pyarrow") From e9fb3a609de8b5d02171d0199a4ce0e5ab89d802 Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Fri, 29 May 2026 16:00:37 +0800 Subject: [PATCH 23/32] [python] fuse ray merge_into matched/not-matched into one outer join When both when_matched and when_not_matched run on a non-empty target, build the UPDATE and INSERT datasets from one materialized LEFT_OUTER join instead of reading and shuffling the target table twice. The join shuffle dominates cost at scale, so routing matched (non-null target) and not-matched (null target) rows from a single join halves it. --- .../pypaimon/ray/data_evolution_merge_into.py | 315 ++++++++++++++---- 1 file changed, 248 insertions(+), 67 deletions(-) diff --git a/paimon-python/pypaimon/ray/data_evolution_merge_into.py b/paimon-python/pypaimon/ray/data_evolution_merge_into.py index 53938dc5b11e..2cf0148addae 100644 --- a/paimon-python/pypaimon/ray/data_evolution_merge_into.py +++ b/paimon-python/pypaimon/ray/data_evolution_merge_into.py @@ -141,50 +141,72 @@ def merge_into( _check_global_index_for_insert(table, base_snapshot) update_ds = None + insert_ds = None update_cols_union: List[str] = [] - # Empty target → no rows can match; matched UPDATE is a no-op. - if matched_specs and base_snapshot is not None: + + # With both clauses on a non-empty target, matched and not-matched routing + # share the same source/target equi-join. Build them from one materialized + # LEFT_OUTER join instead of reading and shuffling the target table twice. + if matched_specs and not_matched_specs and base_snapshot is not None: update_cols_union = _union_update_cols(matched_specs) _check_global_index_collision(table, base_snapshot, update_cols_union) - update_ds = _build_matched_update_ds( + update_ds, insert_ds = _build_unified_both( target_identifier=target, source_ds=source_ds, target_on=target_on_cols, source_on=source_on_cols, merge_condition=merge_condition, - clauses=matched_specs, + matched_clauses=matched_specs, + not_matched_clauses=not_matched_specs, target_field_names=target_field_names, target_pa_schema=target_pa_schema, update_cols=update_cols_union, catalog_options=catalog_options, num_partitions=num_partitions, ) - - insert_ds = None - if not_matched_specs: - matched_keys_ds = None - if merge_condition is not None: - matched_keys_ds = _compute_matched_source_idx_ds( + else: + # Empty target → no rows can match; matched UPDATE is a no-op. + if matched_specs and base_snapshot is not None: + update_cols_union = _union_update_cols(matched_specs) + _check_global_index_collision(table, base_snapshot, update_cols_union) + update_ds = _build_matched_update_ds( target_identifier=target, source_ds=source_ds, target_on=target_on_cols, source_on=source_on_cols, merge_condition=merge_condition, + clauses=matched_specs, + target_field_names=target_field_names, + target_pa_schema=target_pa_schema, + update_cols=update_cols_union, catalog_options=catalog_options, num_partitions=num_partitions, ) - insert_ds = _build_not_matched_insert_ds( - target_identifier=target, - source_ds=source_ds, - target_on=target_on_cols, - source_on=source_on_cols, - clauses=not_matched_specs, - target_field_names=target_field_names, - target_pa_schema=target_pa_schema, - catalog_options=catalog_options, - num_partitions=num_partitions, - matched_idx_ds=matched_keys_ds, - ) + + if not_matched_specs: + matched_keys_ds = None + if merge_condition is not None: + matched_keys_ds = _compute_matched_source_idx_ds( + target_identifier=target, + source_ds=source_ds, + target_on=target_on_cols, + source_on=source_on_cols, + merge_condition=merge_condition, + catalog_options=catalog_options, + num_partitions=num_partitions, + ) + insert_ds = _build_not_matched_insert_ds( + target_identifier=target, + source_ds=source_ds, + target_on=target_on_cols, + source_on=source_on_cols, + clauses=not_matched_specs, + target_field_names=target_field_names, + target_pa_schema=target_pa_schema, + catalog_options=catalog_options, + num_partitions=num_partitions, + matched_idx_ds=matched_keys_ds, + ) all_msgs: list = [] if update_ds is not None: @@ -293,36 +315,56 @@ def _fast(batch: pa.Table) -> pa.Table: return joined.map_batches(_fast, batch_format="pyarrow") def _transform(batch: pa.Table) -> pa.Table: - rows = batch.to_pylist() - out_row_ids: list = [] - out_cols: Dict[str, list] = {c: [] for c in captured_update_cols} - for row in rows: - s_row = {k[2:]: v for k, v in row.items() if k.startswith("s.")} - t_row = {k[2:]: v for k, v in row.items() if k.startswith("t.")} - for s_key, t_key in captured_on_pairs: - if s_key not in s_row and t_key in t_row: - s_row[s_key] = t_row[t_key] - combined = _prefixed(s_row, t_row) - if captured_merge_cond is not None and not captured_merge_cond(combined): - continue - for clause in captured_clauses: - if clause.condition is not None and not clause.condition(combined): - continue - new_values = _apply_set( - clause.spec, s_row, t_row, captured_field_names - ) - out_row_ids.append(t_row[captured_row_id_name]) - for col in captured_update_cols: - out_cols[col].append(new_values.get(col, t_row.get(col))) - break - return pa.Table.from_pydict( - {captured_row_id_name: out_row_ids, **out_cols}, - schema=captured_schema, + return _apply_matched_transform( + batch, + captured_clauses, + captured_merge_cond, + captured_on_pairs, + captured_update_cols, + captured_field_names, + captured_row_id_name, + captured_schema, ) return joined.map_batches(_transform, batch_format="pyarrow") +def _apply_matched_transform( + batch: pa.Table, + clauses: List[_NormalizedClause], + merge_condition: Optional[Condition], + on_pairs: Sequence[Tuple[str, str]], + update_cols: Sequence[str], + field_names: Sequence[str], + row_id_name: str, + update_schema: pa.Schema, +) -> pa.Table: + rows = batch.to_pylist() + out_row_ids: list = [] + out_cols: Dict[str, list] = {c: [] for c in update_cols} + for row in rows: + s_row = {k[2:]: v for k, v in row.items() if k.startswith("s.")} + t_row = {k[2:]: v for k, v in row.items() if k.startswith("t.")} + for s_key, t_key in on_pairs: + if s_key not in s_row and t_key in t_row: + s_row[s_key] = t_row[t_key] + combined = _prefixed(s_row, t_row) + if merge_condition is not None and not merge_condition(combined): + continue + for clause in clauses: + if clause.condition is not None and not clause.condition(combined): + continue + new_values = _apply_set(clause.spec, s_row, t_row, field_names) + out_row_ids.append(t_row[row_id_name]) + for col in update_cols: + out_cols[col].append(new_values.get(col, t_row.get(col))) + break + return pa.Table.from_pydict( + {row_id_name: out_row_ids, **out_cols}, + schema=update_schema, + ) + + def _build_update_schema( target_pa_schema: pa.Schema, update_cols: Sequence[str], @@ -692,29 +734,168 @@ def _fast(batch: pa.Table) -> pa.Table: return unmatched.map_batches(_fast, batch_format="pyarrow") def _transform(batch: pa.Table) -> pa.Table: - rows = batch.to_pylist() - out = [] - for row in rows: - s_row = {k[2:]: v for k, v in row.items() if k.startswith("s.")} - s_row.pop(PAIMON_SRC_IDX_COL, None) - combined = _prefixed(s_row, None) - for clause in captured_clauses: - if clause.condition is not None and not clause.condition(combined): - continue - out.append( - _apply_set( - clause.spec, - s_row, - None, - captured_field_names, - null_unspecified=True, + return _apply_insert_transform( + batch, captured_clauses, captured_field_names, out_schema + ) + + return unmatched.map_batches(_transform, batch_format="pyarrow") + + +def _apply_insert_transform( + batch: pa.Table, + clauses: List[_NormalizedClause], + field_names: Sequence[str], + out_schema: pa.Schema, +) -> pa.Table: + from pypaimon.ray.shuffle import _coerce_large_string_types + + rows = batch.to_pylist() + out = [] + for row in rows: + s_row = {k[2:]: v for k, v in row.items() if k.startswith("s.")} + s_row.pop(PAIMON_SRC_IDX_COL, None) + combined = _prefixed(s_row, None) + for clause in clauses: + if clause.condition is not None and not clause.condition(combined): + continue + out.append( + _apply_set( + clause.spec, s_row, None, field_names, null_unspecified=True + ) + ) + break + aligned = [{name: r.get(name) for name in field_names} for r in out] + return _coerce_large_string_types(pa.Table.from_pylist(aligned, schema=out_schema)) + + +def _build_unified_both( + *, + target_identifier: str, + source_ds, + target_on: Sequence[str], + source_on: Sequence[str], + merge_condition: Optional[Condition], + matched_clauses: List[_NormalizedClause], + not_matched_clauses: List[_NormalizedClause], + target_field_names: Sequence[str], + target_pa_schema: pa.Schema, + update_cols: Sequence[str], + catalog_options: Dict[str, str], + num_partitions: int, +): + import pyarrow.compute as pc + + from pypaimon.ray.ray_paimon import read_paimon + from pypaimon.ray.shuffle import _coerce_large_string_types + from pypaimon.table.special_fields import SpecialFields + + row_id_name = SpecialFields.ROW_ID.name + + needs_full = merge_condition is not None or any( + c.condition is not None for c in matched_clauses + ) + if needs_full: + needed_cols = list(target_field_names) + else: + needed_cols = _needed_target_cols( + matched_clauses, target_on, update_cols, target_field_names + ) + projection = [row_id_name] + [c for c in needed_cols if c != row_id_name] + target_ds = read_paimon(target_identifier, catalog_options, projection=projection) + target_renamed = target_ds.rename_columns( + {c: f"t.{c}" for c in target_ds.schema().names} + ) + source_schema = source_ds.schema() + source_cols = list(source_schema.names) if source_schema is not None else list(source_on) + source_renamed = source_ds.rename_columns({c: f"s.{c}" for c in source_cols}) + + # One LEFT_OUTER join feeds both branches: rows with a non-null target side + # are matched (UPDATE), null target side means no key match (INSERT). The + # join shuffle is the dominant cost, so materialize once and route both ways + # instead of reading and shuffling the target table twice. + joined = source_renamed.join( + target_renamed, + join_type="left_outer", + num_partitions=num_partitions, + on=tuple(f"s.{c}" for c in source_on), + right_on=tuple(f"t.{c}" for c in target_on), + ).materialize() + + t_row_id_col = f"t.{row_id_name}" + on_pairs = list(zip(source_on, target_on)) + update_schema = _build_update_schema(target_pa_schema, update_cols, row_id_name) + + use_fast_matched = _clauses_use_vector_fast_path(matched_clauses, merge_condition) + first_matched_spec = matched_clauses[0].spec if use_fast_matched else None + m_update_cols = list(update_cols) + m_field_names = list(target_field_names) + + def _matched_batch(batch: pa.Table) -> pa.Table: + sub = batch.filter(pc.is_valid(batch.column(t_row_id_col))) + if use_fast_matched: + return _vectorized_matched_transform( + sub, first_matched_spec, on_pairs, m_update_cols, + row_id_name, update_schema, + ) + return _apply_matched_transform( + sub, matched_clauses, merge_condition, on_pairs, m_update_cols, + m_field_names, row_id_name, update_schema, + ) + + update_ds = joined.map_batches(_matched_batch, batch_format="pyarrow") + + i_field_names = list(target_field_names) + if merge_condition is None: + use_fast_insert = _clauses_use_vector_fast_path(not_matched_clauses, None) + first_insert_spec = not_matched_clauses[0].spec if use_fast_insert else None + + def _insert_batch(batch: pa.Table) -> pa.Table: + sub = batch.filter(pc.is_null(batch.column(t_row_id_col))) + if use_fast_insert: + return _coerce_large_string_types( + _vectorized_insert_transform( + sub, first_insert_spec, i_field_names, target_pa_schema ) ) - break - aligned = [{name: r.get(name) for name in captured_field_names} for r in out] - return _coerce_large_string_types(pa.Table.from_pylist(aligned, schema=out_schema)) + return _apply_insert_transform( + sub, not_matched_clauses, i_field_names, target_pa_schema + ) - return unmatched.map_batches(_transform, batch_format="pyarrow") + insert_ds = joined.map_batches(_insert_batch, batch_format="pyarrow") + else: + idx_schema = pa.schema([pa.field(MATCHED_SRC_IDX_MARKER, pa.int64())]) + + def _emit_matched_idx(batch: pa.Table) -> pa.Table: + sub = batch.filter(pc.is_valid(batch.column(t_row_id_col))) + out_idx: list = [] + for row in sub.to_pylist(): + s_row = {k[2:]: v for k, v in row.items() if k.startswith("s.")} + t_row = {k[2:]: v for k, v in row.items() if k.startswith("t.")} + for sk, tk in on_pairs: + if sk not in s_row and tk in t_row: + s_row[sk] = t_row[tk] + combined = _prefixed(s_row, t_row) + if merge_condition(combined): + out_idx.append(s_row.get(PAIMON_SRC_IDX_COL)) + return pa.Table.from_pydict( + {MATCHED_SRC_IDX_MARKER: out_idx}, schema=idx_schema + ) + + matched_idx_ds = joined.map_batches(_emit_matched_idx, batch_format="pyarrow") + insert_ds = _build_not_matched_insert_ds( + target_identifier=target_identifier, + source_ds=source_ds, + target_on=target_on, + source_on=source_on, + clauses=not_matched_clauses, + target_field_names=target_field_names, + target_pa_schema=target_pa_schema, + catalog_options=catalog_options, + num_partitions=num_partitions, + matched_idx_ds=matched_idx_ds, + ) + + return update_ds, insert_ds def _distributed_write_collect_msgs( From 74a56a56bd40785e08648bff1c1ac43d5e55b37a Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Fri, 29 May 2026 16:21:22 +0800 Subject: [PATCH 24/32] [python] cover ray merge_into unified path with both clauses Add two cases for the single-outer-join path: combined update+insert under a merge_condition, and a matched clause-level condition with no merge_condition (the full-target-read branch). --- .../ray_data_evolution_merge_into_test.py | 83 +++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py index e2019b84b7c3..6060fe2227f5 100644 --- a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py +++ b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py @@ -530,6 +530,89 @@ def test_combined_update_and_insert(self): self.assertEqual(out['name'], ['a', 'b2', 'c']) self.assertEqual(out['age'], [10, 22, 30]) + def test_combined_update_and_insert_with_merge_condition(self): + target = self._create_table() + self._write( + target, + pa.Table.from_pydict( + { + 'id': pa.array([1, 2], type=pa.int32()), + 'name': ['a', 'b'], + 'age': pa.array([10, 20], type=pa.int32()), + }, + schema=self.pa_schema, + ), + ) + + source = pa.Table.from_pydict( + { + 'id': pa.array([1, 2, 3], type=pa.int32()), + 'name': ['n1', 'n2', 'n3'], + 'age': pa.array([100, 5, 30], type=pa.int32()), + }, + schema=self.pa_schema, + ) + + merge_into( + target=target, + source=source, + catalog_options=self.catalog_options, + on=['id'], + merge_condition=lambda r: r['s.age'] > r['t.age'], + when_matched=[WhenMatched(update='*')], + when_not_matched=[WhenNotMatched(insert='*')], + ) + + out = self._read_sorted(target) + self.assertEqual(sorted(out['id']), [1, 2, 2, 3]) + rows = sorted(zip(out['id'], out['name'], out['age'])) + self.assertEqual( + rows, + [(1, 'n1', 100), (2, 'b', 20), (2, 'n2', 5), (3, 'n3', 30)], + ) + + def test_combined_matched_clause_condition_no_merge_condition(self): + target = self._create_table() + self._write( + target, + pa.Table.from_pydict( + { + 'id': pa.array([1, 2], type=pa.int32()), + 'name': ['a', 'b'], + 'age': pa.array([10, 20], type=pa.int32()), + }, + schema=self.pa_schema, + ), + ) + + source = pa.Table.from_pydict( + { + 'id': pa.array([1, 2, 3], type=pa.int32()), + 'name': ['n1', 'n2', 'n3'], + 'age': pa.array([100, 5, 30], type=pa.int32()), + }, + schema=self.pa_schema, + ) + + merge_into( + target=target, + source=source, + catalog_options=self.catalog_options, + on=['id'], + when_matched=[ + WhenMatched( + update={'name': 's.name'}, + condition=lambda r: r['s.age'] > 50, + ) + ], + when_not_matched=[WhenNotMatched(insert='*')], + ) + + out = self._read_sorted(target) + self.assertEqual(out['id'], [1, 2, 3]) + rows = sorted(zip(out['id'], out['name'], out['age'])) + self.assertEqual(rows, [(1, 'n1', 10), (2, 'b', 20), (3, 'n3', 30)]) + def test_on_with_renamed_columns(self): target = self._create_table() self._write( From e2ce4ada31a6e7fe3e8e87b7bbc360308c502d99 Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Fri, 29 May 2026 23:17:02 +0800 Subject: [PATCH 25/32] [python] mint ray merge_into src idx per-block, fix '*' rename and empty-target insert - '*' SET spec resolves a renamed ON key via the source column, preventing NULL writes into the key column. - Inserting into an empty target skips all joins, avoiding ray's empty hash-partition crash. - Replace the zip(range) src-index with a deterministic per-block running offset: no realignment shuffle, no extra full copy, and no shuffle-materialize barrier (which deadlocked ray at <=2 CPUs). Handles both pandas- and arrow-backed source blocks. --- .../pypaimon/ray/data_evolution_merge_into.py | 47 +++++-- .../ray_data_evolution_merge_into_test.py | 115 ++++++++++++++++++ 2 files changed, 153 insertions(+), 9 deletions(-) diff --git a/paimon-python/pypaimon/ray/data_evolution_merge_into.py b/paimon-python/pypaimon/ray/data_evolution_merge_into.py index 2cf0148addae..fd28e2744fc5 100644 --- a/paimon-python/pypaimon/ray/data_evolution_merge_into.py +++ b/paimon-python/pypaimon/ray/data_evolution_merge_into.py @@ -95,16 +95,17 @@ def merge_into( ) target_field_names = list(table.field_names) + on_map = dict(zip(target_on_cols, source_on_cols)) matched_specs = [ _NormalizedClause( - spec=_normalize_set_spec(c.update, target_field_names), + spec=_normalize_set_spec(c.update, target_field_names, on_map), condition=c.condition, ) for c in when_matched ] not_matched_specs = [ _NormalizedClause( - spec=_normalize_set_spec(c.insert, target_field_names), + spec=_normalize_set_spec(c.insert, target_field_names, on_map), condition=c.condition, ) for c in when_not_matched @@ -184,8 +185,10 @@ def merge_into( ) if not_matched_specs: + # Empty target: nothing can match, so every source row inserts. + # Skip all joins (ray's hash join crashes on empty partitions). matched_keys_ds = None - if merge_condition is not None: + if base_snapshot is not None and merge_condition is not None: matched_keys_ds = _compute_matched_source_idx_ds( target_identifier=target, source_ds=source_ds, @@ -206,6 +209,7 @@ def merge_into( catalog_options=catalog_options, num_partitions=num_partitions, matched_idx_ds=matched_keys_ds, + target_empty=base_snapshot is None, ) all_msgs: list = [] @@ -511,15 +515,34 @@ def _apply_group(group: pa.Table) -> pa.Table: _MAX_GROUP_PARTITIONS = 200 +def _assign_src_idx_block(block, start): + import numpy as np + import pyarrow as pa + + if not isinstance(block, pa.Table): + block = pa.Table.from_pandas(block, preserve_index=False) + idx = pa.array(np.arange(start, start + block.num_rows, dtype=np.int64)) + return block.append_column(PAIMON_SRC_IDX_COL, idx) + + def _add_paimon_src_idx(source_ds): """Append a unique per-row index so INSERTs are routed by row identity, - not by content. Materialize once so count() + zip don't re-run source.""" + not by content. Mint ids per block from a running offset (offset derived + from block metadata, so it is deterministic under re-execution). Avoids + zip's realignment shuffle and the extra full copy it forces.""" import ray materialized = source_ds.materialize() - n = materialized.count() - idx_ds = ray.data.range(n).rename_columns({"id": PAIMON_SRC_IDX_COL}) - return materialized.zip(idx_ds) + assign = ray.remote(_assign_src_idx_block) + offset = 0 + refs = [] + for bundle in materialized.iter_internal_ref_bundles(): + for block_ref, meta in bundle.blocks: + refs.append(assign.remote(block_ref, offset)) + offset += meta.num_rows + if not refs: + return materialized + return ray.data.from_arrow_refs(refs) def _resolve_num_partitions(num_partitions: Optional[int]) -> int: @@ -674,6 +697,7 @@ def _build_not_matched_insert_ds( catalog_options: Dict[str, str], num_partitions: int, matched_idx_ds=None, + target_empty: bool = False, ): from pypaimon.ray.ray_paimon import read_paimon from pypaimon.ray.shuffle import _coerce_large_string_types @@ -686,7 +710,9 @@ def _build_not_matched_insert_ds( source_cols = list(source_schema.names) if source_schema is not None else list(source_on) source_renamed = source_ds.rename_columns({c: f"s.{c}" for c in source_cols}) - if matched_idx_ds is not None: + if target_empty: + unmatched = source_renamed + elif matched_idx_ds is not None: # ray's join is equi-only, so anti-join source against the matched # per-row ids (Spark folds this into one LeftAnti predicate). Size # partitions to the matched count: ray's join crashes on empty hash @@ -1043,13 +1069,16 @@ def _needed_target_cols( def _normalize_set_spec( spec: SetSpec, target_field_names: Sequence[str], + on_map: Optional[Mapping[str, str]] = None, ) -> Dict[str, Any]: + on_map = on_map or {} if isinstance(spec, str): if spec != "*": raise ValueError( f"SET spec strings other than '*' are not supported; got {spec!r}." ) - return {col: f"s.{col}" for col in target_field_names} + # A renamed ON key resolves via the source's ON column, not its own name. + return {col: f"s.{on_map.get(col, col)}" for col in target_field_names} if not isinstance(spec, dict): raise ValueError( f"SET spec must be '*' or a dict, got {type(spec).__name__}." diff --git a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py index 6060fe2227f5..df9f97a0d76a 100644 --- a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py +++ b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py @@ -653,6 +653,100 @@ def test_on_with_renamed_columns(self): self.assertEqual(out['id'], [1, 2]) self.assertEqual(out['age'], [10, 22]) + def test_on_with_renamed_columns_star(self): + target = self._create_table() + self._write( + target, + pa.Table.from_pydict( + { + 'id': pa.array([1, 2], type=pa.int32()), + 'name': ['a', 'b'], + 'age': pa.array([10, 20], type=pa.int32()), + }, + schema=self.pa_schema, + ), + ) + + source_schema = pa.schema([ + ('uid', pa.int32()), + ('name', pa.string()), + ('age', pa.int32()), + ]) + source = pa.Table.from_pydict( + { + 'uid': pa.array([2, 3], type=pa.int32()), + 'name': ['b2', 'c'], + 'age': pa.array([22, 30], type=pa.int32()), + }, + schema=source_schema, + ) + + merge_into( + target=target, + source=source, + catalog_options=self.catalog_options, + on={'id': 'uid'}, + when_matched=[WhenMatched(update='*')], + when_not_matched=[WhenNotMatched(insert='*')], + ) + + out = self._read_sorted(target) + self.assertEqual(out['id'], [1, 2, 3]) + self.assertEqual(out['name'], ['a', 'b2', 'c']) + self.assertEqual(out['age'], [10, 22, 30]) + + def test_insert_into_empty_target(self): + target = self._create_table() + + source = pa.Table.from_pydict( + { + 'id': pa.array([1, 2, 3], type=pa.int32()), + 'name': ['a', 'b', 'c'], + 'age': pa.array([10, 20, 30], type=pa.int32()), + }, + schema=self.pa_schema, + ) + + merge_into( + target=target, + source=source, + catalog_options=self.catalog_options, + on=['id'], + when_not_matched=[WhenNotMatched(insert='*')], + ) + + out = self._read_sorted(target) + self.assertEqual(out['id'], [1, 2, 3]) + self.assertEqual(out['name'], ['a', 'b', 'c']) + self.assertEqual(out['age'], [10, 20, 30]) + + def test_insert_into_empty_target_with_merge_condition(self): + target = self._create_table() + + source = pa.Table.from_pydict( + { + 'id': pa.array([1, 2], type=pa.int32()), + 'name': ['a', 'b'], + 'age': pa.array([10, 20], type=pa.int32()), + }, + schema=self.pa_schema, + ) + + merge_into( + target=target, + source=source, + catalog_options=self.catalog_options, + on=['id'], + merge_condition=lambda r: r['s.age'] > 0, + when_matched=[WhenMatched(update='*')], + when_not_matched=[WhenNotMatched(insert='*')], + ) + + out = self._read_sorted(target) + self.assertEqual(out['id'], [1, 2]) + self.assertEqual(out['name'], ['a', 'b']) + self.assertEqual(out['age'], [10, 20]) + def test_insert_dict_fills_unspecified_with_null(self): target = self._create_table() self._write( @@ -781,6 +875,27 @@ def test_blob_update_is_rejected(self): _reject_blob_updates(fake_table, {'payload'}) _reject_blob_updates(fake_table, {'id'}) + def test_add_paimon_src_idx_pandas_blocks(self): + import pandas as pd + + from pypaimon.ray.data_evolution_merge_into import ( + PAIMON_SRC_IDX_COL, + _add_paimon_src_idx, + ) + + pdf = pd.DataFrame( + { + 'id': pd.array(list(range(20)), dtype='int32'), + 'name': ['x'] * 20, + 'age': pd.array(list(range(20)), dtype='int64'), + } + ) + ds = ray.data.from_pandas(pdf).repartition(4) + out = _add_paimon_src_idx(ds).to_pandas() + ids = sorted(out[PAIMON_SRC_IDX_COL].tolist()) + self.assertEqual(len(out), 20) + self.assertEqual(ids, list(range(20))) + def test_combined_writes_single_snapshot(self): target = self._create_table() self._write( From 1cb429a32b6397fe23755f9c047e3f7d33474583 Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Sat, 30 May 2026 12:03:44 +0800 Subject: [PATCH 26/32] [python] align ray merge_into global-index handling with Spark --- .../pypaimon/common/options/core_options.py | 13 ++ .../pypaimon/manifest/index_manifest_entry.py | 19 --- .../pypaimon/manifest/index_manifest_file.py | 122 +++++++++++++++++- .../pypaimon/ray/data_evolution_merge_into.py | 116 ++++++++++------- .../tests/index_manifest_write_test.py | 116 +++++++++++++++++ .../ray_data_evolution_merge_into_test.py | 108 ++++++++++++++++ .../pypaimon/write/commit_message.py | 10 +- .../pypaimon/write/file_store_commit.py | 35 ++++- 8 files changed, 464 insertions(+), 75 deletions(-) create mode 100644 paimon-python/pypaimon/tests/index_manifest_write_test.py diff --git a/paimon-python/pypaimon/common/options/core_options.py b/paimon-python/pypaimon/common/options/core_options.py index 2d140b9539b6..06b9b7e86967 100644 --- a/paimon-python/pypaimon/common/options/core_options.py +++ b/paimon-python/pypaimon/common/options/core_options.py @@ -398,6 +398,16 @@ class CoreOptions: ) ) + GLOBAL_INDEX_COLUMN_UPDATE_ACTION: ConfigOption[str] = ( + ConfigOptions.key("global-index.column-update-action") + .string_type() + .default_value("THROW_ERROR") + .with_description( + "Defines the action to take when an update modifies columns that " + "are covered by a global index. THROW_ERROR or DROP_PARTITION_INDEX." + ) + ) + LOCAL_CACHE_ENABLED: ConfigOption[bool] = ( ConfigOptions.key("local-cache.enabled") .boolean_type() @@ -652,6 +662,9 @@ def row_tracking_enabled(self, default=None): def data_evolution_enabled(self, default=None): return self.options.get(CoreOptions.DATA_EVOLUTION_ENABLED, default) + def global_index_column_update_action(self, default=None): + return self.options.get(CoreOptions.GLOBAL_INDEX_COLUMN_UPDATE_ACTION, default) + def deletion_vectors_enabled(self, default=None): return self.options.get(CoreOptions.DELETION_VECTORS_ENABLED, default) diff --git a/paimon-python/pypaimon/manifest/index_manifest_entry.py b/paimon-python/pypaimon/manifest/index_manifest_entry.py index 7a5e7d1a4f53..9ec5f103dba5 100644 --- a/paimon-python/pypaimon/manifest/index_manifest_entry.py +++ b/paimon-python/pypaimon/manifest/index_manifest_entry.py @@ -41,22 +41,3 @@ def __eq__(self, other): def __hash__(self): return hash((self.kind, tuple(self.partition.values), self.bucket, self.index_file)) - - -INDEX_MANIFEST_ENTRY = { - "type": "record", - "name": "IndexManifestEntry", - "fields": [ - {"name": "_VERSION", "type": "int"}, - {"name": "_KIND", "type": "byte"}, - {"name": "_PARTITION", "type": "bytes"}, - {"name": "_BUCKET", "type": "int"}, - {"name": "_INDEX_TYPE", "type": "string"}, - {"name": "_FILE_NAME", "type": "string"}, - {"name": "_FILE_SIZE", "type": "long"}, - {"name": "_ROW_COUNT", "type": "long"}, - {"name": "_DELETIONS_VECTORS_RANGES", "type": {"type": "array", "elementType": "DeletionVectorMeta"}}, - {"name": "_EXTERNAL_PATH", "type": ["null", "string"]}, - {"name": "_GLOBAL_INDEX", "type": "GlobalIndexMeta"} - ] -} diff --git a/paimon-python/pypaimon/manifest/index_manifest_file.py b/paimon-python/pypaimon/manifest/index_manifest_file.py index 4e65e95e0cb1..5312b0975255 100644 --- a/paimon-python/pypaimon/manifest/index_manifest_file.py +++ b/paimon-python/pypaimon/manifest/index_manifest_file.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +import uuid from io import BytesIO from typing import List, Optional @@ -24,11 +25,60 @@ from pypaimon.index.deletion_vector_meta import DeletionVectorMeta from pypaimon.index.index_file_meta import IndexFileMeta from pypaimon.manifest.index_manifest_entry import IndexManifestEntry -from pypaimon.table.row.generic_row import GenericRowDeserializer +from pypaimon.table.row.generic_row import (GenericRowDeserializer, + GenericRowSerializer) +from pypaimon.utils.file_store_path_factory import FileStorePathFactory + +_DELETION_VECTOR_META_SCHEMA = { + "type": "record", + "name": "DeletionVectorMeta", + "fields": [ + {"name": "f0", "type": "string"}, + {"name": "f1", "type": "long"}, + {"name": "f2", "type": "int"}, + {"name": "_CARDINALITY", "type": ["null", "long"], "default": None}, + ], +} + +_GLOBAL_INDEX_META_SCHEMA = { + "type": "record", + "name": "GlobalIndexMeta", + "fields": [ + {"name": "_ROW_RANGE_START", "type": "long"}, + {"name": "_ROW_RANGE_END", "type": "long"}, + {"name": "_INDEX_FIELD_ID", "type": "int"}, + {"name": "_EXTRA_FIELD_IDS", + "type": ["null", {"type": "array", "items": "int"}], "default": None}, + {"name": "_INDEX_META", "type": ["null", "bytes"], "default": None}, + ], +} + +INDEX_MANIFEST_ENTRY_SCHEMA = { + "type": "record", + "name": "IndexManifestEntry", + "fields": [ + {"name": "_VERSION", "type": "int"}, + {"name": "_KIND", "type": "int"}, + {"name": "_PARTITION", "type": "bytes"}, + {"name": "_BUCKET", "type": "int"}, + {"name": "_INDEX_TYPE", "type": "string"}, + {"name": "_FILE_NAME", "type": "string"}, + {"name": "_FILE_SIZE", "type": "long"}, + {"name": "_ROW_COUNT", "type": "long"}, + {"name": "_DELETIONS_VECTORS_RANGES", + "type": ["null", {"type": "array", "items": _DELETION_VECTOR_META_SCHEMA}], + "default": None}, + {"name": "_EXTERNAL_PATH", "type": ["null", "string"], "default": None}, + {"name": "_GLOBAL_INDEX", + "type": ["null", _GLOBAL_INDEX_META_SCHEMA], "default": None}, + ], +} + +_INDEX_ENTRY_VERSION = 1 class IndexManifestFile: - """Index manifest file reader for reading index manifest entries.""" + """Index manifest file reader/writer for index manifest entries.""" DELETION_VECTORS_INDEX = "DELETION_VECTORS" @@ -172,5 +222,73 @@ def _parse_global_index_meta(self, global_index_record) -> Optional[GlobalIndexM row_range_start=global_index_record.get('_ROW_RANGE_START', 0), row_range_end=global_index_record.get('_ROW_RANGE_END', 0), index_field_id=global_index_record.get('_INDEX_FIELD_ID', 0), + extra_field_ids=global_index_record.get('_EXTRA_FIELD_IDS'), index_meta=global_index_record.get('_INDEX_META') ) + + def combine( + self, + previous_name: Optional[str], + deletes: List[IndexManifestEntry], + ) -> Optional[str]: + """Apply DELETE entries to the previous index manifest and write a new one. + + Mirrors Java GlobalIndexCombiner: the stored manifest only holds ADD + entries; deleting means dropping the entries whose index file name + appears in *deletes*. Returns the new manifest file name, or + *previous_name* unchanged when there is nothing to delete. + """ + if not deletes: + return previous_name + previous = self.read(previous_name) if previous_name else [] + delete_names = {e.index_file.file_name for e in deletes} + survivors = [e for e in previous if e.index_file.file_name not in delete_names] + return self.write(survivors) + + def write(self, entries: List[IndexManifestEntry]) -> str: + """Serialize *entries* to a new Avro index manifest, return its name.""" + file_name = f"{FileStorePathFactory.INDEX_MANIFEST_PREFIX}{uuid.uuid4()}" + path = f"{self.manifest_path}/{file_name}" + records = [self._to_avro_record(e) for e in entries] + try: + buffer = BytesIO() + fastavro.writer(buffer, INDEX_MANIFEST_ENTRY_SCHEMA, records) + with self.file_io.new_output_stream(path) as output_stream: + output_stream.write(buffer.getvalue()) + except Exception as e: + self.file_io.delete_quietly(path) + raise RuntimeError(f"Failed to write index manifest file: {e}") from e + return file_name + + def _to_avro_record(self, entry: IndexManifestEntry) -> dict: + index_file = entry.index_file + dv_ranges = None + if index_file.dv_ranges: + dv_ranges = [ + {"f0": dv.data_file_name, "f1": dv.offset, "f2": dv.length, + "_CARDINALITY": dv.cardinality} + for dv in index_file.dv_ranges.values() + ] + global_index = None + if index_file.global_index_meta is not None: + gim = index_file.global_index_meta + global_index = { + "_ROW_RANGE_START": gim.row_range_start, + "_ROW_RANGE_END": gim.row_range_end, + "_INDEX_FIELD_ID": gim.index_field_id, + "_EXTRA_FIELD_IDS": gim.extra_field_ids, + "_INDEX_META": gim.index_meta, + } + return { + "_VERSION": _INDEX_ENTRY_VERSION, + "_KIND": entry.kind, + "_PARTITION": GenericRowSerializer.to_bytes(entry.partition), + "_BUCKET": entry.bucket, + "_INDEX_TYPE": index_file.index_type, + "_FILE_NAME": index_file.file_name, + "_FILE_SIZE": index_file.file_size, + "_ROW_COUNT": index_file.row_count, + "_DELETIONS_VECTORS_RANGES": dv_ranges, + "_EXTERNAL_PATH": index_file.external_path, + "_GLOBAL_INDEX": global_index, + } diff --git a/paimon-python/pypaimon/ray/data_evolution_merge_into.py b/paimon-python/pypaimon/ray/data_evolution_merge_into.py index fd28e2744fc5..0afe6e53912e 100644 --- a/paimon-python/pypaimon/ray/data_evolution_merge_into.py +++ b/paimon-python/pypaimon/ray/data_evolution_merge_into.py @@ -127,6 +127,11 @@ def merge_into( {"commit.strict-mode.last-safe-snapshot": str(base_snapshot.id)} ) + global_index_action = ( + table.options.global_index_column_update_action() + or GLOBAL_INDEX_ACTION_THROW_ERROR + ) + # Row-precise routing needs a stable per-source-row id when merge_condition # may differ between source rows sharing the same ON key. if when_not_matched and merge_condition is not None: @@ -138,9 +143,6 @@ def merge_into( table.table_schema.fields ) - if not_matched_specs and base_snapshot is not None: - _check_global_index_for_insert(table, base_snapshot) - update_ds = None insert_ds = None update_cols_union: List[str] = [] @@ -150,7 +152,6 @@ def merge_into( # LEFT_OUTER join instead of reading and shuffling the target table twice. if matched_specs and not_matched_specs and base_snapshot is not None: update_cols_union = _union_update_cols(matched_specs) - _check_global_index_collision(table, base_snapshot, update_cols_union) update_ds, insert_ds = _build_unified_both( target_identifier=target, source_ds=source_ds, @@ -169,7 +170,6 @@ def merge_into( # Empty target → no rows can match; matched UPDATE is a no-op. if matched_specs and base_snapshot is not None: update_cols_union = _union_update_cols(matched_specs) - _check_global_index_collision(table, base_snapshot, update_cols_union) update_ds = _build_matched_update_ds( target_identifier=target, source_ds=source_ds, @@ -212,23 +212,30 @@ def merge_into( target_empty=base_snapshot is None, ) - all_msgs: list = [] + update_msgs: list = [] if update_ds is not None: - all_msgs.extend( - _distributed_update_apply( - update_ds, - table, - update_cols_union, - ray_remote_args=ray_remote_args, - allow_multiple_matches=allow_multiple_matches, - ) + update_msgs = _distributed_update_apply( + update_ds, + table, + update_cols_union, + ray_remote_args=ray_remote_args, + allow_multiple_matches=allow_multiple_matches, ) + + all_msgs: list = list(update_msgs) if insert_ds is not None: all_msgs.extend( _distributed_write_collect_msgs( insert_ds, table, ray_remote_args=ray_remote_args, concurrency=concurrency ) ) + # Mirror Spark's checkUpdateResult: scope the global-index action to the + # partitions the update actually wrote and the updated indexed columns. + all_msgs.extend( + _apply_global_index_update_action( + table, base_snapshot, update_cols_union, update_msgs, global_index_action + ) + ) if all_msgs: wb = table.new_batch_write_builder() tc = wb.new_commit() @@ -506,6 +513,8 @@ def _apply_group(group: pa.Table) -> pa.Table: PAIMON_SRC_IDX_COL = "_paimon_src_idx" MATCHED_SRC_IDX_MARKER = "_paimon_matched_src_idx" +GLOBAL_INDEX_ACTION_THROW_ERROR = "THROW_ERROR" +GLOBAL_INDEX_ACTION_DROP_PARTITION_INDEX = "DROP_PARTITION_INDEX" # Min rows per hash partition for the anti-join; keeps partitions non-empty # (ray's join crashes on empty hash partitions). _ANTI_JOIN_ROWS_PER_PARTITION = 8 @@ -964,46 +973,65 @@ def on_write_complete(self, write_result): return sink.collected -def _check_global_index_collision( - table, snapshot, update_cols: Sequence[str] -) -> None: +def _apply_global_index_update_action( + table, snapshot, update_cols: Sequence[str], update_msgs, action: str +) -> list: + """Handle updates touching globally-indexed columns, mirroring Spark's + ``checkUpdateResult``. + + Scoped exactly like Spark: only index entries whose partition was written + by the update *and* whose indexed column is among the updated columns are + affected. THROW_ERROR (default) raises; DROP_PARTITION_INDEX drops those + entries (returned as index-delete commit messages, rebuild afterwards). + Like Spark, the INSERT path is left untouched. + """ + if snapshot is None or not update_cols or not update_msgs: + return [] entries = _scan_global_index_entries(table, snapshot) if not entries: - return + return [] field_by_id = {f.id: f.name for f in table.fields} update_set = set(update_cols) + affected_partitions = {tuple(m.partition) for m in update_msgs} + affected = [ + e for e in entries + if field_by_id.get(e.index_file.global_index_meta.index_field_id) in update_set + and tuple(e.partition.values) in affected_partitions + ] + if not affected: + return [] + if action == GLOBAL_INDEX_ACTION_DROP_PARTITION_INDEX: + return _build_index_delete_msgs(affected) conflicted = sorted( - { - field_by_id.get(e.index_file.global_index_meta.index_field_id) - for e in entries - } - & update_set - ) - if conflicted: - raise NotImplementedError( - f"MERGE INTO would update columns {conflicted} that have a global " - f"index; not supported (refusing to leave the index stale)." - ) - - -def _check_global_index_for_insert(table, snapshot) -> None: - entries = _scan_global_index_entries(table, snapshot) - if not entries: - return - field_by_id = {f.id: f.name for f in table.fields} - indexed = sorted( - { - field_by_id.get(e.index_file.global_index_meta.index_field_id) - for e in entries - } + {field_by_id.get(e.index_file.global_index_meta.index_field_id) for e in affected} ) raise NotImplementedError( - f"MERGE INTO INSERT is not supported on tables with global-index " - f"columns {indexed} (btree/lumina/tantivy). Inserted rows would not " - f"appear in the index. Drop the global index or omit when_not_matched." + f"MERGE INTO would update columns {conflicted} that have a global " + f"index; not supported (refusing to leave the index stale). Set " + f"'global-index.column-update-action' = 'DROP_PARTITION_INDEX' to drop " + f"the affected index instead." ) +def _build_index_delete_msgs(entries) -> list: + """Group scanned index entries by partition into index-delete messages.""" + from pypaimon.manifest.index_manifest_entry import IndexManifestEntry + from pypaimon.write.commit_message import CommitMessage + + by_partition: Dict[tuple, list] = {} + for e in entries: + key = tuple(e.partition.values) + by_partition.setdefault(key, []).append( + IndexManifestEntry( + kind=1, partition=e.partition, bucket=e.bucket, index_file=e.index_file + ) + ) + return [ + CommitMessage(partition=key, bucket=0, new_files=[], index_files=dels) + for key, dels in by_partition.items() + ] + + def _scan_global_index_entries(table, snapshot): from pypaimon.index.index_file_handler import IndexFileHandler diff --git a/paimon-python/pypaimon/tests/index_manifest_write_test.py b/paimon-python/pypaimon/tests/index_manifest_write_test.py new file mode 100644 index 000000000000..7107fe2fa59e --- /dev/null +++ b/paimon-python/pypaimon/tests/index_manifest_write_test.py @@ -0,0 +1,116 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os +import shutil +import tempfile +import unittest +import uuid + +import pyarrow as pa + +from pypaimon import CatalogFactory, Schema +from pypaimon.globalindex.global_index_meta import GlobalIndexMeta +from pypaimon.index.index_file_meta import IndexFileMeta +from pypaimon.manifest.index_manifest_entry import IndexManifestEntry +from pypaimon.manifest.index_manifest_file import IndexManifestFile +from pypaimon.table.row.generic_row import GenericRow + + +class IndexManifestWriteTest(unittest.TestCase): + + pa_schema = pa.schema([ + ('id', pa.int32()), + ('vec', pa.string()), + ]) + + @classmethod + def setUpClass(cls): + cls.tempdir = tempfile.mkdtemp() + cls.warehouse = os.path.join(cls.tempdir, 'warehouse') + cls.catalog = CatalogFactory.create({'warehouse': cls.warehouse}) + cls.catalog.create_database('default', True) + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.tempdir, ignore_errors=True) + + def _table(self): + name = f'default.idx_{uuid.uuid4().hex[:8]}' + s = Schema.from_pyarrow_schema(self.pa_schema) + self.catalog.create_table(name, s, False) + return self.catalog.get_table(name) + + def _entry(self, file_name, field_id, meta=b'm'): + partition = GenericRow([], []) + index_file = IndexFileMeta( + index_type='BTREE', + file_name=file_name, + file_size=123, + row_count=10, + global_index_meta=GlobalIndexMeta( + row_range_start=0, + row_range_end=10, + index_field_id=field_id, + extra_field_ids=[field_id + 1], + index_meta=meta, + ), + ) + return IndexManifestEntry(kind=0, partition=partition, bucket=0, index_file=index_file) + + def test_write_read_roundtrip(self): + imf = IndexManifestFile(self._table()) + name = imf.write([self._entry('idx-a', 1), self._entry('idx-b', 2)]) + out = imf.read(name) + self.assertEqual(2, len(out)) + by_name = {e.index_file.file_name: e for e in out} + a = by_name['idx-a'] + self.assertEqual('BTREE', a.index_file.index_type) + self.assertEqual(123, a.index_file.file_size) + self.assertEqual(10, a.index_file.row_count) + self.assertEqual(0, a.kind) + gim = a.index_file.global_index_meta + self.assertEqual(1, gim.index_field_id) + self.assertEqual(0, gim.row_range_start) + self.assertEqual(10, gim.row_range_end) + self.assertEqual([2], gim.extra_field_ids) + self.assertEqual(b'm', bytes(gim.index_meta)) + + def test_combine_drops_named_files(self): + imf = IndexManifestFile(self._table()) + previous = imf.write([self._entry('idx-a', 1), self._entry('idx-b', 2)]) + deletes = [self._entry('idx-a', 1)] + new_name = imf.combine(previous, deletes) + self.assertNotEqual(previous, new_name) + survivors = {e.index_file.file_name for e in imf.read(new_name)} + self.assertEqual({'idx-b'}, survivors) + + def test_combine_unknown_delete_is_noop_on_content(self): + imf = IndexManifestFile(self._table()) + previous = imf.write([self._entry('idx-a', 1)]) + new_name = imf.combine(previous, [self._entry('idx-zzz', 9)]) + survivors = {e.index_file.file_name for e in imf.read(new_name)} + self.assertEqual({'idx-a'}, survivors) + + def test_combine_empty_deletes_returns_previous(self): + imf = IndexManifestFile(self._table()) + previous = imf.write([self._entry('idx-a', 1)]) + self.assertEqual(previous, imf.combine(previous, [])) + + +if __name__ == '__main__': + unittest.main() diff --git a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py index df9f97a0d76a..d54667e24bb7 100644 --- a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py +++ b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py @@ -1137,5 +1137,113 @@ def test_merge_condition_routes_per_source_row(self): ) +class RayMergeIntoGlobalIndexGateTest(unittest.TestCase): + + pa_schema = pa.schema([ + ('id', pa.int32()), + ('name', pa.string()), + ('age', pa.int32()), + ]) + + de_options = { + 'row-tracking.enabled': 'true', + 'data-evolution.enabled': 'true', + } + + @classmethod + def setUpClass(cls): + cls.tempdir = tempfile.mkdtemp() + cls.warehouse = os.path.join(cls.tempdir, 'warehouse') + cls.catalog = CatalogFactory.create({'warehouse': cls.warehouse}) + cls.catalog.create_database('default', True) + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.tempdir, ignore_errors=True) + + def _table(self): + name = f'default.gidx_{uuid.uuid4().hex[:8]}' + s = Schema.from_pyarrow_schema(self.pa_schema, options=self.de_options) + self.catalog.create_table(name, s, False) + return self.catalog.get_table(name) + + def _entry(self, table, column, partition_values=()): + from pypaimon.globalindex.global_index_meta import GlobalIndexMeta + from pypaimon.index.index_file_meta import IndexFileMeta + from pypaimon.manifest.index_manifest_entry import IndexManifestEntry + from pypaimon.table.row.generic_row import GenericRow + + field_id = next(f.id for f in table.fields if f.name == column) + index_file = IndexFileMeta( + index_type='BTREE', file_name=f'idx-{column}', file_size=1, row_count=1, + global_index_meta=GlobalIndexMeta( + row_range_start=0, row_range_end=1, index_field_id=field_id, + ), + ) + return IndexManifestEntry( + kind=0, partition=GenericRow(list(partition_values), []), + bucket=0, index_file=index_file, + ) + + def _update_msg(self, partition=()): + from pypaimon.write.commit_message import CommitMessage + return CommitMessage(partition=partition, bucket=0, new_files=[]) + + def test_update_throw_error_raises(self): + from unittest.mock import patch + from pypaimon.ray import data_evolution_merge_into as m + + table = self._table() + with patch.object(m, '_scan_global_index_entries', + return_value=[self._entry(table, 'age')]): + with self.assertRaises(NotImplementedError): + m._apply_global_index_update_action( + table, object(), ['age'], [self._update_msg()], + m.GLOBAL_INDEX_ACTION_THROW_ERROR, + ) + + def test_update_drop_returns_delete_msgs(self): + from unittest.mock import patch + from pypaimon.ray import data_evolution_merge_into as m + + table = self._table() + with patch.object(m, '_scan_global_index_entries', + return_value=[self._entry(table, 'age')]): + msgs = m._apply_global_index_update_action( + table, object(), ['age'], [self._update_msg()], + m.GLOBAL_INDEX_ACTION_DROP_PARTITION_INDEX, + ) + self.assertEqual(1, len(msgs)) + self.assertFalse(msgs[0].is_empty()) + self.assertEqual('idx-age', msgs[0].index_files[0].index_file.file_name) + self.assertEqual(1, msgs[0].index_files[0].kind) + + def test_update_unaffected_column_is_noop(self): + from unittest.mock import patch + from pypaimon.ray import data_evolution_merge_into as m + + table = self._table() + with patch.object(m, '_scan_global_index_entries', + return_value=[self._entry(table, 'age')]): + msgs = m._apply_global_index_update_action( + table, object(), ['name'], [self._update_msg()], + m.GLOBAL_INDEX_ACTION_DROP_PARTITION_INDEX, + ) + self.assertEqual([], msgs) + + def test_update_untouched_partition_is_noop(self): + from unittest.mock import patch + from pypaimon.ray import data_evolution_merge_into as m + + table = self._table() + entry = self._entry(table, 'age', partition_values=('EU',)) + with patch.object(m, '_scan_global_index_entries', return_value=[entry]): + msgs = m._apply_global_index_update_action( + table, object(), ['age'], [self._update_msg(partition=('US',))], + m.GLOBAL_INDEX_ACTION_DROP_PARTITION_INDEX, + ) + self.assertEqual([], msgs) + + if __name__ == '__main__': unittest.main() diff --git a/paimon-python/pypaimon/write/commit_message.py b/paimon-python/pypaimon/write/commit_message.py index 7bce06d8ab13..db6d20ff1fae 100644 --- a/paimon-python/pypaimon/write/commit_message.py +++ b/paimon-python/pypaimon/write/commit_message.py @@ -15,11 +15,14 @@ # specific language governing permissions and limitations # under the License. -from dataclasses import dataclass -from typing import List, Tuple, Optional +from dataclasses import dataclass, field +from typing import List, Tuple, Optional, TYPE_CHECKING from pypaimon.manifest.schema.data_file_meta import DataFileMeta +if TYPE_CHECKING: + from pypaimon.manifest.index_manifest_entry import IndexManifestEntry + @dataclass class CommitMessage: @@ -27,6 +30,7 @@ class CommitMessage: bucket: int new_files: List[DataFileMeta] check_from_snapshot: Optional[int] = -1 + index_files: List['IndexManifestEntry'] = field(default_factory=list) def is_empty(self): - return not self.new_files + return not self.new_files and not self.index_files diff --git a/paimon-python/pypaimon/write/file_store_commit.py b/paimon-python/pypaimon/write/file_store_commit.py index 92fd19bf0293..656c80801c2e 100644 --- a/paimon-python/pypaimon/write/file_store_commit.py +++ b/paimon-python/pypaimon/write/file_store_commit.py @@ -144,6 +144,10 @@ def commit(self, commit_messages: List[CommitMessage], commit_identifier: int): logger.info("Finished collecting changes, including: %d entries", len(commit_entries)) + index_deletes = [] + for msg in commit_messages: + index_deletes.extend(msg.index_files) + commit_kind = "APPEND" detect_conflicts = False allow_rollback = False @@ -159,7 +163,8 @@ def commit(self, commit_messages: List[CommitMessage], commit_identifier: int): commit_identifier=commit_identifier, commit_entries_plan=lambda snapshot: commit_entries, detect_conflicts=detect_conflicts, - allow_rollback=allow_rollback) + allow_rollback=allow_rollback, + index_deletes=index_deletes) def overwrite(self, overwrite_partition, commit_messages: List[CommitMessage], commit_identifier: int): """Commit the given commit messages in overwrite mode.""" @@ -263,7 +268,7 @@ def _enforce_strict_mode_last_safe_snapshot(self) -> None: ) def _try_commit(self, commit_kind, commit_identifier, commit_entries_plan, - detect_conflicts=False, allow_rollback=False): + detect_conflicts=False, allow_rollback=False, index_deletes=None): retry_count = 0 retry_result = None @@ -274,7 +279,7 @@ def _try_commit(self, commit_kind, commit_identifier, commit_entries_plan, # No entries to commit (e.g. drop_partitions with no matching data): skip commit # to avoid creating manifest/snapshot with empty partition_stats (causes read errors). - if not commit_entries: + if not commit_entries and not index_deletes: break result = self._try_commit_once( @@ -285,6 +290,7 @@ def _try_commit(self, commit_kind, commit_identifier, commit_entries_plan, latest_snapshot=latest_snapshot, detect_conflicts=detect_conflicts, allow_rollback=allow_rollback, + index_deletes=index_deletes, ) if result.is_success(): @@ -336,7 +342,8 @@ def _try_commit_once(self, retry_result: Optional[RetryResult], commit_kind: str commit_entries: List[ManifestEntry], commit_identifier: int, latest_snapshot: Optional[Snapshot], detect_conflicts: bool = False, - allow_rollback: bool = False) -> CommitResult: + allow_rollback: bool = False, + index_deletes=None) -> CommitResult: start_millis = int(time.time() * 1000) if self._is_duplicate_commit(retry_result, latest_snapshot, commit_identifier, commit_kind): return SuccessResult() @@ -347,6 +354,7 @@ def _try_commit_once(self, retry_result: Optional[RetryResult], commit_kind: str # process new_manifest new_manifest_file = f"manifest-{str(uuid.uuid4())}-0" + new_index_manifest = None # process snapshot new_snapshot_id = latest_snapshot.id + 1 if latest_snapshot else 1 @@ -404,6 +412,13 @@ def _try_commit_once(self, retry_result: Optional[RetryResult], commit_kind: str index_manifest = None if latest_snapshot and commit_kind == "APPEND": index_manifest = latest_snapshot.index_manifest + if index_deletes: + from pypaimon.manifest.index_manifest_file import IndexManifestFile + previous_index_manifest = index_manifest + index_manifest = IndexManifestFile(self.table).combine( + previous_index_manifest, index_deletes) + if index_manifest != previous_index_manifest: + new_index_manifest = index_manifest snapshot_data = Snapshot( version=3, @@ -423,7 +438,8 @@ def _try_commit_once(self, retry_result: Optional[RetryResult], commit_kind: str # Generate partition statistics for the commit statistics = self._generate_partition_statistics(commit_entries) except Exception as e: - self._cleanup_preparation_failure(delta_manifest_list, base_manifest_list) + self._cleanup_preparation_failure(delta_manifest_list, base_manifest_list, + new_index_manifest) logger.warning(f"Exception occurs when preparing snapshot: {e}", exc_info=True) raise RuntimeError(f"Failed to prepare snapshot: {e}") @@ -443,7 +459,8 @@ def _try_commit_once(self, retry_result: Optional[RetryResult], commit_kind: str commit_kind, commit_time_s, ) - self._cleanup_preparation_failure(delta_manifest_list, base_manifest_list) + self._cleanup_preparation_failure(delta_manifest_list, base_manifest_list, + new_index_manifest) return RetryResult(latest_snapshot, None) except Exception as e: # Commit exception, not sure about the situation and should not clean up the files @@ -624,10 +641,14 @@ def _commit_retry_wait(self, retry_count: int): def _cleanup_preparation_failure(self, delta_manifest_list: Optional[str], - base_manifest_list: Optional[str]): + base_manifest_list: Optional[str], + index_manifest: Optional[str] = None): try: manifest_path = self.manifest_list_manager.manifest_path + if index_manifest: + self.table.file_io.delete_quietly(f"{manifest_path}/{index_manifest}") + if delta_manifest_list: manifest_files = self.manifest_list_manager.read(delta_manifest_list) for manifest_meta in manifest_files: From d67579b12e378b9fd141aa3d40c961480b23ceec Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Sat, 30 May 2026 14:37:58 +0800 Subject: [PATCH 27/32] [python] drop redundant merge_into strict-mode guard, rely on row-id conflict checks --- .../pypaimon/ray/data_evolution_merge_into.py | 6 --- .../ray_data_evolution_merge_into_test.py | 37 ------------------- .../pypaimon/write/file_store_commit.py | 20 ---------- 3 files changed, 63 deletions(-) diff --git a/paimon-python/pypaimon/ray/data_evolution_merge_into.py b/paimon-python/pypaimon/ray/data_evolution_merge_into.py index 0afe6e53912e..2dd47c7dd616 100644 --- a/paimon-python/pypaimon/ray/data_evolution_merge_into.py +++ b/paimon-python/pypaimon/ray/data_evolution_merge_into.py @@ -120,12 +120,6 @@ def merge_into( _validate_source_on_cols(source_ds, source_on_cols) base_snapshot = table.snapshot_manager().get_latest_snapshot() - if base_snapshot is not None: - # Pin the snapshot so the final commit aborts if another writer - # commits between our read and our commit. - table = table.copy( - {"commit.strict-mode.last-safe-snapshot": str(base_snapshot.id)} - ) global_index_action = ( table.options.global_index_column_update_action() diff --git a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py index d54667e24bb7..3d0667046171 100644 --- a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py +++ b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py @@ -1060,43 +1060,6 @@ def test_duplicate_identical_source_rows_route_separately(self): [(1, 't1', 100), (2, 'dup', 5), (2, 'dup', 5)], ) - def test_strict_mode_rejects_when_snapshot_advances(self): - target = self._create_table() - self._write( - target, - pa.Table.from_pydict( - { - 'id': pa.array([1], type=pa.int32()), - 'name': ['x'], - 'age': pa.array([1], type=pa.int32()), - }, - schema=self.pa_schema, - ), - ) - current_id = self._snapshot_id(target) - - table = self.catalog.get_table(target).copy( - {"commit.strict-mode.last-safe-snapshot": str(current_id - 1)} - ) - wb = table.new_batch_write_builder() - tw = wb.new_write() - tw.write_arrow( - pa.Table.from_pydict( - { - 'id': pa.array([2], type=pa.int32()), - 'name': ['y'], - 'age': pa.array([2], type=pa.int32()), - }, - schema=self.pa_schema, - ) - ) - msgs = tw.prepare_commit() - tw.close() - - with self.assertRaises(RuntimeError) as ctx: - wb.new_commit().commit(msgs) - self.assertIn("strict-mode", str(ctx.exception).lower()) - def test_merge_condition_routes_per_source_row(self): target = self._create_table() self._write( diff --git a/paimon-python/pypaimon/write/file_store_commit.py b/paimon-python/pypaimon/write/file_store_commit.py index 656c80801c2e..93f0ec82a592 100644 --- a/paimon-python/pypaimon/write/file_store_commit.py +++ b/paimon-python/pypaimon/write/file_store_commit.py @@ -117,8 +117,6 @@ def commit(self, commit_messages: List[CommitMessage], commit_identifier: int): if not commit_messages: return - self._enforce_strict_mode_last_safe_snapshot() - # Extract the minimum check_from_snapshot from commit messages valid_snapshots = [msg.check_from_snapshot for msg in commit_messages if msg.check_from_snapshot != -1] @@ -168,7 +166,6 @@ def commit(self, commit_messages: List[CommitMessage], commit_identifier: int): def overwrite(self, overwrite_partition, commit_messages: List[CommitMessage], commit_identifier: int): """Commit the given commit messages in overwrite mode.""" - self._enforce_strict_mode_last_safe_snapshot() logger.info( "Ready to overwrite to table %s, number of commit messages: %d", self.table.identifier, @@ -241,7 +238,6 @@ def drop_partitions(self, partitions: List[Dict[str, str]], commit_identifier: i def truncate_table(self, commit_identifier: int) -> None: """Truncate the entire table, deleting all data.""" - self._enforce_strict_mode_last_safe_snapshot() self._try_commit( commit_kind="OVERWRITE", commit_identifier=commit_identifier, @@ -251,22 +247,6 @@ def truncate_table(self, commit_identifier: int) -> None: allow_rollback=False, ) - def _enforce_strict_mode_last_safe_snapshot(self) -> None: - raw = self.table.options.options.data.get( - "commit.strict-mode.last-safe-snapshot" - ) - if raw is None or raw == "": - return - safe_id = int(raw) - current = self.snapshot_manager.get_latest_snapshot() - current_id = current.id if current is not None else -1 - if current_id > safe_id: - raise RuntimeError( - f"Strict-mode commit aborted: latest snapshot {current_id} is " - f"newer than the recorded last-safe-snapshot {safe_id}; " - f"another writer has committed since this write was planned." - ) - def _try_commit(self, commit_kind, commit_identifier, commit_entries_plan, detect_conflicts=False, allow_rollback=False, index_deletes=None): From fc44a9def39762c0f7580ce78093a74108a26ce6 Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Sat, 30 May 2026 14:56:41 +0800 Subject: [PATCH 28/32] [python] drive merge_into update-write parallelism by num_partitions, not a fixed 200 cap --- .../pypaimon/ray/data_evolution_merge_into.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/paimon-python/pypaimon/ray/data_evolution_merge_into.py b/paimon-python/pypaimon/ray/data_evolution_merge_into.py index 2dd47c7dd616..7363b96d19cf 100644 --- a/paimon-python/pypaimon/ray/data_evolution_merge_into.py +++ b/paimon-python/pypaimon/ray/data_evolution_merge_into.py @@ -212,6 +212,7 @@ def merge_into( update_ds, table, update_cols_union, + num_partitions=num_partitions, ray_remote_args=ray_remote_args, allow_multiple_matches=allow_multiple_matches, ) @@ -386,6 +387,7 @@ def _distributed_update_apply( table, write_update_cols: Sequence[str], *, + num_partitions: int, ray_remote_args: Optional[Dict[str, Any]] = None, allow_multiple_matches: bool = False, ) -> list: @@ -490,10 +492,11 @@ def _apply_group(group: pa.Table) -> pa.Table: msgs = worker.update_columns(for_update, list(captured_cols)) return pa.Table.from_pydict({"msgs_blob": [pickle.dumps(msgs)]}) - # One group per target data file (distinct _FIRST_ROW_ID). Size the shuffle - # to the real group count instead of ray's default 200, which otherwise - # spawns hundreds of empty reduce tasks on small/medium merges. - group_partitions = max(1, min(len(captured_sorted), _MAX_GROUP_PARTITIONS)) + # One group per target data file (distinct _FIRST_ROW_ID). Drive the write + # shuffle with the same num_partitions knob as the join (Spark's single + # shuffle.partitions), bounded by the file count so small merges don't spawn + # empty reduce tasks and large ones scale past a fixed cap. + group_partitions = max(1, min(len(captured_sorted), num_partitions)) msgs_ds = with_frid.groupby(frid_col, num_partitions=group_partitions).map_groups( _apply_group, batch_format="pyarrow" ) @@ -513,10 +516,6 @@ def _apply_group(group: pa.Table) -> pa.Table: # (ray's join crashes on empty hash partitions). _ANTI_JOIN_ROWS_PER_PARTITION = 8 -# Upper bound on the update groupby shuffle, matching ray's default hash-shuffle -# parallelism so large tables keep today's behavior while small ones shrink. -_MAX_GROUP_PARTITIONS = 200 - def _assign_src_idx_block(block, start): import numpy as np From d8385677fdd75ae81115047cddb0014a671a41d6 Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Sat, 30 May 2026 15:06:00 +0800 Subject: [PATCH 29/32] [python] project only needed target columns into merge_into update join --- .../pypaimon/ray/data_evolution_merge_into.py | 7 +++- .../ray_data_evolution_merge_into_test.py | 35 +++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/paimon-python/pypaimon/ray/data_evolution_merge_into.py b/paimon-python/pypaimon/ray/data_evolution_merge_into.py index 7363b96d19cf..4390a8e8e237 100644 --- a/paimon-python/pypaimon/ray/data_evolution_merge_into.py +++ b/paimon-python/pypaimon/ray/data_evolution_merge_into.py @@ -1077,13 +1077,18 @@ def _needed_target_cols( update_cols: Sequence[str], all_target_cols: Sequence[str], ) -> list: - needed = set(on) | set(update_cols) + # Target needs only: join keys, t.col refs, and cols that may fall back + # (not set by every clause). Cols all clauses set from source aren't read. + needed = set(on) + set_by_all = set(update_cols) for clause in clauses: for value in clause.spec.values(): if callable(value): return list(all_target_cols) if isinstance(value, str) and value.startswith("t."): needed.add(value[2:]) + set_by_all &= set(clause.spec.keys()) + needed |= set(update_cols) - set_by_all return [c for c in all_target_cols if c in needed] diff --git a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py index 3d0667046171..be2519eb3718 100644 --- a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py +++ b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py @@ -300,6 +300,41 @@ def test_matched_multiple_clauses_first_match_wins(self): self.assertEqual(out['id'], [1, 2, 3]) self.assertEqual(out['age'], [1, 999, 999]) + def test_matched_partial_clause_falls_back_to_target(self): + target = self._create_table() + self._write( + target, + pa.Table.from_pydict( + { + 'id': pa.array([1], type=pa.int32()), + 'name': ['old'], + 'age': pa.array([42], type=pa.int32()), + }, + schema=self.pa_schema, + ), + ) + source = pa.Table.from_pydict( + { + 'id': pa.array([1], type=pa.int32()), + 'name': ['new'], + 'age': pa.array([99], type=pa.int32()), + }, + schema=self.pa_schema, + ) + merge_into( + target=target, + source=source, + catalog_options=self.catalog_options, + on=['id'], + when_matched=[ + WhenMatched(update={'name': 's.name'}), + WhenMatched(update={'age': 's.age'}), + ], + ) + out = self._read_sorted(target) + self.assertEqual(out['name'], ['new']) + self.assertEqual(out['age'], [42]) + def test_not_matched_insert_appends_unmatched(self): target = self._create_table() self._write( From bdf004947b6fad34033462edd7e465d20665e658 Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Sat, 30 May 2026 15:44:38 +0800 Subject: [PATCH 30/32] [python] add merge_into condition_cols for precise target projection, skip blob by default --- .../pypaimon/ray/data_evolution_merge_into.py | 79 ++++++++++--- .../ray_data_evolution_merge_into_test.py | 109 ++++++++++++++++++ 2 files changed, 170 insertions(+), 18 deletions(-) diff --git a/paimon-python/pypaimon/ray/data_evolution_merge_into.py b/paimon-python/pypaimon/ray/data_evolution_merge_into.py index 4390a8e8e237..8c6c9738b1f0 100644 --- a/paimon-python/pypaimon/ray/data_evolution_merge_into.py +++ b/paimon-python/pypaimon/ray/data_evolution_merge_into.py @@ -69,6 +69,7 @@ def merge_into( ray_remote_args: Optional[Dict[str, Any]] = None, concurrency: Optional[int] = None, allow_multiple_matches: bool = False, + condition_cols: Optional[List[str]] = None, ) -> None: _require_ray_join() num_partitions = _resolve_num_partitions(num_partitions) @@ -95,6 +96,7 @@ def merge_into( ) target_field_names = list(table.field_names) + blob_cols = _blob_cols(table) on_map = dict(zip(target_on_cols, source_on_cols)) matched_specs = [ _NormalizedClause( @@ -159,6 +161,8 @@ def merge_into( update_cols=update_cols_union, catalog_options=catalog_options, num_partitions=num_partitions, + condition_cols=condition_cols, + blob_cols=blob_cols, ) else: # Empty target → no rows can match; matched UPDATE is a no-op. @@ -176,6 +180,8 @@ def merge_into( update_cols=update_cols_union, catalog_options=catalog_options, num_partitions=num_partitions, + condition_cols=condition_cols, + blob_cols=blob_cols, ) if not_matched_specs: @@ -263,20 +269,17 @@ def _build_matched_update_ds( update_cols: Sequence[str], catalog_options: Dict[str, str], num_partitions: int, + condition_cols: Optional[Sequence[str]], + blob_cols: set, ): from pypaimon.ray.ray_paimon import read_paimon from pypaimon.table.special_fields import SpecialFields row_id_name = SpecialFields.ROW_ID.name - needs_full = merge_condition is not None or any( - c.condition is not None for c in clauses + needed_cols = _resolve_target_projection( + clauses, merge_condition, target_on, update_cols, + target_field_names, condition_cols, blob_cols, ) - if needs_full: - needed_cols = list(target_field_names) - else: - needed_cols = _needed_target_cols( - clauses, target_on, update_cols, target_field_names - ) projection = [row_id_name] + [c for c in needed_cols if c != row_id_name] target_ds = read_paimon(target_identifier, catalog_options, projection=projection) @@ -335,6 +338,16 @@ def _transform(batch: pa.Table) -> pa.Table: return joined.map_batches(_transform, batch_format="pyarrow") +def _eval_cond(cond, combined): + try: + return cond(combined) + except KeyError as e: + raise ValueError( + f"merge condition referenced column {e} that was not read; " + f"add it to condition_cols." + ) from e + + def _apply_matched_transform( batch: pa.Table, clauses: List[_NormalizedClause], @@ -355,10 +368,10 @@ def _apply_matched_transform( if s_key not in s_row and t_key in t_row: s_row[s_key] = t_row[t_key] combined = _prefixed(s_row, t_row) - if merge_condition is not None and not merge_condition(combined): + if merge_condition is not None and not _eval_cond(merge_condition, combined): continue for clause in clauses: - if clause.condition is not None and not clause.condition(combined): + if clause.condition is not None and not _eval_cond(clause.condition, combined): continue new_values = _apply_set(clause.spec, s_row, t_row, field_names) out_row_ids.append(t_row[row_id_name]) @@ -810,6 +823,8 @@ def _build_unified_both( update_cols: Sequence[str], catalog_options: Dict[str, str], num_partitions: int, + condition_cols: Optional[Sequence[str]], + blob_cols: set, ): import pyarrow.compute as pc @@ -819,15 +834,10 @@ def _build_unified_both( row_id_name = SpecialFields.ROW_ID.name - needs_full = merge_condition is not None or any( - c.condition is not None for c in matched_clauses + needed_cols = _resolve_target_projection( + matched_clauses, merge_condition, target_on, update_cols, + target_field_names, condition_cols, blob_cols, ) - if needs_full: - needed_cols = list(target_field_names) - else: - needed_cols = _needed_target_cols( - matched_clauses, target_on, update_cols, target_field_names - ) projection = [row_id_name] + [c for c in needed_cols if c != row_id_name] target_ds = read_paimon(target_identifier, catalog_options, projection=projection) target_renamed = target_ds.rename_columns( @@ -1092,6 +1102,39 @@ def _needed_target_cols( return [c for c in all_target_cols if c in needed] +def _blob_cols(table) -> set: + return { + f.name + for f in table.table_schema.fields + if getattr(f.type, "type", None) == "BLOB" + } + + +def _resolve_target_projection( + clauses: List[_NormalizedClause], + merge_condition: Optional[Condition], + target_on: Sequence[str], + update_cols: Sequence[str], + target_field_names: Sequence[str], + condition_cols: Optional[Sequence[str]], + blob_cols: set, +) -> list: + # When the caller declares condition_cols we read exactly what SET and the + # conditions need (Spark-like precision). Otherwise a condition forces the + # conservative all-columns read, but blob is never read: it can't be an + # update target nor a meaningful predicate input. + has_condition = merge_condition is not None or any( + c.condition is not None for c in clauses + ) + if condition_cols is not None: + needed = set(_needed_target_cols(clauses, target_on, update_cols, target_field_names)) + needed |= {c for c in condition_cols if c in target_field_names} + return [c for c in target_field_names if c in needed] + if has_condition: + return [c for c in target_field_names if c not in blob_cols] + return _needed_target_cols(clauses, target_on, update_cols, target_field_names) + + def _normalize_set_spec( spec: SetSpec, target_field_names: Sequence[str], diff --git a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py index be2519eb3718..d515c5277a15 100644 --- a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py +++ b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py @@ -259,6 +259,81 @@ def test_matched_update_with_condition(self): self.assertEqual(out['id'], [1, 2, 3]) self.assertEqual(out['age'], [10, 100, 50]) + def test_condition_cols_declared_precise(self): + target = self._create_table() + self._write( + target, + pa.Table.from_pydict( + { + 'id': pa.array([1, 2, 3], type=pa.int32()), + 'name': ['a', 'b', 'c'], + 'age': pa.array([10, 20, 30], type=pa.int32()), + }, + schema=self.pa_schema, + ), + ) + source = pa.Table.from_pydict( + { + 'id': pa.array([1, 2, 3], type=pa.int32()), + 'name': ['a', 'b', 'c'], + 'age': pa.array([5, 100, 50], type=pa.int32()), + }, + schema=self.pa_schema, + ) + merge_into( + target=target, + source=source, + catalog_options=self.catalog_options, + on=['id'], + when_matched=[ + WhenMatched( + update={'age': 's.age'}, + condition=lambda r: r['s.age'] > r['t.age'], + ), + ], + condition_cols=['age'], + ) + out = self._read_sorted(target) + self.assertEqual(out['id'], [1, 2, 3]) + self.assertEqual(out['age'], [10, 100, 50]) + + def test_condition_cols_underdeclared_raises(self): + target = self._create_table() + self._write( + target, + pa.Table.from_pydict( + { + 'id': pa.array([1], type=pa.int32()), + 'name': ['a'], + 'age': pa.array([10], type=pa.int32()), + }, + schema=self.pa_schema, + ), + ) + source = pa.Table.from_pydict( + { + 'id': pa.array([1], type=pa.int32()), + 'name': ['a'], + 'age': pa.array([99], type=pa.int32()), + }, + schema=self.pa_schema, + ) + with self.assertRaises(Exception) as ctx: + merge_into( + target=target, + source=source, + catalog_options=self.catalog_options, + on=['id'], + when_matched=[ + WhenMatched( + update={'name': 's.name'}, + condition=lambda r: r['s.age'] > r['t.age'], + ), + ], + condition_cols=[], + ) + self.assertIn('condition_cols', str(ctx.exception)) + def test_matched_multiple_clauses_first_match_wins(self): target = self._create_table() self._write( @@ -1243,5 +1318,39 @@ def test_update_untouched_partition_is_noop(self): self.assertEqual([], msgs) +class TargetProjectionTest(unittest.TestCase): + + def _clause(self, spec, condition=None): + from pypaimon.ray import data_evolution_merge_into as m + return m._NormalizedClause(spec=spec, condition=condition) + + def test_unconditional_set_excludes_target_update_col(self): + from pypaimon.ray import data_evolution_merge_into as m + cols = m._resolve_target_projection( + [self._clause({'feature': 's.feature'})], + None, ['id'], ['feature'], + ['id', 'feature', 'image'], None, {'image'}, + ) + self.assertEqual(['id'], cols) + + def test_condition_without_decl_excludes_blob_only(self): + from pypaimon.ray import data_evolution_merge_into as m + cols = m._resolve_target_projection( + [self._clause({'feature': 's.feature'})], + lambda r: True, ['id'], ['feature'], + ['id', 'age', 'feature', 'image'], None, {'image'}, + ) + self.assertEqual(['id', 'age', 'feature'], cols) + + def test_condition_cols_declared_is_precise(self): + from pypaimon.ray import data_evolution_merge_into as m + cols = m._resolve_target_projection( + [self._clause({'feature': 's.feature'})], + lambda r: True, ['id'], ['feature'], + ['id', 'age', 'feature', 'image'], ['age'], {'image'}, + ) + self.assertEqual(['id', 'age'], cols) + + if __name__ == '__main__': unittest.main() From 92b4590e5229f34bea395678831a55160ec4cb96 Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Sat, 30 May 2026 16:48:10 +0800 Subject: [PATCH 31/32] [python] merge_into: string-expression conditions, drop merge_condition, return metrics --- paimon-python/pypaimon/ray/condition_expr.py | 136 +++++++ .../pypaimon/ray/data_evolution_merge_into.py | 314 +++------------- .../ray_data_evolution_merge_into_test.py | 355 ++---------------- 3 files changed, 217 insertions(+), 588 deletions(-) create mode 100644 paimon-python/pypaimon/ray/condition_expr.py diff --git a/paimon-python/pypaimon/ray/condition_expr.py b/paimon-python/pypaimon/ray/condition_expr.py new file mode 100644 index 000000000000..c20c3c111fb8 --- /dev/null +++ b/paimon-python/pypaimon/ray/condition_expr.py @@ -0,0 +1,136 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import ast +import operator +from typing import Mapping, Set + +_PREFIXES = ("s", "t") + +_COMPARE_OPS = { + ast.Lt: operator.lt, + ast.LtE: operator.le, + ast.Gt: operator.gt, + ast.GtE: operator.ge, +} + +_ALLOWED_NODES = ( + ast.BoolOp, ast.And, ast.Or, + ast.UnaryOp, ast.Not, ast.USub, ast.UAdd, + ast.Compare, ast.Eq, ast.NotEq, ast.Lt, ast.LtE, ast.Gt, ast.GtE, + ast.Constant, ast.Attribute, ast.Name, ast.Load, +) + + +class ConditionExpr: + """A parsed merge condition over the joined row. + + Columns are referenced as ``s.col`` / ``t.col``; the evaluator reads them + from a combined ``{"s.col": ..., "t.col": ...}`` mapping. Only comparisons, + boolean and/or/not, and literals are supported, so the expression is safe to + evaluate (no ``eval``) and its referenced columns can be extracted statically. + """ + + def __init__(self, source: str, body: ast.AST): + self.source = source + self._body = body + + def eval(self, combined: Mapping) -> bool: + return bool(_eval(self._body, combined)) + + def target_columns(self) -> Set[str]: + return self._columns("t") + + def source_columns(self) -> Set[str]: + return self._columns("s") + + def _columns(self, prefix: str) -> Set[str]: + cols = set() + for node in ast.walk(self._body): + if (isinstance(node, ast.Attribute) + and isinstance(node.value, ast.Name) + and node.value.id == prefix): + cols.add(node.attr) + return cols + + +def parse(source: str) -> ConditionExpr: + try: + tree = ast.parse(source, mode="eval") + except SyntaxError as e: + raise ValueError(f"Invalid merge condition {source!r}: {e}") + for node in ast.walk(tree): + if isinstance(node, ast.Expression): + continue + if not isinstance(node, _ALLOWED_NODES): + raise ValueError( + f"Unsupported syntax in merge condition {source!r}: " + f"{type(node).__name__}. Only comparisons of s./t. columns and " + f"literals combined with and/or/not are allowed." + ) + if isinstance(node, ast.Attribute): + if not (isinstance(node.value, ast.Name) and node.value.id in _PREFIXES): + raise ValueError( + f"Column reference in merge condition {source!r} must be " + f"'s.' or 't.'." + ) + if isinstance(node, ast.Name) and node.id not in _PREFIXES: + raise ValueError( + f"Unknown name {node.id!r} in merge condition {source!r}; " + f"only 's' and 't' are allowed." + ) + return ConditionExpr(source, tree.body) + + +def _eval(node, combined): + if isinstance(node, ast.BoolOp): + values = (_eval(v, combined) for v in node.values) + if isinstance(node.op, ast.And): + return all(values) + return any(values) + if isinstance(node, ast.UnaryOp) and isinstance(node.op, ast.Not): + return not _eval(node.operand, combined) + if isinstance(node, ast.Compare): + left = _operand(node.left, combined) + ok = True + for op, comparator in zip(node.ops, node.comparators): + right = _operand(comparator, combined) + ok = ok and _apply(op, left, right) + left = right + return ok + return _operand(node, combined) + + +def _operand(node, combined): + if isinstance(node, ast.Constant): + return node.value + if isinstance(node, ast.Attribute): + return combined.get(f"{node.value.id}.{node.attr}") + if isinstance(node, ast.UnaryOp): + value = _operand(node.operand, combined) + return -value if isinstance(node.op, ast.USub) else value + return _eval(node, combined) + + +def _apply(op, left, right): + if isinstance(op, ast.Eq): + return left == right + if isinstance(op, ast.NotEq): + return left != right + if left is None or right is None: + return False + return _COMPARE_OPS[type(op)](left, right) diff --git a/paimon-python/pypaimon/ray/data_evolution_merge_into.py b/paimon-python/pypaimon/ray/data_evolution_merge_into.py index 8c6c9738b1f0..2876a0a5c838 100644 --- a/paimon-python/pypaimon/ray/data_evolution_merge_into.py +++ b/paimon-python/pypaimon/ray/data_evolution_merge_into.py @@ -21,7 +21,6 @@ from dataclasses import dataclass from typing import ( Any, - Callable, Dict, List, Mapping, @@ -33,8 +32,11 @@ import pyarrow as pa +from pypaimon.ray.condition_expr import ConditionExpr +from pypaimon.ray.condition_expr import parse as parse_condition + SetSpec = Union[str, Dict[str, Any]] -Condition = Callable[[Mapping[str, Any]], bool] +Condition = str OnSpec = Union[Sequence[str], Mapping[str, str]] @@ -53,7 +55,7 @@ class WhenNotMatched: @dataclass class _NormalizedClause: spec: Dict[str, Any] - condition: Optional[Condition] + condition: Optional[ConditionExpr] def merge_into( @@ -62,15 +64,13 @@ def merge_into( catalog_options: Dict[str, str], *, on: OnSpec, - merge_condition: Optional[Condition] = None, when_matched: Sequence[WhenMatched] = (), when_not_matched: Sequence[WhenNotMatched] = (), num_partitions: Optional[int] = None, ray_remote_args: Optional[Dict[str, Any]] = None, concurrency: Optional[int] = None, allow_multiple_matches: bool = False, - condition_cols: Optional[List[str]] = None, -) -> None: +) -> Dict[str, int]: _require_ray_join() num_partitions = _resolve_num_partitions(num_partitions) when_matched = list(when_matched) @@ -96,19 +96,18 @@ def merge_into( ) target_field_names = list(table.field_names) - blob_cols = _blob_cols(table) on_map = dict(zip(target_on_cols, source_on_cols)) matched_specs = [ _NormalizedClause( spec=_normalize_set_spec(c.update, target_field_names, on_map), - condition=c.condition, + condition=parse_condition(c.condition) if c.condition else None, ) for c in when_matched ] not_matched_specs = [ _NormalizedClause( spec=_normalize_set_spec(c.insert, target_field_names, on_map), - condition=c.condition, + condition=parse_condition(c.condition) if c.condition else None, ) for c in when_not_matched ] @@ -128,11 +127,6 @@ def merge_into( or GLOBAL_INDEX_ACTION_THROW_ERROR ) - # Row-precise routing needs a stable per-source-row id when merge_condition - # may differ between source rows sharing the same ON key. - if when_not_matched and merge_condition is not None: - source_ds = _add_paimon_src_idx(source_ds) - from pypaimon.schema.data_types import PyarrowFieldParser target_pa_schema = PyarrowFieldParser.from_paimon_schema( @@ -153,7 +147,6 @@ def merge_into( source_ds=source_ds, target_on=target_on_cols, source_on=source_on_cols, - merge_condition=merge_condition, matched_clauses=matched_specs, not_matched_clauses=not_matched_specs, target_field_names=target_field_names, @@ -161,8 +154,6 @@ def merge_into( update_cols=update_cols_union, catalog_options=catalog_options, num_partitions=num_partitions, - condition_cols=condition_cols, - blob_cols=blob_cols, ) else: # Empty target → no rows can match; matched UPDATE is a no-op. @@ -173,31 +164,17 @@ def merge_into( source_ds=source_ds, target_on=target_on_cols, source_on=source_on_cols, - merge_condition=merge_condition, clauses=matched_specs, target_field_names=target_field_names, target_pa_schema=target_pa_schema, update_cols=update_cols_union, catalog_options=catalog_options, num_partitions=num_partitions, - condition_cols=condition_cols, - blob_cols=blob_cols, ) if not_matched_specs: # Empty target: nothing can match, so every source row inserts. # Skip all joins (ray's hash join crashes on empty partitions). - matched_keys_ds = None - if base_snapshot is not None and merge_condition is not None: - matched_keys_ds = _compute_matched_source_idx_ds( - target_identifier=target, - source_ds=source_ds, - target_on=target_on_cols, - source_on=source_on_cols, - merge_condition=merge_condition, - catalog_options=catalog_options, - num_partitions=num_partitions, - ) insert_ds = _build_not_matched_insert_ds( target_identifier=target, source_ds=source_ds, @@ -208,13 +185,13 @@ def merge_into( target_pa_schema=target_pa_schema, catalog_options=catalog_options, num_partitions=num_partitions, - matched_idx_ds=matched_keys_ds, target_empty=base_snapshot is None, ) update_msgs: list = [] + num_updated = 0 if update_ds is not None: - update_msgs = _distributed_update_apply( + update_msgs, num_updated = _distributed_update_apply( update_ds, table, update_cols_union, @@ -224,12 +201,13 @@ def merge_into( ) all_msgs: list = list(update_msgs) + num_inserted = 0 if insert_ds is not None: - all_msgs.extend( - _distributed_write_collect_msgs( - insert_ds, table, ray_remote_args=ray_remote_args, concurrency=concurrency - ) + insert_msgs = _distributed_write_collect_msgs( + insert_ds, table, ray_remote_args=ray_remote_args, concurrency=concurrency ) + num_inserted = sum(f.row_count for m in insert_msgs for f in m.new_files) + all_msgs.extend(insert_msgs) # Mirror Spark's checkUpdateResult: scope the global-index action to the # partitions the update actually wrote and the updated indexed columns. all_msgs.extend( @@ -243,6 +221,8 @@ def merge_into( tc.commit(all_msgs) tc.close() + return {"num_updated": num_updated, "num_inserted": num_inserted} + def _normalize_on(on: OnSpec) -> Tuple[List[str], List[str]]: if isinstance(on, Mapping): @@ -262,23 +242,19 @@ def _build_matched_update_ds( source_ds, target_on: Sequence[str], source_on: Sequence[str], - merge_condition: Optional[Condition], clauses: List[_NormalizedClause], target_field_names: Sequence[str], target_pa_schema: pa.Schema, update_cols: Sequence[str], catalog_options: Dict[str, str], num_partitions: int, - condition_cols: Optional[Sequence[str]], - blob_cols: set, ): from pypaimon.ray.ray_paimon import read_paimon from pypaimon.table.special_fields import SpecialFields row_id_name = SpecialFields.ROW_ID.name needed_cols = _resolve_target_projection( - clauses, merge_condition, target_on, update_cols, - target_field_names, condition_cols, blob_cols, + clauses, target_on, update_cols, target_field_names, ) projection = [row_id_name] + [c for c in needed_cols if c != row_id_name] @@ -301,14 +277,13 @@ def _build_matched_update_ds( ) captured_clauses = clauses - captured_merge_cond = merge_condition captured_update_cols = list(update_cols) captured_field_names = list(target_field_names) captured_row_id_name = row_id_name captured_on_pairs = list(zip(source_on, target_on)) captured_schema = update_schema - if _clauses_use_vector_fast_path(clauses, merge_condition): + if _clauses_use_vector_fast_path(clauses): first_spec = clauses[0].spec def _fast(batch: pa.Table) -> pa.Table: @@ -327,7 +302,6 @@ def _transform(batch: pa.Table) -> pa.Table: return _apply_matched_transform( batch, captured_clauses, - captured_merge_cond, captured_on_pairs, captured_update_cols, captured_field_names, @@ -338,20 +312,9 @@ def _transform(batch: pa.Table) -> pa.Table: return joined.map_batches(_transform, batch_format="pyarrow") -def _eval_cond(cond, combined): - try: - return cond(combined) - except KeyError as e: - raise ValueError( - f"merge condition referenced column {e} that was not read; " - f"add it to condition_cols." - ) from e - - def _apply_matched_transform( batch: pa.Table, clauses: List[_NormalizedClause], - merge_condition: Optional[Condition], on_pairs: Sequence[Tuple[str, str]], update_cols: Sequence[str], field_names: Sequence[str], @@ -368,10 +331,8 @@ def _apply_matched_transform( if s_key not in s_row and t_key in t_row: s_row[s_key] = t_row[t_key] combined = _prefixed(s_row, t_row) - if merge_condition is not None and not _eval_cond(merge_condition, combined): - continue for clause in clauses: - if clause.condition is not None and not _eval_cond(clause.condition, combined): + if clause.condition is not None and not clause.condition.eval(combined): continue new_values = _apply_set(clause.spec, s_row, t_row, field_names) out_row_ids.append(t_row[row_id_name]) @@ -403,7 +364,7 @@ def _distributed_update_apply( num_partitions: int, ray_remote_args: Optional[Dict[str, Any]] = None, allow_multiple_matches: bool = False, -) -> list: +) -> Tuple[list, int]: import numpy as np import pickle import uuid @@ -426,7 +387,7 @@ def _distributed_update_apply( ) sorted_first_row_ids = list(planner.first_row_ids) if not sorted_first_row_ids: - return [] + return [], 0 # Broadcast the file-info snapshot to every worker so they skip the # per-task manifest scan and observe a single consistent target view. @@ -474,7 +435,10 @@ def _assign_frid(batch: pa.Table) -> pa.Table: def _apply_group(group: pa.Table) -> pa.Table: if group.num_rows == 0: - return pa.Table.from_pydict({"msgs_blob": pa.array([], type=pa.binary())}) + return pa.Table.from_pydict({ + "msgs_blob": pa.array([], type=pa.binary()), + "n_updated": pa.array([], type=pa.int64()), + }) # One target _ROW_ID matched by several source rows. Default: refuse # (the winning value is otherwise undefined, as in Spark DE's @@ -503,7 +467,10 @@ def _apply_group(group: pa.Table) -> pa.Table: precomputed_files_info=captured_precomputed, ) msgs = worker.update_columns(for_update, list(captured_cols)) - return pa.Table.from_pydict({"msgs_blob": [pickle.dumps(msgs)]}) + return pa.Table.from_pydict({ + "msgs_blob": [pickle.dumps(msgs)], + "n_updated": pa.array([for_update.num_rows], type=pa.int64()), + }) # One group per target data file (distinct _FIRST_ROW_ID). Drive the write # shuffle with the same num_partitions knob as the join (Spark's single @@ -515,49 +482,17 @@ def _apply_group(group: pa.Table) -> pa.Table: ) all_msgs: list = [] + num_updated = 0 for batch in msgs_ds.iter_batches(batch_format="pyarrow"): for blob in batch.column("msgs_blob").to_pylist(): all_msgs.extend(pickle.loads(blob)) - return all_msgs + for n in batch.column("n_updated").to_pylist(): + num_updated += n + return all_msgs, num_updated -PAIMON_SRC_IDX_COL = "_paimon_src_idx" -MATCHED_SRC_IDX_MARKER = "_paimon_matched_src_idx" GLOBAL_INDEX_ACTION_THROW_ERROR = "THROW_ERROR" GLOBAL_INDEX_ACTION_DROP_PARTITION_INDEX = "DROP_PARTITION_INDEX" -# Min rows per hash partition for the anti-join; keeps partitions non-empty -# (ray's join crashes on empty hash partitions). -_ANTI_JOIN_ROWS_PER_PARTITION = 8 - - -def _assign_src_idx_block(block, start): - import numpy as np - import pyarrow as pa - - if not isinstance(block, pa.Table): - block = pa.Table.from_pandas(block, preserve_index=False) - idx = pa.array(np.arange(start, start + block.num_rows, dtype=np.int64)) - return block.append_column(PAIMON_SRC_IDX_COL, idx) - - -def _add_paimon_src_idx(source_ds): - """Append a unique per-row index so INSERTs are routed by row identity, - not by content. Mint ids per block from a running offset (offset derived - from block metadata, so it is deterministic under re-execution). Avoids - zip's realignment shuffle and the extra full copy it forces.""" - import ray - - materialized = source_ds.materialize() - assign = ray.remote(_assign_src_idx_block) - offset = 0 - refs = [] - for bundle in materialized.iter_internal_ref_bundles(): - for block_ref, meta in bundle.blocks: - refs.append(assign.remote(block_ref, offset)) - offset += meta.num_rows - if not refs: - return materialized - return ray.data.from_arrow_refs(refs) def _resolve_num_partitions(num_partitions: Optional[int]) -> int: @@ -574,12 +509,9 @@ def _resolve_num_partitions(num_partitions: Optional[int]) -> int: def _clauses_use_vector_fast_path( clauses: List[_NormalizedClause], - merge_condition: Optional[Condition], ) -> bool: if not clauses: return False - if merge_condition is not None: - return False for c in clauses: if c.condition is not None: return False @@ -650,56 +582,6 @@ def _resolve_spec_array( return pa.array([val] * batch.num_rows, type=out_type) -def _compute_matched_source_idx_ds( - *, - target_identifier: str, - source_ds, - target_on: Sequence[str], - source_on: Sequence[str], - merge_condition: Condition, - catalog_options: Dict[str, str], - num_partitions: int, -): - from pypaimon.ray.ray_paimon import read_paimon - - target_ds = read_paimon(target_identifier, catalog_options) - target_renamed = target_ds.rename_columns( - {c: f"t.{c}" for c in target_ds.schema().names} - ) - source_schema = source_ds.schema() - source_cols = list(source_schema.names) if source_schema is not None else list(source_on) - source_renamed = source_ds.rename_columns({c: f"s.{c}" for c in source_cols}) - - joined = target_renamed.join( - source_renamed, - join_type="inner", - num_partitions=num_partitions, - on=tuple(f"t.{c}" for c in target_on), - right_on=tuple(f"s.{c}" for c in source_on), - ) - - captured_merge_cond = merge_condition - captured_on_pairs = list(zip(source_on, target_on)) - out_schema = pa.schema([pa.field(MATCHED_SRC_IDX_MARKER, pa.int64())]) - - def _emit_matched_idx(batch: pa.Table) -> pa.Table: - out_idx: list = [] - for row in batch.to_pylist(): - s_row = {k[2:]: v for k, v in row.items() if k.startswith("s.")} - t_row = {k[2:]: v for k, v in row.items() if k.startswith("t.")} - for sk, tk in captured_on_pairs: - if sk not in s_row and tk in t_row: - s_row[sk] = t_row[tk] - combined = _prefixed(s_row, t_row) - if captured_merge_cond(combined): - out_idx.append(s_row.get(PAIMON_SRC_IDX_COL)) - return pa.Table.from_pydict( - {MATCHED_SRC_IDX_MARKER: out_idx}, schema=out_schema - ) - - return joined.map_batches(_emit_matched_idx, batch_format="pyarrow") - - def _build_not_matched_insert_ds( *, target_identifier: str, @@ -711,7 +593,6 @@ def _build_not_matched_insert_ds( target_pa_schema: pa.Schema, catalog_options: Dict[str, str], num_partitions: int, - matched_idx_ds=None, target_empty: bool = False, ): from pypaimon.ray.ray_paimon import read_paimon @@ -727,26 +608,6 @@ def _build_not_matched_insert_ds( if target_empty: unmatched = source_renamed - elif matched_idx_ds is not None: - # ray's join is equi-only, so anti-join source against the matched - # per-row ids (Spark folds this into one LeftAnti predicate). Size - # partitions to the matched count: ray's join crashes on empty hash - # partitions, so keep them dense. - matched_idx_ds = matched_idx_ds.materialize() - matched_count = matched_idx_ds.count() - if matched_count == 0: - unmatched = source_renamed - else: - anti_np = max( - 1, min(num_partitions, matched_count // _ANTI_JOIN_ROWS_PER_PARTITION) - ) - unmatched = source_renamed.join( - matched_idx_ds, - join_type="left_anti", - num_partitions=anti_np, - on=(f"s.{PAIMON_SRC_IDX_COL}",), - right_on=(MATCHED_SRC_IDX_MARKER,), - ) else: target_ds = read_paimon( target_identifier, catalog_options, projection=list(target_on) @@ -762,7 +623,7 @@ def _build_not_matched_insert_ds( right_on=tuple(f"t.{c}" for c in target_on), ) - if _clauses_use_vector_fast_path(clauses, None): + if _clauses_use_vector_fast_path(clauses): first_spec = clauses[0].spec def _fast(batch: pa.Table) -> pa.Table: @@ -794,10 +655,9 @@ def _apply_insert_transform( out = [] for row in rows: s_row = {k[2:]: v for k, v in row.items() if k.startswith("s.")} - s_row.pop(PAIMON_SRC_IDX_COL, None) combined = _prefixed(s_row, None) for clause in clauses: - if clause.condition is not None and not clause.condition(combined): + if clause.condition is not None and not clause.condition.eval(combined): continue out.append( _apply_set( @@ -815,7 +675,6 @@ def _build_unified_both( source_ds, target_on: Sequence[str], source_on: Sequence[str], - merge_condition: Optional[Condition], matched_clauses: List[_NormalizedClause], not_matched_clauses: List[_NormalizedClause], target_field_names: Sequence[str], @@ -823,8 +682,6 @@ def _build_unified_both( update_cols: Sequence[str], catalog_options: Dict[str, str], num_partitions: int, - condition_cols: Optional[Sequence[str]], - blob_cols: set, ): import pyarrow.compute as pc @@ -835,8 +692,7 @@ def _build_unified_both( row_id_name = SpecialFields.ROW_ID.name needed_cols = _resolve_target_projection( - matched_clauses, merge_condition, target_on, update_cols, - target_field_names, condition_cols, blob_cols, + matched_clauses, target_on, update_cols, target_field_names, ) projection = [row_id_name] + [c for c in needed_cols if c != row_id_name] target_ds = read_paimon(target_identifier, catalog_options, projection=projection) @@ -863,7 +719,7 @@ def _build_unified_both( on_pairs = list(zip(source_on, target_on)) update_schema = _build_update_schema(target_pa_schema, update_cols, row_id_name) - use_fast_matched = _clauses_use_vector_fast_path(matched_clauses, merge_condition) + use_fast_matched = _clauses_use_vector_fast_path(matched_clauses) first_matched_spec = matched_clauses[0].spec if use_fast_matched else None m_update_cols = list(update_cols) m_field_names = list(target_field_names) @@ -876,63 +732,30 @@ def _matched_batch(batch: pa.Table) -> pa.Table: row_id_name, update_schema, ) return _apply_matched_transform( - sub, matched_clauses, merge_condition, on_pairs, m_update_cols, + sub, matched_clauses, on_pairs, m_update_cols, m_field_names, row_id_name, update_schema, ) update_ds = joined.map_batches(_matched_batch, batch_format="pyarrow") i_field_names = list(target_field_names) - if merge_condition is None: - use_fast_insert = _clauses_use_vector_fast_path(not_matched_clauses, None) - first_insert_spec = not_matched_clauses[0].spec if use_fast_insert else None - - def _insert_batch(batch: pa.Table) -> pa.Table: - sub = batch.filter(pc.is_null(batch.column(t_row_id_col))) - if use_fast_insert: - return _coerce_large_string_types( - _vectorized_insert_transform( - sub, first_insert_spec, i_field_names, target_pa_schema - ) - ) - return _apply_insert_transform( - sub, not_matched_clauses, i_field_names, target_pa_schema - ) + use_fast_insert = _clauses_use_vector_fast_path(not_matched_clauses) + first_insert_spec = not_matched_clauses[0].spec if use_fast_insert else None - insert_ds = joined.map_batches(_insert_batch, batch_format="pyarrow") - else: - idx_schema = pa.schema([pa.field(MATCHED_SRC_IDX_MARKER, pa.int64())]) - - def _emit_matched_idx(batch: pa.Table) -> pa.Table: - sub = batch.filter(pc.is_valid(batch.column(t_row_id_col))) - out_idx: list = [] - for row in sub.to_pylist(): - s_row = {k[2:]: v for k, v in row.items() if k.startswith("s.")} - t_row = {k[2:]: v for k, v in row.items() if k.startswith("t.")} - for sk, tk in on_pairs: - if sk not in s_row and tk in t_row: - s_row[sk] = t_row[tk] - combined = _prefixed(s_row, t_row) - if merge_condition(combined): - out_idx.append(s_row.get(PAIMON_SRC_IDX_COL)) - return pa.Table.from_pydict( - {MATCHED_SRC_IDX_MARKER: out_idx}, schema=idx_schema + def _insert_batch(batch: pa.Table) -> pa.Table: + sub = batch.filter(pc.is_null(batch.column(t_row_id_col))) + if use_fast_insert: + return _coerce_large_string_types( + _vectorized_insert_transform( + sub, first_insert_spec, i_field_names, target_pa_schema + ) ) - - matched_idx_ds = joined.map_batches(_emit_matched_idx, batch_format="pyarrow") - insert_ds = _build_not_matched_insert_ds( - target_identifier=target_identifier, - source_ds=source_ds, - target_on=target_on, - source_on=source_on, - clauses=not_matched_clauses, - target_field_names=target_field_names, - target_pa_schema=target_pa_schema, - catalog_options=catalog_options, - num_partitions=num_partitions, - matched_idx_ds=matched_idx_ds, + return _apply_insert_transform( + sub, not_matched_clauses, i_field_names, target_pa_schema ) + insert_ds = joined.map_batches(_insert_batch, batch_format="pyarrow") + return update_ds, insert_ds @@ -1102,37 +925,20 @@ def _needed_target_cols( return [c for c in all_target_cols if c in needed] -def _blob_cols(table) -> set: - return { - f.name - for f in table.table_schema.fields - if getattr(f.type, "type", None) == "BLOB" - } - - def _resolve_target_projection( clauses: List[_NormalizedClause], - merge_condition: Optional[Condition], target_on: Sequence[str], update_cols: Sequence[str], target_field_names: Sequence[str], - condition_cols: Optional[Sequence[str]], - blob_cols: set, ) -> list: - # When the caller declares condition_cols we read exactly what SET and the - # conditions need (Spark-like precision). Otherwise a condition forces the - # conservative all-columns read, but blob is never read: it can't be an - # update target nor a meaningful predicate input. - has_condition = merge_condition is not None or any( - c.condition is not None for c in clauses - ) - if condition_cols is not None: - needed = set(_needed_target_cols(clauses, target_on, update_cols, target_field_names)) - needed |= {c for c in condition_cols if c in target_field_names} - return [c for c in target_field_names if c in needed] - if has_condition: - return [c for c in target_field_names if c not in blob_cols] - return _needed_target_cols(clauses, target_on, update_cols, target_field_names) + # Precise: SET-side needs plus the target columns each parsed condition + # references. Anything not referenced (e.g. blob) is never read. + needed = set(_needed_target_cols(clauses, target_on, update_cols, target_field_names)) + target_set = set(target_field_names) + for clause in clauses: + if clause.condition is not None: + needed |= clause.condition.target_columns() & target_set + return [c for c in target_field_names if c in needed] def _normalize_set_spec( diff --git a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py index d515c5277a15..6479c921056f 100644 --- a/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py +++ b/paimon-python/pypaimon/tests/ray_data_evolution_merge_into_test.py @@ -250,7 +250,7 @@ def test_matched_update_with_condition(self): when_matched=[ WhenMatched( update={'age': 's.age'}, - condition=lambda r: r['s.age'] > r['t.age'], + condition="s.age > t.age", ), ], ) @@ -259,80 +259,16 @@ def test_matched_update_with_condition(self): self.assertEqual(out['id'], [1, 2, 3]) self.assertEqual(out['age'], [10, 100, 50]) - def test_condition_cols_declared_precise(self): - target = self._create_table() - self._write( - target, - pa.Table.from_pydict( - { - 'id': pa.array([1, 2, 3], type=pa.int32()), - 'name': ['a', 'b', 'c'], - 'age': pa.array([10, 20, 30], type=pa.int32()), - }, - schema=self.pa_schema, - ), - ) - source = pa.Table.from_pydict( - { - 'id': pa.array([1, 2, 3], type=pa.int32()), - 'name': ['a', 'b', 'c'], - 'age': pa.array([5, 100, 50], type=pa.int32()), - }, - schema=self.pa_schema, - ) - merge_into( - target=target, - source=source, - catalog_options=self.catalog_options, - on=['id'], - when_matched=[ - WhenMatched( - update={'age': 's.age'}, - condition=lambda r: r['s.age'] > r['t.age'], - ), - ], - condition_cols=['age'], - ) - out = self._read_sorted(target) - self.assertEqual(out['id'], [1, 2, 3]) - self.assertEqual(out['age'], [10, 100, 50]) - - def test_condition_cols_underdeclared_raises(self): - target = self._create_table() - self._write( - target, - pa.Table.from_pydict( - { - 'id': pa.array([1], type=pa.int32()), - 'name': ['a'], - 'age': pa.array([10], type=pa.int32()), - }, - schema=self.pa_schema, - ), - ) - source = pa.Table.from_pydict( - { - 'id': pa.array([1], type=pa.int32()), - 'name': ['a'], - 'age': pa.array([99], type=pa.int32()), - }, - schema=self.pa_schema, - ) - with self.assertRaises(Exception) as ctx: + def test_invalid_condition_expression_raises(self): + with self.assertRaises(ValueError): merge_into( - target=target, - source=source, + target=self._create_table(), + source=self._source(), catalog_options=self.catalog_options, on=['id'], - when_matched=[ - WhenMatched( - update={'name': 's.name'}, - condition=lambda r: r['s.age'] > r['t.age'], - ), - ], - condition_cols=[], + when_matched=[WhenMatched(update={'age': 's.age'}, + condition="evil(1)")], ) - self.assertIn('condition_cols', str(ctx.exception)) def test_matched_multiple_clauses_first_match_wins(self): target = self._create_table() @@ -365,7 +301,7 @@ def test_matched_multiple_clauses_first_match_wins(self): when_matched=[ WhenMatched( update={'age': 1}, - condition=lambda r: r['s.age'] < r['t.age'], + condition="s.age < t.age", ), WhenMatched(update={'age': 999}), ], @@ -477,7 +413,7 @@ def test_not_matched_insert_with_condition(self): when_not_matched=[ WhenNotMatched( insert='*', - condition=lambda r: r['s.age'] >= 50, + condition="s.age >= 50", ), ], ) @@ -517,7 +453,7 @@ def test_not_matched_multiple_clauses_first_match_wins(self): when_not_matched=[ WhenNotMatched( insert={'id': 's.id', 'name': 'small', 'age': 1}, - condition=lambda r: r['s.age'] < 10, + condition="s.age < 10", ), WhenNotMatched(insert={'id': 's.id', 'name': 'big', 'age': 2}), ], @@ -528,81 +464,6 @@ def test_not_matched_multiple_clauses_first_match_wins(self): self.assertEqual(out['name'], ['a', 'small', 'big']) self.assertEqual(out['age'], [10, 1, 2]) - def test_merge_condition_filters_matched_update(self): - target = self._create_table() - self._write( - target, - pa.Table.from_pydict( - { - 'id': pa.array([1, 2], type=pa.int32()), - 'name': ['a', 'b'], - 'age': pa.array([10, 20], type=pa.int32()), - }, - schema=self.pa_schema, - ), - ) - - source = pa.Table.from_pydict( - { - 'id': pa.array([1, 2], type=pa.int32()), - 'name': ['a2', 'b2'], - 'age': pa.array([100, 5], type=pa.int32()), - }, - schema=self.pa_schema, - ) - - merge_into( - target=target, - source=source, - catalog_options=self.catalog_options, - on=['id'], - merge_condition=lambda r: r['s.age'] > r['t.age'], - when_matched=[WhenMatched(update={'name': 's.name'})], - ) - - out = self._read_sorted(target) - self.assertEqual(out['id'], [1, 2]) - self.assertEqual(out['name'], ['a2', 'b']) - self.assertEqual(out['age'], [10, 20]) - - def test_merge_condition_failure_routes_to_insert(self): - target = self._create_table() - self._write( - target, - pa.Table.from_pydict( - { - 'id': pa.array([1], type=pa.int32()), - 'name': ['old'], - 'age': pa.array([20], type=pa.int32()), - }, - schema=self.pa_schema, - ), - ) - - source = pa.Table.from_pydict( - { - 'id': pa.array([1, 2], type=pa.int32()), - 'name': ['new1', 'new2'], - 'age': pa.array([5, 30], type=pa.int32()), - }, - schema=self.pa_schema, - ) - - merge_into( - target=target, - source=source, - catalog_options=self.catalog_options, - on=['id'], - merge_condition=lambda r: r['s.age'] > r['t.age'], - when_not_matched=[WhenNotMatched(insert='*')], - ) - - out = self._read_sorted(target) - ids_sorted = sorted(out['id']) - self.assertEqual(ids_sorted, [1, 1, 2]) - rows = sorted(zip(out['id'], out['name'], out['age'])) - self.assertEqual(rows, [(1, 'new1', 5), (1, 'old', 20), (2, 'new2', 30)]) - def test_combined_update_and_insert(self): target = self._create_table() self._write( @@ -626,7 +487,7 @@ def test_combined_update_and_insert(self): schema=self.pa_schema, ) - merge_into( + metrics = merge_into( target=target, source=source, catalog_options=self.catalog_options, @@ -639,47 +500,7 @@ def test_combined_update_and_insert(self): self.assertEqual(out['id'], [1, 2, 3]) self.assertEqual(out['name'], ['a', 'b2', 'c']) self.assertEqual(out['age'], [10, 22, 30]) - - def test_combined_update_and_insert_with_merge_condition(self): - target = self._create_table() - self._write( - target, - pa.Table.from_pydict( - { - 'id': pa.array([1, 2], type=pa.int32()), - 'name': ['a', 'b'], - 'age': pa.array([10, 20], type=pa.int32()), - }, - schema=self.pa_schema, - ), - ) - - source = pa.Table.from_pydict( - { - 'id': pa.array([1, 2, 3], type=pa.int32()), - 'name': ['n1', 'n2', 'n3'], - 'age': pa.array([100, 5, 30], type=pa.int32()), - }, - schema=self.pa_schema, - ) - - merge_into( - target=target, - source=source, - catalog_options=self.catalog_options, - on=['id'], - merge_condition=lambda r: r['s.age'] > r['t.age'], - when_matched=[WhenMatched(update='*')], - when_not_matched=[WhenNotMatched(insert='*')], - ) - - out = self._read_sorted(target) - self.assertEqual(sorted(out['id']), [1, 2, 2, 3]) - rows = sorted(zip(out['id'], out['name'], out['age'])) - self.assertEqual( - rows, - [(1, 'n1', 100), (2, 'b', 20), (2, 'n2', 5), (3, 'n3', 30)], - ) + self.assertEqual(metrics, {'num_updated': 1, 'num_inserted': 1}) def test_combined_matched_clause_condition_no_merge_condition(self): target = self._create_table() @@ -712,7 +533,7 @@ def test_combined_matched_clause_condition_no_merge_condition(self): when_matched=[ WhenMatched( update={'name': 's.name'}, - condition=lambda r: r['s.age'] > 50, + condition="s.age > 50", ) ], when_not_matched=[WhenNotMatched(insert='*')], @@ -830,33 +651,6 @@ def test_insert_into_empty_target(self): self.assertEqual(out['name'], ['a', 'b', 'c']) self.assertEqual(out['age'], [10, 20, 30]) - def test_insert_into_empty_target_with_merge_condition(self): - target = self._create_table() - - source = pa.Table.from_pydict( - { - 'id': pa.array([1, 2], type=pa.int32()), - 'name': ['a', 'b'], - 'age': pa.array([10, 20], type=pa.int32()), - }, - schema=self.pa_schema, - ) - - merge_into( - target=target, - source=source, - catalog_options=self.catalog_options, - on=['id'], - merge_condition=lambda r: r['s.age'] > 0, - when_matched=[WhenMatched(update='*')], - when_not_matched=[WhenNotMatched(insert='*')], - ) - - out = self._read_sorted(target) - self.assertEqual(out['id'], [1, 2]) - self.assertEqual(out['name'], ['a', 'b']) - self.assertEqual(out['age'], [10, 20]) - def test_insert_dict_fills_unspecified_with_null(self): target = self._create_table() self._write( @@ -985,27 +779,6 @@ def test_blob_update_is_rejected(self): _reject_blob_updates(fake_table, {'payload'}) _reject_blob_updates(fake_table, {'id'}) - def test_add_paimon_src_idx_pandas_blocks(self): - import pandas as pd - - from pypaimon.ray.data_evolution_merge_into import ( - PAIMON_SRC_IDX_COL, - _add_paimon_src_idx, - ) - - pdf = pd.DataFrame( - { - 'id': pd.array(list(range(20)), dtype='int32'), - 'name': ['x'] * 20, - 'age': pd.array(list(range(20)), dtype='int64'), - } - ) - ds = ray.data.from_pandas(pdf).repartition(4) - out = _add_paimon_src_idx(ds).to_pandas() - ids = sorted(out[PAIMON_SRC_IDX_COL].tolist()) - self.assertEqual(len(out), 20) - self.assertEqual(ids, list(range(20))) - def test_combined_writes_single_snapshot(self): target = self._create_table() self._write( @@ -1131,84 +904,6 @@ def test_empty_target_matched_update_is_noop(self): self.assertEqual(self._snapshot_id(target), before) - def test_duplicate_identical_source_rows_route_separately(self): - target = self._create_table() - self._write( - target, - pa.Table.from_pydict( - { - 'id': pa.array([1], type=pa.int32()), - 'name': ['t1'], - 'age': pa.array([100], type=pa.int32()), - }, - schema=self.pa_schema, - ), - ) - - source = pa.Table.from_pydict( - { - 'id': pa.array([2, 2], type=pa.int32()), - 'name': ['dup', 'dup'], - 'age': pa.array([5, 5], type=pa.int32()), - }, - schema=self.pa_schema, - ) - - merge_into( - target=target, - source=source, - catalog_options=self.catalog_options, - on=['id'], - merge_condition=lambda r: r['s.age'] > r['t.age'], - when_not_matched=[WhenNotMatched(insert='*')], - ) - - out = self._read_sorted(target) - rows = sorted(zip(out['id'], out['name'], out['age'])) - self.assertEqual( - rows, - [(1, 't1', 100), (2, 'dup', 5), (2, 'dup', 5)], - ) - - def test_merge_condition_routes_per_source_row(self): - target = self._create_table() - self._write( - target, - pa.Table.from_pydict( - { - 'id': pa.array([1], type=pa.int32()), - 'name': ['original'], - 'age': pa.array([100], type=pa.int32()), - }, - schema=self.pa_schema, - ), - ) - - source = pa.Table.from_pydict( - { - 'id': pa.array([1, 1], type=pa.int32()), - 'name': ['high', 'low'], - 'age': pa.array([200, 5], type=pa.int32()), - }, - schema=self.pa_schema, - ) - - merge_into( - target=target, - source=source, - catalog_options=self.catalog_options, - on=['id'], - merge_condition=lambda r: r['s.age'] > r['t.age'], - when_not_matched=[WhenNotMatched(insert='*')], - ) - - out = self._read_sorted(target) - rows = sorted(zip(out['id'], out['name'], out['age'])) - self.assertEqual( - rows, - [(1, 'low', 5), (1, 'original', 100)], - ) - class RayMergeIntoGlobalIndexGateTest(unittest.TestCase): @@ -1322,32 +1017,24 @@ class TargetProjectionTest(unittest.TestCase): def _clause(self, spec, condition=None): from pypaimon.ray import data_evolution_merge_into as m - return m._NormalizedClause(spec=spec, condition=condition) + from pypaimon.ray.condition_expr import parse + return m._NormalizedClause( + spec=spec, condition=parse(condition) if condition else None + ) def test_unconditional_set_excludes_target_update_col(self): from pypaimon.ray import data_evolution_merge_into as m cols = m._resolve_target_projection( [self._clause({'feature': 's.feature'})], - None, ['id'], ['feature'], - ['id', 'feature', 'image'], None, {'image'}, + ['id'], ['feature'], ['id', 'feature', 'image'], ) self.assertEqual(['id'], cols) - def test_condition_without_decl_excludes_blob_only(self): + def test_condition_columns_are_projected(self): from pypaimon.ray import data_evolution_merge_into as m cols = m._resolve_target_projection( - [self._clause({'feature': 's.feature'})], - lambda r: True, ['id'], ['feature'], - ['id', 'age', 'feature', 'image'], None, {'image'}, - ) - self.assertEqual(['id', 'age', 'feature'], cols) - - def test_condition_cols_declared_is_precise(self): - from pypaimon.ray import data_evolution_merge_into as m - cols = m._resolve_target_projection( - [self._clause({'feature': 's.feature'})], - lambda r: True, ['id'], ['feature'], - ['id', 'age', 'feature', 'image'], ['age'], {'image'}, + [self._clause({'feature': 's.feature'}, condition="t.age > 0")], + ['id'], ['feature'], ['id', 'age', 'feature', 'image'], ) self.assertEqual(['id', 'age'], cols) From 68152a1f1cc372ec43854daa2e78f4851e607d2b Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Sat, 30 May 2026 16:59:03 +0800 Subject: [PATCH 32/32] [python] doc: document Ray merge_into in ray-data.md --- docs/docs/pypaimon/ray-data.md | 45 ++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/docs/docs/pypaimon/ray-data.md b/docs/docs/pypaimon/ray-data.md index 3ee4db328979..074cf768ce6f 100644 --- a/docs/docs/pypaimon/ray-data.md +++ b/docs/docs/pypaimon/ray-data.md @@ -277,3 +277,48 @@ write_builder = table.new_batch_write_builder().overwrite() # overwrite partition 'dt=2024-01-01' write_builder = table.new_batch_write_builder().overwrite({'dt': '2024-01-01'}) ``` + +## Merge Into + +`merge_into` updates (and optionally inserts) rows of a **data-evolution** table +from a source, like SQL `MERGE INTO`. Matched rows are updated in place by +`_ROW_ID`; only the touched columns are rewritten. Requires `ray >= 2.50` and a +target table with `'data-evolution.enabled'` and `'row-tracking.enabled'` set. + +```python +from pypaimon.ray import merge_into, WhenMatched, WhenNotMatched + +metrics = merge_into( + target="database_name.table_name", + source=ray_dataset, # ray.data.Dataset / pa.Table / pandas / table-name str + catalog_options={"warehouse": "/path/to/warehouse"}, + on=["id"], # or {"target_col": "source_col"} for renamed keys + when_matched=[WhenMatched(update={"score": "s.score"})], # or update="*" + when_not_matched=[WhenNotMatched(insert="*")], # optional +) +print(metrics) # {"num_updated": 3, "num_inserted": 2} +``` + +- `update` / `insert`: `"*"` (all columns from source), or a dict mapping target + columns to `"s."`, `"t."`, or a literal. +- `condition` (optional): a string expression over `s.` / `t.` using + `> < >= <= == != and or not`; only referenced columns are read. Example: + `WhenMatched(update={"score": "s.score"}, condition="s.version > t.version")`. + +**Parameters:** +- `on`: key columns, or `{target_col: source_col}` for renamed keys. +- `num_partitions`: shuffle parallelism for the join and the write; defaults to + `max(16, cluster_cpus * 2)`, raise it for large merges. +- `ray_remote_args`, `concurrency`: scheduling for the insert path. +- `allow_multiple_matches`: if `False` (default), a target row matched by + multiple source rows raises; `True` keeps the first match. + +**Returns:** `{"num_updated", "num_inserted"}`. + +**Notes:** +- Blob columns cannot be updated and are never read into the join. +- Updating a globally-indexed column raises by default; set + `'global-index.column-update-action' = 'DROP_PARTITION_INDEX'` to drop the + affected index instead (rebuild afterwards). +- Cost scales with how many data files the updated rows touch; scattered updates + over a large table rewrite the updated column of many files.