Add deduplicate option to DataFrame.join to drop duplicate join columns

kosiew · kosiew · commit ab224a64b631 · 2025-07-08T21:37:05.000+08:00
- Added a `deduplicate` boolean parameter to `DataFrame.join` that,
  when True, drops duplicate join columns from the right DataFrame after join.
- Implemented helper methods `_resolve_join_keys` and `_prepare_deduplicate`
  to normalize join key arguments and handle column renaming and dropping.
- Updated join logic to rename duplicate join columns in right DataFrame,
  join with renamed columns, and drop renamed duplicates post-join.
- Added tests `test_join_deduplicate` and `test_join_deduplicate_multi` covering
  deduplication of single and multiple join columns.
- Extended documentation with example usage of `deduplicate` for disambiguating columns.

Also added Copilot and agent instructions files describing Python and Rust style guidelines,
pre-commit usage, testing, and code organization conventions for the DataFusion Python project.
diff --git a/docs/source/user-guide/common-operations/joins.rst b/docs/source/user-guide/common-operations/joins.rst
@@ -101,4 +101,34 @@ the right table.
 
 .. ipython:: python
 
-    left.join(right, left_on="customer_id", right_on="id", how="anti")
+    left.join(right, left_on="customer_id", right_on="id", how="anti")
+
+Disambiguating Columns
+----------------------
+
+When the join key exists in both DataFrames under the same name, the result contains two columns with that name. Assign a name to each DataFrame to use as a prefix and avoid ambiguity.
+
+.. ipython:: python
+
+    from datafusion import col
+    left = ctx.from_pydict({"id": [1, 2]}, name="l")
+    right = ctx.from_pydict({"id": [2, 3]}, name="r")
+    joined = left.join(right, on="id")
+    joined.select(col("l.id"), col("r.id"))
+
+You can remove the duplicate column after joining.
+
+.. ipython:: python
+
+    joined.drop("r.id")
+
+Automatic Deduplication
+----------------------
+
+Use the ``deduplicate`` argument of :py:meth:`DataFrame.join` to automatically
+drop the duplicate join column from the right DataFrame.
+
+.. ipython:: python
+
+    left.join(right, on="id", deduplicate=True)
+
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -678,6 +678,7 @@ def join(
         left_on: str | Sequence[str] | None = None,
         right_on: str | Sequence[str] | None = None,
         join_keys: tuple[list[str], list[str]] | None = None,
+        deduplicate: bool = False,
     ) -> DataFrame:
         """Join this :py:class:`DataFrame` with another :py:class:`DataFrame`.
 
@@ -691,20 +692,39 @@ def join(
             left_on: Join column of the left dataframe.
             right_on: Join column of the right dataframe.
             join_keys: Tuple of two lists of column names to join on. [Deprecated]
+            deduplicate: If ``True``, drop duplicate join columns from the
+                right DataFrame similar to PySpark's ``on`` behavior.
 
         Returns:
             DataFrame after join.
         """
-        # This check is to prevent breaking API changes where users prior to
-        # DF 43.0.0 would  pass the join_keys as a positional argument instead
-        # of a keyword argument.
+        on, left_on, right_on = self._resolve_join_keys(
+            on, left_on, right_on, join_keys
+        )
+
+        drop_cols: list[str] | None = None
+        if deduplicate and on is not None:
+            right, drop_cols, left_on, right_on = self._prepare_deduplicate(right, on)
+
+        result = DataFrame(self.df.join(right.df, how, left_on, right_on))
+        if drop_cols:
+            result = result.drop(*drop_cols)
+        return result
+
+    def _resolve_join_keys(
+        self,
+        on: str | Sequence[str] | tuple[list[str], list[str]] | None,
+        left_on: str | Sequence[str] | None,
+        right_on: str | Sequence[str] | None,
+        join_keys: tuple[list[str], list[str]] | None,
+    ) -> tuple[str | Sequence[str] | None, list[str], list[str]]:
+        """Normalize join key arguments and validate them."""
         if (
             isinstance(on, tuple)
             and len(on) == 2
             and isinstance(on[0], list)
             and isinstance(on[1], list)
         ):
-            # We know this is safe because we've checked the types
             join_keys = on  # type: ignore[assignment]
             on = None
 
@@ -730,12 +750,25 @@ def join(
         else:
             error_msg = "either `on` or `left_on` and `right_on` should be provided."
             raise ValueError(error_msg)
-        if isinstance(left_on, str):
-            left_on = [left_on]
-        if isinstance(right_on, str):
-            right_on = [right_on]
 
-        return DataFrame(self.df.join(right.df, how, left_on, right_on))
+        left_names = [left_on] if isinstance(left_on, str) else list(left_on)
+        right_names = [right_on] if isinstance(right_on, str) else list(right_on)
+
+        return on, left_names, right_names
+
+    def _prepare_deduplicate(
+        self, right: DataFrame, on: str | Sequence[str]
+    ) -> tuple[DataFrame, list[str], list[str], list[str]]:
+        """Rename join columns to drop them after joining."""
+        drop_cols: list[str] = []
+        right_aliases: list[str] = []
+        on_cols = [on] if isinstance(on, str) else list(on)
+        for col_name in on_cols:
+            alias = f"__right_{col_name}"
+            right = right.with_column_renamed(col_name, alias)
+            right_aliases.append(alias)
+            drop_cols.append(alias)
+        return right, drop_cols, on_cols, right_aliases
 
     def join_on(
         self,
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -519,6 +519,52 @@ def test_join_on():
     assert table.to_pydict() == expected
 
 
+def test_join_deduplicate():
+    ctx = SessionContext()
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2]), pa.array(["l1", "l2"])],
+        names=["id", "left_val"],
+    )
+    left = ctx.create_dataframe([[batch]], "l")
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2]), pa.array(["r1", "r2"])],
+        names=["id", "right_val"],
+    )
+    right = ctx.create_dataframe([[batch]], "r")
+
+    joined = left.join(right, on="id", deduplicate=True)
+    joined = joined.sort(column("id"))
+    table = pa.Table.from_batches(joined.collect())
+
+    expected = {"id": [1, 2], "right_val": ["r1", "r2"], "left_val": ["l1", "l2"]}
+    assert table.to_pydict() == expected
+
+
+def test_join_deduplicate_multi():
+    ctx = SessionContext()
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2]), pa.array([3, 4]), pa.array(["x", "y"])],
+        names=["a", "b", "l"],
+    )
+    left = ctx.create_dataframe([[batch]], "l")
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2]), pa.array([3, 4]), pa.array(["u", "v"])],
+        names=["a", "b", "r"],
+    )
+    right = ctx.create_dataframe([[batch]], "r")
+
+    joined = left.join(right, on=["a", "b"], deduplicate=True)
+    joined = joined.sort(column("a"))
+    table = pa.Table.from_batches(joined.collect())
+
+    expected = {"a": [1, 2], "b": [3, 4], "r": ["u", "v"], "l": ["x", "y"]}
+    assert table.to_pydict() == expected
+
+
 def test_distinct():
     ctx = SessionContext()