kosiew
diff --git a/‎docs/source/user-guide/dataframe/index.rst‎
Lines changed: 45 additions & 0 deletions b/‎docs/source/user-guide/dataframe/index.rst‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎python/datafusion/context.py‎
Lines changed: 34 additions & 21 deletions b/‎python/datafusion/context.py‎
Lines changed: 34 additions & 21 deletions
diff --git a/‎python/datafusion/dataframe.py‎
Lines changed: 62 additions & 28 deletions b/‎python/datafusion/dataframe.py‎
Lines changed: 62 additions & 28 deletions
@@ -126,6 +126,51 @@ DataFusion's DataFrame API offers a wide range of operations:
     # Drop columns
     df = df.drop("temporary_column")
 
+String Columns and Expressions
+------------------------------
+
+Some ``DataFrame`` methods accept plain strings when an argument refers to an
+existing column. These include:
+
+* :py:meth:`~datafusion.DataFrame.select`
+* :py:meth:`~datafusion.DataFrame.sort`
+* :py:meth:`~datafusion.DataFrame.drop`
+* :py:meth:`~datafusion.DataFrame.join` (``on`` argument)
+* :py:meth:`~datafusion.DataFrame.aggregate` (grouping columns)
+
+For such methods, you can pass column names directly:
+
+.. code-block:: python
+
+    from datafusion import col, functions as f
+
+    df.sort('id')
+    df.aggregate('id', [f.count(col('value'))])
+
+The same operation can also be written with explicit column expressions, using either ``col()`` or ``column()``:
+
+.. code-block:: python
+
+    from datafusion import col, column, functions as f
+
+    df.sort(col('id'))
+    df.aggregate(column('id'), [f.count(col('value'))])
+
+Note that ``column()`` is an alias of ``col()``, so you can use either name; the example above shows both in action.
+
+Whenever an argument represents an expression—such as in
+:py:meth:`~datafusion.DataFrame.filter` or
+:py:meth:`~datafusion.DataFrame.with_column`—use ``col()`` to reference columns
+and wrap constant values with ``lit()`` (also available as ``literal()``):
+
+.. code-block:: python
+
+    from datafusion import col, lit
+    df.filter(col('age') > lit(21))
+
+Without ``lit()`` DataFusion would treat ``21`` as a column name rather than a
+constant value.
+
 Terminal Operations
 -------------------
 
 
@@ -31,7 +31,7 @@
 
 from datafusion.catalog import Catalog, CatalogProvider, Table
 from datafusion.dataframe import DataFrame
-from datafusion.expr import Expr, SortExpr, sort_list_to_raw_sort_list
+from datafusion.expr import SortKey, sort_list_to_raw_sort_list
 from datafusion.record_batch import RecordBatchStream
 from datafusion.user_defined import AggregateUDF, ScalarUDF, TableFunction, WindowUDF
 
@@ -553,7 +553,7 @@ def register_listing_table(
         table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
         file_extension: str = ".parquet",
         schema: pa.Schema | None = None,
-        file_sort_order: list[list[Expr | SortExpr]] | None = None,
+        file_sort_order: list[list[SortKey]] | None = None,
     ) -> None:
         """Register multiple files as a single table.
 
@@ -567,23 +567,20 @@ def register_listing_table(
             table_partition_cols: Partition columns.
             file_extension: File extension of the provided table.
             schema: The data source schema.
-            file_sort_order: Sort order for the file.
+            file_sort_order: Sort order for the file. Each sort key can be
+                specified as a column name (``str``), an expression
+                (``Expr``), or a ``SortExpr``.
         """
         if table_partition_cols is None:
             table_partition_cols = []
         table_partition_cols = self._convert_table_partition_cols(table_partition_cols)
-        file_sort_order_raw = (
-            [sort_list_to_raw_sort_list(f) for f in file_sort_order]
-            if file_sort_order is not None
-            else None
-        )
         self.ctx.register_listing_table(
             name,
             str(path),
             table_partition_cols,
             file_extension,
             schema,
-            file_sort_order_raw,
+            self._convert_file_sort_order(file_sort_order),
         )
 
     def sql(self, query: str, options: SQLOptions | None = None) -> DataFrame:
@@ -808,7 +805,7 @@ def register_parquet(
         file_extension: str = ".parquet",
         skip_metadata: bool = True,
         schema: pa.Schema | None = None,
-        file_sort_order: list[list[SortExpr]] | None = None,
+        file_sort_order: list[list[SortKey]] | None = None,
     ) -> None:
         """Register a Parquet file as a table.
 
@@ -827,7 +824,9 @@ def register_parquet(
                 that may be in the file schema. This can help avoid schema
                 conflicts due to metadata.
             schema: The data source schema.
-            file_sort_order: Sort order for the file.
+            file_sort_order: Sort order for the file. Each sort key can be
+                specified as a column name (``str``), an expression
+                (``Expr``), or a ``SortExpr``.
         """
         if table_partition_cols is None:
             table_partition_cols = []
@@ -840,9 +839,7 @@ def register_parquet(
             file_extension,
             skip_metadata,
             schema,
-            [sort_list_to_raw_sort_list(exprs) for exprs in file_sort_order]
-            if file_sort_order is not None
-            else None,
+            self._convert_file_sort_order(file_sort_order),
         )
 
     def register_csv(
@@ -1099,7 +1096,7 @@ def read_parquet(
         file_extension: str = ".parquet",
         skip_metadata: bool = True,
         schema: pa.Schema | None = None,
-        file_sort_order: list[list[Expr | SortExpr]] | None = None,
+        file_sort_order: list[list[SortKey]] | None = None,
     ) -> DataFrame:
         """Read a Parquet source into a :py:class:`~datafusion.dataframe.Dataframe`.
 
@@ -1116,19 +1113,17 @@ def read_parquet(
             schema: An optional schema representing the parquet files. If None,
                 the parquet reader will try to infer it based on data in the
                 file.
-            file_sort_order: Sort order for the file.
+            file_sort_order: Sort order for the file. Each sort key can be
+                specified as a column name (``str``), an expression
+                (``Expr``), or a ``SortExpr``.
 
         Returns:
             DataFrame representation of the read Parquet files
         """
         if table_partition_cols is None:
             table_partition_cols = []
         table_partition_cols = self._convert_table_partition_cols(table_partition_cols)
-        file_sort_order = (
-            [sort_list_to_raw_sort_list(f) for f in file_sort_order]
-            if file_sort_order is not None
-            else None
-        )
+        file_sort_order = self._convert_file_sort_order(file_sort_order)
         return DataFrame(
             self.ctx.read_parquet(
                 str(path),
@@ -1179,6 +1174,24 @@ def execute(self, plan: ExecutionPlan, partitions: int) -> RecordBatchStream:
         """Execute the ``plan`` and return the results."""
         return RecordBatchStream(self.ctx.execute(plan._raw_plan, partitions))
 
+    @staticmethod
+    def _convert_file_sort_order(
+        file_sort_order: list[list[SortKey]] | None,
+    ) -> list[list[Any]] | None:
+        """Convert nested ``SortKey`` lists into raw sort representations.
+
+        Each ``SortKey`` can be a column name string, an ``Expr``, or a
+        ``SortExpr`` and will be converted using
+        :func:`datafusion.expr.sort_list_to_raw_sort_list`.
+        """
+        # Convert each ``SortKey`` in the provided sort order to the low-level
+        # representation expected by the Rust bindings.
+        return (
+            [sort_list_to_raw_sort_list(f) for f in file_sort_order]
+            if file_sort_order is not None
+            else None
+        )
+
     @staticmethod
     def _convert_table_partition_cols(
         table_partition_cols: list[tuple[str, str | pa.DataType]],
 
@@ -40,7 +40,13 @@
 from datafusion._internal import DataFrame as DataFrameInternal
 from datafusion._internal import ParquetColumnOptions as ParquetColumnOptionsInternal
 from datafusion._internal import ParquetWriterOptions as ParquetWriterOptionsInternal
-from datafusion.expr import Expr, SortExpr, sort_or_default
+from datafusion.expr import (
+    EXPR_TYPE_ERROR,
+    Expr,
+    SortKey,
+    expr_list_to_raw_expr_list,
+    sort_list_to_raw_sort_list,
+)
 from datafusion.plan import ExecutionPlan, LogicalPlan
 from datafusion.record_batch import RecordBatchStream
 
@@ -286,6 +292,23 @@ def __init__(
         self.bloom_filter_ndv = bloom_filter_ndv
 
 
+def _ensure_expr(value: Expr) -> expr_internal.Expr:
+    """Return the internal expression or raise ``TypeError`` if invalid.
+
+    Args:
+        value: Candidate expression.
+
+    Returns:
+        The internal expression representation.
+
+    Raises:
+        TypeError: If ``value`` is not an instance of :class:`Expr`.
+    """
+    if not isinstance(value, Expr):
+        raise TypeError(EXPR_TYPE_ERROR)
+    return value.expr
+
+
 class DataFrame:
     """Two dimensional table representation of data.
 
@@ -394,9 +417,7 @@ def select(self, *exprs: Expr | str) -> DataFrame:
             df = df.select("a", col("b"), col("a").alias("alternate_a"))
 
         """
-        exprs_internal = [
-            Expr.column(arg).expr if isinstance(arg, str) else arg.expr for arg in exprs
-        ]
+        exprs_internal = expr_list_to_raw_expr_list(exprs)
         return DataFrame(self.df.select(*exprs_internal))
 
     def drop(self, *columns: str) -> DataFrame:
@@ -426,7 +447,7 @@ def filter(self, *predicates: Expr) -> DataFrame:
         """
         df = self.df
         for p in predicates:
-            df = df.filter(p.expr)
+            df = df.filter(_ensure_expr(p))
         return DataFrame(df)
 
     def with_column(self, name: str, expr: Expr) -> DataFrame:
@@ -439,7 +460,7 @@ def with_column(self, name: str, expr: Expr) -> DataFrame:
         Returns:
             DataFrame with the new column.
         """
-        return DataFrame(self.df.with_column(name, expr.expr))
+        return DataFrame(self.df.with_column(name, _ensure_expr(expr)))
 
     def with_columns(
         self, *exprs: Expr | Iterable[Expr], **named_exprs: Expr
@@ -468,17 +489,24 @@ def with_columns(
         def _simplify_expression(
             *exprs: Expr | Iterable[Expr], **named_exprs: Expr
         ) -> list[expr_internal.Expr]:
-            expr_list = []
+            expr_list: list[expr_internal.Expr] = []
             for expr in exprs:
-                if isinstance(expr, Expr):
-                    expr_list.append(expr.expr)
-                elif isinstance(expr, Iterable):
-                    expr_list.extend(inner_expr.expr for inner_expr in expr)
+                if isinstance(expr, str):
+                    raise TypeError(EXPR_TYPE_ERROR)
+                if isinstance(expr, Iterable) and not isinstance(expr, Expr):
+                    expr_value = list(expr)
+                    if any(isinstance(inner, str) for inner in expr_value):
+                        raise TypeError(EXPR_TYPE_ERROR)
                 else:
-                    raise NotImplementedError
-            if named_exprs:
-                for alias, expr in named_exprs.items():
-                    expr_list.append(expr.alias(alias).expr)
+                    expr_value = expr
+                try:
+                    expr_list.extend(expr_list_to_raw_expr_list(expr_value))
+                except TypeError as err:
+                    raise TypeError(EXPR_TYPE_ERROR) from err
+            for alias, expr in named_exprs.items():
+                if not isinstance(expr, Expr):
+                    raise TypeError(EXPR_TYPE_ERROR)
+                expr_list.append(expr.alias(alias).expr)
             return expr_list
 
         expressions = _simplify_expression(*exprs, **named_exprs)
@@ -503,37 +531,43 @@ def with_column_renamed(self, old_name: str, new_name: str) -> DataFrame:
         return DataFrame(self.df.with_column_renamed(old_name, new_name))
 
     def aggregate(
-        self, group_by: list[Expr] | Expr, aggs: list[Expr] | Expr
+        self,
+        group_by: list[Expr | str] | Expr | str,
+        aggs: list[Expr] | Expr,
     ) -> DataFrame:
         """Aggregates the rows of the current DataFrame.
 
         Args:
-            group_by: List of expressions to group by.
+            group_by: List of expressions or column names to group by.
             aggs: List of expressions to aggregate.
 
         Returns:
             DataFrame after aggregation.
         """
-        group_by = group_by if isinstance(group_by, list) else [group_by]
-        aggs = aggs if isinstance(aggs, list) else [aggs]
+        group_by_list = group_by if isinstance(group_by, list) else [group_by]
+        aggs_list = aggs if isinstance(aggs, list) else [aggs]
 
-        group_by = [e.expr for e in group_by]
-        aggs = [e.expr for e in aggs]
-        return DataFrame(self.df.aggregate(group_by, aggs))
+        group_by_exprs = expr_list_to_raw_expr_list(group_by_list)
+        aggs_exprs = []
+        for agg in aggs_list:
+            if not isinstance(agg, Expr):
+                raise TypeError(EXPR_TYPE_ERROR)
+            aggs_exprs.append(agg.expr)
+        return DataFrame(self.df.aggregate(group_by_exprs, aggs_exprs))
 
-    def sort(self, *exprs: Expr | SortExpr) -> DataFrame:
-        """Sort the DataFrame by the specified sorting expressions.
+    def sort(self, *exprs: SortKey) -> DataFrame:
+        """Sort the DataFrame by the specified sorting expressions or column names.
 
         Note that any expression can be turned into a sort expression by
-        calling its` ``sort`` method.
+        calling its ``sort`` method.
 
         Args:
-            exprs: Sort expressions, applied in order.
+            exprs: Sort expressions or column names, applied in order.
 
         Returns:
             DataFrame after sorting.
         """
-        exprs_raw = [sort_or_default(expr) for expr in exprs]
+        exprs_raw = sort_list_to_raw_sort_list(list(exprs))
         return DataFrame(self.df.sort(*exprs_raw))
 
     def cast(self, mapping: dict[str, pa.DataType[Any]]) -> DataFrame:
@@ -757,7 +791,7 @@ def join_on(
         Returns:
             DataFrame after join.
         """
-        exprs = [expr.expr for expr in on_exprs]
+        exprs = [_ensure_expr(expr) for expr in on_exprs]
         return DataFrame(self.df.join_on(right.df, exprs, how))
 
     def explain(self, verbose: bool = False, analyze: bool = False) -> None: