apache · mbutrovich · May 21, 2026 · May 8, 2026 · May 8, 2026 · May 8, 2026
diff --git a/.github/workflows/pr_build_linux.yml b/.github/workflows/pr_build_linux.yml
@@ -302,6 +302,7 @@ jobs:
               org.apache.comet.CometFuzzAggregateSuite
               org.apache.comet.CometFuzzIcebergSuite
               org.apache.comet.CometFuzzMathSuite
+              org.apache.comet.CometCodegenFuzzSuite
               org.apache.comet.DataGeneratorSuite
           - name: "shuffle"
             value: |
@@ -380,6 +381,9 @@ jobs:
               org.apache.comet.expressions.conditional.CometIfSuite
               org.apache.comet.expressions.conditional.CometCoalesceSuite
               org.apache.comet.expressions.conditional.CometCaseWhenSuite
+              org.apache.comet.CometCodegenSuite
+              org.apache.comet.CometCodegenSourceSuite
+              org.apache.comet.CometCodegenHOFSuite
           - name: "sql"
             value: |
               org.apache.spark.sql.CometToPrettyStringSuite

diff --git a/.github/workflows/pr_build_macos.yml b/.github/workflows/pr_build_macos.yml
@@ -155,6 +155,7 @@ jobs:
               org.apache.comet.CometFuzzAggregateSuite
               org.apache.comet.CometFuzzIcebergSuite
               org.apache.comet.CometFuzzMathSuite
+              org.apache.comet.CometCodegenFuzzSuite
               org.apache.comet.DataGeneratorSuite
           - name: "shuffle"
             value: |
@@ -232,6 +233,9 @@ jobs:
               org.apache.comet.expressions.conditional.CometIfSuite
               org.apache.comet.expressions.conditional.CometCoalesceSuite
               org.apache.comet.expressions.conditional.CometCaseWhenSuite
+              org.apache.comet.CometCodegenSuite
+              org.apache.comet.CometCodegenSourceSuite
+              org.apache.comet.CometCodegenHOFSuite
           - name: "sql"
             value: |
               org.apache.spark.sql.CometToPrettyStringSuite

diff --git a/docs/source/user-guide/latest/iceberg.md b/docs/source/user-guide/latest/iceberg.md
@@ -146,6 +146,24 @@ The following scenarios will fall back to Spark's native Iceberg reader:
 - Dynamic Partition Pruning under Adaptive Query Execution (non-AQE DPP is supported);
   see [#3510](https://github.com/apache/datafusion-comet/issues/3510)
 
+### Iceberg UDFs
+
+Iceberg ships several `ScalaUDF`s that surface in user queries and maintenance actions:
+
+- `IcebergSpark.registerBucketUDF` and `registerTruncateUDF` register `bucket(N, col)` and
+  `truncate(W, col)` for use in `SELECT` / `JOIN` / `WHERE` predicates that align with hidden
+  partitioning.
+- `RewriteDataFiles` with `sort-strategy=zorder` builds a tree of per-type ordered-bytes UDFs
+  (`INT_ORDERED_BYTES`, `LONG_ORDERED_BYTES`, ..., `INTERLEAVE_BYTES`) over the sort key columns
+  during compaction.
+
+By default these UDFs cause the enclosing operator to fall back to Spark, which forces a
+columnar-to-row roundtrip and demotes the surrounding shuffle from `CometExchange` to
+`CometColumnarExchange`. Enabling the experimental
+[Scala UDF and Java UDF Support](scala_java_udfs.md) feature
+(`spark.comet.exec.scalaUDF.codegen.enabled=true`) routes these UDFs through native execution so
+the project, exchange, and sort operators around them stay on the Comet path end-to-end.
+
 ### Task input metrics
 
 The native Iceberg reader populates Spark's task-level `inputMetrics.bytesRead` (visible in the Spark UI Stages tab) using the `bytes_read` counter from iceberg-rust's `ScanMetrics`. This counter includes bytes read from both data files and delete files.

diff --git a/docs/source/user-guide/latest/index.rst b/docs/source/user-guide/latest/index.rst
@@ -43,6 +43,7 @@ to read more.
    Supported Data Types <datatypes>
    Supported Operators <operators>
    Supported Expressions <expressions>
+   ScalaUDF and Java UDF Support <scala_java_udfs>
    Configuration Settings <configs>
    Compatibility Guide <compatibility/index>
    Understanding Comet Plans <understanding-comet-plans>

diff --git a/docs/source/user-guide/latest/scala_java_udfs.md b/docs/source/user-guide/latest/scala_java_udfs.md
@@ -0,0 +1,61 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Scala UDF and Java UDF Support
+
+Comet executes Spark's Scala and Java [scalar user-defined functions (UDFs)](https://spark.apache.org/docs/latest/sql-ref-functions-udf-scalar.html) on the native Comet path. The presence of a UDF does not force the enclosing operator off the native path; surrounding native operators stay native.
+
+This page covers Spark's `ScalaUDF` (Scala `udf(...)`, `spark.udf.register(...)` over Scala or Java functional interfaces, and SQL `CREATE FUNCTION ... AS 'com.example.MyUDF'`). Other UDF kinds (Python / Pandas, Hive, aggregate) are out of scope and continue to fall back to Spark.
+
+This feature is experimental and disabled by default.
+
+## Configuration
+
+| Key                                         | Default | Description                                                                                                        |
+| ------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------ |
+| `spark.comet.exec.scalaUDF.codegen.enabled` | `false` | When `true`, eligible `ScalaUDF`s run on the Comet path. When `false`, the enclosing operator falls back to Spark. |
+
+## Supported
+
+- User functions registered via `udf(...)`, `spark.udf.register(...)` (Scala or Java functional interfaces), or SQL `CREATE FUNCTION ... AS 'com.example.MyUDF'`.
+- Scalar input/output types: `Boolean`, `Byte`, `Short`, `Int`, `Long`, `Float`, `Double`, `Decimal`, `String`, `Binary`, `Date`, `Timestamp`, `TimestampNTZ`.
+- Complex input/output types with arbitrary nesting: `ArrayType`, `StructType`, `MapType`.
+- Composition with other Catalyst expressions inside the argument tree (e.g. `myUdf(upper(s))` runs as one native unit).
+- Higher-order functions (`transform`, `filter`, `exists`, `aggregate`, `zip_with`, `map_filter`, `map_zip_with`, etc.) inside the argument tree.
+
+## Not supported
+
+- Aggregate UDFs (`ScalaAggregator`, `TypedImperativeAggregate`, the legacy `UserDefinedAggregateFunction`).
+- Table UDFs and generators.
+- Python `@udf` and Pandas `@pandas_udf`.
+- Hive `GenericUDF` and `SimpleUDF`.
+- `CalendarIntervalType`, `NullType`, and `UserDefinedType` arguments and return types. UDT-typed columns fall back to Spark; for native execution, store and read the underlying representation directly (e.g. write MLlib `Vector` outputs as `Struct<type: Byte, size: Int, indices: Array<Int>, values: Array<Double>>` rather than `VectorUDT`).
+- Trees whose total nested-field count (output plus all input columns the UDF tree references) exceeds `spark.sql.codegen.maxFields` (default 100). Comet refuses these at plan time and the operator falls back to Spark.
+
+When a UDF is rejected, the reason surfaces through Comet's standard fallback diagnostics; the query still runs on Spark.
+
+## Behavior
+
+- Non-deterministic expressions referenced from the argument tree (`rand`, `uuid`, `monotonically_increasing_id`) produce per-partition sequences consistent with Spark.
+- `TaskContext.get()` inside the user function returns the driving Spark task's context.
+- The user function must be closure-serializable; the same function that works with Spark's executor execution works here.
+
+## Known limitations
+
+- Each query containing a ScalaUDF pays a one-time codegen cost on its first batch and reuses the compiled kernel for subsequent batches, matching Spark's whole-stage codegen behavior. Bytecode is deduped JVM-wide via the same `CodeGenerator` cache, so structurally identical queries across a session share the compiled class.
diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs
@@ -211,9 +211,9 @@ impl PhysicalPlanner {
         self
     }
 
-    /// Attach the Spark `TaskContext` global reference captured at `createPlan` time. Cloned
-    /// into every `JvmScalarUdfExpr` the planner builds so the JNI bridge can install it as
-    /// the thread-local on the Tokio worker driving the UDF.
+    /// Attach a propagated Spark `TaskContext` global reference. Called by the JNI `executePlan`
+    /// entry with whatever was captured at `createPlan` time. The planner clones this `Option`
+    /// into every `JvmScalarUdfExpr` it builds.
     pub fn with_task_context(
         mut self,
         task_context: Option<Arc<Global<JObject<'static>>>>,
@@ -742,6 +742,13 @@ impl PhysicalPlanner {
                     to_arrow_datatype(udf.return_type.as_ref().ok_or_else(|| {
                         GeneralError("JvmScalarUdf missing return_type".to_string())
                     })?);
+                // Invariant: task_context is propagated for every JvmScalarUdfExpr built during
+                // normal execution. The TEST_EXEC_CONTEXT_ID path is the only context in which
+                // task_context may legitimately be None (unit tests, direct native driver runs).
+                debug_assert!(
+                    self.task_context.is_some() || self.exec_context_id == TEST_EXEC_CONTEXT_ID,
+                    "task_context must be set for non-test execution"
+                );
                 Ok(Arc::new(JvmScalarUdfExpr::new(
                     udf.class_name.clone(),
                     args,

diff --git a/native/spark-expr/src/jvm_udf/mod.rs b/native/spark-expr/src/jvm_udf/mod.rs
@@ -59,6 +59,10 @@ impl JvmScalarUdfExpr {
         return_nullable: bool,
         task_context: Option<Arc<Global<JObject<'static>>>>,
     ) -> Self {
+        debug_assert!(
+            !class_name.is_empty(),
+            "JvmScalarUdfExpr requires a non-empty class name"
+        );
         Self {
             class_name,
             args,
@@ -120,10 +124,10 @@ impl PhysicalExpr for JvmScalarUdfExpr {
     }
 
     fn evaluate(&self, batch: &RecordBatch) -> DFResult<ColumnarValue> {
-        // Step 1: evaluate child expressions to get Arrow arrays. Scalar children
-        // (e.g. literal patterns) are sent as length-1 vectors rather than expanded
-        // to batch-row count, so the JVM bridge does not pay an O(rows) copy for
-        // values that never vary across the batch.
+        // Scalar children (e.g. literal patterns) are sent as length-1 vectors rather than
+        // expanded to batch-row count, so the JVM bridge does not pay an O(rows) copy for
+        // values that never vary across the batch. The JVM side gets `numRows` directly via
+        // the bridge so it doesn't need the scalar to carry batch length.
         let arrays: Vec<ArrayRef> = self
             .args
             .iter()
@@ -133,7 +137,6 @@ impl PhysicalExpr for JvmScalarUdfExpr {
             })
             .collect::<DFResult<_>>()?;
 
-        // Step 2: allocate FFI structs on the Rust heap and collect their raw pointers.
         // The JVM writes into the out_array/out_schema slots and reads from the in_ slots.
         let in_ffi_arrays: Vec<Box<FFI_ArrowArray>> = arrays
             .iter()
@@ -157,7 +160,13 @@ impl PhysicalExpr for JvmScalarUdfExpr {
             .map(|b| b.as_ref() as *const FFI_ArrowSchema as i64)
             .collect();
 
-        // Allocate output FFI slots.
+        debug_assert!(!self.class_name.is_empty(), "class_name must not be empty");
+        debug_assert_eq!(
+            in_arr_ptrs.len(),
+            in_sch_ptrs.len(),
+            "input array and schema pointer counts must match"
+        );
+
         let mut out_array = Box::new(FFI_ArrowArray::empty());
         let mut out_schema = Box::new(FFI_ArrowSchema::empty());
         let out_arr_ptr = out_array.as_mut() as *mut FFI_ArrowArray as i64;
@@ -166,7 +175,6 @@ impl PhysicalExpr for JvmScalarUdfExpr {
         let class_name = self.class_name.clone();
         let n_args = arrays.len();
 
-        // Step 3: attach a JNI env for this thread and call the static bridge method.
         JVMClasses::with_env(|env| {
             let bridge = JVMClasses::get().comet_udf_bridge.as_ref().ok_or_else(|| {
                 CometError::from(ExecutionError::GeneralError(
@@ -176,12 +184,10 @@ impl PhysicalExpr for JvmScalarUdfExpr {
                 ))
             })?;
 
-            // Build the JVM String for the class name.
             let jclass_name = env
                 .new_string(&class_name)
                 .map_err(|e| CometError::JNI { source: e })?;
 
-            // Build the long[] arrays for input pointers.
             let in_arr_java = env
                 .new_long_array(n_args)
                 .map_err(|e| CometError::JNI { source: e })?;
@@ -196,9 +202,10 @@ impl PhysicalExpr for JvmScalarUdfExpr {
                 .set_region(env, 0, &in_sch_ptrs)
                 .map_err(|e| CometError::JNI { source: e })?;
 
-            // Pass a null jobject when no TaskContext was propagated so the bridge's null-guard
-            // leaves the worker thread's current TaskContext.get() in place. The borrow must
-            // outlive `call_static_method_unchecked`.
+            // Resolve the TaskContext reference once before building the arg array so the
+            // borrow lives until `call_static_method_unchecked` returns. When no TaskContext
+            // was propagated, pass a null object so the bridge's null-guard leaves the thread-
+            // local alone.
             let null_task_context = JObject::null();
             let task_context_ref: &JObject = match &self.task_context {
                 Some(gref) => gref.as_obj(),
@@ -229,7 +236,6 @@ impl PhysicalExpr for JvmScalarUdfExpr {
             Ok(())
         })?;
 
-        // Step 4: import the result from the FFI slots filled by the JVM.
         // SAFETY: `*out_array` moves the FFI_ArrowArray out of the Box (the heap
         // allocation is freed by the move), and `from_ffi` wraps it in an Arc that
         // keeps the JVM-installed release callback alive until the resulting

diff --git a/spark/src/main/java/org/apache/comet/codegen/CometBatchKernel.java b/spark/src/main/java/org/apache/comet/codegen/CometBatchKernel.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet.codegen;
+
+import org.apache.arrow.vector.FieldVector;
+import org.apache.arrow.vector.ValueVector;
+
+/**
+ * Abstract base extended by the Janino-compiled batch kernel emitted by {@code
+ * CometBatchKernelCodegen}. The generated subclass extends {@code CometInternalRow} (so Spark's
+ * {@code BoundReference.genCode} can call {@code this.getUTF8String(ord)} directly) and carries
+ * typed input fields baked at codegen time, one per input column. Expression evaluation plus Arrow
+ * read/write fuse into one method per expression tree.
+ */
+public abstract class CometBatchKernel extends CometInternalRow {
+
+  protected final Object[] references;
+
+  protected CometBatchKernel(Object[] references) {
+    this.references = references;
+  }
+
+  /**
+   * Run partition-dependent initialization. The generated subclass overrides this to execute
+   * statements collected via {@code CodegenContext.addPartitionInitializationStatement}, e.g.
+   * reseeding {@code Rand}'s {@code XORShiftRandom} from {@code seed + partitionIndex}.
+   * Deterministic expressions leave this as a no-op.
+   *
+   * <p>The caller invokes this before the first {@code process} call of each partition. The
+   * generated subclass is not thread-safe across concurrent {@code process} calls. The dispatcher
+   * allocates one per partition and serializes calls.
+   */
+  public void init(int partitionIndex) {}
+
+  /**
+   * Process one batch.
+   *
+   * @param inputs Arrow input vectors. Length and concrete classes match the schema the kernel was
+   *     compiled against.
+   * @param output Arrow output vector. Caller allocates to the expression's {@code dataType}.
+   * @param numRows number of rows in this batch
+   */
+  public abstract void process(ValueVector[] inputs, FieldVector output, int numRows);
+}
diff --git a/spark/src/main/scala/org/apache/comet/CometConf.scala b/spark/src/main/scala/org/apache/comet/CometConf.scala
@@ -362,6 +362,17 @@ object CometConf extends ShimCometConf {
       .booleanConf
       .createWithDefault(false)
 
+  val COMET_SCALA_UDF_CODEGEN_ENABLED: ConfigEntry[Boolean] =
+    conf("spark.comet.exec.scalaUDF.codegen.enabled")
+      .category(CATEGORY_EXEC)
+      .doc("Experimental. Whether to route Spark `ScalaUDF` expressions through Comet's " +
+        "Arrow-direct codegen dispatcher. When enabled, a supported ScalaUDF is compiled into " +
+        "a per-batch kernel that reads and writes Arrow vectors directly from native " +
+        "execution. When disabled, plans containing a ScalaUDF fall back to Spark for the " +
+        "enclosing operator.")
+      .booleanConf
+      .createWithDefault(false)
+
   val COMET_EXEC_SHUFFLE_WITH_HASH_PARTITIONING_ENABLED: ConfigEntry[Boolean] =
     conf("spark.comet.native.shuffle.partitioning.hash.enabled")
       .category(CATEGORY_SHUFFLE)