From 9af4184357a436b0a9ab00900d6267dd1d6de1d8 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 2 Jan 2026 12:20:51 -0700
Subject: [PATCH 1/8] skip some CI workflows for benchmark changes

---
 .github/workflows/pr_benchmark_check.yml | 85 ++++++++++++++++++++++++
 .github/workflows/pr_build_linux.yml     |  6 ++
 .github/workflows/pr_build_macos.yml     |  6 ++
 .github/workflows/spark_sql_test.yml     |  6 ++
 4 files changed, 103 insertions(+)
 create mode 100644 .github/workflows/pr_benchmark_check.yml

diff --git a/.github/workflows/pr_benchmark_check.yml b/.github/workflows/pr_benchmark_check.yml
new file mode 100644
index 0000000000..b7475b9076
--- /dev/null
+++ b/.github/workflows/pr_benchmark_check.yml
@@ -0,0 +1,85 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Lightweight CI for benchmark-only changes - verifies compilation and linting
+# without running full test suites
+
+name: PR Benchmark Check
+
+concurrency:
+  group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
+  cancel-in-progress: true
+
+on:
+  push:
+    paths:
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
+  pull_request:
+    paths:
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
+  workflow_dispatch:
+
+env:
+  RUST_VERSION: stable
+
+jobs:
+  benchmark-check:
+    name: Benchmark Compile & Lint Check
+    runs-on: ubuntu-latest
+    container:
+      image: amd64/rust
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Setup Rust & Java toolchain
+        uses: ./.github/actions/setup-builder
+        with:
+          rust-version: ${{ env.RUST_VERSION }}
+          jdk-version: 17
+
+      - name: Check Cargo fmt
+        run: |
+          cd native
+          cargo fmt --all -- --check --color=never
+
+      - name: Check Cargo clippy
+        run: |
+          cd native
+          cargo clippy --color=never --all-targets --workspace -- -D warnings
+
+      - name: Check benchmark compilation
+        run: |
+          cd native
+          cargo check --benches
+
+      - name: Cache Maven dependencies
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.m2/repository
+            /root/.m2/repository
+          key: ${{ runner.os }}-benchmark-maven-${{ hashFiles('**/pom.xml') }}
+          restore-keys: |
+            ${{ runner.os }}-benchmark-maven-
+
+      - name: Check Scala compilation and linting
+        run: |
+          ./mvnw -B compile test-compile scalafix:scalafix -Dscalafix.mode=CHECK -Psemanticdb -DskipTests
diff --git a/.github/workflows/pr_build_linux.yml b/.github/workflows/pr_build_linux.yml
index e3b0e40566..beb5f9dcf7 100644
--- a/.github/workflows/pr_build_linux.yml
+++ b/.github/workflows/pr_build_linux.yml
@@ -27,11 +27,17 @@ on:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
   pull_request:
     paths-ignore:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
   # manual trigger
   # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow
   workflow_dispatch:
diff --git a/.github/workflows/pr_build_macos.yml b/.github/workflows/pr_build_macos.yml
index 0ad40c1932..9a45fe022d 100644
--- a/.github/workflows/pr_build_macos.yml
+++ b/.github/workflows/pr_build_macos.yml
@@ -27,11 +27,17 @@ on:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
   pull_request:
     paths-ignore:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
   # manual trigger
   # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow
   workflow_dispatch:
diff --git a/.github/workflows/spark_sql_test.yml b/.github/workflows/spark_sql_test.yml
index d143ef83a0..1ff6fa952c 100644
--- a/.github/workflows/spark_sql_test.yml
+++ b/.github/workflows/spark_sql_test.yml
@@ -27,11 +27,17 @@ on:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
   pull_request:
     paths-ignore:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
   # manual trigger
   # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow
   workflow_dispatch:

From 93ab38fbaa9ff787f9aa95a0e31f0c5df128e406 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 2 Jan 2026 12:26:58 -0700
Subject: [PATCH 2/8] implement more microbenchmarks for casts

---
 .../benchmark/CometCastBooleanBenchmark.scala | 122 +++++++++++++++
 .../CometCastNumericToNumericBenchmark.scala  | 142 ++++++++++++++++++
 .../CometCastNumericToStringBenchmark.scala   |  98 ++++++++++++
 .../CometCastNumericToTemporalBenchmark.scala | 102 +++++++++++++
 .../CometCastTemporalToNumericBenchmark.scala | 102 +++++++++++++
 .../CometCastTemporalToStringBenchmark.scala  |  96 ++++++++++++
 6 files changed, 662 insertions(+)
 create mode 100644 spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala
 create mode 100644 spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala
 create mode 100644 spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala
 create mode 100644 spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala
 create mode 100644 spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala
 create mode 100644 spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala

diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala
new file mode 100644
index 0000000000..085e7388c7
--- /dev/null
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.benchmark
+
+case class CastBooleanConfig(
+    name: String,
+    query: String,
+    extraCometConfigs: Map[String, String] = Map.empty)
+
+// spotless:off
+/**
+ * Benchmark to measure performance of Comet cast operations involving Boolean type. To run this
+ * benchmark:
+ * `SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastBooleanBenchmark`
+ * Results will be written to "spark/benchmarks/CometCastBooleanBenchmark-**results.txt".
+ */
+// spotless:on
+object CometCastBooleanBenchmark extends CometBenchmarkBase {
+
+  private val castFunctions = Seq("CAST", "TRY_CAST")
+
+  // Boolean to String
+  private val boolToStringConfigs = for {
+    castFunc <- castFunctions
+  } yield CastBooleanConfig(
+    s"$castFunc Boolean to String",
+    s"SELECT $castFunc(c_bool AS STRING) FROM parquetV1Table")
+
+  // Boolean to numeric types
+  private val boolToNumericTypes =
+    Seq("BYTE", "SHORT", "INT", "LONG", "FLOAT", "DOUBLE", "DECIMAL(10,2)")
+  private val boolToNumericConfigs = for {
+    castFunc <- castFunctions
+    targetType <- boolToNumericTypes
+  } yield CastBooleanConfig(
+    s"$castFunc Boolean to $targetType",
+    s"SELECT $castFunc(c_bool AS $targetType) FROM parquetV1Table")
+
+  // Numeric to Boolean
+  private val numericTypes = Seq(
+    ("BYTE", "c_byte"),
+    ("SHORT", "c_short"),
+    ("INT", "c_int"),
+    ("LONG", "c_long"),
+    ("FLOAT", "c_float"),
+    ("DOUBLE", "c_double"),
+    ("DECIMAL(10,2)", "c_decimal"))
+
+  private val numericToBoolConfigs = for {
+    castFunc <- castFunctions
+    (sourceType, colName) <- numericTypes
+  } yield CastBooleanConfig(
+    s"$castFunc $sourceType to Boolean",
+    s"SELECT $castFunc($colName AS BOOLEAN) FROM parquetV1Table")
+
+  override def runCometBenchmark(mainArgs: Array[String]): Unit = {
+    val values = getBenchmarkRows(1024 * 1024 * 5) // 5M rows default
+
+    // Generate boolean data for boolean-to-other casts
+    runBenchmarkWithTable("Boolean to other types casts", values) { v =>
+      withTempPath { dir =>
+        withTempTable("parquetV1Table") {
+          prepareTable(
+            dir,
+            spark.sql(s"""
+              SELECT CASE
+                WHEN value % 100 = 0 THEN NULL
+                ELSE (value % 2 = 0)
+              END AS c_bool
+              FROM $tbl
+            """))
+
+          (boolToStringConfigs ++ boolToNumericConfigs).foreach { config =>
+            runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs)
+          }
+        }
+      }
+    }
+
+    // Generate numeric data for numeric-to-boolean casts
+    runBenchmarkWithTable("Numeric to Boolean casts", values) { v =>
+      withTempPath { dir =>
+        withTempTable("parquetV1Table") {
+          prepareTable(
+            dir,
+            spark.sql(s"""
+              SELECT
+                CASE WHEN value % 100 = 0 THEN NULL ELSE CAST((value % 3) - 1 AS BYTE) END AS c_byte,
+                CASE WHEN value % 100 = 1 THEN NULL ELSE CAST((value % 3) - 1 AS SHORT) END AS c_short,
+                CASE WHEN value % 100 = 2 THEN NULL ELSE CAST((value % 3) - 1 AS INT) END AS c_int,
+                CASE WHEN value % 100 = 3 THEN NULL ELSE CAST((value % 3) - 1 AS LONG) END AS c_long,
+                CASE WHEN value % 100 = 4 THEN NULL ELSE CAST((value % 3) - 1 AS FLOAT) END AS c_float,
+                CASE WHEN value % 100 = 5 THEN NULL ELSE CAST((value % 3) - 1 AS DOUBLE) END AS c_double,
+                CASE WHEN value % 100 = 6 THEN NULL ELSE CAST((value % 3) - 1 AS DECIMAL(10,2)) END AS c_decimal
+              FROM $tbl
+            """))
+
+          numericToBoolConfigs.foreach { config =>
+            runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs)
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala
new file mode 100644
index 0000000000..5137e50e18
--- /dev/null
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.benchmark
+
+case class CastNumericToNumericConfig(
+    name: String,
+    query: String,
+    extraCometConfigs: Map[String, String] = Map.empty)
+
+// spotless:off
+/**
+ * Benchmark to measure performance of Comet cast between numeric types. To run this
+ * benchmark:
+ * `SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastNumericToNumericBenchmark`
+ * Results will be written to "spark/benchmarks/CometCastNumericToNumericBenchmark-**results.txt".
+ */
+// spotless:on
+object CometCastNumericToNumericBenchmark extends CometBenchmarkBase {
+
+  private val castFunctions = Seq("CAST", "TRY_CAST")
+
+  // Integer widening conversions
+  private val integerWideningPairs = Seq(
+    ("BYTE", "c_byte", "SHORT"),
+    ("BYTE", "c_byte", "INT"),
+    ("BYTE", "c_byte", "LONG"),
+    ("SHORT", "c_short", "INT"),
+    ("SHORT", "c_short", "LONG"),
+    ("INT", "c_int", "LONG"))
+
+  // Integer narrowing conversions
+  private val integerNarrowingPairs = Seq(
+    ("LONG", "c_long", "INT"),
+    ("LONG", "c_long", "SHORT"),
+    ("LONG", "c_long", "BYTE"),
+    ("INT", "c_int", "SHORT"),
+    ("INT", "c_int", "BYTE"),
+    ("SHORT", "c_short", "BYTE"))
+
+  // Floating point conversions
+  private val floatPairs = Seq(("FLOAT", "c_float", "DOUBLE"), ("DOUBLE", "c_double", "FLOAT"))
+
+  // Integer to floating point conversions
+  private val intToFloatPairs = Seq(
+    ("BYTE", "c_byte", "FLOAT"),
+    ("SHORT", "c_short", "FLOAT"),
+    ("INT", "c_int", "FLOAT"),
+    ("LONG", "c_long", "FLOAT"),
+    ("INT", "c_int", "DOUBLE"),
+    ("LONG", "c_long", "DOUBLE"))
+
+  // Floating point to integer conversions
+  private val floatToIntPairs = Seq(
+    ("FLOAT", "c_float", "INT"),
+    ("FLOAT", "c_float", "LONG"),
+    ("DOUBLE", "c_double", "INT"),
+    ("DOUBLE", "c_double", "LONG"))
+
+  // Decimal conversions
+  private val decimalPairs = Seq(
+    ("INT", "c_int", "DECIMAL(10,2)"),
+    ("LONG", "c_long", "DECIMAL(20,4)"),
+    ("DOUBLE", "c_double", "DECIMAL(15,5)"),
+    ("DECIMAL(10,2)", "c_decimal", "INT"),
+    ("DECIMAL(10,2)", "c_decimal", "LONG"),
+    ("DECIMAL(10,2)", "c_decimal", "DOUBLE"))
+
+  private def generateConfigs(
+      pairs: Seq[(String, String, String)]): Seq[CastNumericToNumericConfig] = {
+    for {
+      castFunc <- castFunctions
+      (sourceType, colName, targetType) <- pairs
+    } yield CastNumericToNumericConfig(
+      s"$castFunc $sourceType to $targetType",
+      s"SELECT $castFunc($colName AS $targetType) FROM parquetV1Table")
+  }
+
+  override def runCometBenchmark(mainArgs: Array[String]): Unit = {
+    val values = getBenchmarkRows(1024 * 1024 * 5) // 5M rows default
+
+    // Generate input data once with all numeric types
+    runBenchmarkWithTable("Numeric to Numeric casts", values) { v =>
+      withTempPath { dir =>
+        withTempTable("parquetV1Table") {
+          // Generate varied numeric data including edge cases
+          prepareTable(
+            dir,
+            spark.sql(s"""
+              SELECT
+                CASE WHEN value % 100 = 0 THEN NULL ELSE CAST((value % 128) - 64 AS BYTE) END AS c_byte,
+                CASE WHEN value % 100 = 1 THEN NULL ELSE CAST((value % 32768) - 16384 AS SHORT) END AS c_short,
+                CASE WHEN value % 100 = 2 THEN NULL ELSE CAST(value - 2500000 AS INT) END AS c_int,
+                CASE WHEN value % 100 = 3 THEN NULL ELSE CAST(value * 1000 AS LONG) END AS c_long,
+                CASE
+                  WHEN value % 100 = 4 THEN NULL
+                  WHEN value % 100 = 5 THEN CAST('NaN' AS FLOAT)
+                  WHEN value % 100 = 6 THEN CAST('Infinity' AS FLOAT)
+                  WHEN value % 100 = 7 THEN CAST('-Infinity' AS FLOAT)
+                  ELSE CAST((value - 2500000) / 100.0 AS FLOAT)
+                END AS c_float,
+                CASE
+                  WHEN value % 100 = 8 THEN NULL
+                  WHEN value % 100 = 9 THEN CAST('NaN' AS DOUBLE)
+                  WHEN value % 100 = 10 THEN CAST('Infinity' AS DOUBLE)
+                  WHEN value % 100 = 11 THEN CAST('-Infinity' AS DOUBLE)
+                  ELSE CAST((value - 2500000) / 100.0 AS DOUBLE)
+                END AS c_double,
+                CASE WHEN value % 100 = 12 THEN NULL ELSE CAST((value - 2500000) / 100.0 AS DECIMAL(10,2)) END AS c_decimal
+              FROM $tbl
+            """))
+
+          // Run all benchmark categories
+          (generateConfigs(integerWideningPairs) ++
+            generateConfigs(integerNarrowingPairs) ++
+            generateConfigs(floatPairs) ++
+            generateConfigs(intToFloatPairs) ++
+            generateConfigs(floatToIntPairs) ++
+            generateConfigs(decimalPairs)).foreach { config =>
+            runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs)
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala
new file mode 100644
index 0000000000..1459ab941f
--- /dev/null
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.benchmark
+
+case class CastNumericToStringConfig(
+    name: String,
+    query: String,
+    extraCometConfigs: Map[String, String] = Map.empty)
+
+// spotless:off
+/**
+ * Benchmark to measure performance of Comet cast from numeric types to String. To run this
+ * benchmark:
+ * `SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastNumericToStringBenchmark`
+ * Results will be written to "spark/benchmarks/CometCastNumericToStringBenchmark-**results.txt".
+ */
+// spotless:on
+object CometCastNumericToStringBenchmark extends CometBenchmarkBase {
+
+  private val castFunctions = Seq("CAST", "TRY_CAST")
+  private val sourceTypes =
+    Seq(
+      ("BOOLEAN", "c_bool"),
+      ("BYTE", "c_byte"),
+      ("SHORT", "c_short"),
+      ("INT", "c_int"),
+      ("LONG", "c_long"),
+      ("FLOAT", "c_float"),
+      ("DOUBLE", "c_double"),
+      ("DECIMAL(10,2)", "c_decimal"))
+
+  private val castConfigs = for {
+    castFunc <- castFunctions
+    (sourceType, colName) <- sourceTypes
+  } yield CastNumericToStringConfig(
+    s"$castFunc $sourceType to String",
+    s"SELECT $castFunc($colName AS STRING) FROM parquetV1Table")
+
+  override def runCometBenchmark(mainArgs: Array[String]): Unit = {
+    val values = getBenchmarkRows(1024 * 1024 * 5) // 5M rows default
+
+    // Generate input data once with all numeric types
+    runBenchmarkWithTable("Numeric to String casts", values) { v =>
+      withTempPath { dir =>
+        withTempTable("parquetV1Table") {
+          // Generate varied numeric data including edge cases
+          prepareTable(
+            dir,
+            spark.sql(s"""
+              SELECT
+                CASE WHEN value % 100 = 0 THEN NULL ELSE (value % 2 = 0) END AS c_bool,
+                CASE WHEN value % 100 = 1 THEN NULL ELSE CAST((value % 128) - 64 AS BYTE) END AS c_byte,
+                CASE WHEN value % 100 = 2 THEN NULL ELSE CAST((value % 32768) - 16384 AS SHORT) END AS c_short,
+                CASE WHEN value % 100 = 3 THEN NULL ELSE CAST(value - 2500000 AS INT) END AS c_int,
+                CASE WHEN value % 100 = 4 THEN NULL ELSE CAST(value * 1000000 AS LONG) END AS c_long,
+                CASE
+                  WHEN value % 100 = 5 THEN NULL
+                  WHEN value % 100 = 6 THEN CAST('NaN' AS FLOAT)
+                  WHEN value % 100 = 7 THEN CAST('Infinity' AS FLOAT)
+                  WHEN value % 100 = 8 THEN CAST('-Infinity' AS FLOAT)
+                  ELSE CAST((value - 2500000) / 1000.0 AS FLOAT)
+                END AS c_float,
+                CASE
+                  WHEN value % 100 = 9 THEN NULL
+                  WHEN value % 100 = 10 THEN CAST('NaN' AS DOUBLE)
+                  WHEN value % 100 = 11 THEN CAST('Infinity' AS DOUBLE)
+                  WHEN value % 100 = 12 THEN CAST('-Infinity' AS DOUBLE)
+                  ELSE CAST((value - 2500000) / 100.0 AS DOUBLE)
+                END AS c_double,
+                CASE WHEN value % 100 = 13 THEN NULL ELSE CAST((value - 2500000) / 100.0 AS DECIMAL(10,2)) END AS c_decimal
+              FROM $tbl
+            """))
+
+          castConfigs.foreach { config =>
+            runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs)
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala
new file mode 100644
index 0000000000..a8e81a3dff
--- /dev/null
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.benchmark
+
+case class CastNumericToTemporalConfig(
+    name: String,
+    query: String,
+    extraCometConfigs: Map[String, String] = Map.empty)
+
+// spotless:off
+/**
+ * Benchmark to measure performance of Comet cast from numeric types to temporal types. To run
+ * this benchmark:
+ * `SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastNumericToTemporalBenchmark`
+ * Results will be written to "spark/benchmarks/CometCastNumericToTemporalBenchmark-**results.txt".
+ */
+// spotless:on
+object CometCastNumericToTemporalBenchmark extends CometBenchmarkBase {
+
+  private val castFunctions = Seq("CAST", "TRY_CAST")
+
+  // INT to DATE (days since epoch)
+  private val intToDateConfigs = for {
+    castFunc <- castFunctions
+  } yield CastNumericToTemporalConfig(
+    s"$castFunc Int to Date",
+    s"SELECT $castFunc(c_int AS DATE) FROM parquetV1Table")
+
+  // LONG to TIMESTAMP (microseconds since epoch)
+  private val longToTimestampConfigs = for {
+    castFunc <- castFunctions
+  } yield CastNumericToTemporalConfig(
+    s"$castFunc Long to Timestamp",
+    s"SELECT $castFunc(c_long AS TIMESTAMP) FROM parquetV1Table")
+
+  override def runCometBenchmark(mainArgs: Array[String]): Unit = {
+    val values = getBenchmarkRows(1024 * 1024 * 5) // 5M rows default
+
+    // Generate data once for INT to DATE conversions
+    runBenchmarkWithTable("Int to Date casts", values) { v =>
+      withTempPath { dir =>
+        withTempTable("parquetV1Table") {
+          // Generate INT values representing days since epoch (1970-01-01)
+          // Range: ~-18000 to +18000 days (roughly 1920 to 2020)
+          prepareTable(
+            dir,
+            spark.sql(s"""
+              SELECT CASE
+                WHEN value % 100 = 0 THEN NULL
+                ELSE CAST((value % 36500) - 18000 AS INT)
+              END AS c_int
+              FROM $tbl
+            """))
+
+          intToDateConfigs.foreach { config =>
+            runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs)
+          }
+        }
+      }
+    }
+
+    // Generate data once for LONG to TIMESTAMP conversions
+    runBenchmarkWithTable("Long to Timestamp casts", values) { v =>
+      withTempPath { dir =>
+        withTempTable("parquetV1Table") {
+          // Generate LONG values representing microseconds since epoch
+          // Range: 2020-2021 timestamps
+          prepareTable(
+            dir,
+            spark.sql(s"""
+              SELECT CASE
+                WHEN value % 100 = 0 THEN NULL
+                ELSE 1577836800000000 + (value % 31536000000000)
+              END AS c_long
+              FROM $tbl
+            """))
+
+          longToTimestampConfigs.foreach { config =>
+            runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs)
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala
new file mode 100644
index 0000000000..08850b6a12
--- /dev/null
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.benchmark
+
+case class CastTemporalToNumericConfig(
+    name: String,
+    query: String,
+    extraCometConfigs: Map[String, String] = Map.empty)
+
+// spotless:off
+/**
+ * Benchmark to measure performance of Comet cast from temporal types to numeric types. To run
+ * this benchmark:
+ * `SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastTemporalToNumericBenchmark`
+ * Results will be written to "spark/benchmarks/CometCastTemporalToNumericBenchmark-**results.txt".
+ */
+// spotless:on
+object CometCastTemporalToNumericBenchmark extends CometBenchmarkBase {
+
+  private val castFunctions = Seq("CAST", "TRY_CAST")
+
+  // DATE to numeric types
+  private val dateToNumericTypes = Seq("BYTE", "SHORT", "INT", "LONG")
+  private val dateToNumericConfigs = for {
+    castFunc <- castFunctions
+    targetType <- dateToNumericTypes
+  } yield CastTemporalToNumericConfig(
+    s"$castFunc Date to $targetType",
+    s"SELECT $castFunc(c_date AS $targetType) FROM parquetV1Table")
+
+  // TIMESTAMP to numeric types
+  private val timestampToNumericTypes = Seq("BYTE", "SHORT", "INT", "LONG")
+  private val timestampToNumericConfigs = for {
+    castFunc <- castFunctions
+    targetType <- timestampToNumericTypes
+  } yield CastTemporalToNumericConfig(
+    s"$castFunc Timestamp to $targetType",
+    s"SELECT $castFunc(c_timestamp AS $targetType) FROM parquetV1Table")
+
+  override def runCometBenchmark(mainArgs: Array[String]): Unit = {
+    val values = getBenchmarkRows(1024 * 1024 * 5) // 5M rows default
+
+    // Generate DATE data once for all date-to-numeric benchmarks
+    runBenchmarkWithTable("Date to Numeric casts", values) { v =>
+      withTempPath { dir =>
+        withTempTable("parquetV1Table") {
+          prepareTable(
+            dir,
+            spark.sql(s"""
+              SELECT CASE
+                WHEN value % 100 = 0 THEN NULL
+                ELSE DATE_ADD('2020-01-01', CAST(value % 3650 AS INT))
+              END AS c_date
+              FROM $tbl
+            """))
+
+          dateToNumericConfigs.foreach { config =>
+            runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs)
+          }
+        }
+      }
+    }
+
+    // Generate TIMESTAMP data once for all timestamp-to-numeric benchmarks
+    runBenchmarkWithTable("Timestamp to Numeric casts", values) { v =>
+      withTempPath { dir =>
+        withTempTable("parquetV1Table") {
+          prepareTable(
+            dir,
+            spark.sql(s"""
+              SELECT CASE
+                WHEN value % 100 = 0 THEN NULL
+                ELSE TIMESTAMP_MICROS(1577836800000000 + value % 31536000000000)
+              END AS c_timestamp
+              FROM $tbl
+            """))
+
+          timestampToNumericConfigs.foreach { config =>
+            runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs)
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala
new file mode 100644
index 0000000000..5e2316a142
--- /dev/null
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.benchmark
+
+case class CastTemporalToStringConfig(
+    name: String,
+    query: String,
+    extraCometConfigs: Map[String, String] = Map.empty)
+
+// spotless:off
+/**
+ * Benchmark to measure performance of Comet cast from temporal types to String. To run this
+ * benchmark:
+ * `SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastTemporalToStringBenchmark`
+ * Results will be written to "spark/benchmarks/CometCastTemporalToStringBenchmark-**results.txt".
+ */
+// spotless:on
+object CometCastTemporalToStringBenchmark extends CometBenchmarkBase {
+
+  private val castFunctions = Seq("CAST", "TRY_CAST")
+
+  private val dateCastConfigs = for {
+    castFunc <- castFunctions
+  } yield CastTemporalToStringConfig(
+    s"$castFunc Date to String",
+    s"SELECT $castFunc(c_date AS STRING) FROM parquetV1Table")
+
+  private val timestampCastConfigs = for {
+    castFunc <- castFunctions
+  } yield CastTemporalToStringConfig(
+    s"$castFunc Timestamp to String",
+    s"SELECT $castFunc(c_timestamp AS STRING) FROM parquetV1Table")
+
+  override def runCometBenchmark(mainArgs: Array[String]): Unit = {
+    val values = getBenchmarkRows(1024 * 1024 * 5) // 5M rows default
+
+    // Generate temporal data once for date benchmarks
+    runBenchmarkWithTable("Date to String casts", values) { v =>
+      withTempPath { dir =>
+        withTempTable("parquetV1Table") {
+          prepareTable(
+            dir,
+            spark.sql(s"""
+              SELECT CASE
+                WHEN value % 100 = 0 THEN NULL
+                ELSE DATE_ADD('2020-01-01', CAST(value % 3650 AS INT))
+              END AS c_date
+              FROM $tbl
+            """))
+
+          dateCastConfigs.foreach { config =>
+            runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs)
+          }
+        }
+      }
+    }
+
+    // Generate temporal data once for timestamp benchmarks
+    runBenchmarkWithTable("Timestamp to String casts", values) { v =>
+      withTempPath { dir =>
+        withTempTable("parquetV1Table") {
+          prepareTable(
+            dir,
+            spark.sql(s"""
+              SELECT CASE
+                WHEN value % 100 = 0 THEN NULL
+                ELSE TIMESTAMP_MICROS(1577836800000000 + value % 31536000000000)
+              END AS c_timestamp
+              FROM $tbl
+            """))
+
+          timestampCastConfigs.foreach { config =>
+            runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs)
+          }
+        }
+      }
+    }
+  }
+}

From a3d45af0c75b76a55b3ca0f2046720ff14c121d9 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 2 Jan 2026 12:30:33 -0700
Subject: [PATCH 3/8] fix docs

---
 .../spark/sql/benchmark/CometCastBooleanBenchmark.scala  | 6 +++---
 .../benchmark/CometCastNumericToNumericBenchmark.scala   | 9 ++++-----
 .../benchmark/CometCastNumericToStringBenchmark.scala    | 6 +++---
 .../benchmark/CometCastNumericToTemporalBenchmark.scala  | 9 +++++----
 .../benchmark/CometCastStringToTemporalBenchmark.scala   | 6 +++---
 .../benchmark/CometCastTemporalToNumericBenchmark.scala  | 9 +++++----
 .../benchmark/CometCastTemporalToStringBenchmark.scala   | 6 +++---
 7 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala
index 085e7388c7..c3043e8c90 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala
@@ -24,14 +24,14 @@ case class CastBooleanConfig(
     query: String,
     extraCometConfigs: Map[String, String] = Map.empty)
 
-// spotless:off
 /**
  * Benchmark to measure performance of Comet cast operations involving Boolean type. To run this
  * benchmark:
- * `SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastBooleanBenchmark`
+ * {{{
+ *   SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastBooleanBenchmark
+ * }}}
  * Results will be written to "spark/benchmarks/CometCastBooleanBenchmark-**results.txt".
  */
-// spotless:on
 object CometCastBooleanBenchmark extends CometBenchmarkBase {
 
   private val castFunctions = Seq("CAST", "TRY_CAST")
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala
index 5137e50e18..a4264a5b88 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala
@@ -24,14 +24,13 @@ case class CastNumericToNumericConfig(
     query: String,
     extraCometConfigs: Map[String, String] = Map.empty)
 
-// spotless:off
 /**
- * Benchmark to measure performance of Comet cast between numeric types. To run this
- * benchmark:
- * `SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastNumericToNumericBenchmark`
+ * Benchmark to measure performance of Comet cast between numeric types. To run this benchmark:
+ * {{{
+ *   SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastNumericToNumericBenchmark
+ * }}}
  * Results will be written to "spark/benchmarks/CometCastNumericToNumericBenchmark-**results.txt".
  */
-// spotless:on
 object CometCastNumericToNumericBenchmark extends CometBenchmarkBase {
 
   private val castFunctions = Seq("CAST", "TRY_CAST")
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala
index 1459ab941f..202f336b86 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala
@@ -24,14 +24,14 @@ case class CastNumericToStringConfig(
     query: String,
     extraCometConfigs: Map[String, String] = Map.empty)
 
-// spotless:off
 /**
  * Benchmark to measure performance of Comet cast from numeric types to String. To run this
  * benchmark:
- * `SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastNumericToStringBenchmark`
+ * {{{
+ *   SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastNumericToStringBenchmark
+ * }}}
  * Results will be written to "spark/benchmarks/CometCastNumericToStringBenchmark-**results.txt".
  */
-// spotless:on
 object CometCastNumericToStringBenchmark extends CometBenchmarkBase {
 
   private val castFunctions = Seq("CAST", "TRY_CAST")
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala
index a8e81a3dff..bc1cacd803 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala
@@ -24,14 +24,15 @@ case class CastNumericToTemporalConfig(
     query: String,
     extraCometConfigs: Map[String, String] = Map.empty)
 
-// spotless:off
 /**
  * Benchmark to measure performance of Comet cast from numeric types to temporal types. To run
  * this benchmark:
- * `SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastNumericToTemporalBenchmark`
- * Results will be written to "spark/benchmarks/CometCastNumericToTemporalBenchmark-**results.txt".
+ * {{{
+ *   SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastNumericToTemporalBenchmark
+ * }}}
+ * Results will be written to
+ * "spark/benchmarks/CometCastNumericToTemporalBenchmark-**results.txt".
  */
-// spotless:on
 object CometCastNumericToTemporalBenchmark extends CometBenchmarkBase {
 
   private val castFunctions = Seq("CAST", "TRY_CAST")
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala
index 39337be5c8..6cf4cd4cc4 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala
@@ -24,14 +24,14 @@ case class CastStringToTemporalConfig(
     query: String,
     extraCometConfigs: Map[String, String] = Map.empty)
 
-// spotless:off
 /**
  * Benchmark to measure performance of Comet cast from String to temporal types. To run this
  * benchmark:
- * `SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastStringToTemporalBenchmark`
+ * {{{
+ *   SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastStringToTemporalBenchmark
+ * }}}
  * Results will be written to "spark/benchmarks/CometCastStringToTemporalBenchmark-**results.txt".
  */
-// spotless:on
 object CometCastStringToTemporalBenchmark extends CometBenchmarkBase {
 
   // Configuration for String to temporal cast benchmarks
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala
index 08850b6a12..52402fc0db 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala
@@ -24,14 +24,15 @@ case class CastTemporalToNumericConfig(
     query: String,
     extraCometConfigs: Map[String, String] = Map.empty)
 
-// spotless:off
 /**
  * Benchmark to measure performance of Comet cast from temporal types to numeric types. To run
  * this benchmark:
- * `SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastTemporalToNumericBenchmark`
- * Results will be written to "spark/benchmarks/CometCastTemporalToNumericBenchmark-**results.txt".
+ * {{{
+ *   SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastTemporalToNumericBenchmark
+ * }}}
+ * Results will be written to
+ * "spark/benchmarks/CometCastTemporalToNumericBenchmark-**results.txt".
  */
-// spotless:on
 object CometCastTemporalToNumericBenchmark extends CometBenchmarkBase {
 
   private val castFunctions = Seq("CAST", "TRY_CAST")
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala
index 5e2316a142..04db279659 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala
@@ -24,14 +24,14 @@ case class CastTemporalToStringConfig(
     query: String,
     extraCometConfigs: Map[String, String] = Map.empty)
 
-// spotless:off
 /**
  * Benchmark to measure performance of Comet cast from temporal types to String. To run this
  * benchmark:
- * `SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastTemporalToStringBenchmark`
+ * {{{
+ *   SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastTemporalToStringBenchmark
+ * }}}
  * Results will be written to "spark/benchmarks/CometCastTemporalToStringBenchmark-**results.txt".
  */
-// spotless:on
 object CometCastTemporalToStringBenchmark extends CometBenchmarkBase {
 
   private val castFunctions = Seq("CAST", "TRY_CAST")

From 691d7b57eada9c425066077be1c1ed697c25ec21 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 2 Jan 2026 12:41:37 -0700
Subject: [PATCH 4/8] Add temporal casts

---
 .../benchmark/CometCastBooleanBenchmark.scala |  2 +-
 .../CometCastNumericToNumericBenchmark.scala  |  2 +-
 .../CometCastNumericToStringBenchmark.scala   |  2 +-
 .../CometCastNumericToTemporalBenchmark.scala |  2 +-
 .../CometCastTemporalToNumericBenchmark.scala |  2 +-
 .../CometCastTemporalToStringBenchmark.scala  |  2 +-
 ...CometCastTemporalToTemporalBenchmark.scala | 98 +++++++++++++++++++
 7 files changed, 104 insertions(+), 6 deletions(-)
 create mode 100644 spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToTemporalBenchmark.scala

diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala
index c3043e8c90..d8e0419867 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala
@@ -71,7 +71,7 @@ object CometCastBooleanBenchmark extends CometBenchmarkBase {
     s"SELECT $castFunc($colName AS BOOLEAN) FROM parquetV1Table")
 
   override def runCometBenchmark(mainArgs: Array[String]): Unit = {
-    val values = getBenchmarkRows(1024 * 1024 * 5) // 5M rows default
+    val values = 1024 * 1024 * 5 // 5M rows
 
     // Generate boolean data for boolean-to-other casts
     runBenchmarkWithTable("Boolean to other types casts", values) { v =>
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala
index a4264a5b88..fba9fe0135 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala
@@ -92,7 +92,7 @@ object CometCastNumericToNumericBenchmark extends CometBenchmarkBase {
   }
 
   override def runCometBenchmark(mainArgs: Array[String]): Unit = {
-    val values = getBenchmarkRows(1024 * 1024 * 5) // 5M rows default
+    val values = 1024 * 1024 * 5 // 5M rows
 
     // Generate input data once with all numeric types
     runBenchmarkWithTable("Numeric to Numeric casts", values) { v =>
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala
index 202f336b86..c7574f6e8d 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala
@@ -54,7 +54,7 @@ object CometCastNumericToStringBenchmark extends CometBenchmarkBase {
     s"SELECT $castFunc($colName AS STRING) FROM parquetV1Table")
 
   override def runCometBenchmark(mainArgs: Array[String]): Unit = {
-    val values = getBenchmarkRows(1024 * 1024 * 5) // 5M rows default
+    val values = 1024 * 1024 * 5 // 5M rows
 
     // Generate input data once with all numeric types
     runBenchmarkWithTable("Numeric to String casts", values) { v =>
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala
index bc1cacd803..da0de5b429 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala
@@ -52,7 +52,7 @@ object CometCastNumericToTemporalBenchmark extends CometBenchmarkBase {
     s"SELECT $castFunc(c_long AS TIMESTAMP) FROM parquetV1Table")
 
   override def runCometBenchmark(mainArgs: Array[String]): Unit = {
-    val values = getBenchmarkRows(1024 * 1024 * 5) // 5M rows default
+    val values = 1024 * 1024 * 5 // 5M rows
 
     // Generate data once for INT to DATE conversions
     runBenchmarkWithTable("Int to Date casts", values) { v =>
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala
index 52402fc0db..022dfee7f5 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala
@@ -56,7 +56,7 @@ object CometCastTemporalToNumericBenchmark extends CometBenchmarkBase {
     s"SELECT $castFunc(c_timestamp AS $targetType) FROM parquetV1Table")
 
   override def runCometBenchmark(mainArgs: Array[String]): Unit = {
-    val values = getBenchmarkRows(1024 * 1024 * 5) // 5M rows default
+    val values = 1024 * 1024 * 5 // 5M rows
 
     // Generate DATE data once for all date-to-numeric benchmarks
     runBenchmarkWithTable("Date to Numeric casts", values) { v =>
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala
index 04db279659..2430024487 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala
@@ -49,7 +49,7 @@ object CometCastTemporalToStringBenchmark extends CometBenchmarkBase {
     s"SELECT $castFunc(c_timestamp AS STRING) FROM parquetV1Table")
 
   override def runCometBenchmark(mainArgs: Array[String]): Unit = {
-    val values = getBenchmarkRows(1024 * 1024 * 5) // 5M rows default
+    val values = 1024 * 1024 * 5 // 5M rows
 
     // Generate temporal data once for date benchmarks
     runBenchmarkWithTable("Date to String casts", values) { v =>
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToTemporalBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToTemporalBenchmark.scala
new file mode 100644
index 0000000000..2dfc8e8012
--- /dev/null
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToTemporalBenchmark.scala
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.benchmark
+
+case class CastTemporalToTemporalConfig(
+    name: String,
+    query: String,
+    extraCometConfigs: Map[String, String] = Map.empty)
+
+/**
+ * Benchmark to measure performance of Comet cast between temporal types. To run this benchmark:
+ * {{{
+ *   SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastTemporalToTemporalBenchmark
+ * }}}
+ * Results will be written to
+ * "spark/benchmarks/CometCastTemporalToTemporalBenchmark-**results.txt".
+ */
+object CometCastTemporalToTemporalBenchmark extends CometBenchmarkBase {
+
+  private val castFunctions = Seq("CAST", "TRY_CAST")
+
+  // Date to Timestamp
+  private val dateToTimestampConfigs = for {
+    castFunc <- castFunctions
+  } yield CastTemporalToTemporalConfig(
+    s"$castFunc Date to Timestamp",
+    s"SELECT $castFunc(c_date AS TIMESTAMP) FROM parquetV1Table")
+
+  // Timestamp to Date
+  private val timestampToDateConfigs = for {
+    castFunc <- castFunctions
+  } yield CastTemporalToTemporalConfig(
+    s"$castFunc Timestamp to Date",
+    s"SELECT $castFunc(c_timestamp AS DATE) FROM parquetV1Table")
+
+  override def runCometBenchmark(mainArgs: Array[String]): Unit = {
+    val values = 1024 * 1024 * 5 // 5M rows
+
+    // Generate DATE data for Date -> Timestamp benchmarks
+    runBenchmarkWithTable("Date to Timestamp casts", values) { v =>
+      withTempPath { dir =>
+        withTempTable("parquetV1Table") {
+          prepareTable(
+            dir,
+            spark.sql(s"""
+              SELECT CASE
+                WHEN value % 100 = 0 THEN NULL
+                ELSE DATE_ADD('2020-01-01', CAST(value % 3650 AS INT))
+              END AS c_date
+              FROM $tbl
+            """))
+
+          dateToTimestampConfigs.foreach { config =>
+            runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs)
+          }
+        }
+      }
+    }
+
+    // Generate TIMESTAMP data for Timestamp -> Date benchmarks
+    runBenchmarkWithTable("Timestamp to Date casts", values) { v =>
+      withTempPath { dir =>
+        withTempTable("parquetV1Table") {
+          prepareTable(
+            dir,
+            spark.sql(s"""
+              SELECT CASE
+                WHEN value % 100 = 0 THEN NULL
+                ELSE TIMESTAMP_MICROS(1577836800000000 + value % 31536000000000)
+              END AS c_timestamp
+              FROM $tbl
+            """))
+
+          timestampToDateConfigs.foreach { config =>
+            runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs)
+          }
+        }
+      }
+    }
+  }
+}

From 199cc4385f21a0bd1bf189697a5354d66eeed25b Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 2 Jan 2026 12:47:33 -0700
Subject: [PATCH 5/8] use consistent row count

---
 .../org/apache/spark/sql/benchmark/CometCastBenchmark.scala     | 2 +-
 .../apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala  | 2 +-
 .../sql/benchmark/CometCastNumericToNumericBenchmark.scala      | 2 +-
 .../spark/sql/benchmark/CometCastNumericToStringBenchmark.scala | 2 +-
 .../sql/benchmark/CometCastNumericToTemporalBenchmark.scala     | 2 +-
 .../sql/benchmark/CometCastStringToTemporalBenchmark.scala      | 2 +-
 .../sql/benchmark/CometCastTemporalToNumericBenchmark.scala     | 2 +-
 .../sql/benchmark/CometCastTemporalToStringBenchmark.scala      | 2 +-
 .../sql/benchmark/CometCastTemporalToTemporalBenchmark.scala    | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBenchmark.scala
index 975abd632f..08ab2c344b 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBenchmark.scala
@@ -61,7 +61,7 @@ object CometCastBenchmark extends CometBenchmarkBase {
           case Compatible(notes) =>
             runBenchmarkWithTable(
               s"Running benchmark cast operation from : $LongType to : $toDataType",
-              1024 * 1024 * 10) { v =>
+              1024 * 1024) { v => // 1M rows
               castBenchmark(v, LongType, toDataType, isAnsiMode = ansiMode)
             }
           case Incompatible(notes) => None
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala
index d8e0419867..6a81cce20e 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala
@@ -71,7 +71,7 @@ object CometCastBooleanBenchmark extends CometBenchmarkBase {
     s"SELECT $castFunc($colName AS BOOLEAN) FROM parquetV1Table")
 
   override def runCometBenchmark(mainArgs: Array[String]): Unit = {
-    val values = 1024 * 1024 * 5 // 5M rows
+    val values = 1024 * 1024 // 1M rows
 
     // Generate boolean data for boolean-to-other casts
     runBenchmarkWithTable("Boolean to other types casts", values) { v =>
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala
index fba9fe0135..e7adf370cb 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala
@@ -92,7 +92,7 @@ object CometCastNumericToNumericBenchmark extends CometBenchmarkBase {
   }
 
   override def runCometBenchmark(mainArgs: Array[String]): Unit = {
-    val values = 1024 * 1024 * 5 // 5M rows
+    val values = 1024 * 1024 // 1M rows
 
     // Generate input data once with all numeric types
     runBenchmarkWithTable("Numeric to Numeric casts", values) { v =>
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala
index c7574f6e8d..00fba11392 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala
@@ -54,7 +54,7 @@ object CometCastNumericToStringBenchmark extends CometBenchmarkBase {
     s"SELECT $castFunc($colName AS STRING) FROM parquetV1Table")
 
   override def runCometBenchmark(mainArgs: Array[String]): Unit = {
-    val values = 1024 * 1024 * 5 // 5M rows
+    val values = 1024 * 1024 // 1M rows
 
     // Generate input data once with all numeric types
     runBenchmarkWithTable("Numeric to String casts", values) { v =>
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala
index da0de5b429..0545ab147c 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala
@@ -52,7 +52,7 @@ object CometCastNumericToTemporalBenchmark extends CometBenchmarkBase {
     s"SELECT $castFunc(c_long AS TIMESTAMP) FROM parquetV1Table")
 
   override def runCometBenchmark(mainArgs: Array[String]): Unit = {
-    val values = 1024 * 1024 * 5 // 5M rows
+    val values = 1024 * 1024 // 1M rows
 
     // Generate data once for INT to DATE conversions
     runBenchmarkWithTable("Int to Date casts", values) { v =>
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala
index 6cf4cd4cc4..79856f0f47 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala
@@ -52,7 +52,7 @@ object CometCastStringToTemporalBenchmark extends CometBenchmarkBase {
       "SELECT TRY_CAST(c1 AS TIMESTAMP) FROM parquetV1Table"))
 
   override def runCometBenchmark(mainArgs: Array[String]): Unit = {
-    val values = 1024 * 1024 * 10 // 10M rows
+    val values = 1024 * 1024 // 1M rows
 
     // Generate date data once with ~10% invalid values
     runBenchmarkWithTable("date data generation", values) { v =>
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala
index 022dfee7f5..d858c76a26 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala
@@ -56,7 +56,7 @@ object CometCastTemporalToNumericBenchmark extends CometBenchmarkBase {
     s"SELECT $castFunc(c_timestamp AS $targetType) FROM parquetV1Table")
 
   override def runCometBenchmark(mainArgs: Array[String]): Unit = {
-    val values = 1024 * 1024 * 5 // 5M rows
+    val values = 1024 * 1024 // 1M rows
 
     // Generate DATE data once for all date-to-numeric benchmarks
     runBenchmarkWithTable("Date to Numeric casts", values) { v =>
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala
index 2430024487..5d21bbffef 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala
@@ -49,7 +49,7 @@ object CometCastTemporalToStringBenchmark extends CometBenchmarkBase {
     s"SELECT $castFunc(c_timestamp AS STRING) FROM parquetV1Table")
 
   override def runCometBenchmark(mainArgs: Array[String]): Unit = {
-    val values = 1024 * 1024 * 5 // 5M rows
+    val values = 1024 * 1024 // 1M rows
 
     // Generate temporal data once for date benchmarks
     runBenchmarkWithTable("Date to String casts", values) { v =>
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToTemporalBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToTemporalBenchmark.scala
index 2dfc8e8012..df675f6cb8 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToTemporalBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToTemporalBenchmark.scala
@@ -51,7 +51,7 @@ object CometCastTemporalToTemporalBenchmark extends CometBenchmarkBase {
     s"SELECT $castFunc(c_timestamp AS DATE) FROM parquetV1Table")
 
   override def runCometBenchmark(mainArgs: Array[String]): Unit = {
-    val values = 1024 * 1024 * 5 // 5M rows
+    val values = 1024 * 1024 // 1M rows
 
     // Generate DATE data for Date -> Timestamp benchmarks
     runBenchmarkWithTable("Date to Timestamp casts", values) { v =>

From f28b98444999633a8fb6ce8357350df473e279ad Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 2 Jan 2026 12:49:14 -0700
Subject: [PATCH 6/8] remove legacy benchmark

---
 .../sql/benchmark/CometCastBenchmark.scala    | 96 -------------------
 1 file changed, 96 deletions(-)
 delete mode 100644 spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBenchmark.scala

diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBenchmark.scala
deleted file mode 100644
index 08ab2c344b..0000000000
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBenchmark.scala
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.benchmark
-
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.types.{DataType, LongType}
-
-import org.apache.comet.expressions.{CometCast, CometEvalMode}
-import org.apache.comet.serde.{Compatible, Incompatible, Unsupported}
-
-/**
- * Benchmark to measure Comet execution performance. To run this benchmark:
- * {{{
- *   SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastBenchmark
- * }}}
- *
- * Results will be written to "spark/benchmarks/CometCastBenchmark-**results.txt".
- */
-
-object CometCastBenchmark extends CometBenchmarkBase {
-
-  override def getSparkSession: SparkSession = {
-    val session = super.getSparkSession
-    session.conf.set("parquet.enable.dictionary", "false")
-    session.conf.set("spark.sql.shuffle.partitions", "2")
-    session
-  }
-
-  def castExprSQL(toDataType: DataType, input: String): String = {
-    s"CAST ($input AS ${toDataType.sql})"
-  }
-
-  override def runCometBenchmark(args: Array[String]): Unit = {
-
-    //  TODO : Create all possible input datatypes. We only have Long inputs for now
-    CometCast.supportedTypes.foreach { toDataType =>
-      Seq(false, true).foreach { ansiMode =>
-        CometCast.isSupported(
-          LongType,
-          toDataType,
-          None,
-          if (ansiMode) CometEvalMode.ANSI else CometEvalMode.LEGACY) match {
-          case Compatible(notes) =>
-            runBenchmarkWithTable(
-              s"Running benchmark cast operation from : $LongType to : $toDataType",
-              1024 * 1024) { v => // 1M rows
-              castBenchmark(v, LongType, toDataType, isAnsiMode = ansiMode)
-            }
-          case Incompatible(notes) => None
-          case Unsupported(notes) => None
-        }
-      }
-    }
-  }
-
-  def castBenchmark(
-      values: Int,
-      fromDataType: DataType,
-      toDataType: DataType,
-      isAnsiMode: Boolean): Unit = {
-
-    withTempPath { dir =>
-      withTempTable("parquetV1Table") {
-        prepareTable(dir, spark.sql(s"SELECT value FROM $tbl"))
-
-        val functionSQL = castExprSQL(toDataType, "value")
-        val query = s"SELECT $functionSQL FROM parquetV1Table"
-        val name =
-          s"Cast function to : ${toDataType} , ansi mode enabled : ${isAnsiMode}"
-
-        val extraConfigs = Map(SQLConf.ANSI_ENABLED.key -> isAnsiMode.toString)
-
-        runExpressionBenchmark(name, values, query, extraConfigs)
-      }
-    }
-  }
-
-}

From 6869f79709dff276c22154a4ba0201700db5ba19 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 2 Jan 2026 15:14:21 -0700
Subject: [PATCH 7/8] skip failing suite

---
 .github/workflows/spark_sql_test.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/spark_sql_test.yml b/.github/workflows/spark_sql_test.yml
index 1ff6fa952c..2fe5fefe1a 100644
--- a/.github/workflows/spark_sql_test.yml
+++ b/.github/workflows/spark_sql_test.yml
@@ -65,6 +65,10 @@ jobs:
           - {name: "sql_hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest"}
           - {name: "sql_hive-2", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.ExtendedHiveTest"}
           - {name: "sql_hive-3", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.SlowHiveTest"}
+        # Skip sql_hive-1 for Spark 4.0 due to https://github.com/apache/datafusion-comet/issues/2946
+        exclude:
+          - spark-version: {short: '4.0', full: '4.0.1', java: 17}
+            module: {name: "sql_hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest"}
       fail-fast: false
     name: spark-sql-${{ matrix.module.name }}/${{ matrix.os }}/spark-${{ matrix.spark-version.full }}/java-${{ matrix.spark-version.java }}
     runs-on: ${{ matrix.os }}

From fc8b30df5235ebac74565876371e455caad946f6 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 6 Jan 2026 11:42:26 -0700
Subject: [PATCH 8/8] address feedback

---
 .../spark/sql/benchmark/CometCastBooleanBenchmark.scala   | 2 ++
 .../benchmark/CometCastNumericToNumericBenchmark.scala    | 8 +++++++-
 .../sql/benchmark/CometCastNumericToStringBenchmark.scala | 8 +++++++-
 .../benchmark/CometCastNumericToTemporalBenchmark.scala   | 6 ++----
 .../sql/benchmark/CometCastStringToNumericBenchmark.scala | 7 +++++--
 .../benchmark/CometCastStringToTemporalBenchmark.scala    | 2 ++
 .../benchmark/CometCastTemporalToNumericBenchmark.scala   | 2 ++
 .../benchmark/CometCastTemporalToStringBenchmark.scala    | 2 ++
 .../benchmark/CometCastTemporalToTemporalBenchmark.scala  | 2 ++
 9 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala
index 6a81cce20e..57b8e88a7b 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala
@@ -77,6 +77,7 @@ object CometCastBooleanBenchmark extends CometBenchmarkBase {
     runBenchmarkWithTable("Boolean to other types casts", values) { v =>
       withTempPath { dir =>
         withTempTable("parquetV1Table") {
+          // Data distribution: 1% NULL, 50/50 true/false
           prepareTable(
             dir,
             spark.sql(s"""
@@ -98,6 +99,7 @@ object CometCastBooleanBenchmark extends CometBenchmarkBase {
     runBenchmarkWithTable("Numeric to Boolean casts", values) { v =>
       withTempPath { dir =>
         withTempTable("parquetV1Table") {
+          // Data distribution: 1% NULL per column, values in {-1, 0, 1} (~33% each)
           prepareTable(
             dir,
             spark.sql(s"""
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala
index e7adf370cb..a9ea19a0e9 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala
@@ -98,7 +98,13 @@ object CometCastNumericToNumericBenchmark extends CometBenchmarkBase {
     runBenchmarkWithTable("Numeric to Numeric casts", values) { v =>
       withTempPath { dir =>
         withTempTable("parquetV1Table") {
-          // Generate varied numeric data including edge cases
+          // Data distribution: 1% NULL per column
+          // - c_byte: full range -64 to 63
+          // - c_short: full range -16384 to 16383
+          // - c_int: centered around 0 (-2.5M to +2.5M)
+          // - c_long: large positive values (0 to ~5 billion)
+          // - c_float/c_double: 4% special values (NaN/Infinity), rest centered around 0
+          // - c_decimal: values from -25000.00 to +25000.00
           prepareTable(
             dir,
             spark.sql(s"""
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala
index 00fba11392..1fd2138c58 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala
@@ -60,7 +60,13 @@ object CometCastNumericToStringBenchmark extends CometBenchmarkBase {
     runBenchmarkWithTable("Numeric to String casts", values) { v =>
       withTempPath { dir =>
         withTempTable("parquetV1Table") {
-          // Generate varied numeric data including edge cases
+          // Data distribution: 1% NULL per column
+          // - c_bool: 50/50 true/false
+          // - c_byte: full range -64 to 63
+          // - c_short: full range -16384 to 16383
+          // - c_int/c_long: large values centered around 0
+          // - c_float/c_double: 3% special values (NaN/Infinity), rest centered around 0
+          // - c_decimal: values from -25000.00 to +25000.00
           prepareTable(
             dir,
             spark.sql(s"""
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala
index 0545ab147c..ec2d9ab12f 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala
@@ -58,8 +58,7 @@ object CometCastNumericToTemporalBenchmark extends CometBenchmarkBase {
     runBenchmarkWithTable("Int to Date casts", values) { v =>
       withTempPath { dir =>
         withTempTable("parquetV1Table") {
-          // Generate INT values representing days since epoch (1970-01-01)
-          // Range: ~-18000 to +18000 days (roughly 1920 to 2020)
+          // Data distribution: 1% NULL, days since epoch spanning ~100 years (1920-2020)
           prepareTable(
             dir,
             spark.sql(s"""
@@ -81,8 +80,7 @@ object CometCastNumericToTemporalBenchmark extends CometBenchmarkBase {
     runBenchmarkWithTable("Long to Timestamp casts", values) { v =>
       withTempPath { dir =>
         withTempTable("parquetV1Table") {
-          // Generate LONG values representing microseconds since epoch
-          // Range: 2020-2021 timestamps
+          // Data distribution: 1% NULL, microseconds since epoch spanning ~1 year from 2020-01-01
           prepareTable(
             dir,
             spark.sql(s"""
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToNumericBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToNumericBenchmark.scala
index 7f210fc730..c71eadad8c 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToNumericBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToNumericBenchmark.scala
@@ -68,8 +68,11 @@ object CometCastStringToNumericBenchmark extends CometBenchmarkBase {
     runBenchmarkWithTable("String to numeric casts", values) { v =>
       withTempPath { dir =>
         withTempTable("parquetV1Table") {
-          // Generate numeric strings with both integer and decimal values
-          // Also include some special values: nulls (~2%), NaN (~2%), Infinity (~2%)
+          // Data distribution:
+          // - 2% NULL, 2% 'NaN', 2% 'Infinity', 2% '-Infinity'
+          // - 12% small integers (0-98)
+          // - 40% medium integers (0-999,998)
+          // - 40% decimals centered around 0 (approx -5000.00 to +5000.00)
           prepareTable(
             dir,
             spark.sql(s"""
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala
index 79856f0f47..77cc009ae1 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala
@@ -58,6 +58,7 @@ object CometCastStringToTemporalBenchmark extends CometBenchmarkBase {
     runBenchmarkWithTable("date data generation", values) { v =>
       withTempPath { dateDir =>
         withTempTable("parquetV1Table") {
+          // Data distribution: 10% invalid strings, 90% valid date strings spanning ~10 years
           prepareTable(
             dateDir,
             spark.sql(s"""
@@ -80,6 +81,7 @@ object CometCastStringToTemporalBenchmark extends CometBenchmarkBase {
     runBenchmarkWithTable("timestamp data generation", values) { v =>
       withTempPath { timestampDir =>
         withTempTable("parquetV1Table") {
+          // Data distribution: 10% invalid strings, 90% valid timestamp strings (1970 epoch range)
           prepareTable(
             timestampDir,
             spark.sql(s"""
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala
index d858c76a26..1468cbe086 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala
@@ -62,6 +62,7 @@ object CometCastTemporalToNumericBenchmark extends CometBenchmarkBase {
     runBenchmarkWithTable("Date to Numeric casts", values) { v =>
       withTempPath { dir =>
         withTempTable("parquetV1Table") {
+          // Data distribution: 1% NULL, dates spanning ~10 years from 2020-01-01
           prepareTable(
             dir,
             spark.sql(s"""
@@ -83,6 +84,7 @@ object CometCastTemporalToNumericBenchmark extends CometBenchmarkBase {
     runBenchmarkWithTable("Timestamp to Numeric casts", values) { v =>
       withTempPath { dir =>
         withTempTable("parquetV1Table") {
+          // Data distribution: 1% NULL, timestamps spanning ~1 year from 2020-01-01
           prepareTable(
             dir,
             spark.sql(s"""
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala
index 5d21bbffef..1ef3e7711d 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala
@@ -55,6 +55,7 @@ object CometCastTemporalToStringBenchmark extends CometBenchmarkBase {
     runBenchmarkWithTable("Date to String casts", values) { v =>
       withTempPath { dir =>
         withTempTable("parquetV1Table") {
+          // Data distribution: 1% NULL, dates spanning ~10 years from 2020-01-01
           prepareTable(
             dir,
             spark.sql(s"""
@@ -76,6 +77,7 @@ object CometCastTemporalToStringBenchmark extends CometBenchmarkBase {
     runBenchmarkWithTable("Timestamp to String casts", values) { v =>
       withTempPath { dir =>
         withTempTable("parquetV1Table") {
+          // Data distribution: 1% NULL, timestamps spanning ~1 year from 2020-01-01
           prepareTable(
             dir,
             spark.sql(s"""
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToTemporalBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToTemporalBenchmark.scala
index df675f6cb8..f2e2572487 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToTemporalBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToTemporalBenchmark.scala
@@ -57,6 +57,7 @@ object CometCastTemporalToTemporalBenchmark extends CometBenchmarkBase {
     runBenchmarkWithTable("Date to Timestamp casts", values) { v =>
       withTempPath { dir =>
         withTempTable("parquetV1Table") {
+          // Data distribution: 1% NULL, dates spanning ~10 years from 2020-01-01
           prepareTable(
             dir,
             spark.sql(s"""
@@ -78,6 +79,7 @@ object CometCastTemporalToTemporalBenchmark extends CometBenchmarkBase {
     runBenchmarkWithTable("Timestamp to Date casts", values) { v =>
       withTempPath { dir =>
         withTempTable("parquetV1Table") {
+          // Data distribution: 1% NULL, timestamps spanning ~1 year from 2020-01-01
           prepareTable(
             dir,
             spark.sql(s"""