From 48cc37b3ddf1755e772369510cbaebca2014e7f4 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 2 Jan 2026 12:07:44 -0700
Subject: [PATCH 1/5] add microbenchmark for hash expressions

---
 .../CometHashExpressionBenchmark.scala        | 70 +++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 spark/src/test/scala/org/apache/spark/sql/benchmark/CometHashExpressionBenchmark.scala

diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometHashExpressionBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometHashExpressionBenchmark.scala
new file mode 100644
index 0000000000..e1304dbe62
--- /dev/null
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometHashExpressionBenchmark.scala
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.benchmark
+
+case class HashExprConfig(
+    name: String,
+    query: String,
+    extraCometConfigs: Map[String, String] = Map.empty)
+
+/**
+ * Comprehensive benchmark for Comet hash expressions. To run this benchmark:
+ * {{{
+ *   SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometHashExpressionBenchmark
+ * }}}
+ * Results will be written to "spark/benchmarks/CometHashExpressionBenchmark-**results.txt".
+ */
+object CometHashExpressionBenchmark extends CometBenchmarkBase {
+
+  private val hashExpressions = List(
+    HashExprConfig("xxhash64_single", "SELECT xxhash64(c_str) FROM parquetV1Table"),
+    HashExprConfig("xxhash64_multi", "SELECT xxhash64(c_str, c_int, c_long) FROM parquetV1Table"),
+    HashExprConfig("murmur3_hash_single", "SELECT hash(c_str) FROM parquetV1Table"),
+    HashExprConfig("murmur3_hash_multi", "SELECT hash(c_str, c_int, c_long) FROM parquetV1Table"),
+    HashExprConfig("sha1", "SELECT sha1(c_str) FROM parquetV1Table"),
+    HashExprConfig("sha2_224", "SELECT sha2(c_str, 224) FROM parquetV1Table"),
+    HashExprConfig("sha2_256", "SELECT sha2(c_str, 256) FROM parquetV1Table"),
+    HashExprConfig("sha2_384", "SELECT sha2(c_str, 384) FROM parquetV1Table"),
+    HashExprConfig("sha2_512", "SELECT sha2(c_str, 512) FROM parquetV1Table"))
+
+  override def runCometBenchmark(mainArgs: Array[String]): Unit = {
+    val values = 1024 * 1024
+
+    runBenchmarkWithTable("Hash expression benchmarks", values) { v =>
+      withTempPath { dir =>
+        withTempTable("parquetV1Table") {
+          prepareTable(
+            dir,
+            spark.sql(s"""
+              SELECT
+                CASE WHEN value % 100 = 0 THEN NULL ELSE CONCAT('string_', CAST(value AS STRING)) END AS c_str,
+                CASE WHEN value % 100 = 1 THEN NULL ELSE CAST(value % 1000000 AS INT) END AS c_int,
+                CASE WHEN value % 100 = 2 THEN NULL ELSE CAST(value * 1000 AS LONG) END AS c_long
+              FROM $tbl
+            """))
+
+          hashExpressions.foreach { config =>
+            runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs)
+          }
+        }
+      }
+    }
+  }
+}

From 9af4184357a436b0a9ab00900d6267dd1d6de1d8 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 2 Jan 2026 12:20:51 -0700
Subject: [PATCH 2/5] skip some CI workflows for benchmark changes

---
 .github/workflows/pr_benchmark_check.yml | 85 ++++++++++++++++++++++++
 .github/workflows/pr_build_linux.yml     |  6 ++
 .github/workflows/pr_build_macos.yml     |  6 ++
 .github/workflows/spark_sql_test.yml     |  6 ++
 4 files changed, 103 insertions(+)
 create mode 100644 .github/workflows/pr_benchmark_check.yml

diff --git a/.github/workflows/pr_benchmark_check.yml b/.github/workflows/pr_benchmark_check.yml
new file mode 100644
index 0000000000..b7475b9076
--- /dev/null
+++ b/.github/workflows/pr_benchmark_check.yml
@@ -0,0 +1,85 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Lightweight CI for benchmark-only changes - verifies compilation and linting
+# without running full test suites
+
+name: PR Benchmark Check
+
+concurrency:
+  group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
+  cancel-in-progress: true
+
+on:
+  push:
+    paths:
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
+  pull_request:
+    paths:
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
+  workflow_dispatch:
+
+env:
+  RUST_VERSION: stable
+
+jobs:
+  benchmark-check:
+    name: Benchmark Compile & Lint Check
+    runs-on: ubuntu-latest
+    container:
+      image: amd64/rust
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Setup Rust & Java toolchain
+        uses: ./.github/actions/setup-builder
+        with:
+          rust-version: ${{ env.RUST_VERSION }}
+          jdk-version: 17
+
+      - name: Check Cargo fmt
+        run: |
+          cd native
+          cargo fmt --all -- --check --color=never
+
+      - name: Check Cargo clippy
+        run: |
+          cd native
+          cargo clippy --color=never --all-targets --workspace -- -D warnings
+
+      - name: Check benchmark compilation
+        run: |
+          cd native
+          cargo check --benches
+
+      - name: Cache Maven dependencies
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.m2/repository
+            /root/.m2/repository
+          key: ${{ runner.os }}-benchmark-maven-${{ hashFiles('**/pom.xml') }}
+          restore-keys: |
+            ${{ runner.os }}-benchmark-maven-
+
+      - name: Check Scala compilation and linting
+        run: |
+          ./mvnw -B compile test-compile scalafix:scalafix -Dscalafix.mode=CHECK -Psemanticdb -DskipTests
diff --git a/.github/workflows/pr_build_linux.yml b/.github/workflows/pr_build_linux.yml
index e3b0e40566..beb5f9dcf7 100644
--- a/.github/workflows/pr_build_linux.yml
+++ b/.github/workflows/pr_build_linux.yml
@@ -27,11 +27,17 @@ on:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
   pull_request:
     paths-ignore:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
   # manual trigger
   # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow
   workflow_dispatch:
diff --git a/.github/workflows/pr_build_macos.yml b/.github/workflows/pr_build_macos.yml
index 0ad40c1932..9a45fe022d 100644
--- a/.github/workflows/pr_build_macos.yml
+++ b/.github/workflows/pr_build_macos.yml
@@ -27,11 +27,17 @@ on:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
   pull_request:
     paths-ignore:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
   # manual trigger
   # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow
   workflow_dispatch:
diff --git a/.github/workflows/spark_sql_test.yml b/.github/workflows/spark_sql_test.yml
index d143ef83a0..1ff6fa952c 100644
--- a/.github/workflows/spark_sql_test.yml
+++ b/.github/workflows/spark_sql_test.yml
@@ -27,11 +27,17 @@ on:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
   pull_request:
     paths-ignore:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
   # manual trigger
   # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow
   workflow_dispatch:

From 6869f79709dff276c22154a4ba0201700db5ba19 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 2 Jan 2026 15:14:21 -0700
Subject: [PATCH 3/5] skip failing suite

---
 .github/workflows/spark_sql_test.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/spark_sql_test.yml b/.github/workflows/spark_sql_test.yml
index 1ff6fa952c..2fe5fefe1a 100644
--- a/.github/workflows/spark_sql_test.yml
+++ b/.github/workflows/spark_sql_test.yml
@@ -65,6 +65,10 @@ jobs:
           - {name: "sql_hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest"}
           - {name: "sql_hive-2", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.ExtendedHiveTest"}
           - {name: "sql_hive-3", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.SlowHiveTest"}
+        # Skip sql_hive-1 for Spark 4.0 due to https://github.com/apache/datafusion-comet/issues/2946
+        exclude:
+          - spark-version: {short: '4.0', full: '4.0.1', java: 17}
+            module: {name: "sql_hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest"}
       fail-fast: false
     name: spark-sql-${{ matrix.module.name }}/${{ matrix.os }}/spark-${{ matrix.spark-version.full }}/java-${{ matrix.spark-version.java }}
     runs-on: ${{ matrix.os }}

From c2139124b9c181bd695500118e972bd21b0f3326 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 2 Jan 2026 17:40:47 -0700
Subject: [PATCH 4/5] skip more workflows on benchmark PRs

---
 .github/workflows/benchmark-tpcds.yml    | 6 ++++++
 .github/workflows/benchmark-tpch.yml     | 6 ++++++
 .github/workflows/iceberg_spark_test.yml | 6 ++++++
 .github/workflows/miri.yml               | 6 ++++++
 4 files changed, 24 insertions(+)

diff --git a/.github/workflows/benchmark-tpcds.yml b/.github/workflows/benchmark-tpcds.yml
index 9930ea4f20..db1fce0192 100644
--- a/.github/workflows/benchmark-tpcds.yml
+++ b/.github/workflows/benchmark-tpcds.yml
@@ -27,11 +27,17 @@ on:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
   pull_request:
     paths-ignore:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
   # manual trigger
   # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow
   workflow_dispatch:
diff --git a/.github/workflows/benchmark-tpch.yml b/.github/workflows/benchmark-tpch.yml
index 435eaaa5c3..124b0d0c78 100644
--- a/.github/workflows/benchmark-tpch.yml
+++ b/.github/workflows/benchmark-tpch.yml
@@ -27,11 +27,17 @@ on:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
   pull_request:
     paths-ignore:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
   # manual trigger
   # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow
   workflow_dispatch:
diff --git a/.github/workflows/iceberg_spark_test.yml b/.github/workflows/iceberg_spark_test.yml
index cb497daee5..74badcda5f 100644
--- a/.github/workflows/iceberg_spark_test.yml
+++ b/.github/workflows/iceberg_spark_test.yml
@@ -27,11 +27,17 @@ on:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
   pull_request:
     paths-ignore:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
   # manual trigger
   # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow
   workflow_dispatch:
diff --git a/.github/workflows/miri.yml b/.github/workflows/miri.yml
index efac9cdd77..77b4d29a9c 100644
--- a/.github/workflows/miri.yml
+++ b/.github/workflows/miri.yml
@@ -23,11 +23,17 @@ on:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
   pull_request:
     paths-ignore:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
   # manual trigger
   # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow
   workflow_dispatch:

From 828a455787e58ea3a9c2356ca804aba2a1cd254d Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 6 Jan 2026 11:45:01 -0700
Subject: [PATCH 5/5] address feedback

---
 .../spark/sql/benchmark/CometHashExpressionBenchmark.scala    | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometHashExpressionBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometHashExpressionBenchmark.scala
index e1304dbe62..c230e44c4e 100644
--- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometHashExpressionBenchmark.scala
+++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometHashExpressionBenchmark.scala
@@ -50,6 +50,10 @@ object CometHashExpressionBenchmark extends CometBenchmarkBase {
     runBenchmarkWithTable("Hash expression benchmarks", values) { v =>
       withTempPath { dir =>
         withTempTable("parquetV1Table") {
+          // Data distribution: 1% NULL per column
+          // - c_str: unique strings "string_0" through "string_N"
+          // - c_int: integers 0-999,999 (cycling)
+          // - c_long: large values 0 to ~1 billion
           prepareTable(
             dir,
             spark.sql(s"""