From 48cc37b3ddf1755e772369510cbaebca2014e7f4 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 2 Jan 2026 12:07:44 -0700 Subject: [PATCH 1/5] add microbenchmark for hash expressions --- .../CometHashExpressionBenchmark.scala | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 spark/src/test/scala/org/apache/spark/sql/benchmark/CometHashExpressionBenchmark.scala diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometHashExpressionBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometHashExpressionBenchmark.scala new file mode 100644 index 0000000000..e1304dbe62 --- /dev/null +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometHashExpressionBenchmark.scala @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.benchmark + +case class HashExprConfig( + name: String, + query: String, + extraCometConfigs: Map[String, String] = Map.empty) + +/** + * Comprehensive benchmark for Comet hash expressions. To run this benchmark: + * {{{ + * SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometHashExpressionBenchmark + * }}} + * Results will be written to "spark/benchmarks/CometHashExpressionBenchmark-**results.txt". + */ +object CometHashExpressionBenchmark extends CometBenchmarkBase { + + private val hashExpressions = List( + HashExprConfig("xxhash64_single", "SELECT xxhash64(c_str) FROM parquetV1Table"), + HashExprConfig("xxhash64_multi", "SELECT xxhash64(c_str, c_int, c_long) FROM parquetV1Table"), + HashExprConfig("murmur3_hash_single", "SELECT hash(c_str) FROM parquetV1Table"), + HashExprConfig("murmur3_hash_multi", "SELECT hash(c_str, c_int, c_long) FROM parquetV1Table"), + HashExprConfig("sha1", "SELECT sha1(c_str) FROM parquetV1Table"), + HashExprConfig("sha2_224", "SELECT sha2(c_str, 224) FROM parquetV1Table"), + HashExprConfig("sha2_256", "SELECT sha2(c_str, 256) FROM parquetV1Table"), + HashExprConfig("sha2_384", "SELECT sha2(c_str, 384) FROM parquetV1Table"), + HashExprConfig("sha2_512", "SELECT sha2(c_str, 512) FROM parquetV1Table")) + + override def runCometBenchmark(mainArgs: Array[String]): Unit = { + val values = 1024 * 1024 + + runBenchmarkWithTable("Hash expression benchmarks", values) { v => + withTempPath { dir => + withTempTable("parquetV1Table") { + prepareTable( + dir, + spark.sql(s""" + SELECT + CASE WHEN value % 100 = 0 THEN NULL ELSE CONCAT('string_', CAST(value AS STRING)) END AS c_str, + CASE WHEN value % 100 = 1 THEN NULL ELSE CAST(value % 1000000 AS INT) END AS c_int, + CASE WHEN value % 100 = 2 THEN NULL ELSE CAST(value * 1000 AS LONG) END AS c_long + FROM $tbl + """)) + + hashExpressions.foreach { config => + runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs) + } + } + } + } + } +} From 9af4184357a436b0a9ab00900d6267dd1d6de1d8 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 2 Jan 2026 12:20:51 -0700 Subject: [PATCH 2/5] skip some CI workflows for benchmark changes --- .github/workflows/pr_benchmark_check.yml | 85 ++++++++++++++++++++++++ .github/workflows/pr_build_linux.yml | 6 ++ .github/workflows/pr_build_macos.yml | 6 ++ .github/workflows/spark_sql_test.yml | 6 ++ 4 files changed, 103 insertions(+) create mode 100644 .github/workflows/pr_benchmark_check.yml diff --git a/.github/workflows/pr_benchmark_check.yml b/.github/workflows/pr_benchmark_check.yml new file mode 100644 index 0000000000..b7475b9076 --- /dev/null +++ b/.github/workflows/pr_benchmark_check.yml @@ -0,0 +1,85 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Lightweight CI for benchmark-only changes - verifies compilation and linting +# without running full test suites + +name: PR Benchmark Check + +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + +on: + push: + paths: + - "native/core/benches/**" + - "native/spark-expr/benches/**" + - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" + pull_request: + paths: + - "native/core/benches/**" + - "native/spark-expr/benches/**" + - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" + workflow_dispatch: + +env: + RUST_VERSION: stable + +jobs: + benchmark-check: + name: Benchmark Compile & Lint Check + runs-on: ubuntu-latest + container: + image: amd64/rust + steps: + - uses: actions/checkout@v6 + + - name: Setup Rust & Java toolchain + uses: ./.github/actions/setup-builder + with: + rust-version: ${{ env.RUST_VERSION }} + jdk-version: 17 + + - name: Check Cargo fmt + run: | + cd native + cargo fmt --all -- --check --color=never + + - name: Check Cargo clippy + run: | + cd native + cargo clippy --color=never --all-targets --workspace -- -D warnings + + - name: Check benchmark compilation + run: | + cd native + cargo check --benches + + - name: Cache Maven dependencies + uses: actions/cache@v4 + with: + path: | + ~/.m2/repository + /root/.m2/repository + key: ${{ runner.os }}-benchmark-maven-${{ hashFiles('**/pom.xml') }} + restore-keys: | + ${{ runner.os }}-benchmark-maven- + + - name: Check Scala compilation and linting + run: | + ./mvnw -B compile test-compile scalafix:scalafix -Dscalafix.mode=CHECK -Psemanticdb -DskipTests diff --git a/.github/workflows/pr_build_linux.yml b/.github/workflows/pr_build_linux.yml index e3b0e40566..beb5f9dcf7 100644 --- a/.github/workflows/pr_build_linux.yml +++ b/.github/workflows/pr_build_linux.yml @@ -27,11 +27,17 @@ on: - "doc/**" - "docs/**" - "**.md" + - "native/core/benches/**" + - "native/spark-expr/benches/**" + - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" pull_request: paths-ignore: - "doc/**" - "docs/**" - "**.md" + - "native/core/benches/**" + - "native/spark-expr/benches/**" + - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" # manual trigger # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow workflow_dispatch: diff --git a/.github/workflows/pr_build_macos.yml b/.github/workflows/pr_build_macos.yml index 0ad40c1932..9a45fe022d 100644 --- a/.github/workflows/pr_build_macos.yml +++ b/.github/workflows/pr_build_macos.yml @@ -27,11 +27,17 @@ on: - "doc/**" - "docs/**" - "**.md" + - "native/core/benches/**" + - "native/spark-expr/benches/**" + - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" pull_request: paths-ignore: - "doc/**" - "docs/**" - "**.md" + - "native/core/benches/**" + - "native/spark-expr/benches/**" + - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" # manual trigger # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow workflow_dispatch: diff --git a/.github/workflows/spark_sql_test.yml b/.github/workflows/spark_sql_test.yml index d143ef83a0..1ff6fa952c 100644 --- a/.github/workflows/spark_sql_test.yml +++ b/.github/workflows/spark_sql_test.yml @@ -27,11 +27,17 @@ on: - "doc/**" - "docs/**" - "**.md" + - "native/core/benches/**" + - "native/spark-expr/benches/**" + - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" pull_request: paths-ignore: - "doc/**" - "docs/**" - "**.md" + - "native/core/benches/**" + - "native/spark-expr/benches/**" + - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" # manual trigger # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow workflow_dispatch: From 6869f79709dff276c22154a4ba0201700db5ba19 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 2 Jan 2026 15:14:21 -0700 Subject: [PATCH 3/5] skip failing suite --- .github/workflows/spark_sql_test.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/spark_sql_test.yml b/.github/workflows/spark_sql_test.yml index 1ff6fa952c..2fe5fefe1a 100644 --- a/.github/workflows/spark_sql_test.yml +++ b/.github/workflows/spark_sql_test.yml @@ -65,6 +65,10 @@ jobs: - {name: "sql_hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest"} - {name: "sql_hive-2", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.ExtendedHiveTest"} - {name: "sql_hive-3", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.SlowHiveTest"} + # Skip sql_hive-1 for Spark 4.0 due to https://github.com/apache/datafusion-comet/issues/2946 + exclude: + - spark-version: {short: '4.0', full: '4.0.1', java: 17} + module: {name: "sql_hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest"} fail-fast: false name: spark-sql-${{ matrix.module.name }}/${{ matrix.os }}/spark-${{ matrix.spark-version.full }}/java-${{ matrix.spark-version.java }} runs-on: ${{ matrix.os }} From c2139124b9c181bd695500118e972bd21b0f3326 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 2 Jan 2026 17:40:47 -0700 Subject: [PATCH 4/5] skip more workflows on benchmark PRs --- .github/workflows/benchmark-tpcds.yml | 6 ++++++ .github/workflows/benchmark-tpch.yml | 6 ++++++ .github/workflows/iceberg_spark_test.yml | 6 ++++++ .github/workflows/miri.yml | 6 ++++++ 4 files changed, 24 insertions(+) diff --git a/.github/workflows/benchmark-tpcds.yml b/.github/workflows/benchmark-tpcds.yml index 9930ea4f20..db1fce0192 100644 --- a/.github/workflows/benchmark-tpcds.yml +++ b/.github/workflows/benchmark-tpcds.yml @@ -27,11 +27,17 @@ on: - "doc/**" - "docs/**" - "**.md" + - "native/core/benches/**" + - "native/spark-expr/benches/**" + - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" pull_request: paths-ignore: - "doc/**" - "docs/**" - "**.md" + - "native/core/benches/**" + - "native/spark-expr/benches/**" + - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" # manual trigger # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow workflow_dispatch: diff --git a/.github/workflows/benchmark-tpch.yml b/.github/workflows/benchmark-tpch.yml index 435eaaa5c3..124b0d0c78 100644 --- a/.github/workflows/benchmark-tpch.yml +++ b/.github/workflows/benchmark-tpch.yml @@ -27,11 +27,17 @@ on: - "doc/**" - "docs/**" - "**.md" + - "native/core/benches/**" + - "native/spark-expr/benches/**" + - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" pull_request: paths-ignore: - "doc/**" - "docs/**" - "**.md" + - "native/core/benches/**" + - "native/spark-expr/benches/**" + - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" # manual trigger # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow workflow_dispatch: diff --git a/.github/workflows/iceberg_spark_test.yml b/.github/workflows/iceberg_spark_test.yml index cb497daee5..74badcda5f 100644 --- a/.github/workflows/iceberg_spark_test.yml +++ b/.github/workflows/iceberg_spark_test.yml @@ -27,11 +27,17 @@ on: - "doc/**" - "docs/**" - "**.md" + - "native/core/benches/**" + - "native/spark-expr/benches/**" + - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" pull_request: paths-ignore: - "doc/**" - "docs/**" - "**.md" + - "native/core/benches/**" + - "native/spark-expr/benches/**" + - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" # manual trigger # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow workflow_dispatch: diff --git a/.github/workflows/miri.yml b/.github/workflows/miri.yml index efac9cdd77..77b4d29a9c 100644 --- a/.github/workflows/miri.yml +++ b/.github/workflows/miri.yml @@ -23,11 +23,17 @@ on: - "doc/**" - "docs/**" - "**.md" + - "native/core/benches/**" + - "native/spark-expr/benches/**" + - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" pull_request: paths-ignore: - "doc/**" - "docs/**" - "**.md" + - "native/core/benches/**" + - "native/spark-expr/benches/**" + - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" # manual trigger # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow workflow_dispatch: From 828a455787e58ea3a9c2356ca804aba2a1cd254d Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 6 Jan 2026 11:45:01 -0700 Subject: [PATCH 5/5] address feedback --- .../spark/sql/benchmark/CometHashExpressionBenchmark.scala | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometHashExpressionBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometHashExpressionBenchmark.scala index e1304dbe62..c230e44c4e 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometHashExpressionBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometHashExpressionBenchmark.scala @@ -50,6 +50,10 @@ object CometHashExpressionBenchmark extends CometBenchmarkBase { runBenchmarkWithTable("Hash expression benchmarks", values) { v => withTempPath { dir => withTempTable("parquetV1Table") { + // Data distribution: 1% NULL per column + // - c_str: unique strings "string_0" through "string_N" + // - c_int: integers 0-999,999 (cycling) + // - c_long: large values 0 to ~1 billion prepareTable( dir, spark.sql(s"""