From 9af4184357a436b0a9ab00900d6267dd1d6de1d8 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 2 Jan 2026 12:20:51 -0700 Subject: [PATCH 1/8] skip some CI workflows for benchmark changes --- .github/workflows/pr_benchmark_check.yml | 85 ++++++++++++++++++++++++ .github/workflows/pr_build_linux.yml | 6 ++ .github/workflows/pr_build_macos.yml | 6 ++ .github/workflows/spark_sql_test.yml | 6 ++ 4 files changed, 103 insertions(+) create mode 100644 .github/workflows/pr_benchmark_check.yml diff --git a/.github/workflows/pr_benchmark_check.yml b/.github/workflows/pr_benchmark_check.yml new file mode 100644 index 0000000000..b7475b9076 --- /dev/null +++ b/.github/workflows/pr_benchmark_check.yml @@ -0,0 +1,85 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Lightweight CI for benchmark-only changes - verifies compilation and linting +# without running full test suites + +name: PR Benchmark Check + +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + +on: + push: + paths: + - "native/core/benches/**" + - "native/spark-expr/benches/**" + - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" + pull_request: + paths: + - "native/core/benches/**" + - "native/spark-expr/benches/**" + - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" + workflow_dispatch: + +env: + RUST_VERSION: stable + +jobs: + benchmark-check: + name: Benchmark Compile & Lint Check + runs-on: ubuntu-latest + container: + image: amd64/rust + steps: + - uses: actions/checkout@v6 + + - name: Setup Rust & Java toolchain + uses: ./.github/actions/setup-builder + with: + rust-version: ${{ env.RUST_VERSION }} + jdk-version: 17 + + - name: Check Cargo fmt + run: | + cd native + cargo fmt --all -- --check --color=never + + - name: Check Cargo clippy + run: | + cd native + cargo clippy --color=never --all-targets --workspace -- -D warnings + + - name: Check benchmark compilation + run: | + cd native + cargo check --benches + + - name: Cache Maven dependencies + uses: actions/cache@v4 + with: + path: | + ~/.m2/repository + /root/.m2/repository + key: ${{ runner.os }}-benchmark-maven-${{ hashFiles('**/pom.xml') }} + restore-keys: | + ${{ runner.os }}-benchmark-maven- + + - name: Check Scala compilation and linting + run: | + ./mvnw -B compile test-compile scalafix:scalafix -Dscalafix.mode=CHECK -Psemanticdb -DskipTests diff --git a/.github/workflows/pr_build_linux.yml b/.github/workflows/pr_build_linux.yml index e3b0e40566..beb5f9dcf7 100644 --- a/.github/workflows/pr_build_linux.yml +++ b/.github/workflows/pr_build_linux.yml @@ -27,11 +27,17 @@ on: - "doc/**" - "docs/**" - "**.md" + - "native/core/benches/**" + - "native/spark-expr/benches/**" + - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" pull_request: paths-ignore: - "doc/**" - "docs/**" - "**.md" + - "native/core/benches/**" + - "native/spark-expr/benches/**" + - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" # manual trigger # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow workflow_dispatch: diff --git a/.github/workflows/pr_build_macos.yml b/.github/workflows/pr_build_macos.yml index 0ad40c1932..9a45fe022d 100644 --- a/.github/workflows/pr_build_macos.yml +++ b/.github/workflows/pr_build_macos.yml @@ -27,11 +27,17 @@ on: - "doc/**" - "docs/**" - "**.md" + - "native/core/benches/**" + - "native/spark-expr/benches/**" + - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" pull_request: paths-ignore: - "doc/**" - "docs/**" - "**.md" + - "native/core/benches/**" + - "native/spark-expr/benches/**" + - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" # manual trigger # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow workflow_dispatch: diff --git a/.github/workflows/spark_sql_test.yml b/.github/workflows/spark_sql_test.yml index d143ef83a0..1ff6fa952c 100644 --- a/.github/workflows/spark_sql_test.yml +++ b/.github/workflows/spark_sql_test.yml @@ -27,11 +27,17 @@ on: - "doc/**" - "docs/**" - "**.md" + - "native/core/benches/**" + - "native/spark-expr/benches/**" + - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" pull_request: paths-ignore: - "doc/**" - "docs/**" - "**.md" + - "native/core/benches/**" + - "native/spark-expr/benches/**" + - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" # manual trigger # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow workflow_dispatch: From 93ab38fbaa9ff787f9aa95a0e31f0c5df128e406 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 2 Jan 2026 12:26:58 -0700 Subject: [PATCH 2/8] implement more microbenchmarks for casts --- .../benchmark/CometCastBooleanBenchmark.scala | 122 +++++++++++++++ .../CometCastNumericToNumericBenchmark.scala | 142 ++++++++++++++++++ .../CometCastNumericToStringBenchmark.scala | 98 ++++++++++++ .../CometCastNumericToTemporalBenchmark.scala | 102 +++++++++++++ .../CometCastTemporalToNumericBenchmark.scala | 102 +++++++++++++ .../CometCastTemporalToStringBenchmark.scala | 96 ++++++++++++ 6 files changed, 662 insertions(+) create mode 100644 spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala create mode 100644 spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala create mode 100644 spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala create mode 100644 spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala create mode 100644 spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala create mode 100644 spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala new file mode 100644 index 0000000000..085e7388c7 --- /dev/null +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.benchmark + +case class CastBooleanConfig( + name: String, + query: String, + extraCometConfigs: Map[String, String] = Map.empty) + +// spotless:off +/** + * Benchmark to measure performance of Comet cast operations involving Boolean type. To run this + * benchmark: + * `SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastBooleanBenchmark` + * Results will be written to "spark/benchmarks/CometCastBooleanBenchmark-**results.txt". + */ +// spotless:on +object CometCastBooleanBenchmark extends CometBenchmarkBase { + + private val castFunctions = Seq("CAST", "TRY_CAST") + + // Boolean to String + private val boolToStringConfigs = for { + castFunc <- castFunctions + } yield CastBooleanConfig( + s"$castFunc Boolean to String", + s"SELECT $castFunc(c_bool AS STRING) FROM parquetV1Table") + + // Boolean to numeric types + private val boolToNumericTypes = + Seq("BYTE", "SHORT", "INT", "LONG", "FLOAT", "DOUBLE", "DECIMAL(10,2)") + private val boolToNumericConfigs = for { + castFunc <- castFunctions + targetType <- boolToNumericTypes + } yield CastBooleanConfig( + s"$castFunc Boolean to $targetType", + s"SELECT $castFunc(c_bool AS $targetType) FROM parquetV1Table") + + // Numeric to Boolean + private val numericTypes = Seq( + ("BYTE", "c_byte"), + ("SHORT", "c_short"), + ("INT", "c_int"), + ("LONG", "c_long"), + ("FLOAT", "c_float"), + ("DOUBLE", "c_double"), + ("DECIMAL(10,2)", "c_decimal")) + + private val numericToBoolConfigs = for { + castFunc <- castFunctions + (sourceType, colName) <- numericTypes + } yield CastBooleanConfig( + s"$castFunc $sourceType to Boolean", + s"SELECT $castFunc($colName AS BOOLEAN) FROM parquetV1Table") + + override def runCometBenchmark(mainArgs: Array[String]): Unit = { + val values = getBenchmarkRows(1024 * 1024 * 5) // 5M rows default + + // Generate boolean data for boolean-to-other casts + runBenchmarkWithTable("Boolean to other types casts", values) { v => + withTempPath { dir => + withTempTable("parquetV1Table") { + prepareTable( + dir, + spark.sql(s""" + SELECT CASE + WHEN value % 100 = 0 THEN NULL + ELSE (value % 2 = 0) + END AS c_bool + FROM $tbl + """)) + + (boolToStringConfigs ++ boolToNumericConfigs).foreach { config => + runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs) + } + } + } + } + + // Generate numeric data for numeric-to-boolean casts + runBenchmarkWithTable("Numeric to Boolean casts", values) { v => + withTempPath { dir => + withTempTable("parquetV1Table") { + prepareTable( + dir, + spark.sql(s""" + SELECT + CASE WHEN value % 100 = 0 THEN NULL ELSE CAST((value % 3) - 1 AS BYTE) END AS c_byte, + CASE WHEN value % 100 = 1 THEN NULL ELSE CAST((value % 3) - 1 AS SHORT) END AS c_short, + CASE WHEN value % 100 = 2 THEN NULL ELSE CAST((value % 3) - 1 AS INT) END AS c_int, + CASE WHEN value % 100 = 3 THEN NULL ELSE CAST((value % 3) - 1 AS LONG) END AS c_long, + CASE WHEN value % 100 = 4 THEN NULL ELSE CAST((value % 3) - 1 AS FLOAT) END AS c_float, + CASE WHEN value % 100 = 5 THEN NULL ELSE CAST((value % 3) - 1 AS DOUBLE) END AS c_double, + CASE WHEN value % 100 = 6 THEN NULL ELSE CAST((value % 3) - 1 AS DECIMAL(10,2)) END AS c_decimal + FROM $tbl + """)) + + numericToBoolConfigs.foreach { config => + runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs) + } + } + } + } + } +} diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala new file mode 100644 index 0000000000..5137e50e18 --- /dev/null +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala @@ -0,0 +1,142 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.benchmark + +case class CastNumericToNumericConfig( + name: String, + query: String, + extraCometConfigs: Map[String, String] = Map.empty) + +// spotless:off +/** + * Benchmark to measure performance of Comet cast between numeric types. To run this + * benchmark: + * `SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastNumericToNumericBenchmark` + * Results will be written to "spark/benchmarks/CometCastNumericToNumericBenchmark-**results.txt". + */ +// spotless:on +object CometCastNumericToNumericBenchmark extends CometBenchmarkBase { + + private val castFunctions = Seq("CAST", "TRY_CAST") + + // Integer widening conversions + private val integerWideningPairs = Seq( + ("BYTE", "c_byte", "SHORT"), + ("BYTE", "c_byte", "INT"), + ("BYTE", "c_byte", "LONG"), + ("SHORT", "c_short", "INT"), + ("SHORT", "c_short", "LONG"), + ("INT", "c_int", "LONG")) + + // Integer narrowing conversions + private val integerNarrowingPairs = Seq( + ("LONG", "c_long", "INT"), + ("LONG", "c_long", "SHORT"), + ("LONG", "c_long", "BYTE"), + ("INT", "c_int", "SHORT"), + ("INT", "c_int", "BYTE"), + ("SHORT", "c_short", "BYTE")) + + // Floating point conversions + private val floatPairs = Seq(("FLOAT", "c_float", "DOUBLE"), ("DOUBLE", "c_double", "FLOAT")) + + // Integer to floating point conversions + private val intToFloatPairs = Seq( + ("BYTE", "c_byte", "FLOAT"), + ("SHORT", "c_short", "FLOAT"), + ("INT", "c_int", "FLOAT"), + ("LONG", "c_long", "FLOAT"), + ("INT", "c_int", "DOUBLE"), + ("LONG", "c_long", "DOUBLE")) + + // Floating point to integer conversions + private val floatToIntPairs = Seq( + ("FLOAT", "c_float", "INT"), + ("FLOAT", "c_float", "LONG"), + ("DOUBLE", "c_double", "INT"), + ("DOUBLE", "c_double", "LONG")) + + // Decimal conversions + private val decimalPairs = Seq( + ("INT", "c_int", "DECIMAL(10,2)"), + ("LONG", "c_long", "DECIMAL(20,4)"), + ("DOUBLE", "c_double", "DECIMAL(15,5)"), + ("DECIMAL(10,2)", "c_decimal", "INT"), + ("DECIMAL(10,2)", "c_decimal", "LONG"), + ("DECIMAL(10,2)", "c_decimal", "DOUBLE")) + + private def generateConfigs( + pairs: Seq[(String, String, String)]): Seq[CastNumericToNumericConfig] = { + for { + castFunc <- castFunctions + (sourceType, colName, targetType) <- pairs + } yield CastNumericToNumericConfig( + s"$castFunc $sourceType to $targetType", + s"SELECT $castFunc($colName AS $targetType) FROM parquetV1Table") + } + + override def runCometBenchmark(mainArgs: Array[String]): Unit = { + val values = getBenchmarkRows(1024 * 1024 * 5) // 5M rows default + + // Generate input data once with all numeric types + runBenchmarkWithTable("Numeric to Numeric casts", values) { v => + withTempPath { dir => + withTempTable("parquetV1Table") { + // Generate varied numeric data including edge cases + prepareTable( + dir, + spark.sql(s""" + SELECT + CASE WHEN value % 100 = 0 THEN NULL ELSE CAST((value % 128) - 64 AS BYTE) END AS c_byte, + CASE WHEN value % 100 = 1 THEN NULL ELSE CAST((value % 32768) - 16384 AS SHORT) END AS c_short, + CASE WHEN value % 100 = 2 THEN NULL ELSE CAST(value - 2500000 AS INT) END AS c_int, + CASE WHEN value % 100 = 3 THEN NULL ELSE CAST(value * 1000 AS LONG) END AS c_long, + CASE + WHEN value % 100 = 4 THEN NULL + WHEN value % 100 = 5 THEN CAST('NaN' AS FLOAT) + WHEN value % 100 = 6 THEN CAST('Infinity' AS FLOAT) + WHEN value % 100 = 7 THEN CAST('-Infinity' AS FLOAT) + ELSE CAST((value - 2500000) / 100.0 AS FLOAT) + END AS c_float, + CASE + WHEN value % 100 = 8 THEN NULL + WHEN value % 100 = 9 THEN CAST('NaN' AS DOUBLE) + WHEN value % 100 = 10 THEN CAST('Infinity' AS DOUBLE) + WHEN value % 100 = 11 THEN CAST('-Infinity' AS DOUBLE) + ELSE CAST((value - 2500000) / 100.0 AS DOUBLE) + END AS c_double, + CASE WHEN value % 100 = 12 THEN NULL ELSE CAST((value - 2500000) / 100.0 AS DECIMAL(10,2)) END AS c_decimal + FROM $tbl + """)) + + // Run all benchmark categories + (generateConfigs(integerWideningPairs) ++ + generateConfigs(integerNarrowingPairs) ++ + generateConfigs(floatPairs) ++ + generateConfigs(intToFloatPairs) ++ + generateConfigs(floatToIntPairs) ++ + generateConfigs(decimalPairs)).foreach { config => + runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs) + } + } + } + } + } +} diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala new file mode 100644 index 0000000000..1459ab941f --- /dev/null +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.benchmark + +case class CastNumericToStringConfig( + name: String, + query: String, + extraCometConfigs: Map[String, String] = Map.empty) + +// spotless:off +/** + * Benchmark to measure performance of Comet cast from numeric types to String. To run this + * benchmark: + * `SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastNumericToStringBenchmark` + * Results will be written to "spark/benchmarks/CometCastNumericToStringBenchmark-**results.txt". + */ +// spotless:on +object CometCastNumericToStringBenchmark extends CometBenchmarkBase { + + private val castFunctions = Seq("CAST", "TRY_CAST") + private val sourceTypes = + Seq( + ("BOOLEAN", "c_bool"), + ("BYTE", "c_byte"), + ("SHORT", "c_short"), + ("INT", "c_int"), + ("LONG", "c_long"), + ("FLOAT", "c_float"), + ("DOUBLE", "c_double"), + ("DECIMAL(10,2)", "c_decimal")) + + private val castConfigs = for { + castFunc <- castFunctions + (sourceType, colName) <- sourceTypes + } yield CastNumericToStringConfig( + s"$castFunc $sourceType to String", + s"SELECT $castFunc($colName AS STRING) FROM parquetV1Table") + + override def runCometBenchmark(mainArgs: Array[String]): Unit = { + val values = getBenchmarkRows(1024 * 1024 * 5) // 5M rows default + + // Generate input data once with all numeric types + runBenchmarkWithTable("Numeric to String casts", values) { v => + withTempPath { dir => + withTempTable("parquetV1Table") { + // Generate varied numeric data including edge cases + prepareTable( + dir, + spark.sql(s""" + SELECT + CASE WHEN value % 100 = 0 THEN NULL ELSE (value % 2 = 0) END AS c_bool, + CASE WHEN value % 100 = 1 THEN NULL ELSE CAST((value % 128) - 64 AS BYTE) END AS c_byte, + CASE WHEN value % 100 = 2 THEN NULL ELSE CAST((value % 32768) - 16384 AS SHORT) END AS c_short, + CASE WHEN value % 100 = 3 THEN NULL ELSE CAST(value - 2500000 AS INT) END AS c_int, + CASE WHEN value % 100 = 4 THEN NULL ELSE CAST(value * 1000000 AS LONG) END AS c_long, + CASE + WHEN value % 100 = 5 THEN NULL + WHEN value % 100 = 6 THEN CAST('NaN' AS FLOAT) + WHEN value % 100 = 7 THEN CAST('Infinity' AS FLOAT) + WHEN value % 100 = 8 THEN CAST('-Infinity' AS FLOAT) + ELSE CAST((value - 2500000) / 1000.0 AS FLOAT) + END AS c_float, + CASE + WHEN value % 100 = 9 THEN NULL + WHEN value % 100 = 10 THEN CAST('NaN' AS DOUBLE) + WHEN value % 100 = 11 THEN CAST('Infinity' AS DOUBLE) + WHEN value % 100 = 12 THEN CAST('-Infinity' AS DOUBLE) + ELSE CAST((value - 2500000) / 100.0 AS DOUBLE) + END AS c_double, + CASE WHEN value % 100 = 13 THEN NULL ELSE CAST((value - 2500000) / 100.0 AS DECIMAL(10,2)) END AS c_decimal + FROM $tbl + """)) + + castConfigs.foreach { config => + runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs) + } + } + } + } + } +} diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala new file mode 100644 index 0000000000..a8e81a3dff --- /dev/null +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.benchmark + +case class CastNumericToTemporalConfig( + name: String, + query: String, + extraCometConfigs: Map[String, String] = Map.empty) + +// spotless:off +/** + * Benchmark to measure performance of Comet cast from numeric types to temporal types. To run + * this benchmark: + * `SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastNumericToTemporalBenchmark` + * Results will be written to "spark/benchmarks/CometCastNumericToTemporalBenchmark-**results.txt". + */ +// spotless:on +object CometCastNumericToTemporalBenchmark extends CometBenchmarkBase { + + private val castFunctions = Seq("CAST", "TRY_CAST") + + // INT to DATE (days since epoch) + private val intToDateConfigs = for { + castFunc <- castFunctions + } yield CastNumericToTemporalConfig( + s"$castFunc Int to Date", + s"SELECT $castFunc(c_int AS DATE) FROM parquetV1Table") + + // LONG to TIMESTAMP (microseconds since epoch) + private val longToTimestampConfigs = for { + castFunc <- castFunctions + } yield CastNumericToTemporalConfig( + s"$castFunc Long to Timestamp", + s"SELECT $castFunc(c_long AS TIMESTAMP) FROM parquetV1Table") + + override def runCometBenchmark(mainArgs: Array[String]): Unit = { + val values = getBenchmarkRows(1024 * 1024 * 5) // 5M rows default + + // Generate data once for INT to DATE conversions + runBenchmarkWithTable("Int to Date casts", values) { v => + withTempPath { dir => + withTempTable("parquetV1Table") { + // Generate INT values representing days since epoch (1970-01-01) + // Range: ~-18000 to +18000 days (roughly 1920 to 2020) + prepareTable( + dir, + spark.sql(s""" + SELECT CASE + WHEN value % 100 = 0 THEN NULL + ELSE CAST((value % 36500) - 18000 AS INT) + END AS c_int + FROM $tbl + """)) + + intToDateConfigs.foreach { config => + runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs) + } + } + } + } + + // Generate data once for LONG to TIMESTAMP conversions + runBenchmarkWithTable("Long to Timestamp casts", values) { v => + withTempPath { dir => + withTempTable("parquetV1Table") { + // Generate LONG values representing microseconds since epoch + // Range: 2020-2021 timestamps + prepareTable( + dir, + spark.sql(s""" + SELECT CASE + WHEN value % 100 = 0 THEN NULL + ELSE 1577836800000000 + (value % 31536000000000) + END AS c_long + FROM $tbl + """)) + + longToTimestampConfigs.foreach { config => + runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs) + } + } + } + } + } +} diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala new file mode 100644 index 0000000000..08850b6a12 --- /dev/null +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.benchmark + +case class CastTemporalToNumericConfig( + name: String, + query: String, + extraCometConfigs: Map[String, String] = Map.empty) + +// spotless:off +/** + * Benchmark to measure performance of Comet cast from temporal types to numeric types. To run + * this benchmark: + * `SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastTemporalToNumericBenchmark` + * Results will be written to "spark/benchmarks/CometCastTemporalToNumericBenchmark-**results.txt". + */ +// spotless:on +object CometCastTemporalToNumericBenchmark extends CometBenchmarkBase { + + private val castFunctions = Seq("CAST", "TRY_CAST") + + // DATE to numeric types + private val dateToNumericTypes = Seq("BYTE", "SHORT", "INT", "LONG") + private val dateToNumericConfigs = for { + castFunc <- castFunctions + targetType <- dateToNumericTypes + } yield CastTemporalToNumericConfig( + s"$castFunc Date to $targetType", + s"SELECT $castFunc(c_date AS $targetType) FROM parquetV1Table") + + // TIMESTAMP to numeric types + private val timestampToNumericTypes = Seq("BYTE", "SHORT", "INT", "LONG") + private val timestampToNumericConfigs = for { + castFunc <- castFunctions + targetType <- timestampToNumericTypes + } yield CastTemporalToNumericConfig( + s"$castFunc Timestamp to $targetType", + s"SELECT $castFunc(c_timestamp AS $targetType) FROM parquetV1Table") + + override def runCometBenchmark(mainArgs: Array[String]): Unit = { + val values = getBenchmarkRows(1024 * 1024 * 5) // 5M rows default + + // Generate DATE data once for all date-to-numeric benchmarks + runBenchmarkWithTable("Date to Numeric casts", values) { v => + withTempPath { dir => + withTempTable("parquetV1Table") { + prepareTable( + dir, + spark.sql(s""" + SELECT CASE + WHEN value % 100 = 0 THEN NULL + ELSE DATE_ADD('2020-01-01', CAST(value % 3650 AS INT)) + END AS c_date + FROM $tbl + """)) + + dateToNumericConfigs.foreach { config => + runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs) + } + } + } + } + + // Generate TIMESTAMP data once for all timestamp-to-numeric benchmarks + runBenchmarkWithTable("Timestamp to Numeric casts", values) { v => + withTempPath { dir => + withTempTable("parquetV1Table") { + prepareTable( + dir, + spark.sql(s""" + SELECT CASE + WHEN value % 100 = 0 THEN NULL + ELSE TIMESTAMP_MICROS(1577836800000000 + value % 31536000000000) + END AS c_timestamp + FROM $tbl + """)) + + timestampToNumericConfigs.foreach { config => + runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs) + } + } + } + } + } +} diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala new file mode 100644 index 0000000000..5e2316a142 --- /dev/null +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.benchmark + +case class CastTemporalToStringConfig( + name: String, + query: String, + extraCometConfigs: Map[String, String] = Map.empty) + +// spotless:off +/** + * Benchmark to measure performance of Comet cast from temporal types to String. To run this + * benchmark: + * `SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastTemporalToStringBenchmark` + * Results will be written to "spark/benchmarks/CometCastTemporalToStringBenchmark-**results.txt". + */ +// spotless:on +object CometCastTemporalToStringBenchmark extends CometBenchmarkBase { + + private val castFunctions = Seq("CAST", "TRY_CAST") + + private val dateCastConfigs = for { + castFunc <- castFunctions + } yield CastTemporalToStringConfig( + s"$castFunc Date to String", + s"SELECT $castFunc(c_date AS STRING) FROM parquetV1Table") + + private val timestampCastConfigs = for { + castFunc <- castFunctions + } yield CastTemporalToStringConfig( + s"$castFunc Timestamp to String", + s"SELECT $castFunc(c_timestamp AS STRING) FROM parquetV1Table") + + override def runCometBenchmark(mainArgs: Array[String]): Unit = { + val values = getBenchmarkRows(1024 * 1024 * 5) // 5M rows default + + // Generate temporal data once for date benchmarks + runBenchmarkWithTable("Date to String casts", values) { v => + withTempPath { dir => + withTempTable("parquetV1Table") { + prepareTable( + dir, + spark.sql(s""" + SELECT CASE + WHEN value % 100 = 0 THEN NULL + ELSE DATE_ADD('2020-01-01', CAST(value % 3650 AS INT)) + END AS c_date + FROM $tbl + """)) + + dateCastConfigs.foreach { config => + runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs) + } + } + } + } + + // Generate temporal data once for timestamp benchmarks + runBenchmarkWithTable("Timestamp to String casts", values) { v => + withTempPath { dir => + withTempTable("parquetV1Table") { + prepareTable( + dir, + spark.sql(s""" + SELECT CASE + WHEN value % 100 = 0 THEN NULL + ELSE TIMESTAMP_MICROS(1577836800000000 + value % 31536000000000) + END AS c_timestamp + FROM $tbl + """)) + + timestampCastConfigs.foreach { config => + runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs) + } + } + } + } + } +} From a3d45af0c75b76a55b3ca0f2046720ff14c121d9 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 2 Jan 2026 12:30:33 -0700 Subject: [PATCH 3/8] fix docs --- .../spark/sql/benchmark/CometCastBooleanBenchmark.scala | 6 +++--- .../benchmark/CometCastNumericToNumericBenchmark.scala | 9 ++++----- .../benchmark/CometCastNumericToStringBenchmark.scala | 6 +++--- .../benchmark/CometCastNumericToTemporalBenchmark.scala | 9 +++++---- .../benchmark/CometCastStringToTemporalBenchmark.scala | 6 +++--- .../benchmark/CometCastTemporalToNumericBenchmark.scala | 9 +++++---- .../benchmark/CometCastTemporalToStringBenchmark.scala | 6 +++--- 7 files changed, 26 insertions(+), 25 deletions(-) diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala index 085e7388c7..c3043e8c90 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala @@ -24,14 +24,14 @@ case class CastBooleanConfig( query: String, extraCometConfigs: Map[String, String] = Map.empty) -// spotless:off /** * Benchmark to measure performance of Comet cast operations involving Boolean type. To run this * benchmark: - * `SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastBooleanBenchmark` + * {{{ + * SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastBooleanBenchmark + * }}} * Results will be written to "spark/benchmarks/CometCastBooleanBenchmark-**results.txt". */ -// spotless:on object CometCastBooleanBenchmark extends CometBenchmarkBase { private val castFunctions = Seq("CAST", "TRY_CAST") diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala index 5137e50e18..a4264a5b88 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala @@ -24,14 +24,13 @@ case class CastNumericToNumericConfig( query: String, extraCometConfigs: Map[String, String] = Map.empty) -// spotless:off /** - * Benchmark to measure performance of Comet cast between numeric types. To run this - * benchmark: - * `SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastNumericToNumericBenchmark` + * Benchmark to measure performance of Comet cast between numeric types. To run this benchmark: + * {{{ + * SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastNumericToNumericBenchmark + * }}} * Results will be written to "spark/benchmarks/CometCastNumericToNumericBenchmark-**results.txt". */ -// spotless:on object CometCastNumericToNumericBenchmark extends CometBenchmarkBase { private val castFunctions = Seq("CAST", "TRY_CAST") diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala index 1459ab941f..202f336b86 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala @@ -24,14 +24,14 @@ case class CastNumericToStringConfig( query: String, extraCometConfigs: Map[String, String] = Map.empty) -// spotless:off /** * Benchmark to measure performance of Comet cast from numeric types to String. To run this * benchmark: - * `SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastNumericToStringBenchmark` + * {{{ + * SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastNumericToStringBenchmark + * }}} * Results will be written to "spark/benchmarks/CometCastNumericToStringBenchmark-**results.txt". */ -// spotless:on object CometCastNumericToStringBenchmark extends CometBenchmarkBase { private val castFunctions = Seq("CAST", "TRY_CAST") diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala index a8e81a3dff..bc1cacd803 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala @@ -24,14 +24,15 @@ case class CastNumericToTemporalConfig( query: String, extraCometConfigs: Map[String, String] = Map.empty) -// spotless:off /** * Benchmark to measure performance of Comet cast from numeric types to temporal types. To run * this benchmark: - * `SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastNumericToTemporalBenchmark` - * Results will be written to "spark/benchmarks/CometCastNumericToTemporalBenchmark-**results.txt". + * {{{ + * SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastNumericToTemporalBenchmark + * }}} + * Results will be written to + * "spark/benchmarks/CometCastNumericToTemporalBenchmark-**results.txt". */ -// spotless:on object CometCastNumericToTemporalBenchmark extends CometBenchmarkBase { private val castFunctions = Seq("CAST", "TRY_CAST") diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala index 39337be5c8..6cf4cd4cc4 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala @@ -24,14 +24,14 @@ case class CastStringToTemporalConfig( query: String, extraCometConfigs: Map[String, String] = Map.empty) -// spotless:off /** * Benchmark to measure performance of Comet cast from String to temporal types. To run this * benchmark: - * `SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastStringToTemporalBenchmark` + * {{{ + * SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastStringToTemporalBenchmark + * }}} * Results will be written to "spark/benchmarks/CometCastStringToTemporalBenchmark-**results.txt". */ -// spotless:on object CometCastStringToTemporalBenchmark extends CometBenchmarkBase { // Configuration for String to temporal cast benchmarks diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala index 08850b6a12..52402fc0db 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala @@ -24,14 +24,15 @@ case class CastTemporalToNumericConfig( query: String, extraCometConfigs: Map[String, String] = Map.empty) -// spotless:off /** * Benchmark to measure performance of Comet cast from temporal types to numeric types. To run * this benchmark: - * `SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastTemporalToNumericBenchmark` - * Results will be written to "spark/benchmarks/CometCastTemporalToNumericBenchmark-**results.txt". + * {{{ + * SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastTemporalToNumericBenchmark + * }}} + * Results will be written to + * "spark/benchmarks/CometCastTemporalToNumericBenchmark-**results.txt". */ -// spotless:on object CometCastTemporalToNumericBenchmark extends CometBenchmarkBase { private val castFunctions = Seq("CAST", "TRY_CAST") diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala index 5e2316a142..04db279659 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala @@ -24,14 +24,14 @@ case class CastTemporalToStringConfig( query: String, extraCometConfigs: Map[String, String] = Map.empty) -// spotless:off /** * Benchmark to measure performance of Comet cast from temporal types to String. To run this * benchmark: - * `SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastTemporalToStringBenchmark` + * {{{ + * SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastTemporalToStringBenchmark + * }}} * Results will be written to "spark/benchmarks/CometCastTemporalToStringBenchmark-**results.txt". */ -// spotless:on object CometCastTemporalToStringBenchmark extends CometBenchmarkBase { private val castFunctions = Seq("CAST", "TRY_CAST") From 691d7b57eada9c425066077be1c1ed697c25ec21 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 2 Jan 2026 12:41:37 -0700 Subject: [PATCH 4/8] Add temporal casts --- .../benchmark/CometCastBooleanBenchmark.scala | 2 +- .../CometCastNumericToNumericBenchmark.scala | 2 +- .../CometCastNumericToStringBenchmark.scala | 2 +- .../CometCastNumericToTemporalBenchmark.scala | 2 +- .../CometCastTemporalToNumericBenchmark.scala | 2 +- .../CometCastTemporalToStringBenchmark.scala | 2 +- ...CometCastTemporalToTemporalBenchmark.scala | 98 +++++++++++++++++++ 7 files changed, 104 insertions(+), 6 deletions(-) create mode 100644 spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToTemporalBenchmark.scala diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala index c3043e8c90..d8e0419867 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala @@ -71,7 +71,7 @@ object CometCastBooleanBenchmark extends CometBenchmarkBase { s"SELECT $castFunc($colName AS BOOLEAN) FROM parquetV1Table") override def runCometBenchmark(mainArgs: Array[String]): Unit = { - val values = getBenchmarkRows(1024 * 1024 * 5) // 5M rows default + val values = 1024 * 1024 * 5 // 5M rows // Generate boolean data for boolean-to-other casts runBenchmarkWithTable("Boolean to other types casts", values) { v => diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala index a4264a5b88..fba9fe0135 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala @@ -92,7 +92,7 @@ object CometCastNumericToNumericBenchmark extends CometBenchmarkBase { } override def runCometBenchmark(mainArgs: Array[String]): Unit = { - val values = getBenchmarkRows(1024 * 1024 * 5) // 5M rows default + val values = 1024 * 1024 * 5 // 5M rows // Generate input data once with all numeric types runBenchmarkWithTable("Numeric to Numeric casts", values) { v => diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala index 202f336b86..c7574f6e8d 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala @@ -54,7 +54,7 @@ object CometCastNumericToStringBenchmark extends CometBenchmarkBase { s"SELECT $castFunc($colName AS STRING) FROM parquetV1Table") override def runCometBenchmark(mainArgs: Array[String]): Unit = { - val values = getBenchmarkRows(1024 * 1024 * 5) // 5M rows default + val values = 1024 * 1024 * 5 // 5M rows // Generate input data once with all numeric types runBenchmarkWithTable("Numeric to String casts", values) { v => diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala index bc1cacd803..da0de5b429 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala @@ -52,7 +52,7 @@ object CometCastNumericToTemporalBenchmark extends CometBenchmarkBase { s"SELECT $castFunc(c_long AS TIMESTAMP) FROM parquetV1Table") override def runCometBenchmark(mainArgs: Array[String]): Unit = { - val values = getBenchmarkRows(1024 * 1024 * 5) // 5M rows default + val values = 1024 * 1024 * 5 // 5M rows // Generate data once for INT to DATE conversions runBenchmarkWithTable("Int to Date casts", values) { v => diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala index 52402fc0db..022dfee7f5 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala @@ -56,7 +56,7 @@ object CometCastTemporalToNumericBenchmark extends CometBenchmarkBase { s"SELECT $castFunc(c_timestamp AS $targetType) FROM parquetV1Table") override def runCometBenchmark(mainArgs: Array[String]): Unit = { - val values = getBenchmarkRows(1024 * 1024 * 5) // 5M rows default + val values = 1024 * 1024 * 5 // 5M rows // Generate DATE data once for all date-to-numeric benchmarks runBenchmarkWithTable("Date to Numeric casts", values) { v => diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala index 04db279659..2430024487 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala @@ -49,7 +49,7 @@ object CometCastTemporalToStringBenchmark extends CometBenchmarkBase { s"SELECT $castFunc(c_timestamp AS STRING) FROM parquetV1Table") override def runCometBenchmark(mainArgs: Array[String]): Unit = { - val values = getBenchmarkRows(1024 * 1024 * 5) // 5M rows default + val values = 1024 * 1024 * 5 // 5M rows // Generate temporal data once for date benchmarks runBenchmarkWithTable("Date to String casts", values) { v => diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToTemporalBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToTemporalBenchmark.scala new file mode 100644 index 0000000000..2dfc8e8012 --- /dev/null +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToTemporalBenchmark.scala @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.benchmark + +case class CastTemporalToTemporalConfig( + name: String, + query: String, + extraCometConfigs: Map[String, String] = Map.empty) + +/** + * Benchmark to measure performance of Comet cast between temporal types. To run this benchmark: + * {{{ + * SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastTemporalToTemporalBenchmark + * }}} + * Results will be written to + * "spark/benchmarks/CometCastTemporalToTemporalBenchmark-**results.txt". + */ +object CometCastTemporalToTemporalBenchmark extends CometBenchmarkBase { + + private val castFunctions = Seq("CAST", "TRY_CAST") + + // Date to Timestamp + private val dateToTimestampConfigs = for { + castFunc <- castFunctions + } yield CastTemporalToTemporalConfig( + s"$castFunc Date to Timestamp", + s"SELECT $castFunc(c_date AS TIMESTAMP) FROM parquetV1Table") + + // Timestamp to Date + private val timestampToDateConfigs = for { + castFunc <- castFunctions + } yield CastTemporalToTemporalConfig( + s"$castFunc Timestamp to Date", + s"SELECT $castFunc(c_timestamp AS DATE) FROM parquetV1Table") + + override def runCometBenchmark(mainArgs: Array[String]): Unit = { + val values = 1024 * 1024 * 5 // 5M rows + + // Generate DATE data for Date -> Timestamp benchmarks + runBenchmarkWithTable("Date to Timestamp casts", values) { v => + withTempPath { dir => + withTempTable("parquetV1Table") { + prepareTable( + dir, + spark.sql(s""" + SELECT CASE + WHEN value % 100 = 0 THEN NULL + ELSE DATE_ADD('2020-01-01', CAST(value % 3650 AS INT)) + END AS c_date + FROM $tbl + """)) + + dateToTimestampConfigs.foreach { config => + runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs) + } + } + } + } + + // Generate TIMESTAMP data for Timestamp -> Date benchmarks + runBenchmarkWithTable("Timestamp to Date casts", values) { v => + withTempPath { dir => + withTempTable("parquetV1Table") { + prepareTable( + dir, + spark.sql(s""" + SELECT CASE + WHEN value % 100 = 0 THEN NULL + ELSE TIMESTAMP_MICROS(1577836800000000 + value % 31536000000000) + END AS c_timestamp + FROM $tbl + """)) + + timestampToDateConfigs.foreach { config => + runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs) + } + } + } + } + } +} From 199cc4385f21a0bd1bf189697a5354d66eeed25b Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 2 Jan 2026 12:47:33 -0700 Subject: [PATCH 5/8] use consistent row count --- .../org/apache/spark/sql/benchmark/CometCastBenchmark.scala | 2 +- .../apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala | 2 +- .../sql/benchmark/CometCastNumericToNumericBenchmark.scala | 2 +- .../spark/sql/benchmark/CometCastNumericToStringBenchmark.scala | 2 +- .../sql/benchmark/CometCastNumericToTemporalBenchmark.scala | 2 +- .../sql/benchmark/CometCastStringToTemporalBenchmark.scala | 2 +- .../sql/benchmark/CometCastTemporalToNumericBenchmark.scala | 2 +- .../sql/benchmark/CometCastTemporalToStringBenchmark.scala | 2 +- .../sql/benchmark/CometCastTemporalToTemporalBenchmark.scala | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBenchmark.scala index 975abd632f..08ab2c344b 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBenchmark.scala @@ -61,7 +61,7 @@ object CometCastBenchmark extends CometBenchmarkBase { case Compatible(notes) => runBenchmarkWithTable( s"Running benchmark cast operation from : $LongType to : $toDataType", - 1024 * 1024 * 10) { v => + 1024 * 1024) { v => // 1M rows castBenchmark(v, LongType, toDataType, isAnsiMode = ansiMode) } case Incompatible(notes) => None diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala index d8e0419867..6a81cce20e 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala @@ -71,7 +71,7 @@ object CometCastBooleanBenchmark extends CometBenchmarkBase { s"SELECT $castFunc($colName AS BOOLEAN) FROM parquetV1Table") override def runCometBenchmark(mainArgs: Array[String]): Unit = { - val values = 1024 * 1024 * 5 // 5M rows + val values = 1024 * 1024 // 1M rows // Generate boolean data for boolean-to-other casts runBenchmarkWithTable("Boolean to other types casts", values) { v => diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala index fba9fe0135..e7adf370cb 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala @@ -92,7 +92,7 @@ object CometCastNumericToNumericBenchmark extends CometBenchmarkBase { } override def runCometBenchmark(mainArgs: Array[String]): Unit = { - val values = 1024 * 1024 * 5 // 5M rows + val values = 1024 * 1024 // 1M rows // Generate input data once with all numeric types runBenchmarkWithTable("Numeric to Numeric casts", values) { v => diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala index c7574f6e8d..00fba11392 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala @@ -54,7 +54,7 @@ object CometCastNumericToStringBenchmark extends CometBenchmarkBase { s"SELECT $castFunc($colName AS STRING) FROM parquetV1Table") override def runCometBenchmark(mainArgs: Array[String]): Unit = { - val values = 1024 * 1024 * 5 // 5M rows + val values = 1024 * 1024 // 1M rows // Generate input data once with all numeric types runBenchmarkWithTable("Numeric to String casts", values) { v => diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala index da0de5b429..0545ab147c 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala @@ -52,7 +52,7 @@ object CometCastNumericToTemporalBenchmark extends CometBenchmarkBase { s"SELECT $castFunc(c_long AS TIMESTAMP) FROM parquetV1Table") override def runCometBenchmark(mainArgs: Array[String]): Unit = { - val values = 1024 * 1024 * 5 // 5M rows + val values = 1024 * 1024 // 1M rows // Generate data once for INT to DATE conversions runBenchmarkWithTable("Int to Date casts", values) { v => diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala index 6cf4cd4cc4..79856f0f47 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala @@ -52,7 +52,7 @@ object CometCastStringToTemporalBenchmark extends CometBenchmarkBase { "SELECT TRY_CAST(c1 AS TIMESTAMP) FROM parquetV1Table")) override def runCometBenchmark(mainArgs: Array[String]): Unit = { - val values = 1024 * 1024 * 10 // 10M rows + val values = 1024 * 1024 // 1M rows // Generate date data once with ~10% invalid values runBenchmarkWithTable("date data generation", values) { v => diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala index 022dfee7f5..d858c76a26 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala @@ -56,7 +56,7 @@ object CometCastTemporalToNumericBenchmark extends CometBenchmarkBase { s"SELECT $castFunc(c_timestamp AS $targetType) FROM parquetV1Table") override def runCometBenchmark(mainArgs: Array[String]): Unit = { - val values = 1024 * 1024 * 5 // 5M rows + val values = 1024 * 1024 // 1M rows // Generate DATE data once for all date-to-numeric benchmarks runBenchmarkWithTable("Date to Numeric casts", values) { v => diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala index 2430024487..5d21bbffef 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala @@ -49,7 +49,7 @@ object CometCastTemporalToStringBenchmark extends CometBenchmarkBase { s"SELECT $castFunc(c_timestamp AS STRING) FROM parquetV1Table") override def runCometBenchmark(mainArgs: Array[String]): Unit = { - val values = 1024 * 1024 * 5 // 5M rows + val values = 1024 * 1024 // 1M rows // Generate temporal data once for date benchmarks runBenchmarkWithTable("Date to String casts", values) { v => diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToTemporalBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToTemporalBenchmark.scala index 2dfc8e8012..df675f6cb8 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToTemporalBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToTemporalBenchmark.scala @@ -51,7 +51,7 @@ object CometCastTemporalToTemporalBenchmark extends CometBenchmarkBase { s"SELECT $castFunc(c_timestamp AS DATE) FROM parquetV1Table") override def runCometBenchmark(mainArgs: Array[String]): Unit = { - val values = 1024 * 1024 * 5 // 5M rows + val values = 1024 * 1024 // 1M rows // Generate DATE data for Date -> Timestamp benchmarks runBenchmarkWithTable("Date to Timestamp casts", values) { v => From f28b98444999633a8fb6ce8357350df473e279ad Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 2 Jan 2026 12:49:14 -0700 Subject: [PATCH 6/8] remove legacy benchmark --- .../sql/benchmark/CometCastBenchmark.scala | 96 ------------------- 1 file changed, 96 deletions(-) delete mode 100644 spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBenchmark.scala diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBenchmark.scala deleted file mode 100644 index 08ab2c344b..0000000000 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBenchmark.scala +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.spark.sql.benchmark - -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.{DataType, LongType} - -import org.apache.comet.expressions.{CometCast, CometEvalMode} -import org.apache.comet.serde.{Compatible, Incompatible, Unsupported} - -/** - * Benchmark to measure Comet execution performance. To run this benchmark: - * {{{ - * SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastBenchmark - * }}} - * - * Results will be written to "spark/benchmarks/CometCastBenchmark-**results.txt". - */ - -object CometCastBenchmark extends CometBenchmarkBase { - - override def getSparkSession: SparkSession = { - val session = super.getSparkSession - session.conf.set("parquet.enable.dictionary", "false") - session.conf.set("spark.sql.shuffle.partitions", "2") - session - } - - def castExprSQL(toDataType: DataType, input: String): String = { - s"CAST ($input AS ${toDataType.sql})" - } - - override def runCometBenchmark(args: Array[String]): Unit = { - - // TODO : Create all possible input datatypes. We only have Long inputs for now - CometCast.supportedTypes.foreach { toDataType => - Seq(false, true).foreach { ansiMode => - CometCast.isSupported( - LongType, - toDataType, - None, - if (ansiMode) CometEvalMode.ANSI else CometEvalMode.LEGACY) match { - case Compatible(notes) => - runBenchmarkWithTable( - s"Running benchmark cast operation from : $LongType to : $toDataType", - 1024 * 1024) { v => // 1M rows - castBenchmark(v, LongType, toDataType, isAnsiMode = ansiMode) - } - case Incompatible(notes) => None - case Unsupported(notes) => None - } - } - } - } - - def castBenchmark( - values: Int, - fromDataType: DataType, - toDataType: DataType, - isAnsiMode: Boolean): Unit = { - - withTempPath { dir => - withTempTable("parquetV1Table") { - prepareTable(dir, spark.sql(s"SELECT value FROM $tbl")) - - val functionSQL = castExprSQL(toDataType, "value") - val query = s"SELECT $functionSQL FROM parquetV1Table" - val name = - s"Cast function to : ${toDataType} , ansi mode enabled : ${isAnsiMode}" - - val extraConfigs = Map(SQLConf.ANSI_ENABLED.key -> isAnsiMode.toString) - - runExpressionBenchmark(name, values, query, extraConfigs) - } - } - } - -} From 6869f79709dff276c22154a4ba0201700db5ba19 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 2 Jan 2026 15:14:21 -0700 Subject: [PATCH 7/8] skip failing suite --- .github/workflows/spark_sql_test.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/spark_sql_test.yml b/.github/workflows/spark_sql_test.yml index 1ff6fa952c..2fe5fefe1a 100644 --- a/.github/workflows/spark_sql_test.yml +++ b/.github/workflows/spark_sql_test.yml @@ -65,6 +65,10 @@ jobs: - {name: "sql_hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest"} - {name: "sql_hive-2", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.ExtendedHiveTest"} - {name: "sql_hive-3", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.SlowHiveTest"} + # Skip sql_hive-1 for Spark 4.0 due to https://github.com/apache/datafusion-comet/issues/2946 + exclude: + - spark-version: {short: '4.0', full: '4.0.1', java: 17} + module: {name: "sql_hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest"} fail-fast: false name: spark-sql-${{ matrix.module.name }}/${{ matrix.os }}/spark-${{ matrix.spark-version.full }}/java-${{ matrix.spark-version.java }} runs-on: ${{ matrix.os }} From fc8b30df5235ebac74565876371e455caad946f6 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 6 Jan 2026 11:42:26 -0700 Subject: [PATCH 8/8] address feedback --- .../spark/sql/benchmark/CometCastBooleanBenchmark.scala | 2 ++ .../benchmark/CometCastNumericToNumericBenchmark.scala | 8 +++++++- .../sql/benchmark/CometCastNumericToStringBenchmark.scala | 8 +++++++- .../benchmark/CometCastNumericToTemporalBenchmark.scala | 6 ++---- .../sql/benchmark/CometCastStringToNumericBenchmark.scala | 7 +++++-- .../benchmark/CometCastStringToTemporalBenchmark.scala | 2 ++ .../benchmark/CometCastTemporalToNumericBenchmark.scala | 2 ++ .../benchmark/CometCastTemporalToStringBenchmark.scala | 2 ++ .../benchmark/CometCastTemporalToTemporalBenchmark.scala | 2 ++ 9 files changed, 31 insertions(+), 8 deletions(-) diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala index 6a81cce20e..57b8e88a7b 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala @@ -77,6 +77,7 @@ object CometCastBooleanBenchmark extends CometBenchmarkBase { runBenchmarkWithTable("Boolean to other types casts", values) { v => withTempPath { dir => withTempTable("parquetV1Table") { + // Data distribution: 1% NULL, 50/50 true/false prepareTable( dir, spark.sql(s""" @@ -98,6 +99,7 @@ object CometCastBooleanBenchmark extends CometBenchmarkBase { runBenchmarkWithTable("Numeric to Boolean casts", values) { v => withTempPath { dir => withTempTable("parquetV1Table") { + // Data distribution: 1% NULL per column, values in {-1, 0, 1} (~33% each) prepareTable( dir, spark.sql(s""" diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala index e7adf370cb..a9ea19a0e9 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala @@ -98,7 +98,13 @@ object CometCastNumericToNumericBenchmark extends CometBenchmarkBase { runBenchmarkWithTable("Numeric to Numeric casts", values) { v => withTempPath { dir => withTempTable("parquetV1Table") { - // Generate varied numeric data including edge cases + // Data distribution: 1% NULL per column + // - c_byte: full range -64 to 63 + // - c_short: full range -16384 to 16383 + // - c_int: centered around 0 (-2.5M to +2.5M) + // - c_long: large positive values (0 to ~5 billion) + // - c_float/c_double: 4% special values (NaN/Infinity), rest centered around 0 + // - c_decimal: values from -25000.00 to +25000.00 prepareTable( dir, spark.sql(s""" diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala index 00fba11392..1fd2138c58 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala @@ -60,7 +60,13 @@ object CometCastNumericToStringBenchmark extends CometBenchmarkBase { runBenchmarkWithTable("Numeric to String casts", values) { v => withTempPath { dir => withTempTable("parquetV1Table") { - // Generate varied numeric data including edge cases + // Data distribution: 1% NULL per column + // - c_bool: 50/50 true/false + // - c_byte: full range -64 to 63 + // - c_short: full range -16384 to 16383 + // - c_int/c_long: large values centered around 0 + // - c_float/c_double: 3% special values (NaN/Infinity), rest centered around 0 + // - c_decimal: values from -25000.00 to +25000.00 prepareTable( dir, spark.sql(s""" diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala index 0545ab147c..ec2d9ab12f 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala @@ -58,8 +58,7 @@ object CometCastNumericToTemporalBenchmark extends CometBenchmarkBase { runBenchmarkWithTable("Int to Date casts", values) { v => withTempPath { dir => withTempTable("parquetV1Table") { - // Generate INT values representing days since epoch (1970-01-01) - // Range: ~-18000 to +18000 days (roughly 1920 to 2020) + // Data distribution: 1% NULL, days since epoch spanning ~100 years (1920-2020) prepareTable( dir, spark.sql(s""" @@ -81,8 +80,7 @@ object CometCastNumericToTemporalBenchmark extends CometBenchmarkBase { runBenchmarkWithTable("Long to Timestamp casts", values) { v => withTempPath { dir => withTempTable("parquetV1Table") { - // Generate LONG values representing microseconds since epoch - // Range: 2020-2021 timestamps + // Data distribution: 1% NULL, microseconds since epoch spanning ~1 year from 2020-01-01 prepareTable( dir, spark.sql(s""" diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToNumericBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToNumericBenchmark.scala index 7f210fc730..c71eadad8c 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToNumericBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToNumericBenchmark.scala @@ -68,8 +68,11 @@ object CometCastStringToNumericBenchmark extends CometBenchmarkBase { runBenchmarkWithTable("String to numeric casts", values) { v => withTempPath { dir => withTempTable("parquetV1Table") { - // Generate numeric strings with both integer and decimal values - // Also include some special values: nulls (~2%), NaN (~2%), Infinity (~2%) + // Data distribution: + // - 2% NULL, 2% 'NaN', 2% 'Infinity', 2% '-Infinity' + // - 12% small integers (0-98) + // - 40% medium integers (0-999,998) + // - 40% decimals centered around 0 (approx -5000.00 to +5000.00) prepareTable( dir, spark.sql(s""" diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala index 79856f0f47..77cc009ae1 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala @@ -58,6 +58,7 @@ object CometCastStringToTemporalBenchmark extends CometBenchmarkBase { runBenchmarkWithTable("date data generation", values) { v => withTempPath { dateDir => withTempTable("parquetV1Table") { + // Data distribution: 10% invalid strings, 90% valid date strings spanning ~10 years prepareTable( dateDir, spark.sql(s""" @@ -80,6 +81,7 @@ object CometCastStringToTemporalBenchmark extends CometBenchmarkBase { runBenchmarkWithTable("timestamp data generation", values) { v => withTempPath { timestampDir => withTempTable("parquetV1Table") { + // Data distribution: 10% invalid strings, 90% valid timestamp strings (1970 epoch range) prepareTable( timestampDir, spark.sql(s""" diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala index d858c76a26..1468cbe086 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala @@ -62,6 +62,7 @@ object CometCastTemporalToNumericBenchmark extends CometBenchmarkBase { runBenchmarkWithTable("Date to Numeric casts", values) { v => withTempPath { dir => withTempTable("parquetV1Table") { + // Data distribution: 1% NULL, dates spanning ~10 years from 2020-01-01 prepareTable( dir, spark.sql(s""" @@ -83,6 +84,7 @@ object CometCastTemporalToNumericBenchmark extends CometBenchmarkBase { runBenchmarkWithTable("Timestamp to Numeric casts", values) { v => withTempPath { dir => withTempTable("parquetV1Table") { + // Data distribution: 1% NULL, timestamps spanning ~1 year from 2020-01-01 prepareTable( dir, spark.sql(s""" diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala index 5d21bbffef..1ef3e7711d 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala @@ -55,6 +55,7 @@ object CometCastTemporalToStringBenchmark extends CometBenchmarkBase { runBenchmarkWithTable("Date to String casts", values) { v => withTempPath { dir => withTempTable("parquetV1Table") { + // Data distribution: 1% NULL, dates spanning ~10 years from 2020-01-01 prepareTable( dir, spark.sql(s""" @@ -76,6 +77,7 @@ object CometCastTemporalToStringBenchmark extends CometBenchmarkBase { runBenchmarkWithTable("Timestamp to String casts", values) { v => withTempPath { dir => withTempTable("parquetV1Table") { + // Data distribution: 1% NULL, timestamps spanning ~1 year from 2020-01-01 prepareTable( dir, spark.sql(s""" diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToTemporalBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToTemporalBenchmark.scala index df675f6cb8..f2e2572487 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToTemporalBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToTemporalBenchmark.scala @@ -57,6 +57,7 @@ object CometCastTemporalToTemporalBenchmark extends CometBenchmarkBase { runBenchmarkWithTable("Date to Timestamp casts", values) { v => withTempPath { dir => withTempTable("parquetV1Table") { + // Data distribution: 1% NULL, dates spanning ~10 years from 2020-01-01 prepareTable( dir, spark.sql(s""" @@ -78,6 +79,7 @@ object CometCastTemporalToTemporalBenchmark extends CometBenchmarkBase { runBenchmarkWithTable("Timestamp to Date casts", values) { v => withTempPath { dir => withTempTable("parquetV1Table") { + // Data distribution: 1% NULL, timestamps spanning ~1 year from 2020-01-01 prepareTable( dir, spark.sql(s"""