From 9e018bf5e4882e31563d74f56faa6639047af876 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 30 Jan 2026 05:25:33 +0000 Subject: [PATCH 1/4] Merge pull request #95476 from scanhex12/enable_prewhere_for_iceberg enable prewhere for iceberg --- .../DataLakes/DataLakeConfiguration.h | 9 +++++ .../ObjectStorage/StorageObjectStorage.cpp | 2 +- .../StorageObjectStorageConfiguration.h | 5 +++ .../StorageObjectStorageSource.cpp | 2 +- .../test_minmax_pruning.py | 36 +++++++++++-------- 5 files changed, 37 insertions(+), 17 deletions(-) diff --git a/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h b/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h index b53ff9d8dcf8..9ba5d0562469 100644 --- a/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h +++ b/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h @@ -427,6 +427,15 @@ class DataLakeConfiguration : public BaseStorageConfiguration, public std::enabl return res; } + bool supportsPrewhere() const override + { +#if USE_AVRO + return std::is_same_v; +#else + return false; +#endif + } + private: const DataLakeStorageSettingsPtr settings; ObjectStoragePtr ready_object_storage; diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 7fe3ce25debe..b7a3260bc791 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -240,7 +240,7 @@ StorageObjectStorage::StorageObjectStorage( /// There's probably no reason for this, and it should just copy those fields like the others. /// * If the table contains files in different formats, with only some of them supporting /// prewhere, things break. - supports_prewhere = !configuration->isDataLakeConfiguration() && format_supports_prewhere; + supports_prewhere = configuration->supportsPrewhere() && format_supports_prewhere; supports_tuple_elements = format_supports_prewhere; StorageInMemoryMetadata metadata; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h index 935171504c47..3f3129a9d3c4 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h @@ -306,6 +306,11 @@ class StorageObjectStorageConfiguration return false; } + virtual bool supportsPrewhere() const + { + return true; + } + virtual void drop(ContextPtr) {} virtual bool isClusterSupported() const { return true; } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index d3ff0c5a6906..b07deb3a472b 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -853,7 +853,7 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade auto mapper = configuration->getColumnMapperForObject(object_info); if (!mapper) return format_filter_info; - return std::make_shared(format_filter_info->filter_actions_dag, format_filter_info->context.lock(), mapper, nullptr, nullptr); + return std::make_shared(format_filter_info->filter_actions_dag, format_filter_info->context.lock(), mapper, format_filter_info->row_level_filter, format_filter_info->prewhere_info); }(); LOG_DEBUG( diff --git a/tests/integration/test_storage_iceberg_with_spark/test_minmax_pruning.py b/tests/integration/test_storage_iceberg_with_spark/test_minmax_pruning.py index 859a31ffa1eb..c62e6029b851 100644 --- a/tests/integration/test_storage_iceberg_with_spark/test_minmax_pruning.py +++ b/tests/integration/test_storage_iceberg_with_spark/test_minmax_pruning.py @@ -10,7 +10,8 @@ @pytest.mark.parametrize("storage_type", ["s3", "azure", "local"]) @pytest.mark.parametrize("is_table_function", [False, True]) -def test_minmax_pruning(started_cluster_iceberg_with_spark, storage_type, is_table_function): +@pytest.mark.parametrize("use_prewhere", [False, True]) +def test_minmax_pruning(started_cluster_iceberg_with_spark, storage_type, is_table_function, use_prewhere): instance = started_cluster_iceberg_with_spark.instances["node1"] spark = started_cluster_iceberg_with_spark.spark_session TABLE_NAME = "test_minmax_pruning_" + storage_type + "_" + get_uuid_str() @@ -95,6 +96,11 @@ def check_validity_and_get_prunned_files(select_expression): instance, TABLE_NAME, settings1, settings2, 'IcebergMinMaxIndexPrunedFiles', select_expression ) + + where_condition = "WHERE" + if use_prewhere and not is_table_function: + where_condition = "PREWHERE" + assert ( check_validity_and_get_prunned_files( f"SELECT * FROM {creation_expression} ORDER BY ALL" @@ -103,55 +109,55 @@ def check_validity_and_get_prunned_files(select_expression): ) assert ( check_validity_and_get_prunned_files( - f"SELECT * FROM {creation_expression} WHERE date <= '2024-01-25' ORDER BY ALL" + f"SELECT * FROM {creation_expression} {where_condition} date <= '2024-01-25' ORDER BY ALL" ) == 3 ) assert ( check_validity_and_get_prunned_files( - f"SELECT * FROM {creation_expression} WHERE ts <= timestamp('2024-03-20 14:00:00.000000') ORDER BY ALL" + f"SELECT * FROM {creation_expression} {where_condition} ts <= timestamp('2024-03-20 14:00:00.000000') ORDER BY ALL" ) == 3 ) assert ( check_validity_and_get_prunned_files( - f"SELECT * FROM {creation_expression} WHERE tag == 1 ORDER BY ALL" + f"SELECT * FROM {creation_expression} {where_condition} tag == 1 ORDER BY ALL" ) == 3 ) assert ( check_validity_and_get_prunned_files( - f"SELECT * FROM {creation_expression} WHERE tag <= 1 ORDER BY ALL" + f"SELECT * FROM {creation_expression} {where_condition} tag <= 1 ORDER BY ALL" ) == 3 ) assert ( check_validity_and_get_prunned_files( - f"SELECT * FROM {creation_expression} WHERE name == 'vasilisa' ORDER BY ALL" + f"SELECT * FROM {creation_expression} {where_condition} name == 'vasilisa' ORDER BY ALL" ) == 3 ) assert ( check_validity_and_get_prunned_files( - f"SELECT * FROM {creation_expression} WHERE name < 'kek' ORDER BY ALL" + f"SELECT * FROM {creation_expression} {where_condition} name < 'kek' ORDER BY ALL" ) == 2 ) assert ( check_validity_and_get_prunned_files( - f"SELECT * FROM {creation_expression} WHERE number == 8 ORDER BY ALL" + f"SELECT * FROM {creation_expression} {where_condition} number == 8 ORDER BY ALL" ) == 3 ) assert ( check_validity_and_get_prunned_files( - f"SELECT * FROM {creation_expression} WHERE number <= 5 ORDER BY ALL" + f"SELECT * FROM {creation_expression} {where_condition} number <= 5 ORDER BY ALL" ) == 3 ) @@ -163,7 +169,7 @@ def check_validity_and_get_prunned_files(select_expression): assert ( check_validity_and_get_prunned_files( - f"SELECT * FROM {creation_expression} WHERE date3 <= '2024-01-25' ORDER BY ALL" + f"SELECT * FROM {creation_expression} {where_condition} date3 <= '2024-01-25' ORDER BY ALL" ) == 3 ) @@ -172,14 +178,14 @@ def check_validity_and_get_prunned_files(select_expression): assert ( check_validity_and_get_prunned_files( - f"SELECT * FROM {creation_expression} WHERE tag <= 1 ORDER BY ALL" + f"SELECT * FROM {creation_expression} {where_condition} tag <= 1 ORDER BY ALL" ) == 3 ) assert ( check_validity_and_get_prunned_files( - f"SELECT * FROM {creation_expression} WHERE time_struct.a <= '2024-02-01' ORDER BY ALL" + f"SELECT * FROM {creation_expression} {where_condition} time_struct.a <= '2024-02-01' ORDER BY ALL" ) == 3 ) @@ -190,7 +196,7 @@ def check_validity_and_get_prunned_files(select_expression): assert ( check_validity_and_get_prunned_files( - f"SELECT * FROM {creation_expression} WHERE time_struct.a <= '2024-02-01' ORDER BY ALL" + f"SELECT * FROM {creation_expression} {where_condition} time_struct.a <= '2024-02-01' ORDER BY ALL" ) == 4 ) @@ -211,7 +217,7 @@ def check_validity_and_get_prunned_files(select_expression): assert ( check_validity_and_get_prunned_files( - f"SELECT * FROM {creation_expression} WHERE ddd >= 100 ORDER BY ALL" + f"SELECT * FROM {creation_expression} {where_condition} ddd >= 100 ORDER BY ALL" ) == 2 ) @@ -219,7 +225,7 @@ def check_validity_and_get_prunned_files(select_expression): # Please check the code where we parse lower bounds and upper bounds assert ( check_validity_and_get_prunned_files( - f"SELECT * FROM {creation_expression} WHERE ddd >= toDecimal64('17.21', 3) ORDER BY ALL" + f"SELECT * FROM {creation_expression} {where_condition} ddd >= toDecimal64('17.21', 3) ORDER BY ALL" ) == 1 ) \ No newline at end of file From 33cd9a00dde18ff69bdb123c967043545a2680c2 Mon Sep 17 00:00:00 2001 From: Mikhail Koviazin Date: Thu, 26 Mar 2026 09:27:44 +0100 Subject: [PATCH 2/4] DataLakeConfiguration: delegate supportsPrewhere() as well --- src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h b/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h index 9ba5d0562469..afbe64d1a325 100644 --- a/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h +++ b/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h @@ -608,6 +608,7 @@ class StorageIcebergConfiguration : public StorageObjectStorageConfiguration, pu bool supportsFileIterator() const override { return getImpl().supportsFileIterator(); } bool supportsParallelInsert() const override { return getImpl().supportsParallelInsert(); } bool supportsWrites() const override { return getImpl().supportsWrites(); } + bool supportsPrewhere() const override { return getImpl().supportsPrewhere(); } bool supportsPartialPathPrefix() const override { return getImpl().supportsPartialPathPrefix(); } From 0bb4fa3ba5f85280b60906fc34fe8c99c11fcd20 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 1 Mar 2026 18:20:30 +0000 Subject: [PATCH 3/4] Merge pull request #98360 from ClickHouse/fix-parquet-prewhere-external-columns Fix exception in Parquet PREWHERE when column is not in file --- .../Formats/Impl/Parquet/Reader.cpp | 5 ++-- ..._parquet_prewhere_missing_column.reference | 10 ++++++++ .../04001_parquet_prewhere_missing_column.sql | 25 +++++++++++++++++++ 3 files changed, 38 insertions(+), 2 deletions(-) create mode 100644 tests/queries/0_stateless/04001_parquet_prewhere_missing_column.reference create mode 100644 tests/queries/0_stateless/04001_parquet_prewhere_missing_column.sql diff --git a/src/Processors/Formats/Impl/Parquet/Reader.cpp b/src/Processors/Formats/Impl/Parquet/Reader.cpp index 938b3ba077fc..ff9ddf1cda98 100644 --- a/src/Processors/Formats/Impl/Parquet/Reader.cpp +++ b/src/Processors/Formats/Impl/Parquet/Reader.cpp @@ -307,7 +307,7 @@ void Reader::prefilterAndInitRowGroups(const std::optionalhas(node->result_name)) + if (node->type != ActionsDAG::ActionType::INPUT && sample_block->has(node->result_name)) schemer.external_columns.push_back(node->result_name); }; if (row_level_filter) @@ -688,7 +688,8 @@ void Reader::preparePrewhere() else { if (!prewhere_output_column_idxs.contains(idx_in_output_block)) - throw Exception(ErrorCodes::LOGICAL_ERROR, "PREWHERE appears to use its own output as input"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "PREWHERE appears to use its own output as input: column '{}' (idx {})", + col.name, idx_in_output_block); } step.input_idxs.push_back(idx_in_output_block); } diff --git a/tests/queries/0_stateless/04001_parquet_prewhere_missing_column.reference b/tests/queries/0_stateless/04001_parquet_prewhere_missing_column.reference new file mode 100644 index 000000000000..7f1bc308d222 --- /dev/null +++ b/tests/queries/0_stateless/04001_parquet_prewhere_missing_column.reference @@ -0,0 +1,10 @@ +0 0 +1 0 +2 0 +3 0 +4 0 +5 0 +6 0 +7 0 +8 0 +9 0 diff --git a/tests/queries/0_stateless/04001_parquet_prewhere_missing_column.sql b/tests/queries/0_stateless/04001_parquet_prewhere_missing_column.sql new file mode 100644 index 000000000000..140f5c774c56 --- /dev/null +++ b/tests/queries/0_stateless/04001_parquet_prewhere_missing_column.sql @@ -0,0 +1,25 @@ +-- Tags: no-fasttest + +-- Regression test: PREWHERE on a Parquet file with a column declared in the schema +-- but not present in the actual file. When the missing column appears both in the +-- PREWHERE condition and in the SELECT list, it stays in the format header as a +-- pass-through INPUT node in the prewhere DAG. The `add_prewhere_outputs` lambda +-- incorrectly added such INPUT nodes to `external_columns`, which prevented the +-- SchemaConverter from creating a missing-column entry. This caused "KeyCondition +-- uses PREWHERE output" or "PREWHERE appears to use its own output as input" +-- exceptions (LOGICAL_ERROR). + +SET input_format_parquet_use_native_reader_v3 = 1; +SET input_format_parquet_allow_missing_columns = 1; +SET engine_file_truncate_on_insert = 1; + +INSERT INTO FUNCTION file(currentDatabase() || '_04001.parquet') + SELECT number FROM numbers(10); + +-- `extra` is not in the Parquet file; it defaults to 0 with allow_missing_columns. +-- The SELECT uses both `number` and `extra`, so `extra` remains in format_header as +-- a pass-through. Without the fix, `add_prewhere_outputs` incorrectly added `extra` +-- to `external_columns`, preventing SchemaConverter from creating a missing column. +SELECT number, extra FROM file(currentDatabase() || '_04001.parquet', Parquet, 'number UInt64, extra UInt64') + PREWHERE extra = 0 + ORDER BY number; From 33a1a86c964be62474fdff4695ba93dc23a06375 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 29 Mar 2026 18:01:37 +0200 Subject: [PATCH 4/4] Merge pull request #100361 from ClickHouse/fix-prewhere-info-assertion Fix exception in `updateFormatPrewhereInfo` when only row-level filter is set --- src/Storages/prepareReadingFromFormat.cpp | 10 ++-- .../04053_row_policy_object_storage.reference | 7 +++ .../04053_row_policy_object_storage.sh | 46 +++++++++++++++++++ 3 files changed, 60 insertions(+), 3 deletions(-) create mode 100644 tests/queries/0_stateless/04053_row_policy_object_storage.reference create mode 100755 tests/queries/0_stateless/04053_row_policy_object_storage.sh diff --git a/src/Storages/prepareReadingFromFormat.cpp b/src/Storages/prepareReadingFromFormat.cpp index 5d6cc8b5afb5..8e860110a7c5 100644 --- a/src/Storages/prepareReadingFromFormat.cpp +++ b/src/Storages/prepareReadingFromFormat.cpp @@ -258,19 +258,23 @@ Names filterTupleColumnsToRead(NamesAndTypesList & requested_columns) ReadFromFormatInfo updateFormatPrewhereInfo(const ReadFromFormatInfo & info, const FilterDAGInfoPtr & row_level_filter, const PrewhereInfoPtr & prewhere_info) { - chassert(prewhere_info); + chassert(prewhere_info || row_level_filter); - if (info.prewhere_info || info.row_level_filter) + if (info.prewhere_info) throw Exception(ErrorCodes::LOGICAL_ERROR, "updateFormatPrewhereInfo called more than once"); ReadFromFormatInfo new_info; new_info.prewhere_info = prewhere_info; + new_info.row_level_filter = row_level_filter; /// Removes columns that are only used as prewhere input. /// Adds prewhere outputs (the actual prewhere filter column is only added if /// !remove_prewhere_column; but there may also be subexpressions computed by prewhere /// expression and preserved for use further down the query pipeline). - new_info.format_header = SourceStepWithFilter::applyPrewhereActions(info.format_header, row_level_filter, prewhere_info); + /// If row_level_filter was already applied in a previous call, don't re-apply it; + /// only apply the new prewhere_info on top. + new_info.format_header = SourceStepWithFilter::applyPrewhereActions( + info.format_header, info.row_level_filter ? nullptr : row_level_filter, prewhere_info); /// We assume that any format that supports prewhere also supports subset of subcolumns, so we /// don't need to replace subcolumns with their nested columns etc. diff --git a/tests/queries/0_stateless/04053_row_policy_object_storage.reference b/tests/queries/0_stateless/04053_row_policy_object_storage.reference new file mode 100644 index 000000000000..7f4f9bef5bd9 --- /dev/null +++ b/tests/queries/0_stateless/04053_row_policy_object_storage.reference @@ -0,0 +1,7 @@ +--- Row policy filters URL Parquet table --- +1 a +2 b +--- Row policy with WHERE on URL Parquet table --- +1 a +--- Row policy count on URL Parquet table --- +2 diff --git a/tests/queries/0_stateless/04053_row_policy_object_storage.sh b/tests/queries/0_stateless/04053_row_policy_object_storage.sh new file mode 100755 index 000000000000..e66ff60e8ad6 --- /dev/null +++ b/tests/queries/0_stateless/04053_row_policy_object_storage.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +# Tags: no-replicated-database, no-fasttest + +# Regression test: row policy on a URL table with Parquet format caused +# "Logical error: 'prewhere_info'" because updateFormatPrewhereInfo asserted +# prewhere_info was non-null, but only row_level_filter was set. + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +user="user04053_${CLICKHOUSE_DATABASE}_$RANDOM" +db=${CLICKHOUSE_DATABASE} + +${CLICKHOUSE_CLIENT} <