From aad8b491bbff14f31cba5eca9d8ed7d29dd3b55d Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Wed, 26 Nov 2025 09:36:15 +0800 Subject: [PATCH 1/3] feat: add manifest evaluator --- src/iceberg/CMakeLists.txt | 1 + src/iceberg/expression/manifest_evaluator.cc | 391 ++++++++++++++++ src/iceberg/expression/manifest_evaluator.h | 82 ++++ src/iceberg/expression/meson.build | 1 + src/iceberg/expression/term.h | 2 + src/iceberg/meson.build | 1 + src/iceberg/row/struct_like.cc | 2 +- src/iceberg/row/struct_like.h | 4 + src/iceberg/test/CMakeLists.txt | 1 + src/iceberg/test/manifest_evaluator_test.cc | 462 +++++++++++++++++++ src/iceberg/test/meson.build | 1 + 11 files changed, 947 insertions(+), 1 deletion(-) create mode 100644 src/iceberg/expression/manifest_evaluator.cc create mode 100644 src/iceberg/expression/manifest_evaluator.h create mode 100644 src/iceberg/test/manifest_evaluator_test.cc diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index 275d71fce..f62362efc 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -27,6 +27,7 @@ set(ICEBERG_SOURCES expression/expressions.cc expression/inclusive_metrics_evaluator.cc expression/literal.cc + expression/manifest_evaluator.cc expression/predicate.cc expression/rewrite_not.cc expression/strict_metrics_evaluator.cc diff --git a/src/iceberg/expression/manifest_evaluator.cc b/src/iceberg/expression/manifest_evaluator.cc new file mode 100644 index 000000000..d4c177821 --- /dev/null +++ b/src/iceberg/expression/manifest_evaluator.cc @@ -0,0 +1,391 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/expression/manifest_evaluator.h" + +#include "iceberg/expression/binder.h" +#include "iceberg/expression/expression_visitor.h" +#include "iceberg/expression/rewrite_not.h" +#include "iceberg/manifest/manifest_list.h" +#include "iceberg/row/struct_like.h" +#include "iceberg/schema.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +namespace { +constexpr bool kRowsMightMatch = true; +constexpr bool kRowCannotMatch = false; +constexpr int32_t kInPredicateLimit = 200; +} // namespace + +class ManifestEvalVisitor : public BoundVisitor { + public: + explicit ManifestEvalVisitor(const ManifestFile& manifest) + : stats_(manifest.partitions) {} + + Result AlwaysTrue() override { return true; } + + Result AlwaysFalse() override { return false; } + + Result Not(bool child_result) override { return !child_result; } + + Result And(bool left_result, bool right_result) override { + return left_result && right_result; + } + + Result Or(bool left_result, bool right_result) override { + return left_result || right_result; + } + + Result IsNull(const std::shared_ptr& expr) override { + // no need to check whether the field is required because binding evaluates that case + // if the column has no null values, the expression cannot match + ICEBERG_ASSIGN_OR_RAISE(auto ref, ParseBoundReference(expr)); + ICEBERG_ASSIGN_OR_RAISE(auto pos, GetPosition(*ref)); + if (!stats_.at(pos).contains_null) { + return kRowCannotMatch; + } + + return kRowsMightMatch; + } + + Result NotNull(const std::shared_ptr& expr) override { + ICEBERG_ASSIGN_OR_RAISE(auto ref, ParseBoundReference(expr)); + ICEBERG_ASSIGN_OR_RAISE(auto pos, GetPosition(*ref)); + if (AllValuesAreNull(stats_.at(pos), ref->type()->type_id())) { + return kRowCannotMatch; + } + + return kRowsMightMatch; + } + + Result IsNaN(const std::shared_ptr& expr) override { + ICEBERG_ASSIGN_OR_RAISE(auto ref, ParseBoundReference(expr)); + ICEBERG_ASSIGN_OR_RAISE(auto pos, GetPosition(*ref)); + if (stats_.at(pos).contains_nan.has_value() && !stats_.at(pos).contains_nan.value()) { + return kRowCannotMatch; + } + if (AllValuesAreNull(stats_.at(pos), ref->type()->type_id())) { + return kRowCannotMatch; + } + + return kRowsMightMatch; + } + + Result NotNaN(const std::shared_ptr& expr) override { + ICEBERG_ASSIGN_OR_RAISE(auto ref, ParseBoundReference(expr)); + ICEBERG_ASSIGN_OR_RAISE(auto pos, GetPosition(*ref)); + const auto& summary = stats_.at(pos); + // if containsNaN is true, containsNull is false and lowerBound is null, all values + // are NaN + if (summary.contains_nan.has_value() && summary.contains_nan.value() && + !summary.contains_null && !summary.lower_bound.has_value()) { + return kRowCannotMatch; + } + + return kRowsMightMatch; + } + + Result Lt(const std::shared_ptr& expr, const Literal& lit) override { + ICEBERG_ASSIGN_OR_RAISE(auto ref, ParseBoundReference(expr)); + ICEBERG_ASSIGN_OR_RAISE(auto pos, GetPosition(*ref)); + const auto& summary = stats_.at(pos); + if (!summary.lower_bound.has_value()) { + return kRowCannotMatch; // values are all null + } + ICEBERG_ASSIGN_OR_RAISE( + auto lower, DeserializeBoundLiteral(summary.lower_bound.value(), ref->type())); + if (lower >= lit) { + return kRowCannotMatch; + } + return kRowsMightMatch; + } + + Result LtEq(const std::shared_ptr& expr, const Literal& lit) override { + ICEBERG_ASSIGN_OR_RAISE(auto ref, ParseBoundReference(expr)); + ICEBERG_ASSIGN_OR_RAISE(auto pos, GetPosition(*ref)); + const auto& summary = stats_.at(pos); + if (!summary.lower_bound.has_value()) { + return kRowCannotMatch; // values are all null + } + ICEBERG_ASSIGN_OR_RAISE( + auto lower, DeserializeBoundLiteral(summary.lower_bound.value(), ref->type())); + if (lower > lit) { + return kRowCannotMatch; + } + return kRowsMightMatch; + } + + Result Gt(const std::shared_ptr& expr, const Literal& lit) override { + ICEBERG_ASSIGN_OR_RAISE(auto ref, ParseBoundReference(expr)); + ICEBERG_ASSIGN_OR_RAISE(auto pos, GetPosition(*ref)); + const auto& summary = stats_.at(pos); + if (!summary.upper_bound.has_value()) { + return kRowCannotMatch; // values are all null + } + ICEBERG_ASSIGN_OR_RAISE( + auto upper, DeserializeBoundLiteral(summary.upper_bound.value(), ref->type())); + if (upper <= lit) { + return kRowCannotMatch; + } + return kRowsMightMatch; + } + + Result GtEq(const std::shared_ptr& expr, const Literal& lit) override { + ICEBERG_ASSIGN_OR_RAISE(auto ref, ParseBoundReference(expr)); + ICEBERG_ASSIGN_OR_RAISE(auto pos, GetPosition(*ref)); + const auto& summary = stats_.at(pos); + if (!summary.upper_bound.has_value()) { + return kRowCannotMatch; // values are all null + } + ICEBERG_ASSIGN_OR_RAISE( + auto upper, DeserializeBoundLiteral(summary.upper_bound.value(), ref->type())); + if (upper < lit) { + return kRowCannotMatch; + } + return kRowsMightMatch; + } + + Result Eq(const std::shared_ptr& expr, const Literal& lit) override { + ICEBERG_ASSIGN_OR_RAISE(auto ref, ParseBoundReference(expr)); + ICEBERG_ASSIGN_OR_RAISE(auto pos, GetPosition(*ref)); + const auto& summary = stats_.at(pos); + if (!summary.lower_bound.has_value() || !summary.upper_bound.has_value()) { + return kRowCannotMatch; // values are all null and literal cannot contain null + } + ICEBERG_ASSIGN_OR_RAISE( + auto lower, DeserializeBoundLiteral(summary.lower_bound.value(), ref->type())); + if (lower > lit) { + return kRowCannotMatch; + } + + ICEBERG_ASSIGN_OR_RAISE( + auto upper, DeserializeBoundLiteral(summary.upper_bound.value(), ref->type())); + if (upper < lit) { + return kRowCannotMatch; + } + + return kRowsMightMatch; + } + + Result NotEq(const std::shared_ptr& expr, const Literal& lit) override { + // because the bounds are not necessarily a min or max value, this cannot be answered + // using them. notEq(col, X) with (X, Y) doesn't guarantee that X is a value in col. + return kRowsMightMatch; + } + + Result In(const std::shared_ptr& expr, + const BoundSetPredicate::LiteralSet& literal_set) override { + ICEBERG_ASSIGN_OR_RAISE(auto ref, ParseBoundReference(expr)); + ICEBERG_ASSIGN_OR_RAISE(auto pos, GetPosition(*ref)); + const auto& summary = stats_.at(pos); + if (!summary.lower_bound.has_value() || !summary.upper_bound.has_value()) { + // values are all null and literalSet cannot contain null. + return kRowCannotMatch; + } + if (literal_set.size() > kInPredicateLimit) { + // skip evaluating the predicate if the number of values is too big + return kRowsMightMatch; + } + + ICEBERG_ASSIGN_OR_RAISE( + auto lower, DeserializeBoundLiteral(summary.lower_bound.value(), ref->type())); + ICEBERG_ASSIGN_OR_RAISE( + auto upper, DeserializeBoundLiteral(summary.upper_bound.value(), ref->type())); + auto literals_view = literal_set | std::views::filter([&](const Literal& lit) { + return lower <= lit && lit <= upper; + }); + + if (literals_view.empty()) { + // if all values are less than lower bound, rows cannot match. + // if all values are greater than upper bound, rows cannot match. + return kRowCannotMatch; + } + return kRowsMightMatch; + } + + Result NotIn(const std::shared_ptr& expr, + const BoundSetPredicate::LiteralSet& literal_set) override { + // because the bounds are not necessarily a min or max value, this cannot be answered + // using them. notIn(col, {X, ...}) with (X, Y) doesn't guarantee that X is a value in + // col. + return kRowsMightMatch; + } + + Result StartsWith(const std::shared_ptr& expr, + const Literal& lit) override { + ICEBERG_ASSIGN_OR_RAISE(auto ref, ParseBoundReference(expr)); + ICEBERG_ASSIGN_OR_RAISE(auto pos, GetPosition(*ref)); + const auto& summary = stats_.at(pos); + if (!summary.lower_bound.has_value() || !summary.upper_bound.has_value()) { + return kRowCannotMatch; + } + if (lit.type()->type_id() != TypeId::kString) { + return Invalid("Invalid literal: not a string, cannot use StartsWith"); + } + const auto& prefix = std::get(lit.value()); + ICEBERG_ASSIGN_OR_RAISE( + auto lower, DeserializeBoundLiteral(summary.lower_bound.value(), ref->type())); + ICEBERG_ASSIGN_OR_RAISE( + auto upper, DeserializeBoundLiteral(summary.upper_bound.value(), ref->type())); + const auto& lower_bound = std::get(lower.value()); + const auto& upper_bound = std::get(upper.value()); + // truncate lower bound so that its length in bytes is not greater than the length of + // prefix + size_t length = std::min(prefix.size(), lower_bound.size()); + if (lower_bound.substr(0, length) > prefix) { + return kRowCannotMatch; + } + length = std::min(prefix.size(), upper_bound.size()); + if (upper_bound.substr(0, length) < prefix) { + return kRowCannotMatch; + } + return kRowsMightMatch; + } + + Result NotStartsWith(const std::shared_ptr& expr, + const Literal& lit) override { + ICEBERG_ASSIGN_OR_RAISE(auto ref, ParseBoundReference(expr)); + ICEBERG_ASSIGN_OR_RAISE(auto pos, GetPosition(*ref)); + const auto& summary = stats_.at(pos); + if (summary.contains_null || !summary.lower_bound.has_value() || + !summary.upper_bound.has_value()) { + return kRowsMightMatch; + } + if (lit.type()->type_id() != TypeId::kString) { + return Invalid("Invalid literal: not a string, cannot use notStartsWith"); + } + // notStartsWith will match unless all values must start with the prefix. This happens + // when the lower and upper bounds both start with the prefix. + const auto& prefix = std::get(lit.value()); + ICEBERG_ASSIGN_OR_RAISE( + auto lower, DeserializeBoundLiteral(summary.lower_bound.value(), ref->type())); + ICEBERG_ASSIGN_OR_RAISE( + auto upper, DeserializeBoundLiteral(summary.upper_bound.value(), ref->type())); + const auto& lower_bound = std::get(lower.value()); + const auto& upper_bound = std::get(upper.value()); + + // if lower is shorter than the prefix, it can't start with the prefix + if (lower_bound.size() < prefix.size()) { + return kRowsMightMatch; + } + if (lower_bound.starts_with(prefix)) { + // the lower bound starts with the prefix; check the upper bound + // if upper is shorter than the prefix, it can't start with the prefix + if (upper_bound.size() < prefix.size()) { + return kRowsMightMatch; + } + // truncate upper bound so that its length in bytes is not greater than the length + // of prefix + if (upper_bound.starts_with(prefix)) { + return kRowCannotMatch; + } + } + return kRowsMightMatch; + } + + private: + Result ParseBoundReference(const std::shared_ptr& expr) const { + auto ref = dynamic_cast(expr.get()); + if (ref == nullptr) { + return Invalid("Invalid expression: not a BoundReference"); + } + return ref; + } + + Result GetPosition(const BoundReference& ref) const { + const auto& accessor = ref.accessor(); + const auto& position_path = accessor.position_path(); + if (position_path.empty()) { + return Invalid("Invalid accessor: empty position path."); + } + // nested accessors are not supported for partition fields + if (position_path.size() > 1) { + return Invalid("Cannot convert nested accessor to position"); + } + auto pos = position_path.at(0); + if (pos >= stats_.size()) { + return Invalid("Invalid position: out of partition filed range."); + } + return pos; + } + + bool AllValuesAreNull(const PartitionFieldSummary& summary, TypeId typeId) { + // containsNull encodes whether at least one partition value is null, + // lowerBound is null if all partition values are null + bool allNull = summary.contains_null && !summary.lower_bound.has_value(); + + if (allNull && (typeId == TypeId::kDouble || typeId == TypeId::kFloat)) { + // floating point types may include NaN values, which we check separately. + // In case bounds don't include NaN value, containsNaN needs to be checked against. + allNull = summary.contains_nan.has_value() && !summary.contains_nan.value(); + } + return allNull; + } + + Result DeserializeBoundLiteral(const std::vector& bound, + const std::shared_ptr& type) const { + if (!type->is_primitive()) { + return NotSupported("Bounds of non-primitive partition fields are not supported."); + } + auto primitive_type = internal::checked_pointer_cast(type); + return Literal::Deserialize(bound, primitive_type); + } + + private: + const std::vector& stats_; +}; + +ManifestEvaluator::ManifestEvaluator(std::shared_ptr expr) + : expr_(std::move(expr)) {} + +ManifestEvaluator::~ManifestEvaluator() = default; + +Result> ManifestEvaluator::MakeRowFilter( + [[maybe_unused]] std::shared_ptr expr, + [[maybe_unused]] const std::shared_ptr& spec, + [[maybe_unused]] const Schema& schema, [[maybe_unused]] bool case_sensitive) { + // TODO(xiao.dong) we need a projection util to project row filter to the partition col + return NotImplemented("ManifestEvaluator::MakeRowFilter"); +} + +Result> ManifestEvaluator::MakePartitionFilter( + std::shared_ptr expr, const std::shared_ptr& spec, + const Schema& schema, bool case_sensitive) { + ICEBERG_ASSIGN_OR_RAISE(auto partition_type, spec->PartitionType(schema)); + auto field_span = partition_type->fields(); + std::vector fields(field_span.begin(), field_span.end()); + auto partition_schema = std::make_shared(fields); + ICEBERG_ASSIGN_OR_RAISE(auto rewrite_expr, RewriteNot::Visit(std::move(expr))); + ICEBERG_ASSIGN_OR_RAISE(auto partition_expr, + Binder::Bind(*partition_schema, rewrite_expr, case_sensitive)); + return std::unique_ptr( + new ManifestEvaluator(std::move(partition_expr))); +} + +Result ManifestEvaluator::Evaluate(const ManifestFile& manifest) const { + if (manifest.partitions.empty()) { + return kRowsMightMatch; + } + ManifestEvalVisitor visitor(manifest); + return Visit(expr_, visitor); +} + +} // namespace iceberg diff --git a/src/iceberg/expression/manifest_evaluator.h b/src/iceberg/expression/manifest_evaluator.h new file mode 100644 index 000000000..ddc743dcd --- /dev/null +++ b/src/iceberg/expression/manifest_evaluator.h @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/expression/manifest_evaluator.h +/// +/// Evaluates an Expression on a ManifestFile to test whether the file contains +/// matching partitions. +/// +/// For row expressions, evaluation is inclusive: it returns true if a file +/// may match and false if it cannot match. +/// +/// Files are passed to #eval(ManifestFile), which returns true if the manifest may +/// contain data files that match the partition expression. Manifest files may be +/// skipped if and only if the return value of eval is false. +/// + +#include + +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" + +namespace iceberg { + +/// \brief Evaluates an Expression against manifest. +/// \note: The evaluator is thread-safe. +class ICEBERG_EXPORT ManifestEvaluator { + public: + /// \brief Make a manifest evaluator for RowFilter + /// + /// \param expr The expression to evaluate + /// \param spec The partition spec + /// \param schema The schema of the table + /// \param case_sensitive Whether field name matching is case-sensitive + static Result> MakeRowFilter( + std::shared_ptr expr, const std::shared_ptr& spec, + const Schema& schema, bool case_sensitive = true); + + /// \brief Make a manifest evaluator for PartitionFilter + /// + /// \param expr The expression to evaluate + /// \param spec The partition spec + /// \param schema The schema of the table + /// \param case_sensitive Whether field name matching is case-sensitive + static Result> MakePartitionFilter( + std::shared_ptr expr, const std::shared_ptr& spec, + const Schema& schema, bool case_sensitive = true); + + ~ManifestEvaluator(); + + /// \brief Evaluate the expression against a manifest. + /// + /// \param manifest The manifest to evaluate + /// \return true if the row matches the expression, false otherwise, or error + Result Evaluate(const ManifestFile& manifest) const; + + private: + explicit ManifestEvaluator(std::shared_ptr expr); + + private: + std::shared_ptr expr_; +}; + +} // namespace iceberg diff --git a/src/iceberg/expression/meson.build b/src/iceberg/expression/meson.build index 8e312791b..b4e4ab02d 100644 --- a/src/iceberg/expression/meson.build +++ b/src/iceberg/expression/meson.build @@ -25,6 +25,7 @@ install_headers( 'expressions.h', 'inclusive_metrics_evaluator.h', 'literal.h', + 'manifest_evaluator.h', 'predicate.h', 'rewrite_not.h', 'strict_metrics_evaluator.h', diff --git a/src/iceberg/expression/term.h b/src/iceberg/expression/term.h index 616f11da6..5e834af51 100644 --- a/src/iceberg/expression/term.h +++ b/src/iceberg/expression/term.h @@ -157,6 +157,8 @@ class ICEBERG_EXPORT BoundReference Kind kind() const override { return Kind::kReference; } + const StructLikeAccessor& accessor() const { return *accessor_; } + private: BoundReference(SchemaField field, std::unique_ptr accessor); diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build index c139c66b5..04aab6534 100644 --- a/src/iceberg/meson.build +++ b/src/iceberg/meson.build @@ -49,6 +49,7 @@ iceberg_sources = files( 'expression/expressions.cc', 'expression/inclusive_metrics_evaluator.cc', 'expression/literal.cc', + 'expression/manifest_evaluator.cc', 'expression/predicate.cc', 'expression/rewrite_not.cc', 'expression/strict_metrics_evaluator.cc', diff --git a/src/iceberg/row/struct_like.cc b/src/iceberg/row/struct_like.cc index 85bde1a69..24e61644f 100644 --- a/src/iceberg/row/struct_like.cc +++ b/src/iceberg/row/struct_like.cc @@ -70,7 +70,7 @@ Result LiteralToScalar(const Literal& literal) { StructLikeAccessor::StructLikeAccessor(std::shared_ptr type, std::span position_path) - : type_(std::move(type)) { + : type_(std::move(type)), position_path_(position_path.begin(), position_path.end()) { if (position_path.size() == 1) { accessor_ = [pos = position_path[0]](const StructLike& struct_like) -> Result { diff --git a/src/iceberg/row/struct_like.h b/src/iceberg/row/struct_like.h index 36ff5d86b..7b4bdc3be 100644 --- a/src/iceberg/row/struct_like.h +++ b/src/iceberg/row/struct_like.h @@ -121,9 +121,13 @@ class ICEBERG_EXPORT StructLikeAccessor { /// \brief Get the type of the value that this accessor is bound to. const Type& type() const { return *type_; } + /// \brief Get the position path of the value that this accessor bounded to. + const std::vector& position_path() const { return position_path_; } + private: std::shared_ptr type_; std::function(const StructLike&)> accessor_; + std::vector position_path_; }; } // namespace iceberg diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt index 9892e3d4f..e8da2e223 100644 --- a/src/iceberg/test/CMakeLists.txt +++ b/src/iceberg/test/CMakeLists.txt @@ -85,6 +85,7 @@ add_iceberg_test(expression_test expression_test.cc expression_visitor_test.cc literal_test.cc + manifest_evaluator_test.cc inclusive_metrics_evaluator_test.cc inclusive_metrics_evaluator_with_transform_test.cc predicate_test.cc diff --git a/src/iceberg/test/manifest_evaluator_test.cc b/src/iceberg/test/manifest_evaluator_test.cc new file mode 100644 index 000000000..4562e74ec --- /dev/null +++ b/src/iceberg/test/manifest_evaluator_test.cc @@ -0,0 +1,462 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/expression/manifest_evaluator.h" + +#include +#include +#include + +#include + +#include "iceberg/expression/expressions.h" +#include "iceberg/manifest/manifest_list.h" +#include "iceberg/partition_spec.h" +#include "iceberg/schema.h" +#include "iceberg/test/matchers.h" +#include "iceberg/transform.h" +#include "iceberg/type.h" + +namespace iceberg { + +class ManifestEvaluatorTest : public ::testing::Test { + protected: + static constexpr int32_t kIntMinValue = 30; + static constexpr int32_t kIntMaxValue = 79; + + void SetUp() override { + schema_ = std::make_shared( + std::vector{ + SchemaField::MakeRequired(1, "id", int32()), + SchemaField::MakeOptional(4, "all_nulls_missing_nan", string()), + SchemaField::MakeOptional(5, "some_nulls", string()), + SchemaField::MakeOptional(6, "no_nulls", string()), + SchemaField::MakeOptional(7, "float", float32()), + SchemaField::MakeOptional(8, "all_nulls_double", float64()), + SchemaField::MakeOptional(9, "all_nulls_no_nans", float32()), + SchemaField::MakeOptional(10, "all_nans", float64()), + SchemaField::MakeOptional(11, "both_nan_and_null", float32()), + SchemaField::MakeOptional(12, "no_nan_or_null", float64()), + SchemaField::MakeOptional(13, "all_nulls_missing_nan_float", float32()), + SchemaField::MakeOptional(14, "all_same_value_or_null", string()), + SchemaField::MakeOptional(15, "no_nulls_same_value_a", string()), + }, + 0); + + ICEBERG_UNWRAP_OR_FAIL(auto spec_result, + PartitionSpec::Make(/*spec_id=*/0, BuildIdentityFields())); + spec_ = std::shared_ptr(std::move(spec_result)); + + file_ = BuildManifestFile(); + no_stats_.manifest_path = "no-stats.avro"; + no_stats_.partition_spec_id = 0; + } + + std::vector BuildIdentityFields() { + std::vector fields; + int32_t partition_field_id = PartitionSpec::kLegacyPartitionDataIdStart; + auto add_field = [&](int32_t source_id, std::string name) { + fields.emplace_back(source_id, partition_field_id++, std::move(name), + Transform::Identity()); + }; + add_field(1, "id"); + add_field(4, "all_nulls_missing_nan"); + add_field(5, "some_nulls"); + add_field(6, "no_nulls"); + add_field(7, "float"); + add_field(8, "all_nulls_double"); + add_field(9, "all_nulls_no_nans"); + add_field(10, "all_nans"); + add_field(11, "both_nan_and_null"); + add_field(12, "no_nan_or_null"); + add_field(13, "all_nulls_missing_nan_float"); + add_field(14, "all_same_value_or_null"); + add_field(15, "no_nulls_same_value_a"); + return fields; + } + + PartitionFieldSummary MakeSummary(bool contains_null, std::optional contains_nan, + std::optional lower = std::nullopt, + std::optional upper = std::nullopt) { + PartitionFieldSummary summary; + summary.contains_null = contains_null; + summary.contains_nan = contains_nan; + if (lower.has_value()) { + summary.lower_bound = lower->Serialize().value(); + } + if (upper.has_value()) { + summary.upper_bound = upper->Serialize().value(); + } + return summary; + } + + ManifestFile BuildManifestFile() { + ManifestFile manifest; + manifest.manifest_path = "manifest-list.avro"; + manifest.manifest_length = 1024; + manifest.partition_spec_id = 0; + manifest.partitions = { + MakeSummary(/*contains_null=*/false, std::nullopt, Literal::Int(kIntMinValue), + Literal::Int(kIntMaxValue)), + MakeSummary(/*contains_null=*/true, std::nullopt, std::nullopt, std::nullopt), + MakeSummary(/*contains_null=*/true, std::nullopt, Literal::String("a"), + Literal::String("z")), + MakeSummary(/*contains_null=*/false, std::nullopt, Literal::String("a"), + Literal::String("z")), + MakeSummary(/*contains_null=*/false, std::nullopt, Literal::Float(0.0F), + Literal::Float(20.0F)), + MakeSummary(/*contains_null=*/true, std::nullopt, std::nullopt, std::nullopt), + MakeSummary(/*contains_null=*/true, /*contains_nan=*/false, std::nullopt, + std::nullopt), + MakeSummary(/*contains_null=*/false, /*contains_nan=*/true, std::nullopt, + std::nullopt), + MakeSummary(/*contains_null=*/true, /*contains_nan=*/true, std::nullopt, + std::nullopt), + MakeSummary(/*contains_null=*/false, /*contains_nan=*/false, Literal::Double(0.0), + Literal::Double(20.0)), + MakeSummary(/*contains_null=*/true, std::nullopt, std::nullopt, std::nullopt), + MakeSummary(/*contains_null=*/true, std::nullopt, Literal::String("a"), + Literal::String("a")), + MakeSummary(/*contains_null=*/false, std::nullopt, Literal::String("a"), + Literal::String("a")), + }; + return manifest; + } + + void ExpectEval(const std::shared_ptr& expr, bool expected, + const ManifestFile& manifest, bool case_sensitive = true) { + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, ManifestEvaluator::MakePartitionFilter( + expr, spec_, *schema_, case_sensitive)); + auto result = evaluator->Evaluate(manifest); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(expected, result.value()) << expr->ToString(); + } + + void ExpectEval(const std::shared_ptr& expr, bool expected, + bool case_sensitive = true) { + ExpectEval(expr, expected, file_, case_sensitive); + } + + std::shared_ptr schema_; + std::shared_ptr spec_; + ManifestFile file_; + ManifestFile no_stats_; +}; + +TEST_F(ManifestEvaluatorTest, AllNulls) { + ExpectEval(Expressions::NotNull("all_nulls_missing_nan"), false); + ExpectEval(Expressions::NotNull("all_nulls_missing_nan_float"), true); + ExpectEval(Expressions::NotNull("some_nulls"), true); + ExpectEval(Expressions::NotNull("no_nulls"), true); + ExpectEval(Expressions::StartsWith("all_nulls_missing_nan", "asad"), false); + ExpectEval(Expressions::NotStartsWith("all_nulls_missing_nan", "asad"), true); +} + +TEST_F(ManifestEvaluatorTest, NoNulls) { + ExpectEval(Expressions::IsNull("all_nulls_missing_nan"), true); + ExpectEval(Expressions::IsNull("some_nulls"), true); + ExpectEval(Expressions::IsNull("no_nulls"), false); + ExpectEval(Expressions::IsNull("both_nan_and_null"), true); +} + +TEST_F(ManifestEvaluatorTest, IsNaN) { + ExpectEval(Expressions::IsNaN("float"), true); + ExpectEval(Expressions::IsNaN("all_nulls_double"), true); + ExpectEval(Expressions::IsNaN("all_nulls_missing_nan_float"), true); + ExpectEval(Expressions::IsNaN("all_nulls_no_nans"), false); + ExpectEval(Expressions::IsNaN("all_nans"), true); + ExpectEval(Expressions::IsNaN("both_nan_and_null"), true); + ExpectEval(Expressions::IsNaN("no_nan_or_null"), false); +} + +TEST_F(ManifestEvaluatorTest, NotNaN) { + ExpectEval(Expressions::NotNaN("float"), true); + ExpectEval(Expressions::NotNaN("all_nulls_double"), true); + ExpectEval(Expressions::NotNaN("all_nulls_no_nans"), true); + ExpectEval(Expressions::NotNaN("all_nans"), false); + ExpectEval(Expressions::NotNaN("both_nan_and_null"), true); + ExpectEval(Expressions::NotNaN("no_nan_or_null"), true); +} + +TEST_F(ManifestEvaluatorTest, MissingColumn) { + auto expr = Expressions::LessThan("missing", Literal::Int(5)); + auto evaluator = ManifestEvaluator::MakePartitionFilter(expr, spec_, *schema_, true); + ASSERT_FALSE(evaluator.has_value()); + ASSERT_TRUE(evaluator.error().message.contains("Cannot find field 'missing'")) + << evaluator.error().message; +} + +TEST_F(ManifestEvaluatorTest, MissingStats) { + std::vector> expressions = { + Expressions::LessThan("id", Literal::Int(5)), + Expressions::LessThanOrEqual("id", Literal::Int(30)), + Expressions::Equal("id", Literal::Int(70)), + Expressions::GreaterThan("id", Literal::Int(78)), + Expressions::GreaterThanOrEqual("id", Literal::Int(90)), + Expressions::NotEqual("id", Literal::Int(101)), + Expressions::IsNull("id"), + Expressions::NotNull("id"), + Expressions::StartsWith("all_nulls_missing_nan", "a"), + Expressions::IsNaN("float"), + Expressions::NotNaN("float"), + Expressions::NotStartsWith("all_nulls_missing_nan", "a"), + }; + + for (const auto& expr : expressions) { + ExpectEval(expr, true, no_stats_); + } +} + +TEST_F(ManifestEvaluatorTest, Not) { + ExpectEval( + Expressions::Not(Expressions::LessThan("id", Literal::Int(kIntMinValue - 25))), + true); + ExpectEval( + Expressions::Not(Expressions::GreaterThan("id", Literal::Int(kIntMinValue - 25))), + false); +} + +TEST_F(ManifestEvaluatorTest, And) { + ExpectEval(Expressions::And( + Expressions::LessThan("id", Literal::Int(kIntMinValue - 25)), + Expressions::GreaterThanOrEqual("id", Literal::Int(kIntMinValue - 30))), + false); + ExpectEval(Expressions::And( + Expressions::LessThan("id", Literal::Int(kIntMinValue - 25)), + Expressions::GreaterThanOrEqual("id", Literal::Int(kIntMaxValue + 1))), + false); + ExpectEval( + Expressions::And(Expressions::GreaterThan("id", Literal::Int(kIntMinValue - 25)), + Expressions::LessThanOrEqual("id", Literal::Int(kIntMinValue))), + true); +} + +TEST_F(ManifestEvaluatorTest, Or) { + ExpectEval(Expressions::Or( + Expressions::LessThan("id", Literal::Int(kIntMinValue - 25)), + Expressions::GreaterThanOrEqual("id", Literal::Int(kIntMaxValue + 1))), + false); + ExpectEval(Expressions::Or( + Expressions::LessThan("id", Literal::Int(kIntMinValue - 25)), + Expressions::GreaterThanOrEqual("id", Literal::Int(kIntMaxValue - 19))), + true); +} + +TEST_F(ManifestEvaluatorTest, IntegerLt) { + ExpectEval(Expressions::LessThan("id", Literal::Int(kIntMinValue - 25)), false); + ExpectEval(Expressions::LessThan("id", Literal::Int(kIntMinValue)), false); + ExpectEval(Expressions::LessThan("id", Literal::Int(kIntMinValue + 1)), true); + ExpectEval(Expressions::LessThan("id", Literal::Int(kIntMaxValue)), true); +} + +TEST_F(ManifestEvaluatorTest, IntegerLtEq) { + ExpectEval(Expressions::LessThanOrEqual("id", Literal::Int(kIntMinValue - 25)), false); + ExpectEval(Expressions::LessThanOrEqual("id", Literal::Int(kIntMinValue - 1)), false); + ExpectEval(Expressions::LessThanOrEqual("id", Literal::Int(kIntMinValue)), true); + ExpectEval(Expressions::LessThanOrEqual("id", Literal::Int(kIntMaxValue)), true); +} + +TEST_F(ManifestEvaluatorTest, IntegerGt) { + ExpectEval(Expressions::GreaterThan("id", Literal::Int(kIntMaxValue + 6)), false); + ExpectEval(Expressions::GreaterThan("id", Literal::Int(kIntMaxValue)), false); + ExpectEval(Expressions::GreaterThan("id", Literal::Int(kIntMaxValue - 1)), true); + ExpectEval(Expressions::GreaterThan("id", Literal::Int(kIntMaxValue - 4)), true); +} + +TEST_F(ManifestEvaluatorTest, IntegerGtEq) { + ExpectEval(Expressions::GreaterThanOrEqual("id", Literal::Int(kIntMaxValue + 6)), + false); + ExpectEval(Expressions::GreaterThanOrEqual("id", Literal::Int(kIntMaxValue + 1)), + false); + ExpectEval(Expressions::GreaterThanOrEqual("id", Literal::Int(kIntMaxValue)), true); + ExpectEval(Expressions::GreaterThanOrEqual("id", Literal::Int(kIntMaxValue - 4)), true); +} + +TEST_F(ManifestEvaluatorTest, IntegerEq) { + ExpectEval(Expressions::Equal("id", Literal::Int(kIntMinValue - 25)), false); + ExpectEval(Expressions::Equal("id", Literal::Int(kIntMinValue - 1)), false); + ExpectEval(Expressions::Equal("id", Literal::Int(kIntMinValue)), true); + ExpectEval(Expressions::Equal("id", Literal::Int(kIntMaxValue - 4)), true); + ExpectEval(Expressions::Equal("id", Literal::Int(kIntMaxValue)), true); + ExpectEval(Expressions::Equal("id", Literal::Int(kIntMaxValue + 1)), false); + ExpectEval(Expressions::Equal("id", Literal::Int(kIntMaxValue + 6)), false); +} + +TEST_F(ManifestEvaluatorTest, IntegerNotEq) { + ExpectEval(Expressions::NotEqual("id", Literal::Int(kIntMinValue - 25)), true); + ExpectEval(Expressions::NotEqual("id", Literal::Int(kIntMinValue - 1)), true); + ExpectEval(Expressions::NotEqual("id", Literal::Int(kIntMinValue)), true); + ExpectEval(Expressions::NotEqual("id", Literal::Int(kIntMaxValue - 4)), true); + ExpectEval(Expressions::NotEqual("id", Literal::Int(kIntMaxValue)), true); + ExpectEval(Expressions::NotEqual("id", Literal::Int(kIntMaxValue + 1)), true); + ExpectEval(Expressions::NotEqual("id", Literal::Int(kIntMaxValue + 6)), true); +} + +TEST_F(ManifestEvaluatorTest, IntegerNotEqRewritten) { + ExpectEval(Expressions::Not(Expressions::Equal("id", Literal::Int(kIntMinValue - 25))), + true); + ExpectEval(Expressions::Not(Expressions::Equal("id", Literal::Int(kIntMinValue - 1))), + true); + ExpectEval(Expressions::Not(Expressions::Equal("id", Literal::Int(kIntMinValue))), + true); + ExpectEval(Expressions::Not(Expressions::Equal("id", Literal::Int(kIntMaxValue - 4))), + true); + ExpectEval(Expressions::Not(Expressions::Equal("id", Literal::Int(kIntMaxValue))), + true); + ExpectEval(Expressions::Not(Expressions::Equal("id", Literal::Int(kIntMaxValue + 1))), + true); + ExpectEval(Expressions::Not(Expressions::Equal("id", Literal::Int(kIntMaxValue + 6))), + true); +} + +TEST_F(ManifestEvaluatorTest, CaseInsensitiveIntegerNotEqRewritten) { + ExpectEval(Expressions::Not(Expressions::Equal("ID", Literal::Int(kIntMinValue - 25))), + true, + /*case_sensitive=*/false); + ExpectEval(Expressions::Not(Expressions::Equal("ID", Literal::Int(kIntMinValue - 1))), + true, + /*case_sensitive=*/false); + ExpectEval(Expressions::Not(Expressions::Equal("ID", Literal::Int(kIntMinValue))), true, + /*case_sensitive=*/false); + ExpectEval(Expressions::Not(Expressions::Equal("ID", Literal::Int(kIntMaxValue - 4))), + true, + /*case_sensitive=*/false); + ExpectEval(Expressions::Not(Expressions::Equal("ID", Literal::Int(kIntMaxValue))), true, + /*case_sensitive=*/false); + ExpectEval(Expressions::Not(Expressions::Equal("ID", Literal::Int(kIntMaxValue + 1))), + true, + /*case_sensitive=*/false); + ExpectEval(Expressions::Not(Expressions::Equal("ID", Literal::Int(kIntMaxValue + 6))), + true, + /*case_sensitive=*/false); +} + +TEST_F(ManifestEvaluatorTest, CaseSensitiveIntegerNotEqRewritten) { + auto expr = Expressions::Not(Expressions::Equal("ID", Literal::Int(kIntMinValue - 25))); + auto evaluator = ManifestEvaluator::MakePartitionFilter(expr, spec_, *schema_, true); + ASSERT_FALSE(evaluator.has_value()); + ASSERT_TRUE(evaluator.error().message.contains("Cannot find field 'ID'")) + << evaluator.error().message; +} + +TEST_F(ManifestEvaluatorTest, StringStartsWith) { + ExpectEval(Expressions::StartsWith("some_nulls", "a"), true, /*case_sensitive=*/false); + ExpectEval(Expressions::StartsWith("some_nulls", "aa"), true, /*case_sensitive=*/false); + ExpectEval(Expressions::StartsWith("some_nulls", "dddd"), true, + /*case_sensitive=*/false); + ExpectEval(Expressions::StartsWith("some_nulls", "z"), true, /*case_sensitive=*/false); + ExpectEval(Expressions::StartsWith("no_nulls", "a"), true, /*case_sensitive=*/false); + ExpectEval(Expressions::StartsWith("some_nulls", "zzzz"), false, + /*case_sensitive=*/false); + ExpectEval(Expressions::StartsWith("some_nulls", "1"), false, /*case_sensitive=*/false); +} + +TEST_F(ManifestEvaluatorTest, StringNotStartsWith) { + ExpectEval(Expressions::NotStartsWith("some_nulls", "a"), true, + /*case_sensitive=*/false); + ExpectEval(Expressions::NotStartsWith("some_nulls", "aa"), true, + /*case_sensitive=*/false); + ExpectEval(Expressions::NotStartsWith("some_nulls", "dddd"), true, + /*case_sensitive=*/false); + ExpectEval(Expressions::NotStartsWith("some_nulls", "z"), true, + /*case_sensitive=*/false); + ExpectEval(Expressions::NotStartsWith("no_nulls", "a"), true, /*case_sensitive=*/false); + ExpectEval(Expressions::NotStartsWith("some_nulls", "zzzz"), true, + /*case_sensitive=*/false); + ExpectEval(Expressions::NotStartsWith("some_nulls", "1"), true, + /*case_sensitive=*/false); + ExpectEval(Expressions::NotStartsWith("all_same_value_or_null", "a"), true, + /*case_sensitive=*/false); + ExpectEval(Expressions::NotStartsWith("all_same_value_or_null", "aa"), true, + /*case_sensitive=*/false); + ExpectEval(Expressions::NotStartsWith("all_same_value_or_null", "A"), true, + /*case_sensitive=*/false); + ExpectEval(Expressions::NotStartsWith("all_nulls_missing_nan", "A"), true, + /*case_sensitive=*/false); + ExpectEval(Expressions::NotStartsWith("no_nulls_same_value_a", "a"), false, + /*case_sensitive=*/false); +} + +TEST_F(ManifestEvaluatorTest, IntegerIn) { + ExpectEval(Expressions::In("id", {Literal::Int(kIntMinValue - 25), + Literal::Int(kIntMinValue - 24)}), + false); + ExpectEval(Expressions::In( + "id", {Literal::Int(kIntMinValue - 2), Literal::Int(kIntMinValue - 1)}), + false); + ExpectEval( + Expressions::In("id", {Literal::Int(kIntMinValue - 1), Literal::Int(kIntMinValue)}), + true); + ExpectEval(Expressions::In( + "id", {Literal::Int(kIntMaxValue - 4), Literal::Int(kIntMaxValue - 3)}), + true); + ExpectEval( + Expressions::In("id", {Literal::Int(kIntMaxValue), Literal::Int(kIntMaxValue + 1)}), + true); + ExpectEval(Expressions::In( + "id", {Literal::Int(kIntMaxValue + 1), Literal::Int(kIntMaxValue + 2)}), + false); + ExpectEval(Expressions::In( + "id", {Literal::Int(kIntMaxValue + 6), Literal::Int(kIntMaxValue + 7)}), + false); + ExpectEval(Expressions::In("all_nulls_missing_nan", + {Literal::String("abc"), Literal::String("def")}), + false); + ExpectEval( + Expressions::In("some_nulls", {Literal::String("abc"), Literal::String("def")}), + true); + ExpectEval( + Expressions::In("no_nulls", {Literal::String("abc"), Literal::String("def")}), + true); +} + +TEST_F(ManifestEvaluatorTest, IntegerNotIn) { + ExpectEval(Expressions::NotIn("id", {Literal::Int(kIntMinValue - 25), + Literal::Int(kIntMinValue - 24)}), + true); + ExpectEval(Expressions::NotIn( + "id", {Literal::Int(kIntMinValue - 2), Literal::Int(kIntMinValue - 1)}), + true); + ExpectEval(Expressions::NotIn( + "id", {Literal::Int(kIntMinValue - 1), Literal::Int(kIntMinValue)}), + true); + ExpectEval(Expressions::NotIn( + "id", {Literal::Int(kIntMaxValue - 4), Literal::Int(kIntMaxValue - 3)}), + true); + ExpectEval(Expressions::NotIn( + "id", {Literal::Int(kIntMaxValue), Literal::Int(kIntMaxValue + 1)}), + true); + ExpectEval(Expressions::NotIn( + "id", {Literal::Int(kIntMaxValue + 1), Literal::Int(kIntMaxValue + 2)}), + true); + ExpectEval(Expressions::NotIn( + "id", {Literal::Int(kIntMaxValue + 6), Literal::Int(kIntMaxValue + 7)}), + true); + ExpectEval(Expressions::NotIn("all_nulls_missing_nan", + {Literal::String("abc"), Literal::String("def")}), + true); + ExpectEval( + Expressions::NotIn("some_nulls", {Literal::String("abc"), Literal::String("def")}), + true); + ExpectEval( + Expressions::NotIn("no_nulls", {Literal::String("abc"), Literal::String("def")}), + true); +} + +} // namespace iceberg diff --git a/src/iceberg/test/meson.build b/src/iceberg/test/meson.build index c73abe188..e531b1159 100644 --- a/src/iceberg/test/meson.build +++ b/src/iceberg/test/meson.build @@ -64,6 +64,7 @@ iceberg_tests = { 'inclusive_metrics_evaluator_test.cc', 'inclusive_metrics_evaluator_with_transform_test.cc', 'literal_test.cc', + 'manifest_evaluator_test.cc', 'predicate_test.cc', 'strict_metrics_evaluator_test.cc', ), From dcf9d5413339ae34aa023c72741d88196cb771b8 Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Mon, 8 Dec 2025 17:49:54 +0800 Subject: [PATCH 2/3] fix comments --- src/iceberg/expression/manifest_evaluator.cc | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/iceberg/expression/manifest_evaluator.cc b/src/iceberg/expression/manifest_evaluator.cc index d4c177821..3f8558d3b 100644 --- a/src/iceberg/expression/manifest_evaluator.cc +++ b/src/iceberg/expression/manifest_evaluator.cc @@ -209,13 +209,12 @@ class ManifestEvalVisitor : public BoundVisitor { auto lower, DeserializeBoundLiteral(summary.lower_bound.value(), ref->type())); ICEBERG_ASSIGN_OR_RAISE( auto upper, DeserializeBoundLiteral(summary.upper_bound.value(), ref->type())); - auto literals_view = literal_set | std::views::filter([&](const Literal& lit) { - return lower <= lit && lit <= upper; - }); - if (literals_view.empty()) { - // if all values are less than lower bound, rows cannot match. - // if all values are greater than upper bound, rows cannot match. + if (std::ranges::all_of(literal_set, [&](const Literal& lit) { + return lit < lower || lit > upper; + })) { + // if all values are less than lower bound or greater than upper bound, + // rows cannot match. return kRowCannotMatch; } return kRowsMightMatch; From 0dd74985c867855036590c79fc33c9507ab84299 Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Thu, 11 Dec 2025 09:42:20 +0800 Subject: [PATCH 3/3] fix comments --- src/iceberg/expression/manifest_evaluator.cc | 54 +++++++++----------- 1 file changed, 24 insertions(+), 30 deletions(-) diff --git a/src/iceberg/expression/manifest_evaluator.cc b/src/iceberg/expression/manifest_evaluator.cc index 3f8558d3b..845ecb6ca 100644 --- a/src/iceberg/expression/manifest_evaluator.cc +++ b/src/iceberg/expression/manifest_evaluator.cc @@ -40,9 +40,9 @@ class ManifestEvalVisitor : public BoundVisitor { explicit ManifestEvalVisitor(const ManifestFile& manifest) : stats_(manifest.partitions) {} - Result AlwaysTrue() override { return true; } + Result AlwaysTrue() override { return kRowsMightMatch; } - Result AlwaysFalse() override { return false; } + Result AlwaysFalse() override { return kRowCannotMatch; } Result Not(bool child_result) override { return !child_result; } @@ -57,7 +57,7 @@ class ManifestEvalVisitor : public BoundVisitor { Result IsNull(const std::shared_ptr& expr) override { // no need to check whether the field is required because binding evaluates that case // if the column has no null values, the expression cannot match - ICEBERG_ASSIGN_OR_RAISE(auto ref, ParseBoundReference(expr)); + const auto& ref = expr->reference(); ICEBERG_ASSIGN_OR_RAISE(auto pos, GetPosition(*ref)); if (!stats_.at(pos).contains_null) { return kRowCannotMatch; @@ -67,7 +67,7 @@ class ManifestEvalVisitor : public BoundVisitor { } Result NotNull(const std::shared_ptr& expr) override { - ICEBERG_ASSIGN_OR_RAISE(auto ref, ParseBoundReference(expr)); + const auto& ref = expr->reference(); ICEBERG_ASSIGN_OR_RAISE(auto pos, GetPosition(*ref)); if (AllValuesAreNull(stats_.at(pos), ref->type()->type_id())) { return kRowCannotMatch; @@ -77,7 +77,7 @@ class ManifestEvalVisitor : public BoundVisitor { } Result IsNaN(const std::shared_ptr& expr) override { - ICEBERG_ASSIGN_OR_RAISE(auto ref, ParseBoundReference(expr)); + const auto& ref = expr->reference(); ICEBERG_ASSIGN_OR_RAISE(auto pos, GetPosition(*ref)); if (stats_.at(pos).contains_nan.has_value() && !stats_.at(pos).contains_nan.value()) { return kRowCannotMatch; @@ -90,7 +90,7 @@ class ManifestEvalVisitor : public BoundVisitor { } Result NotNaN(const std::shared_ptr& expr) override { - ICEBERG_ASSIGN_OR_RAISE(auto ref, ParseBoundReference(expr)); + const auto& ref = expr->reference(); ICEBERG_ASSIGN_OR_RAISE(auto pos, GetPosition(*ref)); const auto& summary = stats_.at(pos); // if containsNaN is true, containsNull is false and lowerBound is null, all values @@ -104,7 +104,7 @@ class ManifestEvalVisitor : public BoundVisitor { } Result Lt(const std::shared_ptr& expr, const Literal& lit) override { - ICEBERG_ASSIGN_OR_RAISE(auto ref, ParseBoundReference(expr)); + const auto& ref = expr->reference(); ICEBERG_ASSIGN_OR_RAISE(auto pos, GetPosition(*ref)); const auto& summary = stats_.at(pos); if (!summary.lower_bound.has_value()) { @@ -119,7 +119,7 @@ class ManifestEvalVisitor : public BoundVisitor { } Result LtEq(const std::shared_ptr& expr, const Literal& lit) override { - ICEBERG_ASSIGN_OR_RAISE(auto ref, ParseBoundReference(expr)); + const auto& ref = expr->reference(); ICEBERG_ASSIGN_OR_RAISE(auto pos, GetPosition(*ref)); const auto& summary = stats_.at(pos); if (!summary.lower_bound.has_value()) { @@ -134,7 +134,7 @@ class ManifestEvalVisitor : public BoundVisitor { } Result Gt(const std::shared_ptr& expr, const Literal& lit) override { - ICEBERG_ASSIGN_OR_RAISE(auto ref, ParseBoundReference(expr)); + const auto& ref = expr->reference(); ICEBERG_ASSIGN_OR_RAISE(auto pos, GetPosition(*ref)); const auto& summary = stats_.at(pos); if (!summary.upper_bound.has_value()) { @@ -149,14 +149,15 @@ class ManifestEvalVisitor : public BoundVisitor { } Result GtEq(const std::shared_ptr& expr, const Literal& lit) override { - ICEBERG_ASSIGN_OR_RAISE(auto ref, ParseBoundReference(expr)); + const auto& ref = expr->reference(); ICEBERG_ASSIGN_OR_RAISE(auto pos, GetPosition(*ref)); const auto& summary = stats_.at(pos); if (!summary.upper_bound.has_value()) { return kRowCannotMatch; // values are all null } ICEBERG_ASSIGN_OR_RAISE( - auto upper, DeserializeBoundLiteral(summary.upper_bound.value(), ref->type())); + auto upper, + DeserializeBoundLiteral(summary.upper_bound.value(), expr->reference()->type())); if (upper < lit) { return kRowCannotMatch; } @@ -164,7 +165,7 @@ class ManifestEvalVisitor : public BoundVisitor { } Result Eq(const std::shared_ptr& expr, const Literal& lit) override { - ICEBERG_ASSIGN_OR_RAISE(auto ref, ParseBoundReference(expr)); + const auto& ref = expr->reference(); ICEBERG_ASSIGN_OR_RAISE(auto pos, GetPosition(*ref)); const auto& summary = stats_.at(pos); if (!summary.lower_bound.has_value() || !summary.upper_bound.has_value()) { @@ -193,7 +194,7 @@ class ManifestEvalVisitor : public BoundVisitor { Result In(const std::shared_ptr& expr, const BoundSetPredicate::LiteralSet& literal_set) override { - ICEBERG_ASSIGN_OR_RAISE(auto ref, ParseBoundReference(expr)); + const auto& ref = expr->reference(); ICEBERG_ASSIGN_OR_RAISE(auto pos, GetPosition(*ref)); const auto& summary = stats_.at(pos); if (!summary.lower_bound.has_value() || !summary.upper_bound.has_value()) { @@ -230,14 +231,14 @@ class ManifestEvalVisitor : public BoundVisitor { Result StartsWith(const std::shared_ptr& expr, const Literal& lit) override { - ICEBERG_ASSIGN_OR_RAISE(auto ref, ParseBoundReference(expr)); + const auto& ref = expr->reference(); ICEBERG_ASSIGN_OR_RAISE(auto pos, GetPosition(*ref)); const auto& summary = stats_.at(pos); if (!summary.lower_bound.has_value() || !summary.upper_bound.has_value()) { return kRowCannotMatch; } if (lit.type()->type_id() != TypeId::kString) { - return Invalid("Invalid literal: not a string, cannot use StartsWith"); + return InvalidExpression("Invalid literal: not a string, cannot use StartsWith"); } const auto& prefix = std::get(lit.value()); ICEBERG_ASSIGN_OR_RAISE( @@ -261,7 +262,7 @@ class ManifestEvalVisitor : public BoundVisitor { Result NotStartsWith(const std::shared_ptr& expr, const Literal& lit) override { - ICEBERG_ASSIGN_OR_RAISE(auto ref, ParseBoundReference(expr)); + const auto& ref = expr->reference(); ICEBERG_ASSIGN_OR_RAISE(auto pos, GetPosition(*ref)); const auto& summary = stats_.at(pos); if (summary.contains_null || !summary.lower_bound.has_value() || @@ -269,7 +270,7 @@ class ManifestEvalVisitor : public BoundVisitor { return kRowsMightMatch; } if (lit.type()->type_id() != TypeId::kString) { - return Invalid("Invalid literal: not a string, cannot use notStartsWith"); + return InvalidExpression("Invalid literal: not a string, cannot use notStartsWith"); } // notStartsWith will match unless all values must start with the prefix. This happens // when the lower and upper bounds both start with the prefix. @@ -301,27 +302,20 @@ class ManifestEvalVisitor : public BoundVisitor { } private: - Result ParseBoundReference(const std::shared_ptr& expr) const { - auto ref = dynamic_cast(expr.get()); - if (ref == nullptr) { - return Invalid("Invalid expression: not a BoundReference"); - } - return ref; - } - Result GetPosition(const BoundReference& ref) const { const auto& accessor = ref.accessor(); const auto& position_path = accessor.position_path(); if (position_path.empty()) { - return Invalid("Invalid accessor: empty position path."); + return InvalidArgument("Invalid accessor: empty position path."); } // nested accessors are not supported for partition fields if (position_path.size() > 1) { - return Invalid("Cannot convert nested accessor to position"); + return InvalidArgument("Cannot convert nested accessor to position"); } auto pos = position_path.at(0); if (pos >= stats_.size()) { - return Invalid("Invalid position: out of partition filed range."); + return InvalidArgument("Position {} is out of partition field range {}", pos, + stats_.size()); } return pos; } @@ -344,8 +338,8 @@ class ManifestEvalVisitor : public BoundVisitor { if (!type->is_primitive()) { return NotSupported("Bounds of non-primitive partition fields are not supported."); } - auto primitive_type = internal::checked_pointer_cast(type); - return Literal::Deserialize(bound, primitive_type); + return Literal::Deserialize( + bound, std::move(internal::checked_pointer_cast(type))); } private: