From 1bee83d00e211ef7df71cef01f59f4461fb5d48d Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Sun, 7 Dec 2025 15:42:46 +0800 Subject: [PATCH 1/7] feat: add residual evaluator --- src/iceberg/CMakeLists.txt | 1 + src/iceberg/expression/expression_visitor.h | 4 +- src/iceberg/expression/meson.build | 1 + src/iceberg/expression/residual_evaluator.cc | 560 +++++++++++++++++ src/iceberg/expression/residual_evaluator.h | 93 +++ src/iceberg/meson.build | 1 + src/iceberg/test/CMakeLists.txt | 1 + src/iceberg/test/meson.build | 1 + src/iceberg/test/residual_evaluator_test.cc | 612 +++++++++++++++++++ 9 files changed, 1271 insertions(+), 3 deletions(-) create mode 100644 src/iceberg/expression/residual_evaluator.cc create mode 100644 src/iceberg/expression/residual_evaluator.h create mode 100644 src/iceberg/test/residual_evaluator_test.cc diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index 275d71fce..7218d76e0 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -28,6 +28,7 @@ set(ICEBERG_SOURCES expression/inclusive_metrics_evaluator.cc expression/literal.cc expression/predicate.cc + expression/residual_evaluator.cc expression/rewrite_not.cc expression/strict_metrics_evaluator.cc expression/term.cc diff --git a/src/iceberg/expression/expression_visitor.h b/src/iceberg/expression/expression_visitor.h index d66382453..27cfb99c1 100644 --- a/src/iceberg/expression/expression_visitor.h +++ b/src/iceberg/expression/expression_visitor.h @@ -260,10 +260,8 @@ class ICEBERG_EXPORT BoundVisitor : public ExpressionVisitor { /// \brief Visit an unbound predicate. /// - /// Bound visitors do not support unbound predicates. - /// /// \param pred The unbound predicate - Result Predicate(const std::shared_ptr& pred) final { + Result Predicate(const std::shared_ptr& pred) override { ICEBERG_DCHECK(pred != nullptr, "UnboundPredicate cannot be null"); return NotSupported("Not a bound predicate: {}", pred->ToString()); } diff --git a/src/iceberg/expression/meson.build b/src/iceberg/expression/meson.build index 8e312791b..f3b748482 100644 --- a/src/iceberg/expression/meson.build +++ b/src/iceberg/expression/meson.build @@ -26,6 +26,7 @@ install_headers( 'inclusive_metrics_evaluator.h', 'literal.h', 'predicate.h', + 'residual_evaluator.h', 'rewrite_not.h', 'strict_metrics_evaluator.h', 'term.h', diff --git a/src/iceberg/expression/residual_evaluator.cc b/src/iceberg/expression/residual_evaluator.cc new file mode 100644 index 000000000..f86f2658a --- /dev/null +++ b/src/iceberg/expression/residual_evaluator.cc @@ -0,0 +1,560 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/expression/residual_evaluator.h" + +#include + +#include "iceberg/expression/binder.h" +#include "iceberg/expression/expression.h" +#include "iceberg/expression/expression_visitor.h" +#include "iceberg/expression/expressions.h" +#include "iceberg/expression/predicate.h" +#include "iceberg/partition_spec.h" +#include "iceberg/row/struct_like.h" +#include "iceberg/schema.h" +#include "iceberg/schema_internal.h" +#include "iceberg/transform.h" +#include "iceberg/util/checked_cast.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +class ResidualVisitor : public BoundVisitor> { + public: + ResidualVisitor(const std::shared_ptr& spec, + const std::shared_ptr& schema, const StructLike& partition_data, + bool case_sensitive) + : spec_(spec), + schema_(schema), + partition_data_(partition_data), + case_sensitive_(case_sensitive) { + ICEBERG_ASSIGN_OR_THROW(auto partition_type_, spec_->PartitionType(*schema)); + partition_schema_ = FromStructType(std::move(*partition_type_), std::nullopt); + } + + Result> AlwaysTrue() override { + return Expressions::AlwaysTrue(); + } + + Result> AlwaysFalse() override { + return Expressions::AlwaysFalse(); + } + + Result> Not( + const std::shared_ptr& child_result) override { + return Expressions::Not(child_result); + } + + Result> And( + const std::shared_ptr& left_result, + const std::shared_ptr& right_result) override { + return Expressions::And(left_result, right_result); + } + + Result> Or( + const std::shared_ptr& left_result, + const std::shared_ptr& right_result) override { + return Expressions::Or(left_result, right_result); + } + + Result> IsNull( + const std::shared_ptr& expr) override { + return IsNullImpl(expr); + } + + Result> NotNull( + const std::shared_ptr& expr) override { + return NotNullImpl(expr); + } + + Result> IsNaN(const std::shared_ptr& expr) override { + return IsNaNImpl(expr); + } + + Result> NotNaN( + const std::shared_ptr& expr) override { + return NotNaNImpl(expr); + } + + Result> Lt(const std::shared_ptr& expr, + const Literal& lit) override { + return LtImpl(expr, lit); + } + + Result> LtEq(const std::shared_ptr& expr, + const Literal& lit) override { + return LtEqImpl(expr, lit); + } + + Result> Gt(const std::shared_ptr& expr, + const Literal& lit) override { + return GtImpl(expr, lit); + } + + Result> GtEq(const std::shared_ptr& expr, + const Literal& lit) override { + return GtEqImpl(expr, lit); + } + + Result> Eq(const std::shared_ptr& expr, + const Literal& lit) override { + return EqImpl(expr, lit); + } + + Result> NotEq(const std::shared_ptr& expr, + const Literal& lit) override { + return NotEqImpl(expr, lit); + } + + Result> StartsWith(const std::shared_ptr& expr, + const Literal& lit) override { + return StartsWithImpl(expr, lit); + } + + Result> NotStartsWith(const std::shared_ptr& expr, + const Literal& lit) override { + return NotStartsWithImpl(expr, lit); + } + + Result> In( + const std::shared_ptr& expr, + const BoundSetPredicate::LiteralSet& literal_set) override { + return InImpl(expr, literal_set); + } + + Result> NotIn( + const std::shared_ptr& expr, + const BoundSetPredicate::LiteralSet& literal_set) override { + return NotInImpl(expr, literal_set); + } + + Result> Predicate( + const std::shared_ptr& pred) override; + + Result> Predicate( + const std::shared_ptr& pred) override { + ICEBERG_ASSIGN_OR_RAISE(auto bound_predicate, pred->Bind(*schema_, case_sensitive_)); + if (bound_predicate->is_bound_predicate()) { + ICEBERG_ASSIGN_OR_RAISE( + auto residual, + Predicate(std::dynamic_pointer_cast(bound_predicate))); + if (residual->is_bound_predicate()) { + // replace inclusive original unbound predicate + return pred; + } + return residual; + } + // if binding didn't result in a Predicate, return the expression + return bound_predicate; + } + + private: + // Helper methods for bound predicates + Result> EvaluateBoundPredicate( + const std::shared_ptr& pred); + + Result> IsNullImpl(const std::shared_ptr& expr); + Result> NotNullImpl(const std::shared_ptr& expr); + Result> IsNaNImpl(const std::shared_ptr& expr); + Result> NotNaNImpl(const std::shared_ptr& expr); + Result> LtImpl(const std::shared_ptr& expr, + const Literal& lit); + Result> LtEqImpl(const std::shared_ptr& expr, + const Literal& lit); + Result> GtImpl(const std::shared_ptr& expr, + const Literal& lit); + Result> GtEqImpl(const std::shared_ptr& expr, + const Literal& lit); + Result> EqImpl(const std::shared_ptr& expr, + const Literal& lit); + Result> NotEqImpl(const std::shared_ptr& expr, + const Literal& lit); + Result> InImpl( + const std::shared_ptr& expr, + const BoundSetPredicate::LiteralSet& literal_set); + Result> NotInImpl( + const std::shared_ptr& expr, + const BoundSetPredicate::LiteralSet& literal_set); + Result> StartsWithImpl(const std::shared_ptr& expr, + const Literal& lit); + Result> NotStartsWithImpl( + const std::shared_ptr& expr, const Literal& lit); + + std::shared_ptr spec_; + std::shared_ptr schema_; + std::shared_ptr partition_schema_; + const StructLike& partition_data_; + bool case_sensitive_; +}; + +Result> ResidualVisitor::IsNullImpl( + const std::shared_ptr& expr) { + ICEBERG_ASSIGN_OR_RAISE(auto value, expr->Evaluate(partition_data_)); + if (value.IsNull()) { + return Expressions::AlwaysTrue(); + } + return Expressions::AlwaysFalse(); +} + +Result> ResidualVisitor::NotNullImpl( + const std::shared_ptr& expr) { + ICEBERG_ASSIGN_OR_RAISE(auto value, expr->Evaluate(partition_data_)); + if (value.IsNull()) { + return Expressions::AlwaysFalse(); + } + return Expressions::AlwaysTrue(); +} + +Result> ResidualVisitor::IsNaNImpl( + const std::shared_ptr& expr) { + ICEBERG_ASSIGN_OR_RAISE(auto value, expr->Evaluate(partition_data_)); + if (value.IsNaN()) { + return Expressions::AlwaysTrue(); + } + return Expressions::AlwaysFalse(); +} + +Result> ResidualVisitor::NotNaNImpl( + const std::shared_ptr& expr) { + ICEBERG_ASSIGN_OR_RAISE(auto value, expr->Evaluate(partition_data_)); + if (value.IsNaN()) { + return Expressions::AlwaysFalse(); + } + return Expressions::AlwaysTrue(); +} + +Result> ResidualVisitor::LtImpl( + const std::shared_ptr& expr, const Literal& lit) { + ICEBERG_ASSIGN_OR_RAISE(auto value, expr->Evaluate(partition_data_)); + if (value < lit) { + return Expressions::AlwaysTrue(); + } + return Expressions::AlwaysFalse(); +} + +Result> ResidualVisitor::LtEqImpl( + const std::shared_ptr& expr, const Literal& lit) { + ICEBERG_ASSIGN_OR_RAISE(auto value, expr->Evaluate(partition_data_)); + if (value <= lit) { + return Expressions::AlwaysTrue(); + } + return Expressions::AlwaysFalse(); +} + +Result> ResidualVisitor::GtImpl( + const std::shared_ptr& expr, const Literal& lit) { + ICEBERG_ASSIGN_OR_RAISE(auto value, expr->Evaluate(partition_data_)); + if (value > lit) { + return Expressions::AlwaysTrue(); + } + return Expressions::AlwaysFalse(); +} + +Result> ResidualVisitor::GtEqImpl( + const std::shared_ptr& expr, const Literal& lit) { + ICEBERG_ASSIGN_OR_RAISE(auto value, expr->Evaluate(partition_data_)); + if (value >= lit) { + return Expressions::AlwaysTrue(); + } + return Expressions::AlwaysFalse(); +} + +Result> ResidualVisitor::EqImpl( + const std::shared_ptr& expr, const Literal& lit) { + ICEBERG_ASSIGN_OR_RAISE(auto value, expr->Evaluate(partition_data_)); + if (value == lit) { + return Expressions::AlwaysTrue(); + } + return Expressions::AlwaysFalse(); +} + +Result> ResidualVisitor::NotEqImpl( + const std::shared_ptr& expr, const Literal& lit) { + ICEBERG_ASSIGN_OR_RAISE(auto value, expr->Evaluate(partition_data_)); + if (value != lit) { + return Expressions::AlwaysTrue(); + } + return Expressions::AlwaysFalse(); +} + +Result> ResidualVisitor::InImpl( + const std::shared_ptr& expr, + const BoundSetPredicate::LiteralSet& literal_set) { + ICEBERG_ASSIGN_OR_RAISE(auto value, expr->Evaluate(partition_data_)); + if (literal_set.contains(value)) { + return Expressions::AlwaysTrue(); + } + return Expressions::AlwaysFalse(); +} + +Result> ResidualVisitor::NotInImpl( + const std::shared_ptr& expr, + const BoundSetPredicate::LiteralSet& literal_set) { + ICEBERG_ASSIGN_OR_RAISE(auto value, expr->Evaluate(partition_data_)); + if (literal_set.contains(value)) { + return Expressions::AlwaysFalse(); + } + return Expressions::AlwaysTrue(); +} + +Result> ResidualVisitor::StartsWithImpl( + const std::shared_ptr& expr, const Literal& lit) { + ICEBERG_ASSIGN_OR_RAISE(auto value, expr->Evaluate(partition_data_)); + + // Both value and literal should be strings + if (!std::holds_alternative(value.value()) || + !std::holds_alternative(lit.value())) { + return Expressions::AlwaysFalse(); + } + + const auto& str_value = std::get(value.value()); + const auto& str_prefix = std::get(lit.value()); + if (str_value.starts_with(str_prefix)) { + return Expressions::AlwaysTrue(); + } + return Expressions::AlwaysFalse(); +} + +Result> ResidualVisitor::NotStartsWithImpl( + const std::shared_ptr& expr, const Literal& lit) { + ICEBERG_ASSIGN_OR_RAISE(auto value, expr->Evaluate(partition_data_)); + + // Both value and literal should be strings + if (!std::holds_alternative(value.value()) || + !std::holds_alternative(lit.value())) { + return Expressions::AlwaysTrue(); + } + + const auto& str_value = std::get(value.value()); + const auto& str_prefix = std::get(lit.value()); + if (str_value.starts_with(str_prefix)) { + return Expressions::AlwaysFalse(); + } + return Expressions::AlwaysTrue(); +} + +Result> ResidualVisitor::EvaluateBoundPredicate( + const std::shared_ptr& pred) { + ICEBERG_DCHECK(pred != nullptr, "BoundPredicate cannot be null"); + + switch (pred->kind()) { + case BoundPredicate::Kind::kUnary: { + switch (pred->op()) { + case Expression::Operation::kIsNull: + return IsNullImpl(pred->term()); + case Expression::Operation::kNotNull: + return NotNullImpl(pred->term()); + case Expression::Operation::kIsNan: + return IsNaNImpl(pred->term()); + case Expression::Operation::kNotNan: + return NotNaNImpl(pred->term()); + default: + return InvalidExpression("Invalid operation for BoundUnaryPredicate: {}", + ToString(pred->op())); + } + } + case BoundPredicate::Kind::kLiteral: { + const auto& literal_pred = + internal::checked_cast(*pred); + switch (pred->op()) { + case Expression::Operation::kLt: + return LtImpl(pred->term(), literal_pred.literal()); + case Expression::Operation::kLtEq: + return LtEqImpl(pred->term(), literal_pred.literal()); + case Expression::Operation::kGt: + return GtImpl(pred->term(), literal_pred.literal()); + case Expression::Operation::kGtEq: + return GtEqImpl(pred->term(), literal_pred.literal()); + case Expression::Operation::kEq: + return EqImpl(pred->term(), literal_pred.literal()); + case Expression::Operation::kNotEq: + return NotEqImpl(pred->term(), literal_pred.literal()); + case Expression::Operation::kStartsWith: + return StartsWithImpl(pred->term(), literal_pred.literal()); + case Expression::Operation::kNotStartsWith: + return NotStartsWithImpl(pred->term(), literal_pred.literal()); + default: + return InvalidExpression("Invalid operation for BoundLiteralPredicate: {}", + ToString(pred->op())); + } + } + case BoundPredicate::Kind::kSet: { + const auto& set_pred = internal::checked_cast(*pred); + switch (pred->op()) { + case Expression::Operation::kIn: + return InImpl(pred->term(), set_pred.literal_set()); + case Expression::Operation::kNotIn: + return NotInImpl(pred->term(), set_pred.literal_set()); + default: + return InvalidExpression("Invalid operation for BoundSetPredicate: {}", + ToString(pred->op())); + } + } + } + + return InvalidExpression("Unsupported bound predicate: {}", pred->ToString()); +} + +Result> ResidualVisitor::Predicate( + const std::shared_ptr& pred) { + // Get the strict projection and inclusive projection of this predicate in partition + // data, then use them to determine whether to return the original predicate. The + // strict projection returns true iff the original predicate would have returned true, + // so the predicate can be eliminated if the strict projection evaluates to true. + // Similarly the inclusive projection returns false iff the original predicate would + // have returned false, so the predicate can also be eliminated if the inclusive + // projection evaluates to false. + + // Get the field ID from the predicate's reference + const auto& ref = pred->reference(); + int32_t field_id = ref->field().field_id(); + + // Find partition fields that match this source field ID + std::vector matching_fields; + for (const auto& field : spec_->fields()) { + if (field.source_id() == field_id) { + matching_fields.push_back(&field); + } + } + + if (matching_fields.empty()) { + // Not associated with a partition field, can't be evaluated + return pred; + } + + for (const auto* part : matching_fields) { + // Check the strict projection + ICEBERG_ASSIGN_OR_RAISE(auto strict_projection, + part->transform()->ProjectStrict(part->name(), pred)); + std::shared_ptr strict_result = nullptr; + + if (strict_projection != nullptr) { + // Bind the projected predicate to partition type + ICEBERG_ASSIGN_OR_RAISE( + auto bound_strict, + Binder::Bind(*partition_schema_, + std::shared_ptr(std::move(strict_projection)), + case_sensitive_)); + + if (bound_strict->is_bound_predicate()) { + // Evaluate the bound predicate against partition data + ICEBERG_ASSIGN_OR_RAISE( + strict_result, EvaluateBoundPredicate( + std::dynamic_pointer_cast(bound_strict))); + } else { + // If the result is not a predicate, then it must be a constant like alwaysTrue + // or alwaysFalse + strict_result = bound_strict; + } + } + + if (strict_result != nullptr && strict_result->op() == Expression::Operation::kTrue) { + // If strict is true, returning true + return Expressions::AlwaysTrue(); + } + + // Check the inclusive projection + ICEBERG_ASSIGN_OR_RAISE(auto inclusive_projection, + part->transform()->Project(part->name(), pred)); + std::shared_ptr inclusive_result = nullptr; + + if (inclusive_projection != nullptr) { + // Bind the projected predicate to partition type + ICEBERG_ASSIGN_OR_RAISE( + auto bound_inclusive, + Binder::Bind(*partition_schema_, + std::shared_ptr(std::move(inclusive_projection)), + case_sensitive_)); + + if (bound_inclusive->is_bound_predicate()) { + // Evaluate the bound predicate against partition data + ICEBERG_ASSIGN_OR_RAISE( + inclusive_result, + EvaluateBoundPredicate( + std::dynamic_pointer_cast(bound_inclusive))); + } else { + // If the result is not a predicate, then it must be a constant like alwaysTrue + // or alwaysFalse + inclusive_result = bound_inclusive; + } + } + + if (inclusive_result != nullptr && + inclusive_result->op() == Expression::Operation::kFalse) { + // If inclusive is false, returning false + return Expressions::AlwaysFalse(); + } + } + + // Neither strict nor inclusive predicate was conclusive, returning the original pred + return pred; +} + +ResidualEvaluator::ResidualEvaluator(std::shared_ptr expr, + const std::shared_ptr& spec, + const std::shared_ptr& schema, + bool case_sensitive) + : expr_(std::move(expr)), + spec_(spec), + schema_(schema), + case_sensitive_(case_sensitive) {} + +ResidualEvaluator::~ResidualEvaluator() = default; + +namespace { + +// Unpartitioned residual evaluator that always returns the original expression +class UnpartitionedResidualEvaluator : public ResidualEvaluator { + public: + explicit UnpartitionedResidualEvaluator(std::shared_ptr expr) + : ResidualEvaluator(std::move(expr), PartitionSpec::Unpartitioned(), nullptr, + true) {} + + Result> ResidualFor( + const StructLike& /*partition_data*/) const override { + return expr_; + } +}; + +} // namespace + +Result> ResidualEvaluator::Unpartitioned( + std::shared_ptr expr) { + return std::unique_ptr( + new UnpartitionedResidualEvaluator(std::move(expr))); +} + +Result> ResidualEvaluator::Make( + const std::shared_ptr& spec, const std::shared_ptr& schema, + std::shared_ptr expr, bool case_sensitive) { + if (spec->fields().empty()) { + return Unpartitioned(std::move(expr)); + } + return std::unique_ptr( + new ResidualEvaluator(std::move(expr), spec, schema, case_sensitive)); +} + +Result> ResidualEvaluator::ResidualFor( + const StructLike& partition_data) const { + ResidualVisitor visitor(spec_, schema_, partition_data, case_sensitive_); + return Visit, ResidualVisitor>(expr_, visitor); +} + +} // namespace iceberg diff --git a/src/iceberg/expression/residual_evaluator.h b/src/iceberg/expression/residual_evaluator.h new file mode 100644 index 000000000..77e4af160 --- /dev/null +++ b/src/iceberg/expression/residual_evaluator.h @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/expression/residual_evaluator.h +/// Residual evaluator for finding residual expressions after partition evaluation. + +#include + +#include "iceberg/iceberg_export.h" +#include "iceberg/partition_spec.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" + +namespace iceberg { + +class Expression; +class StructLike; + +/// \brief Finds the residuals for an Expression using the partitions in the given +/// PartitionSpec. +/// +/// A residual expression is made by partially evaluating an expression using partition +/// values. For example, if a table is partitioned by day(utc_timestamp) and is read +/// with a filter expression utc_timestamp >= a and utc_timestamp <= b, then there are +/// 4 possible residual expressions for the partition data, d: +/// +/// - If d > day(a) and d < day(b), the residual is always true +/// - If d == day(a) and d != day(b), the residual is utc_timestamp >= a +/// - If d == day(b) and d != day(a), the residual is utc_timestamp <= b +/// - If d == day(a) == day(b), the residual is utc_timestamp >= a and utc_timestamp <= b +/// +/// Partition data is passed using StructLike. Residuals are returned by ResidualFor(). +class ICEBERG_EXPORT ResidualEvaluator { + public: + /// \brief Return a residual evaluator for an unpartitioned PartitionSpec. + /// + /// \param expr An expression + /// \return A residual evaluator that always returns the expression + static Result> Unpartitioned( + std::shared_ptr expr); + + /// \brief Return a residual evaluator for a PartitionSpec and Expression. + /// + /// \param spec A partition spec + /// \param schema The schema to bind expressions against + /// \param expr An expression + /// \param case_sensitive Whether field name matching is case-sensitive + /// \return A residual evaluator for the expression + static Result> Make( + const std::shared_ptr& spec, const std::shared_ptr& schema, + std::shared_ptr expr, bool case_sensitive = true); + + ~ResidualEvaluator(); + + /// \brief Returns a residual expression for the given partition values. + /// + /// \param partition_data Partition data values + /// \return The residual of this evaluator's expression from the partition values + virtual Result> ResidualFor( + const StructLike& partition_data) const; + + protected: + ResidualEvaluator(std::shared_ptr expr, + const std::shared_ptr& spec, + const std::shared_ptr& schema, bool case_sensitive); + + std::shared_ptr expr_; + + private: + const std::shared_ptr& spec_; + const std::shared_ptr& schema_; + bool case_sensitive_; +}; + +} // namespace iceberg diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build index c139c66b5..c10c5a827 100644 --- a/src/iceberg/meson.build +++ b/src/iceberg/meson.build @@ -50,6 +50,7 @@ iceberg_sources = files( 'expression/inclusive_metrics_evaluator.cc', 'expression/literal.cc', 'expression/predicate.cc', + 'expression/residual_evaluator.cc', 'expression/rewrite_not.cc', 'expression/strict_metrics_evaluator.cc', 'expression/term.cc', diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt index 9892e3d4f..f9cfb8485 100644 --- a/src/iceberg/test/CMakeLists.txt +++ b/src/iceberg/test/CMakeLists.txt @@ -88,6 +88,7 @@ add_iceberg_test(expression_test inclusive_metrics_evaluator_test.cc inclusive_metrics_evaluator_with_transform_test.cc predicate_test.cc + residual_evaluator_test.cc strict_metrics_evaluator_test.cc) add_iceberg_test(json_serde_test diff --git a/src/iceberg/test/meson.build b/src/iceberg/test/meson.build index c73abe188..0f8b9291b 100644 --- a/src/iceberg/test/meson.build +++ b/src/iceberg/test/meson.build @@ -65,6 +65,7 @@ iceberg_tests = { 'inclusive_metrics_evaluator_with_transform_test.cc', 'literal_test.cc', 'predicate_test.cc', + 'residual_evaluator_test.cc', 'strict_metrics_evaluator_test.cc', ), }, diff --git a/src/iceberg/test/residual_evaluator_test.cc b/src/iceberg/test/residual_evaluator_test.cc new file mode 100644 index 000000000..9224f295d --- /dev/null +++ b/src/iceberg/test/residual_evaluator_test.cc @@ -0,0 +1,612 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/expression/residual_evaluator.h" + +#include +#include +#include + +#include + +#include "iceberg/expression/expressions.h" +#include "iceberg/expression/literal.h" +#include "iceberg/expression/predicate.h" +#include "iceberg/partition_field.h" +#include "iceberg/partition_spec.h" +#include "iceberg/row/partition_values.h" +#include "iceberg/schema.h" +#include "iceberg/test/matchers.h" +#include "iceberg/transform.h" +#include "iceberg/type.h" + +namespace iceberg { + +class ResidualEvaluatorTest : public ::testing::Test { + protected: + void SetUp() override {} + + // Helper function to assert residual operation + void AssertResidualOp(const std::shared_ptr& spec, + const std::shared_ptr& schema, + const std::shared_ptr& pred, + const Literal& partition_value, + Expression::Operation expected_op) { + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, + ResidualEvaluator::Make(spec, schema, pred, true)); + PartitionValues partition_data(partition_value); + ICEBERG_UNWRAP_OR_FAIL(auto residual, evaluator->ResidualFor(partition_data)); + EXPECT_EQ(residual->op(), expected_op); + } + + // Helper function to assert residual is the same as original predicate + void AssertResidualPredicate(const std::shared_ptr& spec, + const std::shared_ptr& schema, + const std::shared_ptr& pred, + const Literal& partition_value) { + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, + ResidualEvaluator::Make(spec, schema, pred, true)); + PartitionValues partition_data(partition_value); + ICEBERG_UNWRAP_OR_FAIL(auto residual, evaluator->ResidualFor(partition_data)); + ASSERT_TRUE(residual->is_unbound_predicate()); + auto unbound_residual = std::dynamic_pointer_cast(residual); + ASSERT_NE(unbound_residual, nullptr); + auto unbound_original = std::dynamic_pointer_cast(pred); + ASSERT_NE(unbound_original, nullptr); + EXPECT_EQ(unbound_residual->op(), unbound_original->op()); + EXPECT_EQ(unbound_residual->reference()->name(), + unbound_original->reference()->name()); + // Check literal value + auto residual_impl = + std::dynamic_pointer_cast>(unbound_residual); + auto original_impl = + std::dynamic_pointer_cast>(unbound_original); + ASSERT_NE(residual_impl, nullptr); + ASSERT_NE(original_impl, nullptr); + ASSERT_EQ(residual_impl->literals().size(), original_impl->literals().size()); + if (!residual_impl->literals().empty()) { + EXPECT_EQ(residual_impl->literals()[0].value(), + original_impl->literals()[0].value()); + } + } +}; + +TEST_F(ResidualEvaluatorTest, IdentityTransformResiduals) { + auto schema = std::make_shared( + std::vector{SchemaField::MakeOptional(50, "dateint", int32()), + SchemaField::MakeOptional(51, "hour", int32())}, + std::nullopt); + + auto identity_transform = Transform::Identity(); + PartitionField pt_field(50, 1000, "dateint", identity_transform); + ICEBERG_UNWRAP_OR_FAIL(auto spec_unique, + PartitionSpec::Make(*schema, 0, {pt_field}, false)); + auto spec = std::shared_ptr(spec_unique.release()); + + // Create expression: (dateint < 20170815 AND dateint > 20170801) OR + // (dateint == 20170815 AND hour < 12) OR + // (dateint == 20170801 AND hour > 11) + auto expr = Expressions::Or( + Expressions::Or( + Expressions::And(Expressions::LessThan("dateint", Literal::Int(20170815)), + Expressions::GreaterThan("dateint", Literal::Int(20170801))), + Expressions::And(Expressions::Equal("dateint", Literal::Int(20170815)), + Expressions::LessThan("hour", Literal::Int(12)))), + Expressions::And(Expressions::Equal("dateint", Literal::Int(20170801)), + Expressions::GreaterThan("hour", Literal::Int(11)))); + + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, + ResidualEvaluator::Make(spec, schema, expr, true)); + + // Equal to the upper date bound + PartitionValues partition_data1(Literal::Int(20170815)); + ICEBERG_UNWRAP_OR_FAIL(auto residual1, evaluator->ResidualFor(partition_data1)); + ASSERT_TRUE(residual1->is_unbound_predicate()); + auto unbound1 = std::dynamic_pointer_cast(residual1); + ASSERT_NE(unbound1, nullptr); + EXPECT_EQ(unbound1->op(), Expression::Operation::kLt); + EXPECT_EQ(unbound1->reference()->name(), "hour"); + // Access literal through literals() span + auto unbound1_impl = + std::dynamic_pointer_cast>(unbound1); + ASSERT_NE(unbound1_impl, nullptr); + ASSERT_EQ(unbound1_impl->literals().size(), 1); + EXPECT_EQ(unbound1_impl->literals()[0].value(), Literal::Int(12).value()); + + // Equal to the lower date bound + PartitionValues partition_data2(Literal::Int(20170801)); + ICEBERG_UNWRAP_OR_FAIL(auto residual2, evaluator->ResidualFor(partition_data2)); + ASSERT_TRUE(residual2->is_unbound_predicate()); + auto unbound2 = std::dynamic_pointer_cast(residual2); + ASSERT_NE(unbound2, nullptr); + EXPECT_EQ(unbound2->op(), Expression::Operation::kGt); + EXPECT_EQ(unbound2->reference()->name(), "hour"); + // Access literal through literals() span + auto unbound2_impl = + std::dynamic_pointer_cast>(unbound2); + ASSERT_NE(unbound2_impl, nullptr); + ASSERT_EQ(unbound2_impl->literals().size(), 1); + EXPECT_EQ(unbound2_impl->literals()[0].value(), Literal::Int(11).value()); + + // Inside the date range + PartitionValues partition_data3(Literal::Int(20170812)); + ICEBERG_UNWRAP_OR_FAIL(auto residual3, evaluator->ResidualFor(partition_data3)); + EXPECT_EQ(residual3->op(), Expression::Operation::kTrue); + + // Outside the date range + PartitionValues partition_data4(Literal::Int(20170817)); + ICEBERG_UNWRAP_OR_FAIL(auto residual4, evaluator->ResidualFor(partition_data4)); + EXPECT_EQ(residual4->op(), Expression::Operation::kFalse); +} + +TEST_F(ResidualEvaluatorTest, CaseInsensitiveIdentityTransformResiduals) { + auto schema = std::make_shared( + std::vector{SchemaField::MakeOptional(50, "dateint", int32()), + SchemaField::MakeOptional(51, "hour", int32())}, + std::nullopt); + + auto identity_transform = Transform::Identity(); + PartitionField pt_field(50, 1000, "dateint", identity_transform); + ICEBERG_UNWRAP_OR_FAIL(auto spec_unique, + PartitionSpec::Make(*schema, 0, {pt_field}, false)); + auto spec = std::shared_ptr(spec_unique.release()); + + // Create expression with mixed case field names + auto expr = Expressions::Or( + Expressions::Or( + Expressions::And(Expressions::LessThan("DATEINT", Literal::Int(20170815)), + Expressions::GreaterThan("dateint", Literal::Int(20170801))), + Expressions::And(Expressions::Equal("dateint", Literal::Int(20170815)), + Expressions::LessThan("HOUR", Literal::Int(12)))), + Expressions::And(Expressions::Equal("DateInt", Literal::Int(20170801)), + Expressions::GreaterThan("hOUr", Literal::Int(11)))); + + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, + ResidualEvaluator::Make(spec, schema, expr, false)); + + // Equal to the upper date bound + PartitionValues partition_data1(Literal::Int(20170815)); + ICEBERG_UNWRAP_OR_FAIL(auto residual1, evaluator->ResidualFor(partition_data1)); + ASSERT_TRUE(residual1->is_unbound_predicate()); + auto unbound1 = std::dynamic_pointer_cast(residual1); + ASSERT_NE(unbound1, nullptr); + EXPECT_EQ(unbound1->op(), Expression::Operation::kLt); + EXPECT_EQ(unbound1->reference()->name(), "HOUR"); + // Access literal through literals() span + auto unbound1_impl = + std::dynamic_pointer_cast>(unbound1); + ASSERT_NE(unbound1_impl, nullptr); + ASSERT_EQ(unbound1_impl->literals().size(), 1); + EXPECT_EQ(unbound1_impl->literals()[0].value(), Literal::Int(12).value()); + + // Equal to the lower date bound + PartitionValues partition_data2(Literal::Int(20170801)); + ICEBERG_UNWRAP_OR_FAIL(auto residual2, evaluator->ResidualFor(partition_data2)); + ASSERT_TRUE(residual2->is_unbound_predicate()); + auto unbound2 = std::dynamic_pointer_cast(residual2); + ASSERT_NE(unbound2, nullptr); + EXPECT_EQ(unbound2->op(), Expression::Operation::kGt); + EXPECT_EQ(unbound2->reference()->name(), "hOUr"); + // Access literal through literals() span + auto unbound2_impl = + std::dynamic_pointer_cast>(unbound2); + ASSERT_NE(unbound2_impl, nullptr); + ASSERT_EQ(unbound2_impl->literals().size(), 1); + EXPECT_EQ(unbound2_impl->literals()[0].value(), Literal::Int(11).value()); + + // Inside the date range + PartitionValues partition_data3(Literal::Int(20170812)); + ICEBERG_UNWRAP_OR_FAIL(auto residual3, evaluator->ResidualFor(partition_data3)); + EXPECT_EQ(residual3->op(), Expression::Operation::kTrue); + + // Outside the date range + PartitionValues partition_data4(Literal::Int(20170817)); + ICEBERG_UNWRAP_OR_FAIL(auto residual4, evaluator->ResidualFor(partition_data4)); + EXPECT_EQ(residual4->op(), Expression::Operation::kFalse); +} + +TEST_F(ResidualEvaluatorTest, UnpartitionedResiduals) { + std::vector> expressions = { + Expressions::AlwaysTrue(), + Expressions::AlwaysFalse(), + Expressions::LessThan("a", Literal::Int(5)), + Expressions::GreaterThanOrEqual("b", Literal::Int(16)), + Expressions::NotNull("c"), + Expressions::IsNull("d"), + Expressions::In("e", {Literal::Int(1), Literal::Int(2), Literal::Int(3)}), + Expressions::NotIn("f", {Literal::Int(1), Literal::Int(2), Literal::Int(3)}), + Expressions::NotNaN("g"), + Expressions::IsNaN("h"), + Expressions::StartsWith("data", "abcd"), + Expressions::NotStartsWith("data", "abcd")}; + + PartitionValues empty_partition; + + for (const auto& expr : expressions) { + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, ResidualEvaluator::Unpartitioned(expr)); + ICEBERG_UNWRAP_OR_FAIL(auto residual, evaluator->ResidualFor(empty_partition)); + // For unpartitioned tables, residual should be the original expression + EXPECT_EQ(residual->op(), expr->op()); + } +} + +TEST_F(ResidualEvaluatorTest, In) { + auto schema = std::make_shared( + std::vector{SchemaField::MakeOptional(50, "dateint", int32()), + SchemaField::MakeOptional(51, "hour", int32())}, + std::nullopt); + + auto identity_transform = Transform::Identity(); + PartitionField pt_field(50, 1000, "dateint", identity_transform); + ICEBERG_UNWRAP_OR_FAIL(auto spec_unique, + PartitionSpec::Make(*schema, 0, {pt_field}, false)); + auto spec = std::shared_ptr(spec_unique.release()); + + auto expr = Expressions::In("dateint", {Literal::Int(20170815), Literal::Int(20170816), + Literal::Int(20170817)}); + + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, + ResidualEvaluator::Make(spec, schema, expr, true)); + + PartitionValues partition_data1(Literal::Int(20170815)); + ICEBERG_UNWRAP_OR_FAIL(auto residual1, evaluator->ResidualFor(partition_data1)); + EXPECT_EQ(residual1->op(), Expression::Operation::kTrue); + + PartitionValues partition_data2(Literal::Int(20180815)); + ICEBERG_UNWRAP_OR_FAIL(auto residual2, evaluator->ResidualFor(partition_data2)); + EXPECT_EQ(residual2->op(), Expression::Operation::kFalse); +} + +TEST_F(ResidualEvaluatorTest, NotIn) { + auto schema = std::make_shared( + std::vector{SchemaField::MakeOptional(50, "dateint", int32()), + SchemaField::MakeOptional(51, "hour", int32())}, + std::nullopt); + + auto identity_transform = Transform::Identity(); + PartitionField pt_field(50, 1000, "dateint", identity_transform); + ICEBERG_UNWRAP_OR_FAIL(auto spec_unique, + PartitionSpec::Make(*schema, 0, {pt_field}, false)); + auto spec = std::shared_ptr(spec_unique.release()); + + auto expr = Expressions::NotIn( + "dateint", + {Literal::Int(20170815), Literal::Int(20170816), Literal::Int(20170817)}); + + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, + ResidualEvaluator::Make(spec, schema, expr, true)); + + PartitionValues partition_data1(Literal::Int(20180815)); + ICEBERG_UNWRAP_OR_FAIL(auto residual1, evaluator->ResidualFor(partition_data1)); + EXPECT_EQ(residual1->op(), Expression::Operation::kTrue); + + PartitionValues partition_data2(Literal::Int(20170815)); + ICEBERG_UNWRAP_OR_FAIL(auto residual2, evaluator->ResidualFor(partition_data2)); + EXPECT_EQ(residual2->op(), Expression::Operation::kFalse); +} + +TEST_F(ResidualEvaluatorTest, IsNaN) { + auto schema = std::make_shared( + std::vector{SchemaField::MakeOptional(50, "double", float64()), + SchemaField::MakeOptional(51, "float", float32())}, + std::nullopt); + + // Test double field + auto identity_transform = Transform::Identity(); + PartitionField pt_field_double(50, 1000, "double", identity_transform); + ICEBERG_UNWRAP_OR_FAIL(auto spec_double_unique, + PartitionSpec::Make(*schema, 0, {pt_field_double}, false)); + auto spec_double = std::shared_ptr(spec_double_unique.release()); + + auto expr_double = Expressions::IsNaN("double"); + ICEBERG_UNWRAP_OR_FAIL(auto evaluator_double, + ResidualEvaluator::Make(spec_double, schema, expr_double, true)); + + PartitionValues partition_data_nan_double(Literal::Double(std::nan(""))); + ICEBERG_UNWRAP_OR_FAIL(auto residual_nan_double, + evaluator_double->ResidualFor(partition_data_nan_double)); + EXPECT_EQ(residual_nan_double->op(), Expression::Operation::kTrue); + + PartitionValues partition_data_double(Literal::Double(2.0)); + ICEBERG_UNWRAP_OR_FAIL(auto residual_double, + evaluator_double->ResidualFor(partition_data_double)); + EXPECT_EQ(residual_double->op(), Expression::Operation::kFalse); + + // Test float field + PartitionField pt_field_float(51, 1001, "float", identity_transform); + ICEBERG_UNWRAP_OR_FAIL(auto spec_float_unique, + PartitionSpec::Make(*schema, 0, {pt_field_float}, false)); + auto spec_float = std::shared_ptr(spec_float_unique.release()); + + auto expr_float = Expressions::IsNaN("float"); + ICEBERG_UNWRAP_OR_FAIL(auto evaluator_float, + ResidualEvaluator::Make(spec_float, schema, expr_float, true)); + + PartitionValues partition_data_nan_float(Literal::Float(std::nanf(""))); + ICEBERG_UNWRAP_OR_FAIL(auto residual_nan_float, + evaluator_float->ResidualFor(partition_data_nan_float)); + EXPECT_EQ(residual_nan_float->op(), Expression::Operation::kTrue); + + PartitionValues partition_data_float(Literal::Float(3.0f)); + ICEBERG_UNWRAP_OR_FAIL(auto residual_float, + evaluator_float->ResidualFor(partition_data_float)); + EXPECT_EQ(residual_float->op(), Expression::Operation::kFalse); +} + +TEST_F(ResidualEvaluatorTest, NotNaN) { + auto schema = std::make_shared( + std::vector{SchemaField::MakeOptional(50, "double", float64()), + SchemaField::MakeOptional(51, "float", float32())}, + std::nullopt); + + // Test double field + auto identity_transform = Transform::Identity(); + PartitionField pt_field_double(50, 1000, "double", identity_transform); + ICEBERG_UNWRAP_OR_FAIL(auto spec_double_unique, + PartitionSpec::Make(*schema, 0, {pt_field_double}, false)); + auto spec_double = std::shared_ptr(spec_double_unique.release()); + + auto expr_double = Expressions::NotNaN("double"); + ICEBERG_UNWRAP_OR_FAIL(auto evaluator_double, + ResidualEvaluator::Make(spec_double, schema, expr_double, true)); + + PartitionValues partition_data_nan_double(Literal::Double(std::nan(""))); + ICEBERG_UNWRAP_OR_FAIL(auto residual_nan_double, + evaluator_double->ResidualFor(partition_data_nan_double)); + EXPECT_EQ(residual_nan_double->op(), Expression::Operation::kFalse); + + PartitionValues partition_data_double(Literal::Double(2.0)); + ICEBERG_UNWRAP_OR_FAIL(auto residual_double, + evaluator_double->ResidualFor(partition_data_double)); + EXPECT_EQ(residual_double->op(), Expression::Operation::kTrue); + + // Test float field + PartitionField pt_field_float(51, 1001, "float", identity_transform); + ICEBERG_UNWRAP_OR_FAIL(auto spec_float_unique, + PartitionSpec::Make(*schema, 0, {pt_field_float}, false)); + auto spec_float = std::shared_ptr(spec_float_unique.release()); + + auto expr_float = Expressions::NotNaN("float"); + ICEBERG_UNWRAP_OR_FAIL(auto evaluator_float, + ResidualEvaluator::Make(spec_float, schema, expr_float, true)); + + PartitionValues partition_data_nan_float(Literal::Float(std::nanf(""))); + ICEBERG_UNWRAP_OR_FAIL(auto residual_nan_float, + evaluator_float->ResidualFor(partition_data_nan_float)); + EXPECT_EQ(residual_nan_float->op(), Expression::Operation::kFalse); + + PartitionValues partition_data_float(Literal::Float(3.0f)); + ICEBERG_UNWRAP_OR_FAIL(auto residual_float, + evaluator_float->ResidualFor(partition_data_float)); + EXPECT_EQ(residual_float->op(), Expression::Operation::kTrue); +} + +TEST_F(ResidualEvaluatorTest, IntegerTruncateTransformResiduals) { + auto schema = std::make_shared( + std::vector{SchemaField::MakeOptional(50, "value", int32())}, + std::nullopt); + + // Valid partitions would be 0, 10, 20...90, 100 etc. + auto truncate_transform = Transform::Truncate(10); + PartitionField pt_field(50, 1000, "value", truncate_transform); + ICEBERG_UNWRAP_OR_FAIL(auto spec_unique, + PartitionSpec::Make(*schema, 0, {pt_field}, false)); + auto spec = std::shared_ptr(spec_unique.release()); + + // Less than lower bound + AssertResidualOp(spec, schema, Expressions::LessThan("value", Literal::Int(100)), + Literal::Int(110), Expression::Operation::kFalse); + AssertResidualOp(spec, schema, Expressions::LessThan("value", Literal::Int(100)), + Literal::Int(100), Expression::Operation::kFalse); + AssertResidualOp(spec, schema, Expressions::LessThan("value", Literal::Int(100)), + Literal::Int(90), Expression::Operation::kTrue); + + // Less than upper bound + AssertResidualOp(spec, schema, Expressions::LessThan("value", Literal::Int(99)), + Literal::Int(100), Expression::Operation::kFalse); + AssertResidualPredicate(spec, schema, Expressions::LessThan("value", Literal::Int(99)), + Literal::Int(90)); + AssertResidualOp(spec, schema, Expressions::LessThan("value", Literal::Int(99)), + Literal::Int(80), Expression::Operation::kTrue); + + // Less than equals lower bound + AssertResidualOp(spec, schema, Expressions::LessThanOrEqual("value", Literal::Int(100)), + Literal::Int(110), Expression::Operation::kFalse); + AssertResidualPredicate(spec, schema, + Expressions::LessThanOrEqual("value", Literal::Int(100)), + Literal::Int(100)); + AssertResidualOp(spec, schema, Expressions::LessThanOrEqual("value", Literal::Int(100)), + Literal::Int(90), Expression::Operation::kTrue); + + // Less than equals upper bound + AssertResidualOp(spec, schema, Expressions::LessThanOrEqual("value", Literal::Int(99)), + Literal::Int(100), Expression::Operation::kFalse); + AssertResidualOp(spec, schema, Expressions::LessThanOrEqual("value", Literal::Int(99)), + Literal::Int(90), Expression::Operation::kTrue); + AssertResidualOp(spec, schema, Expressions::LessThanOrEqual("value", Literal::Int(99)), + Literal::Int(80), Expression::Operation::kTrue); + + // Greater than lower bound + AssertResidualOp(spec, schema, Expressions::GreaterThan("value", Literal::Int(100)), + Literal::Int(110), Expression::Operation::kTrue); + AssertResidualPredicate(spec, schema, + Expressions::GreaterThan("value", Literal::Int(100)), + Literal::Int(100)); + AssertResidualOp(spec, schema, Expressions::GreaterThan("value", Literal::Int(100)), + Literal::Int(90), Expression::Operation::kFalse); + + // Greater than upper bound + AssertResidualOp(spec, schema, Expressions::GreaterThan("value", Literal::Int(99)), + Literal::Int(100), Expression::Operation::kTrue); + AssertResidualOp(spec, schema, Expressions::GreaterThan("value", Literal::Int(99)), + Literal::Int(90), Expression::Operation::kFalse); + AssertResidualOp(spec, schema, Expressions::GreaterThan("value", Literal::Int(99)), + Literal::Int(80), Expression::Operation::kFalse); + + // Greater than equals lower bound + AssertResidualOp(spec, schema, + Expressions::GreaterThanOrEqual("value", Literal::Int(100)), + Literal::Int(110), Expression::Operation::kTrue); + AssertResidualOp(spec, schema, + Expressions::GreaterThanOrEqual("value", Literal::Int(100)), + Literal::Int(100), Expression::Operation::kTrue); + AssertResidualOp(spec, schema, + Expressions::GreaterThanOrEqual("value", Literal::Int(100)), + Literal::Int(90), Expression::Operation::kFalse); + + // Greater than equals upper bound + AssertResidualOp(spec, schema, + Expressions::GreaterThanOrEqual("value", Literal::Int(99)), + Literal::Int(100), Expression::Operation::kTrue); + AssertResidualPredicate(spec, schema, + Expressions::GreaterThanOrEqual("value", Literal::Int(99)), + Literal::Int(90)); + AssertResidualOp(spec, schema, + Expressions::GreaterThanOrEqual("value", Literal::Int(99)), + Literal::Int(80), Expression::Operation::kFalse); + + // Equal lower bound + AssertResidualOp(spec, schema, Expressions::Equal("value", Literal::Int(100)), + Literal::Int(110), Expression::Operation::kFalse); + AssertResidualPredicate(spec, schema, Expressions::Equal("value", Literal::Int(100)), + Literal::Int(100)); + AssertResidualOp(spec, schema, Expressions::Equal("value", Literal::Int(100)), + Literal::Int(90), Expression::Operation::kFalse); + + // Equal upper bound + AssertResidualOp(spec, schema, Expressions::Equal("value", Literal::Int(99)), + Literal::Int(100), Expression::Operation::kFalse); + AssertResidualPredicate(spec, schema, Expressions::Equal("value", Literal::Int(99)), + Literal::Int(90)); + AssertResidualOp(spec, schema, Expressions::Equal("value", Literal::Int(99)), + Literal::Int(80), Expression::Operation::kFalse); + + // Not equal lower bound + AssertResidualOp(spec, schema, Expressions::NotEqual("value", Literal::Int(100)), + Literal::Int(110), Expression::Operation::kTrue); + AssertResidualPredicate(spec, schema, Expressions::NotEqual("value", Literal::Int(100)), + Literal::Int(100)); + AssertResidualOp(spec, schema, Expressions::NotEqual("value", Literal::Int(100)), + Literal::Int(90), Expression::Operation::kTrue); + + // Not equal upper bound + AssertResidualOp(spec, schema, Expressions::NotEqual("value", Literal::Int(99)), + Literal::Int(100), Expression::Operation::kTrue); + AssertResidualPredicate(spec, schema, Expressions::NotEqual("value", Literal::Int(99)), + Literal::Int(90)); + AssertResidualOp(spec, schema, Expressions::NotEqual("value", Literal::Int(99)), + Literal::Int(80), Expression::Operation::kTrue); +} + +TEST_F(ResidualEvaluatorTest, StringTruncateTransformResiduals) { + auto schema = std::make_shared( + std::vector{SchemaField::MakeOptional(50, "value", string())}, + std::nullopt); + + // Valid partitions would be two letter strings for eg: ab, bc etc + auto truncate_transform = Transform::Truncate(2); + PartitionField pt_field(50, 1000, "value", truncate_transform); + ICEBERG_UNWRAP_OR_FAIL(auto spec_unique, + PartitionSpec::Make(*schema, 0, {pt_field}, false)); + auto spec = std::shared_ptr(spec_unique.release()); + + // Less than + AssertResidualOp(spec, schema, Expressions::LessThan("value", Literal::String("bcd")), + Literal::String("ab"), Expression::Operation::kTrue); + AssertResidualPredicate(spec, schema, + Expressions::LessThan("value", Literal::String("bcd")), + Literal::String("bc")); + AssertResidualOp(spec, schema, Expressions::LessThan("value", Literal::String("bcd")), + Literal::String("cd"), Expression::Operation::kFalse); + + // Less than equals + AssertResidualOp(spec, schema, + Expressions::LessThanOrEqual("value", Literal::String("bcd")), + Literal::String("ab"), Expression::Operation::kTrue); + AssertResidualPredicate(spec, schema, + Expressions::LessThanOrEqual("value", Literal::String("bcd")), + Literal::String("bc")); + AssertResidualOp(spec, schema, + Expressions::LessThanOrEqual("value", Literal::String("bcd")), + Literal::String("cd"), Expression::Operation::kFalse); + + // Greater than + AssertResidualOp(spec, schema, + Expressions::GreaterThan("value", Literal::String("bcd")), + Literal::String("ab"), Expression::Operation::kFalse); + AssertResidualPredicate(spec, schema, + Expressions::GreaterThan("value", Literal::String("bcd")), + Literal::String("bc")); + AssertResidualOp(spec, schema, + Expressions::GreaterThan("value", Literal::String("bcd")), + Literal::String("cd"), Expression::Operation::kTrue); + + // Greater than equals + AssertResidualOp(spec, schema, + Expressions::GreaterThanOrEqual("value", Literal::String("bcd")), + Literal::String("ab"), Expression::Operation::kFalse); + AssertResidualPredicate( + spec, schema, Expressions::GreaterThanOrEqual("value", Literal::String("bcd")), + Literal::String("bc")); + AssertResidualOp(spec, schema, + Expressions::GreaterThanOrEqual("value", Literal::String("bcd")), + Literal::String("cd"), Expression::Operation::kTrue); + + // Equal + AssertResidualOp(spec, schema, Expressions::Equal("value", Literal::String("bcd")), + Literal::String("ab"), Expression::Operation::kFalse); + AssertResidualPredicate(spec, schema, + Expressions::Equal("value", Literal::String("bcd")), + Literal::String("bc")); + AssertResidualOp(spec, schema, Expressions::Equal("value", Literal::String("bcd")), + Literal::String("cd"), Expression::Operation::kFalse); + + // Not equal + AssertResidualOp(spec, schema, Expressions::NotEqual("value", Literal::String("bcd")), + Literal::String("ab"), Expression::Operation::kTrue); + AssertResidualPredicate(spec, schema, + Expressions::NotEqual("value", Literal::String("bcd")), + Literal::String("bc")); + AssertResidualOp(spec, schema, Expressions::NotEqual("value", Literal::String("bcd")), + Literal::String("cd"), Expression::Operation::kTrue); + + // Starts with + AssertResidualOp(spec, schema, Expressions::StartsWith("value", "bcd"), + Literal::String("ab"), Expression::Operation::kFalse); + AssertResidualPredicate(spec, schema, Expressions::StartsWith("value", "bcd"), + Literal::String("bc")); + AssertResidualOp(spec, schema, Expressions::StartsWith("value", "bcd"), + Literal::String("cd"), Expression::Operation::kFalse); + AssertResidualPredicate(spec, schema, Expressions::StartsWith("value", "bcd"), + Literal::String("bcdd")); + + // Not starts with + AssertResidualOp(spec, schema, Expressions::NotStartsWith("value", "bcd"), + Literal::String("ab"), Expression::Operation::kTrue); + AssertResidualPredicate(spec, schema, Expressions::NotStartsWith("value", "bcd"), + Literal::String("bc")); + AssertResidualOp(spec, schema, Expressions::NotStartsWith("value", "bcd"), + Literal::String("cd"), Expression::Operation::kTrue); + AssertResidualPredicate(spec, schema, Expressions::NotStartsWith("value", "bcd"), + Literal::String("bcd")); + AssertResidualPredicate(spec, schema, Expressions::NotStartsWith("value", "bcd"), + Literal::String("bcdd")); +} + +} // namespace iceberg From 617a4cdf42947977899ed1d77be5566da212cb2e Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Sun, 7 Dec 2025 21:54:26 +0800 Subject: [PATCH 2/7] fix: cpp-linter --- src/iceberg/expression/residual_evaluator.cc | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/iceberg/expression/residual_evaluator.cc b/src/iceberg/expression/residual_evaluator.cc index f86f2658a..4001f4548 100644 --- a/src/iceberg/expression/residual_evaluator.cc +++ b/src/iceberg/expression/residual_evaluator.cc @@ -524,15 +524,22 @@ namespace { class UnpartitionedResidualEvaluator : public ResidualEvaluator { public: explicit UnpartitionedResidualEvaluator(std::shared_ptr expr) - : ResidualEvaluator(std::move(expr), PartitionSpec::Unpartitioned(), nullptr, + : ResidualEvaluator(std::move(expr), PartitionSpec::Unpartitioned(), empty_schema_, true) {} Result> ResidualFor( const StructLike& /*partition_data*/) const override { return expr_; } + + private: + // Store an empty schema to avoid dangling reference when passing to base class + static const std::shared_ptr empty_schema_; }; +// Static member definition +const std::shared_ptr UnpartitionedResidualEvaluator::empty_schema_; + } // namespace Result> ResidualEvaluator::Unpartitioned( From 6f06961b653aac72e31a66ef1b65d2842b859d8c Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Mon, 8 Dec 2025 22:05:19 +0800 Subject: [PATCH 3/7] fix: review comment --- src/iceberg/expression/residual_evaluator.cc | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/iceberg/expression/residual_evaluator.cc b/src/iceberg/expression/residual_evaluator.cc index 4001f4548..10e83f06e 100644 --- a/src/iceberg/expression/residual_evaluator.cc +++ b/src/iceberg/expression/residual_evaluator.cc @@ -19,8 +19,6 @@ #include "iceberg/expression/residual_evaluator.h" -#include - #include "iceberg/expression/binder.h" #include "iceberg/expression/expression.h" #include "iceberg/expression/expression_visitor.h" @@ -461,7 +459,7 @@ Result> ResidualVisitor::Predicate( } else { // If the result is not a predicate, then it must be a constant like alwaysTrue // or alwaysFalse - strict_result = bound_strict; + strict_result = std::move(bound_strict); } } @@ -492,7 +490,7 @@ Result> ResidualVisitor::Predicate( } else { // If the result is not a predicate, then it must be a constant like alwaysTrue // or alwaysFalse - inclusive_result = bound_inclusive; + inclusive_result = std::move(bound_inclusive); } } From 0e2180b6f6df579e3a004b9f3105dcd567c42add Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Wed, 10 Dec 2025 21:35:35 +0800 Subject: [PATCH 4/7] fix: review comments --- src/iceberg/expression/residual_evaluator.cc | 406 +++++-------------- src/iceberg/expression/residual_evaluator.h | 22 +- src/iceberg/partition_spec.cc | 22 +- src/iceberg/partition_spec.h | 17 +- src/iceberg/test/residual_evaluator_test.cc | 26 +- 5 files changed, 166 insertions(+), 327 deletions(-) diff --git a/src/iceberg/expression/residual_evaluator.cc b/src/iceberg/expression/residual_evaluator.cc index 10e83f06e..eb156ecfd 100644 --- a/src/iceberg/expression/residual_evaluator.cc +++ b/src/iceberg/expression/residual_evaluator.cc @@ -22,125 +22,164 @@ #include "iceberg/expression/binder.h" #include "iceberg/expression/expression.h" #include "iceberg/expression/expression_visitor.h" -#include "iceberg/expression/expressions.h" #include "iceberg/expression/predicate.h" #include "iceberg/partition_spec.h" #include "iceberg/row/struct_like.h" #include "iceberg/schema.h" #include "iceberg/schema_internal.h" #include "iceberg/transform.h" -#include "iceberg/util/checked_cast.h" #include "iceberg/util/macros.h" namespace iceberg { +namespace { + +std::shared_ptr always_true() { return True::Instance(); } +std::shared_ptr always_false() { return False::Instance(); } + class ResidualVisitor : public BoundVisitor> { public: - ResidualVisitor(const std::shared_ptr& spec, - const std::shared_ptr& schema, const StructLike& partition_data, - bool case_sensitive) - : spec_(spec), - schema_(schema), - partition_data_(partition_data), - case_sensitive_(case_sensitive) { - ICEBERG_ASSIGN_OR_THROW(auto partition_type_, spec_->PartitionType(*schema)); - partition_schema_ = FromStructType(std::move(*partition_type_), std::nullopt); + static Result Make(const PartitionSpec& spec, const Schema& schema, + const StructLike& partition_data, + bool case_sensitive) { + ICEBERG_ASSIGN_OR_RAISE(auto partition_type, spec.PartitionType(schema)); + auto partition_schema = FromStructType(std::move(*partition_type), std::nullopt); + return ResidualVisitor(spec, schema, std::move(partition_schema), partition_data, + case_sensitive); } - Result> AlwaysTrue() override { - return Expressions::AlwaysTrue(); - } + Result> AlwaysTrue() override { return always_true(); } - Result> AlwaysFalse() override { - return Expressions::AlwaysFalse(); - } + Result> AlwaysFalse() override { return always_false(); } Result> Not( const std::shared_ptr& child_result) override { - return Expressions::Not(child_result); + return Not::MakeFolded(child_result); } Result> And( const std::shared_ptr& left_result, const std::shared_ptr& right_result) override { - return Expressions::And(left_result, right_result); + return And::MakeFolded(left_result, right_result); } Result> Or( const std::shared_ptr& left_result, const std::shared_ptr& right_result) override { - return Expressions::Or(left_result, right_result); + return Or::MakeFolded(left_result, right_result); } Result> IsNull( const std::shared_ptr& expr) override { - return IsNullImpl(expr); + return expr->Evaluate(partition_data_).transform([](const auto& value) { + return value.IsNull() ? always_true() : always_false(); + }); } Result> NotNull( const std::shared_ptr& expr) override { - return NotNullImpl(expr); + return expr->Evaluate(partition_data_).transform([](const auto& value) { + return value.IsNull() ? always_false() : always_true(); + }); } Result> IsNaN(const std::shared_ptr& expr) override { - return IsNaNImpl(expr); + return expr->Evaluate(partition_data_).transform([](const auto& value) { + return value.IsNaN() ? always_true() : always_false(); + }); } Result> NotNaN( const std::shared_ptr& expr) override { - return NotNaNImpl(expr); + return expr->Evaluate(partition_data_).transform([](const auto& value) { + return value.IsNaN() ? always_false() : always_true(); + }); } Result> Lt(const std::shared_ptr& expr, const Literal& lit) override { - return LtImpl(expr, lit); + return expr->Evaluate(partition_data_).transform([&lit](const auto& value) { + return value < lit ? always_true() : always_false(); + }); } Result> LtEq(const std::shared_ptr& expr, const Literal& lit) override { - return LtEqImpl(expr, lit); + return expr->Evaluate(partition_data_).transform([&lit](const auto& value) { + return value <= lit ? always_true() : always_false(); + }); } Result> Gt(const std::shared_ptr& expr, const Literal& lit) override { - return GtImpl(expr, lit); + return expr->Evaluate(partition_data_).transform([&lit](const auto& value) { + return value > lit ? always_true() : always_false(); + }); } Result> GtEq(const std::shared_ptr& expr, const Literal& lit) override { - return GtEqImpl(expr, lit); + return expr->Evaluate(partition_data_).transform([&lit](const auto& value) { + return value >= lit ? always_true() : always_false(); + }); } Result> Eq(const std::shared_ptr& expr, const Literal& lit) override { - return EqImpl(expr, lit); + return expr->Evaluate(partition_data_).transform([&lit](const auto& value) { + return value == lit ? always_true() : always_false(); + }); } Result> NotEq(const std::shared_ptr& expr, const Literal& lit) override { - return NotEqImpl(expr, lit); + return expr->Evaluate(partition_data_).transform([&lit](const auto& value) { + return value != lit ? always_true() : always_false(); + }); } Result> StartsWith(const std::shared_ptr& expr, const Literal& lit) override { - return StartsWithImpl(expr, lit); + ICEBERG_ASSIGN_OR_RAISE(auto value, expr->Evaluate(partition_data_)); + + if (!std::holds_alternative(value.value()) || + !std::holds_alternative(lit.value())) { + return InvalidExpression("Both value and literal should be strings"); + } + + const auto& str_value = std::get(value.value()); + const auto& str_prefix = std::get(lit.value()); + return str_value.starts_with(str_prefix) ? always_true() : always_false(); } Result> NotStartsWith(const std::shared_ptr& expr, const Literal& lit) override { - return NotStartsWithImpl(expr, lit); + ICEBERG_ASSIGN_OR_RAISE(auto value, expr->Evaluate(partition_data_)); + + if (!std::holds_alternative(value.value()) || + !std::holds_alternative(lit.value())) { + return InvalidExpression("Both value and literal should be strings"); + } + + const auto& str_value = std::get(value.value()); + const auto& str_prefix = std::get(lit.value()); + return str_value.starts_with(str_prefix) ? always_false() : always_true(); } Result> In( const std::shared_ptr& expr, const BoundSetPredicate::LiteralSet& literal_set) override { - return InImpl(expr, literal_set); + return expr->Evaluate(partition_data_).transform([&literal_set](const auto& value) { + return literal_set.contains(value) ? always_true() : always_false(); + }); } Result> NotIn( const std::shared_ptr& expr, const BoundSetPredicate::LiteralSet& literal_set) override { - return NotInImpl(expr, literal_set); + return expr->Evaluate(partition_data_).transform([&literal_set](const auto& value) { + return literal_set.contains(value) ? always_false() : always_true(); + }); } Result> Predicate( @@ -148,7 +187,7 @@ class ResidualVisitor : public BoundVisitor> { Result> Predicate( const std::shared_ptr& pred) override { - ICEBERG_ASSIGN_OR_RAISE(auto bound_predicate, pred->Bind(*schema_, case_sensitive_)); + ICEBERG_ASSIGN_OR_RAISE(auto bound_predicate, pred->Bind(schema_, case_sensitive_)); if (bound_predicate->is_bound_predicate()) { ICEBERG_ASSIGN_OR_RAISE( auto residual, @@ -164,251 +203,22 @@ class ResidualVisitor : public BoundVisitor> { } private: - // Helper methods for bound predicates - Result> EvaluateBoundPredicate( - const std::shared_ptr& pred); - - Result> IsNullImpl(const std::shared_ptr& expr); - Result> NotNullImpl(const std::shared_ptr& expr); - Result> IsNaNImpl(const std::shared_ptr& expr); - Result> NotNaNImpl(const std::shared_ptr& expr); - Result> LtImpl(const std::shared_ptr& expr, - const Literal& lit); - Result> LtEqImpl(const std::shared_ptr& expr, - const Literal& lit); - Result> GtImpl(const std::shared_ptr& expr, - const Literal& lit); - Result> GtEqImpl(const std::shared_ptr& expr, - const Literal& lit); - Result> EqImpl(const std::shared_ptr& expr, - const Literal& lit); - Result> NotEqImpl(const std::shared_ptr& expr, - const Literal& lit); - Result> InImpl( - const std::shared_ptr& expr, - const BoundSetPredicate::LiteralSet& literal_set); - Result> NotInImpl( - const std::shared_ptr& expr, - const BoundSetPredicate::LiteralSet& literal_set); - Result> StartsWithImpl(const std::shared_ptr& expr, - const Literal& lit); - Result> NotStartsWithImpl( - const std::shared_ptr& expr, const Literal& lit); - - std::shared_ptr spec_; - std::shared_ptr schema_; - std::shared_ptr partition_schema_; + ResidualVisitor(const PartitionSpec& spec, const Schema& schema, + std::unique_ptr partition_schema, + const StructLike& partition_data, bool case_sensitive) + : spec_(spec), + schema_(schema), + partition_schema_(std::move(partition_schema)), + partition_data_(partition_data), + case_sensitive_(case_sensitive) {} + + const PartitionSpec& spec_; + const Schema& schema_; + std::unique_ptr partition_schema_; const StructLike& partition_data_; bool case_sensitive_; }; - -Result> ResidualVisitor::IsNullImpl( - const std::shared_ptr& expr) { - ICEBERG_ASSIGN_OR_RAISE(auto value, expr->Evaluate(partition_data_)); - if (value.IsNull()) { - return Expressions::AlwaysTrue(); - } - return Expressions::AlwaysFalse(); -} - -Result> ResidualVisitor::NotNullImpl( - const std::shared_ptr& expr) { - ICEBERG_ASSIGN_OR_RAISE(auto value, expr->Evaluate(partition_data_)); - if (value.IsNull()) { - return Expressions::AlwaysFalse(); - } - return Expressions::AlwaysTrue(); -} - -Result> ResidualVisitor::IsNaNImpl( - const std::shared_ptr& expr) { - ICEBERG_ASSIGN_OR_RAISE(auto value, expr->Evaluate(partition_data_)); - if (value.IsNaN()) { - return Expressions::AlwaysTrue(); - } - return Expressions::AlwaysFalse(); -} - -Result> ResidualVisitor::NotNaNImpl( - const std::shared_ptr& expr) { - ICEBERG_ASSIGN_OR_RAISE(auto value, expr->Evaluate(partition_data_)); - if (value.IsNaN()) { - return Expressions::AlwaysFalse(); - } - return Expressions::AlwaysTrue(); -} - -Result> ResidualVisitor::LtImpl( - const std::shared_ptr& expr, const Literal& lit) { - ICEBERG_ASSIGN_OR_RAISE(auto value, expr->Evaluate(partition_data_)); - if (value < lit) { - return Expressions::AlwaysTrue(); - } - return Expressions::AlwaysFalse(); -} - -Result> ResidualVisitor::LtEqImpl( - const std::shared_ptr& expr, const Literal& lit) { - ICEBERG_ASSIGN_OR_RAISE(auto value, expr->Evaluate(partition_data_)); - if (value <= lit) { - return Expressions::AlwaysTrue(); - } - return Expressions::AlwaysFalse(); -} - -Result> ResidualVisitor::GtImpl( - const std::shared_ptr& expr, const Literal& lit) { - ICEBERG_ASSIGN_OR_RAISE(auto value, expr->Evaluate(partition_data_)); - if (value > lit) { - return Expressions::AlwaysTrue(); - } - return Expressions::AlwaysFalse(); -} - -Result> ResidualVisitor::GtEqImpl( - const std::shared_ptr& expr, const Literal& lit) { - ICEBERG_ASSIGN_OR_RAISE(auto value, expr->Evaluate(partition_data_)); - if (value >= lit) { - return Expressions::AlwaysTrue(); - } - return Expressions::AlwaysFalse(); -} - -Result> ResidualVisitor::EqImpl( - const std::shared_ptr& expr, const Literal& lit) { - ICEBERG_ASSIGN_OR_RAISE(auto value, expr->Evaluate(partition_data_)); - if (value == lit) { - return Expressions::AlwaysTrue(); - } - return Expressions::AlwaysFalse(); -} - -Result> ResidualVisitor::NotEqImpl( - const std::shared_ptr& expr, const Literal& lit) { - ICEBERG_ASSIGN_OR_RAISE(auto value, expr->Evaluate(partition_data_)); - if (value != lit) { - return Expressions::AlwaysTrue(); - } - return Expressions::AlwaysFalse(); -} - -Result> ResidualVisitor::InImpl( - const std::shared_ptr& expr, - const BoundSetPredicate::LiteralSet& literal_set) { - ICEBERG_ASSIGN_OR_RAISE(auto value, expr->Evaluate(partition_data_)); - if (literal_set.contains(value)) { - return Expressions::AlwaysTrue(); - } - return Expressions::AlwaysFalse(); -} - -Result> ResidualVisitor::NotInImpl( - const std::shared_ptr& expr, - const BoundSetPredicate::LiteralSet& literal_set) { - ICEBERG_ASSIGN_OR_RAISE(auto value, expr->Evaluate(partition_data_)); - if (literal_set.contains(value)) { - return Expressions::AlwaysFalse(); - } - return Expressions::AlwaysTrue(); -} - -Result> ResidualVisitor::StartsWithImpl( - const std::shared_ptr& expr, const Literal& lit) { - ICEBERG_ASSIGN_OR_RAISE(auto value, expr->Evaluate(partition_data_)); - - // Both value and literal should be strings - if (!std::holds_alternative(value.value()) || - !std::holds_alternative(lit.value())) { - return Expressions::AlwaysFalse(); - } - - const auto& str_value = std::get(value.value()); - const auto& str_prefix = std::get(lit.value()); - if (str_value.starts_with(str_prefix)) { - return Expressions::AlwaysTrue(); - } - return Expressions::AlwaysFalse(); -} - -Result> ResidualVisitor::NotStartsWithImpl( - const std::shared_ptr& expr, const Literal& lit) { - ICEBERG_ASSIGN_OR_RAISE(auto value, expr->Evaluate(partition_data_)); - - // Both value and literal should be strings - if (!std::holds_alternative(value.value()) || - !std::holds_alternative(lit.value())) { - return Expressions::AlwaysTrue(); - } - - const auto& str_value = std::get(value.value()); - const auto& str_prefix = std::get(lit.value()); - if (str_value.starts_with(str_prefix)) { - return Expressions::AlwaysFalse(); - } - return Expressions::AlwaysTrue(); -} - -Result> ResidualVisitor::EvaluateBoundPredicate( - const std::shared_ptr& pred) { - ICEBERG_DCHECK(pred != nullptr, "BoundPredicate cannot be null"); - - switch (pred->kind()) { - case BoundPredicate::Kind::kUnary: { - switch (pred->op()) { - case Expression::Operation::kIsNull: - return IsNullImpl(pred->term()); - case Expression::Operation::kNotNull: - return NotNullImpl(pred->term()); - case Expression::Operation::kIsNan: - return IsNaNImpl(pred->term()); - case Expression::Operation::kNotNan: - return NotNaNImpl(pred->term()); - default: - return InvalidExpression("Invalid operation for BoundUnaryPredicate: {}", - ToString(pred->op())); - } - } - case BoundPredicate::Kind::kLiteral: { - const auto& literal_pred = - internal::checked_cast(*pred); - switch (pred->op()) { - case Expression::Operation::kLt: - return LtImpl(pred->term(), literal_pred.literal()); - case Expression::Operation::kLtEq: - return LtEqImpl(pred->term(), literal_pred.literal()); - case Expression::Operation::kGt: - return GtImpl(pred->term(), literal_pred.literal()); - case Expression::Operation::kGtEq: - return GtEqImpl(pred->term(), literal_pred.literal()); - case Expression::Operation::kEq: - return EqImpl(pred->term(), literal_pred.literal()); - case Expression::Operation::kNotEq: - return NotEqImpl(pred->term(), literal_pred.literal()); - case Expression::Operation::kStartsWith: - return StartsWithImpl(pred->term(), literal_pred.literal()); - case Expression::Operation::kNotStartsWith: - return NotStartsWithImpl(pred->term(), literal_pred.literal()); - default: - return InvalidExpression("Invalid operation for BoundLiteralPredicate: {}", - ToString(pred->op())); - } - } - case BoundPredicate::Kind::kSet: { - const auto& set_pred = internal::checked_cast(*pred); - switch (pred->op()) { - case Expression::Operation::kIn: - return InImpl(pred->term(), set_pred.literal_set()); - case Expression::Operation::kNotIn: - return NotInImpl(pred->term(), set_pred.literal_set()); - default: - return InvalidExpression("Invalid operation for BoundSetPredicate: {}", - ToString(pred->op())); - } - } - } - - return InvalidExpression("Unsupported bound predicate: {}", pred->ToString()); -} +} // namespace Result> ResidualVisitor::Predicate( const std::shared_ptr& pred) { @@ -425,22 +235,17 @@ Result> ResidualVisitor::Predicate( int32_t field_id = ref->field().field_id(); // Find partition fields that match this source field ID - std::vector matching_fields; - for (const auto& field : spec_->fields()) { - if (field.source_id() == field_id) { - matching_fields.push_back(&field); - } - } + auto matching_fields = spec_.GetFieldsBySourceId(field_id); - if (matching_fields.empty()) { + if (!matching_fields.has_value()) { // Not associated with a partition field, can't be evaluated return pred; } - for (const auto* part : matching_fields) { + for (const auto& part : matching_fields.value()) { // Check the strict projection - ICEBERG_ASSIGN_OR_RAISE(auto strict_projection, - part->transform()->ProjectStrict(part->name(), pred)); + ICEBERG_ASSIGN_OR_RAISE(auto strict_projection, part.get().transform()->ProjectStrict( + part.get().name(), pred)); std::shared_ptr strict_result = nullptr; if (strict_projection != nullptr) { @@ -454,7 +259,7 @@ Result> ResidualVisitor::Predicate( if (bound_strict->is_bound_predicate()) { // Evaluate the bound predicate against partition data ICEBERG_ASSIGN_OR_RAISE( - strict_result, EvaluateBoundPredicate( + strict_result, BoundVisitor::Predicate( std::dynamic_pointer_cast(bound_strict))); } else { // If the result is not a predicate, then it must be a constant like alwaysTrue @@ -465,12 +270,12 @@ Result> ResidualVisitor::Predicate( if (strict_result != nullptr && strict_result->op() == Expression::Operation::kTrue) { // If strict is true, returning true - return Expressions::AlwaysTrue(); + return always_true(); } // Check the inclusive projection ICEBERG_ASSIGN_OR_RAISE(auto inclusive_projection, - part->transform()->Project(part->name(), pred)); + part.get().transform()->Project(part.get().name(), pred)); std::shared_ptr inclusive_result = nullptr; if (inclusive_projection != nullptr) { @@ -485,7 +290,7 @@ Result> ResidualVisitor::Predicate( // Evaluate the bound predicate against partition data ICEBERG_ASSIGN_OR_RAISE( inclusive_result, - EvaluateBoundPredicate( + BoundVisitor::Predicate( std::dynamic_pointer_cast(bound_inclusive))); } else { // If the result is not a predicate, then it must be a constant like alwaysTrue @@ -497,7 +302,7 @@ Result> ResidualVisitor::Predicate( if (inclusive_result != nullptr && inclusive_result->op() == Expression::Operation::kFalse) { // If inclusive is false, returning false - return Expressions::AlwaysFalse(); + return always_false(); } } @@ -506,8 +311,7 @@ Result> ResidualVisitor::Predicate( } ResidualEvaluator::ResidualEvaluator(std::shared_ptr expr, - const std::shared_ptr& spec, - const std::shared_ptr& schema, + const PartitionSpec& spec, const Schema& schema, bool case_sensitive) : expr_(std::move(expr)), spec_(spec), @@ -522,8 +326,8 @@ namespace { class UnpartitionedResidualEvaluator : public ResidualEvaluator { public: explicit UnpartitionedResidualEvaluator(std::shared_ptr expr) - : ResidualEvaluator(std::move(expr), PartitionSpec::Unpartitioned(), empty_schema_, - true) {} + : ResidualEvaluator(std::move(expr), *PartitionSpec::Unpartitioned(), + *empty_schema_, true) {} Result> ResidualFor( const StructLike& /*partition_data*/) const override { @@ -547,9 +351,9 @@ Result> ResidualEvaluator::Unpartitioned( } Result> ResidualEvaluator::Make( - const std::shared_ptr& spec, const std::shared_ptr& schema, - std::shared_ptr expr, bool case_sensitive) { - if (spec->fields().empty()) { + std::shared_ptr expr, const PartitionSpec& spec, const Schema& schema, + bool case_sensitive) { + if (spec.fields().empty()) { return Unpartitioned(std::move(expr)); } return std::unique_ptr( @@ -558,7 +362,9 @@ Result> ResidualEvaluator::Make( Result> ResidualEvaluator::ResidualFor( const StructLike& partition_data) const { - ResidualVisitor visitor(spec_, schema_, partition_data, case_sensitive_); + ICEBERG_ASSIGN_OR_RAISE( + auto visitor, + ResidualVisitor::Make(spec_, schema_, partition_data, case_sensitive_)); return Visit, ResidualVisitor>(expr_, visitor); } diff --git a/src/iceberg/expression/residual_evaluator.h b/src/iceberg/expression/residual_evaluator.h index 77e4af160..60bf67f25 100644 --- a/src/iceberg/expression/residual_evaluator.h +++ b/src/iceberg/expression/residual_evaluator.h @@ -25,15 +25,11 @@ #include #include "iceberg/iceberg_export.h" -#include "iceberg/partition_spec.h" #include "iceberg/result.h" #include "iceberg/type_fwd.h" namespace iceberg { -class Expression; -class StructLike; - /// \brief Finds the residuals for an Expression using the partitions in the given /// PartitionSpec. /// @@ -59,14 +55,15 @@ class ICEBERG_EXPORT ResidualEvaluator { /// \brief Return a residual evaluator for a PartitionSpec and Expression. /// + /// \param expr An expression /// \param spec A partition spec /// \param schema The schema to bind expressions against - /// \param expr An expression /// \param case_sensitive Whether field name matching is case-sensitive /// \return A residual evaluator for the expression - static Result> Make( - const std::shared_ptr& spec, const std::shared_ptr& schema, - std::shared_ptr expr, bool case_sensitive = true); + static Result> Make(std::shared_ptr expr, + const PartitionSpec& spec, + const Schema& schema, + bool case_sensitive = true); ~ResidualEvaluator(); @@ -78,15 +75,14 @@ class ICEBERG_EXPORT ResidualEvaluator { const StructLike& partition_data) const; protected: - ResidualEvaluator(std::shared_ptr expr, - const std::shared_ptr& spec, - const std::shared_ptr& schema, bool case_sensitive); + ResidualEvaluator(std::shared_ptr expr, const PartitionSpec& spec, + const Schema& schema, bool case_sensitive); std::shared_ptr expr_; private: - const std::shared_ptr& spec_; - const std::shared_ptr& schema_; + const PartitionSpec& spec_; + const Schema& schema_; bool case_sensitive_; }; diff --git a/src/iceberg/partition_spec.cc b/src/iceberg/partition_spec.cc index 0c2dda124..f199c37b0 100644 --- a/src/iceberg/partition_spec.cc +++ b/src/iceberg/partition_spec.cc @@ -60,7 +60,8 @@ int32_t PartitionSpec::spec_id() const { return spec_id_; } std::span PartitionSpec::fields() const { return fields_; } -Result> PartitionSpec::PartitionType(const Schema& schema) { +Result> PartitionSpec::PartitionType( + const Schema& schema) const { if (fields_.empty()) { return std::make_unique(std::vector{}); } @@ -154,6 +155,25 @@ Status PartitionSpec::Validate(const Schema& schema, bool allow_missing_fields) return {}; } +Result>> +PartitionSpec::GetFieldsBySourceId(int32_t source_id) const { + ICEBERG_ASSIGN_OR_RAISE(auto source_id_to_fields, source_id_to_fields_.Get(*this)); + if (auto it = source_id_to_fields.get().find(source_id); + it != source_id_to_fields.get().cend()) { + return it->second; + } + return NotFound("Cannot find partition fields for source id: {}", source_id); +} + +Result PartitionSpec::InitSourceIdToFieldsMap( + const PartitionSpec& self) { + SourceIdToFieldsMap source_id_to_fields; + for (const auto& field : self.fields_) { + source_id_to_fields[field.source_id()].emplace_back(std::cref(field)); + } + return source_id_to_fields; +} + Result> PartitionSpec::Make( const Schema& schema, int32_t spec_id, std::vector fields, bool allow_missing_fields, std::optional last_assigned_field_id) { diff --git a/src/iceberg/partition_spec.h b/src/iceberg/partition_spec.h index 12beb9c97..f851ce7fb 100644 --- a/src/iceberg/partition_spec.h +++ b/src/iceberg/partition_spec.h @@ -27,6 +27,7 @@ #include #include #include +#include #include #include "iceberg/iceberg_export.h" @@ -34,6 +35,7 @@ #include "iceberg/result.h" #include "iceberg/type_fwd.h" #include "iceberg/util/formattable.h" +#include "iceberg/util/lazy.h" namespace iceberg { @@ -60,7 +62,7 @@ class ICEBERG_EXPORT PartitionSpec : public util::Formattable { std::span fields() const; /// \brief Get the partition type binding to the input schema. - Result> PartitionType(const Schema&); + Result> PartitionType(const Schema&) const; std::string ToString() const override; @@ -77,6 +79,13 @@ class ICEBERG_EXPORT PartitionSpec : public util::Formattable { /// \return Error status if the partition spec is invalid. Status Validate(const Schema& schema, bool allow_missing_fields) const; + /// \brief Get the partition fields by source ID. + /// \param source_id The id of the source field. + /// \return The partition fields by source ID, or NotFound if the source field is not + /// found. + Result>> GetFieldsBySourceId( + int32_t source_id) const; + /// \brief Create a PartitionSpec binding to a schema. /// \param schema The schema to bind the partition spec to. /// \param spec_id The spec ID. @@ -116,9 +125,15 @@ class ICEBERG_EXPORT PartitionSpec : public util::Formattable { /// \brief Compare two partition specs for equality. bool Equals(const PartitionSpec& other) const; + using SourceIdToFieldsMap = + std::unordered_map>>; + static Result InitSourceIdToFieldsMap(const PartitionSpec&); + const int32_t spec_id_; std::vector fields_; int32_t last_assigned_field_id_; + Lazy source_id_to_fields_; }; } // namespace iceberg diff --git a/src/iceberg/test/residual_evaluator_test.cc b/src/iceberg/test/residual_evaluator_test.cc index 9224f295d..bef17d2bc 100644 --- a/src/iceberg/test/residual_evaluator_test.cc +++ b/src/iceberg/test/residual_evaluator_test.cc @@ -49,7 +49,7 @@ class ResidualEvaluatorTest : public ::testing::Test { const Literal& partition_value, Expression::Operation expected_op) { ICEBERG_UNWRAP_OR_FAIL(auto evaluator, - ResidualEvaluator::Make(spec, schema, pred, true)); + ResidualEvaluator::Make(pred, *spec, *schema, true)); PartitionValues partition_data(partition_value); ICEBERG_UNWRAP_OR_FAIL(auto residual, evaluator->ResidualFor(partition_data)); EXPECT_EQ(residual->op(), expected_op); @@ -61,7 +61,7 @@ class ResidualEvaluatorTest : public ::testing::Test { const std::shared_ptr& pred, const Literal& partition_value) { ICEBERG_UNWRAP_OR_FAIL(auto evaluator, - ResidualEvaluator::Make(spec, schema, pred, true)); + ResidualEvaluator::Make(pred, *spec, *schema, true)); PartitionValues partition_data(partition_value); ICEBERG_UNWRAP_OR_FAIL(auto residual, evaluator->ResidualFor(partition_data)); ASSERT_TRUE(residual->is_unbound_predicate()); @@ -112,7 +112,7 @@ TEST_F(ResidualEvaluatorTest, IdentityTransformResiduals) { Expressions::GreaterThan("hour", Literal::Int(11)))); ICEBERG_UNWRAP_OR_FAIL(auto evaluator, - ResidualEvaluator::Make(spec, schema, expr, true)); + ResidualEvaluator::Make(expr, *spec, *schema, true)); // Equal to the upper date bound PartitionValues partition_data1(Literal::Int(20170815)); @@ -178,7 +178,7 @@ TEST_F(ResidualEvaluatorTest, CaseInsensitiveIdentityTransformResiduals) { Expressions::GreaterThan("hOUr", Literal::Int(11)))); ICEBERG_UNWRAP_OR_FAIL(auto evaluator, - ResidualEvaluator::Make(spec, schema, expr, false)); + ResidualEvaluator::Make(expr, *spec, *schema, false)); // Equal to the upper date bound PartitionValues partition_data1(Literal::Int(20170815)); @@ -262,7 +262,7 @@ TEST_F(ResidualEvaluatorTest, In) { Literal::Int(20170817)}); ICEBERG_UNWRAP_OR_FAIL(auto evaluator, - ResidualEvaluator::Make(spec, schema, expr, true)); + ResidualEvaluator::Make(expr, *spec, *schema, true)); PartitionValues partition_data1(Literal::Int(20170815)); ICEBERG_UNWRAP_OR_FAIL(auto residual1, evaluator->ResidualFor(partition_data1)); @@ -290,7 +290,7 @@ TEST_F(ResidualEvaluatorTest, NotIn) { {Literal::Int(20170815), Literal::Int(20170816), Literal::Int(20170817)}); ICEBERG_UNWRAP_OR_FAIL(auto evaluator, - ResidualEvaluator::Make(spec, schema, expr, true)); + ResidualEvaluator::Make(expr, *spec, *schema, true)); PartitionValues partition_data1(Literal::Int(20180815)); ICEBERG_UNWRAP_OR_FAIL(auto residual1, evaluator->ResidualFor(partition_data1)); @@ -315,8 +315,9 @@ TEST_F(ResidualEvaluatorTest, IsNaN) { auto spec_double = std::shared_ptr(spec_double_unique.release()); auto expr_double = Expressions::IsNaN("double"); - ICEBERG_UNWRAP_OR_FAIL(auto evaluator_double, - ResidualEvaluator::Make(spec_double, schema, expr_double, true)); + ICEBERG_UNWRAP_OR_FAIL( + auto evaluator_double, + ResidualEvaluator::Make(expr_double, *spec_double, *schema, true)); PartitionValues partition_data_nan_double(Literal::Double(std::nan(""))); ICEBERG_UNWRAP_OR_FAIL(auto residual_nan_double, @@ -336,7 +337,7 @@ TEST_F(ResidualEvaluatorTest, IsNaN) { auto expr_float = Expressions::IsNaN("float"); ICEBERG_UNWRAP_OR_FAIL(auto evaluator_float, - ResidualEvaluator::Make(spec_float, schema, expr_float, true)); + ResidualEvaluator::Make(expr_float, *spec_float, *schema, true)); PartitionValues partition_data_nan_float(Literal::Float(std::nanf(""))); ICEBERG_UNWRAP_OR_FAIL(auto residual_nan_float, @@ -363,8 +364,9 @@ TEST_F(ResidualEvaluatorTest, NotNaN) { auto spec_double = std::shared_ptr(spec_double_unique.release()); auto expr_double = Expressions::NotNaN("double"); - ICEBERG_UNWRAP_OR_FAIL(auto evaluator_double, - ResidualEvaluator::Make(spec_double, schema, expr_double, true)); + ICEBERG_UNWRAP_OR_FAIL( + auto evaluator_double, + ResidualEvaluator::Make(expr_double, *spec_double, *schema, true)); PartitionValues partition_data_nan_double(Literal::Double(std::nan(""))); ICEBERG_UNWRAP_OR_FAIL(auto residual_nan_double, @@ -384,7 +386,7 @@ TEST_F(ResidualEvaluatorTest, NotNaN) { auto expr_float = Expressions::NotNaN("float"); ICEBERG_UNWRAP_OR_FAIL(auto evaluator_float, - ResidualEvaluator::Make(spec_float, schema, expr_float, true)); + ResidualEvaluator::Make(expr_float, *spec_float, *schema, true)); PartitionValues partition_data_nan_float(Literal::Float(std::nanf(""))); ICEBERG_UNWRAP_OR_FAIL(auto residual_nan_float, From c62108581d3ffe849c60bff6dc9bd1f83d7f0cab Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Thu, 11 Dec 2025 00:07:41 +0800 Subject: [PATCH 5/7] fix: reference binding to null pointer --- src/iceberg/expression/residual_evaluator.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/iceberg/expression/residual_evaluator.cc b/src/iceberg/expression/residual_evaluator.cc index eb156ecfd..d33206bfc 100644 --- a/src/iceberg/expression/residual_evaluator.cc +++ b/src/iceberg/expression/residual_evaluator.cc @@ -340,7 +340,8 @@ class UnpartitionedResidualEvaluator : public ResidualEvaluator { }; // Static member definition -const std::shared_ptr UnpartitionedResidualEvaluator::empty_schema_; +const std::shared_ptr UnpartitionedResidualEvaluator::empty_schema_ = + std::make_shared(std::vector{}, std::nullopt); } // namespace From ca7d59f1acfe298b444e6c2fd89eb56ef21b81aa Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 11 Dec 2025 11:30:58 +0800 Subject: [PATCH 6/7] simplify a little bit --- src/iceberg/expression/residual_evaluator.cc | 76 ++++++++------------ src/iceberg/partition_spec.cc | 3 +- src/iceberg/partition_spec.h | 10 ++- 3 files changed, 35 insertions(+), 54 deletions(-) diff --git a/src/iceberg/expression/residual_evaluator.cc b/src/iceberg/expression/residual_evaluator.cc index d33206bfc..96eaf832a 100644 --- a/src/iceberg/expression/residual_evaluator.cc +++ b/src/iceberg/expression/residual_evaluator.cc @@ -19,7 +19,6 @@ #include "iceberg/expression/residual_evaluator.h" -#include "iceberg/expression/binder.h" #include "iceberg/expression/expression.h" #include "iceberg/expression/expression_visitor.h" #include "iceberg/expression/predicate.h" @@ -187,11 +186,11 @@ class ResidualVisitor : public BoundVisitor> { Result> Predicate( const std::shared_ptr& pred) override { - ICEBERG_ASSIGN_OR_RAISE(auto bound_predicate, pred->Bind(schema_, case_sensitive_)); - if (bound_predicate->is_bound_predicate()) { + ICEBERG_ASSIGN_OR_RAISE(auto bound, pred->Bind(schema_, case_sensitive_)); + if (bound->is_bound_predicate()) { ICEBERG_ASSIGN_OR_RAISE( auto residual, - Predicate(std::dynamic_pointer_cast(bound_predicate))); + Predicate(internal::checked_pointer_cast(bound))); if (residual->is_bound_predicate()) { // replace inclusive original unbound predicate return pred; @@ -199,7 +198,7 @@ class ResidualVisitor : public BoundVisitor> { return residual; } // if binding didn't result in a Predicate, return the expression - return bound_predicate; + return bound; } private: @@ -218,7 +217,6 @@ class ResidualVisitor : public BoundVisitor> { const StructLike& partition_data_; bool case_sensitive_; }; -} // namespace Result> ResidualVisitor::Predicate( const std::shared_ptr& pred) { @@ -230,37 +228,30 @@ Result> ResidualVisitor::Predicate( // have returned false, so the predicate can also be eliminated if the inclusive // projection evaluates to false. - // Get the field ID from the predicate's reference - const auto& ref = pred->reference(); - int32_t field_id = ref->field().field_id(); - - // Find partition fields that match this source field ID - auto matching_fields = spec_.GetFieldsBySourceId(field_id); - - if (!matching_fields.has_value()) { + // If there is no strict projection or if it evaluates to false, then return the + // predicate. + ICEBERG_ASSIGN_OR_RAISE( + auto parts, spec_.GetFieldsBySourceId(pred->reference()->field().field_id())); + if (parts.empty()) { // Not associated with a partition field, can't be evaluated return pred; } - for (const auto& part : matching_fields.value()) { + for (const auto& part : parts) { // Check the strict projection ICEBERG_ASSIGN_OR_RAISE(auto strict_projection, part.get().transform()->ProjectStrict( part.get().name(), pred)); std::shared_ptr strict_result = nullptr; if (strict_projection != nullptr) { - // Bind the projected predicate to partition type ICEBERG_ASSIGN_OR_RAISE( auto bound_strict, - Binder::Bind(*partition_schema_, - std::shared_ptr(std::move(strict_projection)), - case_sensitive_)); - + strict_projection->Bind(*partition_schema_, case_sensitive_)); if (bound_strict->is_bound_predicate()) { - // Evaluate the bound predicate against partition data ICEBERG_ASSIGN_OR_RAISE( - strict_result, BoundVisitor::Predicate( - std::dynamic_pointer_cast(bound_strict))); + strict_result, + BoundVisitor::Predicate( + internal::checked_pointer_cast(bound_strict))); } else { // If the result is not a predicate, then it must be a constant like alwaysTrue // or alwaysFalse @@ -279,19 +270,15 @@ Result> ResidualVisitor::Predicate( std::shared_ptr inclusive_result = nullptr; if (inclusive_projection != nullptr) { - // Bind the projected predicate to partition type ICEBERG_ASSIGN_OR_RAISE( auto bound_inclusive, - Binder::Bind(*partition_schema_, - std::shared_ptr(std::move(inclusive_projection)), - case_sensitive_)); + inclusive_projection->Bind(*partition_schema_, case_sensitive_)); if (bound_inclusive->is_bound_predicate()) { - // Evaluate the bound predicate against partition data ICEBERG_ASSIGN_OR_RAISE( inclusive_result, BoundVisitor::Predicate( - std::dynamic_pointer_cast(bound_inclusive))); + internal::checked_pointer_cast(bound_inclusive))); } else { // If the result is not a predicate, then it must be a constant like alwaysTrue // or alwaysFalse @@ -310,24 +297,12 @@ Result> ResidualVisitor::Predicate( return pred; } -ResidualEvaluator::ResidualEvaluator(std::shared_ptr expr, - const PartitionSpec& spec, const Schema& schema, - bool case_sensitive) - : expr_(std::move(expr)), - spec_(spec), - schema_(schema), - case_sensitive_(case_sensitive) {} - -ResidualEvaluator::~ResidualEvaluator() = default; - -namespace { - // Unpartitioned residual evaluator that always returns the original expression class UnpartitionedResidualEvaluator : public ResidualEvaluator { public: explicit UnpartitionedResidualEvaluator(std::shared_ptr expr) : ResidualEvaluator(std::move(expr), *PartitionSpec::Unpartitioned(), - *empty_schema_, true) {} + *kEmptySchema_, true) {} Result> ResidualFor( const StructLike& /*partition_data*/) const override { @@ -336,15 +311,22 @@ class UnpartitionedResidualEvaluator : public ResidualEvaluator { private: // Store an empty schema to avoid dangling reference when passing to base class - static const std::shared_ptr empty_schema_; + inline static const std::shared_ptr kEmptySchema_ = + std::make_shared(std::vector{}, std::nullopt); }; -// Static member definition -const std::shared_ptr UnpartitionedResidualEvaluator::empty_schema_ = - std::make_shared(std::vector{}, std::nullopt); - } // namespace +ResidualEvaluator::ResidualEvaluator(std::shared_ptr expr, + const PartitionSpec& spec, const Schema& schema, + bool case_sensitive) + : expr_(std::move(expr)), + spec_(spec), + schema_(schema), + case_sensitive_(case_sensitive) {} + +ResidualEvaluator::~ResidualEvaluator() = default; + Result> ResidualEvaluator::Unpartitioned( std::shared_ptr expr) { return std::unique_ptr( diff --git a/src/iceberg/partition_spec.cc b/src/iceberg/partition_spec.cc index f199c37b0..b0f1144c1 100644 --- a/src/iceberg/partition_spec.cc +++ b/src/iceberg/partition_spec.cc @@ -162,7 +162,8 @@ PartitionSpec::GetFieldsBySourceId(int32_t source_id) const { it != source_id_to_fields.get().cend()) { return it->second; } - return NotFound("Cannot find partition fields for source id: {}", source_id); + // Note that it is not an error to not find any partition fields for a source id. + return std::vector{}; } Result PartitionSpec::InitSourceIdToFieldsMap( diff --git a/src/iceberg/partition_spec.h b/src/iceberg/partition_spec.h index f851ce7fb..7f8f67822 100644 --- a/src/iceberg/partition_spec.h +++ b/src/iceberg/partition_spec.h @@ -62,7 +62,7 @@ class ICEBERG_EXPORT PartitionSpec : public util::Formattable { std::span fields() const; /// \brief Get the partition type binding to the input schema. - Result> PartitionType(const Schema&) const; + Result> PartitionType(const Schema& schema) const; std::string ToString() const override; @@ -83,8 +83,8 @@ class ICEBERG_EXPORT PartitionSpec : public util::Formattable { /// \param source_id The id of the source field. /// \return The partition fields by source ID, or NotFound if the source field is not /// found. - Result>> GetFieldsBySourceId( - int32_t source_id) const; + using PartitionFieldRef = std::reference_wrapper; + Result> GetFieldsBySourceId(int32_t source_id) const; /// \brief Create a PartitionSpec binding to a schema. /// \param schema The schema to bind the partition spec to. @@ -125,9 +125,7 @@ class ICEBERG_EXPORT PartitionSpec : public util::Formattable { /// \brief Compare two partition specs for equality. bool Equals(const PartitionSpec& other) const; - using SourceIdToFieldsMap = - std::unordered_map>>; + using SourceIdToFieldsMap = std::unordered_map>; static Result InitSourceIdToFieldsMap(const PartitionSpec&); const int32_t spec_id_; From f91bc6a701fa71997328c4bd6786e44958079b8d Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 11 Dec 2025 12:00:39 +0800 Subject: [PATCH 7/7] fix win ci --- src/iceberg/expression/residual_evaluator.cc | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/iceberg/expression/residual_evaluator.cc b/src/iceberg/expression/residual_evaluator.cc index 96eaf832a..e818199ed 100644 --- a/src/iceberg/expression/residual_evaluator.cc +++ b/src/iceberg/expression/residual_evaluator.cc @@ -189,8 +189,7 @@ class ResidualVisitor : public BoundVisitor> { ICEBERG_ASSIGN_OR_RAISE(auto bound, pred->Bind(schema_, case_sensitive_)); if (bound->is_bound_predicate()) { ICEBERG_ASSIGN_OR_RAISE( - auto residual, - Predicate(internal::checked_pointer_cast(bound))); + auto residual, Predicate(std::dynamic_pointer_cast(bound))); if (residual->is_bound_predicate()) { // replace inclusive original unbound predicate return pred; @@ -249,9 +248,8 @@ Result> ResidualVisitor::Predicate( strict_projection->Bind(*partition_schema_, case_sensitive_)); if (bound_strict->is_bound_predicate()) { ICEBERG_ASSIGN_OR_RAISE( - strict_result, - BoundVisitor::Predicate( - internal::checked_pointer_cast(bound_strict))); + strict_result, BoundVisitor::Predicate( + std::dynamic_pointer_cast(bound_strict))); } else { // If the result is not a predicate, then it must be a constant like alwaysTrue // or alwaysFalse @@ -278,7 +276,7 @@ Result> ResidualVisitor::Predicate( ICEBERG_ASSIGN_OR_RAISE( inclusive_result, BoundVisitor::Predicate( - internal::checked_pointer_cast(bound_inclusive))); + std::dynamic_pointer_cast(bound_inclusive))); } else { // If the result is not a predicate, then it must be a constant like alwaysTrue // or alwaysFalse