diff --git a/src/iceberg/sort_order.cc b/src/iceberg/sort_order.cc index 48a306611..9e42b15dd 100644 --- a/src/iceberg/sort_order.cc +++ b/src/iceberg/sort_order.cc @@ -20,9 +20,15 @@ #include "iceberg/sort_order.h" #include +#include #include +#include "iceberg/result.h" +#include "iceberg/schema.h" +#include "iceberg/sort_field.h" +#include "iceberg/transform.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep +#include "iceberg/util/macros.h" namespace iceberg { @@ -31,7 +37,7 @@ SortOrder::SortOrder(int32_t order_id, std::vector fields) const std::shared_ptr& SortOrder::Unsorted() { static const std::shared_ptr unsorted = - std::make_shared(/*order_id=*/0, std::vector{}); + std::make_shared(kUnsortedOrderId, std::vector{}); return unsorted; } @@ -80,4 +86,60 @@ bool SortOrder::Equals(const SortOrder& other) const { return order_id_ == other.order_id_ && fields_ == other.fields_; } +Status SortOrder::Validate(const Schema& schema) const { + for (const auto& field : fields_) { + auto schema_field = schema.FindFieldById(field.source_id()); + if (!schema_field.has_value() || schema_field.value() == std::nullopt) { + return InvalidArgument("Cannot find source column for sort field: {}", field); + } + + const auto& source_type = schema_field.value().value().get().type(); + + if (!field.transform()->CanTransform(*source_type)) { + return InvalidArgument("Invalid source type {} for transform {}", + source_type->ToString(), field.transform()->ToString()); + } + } + return {}; +} + +Result> SortOrder::Make(const Schema& schema, int32_t sort_id, + std::vector fields) { + if (!fields.empty() && sort_id == kUnsortedOrderId) [[unlikely]] { + return InvalidArgument("{} is reserved for unsorted sort order", kUnsortedOrderId); + } + + if (fields.empty() && sort_id != kUnsortedOrderId) [[unlikely]] { + return InvalidArgument("Sort order must have at least one sort field"); + } + + for (const auto& field : fields) { + ICEBERG_ASSIGN_OR_RAISE(auto schema_field, schema.FindFieldById(field.source_id())); + if (schema_field == std::nullopt) { + return InvalidArgument("Cannot find source column for sort field: {}", field); + } + + const auto& source_type = schema_field.value().get().type(); + if (field.transform()->CanTransform(*source_type) == false) { + return InvalidArgument("Invalid source type {} for transform {}", + source_type->ToString(), field.transform()->ToString()); + } + } + + return std::make_unique(sort_id, std::move(fields)); +} + +Result> SortOrder::Make(int32_t sort_id, + std::vector fields) { + if (!fields.empty() && sort_id == kUnsortedOrderId) [[unlikely]] { + return InvalidArgument("{} is reserved for unsorted sort order", kUnsortedOrderId); + } + + if (fields.empty() && sort_id != kUnsortedOrderId) [[unlikely]] { + return InvalidArgument("Sort order must have at least one sort field"); + } + + return std::make_unique(sort_id, std::move(fields)); +} + } // namespace iceberg diff --git a/src/iceberg/sort_order.h b/src/iceberg/sort_order.h index e245a74a2..4a0b11a92 100644 --- a/src/iceberg/sort_order.h +++ b/src/iceberg/sort_order.h @@ -20,11 +20,13 @@ #pragma once #include +#include #include #include #include "iceberg/iceberg_export.h" #include "iceberg/sort_field.h" +#include "iceberg/type_fwd.h" #include "iceberg/util/formattable.h" namespace iceberg { @@ -36,6 +38,7 @@ namespace iceberg { /// applied to the data. class ICEBERG_EXPORT SortOrder : public util::Formattable { public: + static constexpr int32_t kUnsortedOrderId = 0; static constexpr int32_t kInitialSortOrderId = 1; SortOrder(int32_t order_id, std::vector fields); @@ -69,6 +72,28 @@ class ICEBERG_EXPORT SortOrder : public util::Formattable { return lhs.Equals(rhs); } + /// \brief Validates the sort order against a schema. + /// \param schema The schema to validate against. + /// \return Error status if the sort order has any invalid transform. + Status Validate(const Schema& schema) const; + + /// \brief Create a SortOrder. + /// \param schema The schema to bind the sort order to. + /// \param sort_id The sort order id. + /// \param fields The sort fields. + /// \return A Result containing the SortOrder or an error. + static Result> Make(const Schema& schema, int32_t sort_id, + std::vector fields); + + /// \brief Create a SortOrder without binding to a schema. + /// \param sort_id The sort order id. + /// \param fields The sort fields. + /// \return A Result containing the SortOrder or an error. + /// \note This method does not check whether the sort fields are valid for any schema. + /// Use IsBoundToSchema to check if the sort order is valid for a given schema. + static Result> Make(int32_t sort_id, + std::vector fields); + private: /// \brief Compare two sort orders for equality. bool Equals(const SortOrder& other) const; diff --git a/src/iceberg/test/schema_field_test.cc b/src/iceberg/test/schema_field_test.cc index bc0dbbdf3..1078aadf2 100644 --- a/src/iceberg/test/schema_field_test.cc +++ b/src/iceberg/test/schema_field_test.cc @@ -63,7 +63,7 @@ TEST(SchemaFieldTest, Equality) { iceberg::SchemaField field1(1, "foo", iceberg::int32(), false); iceberg::SchemaField field2(2, "foo", iceberg::int32(), false); iceberg::SchemaField field3(1, "bar", iceberg::int32(), false); - iceberg::SchemaField field4(1, "foo", std::make_shared(), false); + iceberg::SchemaField field4(1, "foo", iceberg::int64(), false); iceberg::SchemaField field5(1, "foo", iceberg::int32(), true); iceberg::SchemaField field6(1, "foo", iceberg::int32(), false); diff --git a/src/iceberg/test/schema_test.cc b/src/iceberg/test/schema_test.cc index d99e9b3c3..89a8d54b5 100644 --- a/src/iceberg/test/schema_test.cc +++ b/src/iceberg/test/schema_test.cc @@ -25,7 +25,6 @@ #include #include -#include "gtest/gtest.h" #include "iceberg/result.h" #include "iceberg/schema_field.h" #include "iceberg/test/matchers.h" diff --git a/src/iceberg/test/sort_order_test.cc b/src/iceberg/test/sort_order_test.cc index bb407afae..86bd52567 100644 --- a/src/iceberg/test/sort_order_test.cc +++ b/src/iceberg/test/sort_order_test.cc @@ -26,11 +26,40 @@ #include "iceberg/schema.h" #include "iceberg/sort_field.h" +#include "iceberg/test/matchers.h" #include "iceberg/transform.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep namespace iceberg { +class SortOrderMakeTest : public ::testing::Test { + protected: + void SetUp() override { + field1_ = std::make_unique(1, "x", int32(), true); + field2_ = std::make_unique(2, "y", string(), true); + field3_ = std::make_unique(3, "time", timestamp(), true); + + schema_ = std::make_unique( + std::vector{*field1_, *field2_, *field3_}, 1); + + sort_field1_ = std::make_unique( + 1, Transform::Identity(), SortDirection::kAscending, NullOrder::kFirst); + sort_field2_ = std::make_unique( + 2, Transform::Bucket(10), SortDirection::kDescending, NullOrder::kLast); + sort_field3_ = std::make_unique( + 3, Transform::Day(), SortDirection::kAscending, NullOrder::kFirst); + } + + std::unique_ptr schema_; + std::unique_ptr field1_; + std::unique_ptr field2_; + std::unique_ptr field3_; + + std::unique_ptr sort_field1_; + std::unique_ptr sort_field2_; + std::unique_ptr sort_field3_; +}; + TEST(SortOrderTest, Basics) { { SchemaField field1(5, "ts", iceberg::timestamp(), true); @@ -148,4 +177,84 @@ TEST(SortOrderTest, Satisfies) { EXPECT_FALSE(sort_order2.Satisfies(sort_order4)); } +TEST_F(SortOrderMakeTest, MakeValidSortOrder) { + ICEBERG_UNWRAP_OR_FAIL( + auto sort_order, + SortOrder::Make(*schema_, 1, std::vector{*sort_field1_, *sort_field2_})); + ASSERT_NE(sort_order, nullptr); + + EXPECT_TRUE(sort_order->is_sorted()); + ASSERT_EQ(sort_order->fields().size(), 2); + EXPECT_EQ(sort_order->fields()[0], *sort_field1_); + EXPECT_EQ(sort_order->fields()[1], *sort_field2_); +} + +TEST_F(SortOrderMakeTest, MakeInvalidSortOrderEmptyFields) { + auto sort_order = SortOrder::Make(*schema_, 1, std::vector{}); + EXPECT_THAT(sort_order, IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(sort_order, + HasErrorMessage("Sort order must have at least one sort field")); +} + +TEST_F(SortOrderMakeTest, MakeInvalidSortOrderUnsortedId) { + auto sort_order = SortOrder::Make(*schema_, SortOrder::kUnsortedOrderId, + std::vector{*sort_field1_}); + EXPECT_THAT(sort_order, IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(sort_order, + HasErrorMessage(std::format("{} is reserved for unsorted sort order", + SortOrder::kUnsortedOrderId))); +} + +TEST_F(SortOrderMakeTest, MakeValidUnsortedSortOrder) { + ICEBERG_UNWRAP_OR_FAIL(auto sort_order, SortOrder::Make(SortOrder::kUnsortedOrderId, + std::vector{})); + ASSERT_NE(sort_order, nullptr); + + EXPECT_TRUE(sort_order->is_unsorted()); + EXPECT_EQ(sort_order->fields().size(), 0); +} + +TEST_F(SortOrderMakeTest, MakeInvalidSortOrderNonPrimitiveField) { + auto struct_field = std::make_unique( + 4, "struct_field", + std::make_shared(std::vector{ + SchemaField::MakeRequired(41, "inner_field", iceberg::int32()), + }), + true); + + Schema schema_with_struct( + std::vector{*field1_, *field2_, *field3_, *struct_field}, 1); + + SortField sort_field_invalid(4, Transform::Identity(), SortDirection::kAscending, + NullOrder::kFirst); + + auto sort_order = SortOrder::Make( + schema_with_struct, 1, std::vector{*sort_field1_, sort_field_invalid}); + EXPECT_THAT(sort_order, IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(sort_order, HasErrorMessage("Invalid source type")); +} + +TEST_F(SortOrderMakeTest, MakeInvalidSortOrderFieldNotInSchema) { + SortField sort_field_invalid(999, Transform::Identity(), SortDirection::kAscending, + NullOrder::kFirst); + + auto sort_order = SortOrder::Make( + *schema_, 1, std::vector{*sort_field1_, sort_field_invalid}); + EXPECT_THAT(sort_order, IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(sort_order, HasErrorMessage("Cannot find source column for sort field")); +} + +TEST_F(SortOrderMakeTest, MakeUnboundSortOrder) { + SortField sort_field_invalid(999, Transform::Identity(), SortDirection::kAscending, + NullOrder::kFirst); + + auto sort_order = + SortOrder::Make(1, std::vector{*sort_field1_, sort_field_invalid}); + ASSERT_THAT(sort_order, IsOk()); + auto validate_status = sort_order.value()->Validate(*schema_); + EXPECT_THAT(validate_status, IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(validate_status, + HasErrorMessage("Cannot find source column for sort field")); +} + } // namespace iceberg diff --git a/src/iceberg/test/transform_test.cc b/src/iceberg/test/transform_test.cc index e4352af8a..529297b27 100644 --- a/src/iceberg/test/transform_test.cc +++ b/src/iceberg/test/transform_test.cc @@ -27,6 +27,7 @@ #include #include "iceberg/expression/literal.h" +#include "iceberg/schema_field.h" #include "iceberg/test/matchers.h" #include "iceberg/test/temporal_test_helper.h" #include "iceberg/type.h" @@ -841,4 +842,76 @@ TEST(TransformSatisfiesOrderOfTest, SatisfiesOrderOf) { } } +TEST(TransformCanTransformTest, CanTransform) { + struct Case { + std::string transform_str; + std::shared_ptr source_type; + bool expected; + }; + + const std::vector cases = { + // Identity can transform all primitive types + {.transform_str = "identity", .source_type = int32(), .expected = true}, + {.transform_str = "identity", .source_type = string(), .expected = true}, + {.transform_str = "identity", .source_type = boolean(), .expected = true}, + {.transform_str = "identity", + .source_type = list(SchemaField(123, "element", int32(), false)), + .expected = false}, + + // Void can transform any type + {.transform_str = "void", .source_type = iceberg::int32(), .expected = true}, + {.transform_str = "void", + .source_type = iceberg::map(SchemaField(123, "key", iceberg::string(), false), + SchemaField(124, "value", iceberg::int32(), true)), + .expected = true}, + + // Bucket can transform specific types + {.transform_str = "bucket[16]", .source_type = iceberg::int32(), .expected = true}, + {.transform_str = "bucket[16]", .source_type = iceberg::string(), .expected = true}, + {.transform_str = "bucket[16]", + .source_type = iceberg::float32(), + .expected = false}, + + // Truncate can transform specific types + {.transform_str = "truncate[32]", + .source_type = iceberg::int32(), + .expected = true}, + {.transform_str = "truncate[32]", + .source_type = iceberg::string(), + .expected = true}, + {.transform_str = "truncate[32]", + .source_type = iceberg::boolean(), + .expected = false}, + + // Year can transform date and timestamp types + {.transform_str = "year", .source_type = iceberg::date(), .expected = true}, + {.transform_str = "year", .source_type = iceberg::timestamp(), .expected = true}, + {.transform_str = "year", .source_type = iceberg::string(), .expected = false}, + + // Month can transform date and timestamp types + {.transform_str = "month", .source_type = iceberg::date(), .expected = true}, + {.transform_str = "month", .source_type = iceberg::timestamp(), .expected = true}, + {.transform_str = "month", .source_type = iceberg::binary(), .expected = false}, + + // Day can transform date and timestamp types + {.transform_str = "day", .source_type = iceberg::date(), .expected = true}, + {.transform_str = "day", .source_type = iceberg::timestamp(), .expected = true}, + {.transform_str = "day", .source_type = iceberg::uuid(), .expected = false}, + + // Hour can transform timestamp types + {.transform_str = "hour", .source_type = iceberg::timestamp(), .expected = true}, + {.transform_str = "hour", .source_type = iceberg::timestamp_tz(), .expected = true}, + {.transform_str = "hour", .source_type = iceberg::int32(), .expected = false}, + }; + + for (const auto& c : cases) { + auto transform = TransformFromString(c.transform_str); + ASSERT_TRUE(transform.has_value()) << "Failed to parse: " << c.transform_str; + + EXPECT_EQ(transform.value()->CanTransform(*c.source_type), c.expected) + << "Unexpected result for transform: " << c.transform_str + << " and source type: " << c.source_type->ToString(); + } +} + } // namespace iceberg diff --git a/src/iceberg/transform.cc b/src/iceberg/transform.cc index 4724cc18e..2e39cb06d 100644 --- a/src/iceberg/transform.cc +++ b/src/iceberg/transform.cc @@ -125,6 +125,75 @@ Result> Transform::Bind( } } +bool Transform::CanTransform(const Type& source_type) const { + switch (transform_type_) { + case TransformType::kIdentity: + if (!source_type.is_primitive()) [[unlikely]] { + return false; + } + return true; + case TransformType::kVoid: + case TransformType::kUnknown: + return true; + case TransformType::kBucket: + switch (source_type.type_id()) { + case TypeId::kInt: + case TypeId::kLong: + case TypeId::kDecimal: + case TypeId::kDate: + case TypeId::kTime: + case TypeId::kTimestamp: + case TypeId::kTimestampTz: + case TypeId::kString: + case TypeId::kUuid: + case TypeId::kFixed: + case TypeId::kBinary: + return true; + default: + return false; + } + case TransformType::kTruncate: + switch (source_type.type_id()) { + case TypeId::kInt: + case TypeId::kLong: + case TypeId::kString: + case TypeId::kBinary: + case TypeId::kDecimal: + return true; + default: + return false; + } + case TransformType::kYear: + case TransformType::kMonth: + switch (source_type.type_id()) { + case TypeId::kDate: + case TypeId::kTimestamp: + case TypeId::kTimestampTz: + return true; + default: + return false; + } + case TransformType::kDay: + switch (source_type.type_id()) { + case TypeId::kDate: + case TypeId::kTimestamp: + case TypeId::kTimestampTz: + return true; + default: + return false; + } + case TransformType::kHour: + switch (source_type.type_id()) { + case TypeId::kTimestamp: + case TypeId::kTimestampTz: + return true; + default: + return false; + } + } + std::unreachable(); +} + bool Transform::PreservesOrder() const { switch (transform_type_) { case TransformType::kUnknown: diff --git a/src/iceberg/transform.h b/src/iceberg/transform.h index 21483e0c4..4f608c467 100644 --- a/src/iceberg/transform.h +++ b/src/iceberg/transform.h @@ -150,6 +150,11 @@ class ICEBERG_EXPORT Transform : public util::Formattable { Result> Bind( const std::shared_ptr& source_type) const; + /// \brief Checks whether this function can be applied to the given Type. + /// \param source_type The source type to check. + /// \return true if this transform can be applied to the type, false otherwise + bool CanTransform(const Type& source_type) const; + /// \brief Whether the transform preserves the order of values (is monotonic). bool PreservesOrder() const;