From d22ef196bf52c1b1d970e04081d16b7537ada485 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Sun, 27 Jul 2025 23:57:58 +0800 Subject: [PATCH 01/15] feat: implement Literal Transform --- src/iceberg/expression/literal.cc | 26 +++ src/iceberg/expression/literal.h | 6 + src/iceberg/manifest_reader_internal.cc | 5 - src/iceberg/transform.h | 4 + src/iceberg/transform_function.cc | 226 ++++++++++++++++++ src/iceberg/transform_function.h | 24 ++ src/iceberg/type_fwd.h | 16 +- src/iceberg/util/macros.h | 5 + test/transform_test.cc | 292 ++++++++++++++++++++++++ 9 files changed, 591 insertions(+), 13 deletions(-) diff --git a/src/iceberg/expression/literal.cc b/src/iceberg/expression/literal.cc index d053f305f..cbee663bf 100644 --- a/src/iceberg/expression/literal.cc +++ b/src/iceberg/expression/literal.cc @@ -130,8 +130,16 @@ Literal Literal::Boolean(bool value) { return {Value{value}, iceberg::boolean()} Literal Literal::Int(int32_t value) { return {Value{value}, iceberg::int32()}; } +Literal Literal::Date(int32_t value) { return {Value{value}, iceberg::date()}; } + Literal Literal::Long(int64_t value) { return {Value{value}, iceberg::int64()}; } +Literal Literal::Timestamp(int64_t value) { return {Value{value}, iceberg::timestamp()}; } + +Literal Literal::TimestampTz(int64_t value) { + return {Value{value}, iceberg::timestamp_tz()}; +} + Literal Literal::Float(float value) { return {Value{value}, iceberg::float32()}; } Literal Literal::Double(double value) { return {Value{value}, iceberg::float64()}; } @@ -208,12 +216,30 @@ std::partial_ordering Literal::operator<=>(const Literal& other) const { return this_val <=> other_val; } + case TypeId::kDate: { + auto this_val = std::get(value_); + auto other_val = std::get(other.value_); + return this_val <=> other_val; + } + case TypeId::kLong: { auto this_val = std::get(value_); auto other_val = std::get(other.value_); return this_val <=> other_val; } + case TypeId::kTimestamp: { + auto this_val = std::get(value_); + auto other_val = std::get(other.value_); + return this_val <=> other_val; + } + + case TypeId::kTimestampTz: { + auto this_val = std::get(value_); + auto other_val = std::get(other.value_); + return this_val <=> other_val; + } + case TypeId::kFloat: { auto this_val = std::get(value_); auto other_val = std::get(other.value_); diff --git a/src/iceberg/expression/literal.h b/src/iceberg/expression/literal.h index 739e20fc2..7d25e310f 100644 --- a/src/iceberg/expression/literal.h +++ b/src/iceberg/expression/literal.h @@ -63,7 +63,10 @@ class ICEBERG_EXPORT Literal { /// \brief Factory methods for primitive types static Literal Boolean(bool value); static Literal Int(int32_t value); + static Literal Date(int32_t value); static Literal Long(int64_t value); + static Literal Timestamp(int64_t value); + static Literal TimestampTz(int64_t value); static Literal Float(float value); static Literal Double(double value); static Literal String(std::string value); @@ -85,6 +88,9 @@ class ICEBERG_EXPORT Literal { /// \brief Get the literal type. const std::shared_ptr& type() const; + /// \brief Get the literal value. + const Value& value() const { return value_; } + /// \brief Converts this literal to a literal of the given type. /// /// When a predicate is bound to a concrete data column, literals are converted to match diff --git a/src/iceberg/manifest_reader_internal.cc b/src/iceberg/manifest_reader_internal.cc index 251e0d71d..d01717427 100644 --- a/src/iceberg/manifest_reader_internal.cc +++ b/src/iceberg/manifest_reader_internal.cc @@ -33,11 +33,6 @@ namespace iceberg { -#define NANOARROW_RETURN_IF_NOT_OK(status, error) \ - if (status != NANOARROW_OK) [[unlikely]] { \ - return InvalidArrowData("Nanoarrow error: {}", error.message); \ - } - #define PARSE_PRIMITIVE_FIELD(item, array_view, type) \ for (size_t row_idx = 0; row_idx < array_view->length; row_idx++) { \ if (!ArrowArrayViewIsNull(array_view, row_idx)) { \ diff --git a/src/iceberg/transform.h b/src/iceberg/transform.h index f09f15bba..8043c15d7 100644 --- a/src/iceberg/transform.h +++ b/src/iceberg/transform.h @@ -23,9 +23,11 @@ #include #include +#include #include #include "iceberg/arrow_c_data.h" +#include "iceberg/expression/literal.h" #include "iceberg/iceberg_export.h" #include "iceberg/result.h" #include "iceberg/type_fwd.h" @@ -172,6 +174,8 @@ class ICEBERG_EXPORT TransformFunction { TransformFunction(TransformType transform_type, std::shared_ptr source_type); /// \brief Transform an input array to a new array virtual Result Transform(const ArrowArray& data) = 0; + /// \brief Transform an input Literal to a new Literal + virtual Result> Transform(const Literal& literal) = 0; /// \brief Get the transform type TransformType transform_type() const; /// \brief Get the source type of transform function diff --git a/src/iceberg/transform_function.cc b/src/iceberg/transform_function.cc index 9ddf6e9f7..b802fea91 100644 --- a/src/iceberg/transform_function.cc +++ b/src/iceberg/transform_function.cc @@ -19,7 +19,11 @@ #include "iceberg/transform_function.h" +#include +#include + #include "iceberg/type.h" +#include "iceberg/util/murmurhash3_internal.h" namespace iceberg { @@ -30,6 +34,10 @@ Result IdentityTransform::Transform(const ArrowArray& input) { return NotImplemented("IdentityTransform::Transform"); } +Result> IdentityTransform::Transform(const Literal& literal) { + return literal; +} + Result> IdentityTransform::ResultType() const { return source_type(); } @@ -51,6 +59,57 @@ Result BucketTransform::Transform(const ArrowArray& input) { return NotImplemented("BucketTransform::Transform"); } +Result> BucketTransform::Transform(const Literal& literal) { + assert(literal.type() == source_type()); + if (literal.IsBelowMin() || literal.IsAboveMax()) { + return InvalidArgument( + "Cannot apply bucket transform to literal with value {} of type {}", + literal.ToString(), source_type()->ToString()); + } + int32_t hash_value = 0; + switch (source_type()->type_id()) { + case TypeId::kInt: + case TypeId::kDate: { + auto value = std::get(literal.value()); + MurmurHash3_x86_32(&value, sizeof(int32_t), 0, &hash_value); + break; + } + case TypeId::kLong: + case TypeId::kTime: + case TypeId::kTimestamp: + case TypeId::kTimestampTz: { + auto value = std::get(literal.value()); + MurmurHash3_x86_32(&value, sizeof(int64_t), 0, &hash_value); + break; + } + case TypeId::kDecimal: + case TypeId::kUuid: { + auto value = std::get>(literal.value()); + MurmurHash3_x86_32(value.data(), sizeof(uint8_t) * 16, 0, &hash_value); + break; + } + case TypeId::kString: { + auto value = std::get(literal.value()); + MurmurHash3_x86_32(value.data(), value.size(), 0, &hash_value); + break; + } + case TypeId::kFixed: + case TypeId::kBinary: { + auto value = std::get>(literal.value()); + MurmurHash3_x86_32(value.data(), value.size(), 0, &hash_value); + break; + } + default: + std::unreachable(); + } + + // Calculate the bucket index + int32_t bucket_index = + (hash_value & std::numeric_limits::max()) % num_buckets_; + + return Literal::Int(bucket_index); +} + Result> BucketTransform::ResultType() const { return iceberg::int32(); } @@ -91,6 +150,46 @@ Result TruncateTransform::Transform(const ArrowArray& input) { return NotImplemented("TruncateTransform::Transform"); } +Result> TruncateTransform::Transform(const Literal& literal) { + assert(literal.type() == source_type()); + if (literal.IsBelowMin() || literal.IsAboveMax()) { + return InvalidArgument( + "Cannot apply truncate transform to literal with value {} of type {}", + literal.ToString(), source_type()->ToString()); + } + + switch (source_type()->type_id()) { + case TypeId::kInt: { + auto value = std::get(literal.value()); + return Literal::Int(value % width_); + } + case TypeId::kLong: { + auto value = std::get(literal.value()); + return Literal::Long(value % width_); + } + case TypeId::kDecimal: { + // TODO(zhjwpku): Handle decimal truncation logic here + return NotImplemented("Truncate for Decimal is not implemented yet"); + } + case TypeId::kString: { + auto value = std::get(literal.value()); + if (value.size() > static_cast(width_)) { + value.resize(width_); + } + return Literal::String(value); + } + case TypeId::kBinary: { + auto value = std::get>(literal.value()); + if (value.size() > static_cast(width_)) { + value.resize(width_); + } + return Literal::Binary(value); + } + default: + std::unreachable(); + } +} + Result> TruncateTransform::ResultType() const { return source_type(); } @@ -124,6 +223,34 @@ Result YearTransform::Transform(const ArrowArray& input) { return NotImplemented("YearTransform::Transform"); } +Result> YearTransform::Transform(const Literal& literal) { + assert(literal.type() == source_type()); + if (literal.IsBelowMin() || literal.IsAboveMax()) { + return InvalidArgument( + "Cannot apply year transform to literal with value {} of type {}", + literal.ToString(), source_type()->ToString()); + } + + using namespace std::chrono; + switch (source_type()->type_id()) { + case TypeId::kDate: { + auto value = std::get(literal.value()); + auto epoch = sys_days(year{1970} / January / 1); + auto ymd = year_month_day(epoch + days{value}); + return Literal::Int(static_cast(ymd.year())); + } + case TypeId::kTimestamp: + case TypeId::kTimestampTz: { + auto value = std::get(literal.value()); + // Convert milliseconds-since-epoch into a `year_month_day` object + auto ymd = year_month_day(floor(sys_time(milliseconds{value}))); + return Literal::Int(static_cast(ymd.year())); + } + default: + std::unreachable(); + } +} + Result> YearTransform::ResultType() const { return iceberg::int32(); } @@ -152,6 +279,46 @@ Result MonthTransform::Transform(const ArrowArray& input) { return NotImplemented("MonthTransform::Transform"); } +Result> MonthTransform::Transform(const Literal& literal) { + assert(literal.type() == source_type()); + if (literal.IsBelowMin() || literal.IsAboveMax()) { + return InvalidArgument( + "Cannot apply month transform to literal with value {} of type {}", + literal.ToString(), source_type()->ToString()); + } + + using namespace std::chrono; + switch (source_type()->type_id()) { + case TypeId::kDate: { + auto value = std::get(literal.value()); + auto epoch = sys_days(year{1970} / January / 1); + auto ymd = year_month_day(epoch + days{value}); + auto epoch_ymd = year_month_day(epoch); + auto delta = ymd.year() - epoch_ymd.year(); + // Calculate the month as months from 1970-01 + // Note: January is month 1, so we subtract 1 to get zero-based + // month count. + return Literal::Int(static_cast(delta.count() * 12 + + static_cast(ymd.month()) - 1)); + } + case TypeId::kTimestamp: + case TypeId::kTimestampTz: { + auto value = std::get(literal.value()); + // Convert milliseconds-since-epoch into a `year_month_day` object + auto ymd = year_month_day(floor(sys_time(milliseconds{value}))); + auto epoch_ymd = year_month_day(year{1970} / January / 1); + auto delta = ymd.year() - epoch_ymd.year(); + // Calculate the month as months from 1970-01 + // Note: January is month 1, so we subtract 1 to get zero-based + // month count. + return Literal::Int(static_cast(delta.count() * 12 + + static_cast(ymd.month()) - 1)); + } + default: + std::unreachable(); + } +} + Result> MonthTransform::ResultType() const { return iceberg::int32(); } @@ -180,6 +347,35 @@ Result DayTransform::Transform(const ArrowArray& input) { return NotImplemented("DayTransform::Transform"); } +Result> DayTransform::Transform(const Literal& literal) { + assert(literal.type() == source_type()); + if (literal.IsBelowMin() || literal.IsAboveMax()) { + return InvalidArgument( + "Cannot apply day transform to literal with value {} of type {}", + literal.ToString(), source_type()->ToString()); + } + + using namespace std::chrono; + switch (source_type()->type_id()) { + case TypeId::kDate: { + // Day is the same as the date value + return literal; + } + case TypeId::kTimestamp: + case TypeId::kTimestampTz: { + auto value = std::get(literal.value()); + // Convert milliseconds to `sys_days` (chronological days since epoch) + auto timestamp = sys_time(milliseconds{value}); + auto days_since_epoch = floor(timestamp); + + return Literal::Date( + static_cast(days_since_epoch.time_since_epoch().count())); + } + default: + std::unreachable(); + } +} + Result> DayTransform::ResultType() const { return iceberg::date(); } Result> DayTransform::Make( @@ -206,6 +402,32 @@ Result HourTransform::Transform(const ArrowArray& input) { return NotImplemented("HourTransform::Transform"); } +Result> HourTransform::Transform(const Literal& literal) { + assert(literal.type() == source_type()); + if (literal.IsBelowMin() || literal.IsAboveMax()) { + return InvalidArgument( + "Cannot apply hour transform to literal with value {} of type {}", + literal.ToString(), source_type()->ToString()); + } + + using namespace std::chrono; + switch (source_type()->type_id()) { + case TypeId::kTimestamp: + case TypeId::kTimestampTz: { + auto value = std::get(literal.value()); + // Create a `sys_time` object from the milliseconds value + auto timestamp = sys_time(milliseconds{value}); + + // Convert the time since epoch directly into hours + auto hours_since_epoch = duration_cast(timestamp.time_since_epoch()).count(); + + return Literal::Int(static_cast(hours_since_epoch)); + } + default: + std::unreachable(); + } +} + Result> HourTransform::ResultType() const { return iceberg::int32(); } @@ -233,6 +455,10 @@ Result VoidTransform::Transform(const ArrowArray& input) { return NotImplemented("VoidTransform::Transform"); } +Result> VoidTransform::Transform(const Literal& literal) { + return std::nullopt; +} + Result> VoidTransform::ResultType() const { return source_type(); } Result> VoidTransform::Make( diff --git a/src/iceberg/transform_function.h b/src/iceberg/transform_function.h index 7fffd61f0..262f47124 100644 --- a/src/iceberg/transform_function.h +++ b/src/iceberg/transform_function.h @@ -33,6 +33,9 @@ class IdentityTransform : public TransformFunction { /// \brief Returns the input array without modification. Result Transform(const ArrowArray& input) override; + /// \brief Returns the same Literal as the input. + Result> Transform(const Literal& literal) override; + /// \brief Returns the same type as the source type if it is valid. Result> ResultType() const override; @@ -53,6 +56,9 @@ class BucketTransform : public TransformFunction { /// \brief Applies the bucket hash function to the input array. Result Transform(const ArrowArray& input) override; + /// \brief Applies the bucket hash function to the input Literal. + Result> Transform(const Literal& literal) override; + /// \brief Returns INT32 as the output type. Result> ResultType() const override; @@ -77,6 +83,9 @@ class TruncateTransform : public TransformFunction { /// \brief Truncates values in the input array to the specified width. Result Transform(const ArrowArray& input) override; + /// \brief Truncates the input Literal to the specified width. + Result> Transform(const Literal& literal) override; + /// \brief Returns the same type as source_type. Result> ResultType() const override; @@ -100,6 +109,9 @@ class YearTransform : public TransformFunction { /// \brief Extracts the year from each timestamp in the input array. Result Transform(const ArrowArray& input) override; + /// \brief Extracts the year from a timestamp literal. + Result> Transform(const Literal& literal) override; + /// \brief Returns INT32 as the output type. Result> ResultType() const override; @@ -119,6 +131,9 @@ class MonthTransform : public TransformFunction { /// \brief Extracts the month (1-12) from each timestamp in the input array. Result Transform(const ArrowArray& input) override; + /// \brief Extracts the month (1-12) from a timestamp literal. + Result> Transform(const Literal& literal) override; + /// \brief Returns INT32 as the output type. Result> ResultType() const override; @@ -138,6 +153,9 @@ class DayTransform : public TransformFunction { /// \brief Extracts the day (1-31) from each timestamp in the input array. Result Transform(const ArrowArray& input) override; + /// \brief Extracts the day (1-31) from a timestamp literal. + Result> Transform(const Literal& literal) override; + /// \brief Returns INT32 as the output type. Result> ResultType() const override; @@ -157,6 +175,9 @@ class HourTransform : public TransformFunction { /// \brief Extracts the hour (0-23) from each timestamp in the input array. Result Transform(const ArrowArray& input) override; + /// \brief Extracts the hour (0-23) from a timestamp literal. + Result> Transform(const Literal& literal) override; + /// \brief Returns INT32 as the output type. Result> ResultType() const override; @@ -176,6 +197,9 @@ class VoidTransform : public TransformFunction { /// \brief Returns an all-null array of the same length as the input. Result Transform(const ArrowArray& input) override; + /// \brief Returns a null literal. + Result> Transform(const Literal& literal) override; + /// \brief Returns null type or a sentinel type indicating void. Result> ResultType() const override; diff --git a/src/iceberg/type_fwd.h b/src/iceberg/type_fwd.h index 83574eec0..0135422c1 100644 --- a/src/iceberg/type_fwd.h +++ b/src/iceberg/type_fwd.h @@ -120,6 +120,14 @@ class FileScanTask; class TableScan; class TableScanBuilder; +struct DataFile; +struct ManifestEntry; +struct ManifestFile; +struct ManifestList; + +class ManifestReader; +class ManifestListReader; + /// ---------------------------------------------------------------------------- /// TODO: Forward declarations below are not added yet. /// ---------------------------------------------------------------------------- @@ -131,12 +139,4 @@ class UpdateRequirement; class AppendFiles; -struct DataFile; -struct ManifestEntry; -struct ManifestFile; -struct ManifestList; - -class ManifestReader; -class ManifestListReader; - } // namespace iceberg diff --git a/src/iceberg/util/macros.h b/src/iceberg/util/macros.h index 4d687bf51..5c8770fee 100644 --- a/src/iceberg/util/macros.h +++ b/src/iceberg/util/macros.h @@ -36,3 +36,8 @@ #define ICEBERG_ASSIGN_OR_RAISE(lhs, rexpr) \ ICEBERG_ASSIGN_OR_RAISE_IMPL(ICEBERG_ASSIGN_OR_RAISE_NAME(result_, __COUNTER__), lhs, \ rexpr) + +#define NANOARROW_RETURN_IF_NOT_OK(status, error) \ + if (status != NANOARROW_OK) [[unlikely]] { \ + return InvalidArrowData("Nanoarrow error: {}", error.message); \ + } diff --git a/test/transform_test.cc b/test/transform_test.cc index 33149d14d..3ee178f8a 100644 --- a/test/transform_test.cc +++ b/test/transform_test.cc @@ -193,4 +193,296 @@ TEST(TransformResultTypeTest, NegativeCases) { } } +TEST(TransformFunctionTransformTest, IdentityTransform) { + struct Case { + std::shared_ptr source_type; + Literal source; + Literal expected; + }; + + const std::vector cases = { + {.source_type = iceberg::boolean(), + .source = Literal::Boolean(true), + .expected = Literal::Boolean(true)}, + {.source_type = iceberg::int32(), + .source = Literal::Int(42), + .expected = Literal::Int(42)}, + {.source_type = iceberg::int32(), + .source = Literal::Date(30000), + .expected = Literal::Date(30000)}, + {.source_type = iceberg::int64(), + .source = Literal::Long(1234567890), + .expected = Literal::Long(1234567890)}, + {.source_type = iceberg::timestamp(), + .source = Literal::Timestamp(1622547800000), + .expected = Literal::Timestamp(1622547800000)}, + {.source_type = iceberg::timestamp_tz(), + .source = Literal::TimestampTz(1622547800000), + .expected = Literal::TimestampTz(1622547800000)}, + {.source_type = iceberg::float32(), + .source = Literal::Float(3.14), + .expected = Literal::Float(3.14)}, + {.source_type = iceberg::float64(), + .source = Literal::Double(2.71828), + .expected = Literal::Double(2.71828)}, + {.source_type = iceberg::string(), + .source = Literal::String("Hello, World!"), + .expected = Literal::String("Hello, World!")}, + {.source_type = iceberg::binary(), + .source = Literal::Binary({0x01, 0x02, 0x03}), + .expected = Literal::Binary({0x01, 0x02, 0x03})}, + }; + + for (const auto& c : cases) { + auto transform = Transform::Identity(); + auto transformPtr = transform->Bind(c.source_type); + ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind identity transform"; + + auto result = transformPtr.value()->Transform(c.source); + ASSERT_TRUE(result.has_value()) + << "Failed to transform literal: " << c.source.ToString(); + + EXPECT_EQ(result.value(), c.expected) + << "Unexpected result for source: " << c.source.ToString(); + } +} + +TEST(TransformFunctionTransformTest, BucketTransform) { + constexpr int32_t num_buckets = 4; + auto transform = Transform::Bucket(num_buckets); + + struct Case { + std::shared_ptr source_type; + Literal source; + Literal expected; + }; + + const std::vector cases = { + {.source_type = iceberg::int32(), + .source = Literal::Int(42), + .expected = Literal::Int(3)}, + {.source_type = iceberg::date(), + .source = Literal::Date(30000), + .expected = Literal::Int(2)}, + {.source_type = iceberg::int64(), + .source = Literal::Long(1234567890), + .expected = Literal::Int(3)}, + {.source_type = iceberg::timestamp(), + .source = Literal::Timestamp(1622547800000), + .expected = Literal::Int(1)}, + {.source_type = iceberg::timestamp_tz(), + .source = Literal::TimestampTz(1622547800000), + .expected = Literal::Int(1)}, + {.source_type = iceberg::string(), + .source = Literal::String("test"), + .expected = Literal::Int(3)}, + }; + + for (const auto& c : cases) { + auto transformPtr = transform->Bind(c.source_type); + ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind bucket transform"; + auto result = transformPtr.value()->Transform(c.source); + ASSERT_TRUE(result.has_value()) + << "Failed to transform literal: " << c.source.ToString(); + + EXPECT_EQ(result.value(), c.expected) + << "Unexpected result for source: " << c.source.ToString(); + } +} + +TEST(TransformFunctionTransformTest, TruncateTransform) { + constexpr int32_t width = 5; + auto transform = Transform::Truncate(width); + + struct Case { + std::shared_ptr source_type; + Literal source; + Literal expected; + }; + + const std::vector cases = { + {.source_type = iceberg::int32(), + .source = Literal::Int(123456), + .expected = Literal::Int(1)}, + {.source_type = iceberg::string(), + .source = Literal::String("Hello, World!"), + .expected = Literal::String("Hello")}, + {.source_type = iceberg::binary(), + .source = Literal::Binary({0x01, 0x02, 0x03, 0x04, 0x05, 0x06}), + .expected = Literal::Binary({0x01, 0x02, 0x03, 0x04, 0x05})}, + }; + + for (const auto& c : cases) { + auto transformPtr = transform->Bind(c.source_type); + ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind truncate transform"; + auto result = transformPtr.value()->Transform(c.source); + ASSERT_TRUE(result.has_value()) + << "Failed to transform literal: " << c.source.ToString(); + + EXPECT_EQ(result.value(), c.expected) + << "Unexpected result for source: " << c.source.ToString(); + } +} + +TEST(TransformFunctionTransformTest, YearTransform) { + auto transform = Transform::Year(); + + struct Case { + std::shared_ptr source_type; + Literal source; + Literal expected; + }; + + const std::vector cases = { + {.source_type = iceberg::timestamp(), + .source = Literal::Timestamp(1622547800000), + .expected = Literal::Int(2021)}, + {.source_type = iceberg::timestamp_tz(), + .source = Literal::TimestampTz(1622547800000), + .expected = Literal::Int(2021)}, + {.source_type = iceberg::date(), + .source = Literal::Date(30000), + .expected = Literal::Int(2052)}, + }; + + for (const auto& c : cases) { + auto transformPtr = transform->Bind(c.source_type); + ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind year transform"; + auto result = transformPtr.value()->Transform(c.source); + ASSERT_TRUE(result.has_value()) + << "Failed to transform literal: " << c.source.ToString(); + + EXPECT_EQ(result.value(), c.expected) + << "Unexpected result for source: " << c.source.ToString(); + } +} + +TEST(TransformFunctionTransformTest, MonthTransform) { + auto transform = Transform::Month(); + + struct Case { + std::shared_ptr source_type; + Literal source; + Literal expected; + }; + + const std::vector cases = { + {.source_type = iceberg::timestamp(), + .source = Literal::Timestamp(1622547800000), + .expected = Literal::Int(617)}, + {.source_type = iceberg::timestamp_tz(), + .source = Literal::TimestampTz(1622547800000), + .expected = Literal::Int(617)}, + {.source_type = iceberg::date(), + .source = Literal::Date(30000), + .expected = Literal::Int(985)}, + }; + + for (const auto& c : cases) { + auto transformPtr = transform->Bind(c.source_type); + ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind month transform"; + auto result = transformPtr.value()->Transform(c.source); + ASSERT_TRUE(result.has_value()) + << "Failed to transform literal: " << c.source.ToString(); + + EXPECT_EQ(result.value(), c.expected) + << "Unexpected result for source: " << c.source.ToString(); + } +} + +TEST(TransformFunctionTransformTest, DayTransform) { + auto transform = Transform::Day(); + + struct Case { + std::shared_ptr source_type; + Literal source; + Literal expected; + }; + + const std::vector cases = { + {.source_type = iceberg::timestamp(), + .source = Literal::Timestamp(1622547800000), + .expected = Literal::Date(18779)}, + {.source_type = iceberg::timestamp_tz(), + .source = Literal::TimestampTz(1622547800000), + .expected = Literal::Date(18779)}, + {.source_type = iceberg::date(), + .source = Literal::Date(30000), + .expected = Literal::Date(30000)}, + }; + + for (const auto& c : cases) { + auto transformPtr = transform->Bind(c.source_type); + ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind day transform"; + auto result = transformPtr.value()->Transform(c.source); + ASSERT_TRUE(result.has_value()) + << "Failed to transform literal: " << c.source.ToString(); + + EXPECT_EQ(result.value(), c.expected) + << "Unexpected result for source: " << c.source.ToString(); + } +} + +TEST(TransformFunctionTransformTest, HourTransform) { + auto transform = Transform::Hour(); + + struct Case { + std::shared_ptr source_type; + Literal source; + Literal expected; + }; + + const std::vector cases = { + {.source_type = iceberg::timestamp(), + .source = Literal::Timestamp(1622547800000), + .expected = Literal::Int(450707)}, + {.source_type = iceberg::timestamp_tz(), + .source = Literal::TimestampTz(1622547800000), + .expected = Literal::Int(450707)}, + }; + + for (const auto& c : cases) { + auto transformPtr = transform->Bind(c.source_type); + ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind hour transform"; + auto result = transformPtr.value()->Transform(c.source); + ASSERT_TRUE(result.has_value()) + << "Failed to transform literal: " << c.source.ToString(); + + EXPECT_EQ(result.value(), c.expected) + << "Unexpected result for source: " << c.source.ToString(); + } +} + +TEST(TransformFunctionTransformTest, VoidTransform) { + auto transform = Transform::Void(); + + struct Case { + std::shared_ptr source_type; + Literal source; + }; + + const std::vector cases = { + {.source_type = iceberg::boolean(), .source = Literal::Boolean(true)}, + {.source_type = iceberg::int32(), .source = Literal::Int(42)}, + {.source_type = iceberg::int32(), .source = Literal::Date(30000)}, + {.source_type = iceberg::int64(), .source = Literal::Long(1234567890)}, + {.source_type = iceberg::timestamp(), .source = Literal::Timestamp(1622547800000)}, + {.source_type = iceberg::timestamp_tz(), + .source = Literal::TimestampTz(1622547800000)}, + {.source_type = iceberg::float32(), .source = Literal::Float(3.14)}, + {.source_type = iceberg::float64(), .source = Literal::Double(2.71828)}, + {.source_type = iceberg::string(), .source = Literal::String("Hello, World!")}, + {.source_type = iceberg::binary(), .source = Literal::Binary({0x01, 0x02, 0x03})}, + }; + + for (const auto& c : cases) { + auto transformPtr = transform->Bind(c.source_type); + ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind void transform"; + auto result = transformPtr.value()->Transform(c.source); + ASSERT_TRUE(result == std::nullopt) + << "Expected void transform to return no result for source: " + << c.source.ToString(); + } +} + } // namespace iceberg From 27493c8ae632f53afdaa2f6bf8e0c3207a3cdc0d Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Mon, 28 Jul 2025 00:07:00 +0800 Subject: [PATCH 02/15] fix: inaccurate comments --- src/iceberg/transform_function.cc | 1 + src/iceberg/transform_function.h | 20 ++++++++++++-------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/src/iceberg/transform_function.cc b/src/iceberg/transform_function.cc index b802fea91..e30ca5336 100644 --- a/src/iceberg/transform_function.cc +++ b/src/iceberg/transform_function.cc @@ -21,6 +21,7 @@ #include #include +#include #include "iceberg/type.h" #include "iceberg/util/murmurhash3_internal.h" diff --git a/src/iceberg/transform_function.h b/src/iceberg/transform_function.h index 262f47124..c3e7e282b 100644 --- a/src/iceberg/transform_function.h +++ b/src/iceberg/transform_function.h @@ -106,10 +106,11 @@ class YearTransform : public TransformFunction { /// \param source_type Must be a timestamp type. explicit YearTransform(std::shared_ptr const& source_type); - /// \brief Extracts the year from each timestamp in the input array. + /// \brief Extracts the year from each date timestamp in the input array, as years from + /// 1970. Result Transform(const ArrowArray& input) override; - /// \brief Extracts the year from a timestamp literal. + /// \brief Extract a date or timestamp year, as years from 1970. Result> Transform(const Literal& literal) override; /// \brief Returns INT32 as the output type. @@ -128,10 +129,11 @@ class MonthTransform : public TransformFunction { /// \param source_type Must be a timestamp type. explicit MonthTransform(std::shared_ptr const& source_type); - /// \brief Extracts the month (1-12) from each timestamp in the input array. + /// \brief Extracts the month from each date or timestamp in the input array, as months + /// from 1970-01-01. Result Transform(const ArrowArray& input) override; - /// \brief Extracts the month (1-12) from a timestamp literal. + /// \brief Extract a date or timestamp month, as months from 1970-01-01. Result> Transform(const Literal& literal) override; /// \brief Returns INT32 as the output type. @@ -150,10 +152,11 @@ class DayTransform : public TransformFunction { /// \param source_type Must be a timestamp type. explicit DayTransform(std::shared_ptr const& source_type); - /// \brief Extracts the day (1-31) from each timestamp in the input array. + /// \brief Extracts the day from each date or timestamp in the input array, as days from + /// 1970-01-01. Result Transform(const ArrowArray& input) override; - /// \brief Extracts the day (1-31) from a timestamp literal. + /// \brief Extract a date or timestamp day, as days from 1970-01-01. Result> Transform(const Literal& literal) override; /// \brief Returns INT32 as the output type. @@ -172,10 +175,11 @@ class HourTransform : public TransformFunction { /// \param source_type Must be a timestamp type. explicit HourTransform(std::shared_ptr const& source_type); - /// \brief Extracts the hour (0-23) from each timestamp in the input array. + /// \brief Extracts the hour from each timestamp in the input array, as hours from + /// 1970-01-01 00:00:00. Result Transform(const ArrowArray& input) override; - /// \brief Extracts the hour (0-23) from a timestamp literal. + /// \brief Extract a timestamp hour, as hours from 1970-01-01 00:00:00. Result> Transform(const Literal& literal) override; /// \brief Returns INT32 as the output type. From 935dcf415beee50216c0e5226049da2f0928cc3a Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Mon, 28 Jul 2025 23:06:01 +0800 Subject: [PATCH 03/15] fix: timestamps should be in microseconds also fix some lint warnings --- src/iceberg/transform_function.cc | 24 +++++++++--------- test/transform_test.cc | 41 ++++++++++++++++--------------- 2 files changed, 33 insertions(+), 32 deletions(-) diff --git a/src/iceberg/transform_function.cc b/src/iceberg/transform_function.cc index e30ca5336..aee98b9cf 100644 --- a/src/iceberg/transform_function.cc +++ b/src/iceberg/transform_function.cc @@ -232,7 +232,7 @@ Result> YearTransform::Transform(const Literal& literal) literal.ToString(), source_type()->ToString()); } - using namespace std::chrono; + using namespace std::chrono; // NOLINT switch (source_type()->type_id()) { case TypeId::kDate: { auto value = std::get(literal.value()); @@ -243,8 +243,8 @@ Result> YearTransform::Transform(const Literal& literal) case TypeId::kTimestamp: case TypeId::kTimestampTz: { auto value = std::get(literal.value()); - // Convert milliseconds-since-epoch into a `year_month_day` object - auto ymd = year_month_day(floor(sys_time(milliseconds{value}))); + // Convert microseconds-since-epoch into a `year_month_day` object + auto ymd = year_month_day(floor(sys_time(microseconds{value}))); return Literal::Int(static_cast(ymd.year())); } default: @@ -288,7 +288,7 @@ Result> MonthTransform::Transform(const Literal& literal) literal.ToString(), source_type()->ToString()); } - using namespace std::chrono; + using namespace std::chrono; // NOLINT switch (source_type()->type_id()) { case TypeId::kDate: { auto value = std::get(literal.value()); @@ -305,8 +305,8 @@ Result> MonthTransform::Transform(const Literal& literal) case TypeId::kTimestamp: case TypeId::kTimestampTz: { auto value = std::get(literal.value()); - // Convert milliseconds-since-epoch into a `year_month_day` object - auto ymd = year_month_day(floor(sys_time(milliseconds{value}))); + // Convert microseconds-since-epoch into a `year_month_day` object + auto ymd = year_month_day(floor(sys_time(microseconds{value}))); auto epoch_ymd = year_month_day(year{1970} / January / 1); auto delta = ymd.year() - epoch_ymd.year(); // Calculate the month as months from 1970-01 @@ -356,7 +356,7 @@ Result> DayTransform::Transform(const Literal& literal) { literal.ToString(), source_type()->ToString()); } - using namespace std::chrono; + using namespace std::chrono; // NOLINT switch (source_type()->type_id()) { case TypeId::kDate: { // Day is the same as the date value @@ -365,8 +365,8 @@ Result> DayTransform::Transform(const Literal& literal) { case TypeId::kTimestamp: case TypeId::kTimestampTz: { auto value = std::get(literal.value()); - // Convert milliseconds to `sys_days` (chronological days since epoch) - auto timestamp = sys_time(milliseconds{value}); + // Convert microseconds to `sys_days` (chronological days since epoch) + auto timestamp = sys_time(microseconds{value}); auto days_since_epoch = floor(timestamp); return Literal::Date( @@ -411,13 +411,13 @@ Result> HourTransform::Transform(const Literal& literal) literal.ToString(), source_type()->ToString()); } - using namespace std::chrono; + using namespace std::chrono; // NOLINT switch (source_type()->type_id()) { case TypeId::kTimestamp: case TypeId::kTimestampTz: { auto value = std::get(literal.value()); - // Create a `sys_time` object from the milliseconds value - auto timestamp = sys_time(milliseconds{value}); + // Create a `sys_time` object from the microseconds value + auto timestamp = sys_time(microseconds{value}); // Convert the time since epoch directly into hours auto hours_since_epoch = duration_cast(timestamp.time_since_epoch()).count(); diff --git a/test/transform_test.cc b/test/transform_test.cc index 3ee178f8a..21de2d408 100644 --- a/test/transform_test.cc +++ b/test/transform_test.cc @@ -214,17 +214,17 @@ TEST(TransformFunctionTransformTest, IdentityTransform) { .source = Literal::Long(1234567890), .expected = Literal::Long(1234567890)}, {.source_type = iceberg::timestamp(), - .source = Literal::Timestamp(1622547800000), - .expected = Literal::Timestamp(1622547800000)}, + .source = Literal::Timestamp(1622547800000000), + .expected = Literal::Timestamp(1622547800000000)}, {.source_type = iceberg::timestamp_tz(), - .source = Literal::TimestampTz(1622547800000), - .expected = Literal::TimestampTz(1622547800000)}, + .source = Literal::TimestampTz(1622547800000000), + .expected = Literal::TimestampTz(1622547800000000)}, {.source_type = iceberg::float32(), .source = Literal::Float(3.14), .expected = Literal::Float(3.14)}, {.source_type = iceberg::float64(), - .source = Literal::Double(2.71828), - .expected = Literal::Double(2.71828)}, + .source = Literal::Double(1.23e-5), + .expected = Literal::Double(1.23e-5)}, {.source_type = iceberg::string(), .source = Literal::String("Hello, World!"), .expected = Literal::String("Hello, World!")}, @@ -268,10 +268,10 @@ TEST(TransformFunctionTransformTest, BucketTransform) { .source = Literal::Long(1234567890), .expected = Literal::Int(3)}, {.source_type = iceberg::timestamp(), - .source = Literal::Timestamp(1622547800000), + .source = Literal::Timestamp(1622547800000000), .expected = Literal::Int(1)}, {.source_type = iceberg::timestamp_tz(), - .source = Literal::TimestampTz(1622547800000), + .source = Literal::TimestampTz(1622547800000000), .expected = Literal::Int(1)}, {.source_type = iceberg::string(), .source = Literal::String("test"), @@ -335,10 +335,10 @@ TEST(TransformFunctionTransformTest, YearTransform) { const std::vector cases = { {.source_type = iceberg::timestamp(), - .source = Literal::Timestamp(1622547800000), + .source = Literal::Timestamp(1622547800000000), .expected = Literal::Int(2021)}, {.source_type = iceberg::timestamp_tz(), - .source = Literal::TimestampTz(1622547800000), + .source = Literal::TimestampTz(1622547800000000), .expected = Literal::Int(2021)}, {.source_type = iceberg::date(), .source = Literal::Date(30000), @@ -368,10 +368,10 @@ TEST(TransformFunctionTransformTest, MonthTransform) { const std::vector cases = { {.source_type = iceberg::timestamp(), - .source = Literal::Timestamp(1622547800000), + .source = Literal::Timestamp(1622547800000000), .expected = Literal::Int(617)}, {.source_type = iceberg::timestamp_tz(), - .source = Literal::TimestampTz(1622547800000), + .source = Literal::TimestampTz(1622547800000000), .expected = Literal::Int(617)}, {.source_type = iceberg::date(), .source = Literal::Date(30000), @@ -401,10 +401,10 @@ TEST(TransformFunctionTransformTest, DayTransform) { const std::vector cases = { {.source_type = iceberg::timestamp(), - .source = Literal::Timestamp(1622547800000), + .source = Literal::Timestamp(1622547800000000), .expected = Literal::Date(18779)}, {.source_type = iceberg::timestamp_tz(), - .source = Literal::TimestampTz(1622547800000), + .source = Literal::TimestampTz(1622547800000000), .expected = Literal::Date(18779)}, {.source_type = iceberg::date(), .source = Literal::Date(30000), @@ -434,10 +434,10 @@ TEST(TransformFunctionTransformTest, HourTransform) { const std::vector cases = { {.source_type = iceberg::timestamp(), - .source = Literal::Timestamp(1622547800000), + .source = Literal::Timestamp(1622547800000000), .expected = Literal::Int(450707)}, {.source_type = iceberg::timestamp_tz(), - .source = Literal::TimestampTz(1622547800000), + .source = Literal::TimestampTz(1622547800000000), .expected = Literal::Int(450707)}, }; @@ -466,11 +466,12 @@ TEST(TransformFunctionTransformTest, VoidTransform) { {.source_type = iceberg::int32(), .source = Literal::Int(42)}, {.source_type = iceberg::int32(), .source = Literal::Date(30000)}, {.source_type = iceberg::int64(), .source = Literal::Long(1234567890)}, - {.source_type = iceberg::timestamp(), .source = Literal::Timestamp(1622547800000)}, + {.source_type = iceberg::timestamp(), + .source = Literal::Timestamp(1622547800000000)}, {.source_type = iceberg::timestamp_tz(), - .source = Literal::TimestampTz(1622547800000)}, + .source = Literal::TimestampTz(1622547800000000)}, {.source_type = iceberg::float32(), .source = Literal::Float(3.14)}, - {.source_type = iceberg::float64(), .source = Literal::Double(2.71828)}, + {.source_type = iceberg::float64(), .source = Literal::Double(1.23e-5)}, {.source_type = iceberg::string(), .source = Literal::String("Hello, World!")}, {.source_type = iceberg::binary(), .source = Literal::Binary({0x01, 0x02, 0x03})}, }; @@ -479,7 +480,7 @@ TEST(TransformFunctionTransformTest, VoidTransform) { auto transformPtr = transform->Bind(c.source_type); ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind void transform"; auto result = transformPtr.value()->Transform(c.source); - ASSERT_TRUE(result == std::nullopt) + ASSERT_EQ(std::nullopt, result) << "Expected void transform to return no result for source: " << c.source.ToString(); } From 0499ec6612820cbfef13ac454fdbca4bbe84cf9b Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Sun, 3 Aug 2025 18:41:37 +0800 Subject: [PATCH 04/15] fix: review comments --- src/iceberg/expression/literal.cc | 23 +++++------------------ src/iceberg/expression/literal.h | 3 ++- src/iceberg/manifest_reader_internal.cc | 5 +++++ src/iceberg/util/macros.h | 5 ----- 4 files changed, 12 insertions(+), 24 deletions(-) diff --git a/src/iceberg/expression/literal.cc b/src/iceberg/expression/literal.cc index cbee663bf..eb2a4a3fc 100644 --- a/src/iceberg/expression/literal.cc +++ b/src/iceberg/expression/literal.cc @@ -134,6 +134,8 @@ Literal Literal::Date(int32_t value) { return {Value{value}, iceberg::date()}; } Literal Literal::Long(int64_t value) { return {Value{value}, iceberg::int64()}; } +Literal Literal::Time(int64_t value) { return {Value{value}, iceberg::time()}; } + Literal Literal::Timestamp(int64_t value) { return {Value{value}, iceberg::timestamp()}; } Literal Literal::TimestampTz(int64_t value) { @@ -210,30 +212,15 @@ std::partial_ordering Literal::operator<=>(const Literal& other) const { return this_val ? std::partial_ordering::greater : std::partial_ordering::less; } - case TypeId::kInt: { - auto this_val = std::get(value_); - auto other_val = std::get(other.value_); - return this_val <=> other_val; - } - + case TypeId::kInt: case TypeId::kDate: { auto this_val = std::get(value_); auto other_val = std::get(other.value_); return this_val <=> other_val; } - case TypeId::kLong: { - auto this_val = std::get(value_); - auto other_val = std::get(other.value_); - return this_val <=> other_val; - } - - case TypeId::kTimestamp: { - auto this_val = std::get(value_); - auto other_val = std::get(other.value_); - return this_val <=> other_val; - } - + case TypeId::kLong: + case TypeId::kTimestamp: case TypeId::kTimestampTz: { auto this_val = std::get(value_); auto other_val = std::get(other.value_); diff --git a/src/iceberg/expression/literal.h b/src/iceberg/expression/literal.h index 7d25e310f..6145992ab 100644 --- a/src/iceberg/expression/literal.h +++ b/src/iceberg/expression/literal.h @@ -49,6 +49,7 @@ class ICEBERG_EXPORT Literal { std::strong_ordering operator<=>(const AboveMax&) const = default; }; + public: using Value = std::variant, // for uuid and decimal BelowMin, AboveMax>; - public: /// \brief Factory methods for primitive types static Literal Boolean(bool value); static Literal Int(int32_t value); static Literal Date(int32_t value); static Literal Long(int64_t value); + static Literal Time(int64_t value); static Literal Timestamp(int64_t value); static Literal TimestampTz(int64_t value); static Literal Float(float value); diff --git a/src/iceberg/manifest_reader_internal.cc b/src/iceberg/manifest_reader_internal.cc index d01717427..251e0d71d 100644 --- a/src/iceberg/manifest_reader_internal.cc +++ b/src/iceberg/manifest_reader_internal.cc @@ -33,6 +33,11 @@ namespace iceberg { +#define NANOARROW_RETURN_IF_NOT_OK(status, error) \ + if (status != NANOARROW_OK) [[unlikely]] { \ + return InvalidArrowData("Nanoarrow error: {}", error.message); \ + } + #define PARSE_PRIMITIVE_FIELD(item, array_view, type) \ for (size_t row_idx = 0; row_idx < array_view->length; row_idx++) { \ if (!ArrowArrayViewIsNull(array_view, row_idx)) { \ diff --git a/src/iceberg/util/macros.h b/src/iceberg/util/macros.h index 5c8770fee..4d687bf51 100644 --- a/src/iceberg/util/macros.h +++ b/src/iceberg/util/macros.h @@ -36,8 +36,3 @@ #define ICEBERG_ASSIGN_OR_RAISE(lhs, rexpr) \ ICEBERG_ASSIGN_OR_RAISE_IMPL(ICEBERG_ASSIGN_OR_RAISE_NAME(result_, __COUNTER__), lhs, \ rexpr) - -#define NANOARROW_RETURN_IF_NOT_OK(status, error) \ - if (status != NANOARROW_OK) [[unlikely]] { \ - return InvalidArrowData("Nanoarrow error: {}", error.message); \ - } From 2f380fcfacf1bb60c4cc1059242e47f3dbf90d37 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Sun, 3 Aug 2025 19:37:39 +0800 Subject: [PATCH 05/15] use std::visit + static_assert --- src/iceberg/expression/literal.h | 4 +-- src/iceberg/transform_function.cc | 59 +++++++++++++------------------ 2 files changed, 25 insertions(+), 38 deletions(-) diff --git a/src/iceberg/expression/literal.h b/src/iceberg/expression/literal.h index 6145992ab..6161ad4d7 100644 --- a/src/iceberg/expression/literal.h +++ b/src/iceberg/expression/literal.h @@ -32,7 +32,7 @@ namespace iceberg { /// \brief Literal is a literal value that is associated with a primitive type. class ICEBERG_EXPORT Literal { - private: + public: /// \brief Sentinel value to indicate that the literal value is below the valid range /// of a specific primitive type. It can happen when casting a literal to a narrower /// primitive type. @@ -48,8 +48,6 @@ class ICEBERG_EXPORT Literal { bool operator==(const AboveMax&) const = default; std::strong_ordering operator<=>(const AboveMax&) const = default; }; - - public: using Value = std::variant #include +#include #include #include "iceberg/type.h" @@ -68,41 +69,29 @@ Result> BucketTransform::Transform(const Literal& literal literal.ToString(), source_type()->ToString()); } int32_t hash_value = 0; - switch (source_type()->type_id()) { - case TypeId::kInt: - case TypeId::kDate: { - auto value = std::get(literal.value()); - MurmurHash3_x86_32(&value, sizeof(int32_t), 0, &hash_value); - break; - } - case TypeId::kLong: - case TypeId::kTime: - case TypeId::kTimestamp: - case TypeId::kTimestampTz: { - auto value = std::get(literal.value()); - MurmurHash3_x86_32(&value, sizeof(int64_t), 0, &hash_value); - break; - } - case TypeId::kDecimal: - case TypeId::kUuid: { - auto value = std::get>(literal.value()); - MurmurHash3_x86_32(value.data(), sizeof(uint8_t) * 16, 0, &hash_value); - break; - } - case TypeId::kString: { - auto value = std::get(literal.value()); - MurmurHash3_x86_32(value.data(), value.size(), 0, &hash_value); - break; - } - case TypeId::kFixed: - case TypeId::kBinary: { - auto value = std::get>(literal.value()); - MurmurHash3_x86_32(value.data(), value.size(), 0, &hash_value); - break; - } - default: - std::unreachable(); - } + std::visit( + [&](auto&& value) { + using T = std::decay_t; + if constexpr (std::is_same_v) { + MurmurHash3_x86_32(&value, sizeof(int32_t), 0, &hash_value); + } else if constexpr (std::is_same_v) { + MurmurHash3_x86_32(&value, sizeof(int64_t), 0, &hash_value); + } else if constexpr (std::is_same_v>) { + MurmurHash3_x86_32(value.data(), sizeof(uint8_t) * 16, 0, &hash_value); + } else if constexpr (std::is_same_v) { + MurmurHash3_x86_32(value.data(), value.size(), 0, &hash_value); + } else if constexpr (std::is_same_v>) { + MurmurHash3_x86_32(value.data(), value.size(), 0, &hash_value); + } else if constexpr (std::is_same_v || std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v) { + std::unreachable(); + } else { + static_assert(false, "Unhandled type in BucketTransform::Transform"); + } + }, + literal.value()); // Calculate the bucket index int32_t bucket_index = From a6d0086b89dcc142080a1bf0dcb9671a00411369 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Mon, 4 Aug 2025 23:30:49 +0800 Subject: [PATCH 06/15] fix: comment reviews --- src/iceberg/transform.h | 2 -- src/iceberg/transform_function.cc | 45 +++++++---------------------- src/iceberg/transform_function.h | 28 ------------------ test/transform_test.cc | 47 ++++++++++++++++++++----------- 4 files changed, 40 insertions(+), 82 deletions(-) diff --git a/src/iceberg/transform.h b/src/iceberg/transform.h index 8043c15d7..3b5a725af 100644 --- a/src/iceberg/transform.h +++ b/src/iceberg/transform.h @@ -172,8 +172,6 @@ class ICEBERG_EXPORT TransformFunction { public: virtual ~TransformFunction() = default; TransformFunction(TransformType transform_type, std::shared_ptr source_type); - /// \brief Transform an input array to a new array - virtual Result Transform(const ArrowArray& data) = 0; /// \brief Transform an input Literal to a new Literal virtual Result> Transform(const Literal& literal) = 0; /// \brief Get the transform type diff --git a/src/iceberg/transform_function.cc b/src/iceberg/transform_function.cc index f97eaa735..780408593 100644 --- a/src/iceberg/transform_function.cc +++ b/src/iceberg/transform_function.cc @@ -32,10 +32,6 @@ namespace iceberg { IdentityTransform::IdentityTransform(std::shared_ptr const& source_type) : TransformFunction(TransformType::kIdentity, source_type) {} -Result IdentityTransform::Transform(const ArrowArray& input) { - return NotImplemented("IdentityTransform::Transform"); -} - Result> IdentityTransform::Transform(const Literal& literal) { return literal; } @@ -57,10 +53,6 @@ BucketTransform::BucketTransform(std::shared_ptr const& source_type, int32_t num_buckets) : TransformFunction(TransformType::kBucket, source_type), num_buckets_(num_buckets) {} -Result BucketTransform::Transform(const ArrowArray& input) { - return NotImplemented("BucketTransform::Transform"); -} - Result> BucketTransform::Transform(const Literal& literal) { assert(literal.type() == source_type()); if (literal.IsBelowMin() || literal.IsAboveMax()) { @@ -136,10 +128,6 @@ TruncateTransform::TruncateTransform(std::shared_ptr const& source_type, int32_t width) : TransformFunction(TransformType::kTruncate, source_type), width_(width) {} -Result TruncateTransform::Transform(const ArrowArray& input) { - return NotImplemented("TruncateTransform::Transform"); -} - Result> TruncateTransform::Transform(const Literal& literal) { assert(literal.type() == source_type()); if (literal.IsBelowMin() || literal.IsAboveMax()) { @@ -151,11 +139,11 @@ Result> TruncateTransform::Transform(const Literal& liter switch (source_type()->type_id()) { case TypeId::kInt: { auto value = std::get(literal.value()); - return Literal::Int(value % width_); + return Literal::Int(value - (((value % width_) + width_) % width_)); } case TypeId::kLong: { auto value = std::get(literal.value()); - return Literal::Long(value % width_); + return Literal::Long(value - (((value % width_) + width_) % width_)); } case TypeId::kDecimal: { // TODO(zhjwpku): Handle decimal truncation logic here @@ -164,8 +152,15 @@ Result> TruncateTransform::Transform(const Literal& liter case TypeId::kString: { auto value = std::get(literal.value()); if (value.size() > static_cast(width_)) { - value.resize(width_); + size_t safe_point = width_; + while (safe_point > 0 && (value[safe_point] & 0xC0) == 0x80) { + // Find the last valid UTF-8 character boundary before or at width_ + safe_point--; + } + // Resize the string to the safe point + value.resize(safe_point); } + return Literal::String(value); } case TypeId::kBinary: { @@ -209,10 +204,6 @@ Result> TruncateTransform::Make( YearTransform::YearTransform(std::shared_ptr const& source_type) : TransformFunction(TransformType::kTruncate, source_type) {} -Result YearTransform::Transform(const ArrowArray& input) { - return NotImplemented("YearTransform::Transform"); -} - Result> YearTransform::Transform(const Literal& literal) { assert(literal.type() == source_type()); if (literal.IsBelowMin() || literal.IsAboveMax()) { @@ -265,10 +256,6 @@ Result> YearTransform::Make( MonthTransform::MonthTransform(std::shared_ptr const& source_type) : TransformFunction(TransformType::kMonth, source_type) {} -Result MonthTransform::Transform(const ArrowArray& input) { - return NotImplemented("MonthTransform::Transform"); -} - Result> MonthTransform::Transform(const Literal& literal) { assert(literal.type() == source_type()); if (literal.IsBelowMin() || literal.IsAboveMax()) { @@ -333,10 +320,6 @@ Result> MonthTransform::Make( DayTransform::DayTransform(std::shared_ptr const& source_type) : TransformFunction(TransformType::kDay, source_type) {} -Result DayTransform::Transform(const ArrowArray& input) { - return NotImplemented("DayTransform::Transform"); -} - Result> DayTransform::Transform(const Literal& literal) { assert(literal.type() == source_type()); if (literal.IsBelowMin() || literal.IsAboveMax()) { @@ -388,10 +371,6 @@ Result> DayTransform::Make( HourTransform::HourTransform(std::shared_ptr const& source_type) : TransformFunction(TransformType::kHour, source_type) {} -Result HourTransform::Transform(const ArrowArray& input) { - return NotImplemented("HourTransform::Transform"); -} - Result> HourTransform::Transform(const Literal& literal) { assert(literal.type() == source_type()); if (literal.IsBelowMin() || literal.IsAboveMax()) { @@ -441,10 +420,6 @@ Result> HourTransform::Make( VoidTransform::VoidTransform(std::shared_ptr const& source_type) : TransformFunction(TransformType::kVoid, source_type) {} -Result VoidTransform::Transform(const ArrowArray& input) { - return NotImplemented("VoidTransform::Transform"); -} - Result> VoidTransform::Transform(const Literal& literal) { return std::nullopt; } diff --git a/src/iceberg/transform_function.h b/src/iceberg/transform_function.h index c3e7e282b..8567083fa 100644 --- a/src/iceberg/transform_function.h +++ b/src/iceberg/transform_function.h @@ -30,9 +30,6 @@ class IdentityTransform : public TransformFunction { /// \param source_type Type of the input data. explicit IdentityTransform(std::shared_ptr const& source_type); - /// \brief Returns the input array without modification. - Result Transform(const ArrowArray& input) override; - /// \brief Returns the same Literal as the input. Result> Transform(const Literal& literal) override; @@ -53,9 +50,6 @@ class BucketTransform : public TransformFunction { /// \param num_buckets Number of buckets to hash into. BucketTransform(std::shared_ptr const& source_type, int32_t num_buckets); - /// \brief Applies the bucket hash function to the input array. - Result Transform(const ArrowArray& input) override; - /// \brief Applies the bucket hash function to the input Literal. Result> Transform(const Literal& literal) override; @@ -80,9 +74,6 @@ class TruncateTransform : public TransformFunction { /// \param width The width to truncate to (e.g., for strings or numbers). TruncateTransform(std::shared_ptr const& source_type, int32_t width); - /// \brief Truncates values in the input array to the specified width. - Result Transform(const ArrowArray& input) override; - /// \brief Truncates the input Literal to the specified width. Result> Transform(const Literal& literal) override; @@ -106,10 +97,6 @@ class YearTransform : public TransformFunction { /// \param source_type Must be a timestamp type. explicit YearTransform(std::shared_ptr const& source_type); - /// \brief Extracts the year from each date timestamp in the input array, as years from - /// 1970. - Result Transform(const ArrowArray& input) override; - /// \brief Extract a date or timestamp year, as years from 1970. Result> Transform(const Literal& literal) override; @@ -129,10 +116,6 @@ class MonthTransform : public TransformFunction { /// \param source_type Must be a timestamp type. explicit MonthTransform(std::shared_ptr const& source_type); - /// \brief Extracts the month from each date or timestamp in the input array, as months - /// from 1970-01-01. - Result Transform(const ArrowArray& input) override; - /// \brief Extract a date or timestamp month, as months from 1970-01-01. Result> Transform(const Literal& literal) override; @@ -152,10 +135,6 @@ class DayTransform : public TransformFunction { /// \param source_type Must be a timestamp type. explicit DayTransform(std::shared_ptr const& source_type); - /// \brief Extracts the day from each date or timestamp in the input array, as days from - /// 1970-01-01. - Result Transform(const ArrowArray& input) override; - /// \brief Extract a date or timestamp day, as days from 1970-01-01. Result> Transform(const Literal& literal) override; @@ -175,10 +154,6 @@ class HourTransform : public TransformFunction { /// \param source_type Must be a timestamp type. explicit HourTransform(std::shared_ptr const& source_type); - /// \brief Extracts the hour from each timestamp in the input array, as hours from - /// 1970-01-01 00:00:00. - Result Transform(const ArrowArray& input) override; - /// \brief Extract a timestamp hour, as hours from 1970-01-01 00:00:00. Result> Transform(const Literal& literal) override; @@ -198,9 +173,6 @@ class VoidTransform : public TransformFunction { /// \param source_type Input type (ignored). explicit VoidTransform(std::shared_ptr const& source_type); - /// \brief Returns an all-null array of the same length as the input. - Result Transform(const ArrowArray& input) override; - /// \brief Returns a null literal. Result> Transform(const Literal& literal) override; diff --git a/test/transform_test.cc b/test/transform_test.cc index 21de2d408..4477fe35c 100644 --- a/test/transform_test.cc +++ b/test/transform_test.cc @@ -40,12 +40,6 @@ TEST(TransformTest, Transform) { auto source_type = iceberg::string(); auto identity_transform = transform->Bind(source_type); ASSERT_TRUE(identity_transform); - - ArrowArray arrow_array; - auto result = identity_transform.value()->Transform(arrow_array); - ASSERT_FALSE(result); - EXPECT_EQ(ErrorKind::kNotImplemented, result.error().kind); - EXPECT_EQ("IdentityTransform::Transform", result.error().message); } TEST(TransformFunctionTest, CreateBucketTransform) { @@ -193,7 +187,7 @@ TEST(TransformResultTypeTest, NegativeCases) { } } -TEST(TransformFunctionTransformTest, IdentityTransform) { +TEST(TransformLiteralTest, IdentityTransform) { struct Case { std::shared_ptr source_type; Literal source; @@ -247,7 +241,7 @@ TEST(TransformFunctionTransformTest, IdentityTransform) { } } -TEST(TransformFunctionTransformTest, BucketTransform) { +TEST(TransformLiteralTest, BucketTransform) { constexpr int32_t num_buckets = 4; auto transform = Transform::Bucket(num_buckets); @@ -290,29 +284,47 @@ TEST(TransformFunctionTransformTest, BucketTransform) { } } -TEST(TransformFunctionTransformTest, TruncateTransform) { - constexpr int32_t width = 5; - auto transform = Transform::Truncate(width); - +TEST(TransformLiteralTest, TruncateTransform) { struct Case { std::shared_ptr source_type; + int32_t width; Literal source; Literal expected; }; const std::vector cases = { {.source_type = iceberg::int32(), + .width = 5, .source = Literal::Int(123456), - .expected = Literal::Int(1)}, + .expected = Literal::Int(123455)}, {.source_type = iceberg::string(), + .width = 5, .source = Literal::String("Hello, World!"), .expected = Literal::String("Hello")}, + {.source_type = iceberg::string(), + .width = 5, + .source = Literal::String("😜🧐🤔🤪🥳"), + // Truncate to 5 bytes, the safe point should be four bytes which fits the first + // emoji + .expected = Literal::String("😜")}, + {.source_type = iceberg::string(), + .width = 8, + .source = Literal::String("😜🧐🤔🤪🥳"), + // Truncate to 8 bytes, fits the first two emojis + .expected = Literal::String("😜🧐")}, + {.source_type = iceberg::string(), + .width = 3, + .source = Literal::String("😜🧐🤔🤪🥳"), + // Truncate to 3 bytes, the saft point will be 0 bytes + .expected = Literal::String("")}, {.source_type = iceberg::binary(), + .width = 5, .source = Literal::Binary({0x01, 0x02, 0x03, 0x04, 0x05, 0x06}), .expected = Literal::Binary({0x01, 0x02, 0x03, 0x04, 0x05})}, }; for (const auto& c : cases) { + auto transform = Transform::Truncate(c.width); auto transformPtr = transform->Bind(c.source_type); ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind truncate transform"; auto result = transformPtr.value()->Transform(c.source); @@ -324,7 +336,7 @@ TEST(TransformFunctionTransformTest, TruncateTransform) { } } -TEST(TransformFunctionTransformTest, YearTransform) { +TEST(TransformLiteralTest, YearTransform) { auto transform = Transform::Year(); struct Case { @@ -335,6 +347,7 @@ TEST(TransformFunctionTransformTest, YearTransform) { const std::vector cases = { {.source_type = iceberg::timestamp(), + // 2021-06-01T11:43:20Z .source = Literal::Timestamp(1622547800000000), .expected = Literal::Int(2021)}, {.source_type = iceberg::timestamp_tz(), @@ -357,7 +370,7 @@ TEST(TransformFunctionTransformTest, YearTransform) { } } -TEST(TransformFunctionTransformTest, MonthTransform) { +TEST(TransformLiteralTest, MonthTransform) { auto transform = Transform::Month(); struct Case { @@ -423,7 +436,7 @@ TEST(TransformFunctionTransformTest, DayTransform) { } } -TEST(TransformFunctionTransformTest, HourTransform) { +TEST(TransformLiteralTest, HourTransform) { auto transform = Transform::Hour(); struct Case { @@ -453,7 +466,7 @@ TEST(TransformFunctionTransformTest, HourTransform) { } } -TEST(TransformFunctionTransformTest, VoidTransform) { +TEST(TransformLiteralTest, VoidTransform) { auto transform = Transform::Void(); struct Case { From 47da2368a220d331d756a26f31083c7ab9302403 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Wed, 6 Aug 2025 00:22:09 +0800 Subject: [PATCH 07/15] feat: add NullType and fix string truncate --- src/iceberg/expression/literal.cc | 5 ++++ src/iceberg/expression/literal.h | 14 ++++++---- src/iceberg/json_internal.cc | 3 +- src/iceberg/schema_internal.cc | 3 ++ src/iceberg/transform.h | 3 +- src/iceberg/transform_function.cc | 46 ++++++++++++++++++------------- src/iceberg/transform_function.h | 16 +++++------ src/iceberg/type.cc | 5 ++++ src/iceberg/type.h | 19 +++++++++++++ src/iceberg/type_fwd.h | 3 ++ test/transform_test.cc | 22 ++++++--------- test/type_test.cc | 9 +++++- 12 files changed, 97 insertions(+), 51 deletions(-) diff --git a/src/iceberg/expression/literal.cc b/src/iceberg/expression/literal.cc index eb2a4a3fc..98421bcba 100644 --- a/src/iceberg/expression/literal.cc +++ b/src/iceberg/expression/literal.cc @@ -126,6 +126,8 @@ Literal::Literal(Value value, std::shared_ptr type) : value_(std::move(value)), type_(std::move(type)) {} // Factory methods +Literal Literal::Null() { return {Value{std::monostate{}}, iceberg::null()}; } + Literal Literal::Boolean(bool value) { return {Value{value}, iceberg::boolean()}; } Literal Literal::Int(int32_t value) { return {Value{value}, iceberg::int32()}; } @@ -205,6 +207,9 @@ std::partial_ordering Literal::operator<=>(const Literal& other) const { // Same type comparison for normal values switch (type_->type_id()) { + case TypeId::kNull: + // Nulls are equivalent + return std::partial_ordering::equivalent; case TypeId::kBoolean: { auto this_val = std::get(value_); auto other_val = std::get(other.value_); diff --git a/src/iceberg/expression/literal.h b/src/iceberg/expression/literal.h index 6161ad4d7..7cfcd7074 100644 --- a/src/iceberg/expression/literal.h +++ b/src/iceberg/expression/literal.h @@ -48,17 +48,19 @@ class ICEBERG_EXPORT Literal { bool operator==(const AboveMax&) const = default; std::strong_ordering operator<=>(const AboveMax&) const = default; }; - using Value = std::variant, // for binary, fixed std::array, // for uuid and decimal BelowMin, AboveMax>; /// \brief Factory methods for primitive types + static Literal Null(); static Literal Boolean(bool value); static Literal Int(int32_t value); static Literal Date(int32_t value); diff --git a/src/iceberg/json_internal.cc b/src/iceberg/json_internal.cc index 4484ad963..724751a9e 100644 --- a/src/iceberg/json_internal.cc +++ b/src/iceberg/json_internal.cc @@ -477,8 +477,9 @@ nlohmann::json ToJson(const Type& type) { } case TypeId::kUuid: return "uuid"; + default: + std::unreachable(); } - std::unreachable(); } nlohmann::json ToJson(const Schema& schema) { diff --git a/src/iceberg/schema_internal.cc b/src/iceberg/schema_internal.cc index fca6f01e3..97443a662 100644 --- a/src/iceberg/schema_internal.cc +++ b/src/iceberg/schema_internal.cc @@ -22,6 +22,7 @@ #include #include #include +#include #include "iceberg/schema.h" #include "iceberg/type.h" @@ -139,6 +140,8 @@ ArrowErrorCode ToArrowSchema(const Type& type, bool optional, std::string_view n ArrowMetadataBuilderAppend(&metadata_buffer, ArrowCharView(kArrowExtensionName), ArrowCharView(kArrowUuidExtensionName))); } break; + default: + std::unreachable(); } if (!name.empty()) { diff --git a/src/iceberg/transform.h b/src/iceberg/transform.h index 3b5a725af..9606948cb 100644 --- a/src/iceberg/transform.h +++ b/src/iceberg/transform.h @@ -26,7 +26,6 @@ #include #include -#include "iceberg/arrow_c_data.h" #include "iceberg/expression/literal.h" #include "iceberg/iceberg_export.h" #include "iceberg/result.h" @@ -173,7 +172,7 @@ class ICEBERG_EXPORT TransformFunction { virtual ~TransformFunction() = default; TransformFunction(TransformType transform_type, std::shared_ptr source_type); /// \brief Transform an input Literal to a new Literal - virtual Result> Transform(const Literal& literal) = 0; + virtual Result Transform(const Literal& literal) = 0; /// \brief Get the transform type TransformType transform_type() const; /// \brief Get the source type of transform function diff --git a/src/iceberg/transform_function.cc b/src/iceberg/transform_function.cc index 780408593..423fa0b8f 100644 --- a/src/iceberg/transform_function.cc +++ b/src/iceberg/transform_function.cc @@ -23,6 +23,7 @@ #include #include #include +#include #include "iceberg/type.h" #include "iceberg/util/murmurhash3_internal.h" @@ -32,9 +33,7 @@ namespace iceberg { IdentityTransform::IdentityTransform(std::shared_ptr const& source_type) : TransformFunction(TransformType::kIdentity, source_type) {} -Result> IdentityTransform::Transform(const Literal& literal) { - return literal; -} +Result IdentityTransform::Transform(const Literal& literal) { return literal; } Result> IdentityTransform::ResultType() const { return source_type(); @@ -53,7 +52,7 @@ BucketTransform::BucketTransform(std::shared_ptr const& source_type, int32_t num_buckets) : TransformFunction(TransformType::kBucket, source_type), num_buckets_(num_buckets) {} -Result> BucketTransform::Transform(const Literal& literal) { +Result BucketTransform::Transform(const Literal& literal) { assert(literal.type() == source_type()); if (literal.IsBelowMin() || literal.IsAboveMax()) { return InvalidArgument( @@ -74,7 +73,8 @@ Result> BucketTransform::Transform(const Literal& literal MurmurHash3_x86_32(value.data(), value.size(), 0, &hash_value); } else if constexpr (std::is_same_v>) { MurmurHash3_x86_32(value.data(), value.size(), 0, &hash_value); - } else if constexpr (std::is_same_v || std::is_same_v || + } else if constexpr (std::is_same_v || + std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v) { @@ -128,7 +128,7 @@ TruncateTransform::TruncateTransform(std::shared_ptr const& source_type, int32_t width) : TransformFunction(TransformType::kTruncate, source_type), width_(width) {} -Result> TruncateTransform::Transform(const Literal& literal) { +Result TruncateTransform::Transform(const Literal& literal) { assert(literal.type() == source_type()); if (literal.IsBelowMin() || literal.IsAboveMax()) { return InvalidArgument( @@ -150,17 +150,25 @@ Result> TruncateTransform::Transform(const Literal& liter return NotImplemented("Truncate for Decimal is not implemented yet"); } case TypeId::kString: { + // Strings are truncated to a valid UTF-8 string with no more than L code points. auto value = std::get(literal.value()); - if (value.size() > static_cast(width_)) { - size_t safe_point = width_; - while (safe_point > 0 && (value[safe_point] & 0xC0) == 0x80) { - // Find the last valid UTF-8 character boundary before or at width_ - safe_point--; + size_t code_point_count = 0; + size_t safe_point = 0; + + for (size_t i = 0; i < value.size(); ++i) { + // Start of a new UTF-8 code point + if ((value[i] & 0xC0) != 0x80) { + code_point_count++; + if (code_point_count > static_cast(width_)) { + safe_point = i; + break; + } } - // Resize the string to the safe point - value.resize(safe_point); } + if (safe_point != 0) { + value.resize(safe_point); // Resize the string to the safe point + } return Literal::String(value); } case TypeId::kBinary: { @@ -204,7 +212,7 @@ Result> TruncateTransform::Make( YearTransform::YearTransform(std::shared_ptr const& source_type) : TransformFunction(TransformType::kTruncate, source_type) {} -Result> YearTransform::Transform(const Literal& literal) { +Result YearTransform::Transform(const Literal& literal) { assert(literal.type() == source_type()); if (literal.IsBelowMin() || literal.IsAboveMax()) { return InvalidArgument( @@ -256,7 +264,7 @@ Result> YearTransform::Make( MonthTransform::MonthTransform(std::shared_ptr const& source_type) : TransformFunction(TransformType::kMonth, source_type) {} -Result> MonthTransform::Transform(const Literal& literal) { +Result MonthTransform::Transform(const Literal& literal) { assert(literal.type() == source_type()); if (literal.IsBelowMin() || literal.IsAboveMax()) { return InvalidArgument( @@ -320,7 +328,7 @@ Result> MonthTransform::Make( DayTransform::DayTransform(std::shared_ptr const& source_type) : TransformFunction(TransformType::kDay, source_type) {} -Result> DayTransform::Transform(const Literal& literal) { +Result DayTransform::Transform(const Literal& literal) { assert(literal.type() == source_type()); if (literal.IsBelowMin() || literal.IsAboveMax()) { return InvalidArgument( @@ -371,7 +379,7 @@ Result> DayTransform::Make( HourTransform::HourTransform(std::shared_ptr const& source_type) : TransformFunction(TransformType::kHour, source_type) {} -Result> HourTransform::Transform(const Literal& literal) { +Result HourTransform::Transform(const Literal& literal) { assert(literal.type() == source_type()); if (literal.IsBelowMin() || literal.IsAboveMax()) { return InvalidArgument( @@ -420,8 +428,8 @@ Result> HourTransform::Make( VoidTransform::VoidTransform(std::shared_ptr const& source_type) : TransformFunction(TransformType::kVoid, source_type) {} -Result> VoidTransform::Transform(const Literal& literal) { - return std::nullopt; +Result VoidTransform::Transform(const Literal& literal) { + return Literal::Null(); } Result> VoidTransform::ResultType() const { return source_type(); } diff --git a/src/iceberg/transform_function.h b/src/iceberg/transform_function.h index 8567083fa..af97a50cb 100644 --- a/src/iceberg/transform_function.h +++ b/src/iceberg/transform_function.h @@ -31,7 +31,7 @@ class IdentityTransform : public TransformFunction { explicit IdentityTransform(std::shared_ptr const& source_type); /// \brief Returns the same Literal as the input. - Result> Transform(const Literal& literal) override; + Result Transform(const Literal& literal) override; /// \brief Returns the same type as the source type if it is valid. Result> ResultType() const override; @@ -51,7 +51,7 @@ class BucketTransform : public TransformFunction { BucketTransform(std::shared_ptr const& source_type, int32_t num_buckets); /// \brief Applies the bucket hash function to the input Literal. - Result> Transform(const Literal& literal) override; + Result Transform(const Literal& literal) override; /// \brief Returns INT32 as the output type. Result> ResultType() const override; @@ -75,7 +75,7 @@ class TruncateTransform : public TransformFunction { TruncateTransform(std::shared_ptr const& source_type, int32_t width); /// \brief Truncates the input Literal to the specified width. - Result> Transform(const Literal& literal) override; + Result Transform(const Literal& literal) override; /// \brief Returns the same type as source_type. Result> ResultType() const override; @@ -98,7 +98,7 @@ class YearTransform : public TransformFunction { explicit YearTransform(std::shared_ptr const& source_type); /// \brief Extract a date or timestamp year, as years from 1970. - Result> Transform(const Literal& literal) override; + Result Transform(const Literal& literal) override; /// \brief Returns INT32 as the output type. Result> ResultType() const override; @@ -117,7 +117,7 @@ class MonthTransform : public TransformFunction { explicit MonthTransform(std::shared_ptr const& source_type); /// \brief Extract a date or timestamp month, as months from 1970-01-01. - Result> Transform(const Literal& literal) override; + Result Transform(const Literal& literal) override; /// \brief Returns INT32 as the output type. Result> ResultType() const override; @@ -136,7 +136,7 @@ class DayTransform : public TransformFunction { explicit DayTransform(std::shared_ptr const& source_type); /// \brief Extract a date or timestamp day, as days from 1970-01-01. - Result> Transform(const Literal& literal) override; + Result Transform(const Literal& literal) override; /// \brief Returns INT32 as the output type. Result> ResultType() const override; @@ -155,7 +155,7 @@ class HourTransform : public TransformFunction { explicit HourTransform(std::shared_ptr const& source_type); /// \brief Extract a timestamp hour, as hours from 1970-01-01 00:00:00. - Result> Transform(const Literal& literal) override; + Result Transform(const Literal& literal) override; /// \brief Returns INT32 as the output type. Result> ResultType() const override; @@ -174,7 +174,7 @@ class VoidTransform : public TransformFunction { explicit VoidTransform(std::shared_ptr const& source_type); /// \brief Returns a null literal. - Result> Transform(const Literal& literal) override; + Result Transform(const Literal& literal) override; /// \brief Returns null type or a sentinel type indicating void. Result> ResultType() const override; diff --git a/src/iceberg/type.cc b/src/iceberg/type.cc index e66f96daf..d0b61b8ac 100644 --- a/src/iceberg/type.cc +++ b/src/iceberg/type.cc @@ -194,6 +194,10 @@ bool MapType::Equals(const Type& other) const { return fields_ == map.fields_; } +TypeId NullType::type_id() const { return TypeId::kNull; } +std::string NullType::ToString() const { return "null"; } +bool NullType::Equals(const Type& other) const { return other.type_id() == kTypeId; } + TypeId BooleanType::type_id() const { return kTypeId; } std::string BooleanType::ToString() const { return "boolean"; } bool BooleanType::Equals(const Type& other) const { return other.type_id() == kTypeId; } @@ -296,6 +300,7 @@ bool BinaryType::Equals(const Type& other) const { return other.type_id() == kTy return result; \ } +TYPE_FACTORY(null, NullType) TYPE_FACTORY(boolean, BooleanType) TYPE_FACTORY(int32, IntType) TYPE_FACTORY(int64, LongType) diff --git a/src/iceberg/type.h b/src/iceberg/type.h index 78c0141b1..f96decdc6 100644 --- a/src/iceberg/type.h +++ b/src/iceberg/type.h @@ -191,6 +191,23 @@ class ICEBERG_EXPORT MapType : public NestedType { /// Primitive types do not have nested fields. /// @{ +/// \brief A data type that has no physical storage. +/// Technically, this is a primitive type, we treat it as a primitive type for +/// convenience. +class ICEBERG_EXPORT NullType : public PrimitiveType { + public: + constexpr static const TypeId kTypeId = TypeId::kNull; + + NullType() = default; + ~NullType() override = default; + + TypeId type_id() const override; + std::string ToString() const override; + + protected: + bool Equals(const Type& other) const override; +}; + /// \brief A data type representing a boolean (true or false). class ICEBERG_EXPORT BooleanType : public PrimitiveType { public: @@ -451,6 +468,8 @@ class ICEBERG_EXPORT UuidType : public PrimitiveType { /// Factory functions for creating primitive data types /// @{ +/// \brief Return a NullType instance. +ICEBERG_EXPORT const std::shared_ptr& null(); /// \brief Return a BooleanType instance. ICEBERG_EXPORT const std::shared_ptr& boolean(); /// \brief Return an IntType instance. diff --git a/src/iceberg/type_fwd.h b/src/iceberg/type_fwd.h index 0135422c1..6e76f86ea 100644 --- a/src/iceberg/type_fwd.h +++ b/src/iceberg/type_fwd.h @@ -36,6 +36,8 @@ enum class TypeId { kStruct, kList, kMap, + kNull, // Note: A type having no physical storage. This is not an iceberg type, we add + // it to simplify the code logic. kBoolean, kInt, kLong, @@ -69,6 +71,7 @@ class LongType; class ListType; class MapType; class NestedType; +class NullType; class PartitionField; class PartitionSpec; class PrimitiveType; diff --git a/test/transform_test.cc b/test/transform_test.cc index 4477fe35c..6c114cdf3 100644 --- a/test/transform_test.cc +++ b/test/transform_test.cc @@ -24,6 +24,7 @@ #include #include +#include #include "iceberg/type.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep @@ -303,20 +304,13 @@ TEST(TransformLiteralTest, TruncateTransform) { .expected = Literal::String("Hello")}, {.source_type = iceberg::string(), .width = 5, - .source = Literal::String("😜🧐🤔🤪🥳"), - // Truncate to 5 bytes, the safe point should be four bytes which fits the first - // emoji - .expected = Literal::String("😜")}, + .source = Literal::String("😜🧐🤔🤪🥳😵‍💫😂"), + // Truncate to 5 utf-8 code points + .expected = Literal::String("😜🧐🤔🤪🥳")}, {.source_type = iceberg::string(), .width = 8, - .source = Literal::String("😜🧐🤔🤪🥳"), - // Truncate to 8 bytes, fits the first two emojis - .expected = Literal::String("😜🧐")}, - {.source_type = iceberg::string(), - .width = 3, - .source = Literal::String("😜🧐🤔🤪🥳"), - // Truncate to 3 bytes, the saft point will be 0 bytes - .expected = Literal::String("")}, + .source = Literal::String("a😜b🧐c🤔d🤪e🥳"), + .expected = Literal::String("a😜b🧐c🤔d🤪")}, {.source_type = iceberg::binary(), .width = 5, .source = Literal::Binary({0x01, 0x02, 0x03, 0x04, 0x05, 0x06}), @@ -493,8 +487,8 @@ TEST(TransformLiteralTest, VoidTransform) { auto transformPtr = transform->Bind(c.source_type); ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind void transform"; auto result = transformPtr.value()->Transform(c.source); - ASSERT_EQ(std::nullopt, result) - << "Expected void transform to return no result for source: " + EXPECT_EQ(result.value(), Literal::Null()) + << "Expected void transform to return null type for source: " << c.source.ToString(); } } diff --git a/test/type_test.cc b/test/type_test.cc index fca886928..df4aa241c 100644 --- a/test/type_test.cc +++ b/test/type_test.cc @@ -87,7 +87,14 @@ TEST_P(TypeTest, StdFormat) { ASSERT_EQ(test_case.repr, std::format("{}", *test_case.type)); } -const static std::array kPrimitiveTypes = {{ +const static std::array kPrimitiveTypes = {{ + { + .name = "null", + .type = iceberg::null(), + .type_id = iceberg::TypeId::kNull, + .primitive = true, + .repr = "null", + }, { .name = "boolean", .type = iceberg::boolean(), From 8c685e8fbaa118e6dac5e047a9d6a1a1f59ded00 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Wed, 6 Aug 2025 23:12:52 +0800 Subject: [PATCH 08/15] fix: get rid of NullType --- src/iceberg/expression/literal.cc | 17 ++++--- src/iceberg/expression/literal.h | 10 ++++- src/iceberg/json_internal.cc | 3 +- src/iceberg/schema_internal.cc | 3 -- src/iceberg/transform.h | 5 ++- src/iceberg/transform_function.cc | 51 ++++++++++++--------- src/iceberg/transform_function.h | 20 ++++----- src/iceberg/type.cc | 5 --- src/iceberg/type.h | 19 -------- src/iceberg/type_fwd.h | 3 -- test/transform_test.cc | 75 ++++++++++++++++++++++++++++--- test/type_test.cc | 9 +--- 12 files changed, 133 insertions(+), 87 deletions(-) diff --git a/src/iceberg/expression/literal.cc b/src/iceberg/expression/literal.cc index 98421bcba..be6d34ddc 100644 --- a/src/iceberg/expression/literal.cc +++ b/src/iceberg/expression/literal.cc @@ -126,7 +126,6 @@ Literal::Literal(Value value, std::shared_ptr type) : value_(std::move(value)), type_(std::move(type)) {} // Factory methods -Literal Literal::Null() { return {Value{std::monostate{}}, iceberg::null()}; } Literal Literal::Boolean(bool value) { return {Value{value}, iceberg::boolean()}; } @@ -200,16 +199,14 @@ std::partial_ordering Literal::operator<=>(const Literal& other) const { return std::partial_ordering::unordered; } - // If either value is AboveMax or BelowMin, comparison is unordered - if (IsAboveMax() || IsBelowMin() || other.IsAboveMax() || other.IsBelowMin()) { + // If either value is AboveMax, BelowMin or null, comparison is unordered + if (IsAboveMax() || IsBelowMin() || other.IsAboveMax() || other.IsBelowMin() || + IsNull() || other.IsNull()) { return std::partial_ordering::unordered; } // Same type comparison for normal values switch (type_->type_id()) { - case TypeId::kNull: - // Nulls are equivalent - return std::partial_ordering::equivalent; case TypeId::kBoolean: { auto this_val = std::get(value_); auto other_val = std::get(other.value_); @@ -271,6 +268,9 @@ std::string Literal::ToString() const { if (std::holds_alternative(value_)) { return "aboveMax"; } + if (std::holds_alternative(value_)) { + return "null"; + } switch (type_->type_id()) { case TypeId::kBoolean: { @@ -319,6 +319,8 @@ bool Literal::IsBelowMin() const { return std::holds_alternative(value bool Literal::IsAboveMax() const { return std::holds_alternative(value_); } +bool Literal::IsNull() const { return std::holds_alternative(value_); } + // LiteralCaster implementation Result LiteralCaster::CastTo(const Literal& literal, @@ -330,7 +332,8 @@ Result LiteralCaster::CastTo(const Literal& literal, // Handle special values if (std::holds_alternative(literal.value_) || - std::holds_alternative(literal.value_)) { + std::holds_alternative(literal.value_) || + std::holds_alternative(literal.value_)) { // Cannot cast type for special values return NotSupported("Cannot cast type for {}", literal.ToString()); } diff --git a/src/iceberg/expression/literal.h b/src/iceberg/expression/literal.h index 7cfcd7074..4c880ef3e 100644 --- a/src/iceberg/expression/literal.h +++ b/src/iceberg/expression/literal.h @@ -60,7 +60,6 @@ class ICEBERG_EXPORT Literal { BelowMin, AboveMax>; /// \brief Factory methods for primitive types - static Literal Null(); static Literal Boolean(bool value); static Literal Int(int32_t value); static Literal Date(int32_t value); @@ -73,6 +72,11 @@ class ICEBERG_EXPORT Literal { static Literal String(std::string value); static Literal Binary(std::vector value); + /// \brief Create a literal representing a null value. + static Literal Null(std::shared_ptr type) { + return {Value{std::monostate{}}, std::move(type)}; + } + /// \brief Restore a literal from single-value serialization. /// /// See [this spec](https://iceberg.apache.org/spec/#binary-single-value-serialization) @@ -130,6 +134,10 @@ class ICEBERG_EXPORT Literal { /// \return true if this literal represents a BelowMin value, false otherwise bool IsBelowMin() const; + /// Check if this literal is null. + /// \return true if this literal is null, false otherwise + bool IsNull() const; + std::string ToString() const; private: diff --git a/src/iceberg/json_internal.cc b/src/iceberg/json_internal.cc index 724751a9e..4484ad963 100644 --- a/src/iceberg/json_internal.cc +++ b/src/iceberg/json_internal.cc @@ -477,9 +477,8 @@ nlohmann::json ToJson(const Type& type) { } case TypeId::kUuid: return "uuid"; - default: - std::unreachable(); } + std::unreachable(); } nlohmann::json ToJson(const Schema& schema) { diff --git a/src/iceberg/schema_internal.cc b/src/iceberg/schema_internal.cc index 97443a662..fca6f01e3 100644 --- a/src/iceberg/schema_internal.cc +++ b/src/iceberg/schema_internal.cc @@ -22,7 +22,6 @@ #include #include #include -#include #include "iceberg/schema.h" #include "iceberg/type.h" @@ -140,8 +139,6 @@ ArrowErrorCode ToArrowSchema(const Type& type, bool optional, std::string_view n ArrowMetadataBuilderAppend(&metadata_buffer, ArrowCharView(kArrowExtensionName), ArrowCharView(kArrowUuidExtensionName))); } break; - default: - std::unreachable(); } if (!name.empty()) { diff --git a/src/iceberg/transform.h b/src/iceberg/transform.h index 9606948cb..3e6709fa1 100644 --- a/src/iceberg/transform.h +++ b/src/iceberg/transform.h @@ -23,7 +23,6 @@ #include #include -#include #include #include "iceberg/expression/literal.h" @@ -172,13 +171,15 @@ class ICEBERG_EXPORT TransformFunction { virtual ~TransformFunction() = default; TransformFunction(TransformType transform_type, std::shared_ptr source_type); /// \brief Transform an input Literal to a new Literal + /// + /// All transforms must return null for a null input value. virtual Result Transform(const Literal& literal) = 0; /// \brief Get the transform type TransformType transform_type() const; /// \brief Get the source type of transform function const std::shared_ptr& source_type() const; /// \brief Get the result type of transform function - virtual Result> ResultType() const = 0; + virtual std::shared_ptr ResultType() const = 0; friend bool operator==(const TransformFunction& lhs, const TransformFunction& rhs) { return lhs.Equals(rhs); diff --git a/src/iceberg/transform_function.cc b/src/iceberg/transform_function.cc index 423fa0b8f..b8bf94f87 100644 --- a/src/iceberg/transform_function.cc +++ b/src/iceberg/transform_function.cc @@ -35,9 +35,7 @@ IdentityTransform::IdentityTransform(std::shared_ptr const& source_type) Result IdentityTransform::Transform(const Literal& literal) { return literal; } -Result> IdentityTransform::ResultType() const { - return source_type(); -} +std::shared_ptr IdentityTransform::ResultType() const { return source_type(); } Result> IdentityTransform::Make( std::shared_ptr const& source_type) { @@ -59,6 +57,10 @@ Result BucketTransform::Transform(const Literal& literal) { "Cannot apply bucket transform to literal with value {} of type {}", literal.ToString(), source_type()->ToString()); } + if (literal.IsNull()) { + return Literal::Null(iceberg::int32()); + } + int32_t hash_value = 0; std::visit( [&](auto&& value) { @@ -92,9 +94,7 @@ Result BucketTransform::Transform(const Literal& literal) { return Literal::Int(bucket_index); } -Result> BucketTransform::ResultType() const { - return iceberg::int32(); -} +std::shared_ptr BucketTransform::ResultType() const { return iceberg::int32(); } Result> BucketTransform::Make( std::shared_ptr const& source_type, int32_t num_buckets) { @@ -135,6 +135,10 @@ Result TruncateTransform::Transform(const Literal& literal) { "Cannot apply truncate transform to literal with value {} of type {}", literal.ToString(), source_type()->ToString()); } + if (literal.IsNull()) { + // Return null as is + return literal; + } switch (source_type()->type_id()) { case TypeId::kInt: { @@ -183,9 +187,7 @@ Result TruncateTransform::Transform(const Literal& literal) { } } -Result> TruncateTransform::ResultType() const { - return source_type(); -} +std::shared_ptr TruncateTransform::ResultType() const { return source_type(); } Result> TruncateTransform::Make( std::shared_ptr const& source_type, int32_t width) { @@ -219,6 +221,9 @@ Result YearTransform::Transform(const Literal& literal) { "Cannot apply year transform to literal with value {} of type {}", literal.ToString(), source_type()->ToString()); } + if (literal.IsNull()) { + return Literal::Null(iceberg::int32()); + } using namespace std::chrono; // NOLINT switch (source_type()->type_id()) { @@ -240,9 +245,7 @@ Result YearTransform::Transform(const Literal& literal) { } } -Result> YearTransform::ResultType() const { - return iceberg::int32(); -} +std::shared_ptr YearTransform::ResultType() const { return iceberg::int32(); } Result> YearTransform::Make( std::shared_ptr const& source_type) { @@ -271,6 +274,9 @@ Result MonthTransform::Transform(const Literal& literal) { "Cannot apply month transform to literal with value {} of type {}", literal.ToString(), source_type()->ToString()); } + if (literal.IsNull()) { + return Literal::Null(iceberg::int32()); + } using namespace std::chrono; // NOLINT switch (source_type()->type_id()) { @@ -304,9 +310,7 @@ Result MonthTransform::Transform(const Literal& literal) { } } -Result> MonthTransform::ResultType() const { - return iceberg::int32(); -} +std::shared_ptr MonthTransform::ResultType() const { return iceberg::int32(); } Result> MonthTransform::Make( std::shared_ptr const& source_type) { @@ -335,6 +339,9 @@ Result DayTransform::Transform(const Literal& literal) { "Cannot apply day transform to literal with value {} of type {}", literal.ToString(), source_type()->ToString()); } + if (literal.IsNull()) { + return Literal::Null(iceberg::int32()); + } using namespace std::chrono; // NOLINT switch (source_type()->type_id()) { @@ -357,7 +364,7 @@ Result DayTransform::Transform(const Literal& literal) { } } -Result> DayTransform::ResultType() const { return iceberg::date(); } +std::shared_ptr DayTransform::ResultType() const { return iceberg::int32(); } Result> DayTransform::Make( std::shared_ptr const& source_type) { @@ -387,6 +394,10 @@ Result HourTransform::Transform(const Literal& literal) { literal.ToString(), source_type()->ToString()); } + if (literal.IsNull()) { + return Literal::Null(int32()); + } + using namespace std::chrono; // NOLINT switch (source_type()->type_id()) { case TypeId::kTimestamp: @@ -405,9 +416,7 @@ Result HourTransform::Transform(const Literal& literal) { } } -Result> HourTransform::ResultType() const { - return iceberg::int32(); -} +std::shared_ptr HourTransform::ResultType() const { return iceberg::int32(); } Result> HourTransform::Make( std::shared_ptr const& source_type) { @@ -429,10 +438,10 @@ VoidTransform::VoidTransform(std::shared_ptr const& source_type) : TransformFunction(TransformType::kVoid, source_type) {} Result VoidTransform::Transform(const Literal& literal) { - return Literal::Null(); + return literal.IsNull() ? literal : Literal::Null(literal.type()); } -Result> VoidTransform::ResultType() const { return source_type(); } +std::shared_ptr VoidTransform::ResultType() const { return source_type(); } Result> VoidTransform::Make( std::shared_ptr const& source_type) { diff --git a/src/iceberg/transform_function.h b/src/iceberg/transform_function.h index af97a50cb..6d810640a 100644 --- a/src/iceberg/transform_function.h +++ b/src/iceberg/transform_function.h @@ -33,8 +33,8 @@ class IdentityTransform : public TransformFunction { /// \brief Returns the same Literal as the input. Result Transform(const Literal& literal) override; - /// \brief Returns the same type as the source type if it is valid. - Result> ResultType() const override; + /// \brief Returns the same type as source_type. + std::shared_ptr ResultType() const override; /// \brief Create an IdentityTransform. /// \param source_type Type of the input data. @@ -54,7 +54,7 @@ class BucketTransform : public TransformFunction { Result Transform(const Literal& literal) override; /// \brief Returns INT32 as the output type. - Result> ResultType() const override; + std::shared_ptr ResultType() const override; /// \brief Create a BucketTransform. /// \param source_type Type of the input data. @@ -78,7 +78,7 @@ class TruncateTransform : public TransformFunction { Result Transform(const Literal& literal) override; /// \brief Returns the same type as source_type. - Result> ResultType() const override; + std::shared_ptr ResultType() const override; /// \brief Create a TruncateTransform. /// \param source_type Type of the input data. @@ -101,7 +101,7 @@ class YearTransform : public TransformFunction { Result Transform(const Literal& literal) override; /// \brief Returns INT32 as the output type. - Result> ResultType() const override; + std::shared_ptr ResultType() const override; /// \brief Create a YearTransform. /// \param source_type Type of the input data. @@ -120,7 +120,7 @@ class MonthTransform : public TransformFunction { Result Transform(const Literal& literal) override; /// \brief Returns INT32 as the output type. - Result> ResultType() const override; + std::shared_ptr ResultType() const override; /// \brief Create a MonthTransform. /// \param source_type Type of the input data. @@ -139,7 +139,7 @@ class DayTransform : public TransformFunction { Result Transform(const Literal& literal) override; /// \brief Returns INT32 as the output type. - Result> ResultType() const override; + std::shared_ptr ResultType() const override; /// \brief Create a DayTransform. /// \param source_type Type of the input data. @@ -158,7 +158,7 @@ class HourTransform : public TransformFunction { Result Transform(const Literal& literal) override; /// \brief Returns INT32 as the output type. - Result> ResultType() const override; + std::shared_ptr ResultType() const override; /// \brief Create a HourTransform. /// \param source_type Type of the input data. @@ -176,8 +176,8 @@ class VoidTransform : public TransformFunction { /// \brief Returns a null literal. Result Transform(const Literal& literal) override; - /// \brief Returns null type or a sentinel type indicating void. - Result> ResultType() const override; + /// \brief Returns the same type as source_type. + std::shared_ptr ResultType() const override; /// \brief Create a VoidTransform. /// \param source_type Input type (ignored). diff --git a/src/iceberg/type.cc b/src/iceberg/type.cc index d0b61b8ac..e66f96daf 100644 --- a/src/iceberg/type.cc +++ b/src/iceberg/type.cc @@ -194,10 +194,6 @@ bool MapType::Equals(const Type& other) const { return fields_ == map.fields_; } -TypeId NullType::type_id() const { return TypeId::kNull; } -std::string NullType::ToString() const { return "null"; } -bool NullType::Equals(const Type& other) const { return other.type_id() == kTypeId; } - TypeId BooleanType::type_id() const { return kTypeId; } std::string BooleanType::ToString() const { return "boolean"; } bool BooleanType::Equals(const Type& other) const { return other.type_id() == kTypeId; } @@ -300,7 +296,6 @@ bool BinaryType::Equals(const Type& other) const { return other.type_id() == kTy return result; \ } -TYPE_FACTORY(null, NullType) TYPE_FACTORY(boolean, BooleanType) TYPE_FACTORY(int32, IntType) TYPE_FACTORY(int64, LongType) diff --git a/src/iceberg/type.h b/src/iceberg/type.h index f96decdc6..78c0141b1 100644 --- a/src/iceberg/type.h +++ b/src/iceberg/type.h @@ -191,23 +191,6 @@ class ICEBERG_EXPORT MapType : public NestedType { /// Primitive types do not have nested fields. /// @{ -/// \brief A data type that has no physical storage. -/// Technically, this is a primitive type, we treat it as a primitive type for -/// convenience. -class ICEBERG_EXPORT NullType : public PrimitiveType { - public: - constexpr static const TypeId kTypeId = TypeId::kNull; - - NullType() = default; - ~NullType() override = default; - - TypeId type_id() const override; - std::string ToString() const override; - - protected: - bool Equals(const Type& other) const override; -}; - /// \brief A data type representing a boolean (true or false). class ICEBERG_EXPORT BooleanType : public PrimitiveType { public: @@ -468,8 +451,6 @@ class ICEBERG_EXPORT UuidType : public PrimitiveType { /// Factory functions for creating primitive data types /// @{ -/// \brief Return a NullType instance. -ICEBERG_EXPORT const std::shared_ptr& null(); /// \brief Return a BooleanType instance. ICEBERG_EXPORT const std::shared_ptr& boolean(); /// \brief Return an IntType instance. diff --git a/src/iceberg/type_fwd.h b/src/iceberg/type_fwd.h index 6e76f86ea..0135422c1 100644 --- a/src/iceberg/type_fwd.h +++ b/src/iceberg/type_fwd.h @@ -36,8 +36,6 @@ enum class TypeId { kStruct, kList, kMap, - kNull, // Note: A type having no physical storage. This is not an iceberg type, we add - // it to simplify the code logic. kBoolean, kInt, kLong, @@ -71,7 +69,6 @@ class LongType; class ListType; class MapType; class NestedType; -class NullType; class PartitionField; class PartitionSpec; class PrimitiveType; diff --git a/test/transform_test.cc b/test/transform_test.cc index 6c114cdf3..68952dcda 100644 --- a/test/transform_test.cc +++ b/test/transform_test.cc @@ -24,8 +24,8 @@ #include #include -#include +#include "iceberg/expression/literal.h" #include "iceberg/type.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep #include "matchers.h" @@ -131,7 +131,7 @@ TEST(TransformResultTypeTest, PositiveCases) { .expected_result_type = iceberg::int32()}, {.str = "day", .source_type = iceberg::timestamp(), - .expected_result_type = iceberg::date()}, + .expected_result_type = iceberg::int32()}, {.str = "hour", .source_type = iceberg::timestamp(), .expected_result_type = iceberg::int32()}, @@ -155,8 +155,7 @@ TEST(TransformResultTypeTest, PositiveCases) { ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind: " << c.str; auto result_type = transformPtr.value()->ResultType(); - ASSERT_TRUE(result_type.has_value()) << "Failed to get result type for: " << c.str; - EXPECT_EQ(result_type.value()->type_id(), c.expected_result_type->type_id()) + EXPECT_EQ(result_type->type_id(), c.expected_result_type->type_id()) << "Unexpected result type for: " << c.str; } } @@ -471,7 +470,7 @@ TEST(TransformLiteralTest, VoidTransform) { const std::vector cases = { {.source_type = iceberg::boolean(), .source = Literal::Boolean(true)}, {.source_type = iceberg::int32(), .source = Literal::Int(42)}, - {.source_type = iceberg::int32(), .source = Literal::Date(30000)}, + {.source_type = iceberg::date(), .source = Literal::Date(30000)}, {.source_type = iceberg::int64(), .source = Literal::Long(1234567890)}, {.source_type = iceberg::timestamp(), .source = Literal::Timestamp(1622547800000000)}, @@ -487,9 +486,73 @@ TEST(TransformLiteralTest, VoidTransform) { auto transformPtr = transform->Bind(c.source_type); ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind void transform"; auto result = transformPtr.value()->Transform(c.source); - EXPECT_EQ(result.value(), Literal::Null()) + EXPECT_TRUE(result->IsNull()) + << "Expected void transform to return null type for source: " + << c.source.ToString(); + EXPECT_EQ(result->type()->type_id(), c.source_type->type_id()) + << "Expected void transform to return same type as source for: " + << c.source.ToString(); + } +} + +TEST(TransformLiteralTest, NullLiteral) { + struct Case { + std::string str; + std::shared_ptr source_type; + Literal source; + std::shared_ptr expected_result_type; + }; + + const std::vector cases = { + {.str = "identity", + .source_type = iceberg::string(), + .source = Literal::Null(iceberg::string()), + .expected_result_type = iceberg::string()}, + {.str = "year", + .source_type = iceberg::timestamp(), + .source = Literal::Null(iceberg::timestamp()), + .expected_result_type = iceberg::int32()}, + {.str = "month", + .source_type = iceberg::timestamp(), + .source = Literal::Null(iceberg::timestamp()), + .expected_result_type = iceberg::int32()}, + {.str = "day", + .source_type = iceberg::timestamp(), + .source = Literal::Null(iceberg::timestamp()), + .expected_result_type = iceberg::int32()}, + {.str = "hour", + .source_type = iceberg::timestamp(), + .source = Literal::Null(iceberg::timestamp()), + .expected_result_type = iceberg::int32()}, + {.str = "void", + .source_type = iceberg::string(), + .source = Literal::Null(iceberg::string()), + .expected_result_type = iceberg::string()}, + {.str = "bucket[16]", + .source_type = iceberg::string(), + .source = Literal::Null(iceberg::string()), + .expected_result_type = iceberg::int32()}, + {.str = "truncate[32]", + .source_type = iceberg::string(), + .source = Literal::Null(iceberg::string()), + .expected_result_type = iceberg::string()}, + }; + + for (const auto& c : cases) { + auto result = TransformFromString(c.str); + ASSERT_TRUE(result.has_value()) << "Failed to parse: " << c.str; + + const auto& transform = result.value(); + const auto transformPtr = transform->Bind(c.source_type); + ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind: " << c.str; + + auto transform_result = transformPtr.value()->Transform(c.source); + EXPECT_TRUE(transform_result->IsNull()) << "Expected void transform to return null type for source: " << c.source.ToString(); + EXPECT_EQ(transform_result->type()->type_id(), c.expected_result_type->type_id()) + << "Expected void transform to return same type as source for: " + << c.source.ToString(); } } diff --git a/test/type_test.cc b/test/type_test.cc index df4aa241c..fca886928 100644 --- a/test/type_test.cc +++ b/test/type_test.cc @@ -87,14 +87,7 @@ TEST_P(TypeTest, StdFormat) { ASSERT_EQ(test_case.repr, std::format("{}", *test_case.type)); } -const static std::array kPrimitiveTypes = {{ - { - .name = "null", - .type = iceberg::null(), - .type_id = iceberg::TypeId::kNull, - .primitive = true, - .repr = "null", - }, +const static std::array kPrimitiveTypes = {{ { .name = "boolean", .type = iceberg::boolean(), From 080b10a1eddedbd432fcd2d355f4a3a8ee5a8794 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Wed, 6 Aug 2025 23:53:07 +0800 Subject: [PATCH 09/15] feat: add truncate_utils.h --- src/iceberg/transform_function.cc | 38 +++++++-------------- src/iceberg/util/truncate_utils.h | 55 +++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 26 deletions(-) create mode 100644 src/iceberg/util/truncate_utils.h diff --git a/src/iceberg/transform_function.cc b/src/iceberg/transform_function.cc index b8bf94f87..6d2256e7c 100644 --- a/src/iceberg/transform_function.cc +++ b/src/iceberg/transform_function.cc @@ -27,6 +27,7 @@ #include "iceberg/type.h" #include "iceberg/util/murmurhash3_internal.h" +#include "iceberg/util/truncate_utils.h" namespace iceberg { @@ -57,7 +58,7 @@ Result BucketTransform::Transform(const Literal& literal) { "Cannot apply bucket transform to literal with value {} of type {}", literal.ToString(), source_type()->ToString()); } - if (literal.IsNull()) { + if (literal.IsNull()) [[unlikely]] { return Literal::Null(iceberg::int32()); } @@ -135,7 +136,7 @@ Result TruncateTransform::Transform(const Literal& literal) { "Cannot apply truncate transform to literal with value {} of type {}", literal.ToString(), source_type()->ToString()); } - if (literal.IsNull()) { + if (literal.IsNull()) [[unlikely]] { // Return null as is return literal; } @@ -143,11 +144,11 @@ Result TruncateTransform::Transform(const Literal& literal) { switch (source_type()->type_id()) { case TypeId::kInt: { auto value = std::get(literal.value()); - return Literal::Int(value - (((value % width_) + width_) % width_)); + return Literal::Int(TruncateUtils::TruncateInt(value, width_)); } case TypeId::kLong: { auto value = std::get(literal.value()); - return Literal::Long(value - (((value % width_) + width_) % width_)); + return Literal::Long(TruncateUtils::TruncateLong(value, width_)); } case TypeId::kDecimal: { // TODO(zhjwpku): Handle decimal truncation logic here @@ -156,26 +157,11 @@ Result TruncateTransform::Transform(const Literal& literal) { case TypeId::kString: { // Strings are truncated to a valid UTF-8 string with no more than L code points. auto value = std::get(literal.value()); - size_t code_point_count = 0; - size_t safe_point = 0; - - for (size_t i = 0; i < value.size(); ++i) { - // Start of a new UTF-8 code point - if ((value[i] & 0xC0) != 0x80) { - code_point_count++; - if (code_point_count > static_cast(width_)) { - safe_point = i; - break; - } - } - } - - if (safe_point != 0) { - value.resize(safe_point); // Resize the string to the safe point - } - return Literal::String(value); + return Literal::String(TruncateUtils::TruncateUTF8(std::move(value), width_)); } case TypeId::kBinary: { + /// In contrast to strings, binary values do not have an assumed encoding and are + /// truncated to L bytes. auto value = std::get>(literal.value()); if (value.size() > static_cast(width_)) { value.resize(width_); @@ -221,7 +207,7 @@ Result YearTransform::Transform(const Literal& literal) { "Cannot apply year transform to literal with value {} of type {}", literal.ToString(), source_type()->ToString()); } - if (literal.IsNull()) { + if (literal.IsNull()) [[unlikely]] { return Literal::Null(iceberg::int32()); } @@ -274,7 +260,7 @@ Result MonthTransform::Transform(const Literal& literal) { "Cannot apply month transform to literal with value {} of type {}", literal.ToString(), source_type()->ToString()); } - if (literal.IsNull()) { + if (literal.IsNull()) [[unlikely]] { return Literal::Null(iceberg::int32()); } @@ -339,7 +325,7 @@ Result DayTransform::Transform(const Literal& literal) { "Cannot apply day transform to literal with value {} of type {}", literal.ToString(), source_type()->ToString()); } - if (literal.IsNull()) { + if (literal.IsNull()) [[unlikely]] { return Literal::Null(iceberg::int32()); } @@ -394,7 +380,7 @@ Result HourTransform::Transform(const Literal& literal) { literal.ToString(), source_type()->ToString()); } - if (literal.IsNull()) { + if (literal.IsNull()) [[unlikely]] { return Literal::Null(int32()); } diff --git a/src/iceberg/util/truncate_utils.h b/src/iceberg/util/truncate_utils.h new file mode 100644 index 000000000..ed11c167b --- /dev/null +++ b/src/iceberg/util/truncate_utils.h @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include + +#include "iceberg/iceberg_export.h" + +namespace iceberg { + +ICEBERG_EXPORT class TruncateUtils { + public: + static std::string TruncateUTF8(std::string&& source, size_t L) { + size_t code_point_count = 0; + size_t safe_point = 0; + + for (size_t i = 0; i < source.size(); ++i) { + // Start of a new UTF-8 code point + if ((source[i] & 0xC0) != 0x80) { + code_point_count++; + if (code_point_count > L) { + safe_point = i; + break; + } + } + } + + if (safe_point != 0) { + // Resize the string to the safe point + source.resize(safe_point); + } + + return std::move(source); + } +}; + +} // namespace iceberg From f4d4f5da2c545cf9cc6e51aa4d736f1dbd0c0cb1 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Wed, 6 Aug 2025 23:54:05 +0800 Subject: [PATCH 10/15] feat: add truncate_utils.h --- src/iceberg/transform_function.cc | 4 ++-- src/iceberg/util/truncate_utils.h | 20 ++++++++++++++++++-- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/src/iceberg/transform_function.cc b/src/iceberg/transform_function.cc index 6d2256e7c..c439ae851 100644 --- a/src/iceberg/transform_function.cc +++ b/src/iceberg/transform_function.cc @@ -144,11 +144,11 @@ Result TruncateTransform::Transform(const Literal& literal) { switch (source_type()->type_id()) { case TypeId::kInt: { auto value = std::get(literal.value()); - return Literal::Int(TruncateUtils::TruncateInt(value, width_)); + return Literal::Int(TruncateUtils::TruncateInteger(value, width_)); } case TypeId::kLong: { auto value = std::get(literal.value()); - return Literal::Long(TruncateUtils::TruncateLong(value, width_)); + return Literal::Long(TruncateUtils::TruncateInteger(value, width_)); } case TypeId::kDecimal: { // TODO(zhjwpku): Handle decimal truncation logic here diff --git a/src/iceberg/util/truncate_utils.h b/src/iceberg/util/truncate_utils.h index ed11c167b..d3b7f90a4 100644 --- a/src/iceberg/util/truncate_utils.h +++ b/src/iceberg/util/truncate_utils.h @@ -19,8 +19,8 @@ #pragma once -#include #include +#include #include "iceberg/iceberg_export.h" @@ -28,6 +28,13 @@ namespace iceberg { ICEBERG_EXPORT class TruncateUtils { public: + /// \brief Truncate a UTF-8 string to a specified number of code points. + /// + /// \param source The input string to truncate. + /// \param L The maximum number of code points allowed in the output string. + /// \return A valid UTF-8 string truncated to L code points. + /// If the input string is already valid and has fewer than L code points, it is + /// returned unchanged. static std::string TruncateUTF8(std::string&& source, size_t L) { size_t code_point_count = 0; size_t safe_point = 0; @@ -36,7 +43,7 @@ ICEBERG_EXPORT class TruncateUtils { // Start of a new UTF-8 code point if ((source[i] & 0xC0) != 0x80) { code_point_count++; - if (code_point_count > L) { + if (code_point_count > static_cast(L)) { safe_point = i; break; } @@ -50,6 +57,15 @@ ICEBERG_EXPORT class TruncateUtils { return std::move(source); } + + /// \brief Truncate an integer v, either int32_t or int64_t, to v - (v % W). + /// + /// The remainder, v % W, must be positive. For languages where % can produce negative + /// values, the correct truncate function is: v - (((v % W) + W) % W) + template + static inline T TruncateInteger(T v, size_t W) { + return v - (((v % W) + W) % W); + } }; } // namespace iceberg From 4c0766a2b04d9bf2e5bffc3d916bce6f66a01f74 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Thu, 7 Aug 2025 22:11:05 +0800 Subject: [PATCH 11/15] fix: use concepts to limit TTruncateInteger to int32_t and int64_t --- src/iceberg/expression/literal.cc | 28 +++++++++++----------------- src/iceberg/util/truncate_utils.h | 1 + 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/src/iceberg/expression/literal.cc b/src/iceberg/expression/literal.cc index be6d34ddc..e3abb6a66 100644 --- a/src/iceberg/expression/literal.cc +++ b/src/iceberg/expression/literal.cc @@ -21,7 +21,6 @@ #include #include -#include #include "iceberg/exception.h" @@ -126,33 +125,28 @@ Literal::Literal(Value value, std::shared_ptr type) : value_(std::move(value)), type_(std::move(type)) {} // Factory methods +Literal Literal::Boolean(bool value) { return {Value{value}, boolean()}; } -Literal Literal::Boolean(bool value) { return {Value{value}, iceberg::boolean()}; } +Literal Literal::Int(int32_t value) { return {Value{value}, int32()}; } -Literal Literal::Int(int32_t value) { return {Value{value}, iceberg::int32()}; } +Literal Literal::Date(int32_t value) { return {Value{value}, date()}; } -Literal Literal::Date(int32_t value) { return {Value{value}, iceberg::date()}; } +Literal Literal::Long(int64_t value) { return {Value{value}, int64()}; } -Literal Literal::Long(int64_t value) { return {Value{value}, iceberg::int64()}; } +Literal Literal::Time(int64_t value) { return {Value{value}, time()}; } -Literal Literal::Time(int64_t value) { return {Value{value}, iceberg::time()}; } +Literal Literal::Timestamp(int64_t value) { return {Value{value}, timestamp()}; } -Literal Literal::Timestamp(int64_t value) { return {Value{value}, iceberg::timestamp()}; } +Literal Literal::TimestampTz(int64_t value) { return {Value{value}, timestamp_tz()}; } -Literal Literal::TimestampTz(int64_t value) { - return {Value{value}, iceberg::timestamp_tz()}; -} - -Literal Literal::Float(float value) { return {Value{value}, iceberg::float32()}; } +Literal Literal::Float(float value) { return {Value{value}, float32()}; } -Literal Literal::Double(double value) { return {Value{value}, iceberg::float64()}; } +Literal Literal::Double(double value) { return {Value{value}, float64()}; } -Literal Literal::String(std::string value) { - return {Value{std::move(value)}, iceberg::string()}; -} +Literal Literal::String(std::string value) { return {Value{std::move(value)}, string()}; } Literal Literal::Binary(std::vector value) { - return {Value{std::move(value)}, iceberg::binary()}; + return {Value{std::move(value)}, binary()}; } Result Literal::Deserialize(std::span data, diff --git a/src/iceberg/util/truncate_utils.h b/src/iceberg/util/truncate_utils.h index d3b7f90a4..1ef7a7a25 100644 --- a/src/iceberg/util/truncate_utils.h +++ b/src/iceberg/util/truncate_utils.h @@ -63,6 +63,7 @@ ICEBERG_EXPORT class TruncateUtils { /// The remainder, v % W, must be positive. For languages where % can produce negative /// values, the correct truncate function is: v - (((v % W) + W) % W) template + requires std::is_same_v || std::is_same_v static inline T TruncateInteger(T v, size_t W) { return v - (((v % W) + W) % W); } From 0bf304af43c065b55036f9e40b0f9e36b3e69228 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Fri, 8 Aug 2025 22:36:44 +0800 Subject: [PATCH 12/15] fix: review comments --- src/iceberg/transform_function.cc | 20 ++++++++++---------- src/iceberg/util/truncate_utils.h | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/iceberg/transform_function.cc b/src/iceberg/transform_function.cc index c439ae851..4e008e5c6 100644 --- a/src/iceberg/transform_function.cc +++ b/src/iceberg/transform_function.cc @@ -59,7 +59,7 @@ Result BucketTransform::Transform(const Literal& literal) { literal.ToString(), source_type()->ToString()); } if (literal.IsNull()) [[unlikely]] { - return Literal::Null(iceberg::int32()); + return Literal::Null(int32()); } int32_t hash_value = 0; @@ -95,7 +95,7 @@ Result BucketTransform::Transform(const Literal& literal) { return Literal::Int(bucket_index); } -std::shared_ptr BucketTransform::ResultType() const { return iceberg::int32(); } +std::shared_ptr BucketTransform::ResultType() const { return int32(); } Result> BucketTransform::Make( std::shared_ptr const& source_type, int32_t num_buckets) { @@ -166,7 +166,7 @@ Result TruncateTransform::Transform(const Literal& literal) { if (value.size() > static_cast(width_)) { value.resize(width_); } - return Literal::Binary(value); + return Literal::Binary(std::move(value)); } default: std::unreachable(); @@ -208,7 +208,7 @@ Result YearTransform::Transform(const Literal& literal) { literal.ToString(), source_type()->ToString()); } if (literal.IsNull()) [[unlikely]] { - return Literal::Null(iceberg::int32()); + return Literal::Null(int32()); } using namespace std::chrono; // NOLINT @@ -231,7 +231,7 @@ Result YearTransform::Transform(const Literal& literal) { } } -std::shared_ptr YearTransform::ResultType() const { return iceberg::int32(); } +std::shared_ptr YearTransform::ResultType() const { return int32(); } Result> YearTransform::Make( std::shared_ptr const& source_type) { @@ -261,7 +261,7 @@ Result MonthTransform::Transform(const Literal& literal) { literal.ToString(), source_type()->ToString()); } if (literal.IsNull()) [[unlikely]] { - return Literal::Null(iceberg::int32()); + return Literal::Null(int32()); } using namespace std::chrono; // NOLINT @@ -296,7 +296,7 @@ Result MonthTransform::Transform(const Literal& literal) { } } -std::shared_ptr MonthTransform::ResultType() const { return iceberg::int32(); } +std::shared_ptr MonthTransform::ResultType() const { return int32(); } Result> MonthTransform::Make( std::shared_ptr const& source_type) { @@ -326,7 +326,7 @@ Result DayTransform::Transform(const Literal& literal) { literal.ToString(), source_type()->ToString()); } if (literal.IsNull()) [[unlikely]] { - return Literal::Null(iceberg::int32()); + return Literal::Null(int32()); } using namespace std::chrono; // NOLINT @@ -350,7 +350,7 @@ Result DayTransform::Transform(const Literal& literal) { } } -std::shared_ptr DayTransform::ResultType() const { return iceberg::int32(); } +std::shared_ptr DayTransform::ResultType() const { return int32(); } Result> DayTransform::Make( std::shared_ptr const& source_type) { @@ -402,7 +402,7 @@ Result HourTransform::Transform(const Literal& literal) { } } -std::shared_ptr HourTransform::ResultType() const { return iceberg::int32(); } +std::shared_ptr HourTransform::ResultType() const { return int32(); } Result> HourTransform::Make( std::shared_ptr const& source_type) { diff --git a/src/iceberg/util/truncate_utils.h b/src/iceberg/util/truncate_utils.h index 1ef7a7a25..46299358e 100644 --- a/src/iceberg/util/truncate_utils.h +++ b/src/iceberg/util/truncate_utils.h @@ -35,7 +35,7 @@ ICEBERG_EXPORT class TruncateUtils { /// \return A valid UTF-8 string truncated to L code points. /// If the input string is already valid and has fewer than L code points, it is /// returned unchanged. - static std::string TruncateUTF8(std::string&& source, size_t L) { + static std::string TruncateUTF8(std::string source, size_t L) { size_t code_point_count = 0; size_t safe_point = 0; From 1c08af1ed3735c81c4313400c610221eb5bab33f Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Sat, 9 Aug 2025 10:09:02 +0800 Subject: [PATCH 13/15] fix: DayTransform wrong return type --- src/iceberg/transform_function.cc | 6 +++--- test/transform_test.cc | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/iceberg/transform_function.cc b/src/iceberg/transform_function.cc index 4e008e5c6..0cc227e50 100644 --- a/src/iceberg/transform_function.cc +++ b/src/iceberg/transform_function.cc @@ -25,6 +25,7 @@ #include #include +#include "iceberg/expression/literal.h" #include "iceberg/type.h" #include "iceberg/util/murmurhash3_internal.h" #include "iceberg/util/truncate_utils.h" @@ -332,8 +333,7 @@ Result DayTransform::Transform(const Literal& literal) { using namespace std::chrono; // NOLINT switch (source_type()->type_id()) { case TypeId::kDate: { - // Day is the same as the date value - return literal; + return Literal::Int(std::get(literal.value())); } case TypeId::kTimestamp: case TypeId::kTimestampTz: { @@ -342,7 +342,7 @@ Result DayTransform::Transform(const Literal& literal) { auto timestamp = sys_time(microseconds{value}); auto days_since_epoch = floor(timestamp); - return Literal::Date( + return Literal::Int( static_cast(days_since_epoch.time_since_epoch().count())); } default: diff --git a/test/transform_test.cc b/test/transform_test.cc index 68952dcda..c1efcb56a 100644 --- a/test/transform_test.cc +++ b/test/transform_test.cc @@ -408,13 +408,13 @@ TEST(TransformFunctionTransformTest, DayTransform) { const std::vector cases = { {.source_type = iceberg::timestamp(), .source = Literal::Timestamp(1622547800000000), - .expected = Literal::Date(18779)}, + .expected = Literal::Int(18779)}, {.source_type = iceberg::timestamp_tz(), .source = Literal::TimestampTz(1622547800000000), - .expected = Literal::Date(18779)}, + .expected = Literal::Int(18779)}, {.source_type = iceberg::date(), .source = Literal::Date(30000), - .expected = Literal::Date(30000)}, + .expected = Literal::Int(30000)}, }; for (const auto& c : cases) { From 738f59c3362fcd2e94fa72758151fe6e8b215903 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Sun, 10 Aug 2025 11:09:51 +0800 Subject: [PATCH 14/15] fix: ICEBERG_EXPORT position By upgrade https://github.com/pre-commit/mirrors-clang-format to v20.1.8 --- .pre-commit-config.yaml | 2 +- src/iceberg/util/string_utils.h | 2 +- src/iceberg/util/truncate_utils.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 52c8fe53f..04d60cf46 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -30,7 +30,7 @@ repos: - id: check-added-large-files - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v19.1.5 + rev: v20.1.8 hooks: - id: clang-format exclude: ^test/resources/.*\.json$ diff --git a/src/iceberg/util/string_utils.h b/src/iceberg/util/string_utils.h index 9ff250b66..558fc293c 100644 --- a/src/iceberg/util/string_utils.h +++ b/src/iceberg/util/string_utils.h @@ -27,7 +27,7 @@ namespace iceberg { -ICEBERG_EXPORT class StringUtils { +class ICEBERG_EXPORT StringUtils { public: static std::string ToLower(std::string_view str) { std::string input(str); diff --git a/src/iceberg/util/truncate_utils.h b/src/iceberg/util/truncate_utils.h index 46299358e..5e76135c5 100644 --- a/src/iceberg/util/truncate_utils.h +++ b/src/iceberg/util/truncate_utils.h @@ -26,7 +26,7 @@ namespace iceberg { -ICEBERG_EXPORT class TruncateUtils { +class ICEBERG_EXPORT TruncateUtils { public: /// \brief Truncate a UTF-8 string to a specified number of code points. /// From 8e25f03476e533a9c25c777bed9dc9825be241b1 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Sun, 10 Aug 2025 13:46:13 +0800 Subject: [PATCH 15/15] fix: bump clang-tidy version to 22 --- .github/workflows/cpp-linter.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cpp-linter.yml b/.github/workflows/cpp-linter.yml index 2085a4942..90f446cc2 100644 --- a/.github/workflows/cpp-linter.yml +++ b/.github/workflows/cpp-linter.yml @@ -47,7 +47,7 @@ jobs: with: style: file tidy-checks: '' - version: 19 + version: 22 files-changed-only: true lines-changed-only: true thread-comments: true