diff --git a/.github/workflows/cpp-linter.yml b/.github/workflows/cpp-linter.yml index 2085a4942..90f446cc2 100644 --- a/.github/workflows/cpp-linter.yml +++ b/.github/workflows/cpp-linter.yml @@ -47,7 +47,7 @@ jobs: with: style: file tidy-checks: '' - version: 19 + version: 22 files-changed-only: true lines-changed-only: true thread-comments: true diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 52c8fe53f..04d60cf46 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -30,7 +30,7 @@ repos: - id: check-added-large-files - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v19.1.5 + rev: v20.1.8 hooks: - id: clang-format exclude: ^test/resources/.*\.json$ diff --git a/src/iceberg/expression/literal.cc b/src/iceberg/expression/literal.cc index d053f305f..e3abb6a66 100644 --- a/src/iceberg/expression/literal.cc +++ b/src/iceberg/expression/literal.cc @@ -21,7 +21,6 @@ #include #include -#include #include "iceberg/exception.h" @@ -126,22 +125,28 @@ Literal::Literal(Value value, std::shared_ptr type) : value_(std::move(value)), type_(std::move(type)) {} // Factory methods -Literal Literal::Boolean(bool value) { return {Value{value}, iceberg::boolean()}; } +Literal Literal::Boolean(bool value) { return {Value{value}, boolean()}; } -Literal Literal::Int(int32_t value) { return {Value{value}, iceberg::int32()}; } +Literal Literal::Int(int32_t value) { return {Value{value}, int32()}; } -Literal Literal::Long(int64_t value) { return {Value{value}, iceberg::int64()}; } +Literal Literal::Date(int32_t value) { return {Value{value}, date()}; } -Literal Literal::Float(float value) { return {Value{value}, iceberg::float32()}; } +Literal Literal::Long(int64_t value) { return {Value{value}, int64()}; } -Literal Literal::Double(double value) { return {Value{value}, iceberg::float64()}; } +Literal Literal::Time(int64_t value) { return {Value{value}, time()}; } -Literal Literal::String(std::string value) { - return {Value{std::move(value)}, iceberg::string()}; -} +Literal Literal::Timestamp(int64_t value) { return {Value{value}, timestamp()}; } + +Literal Literal::TimestampTz(int64_t value) { return {Value{value}, timestamp_tz()}; } + +Literal Literal::Float(float value) { return {Value{value}, float32()}; } + +Literal Literal::Double(double value) { return {Value{value}, float64()}; } + +Literal Literal::String(std::string value) { return {Value{std::move(value)}, string()}; } Literal Literal::Binary(std::vector value) { - return {Value{std::move(value)}, iceberg::binary()}; + return {Value{std::move(value)}, binary()}; } Result Literal::Deserialize(std::span data, @@ -188,8 +193,9 @@ std::partial_ordering Literal::operator<=>(const Literal& other) const { return std::partial_ordering::unordered; } - // If either value is AboveMax or BelowMin, comparison is unordered - if (IsAboveMax() || IsBelowMin() || other.IsAboveMax() || other.IsBelowMin()) { + // If either value is AboveMax, BelowMin or null, comparison is unordered + if (IsAboveMax() || IsBelowMin() || other.IsAboveMax() || other.IsBelowMin() || + IsNull() || other.IsNull()) { return std::partial_ordering::unordered; } @@ -202,13 +208,16 @@ std::partial_ordering Literal::operator<=>(const Literal& other) const { return this_val ? std::partial_ordering::greater : std::partial_ordering::less; } - case TypeId::kInt: { + case TypeId::kInt: + case TypeId::kDate: { auto this_val = std::get(value_); auto other_val = std::get(other.value_); return this_val <=> other_val; } - case TypeId::kLong: { + case TypeId::kLong: + case TypeId::kTimestamp: + case TypeId::kTimestampTz: { auto this_val = std::get(value_); auto other_val = std::get(other.value_); return this_val <=> other_val; @@ -253,6 +262,9 @@ std::string Literal::ToString() const { if (std::holds_alternative(value_)) { return "aboveMax"; } + if (std::holds_alternative(value_)) { + return "null"; + } switch (type_->type_id()) { case TypeId::kBoolean: { @@ -301,6 +313,8 @@ bool Literal::IsBelowMin() const { return std::holds_alternative(value bool Literal::IsAboveMax() const { return std::holds_alternative(value_); } +bool Literal::IsNull() const { return std::holds_alternative(value_); } + // LiteralCaster implementation Result LiteralCaster::CastTo(const Literal& literal, @@ -312,7 +326,8 @@ Result LiteralCaster::CastTo(const Literal& literal, // Handle special values if (std::holds_alternative(literal.value_) || - std::holds_alternative(literal.value_)) { + std::holds_alternative(literal.value_) || + std::holds_alternative(literal.value_)) { // Cannot cast type for special values return NotSupported("Cannot cast type for {}", literal.ToString()); } diff --git a/src/iceberg/expression/literal.h b/src/iceberg/expression/literal.h index 739e20fc2..4c880ef3e 100644 --- a/src/iceberg/expression/literal.h +++ b/src/iceberg/expression/literal.h @@ -32,7 +32,7 @@ namespace iceberg { /// \brief Literal is a literal value that is associated with a primitive type. class ICEBERG_EXPORT Literal { - private: + public: /// \brief Sentinel value to indicate that the literal value is below the valid range /// of a specific primitive type. It can happen when casting a literal to a narrower /// primitive type. @@ -48,27 +48,35 @@ class ICEBERG_EXPORT Literal { bool operator==(const AboveMax&) const = default; std::strong_ordering operator<=>(const AboveMax&) const = default; }; - - using Value = std::variant, // for binary, fixed std::array, // for uuid and decimal BelowMin, AboveMax>; - public: /// \brief Factory methods for primitive types static Literal Boolean(bool value); static Literal Int(int32_t value); + static Literal Date(int32_t value); static Literal Long(int64_t value); + static Literal Time(int64_t value); + static Literal Timestamp(int64_t value); + static Literal TimestampTz(int64_t value); static Literal Float(float value); static Literal Double(double value); static Literal String(std::string value); static Literal Binary(std::vector value); + /// \brief Create a literal representing a null value. + static Literal Null(std::shared_ptr type) { + return {Value{std::monostate{}}, std::move(type)}; + } + /// \brief Restore a literal from single-value serialization. /// /// See [this spec](https://iceberg.apache.org/spec/#binary-single-value-serialization) @@ -85,6 +93,9 @@ class ICEBERG_EXPORT Literal { /// \brief Get the literal type. const std::shared_ptr& type() const; + /// \brief Get the literal value. + const Value& value() const { return value_; } + /// \brief Converts this literal to a literal of the given type. /// /// When a predicate is bound to a concrete data column, literals are converted to match @@ -123,6 +134,10 @@ class ICEBERG_EXPORT Literal { /// \return true if this literal represents a BelowMin value, false otherwise bool IsBelowMin() const; + /// Check if this literal is null. + /// \return true if this literal is null, false otherwise + bool IsNull() const; + std::string ToString() const; private: diff --git a/src/iceberg/transform.h b/src/iceberg/transform.h index f09f15bba..3e6709fa1 100644 --- a/src/iceberg/transform.h +++ b/src/iceberg/transform.h @@ -25,7 +25,7 @@ #include #include -#include "iceberg/arrow_c_data.h" +#include "iceberg/expression/literal.h" #include "iceberg/iceberg_export.h" #include "iceberg/result.h" #include "iceberg/type_fwd.h" @@ -170,14 +170,16 @@ class ICEBERG_EXPORT TransformFunction { public: virtual ~TransformFunction() = default; TransformFunction(TransformType transform_type, std::shared_ptr source_type); - /// \brief Transform an input array to a new array - virtual Result Transform(const ArrowArray& data) = 0; + /// \brief Transform an input Literal to a new Literal + /// + /// All transforms must return null for a null input value. + virtual Result Transform(const Literal& literal) = 0; /// \brief Get the transform type TransformType transform_type() const; /// \brief Get the source type of transform function const std::shared_ptr& source_type() const; /// \brief Get the result type of transform function - virtual Result> ResultType() const = 0; + virtual std::shared_ptr ResultType() const = 0; friend bool operator==(const TransformFunction& lhs, const TransformFunction& rhs) { return lhs.Equals(rhs); diff --git a/src/iceberg/transform_function.cc b/src/iceberg/transform_function.cc index 9ddf6e9f7..0cc227e50 100644 --- a/src/iceberg/transform_function.cc +++ b/src/iceberg/transform_function.cc @@ -19,20 +19,25 @@ #include "iceberg/transform_function.h" +#include +#include +#include +#include +#include + +#include "iceberg/expression/literal.h" #include "iceberg/type.h" +#include "iceberg/util/murmurhash3_internal.h" +#include "iceberg/util/truncate_utils.h" namespace iceberg { IdentityTransform::IdentityTransform(std::shared_ptr const& source_type) : TransformFunction(TransformType::kIdentity, source_type) {} -Result IdentityTransform::Transform(const ArrowArray& input) { - return NotImplemented("IdentityTransform::Transform"); -} +Result IdentityTransform::Transform(const Literal& literal) { return literal; } -Result> IdentityTransform::ResultType() const { - return source_type(); -} +std::shared_ptr IdentityTransform::ResultType() const { return source_type(); } Result> IdentityTransform::Make( std::shared_ptr const& source_type) { @@ -47,14 +52,52 @@ BucketTransform::BucketTransform(std::shared_ptr const& source_type, int32_t num_buckets) : TransformFunction(TransformType::kBucket, source_type), num_buckets_(num_buckets) {} -Result BucketTransform::Transform(const ArrowArray& input) { - return NotImplemented("BucketTransform::Transform"); -} +Result BucketTransform::Transform(const Literal& literal) { + assert(literal.type() == source_type()); + if (literal.IsBelowMin() || literal.IsAboveMax()) { + return InvalidArgument( + "Cannot apply bucket transform to literal with value {} of type {}", + literal.ToString(), source_type()->ToString()); + } + if (literal.IsNull()) [[unlikely]] { + return Literal::Null(int32()); + } -Result> BucketTransform::ResultType() const { - return iceberg::int32(); + int32_t hash_value = 0; + std::visit( + [&](auto&& value) { + using T = std::decay_t; + if constexpr (std::is_same_v) { + MurmurHash3_x86_32(&value, sizeof(int32_t), 0, &hash_value); + } else if constexpr (std::is_same_v) { + MurmurHash3_x86_32(&value, sizeof(int64_t), 0, &hash_value); + } else if constexpr (std::is_same_v>) { + MurmurHash3_x86_32(value.data(), sizeof(uint8_t) * 16, 0, &hash_value); + } else if constexpr (std::is_same_v) { + MurmurHash3_x86_32(value.data(), value.size(), 0, &hash_value); + } else if constexpr (std::is_same_v>) { + MurmurHash3_x86_32(value.data(), value.size(), 0, &hash_value); + } else if constexpr (std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v) { + std::unreachable(); + } else { + static_assert(false, "Unhandled type in BucketTransform::Transform"); + } + }, + literal.value()); + + // Calculate the bucket index + int32_t bucket_index = + (hash_value & std::numeric_limits::max()) % num_buckets_; + + return Literal::Int(bucket_index); } +std::shared_ptr BucketTransform::ResultType() const { return int32(); } + Result> BucketTransform::Make( std::shared_ptr const& source_type, int32_t num_buckets) { if (!source_type) { @@ -87,14 +130,52 @@ TruncateTransform::TruncateTransform(std::shared_ptr const& source_type, int32_t width) : TransformFunction(TransformType::kTruncate, source_type), width_(width) {} -Result TruncateTransform::Transform(const ArrowArray& input) { - return NotImplemented("TruncateTransform::Transform"); -} +Result TruncateTransform::Transform(const Literal& literal) { + assert(literal.type() == source_type()); + if (literal.IsBelowMin() || literal.IsAboveMax()) { + return InvalidArgument( + "Cannot apply truncate transform to literal with value {} of type {}", + literal.ToString(), source_type()->ToString()); + } + if (literal.IsNull()) [[unlikely]] { + // Return null as is + return literal; + } -Result> TruncateTransform::ResultType() const { - return source_type(); + switch (source_type()->type_id()) { + case TypeId::kInt: { + auto value = std::get(literal.value()); + return Literal::Int(TruncateUtils::TruncateInteger(value, width_)); + } + case TypeId::kLong: { + auto value = std::get(literal.value()); + return Literal::Long(TruncateUtils::TruncateInteger(value, width_)); + } + case TypeId::kDecimal: { + // TODO(zhjwpku): Handle decimal truncation logic here + return NotImplemented("Truncate for Decimal is not implemented yet"); + } + case TypeId::kString: { + // Strings are truncated to a valid UTF-8 string with no more than L code points. + auto value = std::get(literal.value()); + return Literal::String(TruncateUtils::TruncateUTF8(std::move(value), width_)); + } + case TypeId::kBinary: { + /// In contrast to strings, binary values do not have an assumed encoding and are + /// truncated to L bytes. + auto value = std::get>(literal.value()); + if (value.size() > static_cast(width_)) { + value.resize(width_); + } + return Literal::Binary(std::move(value)); + } + default: + std::unreachable(); + } } +std::shared_ptr TruncateTransform::ResultType() const { return source_type(); } + Result> TruncateTransform::Make( std::shared_ptr const& source_type, int32_t width) { if (!source_type) { @@ -120,14 +201,39 @@ Result> TruncateTransform::Make( YearTransform::YearTransform(std::shared_ptr const& source_type) : TransformFunction(TransformType::kTruncate, source_type) {} -Result YearTransform::Transform(const ArrowArray& input) { - return NotImplemented("YearTransform::Transform"); -} +Result YearTransform::Transform(const Literal& literal) { + assert(literal.type() == source_type()); + if (literal.IsBelowMin() || literal.IsAboveMax()) { + return InvalidArgument( + "Cannot apply year transform to literal with value {} of type {}", + literal.ToString(), source_type()->ToString()); + } + if (literal.IsNull()) [[unlikely]] { + return Literal::Null(int32()); + } -Result> YearTransform::ResultType() const { - return iceberg::int32(); + using namespace std::chrono; // NOLINT + switch (source_type()->type_id()) { + case TypeId::kDate: { + auto value = std::get(literal.value()); + auto epoch = sys_days(year{1970} / January / 1); + auto ymd = year_month_day(epoch + days{value}); + return Literal::Int(static_cast(ymd.year())); + } + case TypeId::kTimestamp: + case TypeId::kTimestampTz: { + auto value = std::get(literal.value()); + // Convert microseconds-since-epoch into a `year_month_day` object + auto ymd = year_month_day(floor(sys_time(microseconds{value}))); + return Literal::Int(static_cast(ymd.year())); + } + default: + std::unreachable(); + } } +std::shared_ptr YearTransform::ResultType() const { return int32(); } + Result> YearTransform::Make( std::shared_ptr const& source_type) { if (!source_type) { @@ -148,14 +254,51 @@ Result> YearTransform::Make( MonthTransform::MonthTransform(std::shared_ptr const& source_type) : TransformFunction(TransformType::kMonth, source_type) {} -Result MonthTransform::Transform(const ArrowArray& input) { - return NotImplemented("MonthTransform::Transform"); -} +Result MonthTransform::Transform(const Literal& literal) { + assert(literal.type() == source_type()); + if (literal.IsBelowMin() || literal.IsAboveMax()) { + return InvalidArgument( + "Cannot apply month transform to literal with value {} of type {}", + literal.ToString(), source_type()->ToString()); + } + if (literal.IsNull()) [[unlikely]] { + return Literal::Null(int32()); + } -Result> MonthTransform::ResultType() const { - return iceberg::int32(); + using namespace std::chrono; // NOLINT + switch (source_type()->type_id()) { + case TypeId::kDate: { + auto value = std::get(literal.value()); + auto epoch = sys_days(year{1970} / January / 1); + auto ymd = year_month_day(epoch + days{value}); + auto epoch_ymd = year_month_day(epoch); + auto delta = ymd.year() - epoch_ymd.year(); + // Calculate the month as months from 1970-01 + // Note: January is month 1, so we subtract 1 to get zero-based + // month count. + return Literal::Int(static_cast(delta.count() * 12 + + static_cast(ymd.month()) - 1)); + } + case TypeId::kTimestamp: + case TypeId::kTimestampTz: { + auto value = std::get(literal.value()); + // Convert microseconds-since-epoch into a `year_month_day` object + auto ymd = year_month_day(floor(sys_time(microseconds{value}))); + auto epoch_ymd = year_month_day(year{1970} / January / 1); + auto delta = ymd.year() - epoch_ymd.year(); + // Calculate the month as months from 1970-01 + // Note: January is month 1, so we subtract 1 to get zero-based + // month count. + return Literal::Int(static_cast(delta.count() * 12 + + static_cast(ymd.month()) - 1)); + } + default: + std::unreachable(); + } } +std::shared_ptr MonthTransform::ResultType() const { return int32(); } + Result> MonthTransform::Make( std::shared_ptr const& source_type) { if (!source_type) { @@ -176,11 +319,38 @@ Result> MonthTransform::Make( DayTransform::DayTransform(std::shared_ptr const& source_type) : TransformFunction(TransformType::kDay, source_type) {} -Result DayTransform::Transform(const ArrowArray& input) { - return NotImplemented("DayTransform::Transform"); +Result DayTransform::Transform(const Literal& literal) { + assert(literal.type() == source_type()); + if (literal.IsBelowMin() || literal.IsAboveMax()) { + return InvalidArgument( + "Cannot apply day transform to literal with value {} of type {}", + literal.ToString(), source_type()->ToString()); + } + if (literal.IsNull()) [[unlikely]] { + return Literal::Null(int32()); + } + + using namespace std::chrono; // NOLINT + switch (source_type()->type_id()) { + case TypeId::kDate: { + return Literal::Int(std::get(literal.value())); + } + case TypeId::kTimestamp: + case TypeId::kTimestampTz: { + auto value = std::get(literal.value()); + // Convert microseconds to `sys_days` (chronological days since epoch) + auto timestamp = sys_time(microseconds{value}); + auto days_since_epoch = floor(timestamp); + + return Literal::Int( + static_cast(days_since_epoch.time_since_epoch().count())); + } + default: + std::unreachable(); + } } -Result> DayTransform::ResultType() const { return iceberg::date(); } +std::shared_ptr DayTransform::ResultType() const { return int32(); } Result> DayTransform::Make( std::shared_ptr const& source_type) { @@ -202,14 +372,38 @@ Result> DayTransform::Make( HourTransform::HourTransform(std::shared_ptr const& source_type) : TransformFunction(TransformType::kHour, source_type) {} -Result HourTransform::Transform(const ArrowArray& input) { - return NotImplemented("HourTransform::Transform"); -} +Result HourTransform::Transform(const Literal& literal) { + assert(literal.type() == source_type()); + if (literal.IsBelowMin() || literal.IsAboveMax()) { + return InvalidArgument( + "Cannot apply hour transform to literal with value {} of type {}", + literal.ToString(), source_type()->ToString()); + } + + if (literal.IsNull()) [[unlikely]] { + return Literal::Null(int32()); + } + + using namespace std::chrono; // NOLINT + switch (source_type()->type_id()) { + case TypeId::kTimestamp: + case TypeId::kTimestampTz: { + auto value = std::get(literal.value()); + // Create a `sys_time` object from the microseconds value + auto timestamp = sys_time(microseconds{value}); + + // Convert the time since epoch directly into hours + auto hours_since_epoch = duration_cast(timestamp.time_since_epoch()).count(); -Result> HourTransform::ResultType() const { - return iceberg::int32(); + return Literal::Int(static_cast(hours_since_epoch)); + } + default: + std::unreachable(); + } } +std::shared_ptr HourTransform::ResultType() const { return int32(); } + Result> HourTransform::Make( std::shared_ptr const& source_type) { if (!source_type) { @@ -229,11 +423,11 @@ Result> HourTransform::Make( VoidTransform::VoidTransform(std::shared_ptr const& source_type) : TransformFunction(TransformType::kVoid, source_type) {} -Result VoidTransform::Transform(const ArrowArray& input) { - return NotImplemented("VoidTransform::Transform"); +Result VoidTransform::Transform(const Literal& literal) { + return literal.IsNull() ? literal : Literal::Null(literal.type()); } -Result> VoidTransform::ResultType() const { return source_type(); } +std::shared_ptr VoidTransform::ResultType() const { return source_type(); } Result> VoidTransform::Make( std::shared_ptr const& source_type) { diff --git a/src/iceberg/transform_function.h b/src/iceberg/transform_function.h index 7fffd61f0..6d810640a 100644 --- a/src/iceberg/transform_function.h +++ b/src/iceberg/transform_function.h @@ -30,11 +30,11 @@ class IdentityTransform : public TransformFunction { /// \param source_type Type of the input data. explicit IdentityTransform(std::shared_ptr const& source_type); - /// \brief Returns the input array without modification. - Result Transform(const ArrowArray& input) override; + /// \brief Returns the same Literal as the input. + Result Transform(const Literal& literal) override; - /// \brief Returns the same type as the source type if it is valid. - Result> ResultType() const override; + /// \brief Returns the same type as source_type. + std::shared_ptr ResultType() const override; /// \brief Create an IdentityTransform. /// \param source_type Type of the input data. @@ -50,11 +50,11 @@ class BucketTransform : public TransformFunction { /// \param num_buckets Number of buckets to hash into. BucketTransform(std::shared_ptr const& source_type, int32_t num_buckets); - /// \brief Applies the bucket hash function to the input array. - Result Transform(const ArrowArray& input) override; + /// \brief Applies the bucket hash function to the input Literal. + Result Transform(const Literal& literal) override; /// \brief Returns INT32 as the output type. - Result> ResultType() const override; + std::shared_ptr ResultType() const override; /// \brief Create a BucketTransform. /// \param source_type Type of the input data. @@ -74,11 +74,11 @@ class TruncateTransform : public TransformFunction { /// \param width The width to truncate to (e.g., for strings or numbers). TruncateTransform(std::shared_ptr const& source_type, int32_t width); - /// \brief Truncates values in the input array to the specified width. - Result Transform(const ArrowArray& input) override; + /// \brief Truncates the input Literal to the specified width. + Result Transform(const Literal& literal) override; /// \brief Returns the same type as source_type. - Result> ResultType() const override; + std::shared_ptr ResultType() const override; /// \brief Create a TruncateTransform. /// \param source_type Type of the input data. @@ -97,11 +97,11 @@ class YearTransform : public TransformFunction { /// \param source_type Must be a timestamp type. explicit YearTransform(std::shared_ptr const& source_type); - /// \brief Extracts the year from each timestamp in the input array. - Result Transform(const ArrowArray& input) override; + /// \brief Extract a date or timestamp year, as years from 1970. + Result Transform(const Literal& literal) override; /// \brief Returns INT32 as the output type. - Result> ResultType() const override; + std::shared_ptr ResultType() const override; /// \brief Create a YearTransform. /// \param source_type Type of the input data. @@ -116,11 +116,11 @@ class MonthTransform : public TransformFunction { /// \param source_type Must be a timestamp type. explicit MonthTransform(std::shared_ptr const& source_type); - /// \brief Extracts the month (1-12) from each timestamp in the input array. - Result Transform(const ArrowArray& input) override; + /// \brief Extract a date or timestamp month, as months from 1970-01-01. + Result Transform(const Literal& literal) override; /// \brief Returns INT32 as the output type. - Result> ResultType() const override; + std::shared_ptr ResultType() const override; /// \brief Create a MonthTransform. /// \param source_type Type of the input data. @@ -135,11 +135,11 @@ class DayTransform : public TransformFunction { /// \param source_type Must be a timestamp type. explicit DayTransform(std::shared_ptr const& source_type); - /// \brief Extracts the day (1-31) from each timestamp in the input array. - Result Transform(const ArrowArray& input) override; + /// \brief Extract a date or timestamp day, as days from 1970-01-01. + Result Transform(const Literal& literal) override; /// \brief Returns INT32 as the output type. - Result> ResultType() const override; + std::shared_ptr ResultType() const override; /// \brief Create a DayTransform. /// \param source_type Type of the input data. @@ -154,11 +154,11 @@ class HourTransform : public TransformFunction { /// \param source_type Must be a timestamp type. explicit HourTransform(std::shared_ptr const& source_type); - /// \brief Extracts the hour (0-23) from each timestamp in the input array. - Result Transform(const ArrowArray& input) override; + /// \brief Extract a timestamp hour, as hours from 1970-01-01 00:00:00. + Result Transform(const Literal& literal) override; /// \brief Returns INT32 as the output type. - Result> ResultType() const override; + std::shared_ptr ResultType() const override; /// \brief Create a HourTransform. /// \param source_type Type of the input data. @@ -173,11 +173,11 @@ class VoidTransform : public TransformFunction { /// \param source_type Input type (ignored). explicit VoidTransform(std::shared_ptr const& source_type); - /// \brief Returns an all-null array of the same length as the input. - Result Transform(const ArrowArray& input) override; + /// \brief Returns a null literal. + Result Transform(const Literal& literal) override; - /// \brief Returns null type or a sentinel type indicating void. - Result> ResultType() const override; + /// \brief Returns the same type as source_type. + std::shared_ptr ResultType() const override; /// \brief Create a VoidTransform. /// \param source_type Input type (ignored). diff --git a/src/iceberg/type_fwd.h b/src/iceberg/type_fwd.h index 83574eec0..0135422c1 100644 --- a/src/iceberg/type_fwd.h +++ b/src/iceberg/type_fwd.h @@ -120,6 +120,14 @@ class FileScanTask; class TableScan; class TableScanBuilder; +struct DataFile; +struct ManifestEntry; +struct ManifestFile; +struct ManifestList; + +class ManifestReader; +class ManifestListReader; + /// ---------------------------------------------------------------------------- /// TODO: Forward declarations below are not added yet. /// ---------------------------------------------------------------------------- @@ -131,12 +139,4 @@ class UpdateRequirement; class AppendFiles; -struct DataFile; -struct ManifestEntry; -struct ManifestFile; -struct ManifestList; - -class ManifestReader; -class ManifestListReader; - } // namespace iceberg diff --git a/src/iceberg/util/string_utils.h b/src/iceberg/util/string_utils.h index 9ff250b66..558fc293c 100644 --- a/src/iceberg/util/string_utils.h +++ b/src/iceberg/util/string_utils.h @@ -27,7 +27,7 @@ namespace iceberg { -ICEBERG_EXPORT class StringUtils { +class ICEBERG_EXPORT StringUtils { public: static std::string ToLower(std::string_view str) { std::string input(str); diff --git a/src/iceberg/util/truncate_utils.h b/src/iceberg/util/truncate_utils.h new file mode 100644 index 000000000..5e76135c5 --- /dev/null +++ b/src/iceberg/util/truncate_utils.h @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include + +#include "iceberg/iceberg_export.h" + +namespace iceberg { + +class ICEBERG_EXPORT TruncateUtils { + public: + /// \brief Truncate a UTF-8 string to a specified number of code points. + /// + /// \param source The input string to truncate. + /// \param L The maximum number of code points allowed in the output string. + /// \return A valid UTF-8 string truncated to L code points. + /// If the input string is already valid and has fewer than L code points, it is + /// returned unchanged. + static std::string TruncateUTF8(std::string source, size_t L) { + size_t code_point_count = 0; + size_t safe_point = 0; + + for (size_t i = 0; i < source.size(); ++i) { + // Start of a new UTF-8 code point + if ((source[i] & 0xC0) != 0x80) { + code_point_count++; + if (code_point_count > static_cast(L)) { + safe_point = i; + break; + } + } + } + + if (safe_point != 0) { + // Resize the string to the safe point + source.resize(safe_point); + } + + return std::move(source); + } + + /// \brief Truncate an integer v, either int32_t or int64_t, to v - (v % W). + /// + /// The remainder, v % W, must be positive. For languages where % can produce negative + /// values, the correct truncate function is: v - (((v % W) + W) % W) + template + requires std::is_same_v || std::is_same_v + static inline T TruncateInteger(T v, size_t W) { + return v - (((v % W) + W) % W); + } +}; + +} // namespace iceberg diff --git a/test/transform_test.cc b/test/transform_test.cc index 33149d14d..c1efcb56a 100644 --- a/test/transform_test.cc +++ b/test/transform_test.cc @@ -25,6 +25,7 @@ #include #include +#include "iceberg/expression/literal.h" #include "iceberg/type.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep #include "matchers.h" @@ -40,12 +41,6 @@ TEST(TransformTest, Transform) { auto source_type = iceberg::string(); auto identity_transform = transform->Bind(source_type); ASSERT_TRUE(identity_transform); - - ArrowArray arrow_array; - auto result = identity_transform.value()->Transform(arrow_array); - ASSERT_FALSE(result); - EXPECT_EQ(ErrorKind::kNotImplemented, result.error().kind); - EXPECT_EQ("IdentityTransform::Transform", result.error().message); } TEST(TransformFunctionTest, CreateBucketTransform) { @@ -136,7 +131,7 @@ TEST(TransformResultTypeTest, PositiveCases) { .expected_result_type = iceberg::int32()}, {.str = "day", .source_type = iceberg::timestamp(), - .expected_result_type = iceberg::date()}, + .expected_result_type = iceberg::int32()}, {.str = "hour", .source_type = iceberg::timestamp(), .expected_result_type = iceberg::int32()}, @@ -160,8 +155,7 @@ TEST(TransformResultTypeTest, PositiveCases) { ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind: " << c.str; auto result_type = transformPtr.value()->ResultType(); - ASSERT_TRUE(result_type.has_value()) << "Failed to get result type for: " << c.str; - EXPECT_EQ(result_type.value()->type_id(), c.expected_result_type->type_id()) + EXPECT_EQ(result_type->type_id(), c.expected_result_type->type_id()) << "Unexpected result type for: " << c.str; } } @@ -193,4 +187,373 @@ TEST(TransformResultTypeTest, NegativeCases) { } } +TEST(TransformLiteralTest, IdentityTransform) { + struct Case { + std::shared_ptr source_type; + Literal source; + Literal expected; + }; + + const std::vector cases = { + {.source_type = iceberg::boolean(), + .source = Literal::Boolean(true), + .expected = Literal::Boolean(true)}, + {.source_type = iceberg::int32(), + .source = Literal::Int(42), + .expected = Literal::Int(42)}, + {.source_type = iceberg::int32(), + .source = Literal::Date(30000), + .expected = Literal::Date(30000)}, + {.source_type = iceberg::int64(), + .source = Literal::Long(1234567890), + .expected = Literal::Long(1234567890)}, + {.source_type = iceberg::timestamp(), + .source = Literal::Timestamp(1622547800000000), + .expected = Literal::Timestamp(1622547800000000)}, + {.source_type = iceberg::timestamp_tz(), + .source = Literal::TimestampTz(1622547800000000), + .expected = Literal::TimestampTz(1622547800000000)}, + {.source_type = iceberg::float32(), + .source = Literal::Float(3.14), + .expected = Literal::Float(3.14)}, + {.source_type = iceberg::float64(), + .source = Literal::Double(1.23e-5), + .expected = Literal::Double(1.23e-5)}, + {.source_type = iceberg::string(), + .source = Literal::String("Hello, World!"), + .expected = Literal::String("Hello, World!")}, + {.source_type = iceberg::binary(), + .source = Literal::Binary({0x01, 0x02, 0x03}), + .expected = Literal::Binary({0x01, 0x02, 0x03})}, + }; + + for (const auto& c : cases) { + auto transform = Transform::Identity(); + auto transformPtr = transform->Bind(c.source_type); + ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind identity transform"; + + auto result = transformPtr.value()->Transform(c.source); + ASSERT_TRUE(result.has_value()) + << "Failed to transform literal: " << c.source.ToString(); + + EXPECT_EQ(result.value(), c.expected) + << "Unexpected result for source: " << c.source.ToString(); + } +} + +TEST(TransformLiteralTest, BucketTransform) { + constexpr int32_t num_buckets = 4; + auto transform = Transform::Bucket(num_buckets); + + struct Case { + std::shared_ptr source_type; + Literal source; + Literal expected; + }; + + const std::vector cases = { + {.source_type = iceberg::int32(), + .source = Literal::Int(42), + .expected = Literal::Int(3)}, + {.source_type = iceberg::date(), + .source = Literal::Date(30000), + .expected = Literal::Int(2)}, + {.source_type = iceberg::int64(), + .source = Literal::Long(1234567890), + .expected = Literal::Int(3)}, + {.source_type = iceberg::timestamp(), + .source = Literal::Timestamp(1622547800000000), + .expected = Literal::Int(1)}, + {.source_type = iceberg::timestamp_tz(), + .source = Literal::TimestampTz(1622547800000000), + .expected = Literal::Int(1)}, + {.source_type = iceberg::string(), + .source = Literal::String("test"), + .expected = Literal::Int(3)}, + }; + + for (const auto& c : cases) { + auto transformPtr = transform->Bind(c.source_type); + ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind bucket transform"; + auto result = transformPtr.value()->Transform(c.source); + ASSERT_TRUE(result.has_value()) + << "Failed to transform literal: " << c.source.ToString(); + + EXPECT_EQ(result.value(), c.expected) + << "Unexpected result for source: " << c.source.ToString(); + } +} + +TEST(TransformLiteralTest, TruncateTransform) { + struct Case { + std::shared_ptr source_type; + int32_t width; + Literal source; + Literal expected; + }; + + const std::vector cases = { + {.source_type = iceberg::int32(), + .width = 5, + .source = Literal::Int(123456), + .expected = Literal::Int(123455)}, + {.source_type = iceberg::string(), + .width = 5, + .source = Literal::String("Hello, World!"), + .expected = Literal::String("Hello")}, + {.source_type = iceberg::string(), + .width = 5, + .source = Literal::String("😜🧐🤔🤪🥳😵‍💫😂"), + // Truncate to 5 utf-8 code points + .expected = Literal::String("😜🧐🤔🤪🥳")}, + {.source_type = iceberg::string(), + .width = 8, + .source = Literal::String("a😜b🧐c🤔d🤪e🥳"), + .expected = Literal::String("a😜b🧐c🤔d🤪")}, + {.source_type = iceberg::binary(), + .width = 5, + .source = Literal::Binary({0x01, 0x02, 0x03, 0x04, 0x05, 0x06}), + .expected = Literal::Binary({0x01, 0x02, 0x03, 0x04, 0x05})}, + }; + + for (const auto& c : cases) { + auto transform = Transform::Truncate(c.width); + auto transformPtr = transform->Bind(c.source_type); + ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind truncate transform"; + auto result = transformPtr.value()->Transform(c.source); + ASSERT_TRUE(result.has_value()) + << "Failed to transform literal: " << c.source.ToString(); + + EXPECT_EQ(result.value(), c.expected) + << "Unexpected result for source: " << c.source.ToString(); + } +} + +TEST(TransformLiteralTest, YearTransform) { + auto transform = Transform::Year(); + + struct Case { + std::shared_ptr source_type; + Literal source; + Literal expected; + }; + + const std::vector cases = { + {.source_type = iceberg::timestamp(), + // 2021-06-01T11:43:20Z + .source = Literal::Timestamp(1622547800000000), + .expected = Literal::Int(2021)}, + {.source_type = iceberg::timestamp_tz(), + .source = Literal::TimestampTz(1622547800000000), + .expected = Literal::Int(2021)}, + {.source_type = iceberg::date(), + .source = Literal::Date(30000), + .expected = Literal::Int(2052)}, + }; + + for (const auto& c : cases) { + auto transformPtr = transform->Bind(c.source_type); + ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind year transform"; + auto result = transformPtr.value()->Transform(c.source); + ASSERT_TRUE(result.has_value()) + << "Failed to transform literal: " << c.source.ToString(); + + EXPECT_EQ(result.value(), c.expected) + << "Unexpected result for source: " << c.source.ToString(); + } +} + +TEST(TransformLiteralTest, MonthTransform) { + auto transform = Transform::Month(); + + struct Case { + std::shared_ptr source_type; + Literal source; + Literal expected; + }; + + const std::vector cases = { + {.source_type = iceberg::timestamp(), + .source = Literal::Timestamp(1622547800000000), + .expected = Literal::Int(617)}, + {.source_type = iceberg::timestamp_tz(), + .source = Literal::TimestampTz(1622547800000000), + .expected = Literal::Int(617)}, + {.source_type = iceberg::date(), + .source = Literal::Date(30000), + .expected = Literal::Int(985)}, + }; + + for (const auto& c : cases) { + auto transformPtr = transform->Bind(c.source_type); + ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind month transform"; + auto result = transformPtr.value()->Transform(c.source); + ASSERT_TRUE(result.has_value()) + << "Failed to transform literal: " << c.source.ToString(); + + EXPECT_EQ(result.value(), c.expected) + << "Unexpected result for source: " << c.source.ToString(); + } +} + +TEST(TransformFunctionTransformTest, DayTransform) { + auto transform = Transform::Day(); + + struct Case { + std::shared_ptr source_type; + Literal source; + Literal expected; + }; + + const std::vector cases = { + {.source_type = iceberg::timestamp(), + .source = Literal::Timestamp(1622547800000000), + .expected = Literal::Int(18779)}, + {.source_type = iceberg::timestamp_tz(), + .source = Literal::TimestampTz(1622547800000000), + .expected = Literal::Int(18779)}, + {.source_type = iceberg::date(), + .source = Literal::Date(30000), + .expected = Literal::Int(30000)}, + }; + + for (const auto& c : cases) { + auto transformPtr = transform->Bind(c.source_type); + ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind day transform"; + auto result = transformPtr.value()->Transform(c.source); + ASSERT_TRUE(result.has_value()) + << "Failed to transform literal: " << c.source.ToString(); + + EXPECT_EQ(result.value(), c.expected) + << "Unexpected result for source: " << c.source.ToString(); + } +} + +TEST(TransformLiteralTest, HourTransform) { + auto transform = Transform::Hour(); + + struct Case { + std::shared_ptr source_type; + Literal source; + Literal expected; + }; + + const std::vector cases = { + {.source_type = iceberg::timestamp(), + .source = Literal::Timestamp(1622547800000000), + .expected = Literal::Int(450707)}, + {.source_type = iceberg::timestamp_tz(), + .source = Literal::TimestampTz(1622547800000000), + .expected = Literal::Int(450707)}, + }; + + for (const auto& c : cases) { + auto transformPtr = transform->Bind(c.source_type); + ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind hour transform"; + auto result = transformPtr.value()->Transform(c.source); + ASSERT_TRUE(result.has_value()) + << "Failed to transform literal: " << c.source.ToString(); + + EXPECT_EQ(result.value(), c.expected) + << "Unexpected result for source: " << c.source.ToString(); + } +} + +TEST(TransformLiteralTest, VoidTransform) { + auto transform = Transform::Void(); + + struct Case { + std::shared_ptr source_type; + Literal source; + }; + + const std::vector cases = { + {.source_type = iceberg::boolean(), .source = Literal::Boolean(true)}, + {.source_type = iceberg::int32(), .source = Literal::Int(42)}, + {.source_type = iceberg::date(), .source = Literal::Date(30000)}, + {.source_type = iceberg::int64(), .source = Literal::Long(1234567890)}, + {.source_type = iceberg::timestamp(), + .source = Literal::Timestamp(1622547800000000)}, + {.source_type = iceberg::timestamp_tz(), + .source = Literal::TimestampTz(1622547800000000)}, + {.source_type = iceberg::float32(), .source = Literal::Float(3.14)}, + {.source_type = iceberg::float64(), .source = Literal::Double(1.23e-5)}, + {.source_type = iceberg::string(), .source = Literal::String("Hello, World!")}, + {.source_type = iceberg::binary(), .source = Literal::Binary({0x01, 0x02, 0x03})}, + }; + + for (const auto& c : cases) { + auto transformPtr = transform->Bind(c.source_type); + ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind void transform"; + auto result = transformPtr.value()->Transform(c.source); + EXPECT_TRUE(result->IsNull()) + << "Expected void transform to return null type for source: " + << c.source.ToString(); + EXPECT_EQ(result->type()->type_id(), c.source_type->type_id()) + << "Expected void transform to return same type as source for: " + << c.source.ToString(); + } +} + +TEST(TransformLiteralTest, NullLiteral) { + struct Case { + std::string str; + std::shared_ptr source_type; + Literal source; + std::shared_ptr expected_result_type; + }; + + const std::vector cases = { + {.str = "identity", + .source_type = iceberg::string(), + .source = Literal::Null(iceberg::string()), + .expected_result_type = iceberg::string()}, + {.str = "year", + .source_type = iceberg::timestamp(), + .source = Literal::Null(iceberg::timestamp()), + .expected_result_type = iceberg::int32()}, + {.str = "month", + .source_type = iceberg::timestamp(), + .source = Literal::Null(iceberg::timestamp()), + .expected_result_type = iceberg::int32()}, + {.str = "day", + .source_type = iceberg::timestamp(), + .source = Literal::Null(iceberg::timestamp()), + .expected_result_type = iceberg::int32()}, + {.str = "hour", + .source_type = iceberg::timestamp(), + .source = Literal::Null(iceberg::timestamp()), + .expected_result_type = iceberg::int32()}, + {.str = "void", + .source_type = iceberg::string(), + .source = Literal::Null(iceberg::string()), + .expected_result_type = iceberg::string()}, + {.str = "bucket[16]", + .source_type = iceberg::string(), + .source = Literal::Null(iceberg::string()), + .expected_result_type = iceberg::int32()}, + {.str = "truncate[32]", + .source_type = iceberg::string(), + .source = Literal::Null(iceberg::string()), + .expected_result_type = iceberg::string()}, + }; + + for (const auto& c : cases) { + auto result = TransformFromString(c.str); + ASSERT_TRUE(result.has_value()) << "Failed to parse: " << c.str; + + const auto& transform = result.value(); + const auto transformPtr = transform->Bind(c.source_type); + ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind: " << c.str; + + auto transform_result = transformPtr.value()->Transform(c.source); + EXPECT_TRUE(transform_result->IsNull()) + << "Expected void transform to return null type for source: " + << c.source.ToString(); + EXPECT_EQ(transform_result->type()->type_id(), c.expected_result_type->type_id()) + << "Expected void transform to return same type as source for: " + << c.source.ToString(); + } +} + } // namespace iceberg