diff --git a/.github/workflows/sanitizer_test.yml b/.github/workflows/sanitizer_test.yml index 6c9c50eef..51fee8a27 100644 --- a/.github/workflows/sanitizer_test.yml +++ b/.github/workflows/sanitizer_test.yml @@ -50,9 +50,9 @@ jobs: - name: Run Tests working-directory: build env: - ASAN_OPTIONS: log_path=out.log:detect_leaks=1:symbolize=1:strict_string_checks=1:halt_on_error=0:detect_container_overflow=0 + ASAN_OPTIONS: log_path=out.log:detect_leaks=1:symbolize=1:strict_string_checks=1:halt_on_error=1:detect_container_overflow=0 LSAN_OPTIONS: suppressions=${{ github.workspace }}/.github/lsan-suppressions.txt - UBSAN_OPTIONS: log_path=out.log:halt_on_error=0:print_stacktrace=1:suppressions=${{ github.workspace }}/.github/ubsan-suppressions.txt + UBSAN_OPTIONS: log_path=out.log:halt_on_error=1:print_stacktrace=1:suppressions=${{ github.workspace }}/.github/ubsan-suppressions.txt run: | ctest --output-on-failure - name: Save the test output diff --git a/CMakeLists.txt b/CMakeLists.txt index 9c2a4fe50..f58df9bc0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,6 +32,7 @@ set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +set(CMAKE_COMPILE_WARNING_AS_ERROR ON) option(ICEBERG_BUILD_STATIC "Build static library" ON) option(ICEBERG_BUILD_SHARED "Build shared library" OFF) diff --git a/cmake_modules/IcebergThirdpartyToolchain.cmake b/cmake_modules/IcebergThirdpartyToolchain.cmake index 3826ee95f..b840e6386 100644 --- a/cmake_modules/IcebergThirdpartyToolchain.cmake +++ b/cmake_modules/IcebergThirdpartyToolchain.cmake @@ -71,6 +71,9 @@ function(resolve_arrow_dependency) set(ARROW_FILESYSTEM ON CACHE BOOL "" FORCE) + set(ARROW_JSON + ON + CACHE BOOL "" FORCE) set(ARROW_PARQUET ON CACHE BOOL "" FORCE) @@ -95,8 +98,8 @@ function(resolve_arrow_dependency) fetchcontent_declare(VendoredArrow ${FC_DECLARE_COMMON_OPTIONS} - GIT_REPOSITORY https://github.com/wgtmac/arrow.git - GIT_TAG 7d50c4ac803ad983734de5f418b7cd18f25b0dc9 + GIT_REPOSITORY https://github.com/apache/arrow.git + GIT_TAG 5f0aeb5de53fb25b59a52661a80071faef99a4a4 #URL ${ARROW_SOURCE_URL} #URL_HASH "SHA256=${ICEBERG_ARROW_BUILD_SHA256_CHECKSUM}" SOURCE_SUBDIR diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index 4c660d7d4..a3a6cf566 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -22,6 +22,7 @@ set(ICEBERG_SOURCES catalog/in_memory_catalog.cc demo.cc expression/expression.cc + expression/literal.cc file_reader.cc json_internal.cc manifest_entry.cc @@ -38,6 +39,7 @@ set(ICEBERG_SOURCES sort_field.cc sort_order.cc statistics_file.cc + table.cc table_metadata.cc transform.cc transform_function.cc diff --git a/src/iceberg/avro/avro_data_util.cc b/src/iceberg/avro/avro_data_util.cc index 48ac7e677..e3b4b0638 100644 --- a/src/iceberg/avro/avro_data_util.cc +++ b/src/iceberg/avro/avro_data_util.cc @@ -17,16 +17,439 @@ * under the License. */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "iceberg/arrow/arrow_error_transform_internal.h" #include "iceberg/avro/avro_data_util_internal.h" +#include "iceberg/avro/avro_schema_util_internal.h" +#include "iceberg/schema.h" +#include "iceberg/schema_util.h" +#include "iceberg/util/checked_cast.h" +#include "iceberg/util/macros.h" namespace iceberg::avro { +using ::iceberg::arrow::ToErrorKind; + +namespace { + +/// \brief Forward declaration for mutual recursion. +Status AppendFieldToBuilder(const ::avro::NodePtr& avro_node, + const ::avro::GenericDatum& avro_datum, + const FieldProjection& projection, + const SchemaField& projected_field, + ::arrow::ArrayBuilder* array_builder); + +/// \brief Append Avro record data to Arrow struct builder. +Status AppendStructToBuilder(const ::avro::NodePtr& avro_node, + const ::avro::GenericDatum& avro_datum, + const std::span& projections, + const StructType& struct_type, + ::arrow::ArrayBuilder* array_builder) { + if (avro_node->type() != ::avro::AVRO_RECORD) { + return InvalidArgument("Expected Avro record, got type: {}", ToString(avro_node)); + } + const auto& avro_record = avro_datum.value<::avro::GenericRecord>(); + + auto* struct_builder = internal::checked_cast<::arrow::StructBuilder*>(array_builder); + ICEBERG_ARROW_RETURN_NOT_OK(struct_builder->Append()); + + for (size_t i = 0; i < projections.size(); ++i) { + const auto& field_projection = projections[i]; + const auto& expected_field = struct_type.fields()[i]; + auto* field_builder = struct_builder->field_builder(static_cast(i)); + + if (field_projection.kind == FieldProjection::Kind::kProjected) { + size_t avro_field_index = std::get(field_projection.from); + if (avro_field_index >= avro_record.fieldCount()) { + return InvalidArgument("Avro field index {} out of bound {}", avro_field_index, + avro_record.fieldCount()); + } + + const auto& avro_field_node = avro_node->leafAt(avro_field_index); + const auto& avro_field_datum = avro_record.fieldAt(avro_field_index); + ICEBERG_RETURN_UNEXPECTED(AppendFieldToBuilder(avro_field_node, avro_field_datum, + field_projection, expected_field, + field_builder)); + } else if (field_projection.kind == FieldProjection::Kind::kNull) { + ICEBERG_ARROW_RETURN_NOT_OK(field_builder->AppendNull()); + } else { + return NotImplemented("Unsupported field projection kind: {}", + ToString(field_projection.kind)); + } + } + return {}; +} + +/// \brief Append Avro array data to Arrow list builder. +Status AppendListToBuilder(const ::avro::NodePtr& avro_node, + const ::avro::GenericDatum& avro_datum, + const FieldProjection& element_projection, + const ListType& list_type, + ::arrow::ArrayBuilder* array_builder) { + if (avro_node->type() != ::avro::AVRO_ARRAY) { + return InvalidArgument("Expected Avro array, got type: {}", ToString(avro_node)); + } + const auto& avro_array = avro_datum.value<::avro::GenericArray>(); + + auto* list_builder = internal::checked_cast<::arrow::ListBuilder*>(array_builder); + ICEBERG_ARROW_RETURN_NOT_OK(list_builder->Append()); + + auto* value_builder = list_builder->value_builder(); + const auto& element_node = avro_node->leafAt(0); + const auto& element_field = list_type.fields().back(); + + for (const auto& element : avro_array.value()) { + ICEBERG_RETURN_UNEXPECTED(AppendFieldToBuilder( + element_node, element, element_projection, element_field, value_builder)); + } + return {}; +} + +/// \brief Append Avro map data to Arrow map builder. +Status AppendMapToBuilder(const ::avro::NodePtr& avro_node, + const ::avro::GenericDatum& avro_datum, + const FieldProjection& key_projection, + const FieldProjection& value_projection, + const MapType& map_type, ::arrow::ArrayBuilder* array_builder) { + auto* map_builder = internal::checked_cast<::arrow::MapBuilder*>(array_builder); + + if (avro_node->type() == ::avro::AVRO_MAP) { + // Handle regular Avro map: map + const auto& avro_map = avro_datum.value<::avro::GenericMap>(); + const auto& map_entries = avro_map.value(); + + const auto& key_node = avro_node->leafAt(0); + const auto& value_node = avro_node->leafAt(1); + + const auto& key_field = map_type.key(); + const auto& value_field = map_type.value(); + + ICEBERG_ARROW_RETURN_NOT_OK(map_builder->Append()); + auto* key_builder = map_builder->key_builder(); + auto* item_builder = map_builder->item_builder(); + + for (const auto& entry : map_entries) { + ICEBERG_RETURN_UNEXPECTED(AppendFieldToBuilder( + key_node, entry.first, key_projection, key_field, key_builder)); + ICEBERG_RETURN_UNEXPECTED(AppendFieldToBuilder( + value_node, entry.second, value_projection, value_field, item_builder)); + } + + return {}; + } else if (avro_node->type() == ::avro::AVRO_ARRAY && HasMapLogicalType(avro_node)) { + // Handle array-based map: list> + const auto& avro_array = avro_datum.value<::avro::GenericArray>(); + const auto& array_entries = avro_array.value(); + + const auto& key_field = map_type.key(); + const auto& value_field = map_type.value(); + + ICEBERG_ARROW_RETURN_NOT_OK(map_builder->Append()); + auto* key_builder = map_builder->key_builder(); + auto* item_builder = map_builder->item_builder(); + + const auto& record_node = avro_node->leafAt(0); + if (record_node->type() != ::avro::AVRO_RECORD || record_node->leaves() != 2) { + return InvalidArgument( + "Array-based map must contain records with exactly 2 fields, got: {}", + ToString(record_node)); + } + const auto& key_node = record_node->leafAt(0); + const auto& value_node = record_node->leafAt(1); + + for (const auto& entry : array_entries) { + const auto& record = entry.value<::avro::GenericRecord>(); + ICEBERG_RETURN_UNEXPECTED(AppendFieldToBuilder( + key_node, record.fieldAt(0), key_projection, key_field, key_builder)); + ICEBERG_RETURN_UNEXPECTED(AppendFieldToBuilder( + value_node, record.fieldAt(1), value_projection, value_field, item_builder)); + } + + return {}; + } else { + return InvalidArgument("Expected Avro map or array with map logical type, got: {}", + ToString(avro_node)); + } +} + +/// \brief Append nested Avro data to Arrow array builder based on type. +Status AppendNestedValueToBuilder(const ::avro::NodePtr& avro_node, + const ::avro::GenericDatum& avro_datum, + const std::span& projections, + const NestedType& projected_type, + ::arrow::ArrayBuilder* array_builder) { + switch (projected_type.type_id()) { + case TypeId::kStruct: { + const auto& struct_type = internal::checked_cast(projected_type); + return AppendStructToBuilder(avro_node, avro_datum, projections, struct_type, + array_builder); + } + + case TypeId::kList: { + if (projections.size() != 1) { + return InvalidArgument("Expected 1 projection for list, got: {}", + projections.size()); + } + const auto& list_type = internal::checked_cast(projected_type); + return AppendListToBuilder(avro_node, avro_datum, projections[0], list_type, + array_builder); + } + + case TypeId::kMap: { + if (projections.size() != 2) { + return InvalidArgument("Expected 2 projections for map, got: {}", + projections.size()); + } + const auto& map_type = internal::checked_cast(projected_type); + return AppendMapToBuilder(avro_node, avro_datum, projections[0], projections[1], + map_type, array_builder); + } + + default: + return InvalidArgument("Unsupported nested type: {}", projected_type.ToString()); + } +} + +Status AppendPrimitiveValueToBuilder(const ::avro::NodePtr& avro_node, + const ::avro::GenericDatum& avro_datum, + const SchemaField& projected_field, + ::arrow::ArrayBuilder* array_builder) { + const auto& projected_type = *projected_field.type(); + if (!projected_type.is_primitive()) { + return InvalidArgument("Expected primitive type, got: {}", projected_type.ToString()); + } + + switch (projected_type.type_id()) { + case TypeId::kBoolean: { + if (avro_node->type() != ::avro::AVRO_BOOL) { + return InvalidArgument("Expected Avro boolean for boolean field, got: {}", + ToString(avro_node)); + } + auto* builder = internal::checked_cast<::arrow::BooleanBuilder*>(array_builder); + ICEBERG_ARROW_RETURN_NOT_OK(builder->Append(avro_datum.value())); + return {}; + } + + case TypeId::kInt: { + if (avro_node->type() != ::avro::AVRO_INT) { + return InvalidArgument("Expected Avro int for int field, got: {}", + ToString(avro_node)); + } + auto* builder = internal::checked_cast<::arrow::Int32Builder*>(array_builder); + ICEBERG_ARROW_RETURN_NOT_OK(builder->Append(avro_datum.value())); + return {}; + } + + case TypeId::kLong: { + auto* builder = internal::checked_cast<::arrow::Int64Builder*>(array_builder); + if (avro_node->type() == ::avro::AVRO_LONG) { + ICEBERG_ARROW_RETURN_NOT_OK(builder->Append(avro_datum.value())); + } else if (avro_node->type() == ::avro::AVRO_INT) { + ICEBERG_ARROW_RETURN_NOT_OK( + builder->Append(static_cast(avro_datum.value()))); + } else { + return InvalidArgument("Expected Avro int/long for long field, got: {}", + ToString(avro_node)); + } + return {}; + } + + case TypeId::kFloat: { + if (avro_node->type() != ::avro::AVRO_FLOAT) { + return InvalidArgument("Expected Avro float for float field, got: {}", + ToString(avro_node)); + } + auto* builder = internal::checked_cast<::arrow::FloatBuilder*>(array_builder); + ICEBERG_ARROW_RETURN_NOT_OK(builder->Append(avro_datum.value())); + return {}; + } + + case TypeId::kDouble: { + auto* builder = internal::checked_cast<::arrow::DoubleBuilder*>(array_builder); + if (avro_node->type() == ::avro::AVRO_DOUBLE) { + ICEBERG_ARROW_RETURN_NOT_OK(builder->Append(avro_datum.value())); + } else if (avro_node->type() == ::avro::AVRO_FLOAT) { + ICEBERG_ARROW_RETURN_NOT_OK( + builder->Append(static_cast(avro_datum.value()))); + } else { + return InvalidArgument("Expected Avro float/double for double field, got: {}", + ToString(avro_node)); + } + return {}; + } + + case TypeId::kString: { + if (avro_node->type() != ::avro::AVRO_STRING) { + return InvalidArgument("Expected Avro string for string field, got: {}", + ToString(avro_node)); + } + auto* builder = internal::checked_cast<::arrow::StringBuilder*>(array_builder); + ICEBERG_ARROW_RETURN_NOT_OK(builder->Append(avro_datum.value())); + return {}; + } + + case TypeId::kBinary: { + if (avro_node->type() != ::avro::AVRO_BYTES) { + return InvalidArgument("Expected Avro bytes for binary field, got: {}", + ToString(avro_node)); + } + auto* builder = internal::checked_cast<::arrow::BinaryBuilder*>(array_builder); + const auto& bytes = avro_datum.value>(); + ICEBERG_ARROW_RETURN_NOT_OK( + builder->Append(bytes.data(), static_cast(bytes.size()))); + return {}; + } + + case TypeId::kFixed: { + if (avro_node->type() != ::avro::AVRO_FIXED) { + return InvalidArgument("Expected Avro fixed for fixed field, got: {}", + ToString(avro_node)); + } + const auto& fixed = avro_datum.value<::avro::GenericFixed>(); + const auto& fixed_type = internal::checked_cast(projected_type); + + if (static_cast(fixed.value().size()) != fixed_type.length()) { + return InvalidArgument("Expected Avro fixed[{}], got: {}", fixed_type.length(), + ToString(avro_node)); + } + + auto* builder = + internal::checked_cast<::arrow::FixedSizeBinaryBuilder*>(array_builder); + const auto& value = fixed.value(); + ICEBERG_ARROW_RETURN_NOT_OK( + builder->Append(reinterpret_cast(value.data()))); + return {}; + } + + case TypeId::kUuid: { + if (avro_node->type() != ::avro::AVRO_FIXED || + avro_node->logicalType().type() != ::avro::LogicalType::UUID) { + return InvalidArgument("Expected Avro fixed for uuid field, got: {}", + ToString(avro_node)); + } + + auto* builder = + internal::checked_cast<::arrow::FixedSizeBinaryBuilder*>(array_builder); + const auto& fixed = avro_datum.value<::avro::GenericFixed>(); + if (fixed.value().size() != 16) { + return InvalidArgument("Expected UUID fixed length 16, got: {}", + fixed.value().size()); + } + const auto& value = fixed.value(); + ICEBERG_ARROW_RETURN_NOT_OK( + builder->Append(reinterpret_cast(value.data()))); + return {}; + } + + case TypeId::kDecimal: { + if (avro_node->type() != ::avro::AVRO_FIXED || + avro_node->logicalType().type() != ::avro::LogicalType::DECIMAL) { + return InvalidArgument( + "Expected Avro fixed with decimal logical type for decimal field, got: {}", + ToString(avro_node)); + } + + const auto& fixed = avro_datum.value<::avro::GenericFixed>(); + const auto& value = fixed.value(); + ICEBERG_ARROW_ASSIGN_OR_RETURN( + auto decimal, ::arrow::Decimal128::FromBigEndian(value.data(), value.size())); + auto* builder = internal::checked_cast<::arrow::Decimal128Builder*>(array_builder); + ICEBERG_ARROW_RETURN_NOT_OK(builder->Append(decimal)); + return {}; + } + + case TypeId::kDate: { + if (avro_node->type() != ::avro::AVRO_INT || + avro_node->logicalType().type() != ::avro::LogicalType::DATE) { + return InvalidArgument( + "Expected Avro int with DATE logical type for date field, got: {}", + ToString(avro_node)); + } + auto* builder = internal::checked_cast<::arrow::Date32Builder*>(array_builder); + ICEBERG_ARROW_RETURN_NOT_OK(builder->Append(avro_datum.value())); + return {}; + } + + case TypeId::kTime: { + if (avro_node->type() != ::avro::AVRO_LONG || + avro_node->logicalType().type() != ::avro::LogicalType::TIME_MICROS) { + return InvalidArgument( + "Expected Avro long with TIME_MICROS for time field, got: {}", + ToString(avro_node)); + } + auto* builder = internal::checked_cast<::arrow::Time64Builder*>(array_builder); + ICEBERG_ARROW_RETURN_NOT_OK(builder->Append(avro_datum.value())); + return {}; + } + + case TypeId::kTimestamp: + case TypeId::kTimestampTz: { + if (avro_node->type() != ::avro::AVRO_LONG || + avro_node->logicalType().type() != ::avro::LogicalType::TIMESTAMP_MICROS) { + return InvalidArgument( + "Expected Avro long with TIMESTAMP_MICROS for timestamp field, got: {}", + ToString(avro_node)); + } + auto* builder = internal::checked_cast<::arrow::TimestampBuilder*>(array_builder); + ICEBERG_ARROW_RETURN_NOT_OK(builder->Append(avro_datum.value())); + return {}; + } + + default: + return InvalidArgument("Unsupported primitive type {} to append avro node {}", + projected_field.type()->ToString(), ToString(avro_node)); + } +} + +/// \brief Dispatch to appropriate handlers based on the projection kind. +Status AppendFieldToBuilder(const ::avro::NodePtr& avro_node, + const ::avro::GenericDatum& avro_datum, + const FieldProjection& projection, + const SchemaField& projected_field, + ::arrow::ArrayBuilder* array_builder) { + if (avro_node->type() == ::avro::AVRO_UNION) { + const auto& union_datum = avro_datum.value<::avro::GenericUnion>(); + size_t branch = union_datum.currentBranch(); + if (avro_node->leafAt(branch)->type() == ::avro::AVRO_NULL) { + ICEBERG_ARROW_RETURN_NOT_OK(array_builder->AppendNull()); + return {}; + } else { + return AppendFieldToBuilder(avro_node->leafAt(branch), union_datum.datum(), + projection, projected_field, array_builder); + } + } + + const auto& projected_type = *projected_field.type(); + if (projected_type.is_primitive()) { + return AppendPrimitiveValueToBuilder(avro_node, avro_datum, projected_field, + array_builder); + } else { + const auto& nested_type = internal::checked_cast(projected_type); + return AppendNestedValueToBuilder(avro_node, avro_datum, projection.children, + nested_type, array_builder); + } +} + +} // namespace + Status AppendDatumToBuilder(const ::avro::NodePtr& avro_node, const ::avro::GenericDatum& avro_datum, const SchemaProjection& projection, - const Schema& arrow_schema, + const Schema& projected_schema, ::arrow::ArrayBuilder* array_builder) { - return NotImplemented("AppendDatumToBuilder is not yet implemented"); + return AppendNestedValueToBuilder(avro_node, avro_datum, projection.fields, + projected_schema, array_builder); } } // namespace iceberg::avro diff --git a/src/iceberg/avro/avro_data_util_internal.h b/src/iceberg/avro/avro_data_util_internal.h index 4b96483e7..ad493688c 100644 --- a/src/iceberg/avro/avro_data_util_internal.h +++ b/src/iceberg/avro/avro_data_util_internal.h @@ -26,10 +26,21 @@ namespace iceberg::avro { +/// \brief Append an Avro datum to an Arrow array builder. +/// +/// This function handles schema evolution by using the provided projection to map +/// fields from the Avro data to the expected Arrow schema. +/// +/// \param avro_node The Avro schema node (must be a record at root level) +/// \param avro_datum The Avro data to append +/// \param projection Schema projection from `projected_schema` to `avro_node` +/// \param projected_schema The projected schema +/// \param array_builder The Arrow array builder to append to (must be a struct builder) +/// \return Status indicating success or failure Status AppendDatumToBuilder(const ::avro::NodePtr& avro_node, const ::avro::GenericDatum& avro_datum, const SchemaProjection& projection, - const Schema& arrow_schema, + const Schema& projected_schema, ::arrow::ArrayBuilder* array_builder); } // namespace iceberg::avro diff --git a/src/iceberg/avro/avro_schema_util.cc b/src/iceberg/avro/avro_schema_util.cc index 229c62b48..905d9802f 100644 --- a/src/iceberg/avro/avro_schema_util.cc +++ b/src/iceberg/avro/avro_schema_util.cc @@ -73,6 +73,22 @@ ::avro::CustomAttributes GetAttributesWithFieldId(int32_t field_id) { } // namespace +std::string ToString(const ::avro::NodePtr& node) { + std::stringstream ss; + ss << *node; + return ss.str(); +} + +std::string ToString(const ::avro::LogicalType& logical_type) { + std::stringstream ss; + logical_type.printJson(ss); + return ss.str(); +} + +std::string ToString(const ::avro::LogicalType::Type& logical_type) { + return ToString(::avro::LogicalType(logical_type)); +} + Status ToAvroNodeVisitor::Visit(const BooleanType& type, ::avro::NodePtr* node) { *node = std::make_shared<::avro::NodePrimitive>(::avro::AVRO_BOOL); return {}; @@ -383,33 +399,11 @@ Status HasIdVisitor::Visit(const ::avro::Schema& schema) { return Visit(schema.r namespace { -std::string ToString(const ::avro::NodePtr& node) { - std::stringstream ss; - ss << *node; - return ss.str(); -} - -std::string ToString(const ::avro::LogicalType& logical_type) { - std::stringstream ss; - logical_type.printJson(ss); - return ss.str(); -} - -std::string ToString(const ::avro::LogicalType::Type& logical_type) { - return ToString(::avro::LogicalType(logical_type)); -} - bool HasLogicalType(const ::avro::NodePtr& node, ::avro::LogicalType::Type expected_type) { return node->logicalType().type() == expected_type; } -bool HasMapLogicalType(const ::avro::NodePtr& node) { - return node->logicalType().type() == ::avro::LogicalType::CUSTOM && - node->logicalType().customLogicalType() != nullptr && - node->logicalType().customLogicalType()->name() == "map"; -} - std::optional GetAdjustToUtc(const ::avro::NodePtr& node) { if (node->customAttributes() == 0) { return std::nullopt; @@ -501,7 +495,7 @@ Status ValidateAvroSchemaEvolution(const Type& expected_type, case TypeId::kTimestamp: if (avro_node->type() == ::avro::AVRO_LONG && HasLogicalType(avro_node, ::avro::LogicalType::TIMESTAMP_MICROS) && - GetAdjustToUtc(avro_node).value_or("false") == "true") { + GetAdjustToUtc(avro_node).value_or("false") == "false") { return {}; } break; @@ -676,6 +670,10 @@ Result ProjectList(const ListType& list_type, ValidateAvroSchemaEvolution(*expected_element_field.type(), element_node)); } + // Set the element projection metadata but preserve its children + element_projection.kind = FieldProjection::Kind::kProjected; + element_projection.from = size_t{0}; + FieldProjection result; result.children.emplace_back(std::move(element_projection)); return result; @@ -771,6 +769,12 @@ Result ProjectNested(const Type& expected_type, } // namespace +bool HasMapLogicalType(const ::avro::NodePtr& node) { + return node->logicalType().type() == ::avro::LogicalType::CUSTOM && + node->logicalType().customLogicalType() != nullptr && + node->logicalType().customLogicalType()->name() == "map"; +} + Result Project(const Schema& expected_schema, const ::avro::NodePtr& avro_node, bool prune_source) { ICEBERG_ASSIGN_OR_RAISE( diff --git a/src/iceberg/avro/avro_schema_util_internal.h b/src/iceberg/avro/avro_schema_util_internal.h index 50ff9b239..07e949aef 100644 --- a/src/iceberg/avro/avro_schema_util_internal.h +++ b/src/iceberg/avro/avro_schema_util_internal.h @@ -135,4 +135,13 @@ class HasIdVisitor { Result Project(const Schema& expected_schema, const ::avro::NodePtr& avro_node, bool prune_source); +std::string ToString(const ::avro::NodePtr& node); +std::string ToString(const ::avro::LogicalType& logical_type); +std::string ToString(const ::avro::LogicalType::Type& logical_type); + +/// \brief Check if an Avro node has a map logical type. +/// \param node The Avro node to check. +/// \return True if the node has a map logical type, false otherwise. +bool HasMapLogicalType(const ::avro::NodePtr& node); + } // namespace iceberg::avro diff --git a/src/iceberg/catalog/in_memory_catalog.cc b/src/iceberg/catalog/in_memory_catalog.cc index 3e32ddc75..f2ad26685 100644 --- a/src/iceberg/catalog/in_memory_catalog.cc +++ b/src/iceberg/catalog/in_memory_catalog.cc @@ -21,18 +21,14 @@ #include #include // IWYU pragma: keep -#include -#include -#include #include "iceberg/exception.h" #include "iceberg/table.h" +#include "iceberg/table_metadata.h" #include "iceberg/util/macros.h" namespace iceberg { -namespace { - /// \brief A hierarchical namespace that manages namespaces and table metadata in-memory. /// /// Each InMemoryNamespace represents a namespace level and can contain properties, @@ -318,117 +314,56 @@ Result InMemoryNamespace::GetTableMetadataLocation( return it->second; } -} // namespace - -class ICEBERG_EXPORT InMemoryCatalogImpl { - public: - InMemoryCatalogImpl(std::string name, std::shared_ptr file_io, - std::string warehouse_location, - std::unordered_map properties); - - std::string_view name() const; - - Status CreateNamespace(const Namespace& ns, - const std::unordered_map& properties); - - Result> ListNamespaces(const Namespace& ns) const; - - Status DropNamespace(const Namespace& ns); - - Result NamespaceExists(const Namespace& ns) const; - - Result> GetNamespaceProperties( - const Namespace& ns) const; - - Status UpdateNamespaceProperties( - const Namespace& ns, const std::unordered_map& updates, - const std::unordered_set& removals); - - Result> ListTables(const Namespace& ns) const; - - Result> CreateTable( - const TableIdentifier& identifier, const Schema& schema, const PartitionSpec& spec, - const std::string& location, - const std::unordered_map& properties); - - Result> UpdateTable( - const TableIdentifier& identifier, - const std::vector>& requirements, - const std::vector>& updates); - - Result> StageCreateTable( - const TableIdentifier& identifier, const Schema& schema, const PartitionSpec& spec, - const std::string& location, - const std::unordered_map& properties); - - Result TableExists(const TableIdentifier& identifier) const; - - Status DropTable(const TableIdentifier& identifier, bool purge); - - Result> LoadTable(const TableIdentifier& identifier) const; - - Result> RegisterTable(const TableIdentifier& identifier, - const std::string& metadata_file_location); - - std::unique_ptr BuildTable(const TableIdentifier& identifier, - const Schema& schema) const; - - private: - std::string catalog_name_; - std::unordered_map properties_; - std::shared_ptr file_io_; - std::string warehouse_location_; - std::unique_ptr root_namespace_; - mutable std::recursive_mutex mutex_; -}; - -InMemoryCatalogImpl::InMemoryCatalogImpl( - std::string name, std::shared_ptr file_io, std::string warehouse_location, - std::unordered_map properties) +InMemoryCatalog::InMemoryCatalog( + std::string const& name, std::shared_ptr const& file_io, + std::string const& warehouse_location, + std::unordered_map const& properties) : catalog_name_(std::move(name)), properties_(std::move(properties)), file_io_(std::move(file_io)), warehouse_location_(std::move(warehouse_location)), root_namespace_(std::make_unique()) {} -std::string_view InMemoryCatalogImpl::name() const { return catalog_name_; } +InMemoryCatalog::~InMemoryCatalog() = default; + +std::string_view InMemoryCatalog::name() const { return catalog_name_; } -Status InMemoryCatalogImpl::CreateNamespace( +Status InMemoryCatalog::CreateNamespace( const Namespace& ns, const std::unordered_map& properties) { std::unique_lock lock(mutex_); return root_namespace_->CreateNamespace(ns, properties); } -Result> InMemoryCatalogImpl::ListNamespaces( +Result> +InMemoryCatalog::GetNamespaceProperties(const Namespace& ns) const { + std::unique_lock lock(mutex_); + return root_namespace_->GetProperties(ns); +} + +Result> InMemoryCatalog::ListNamespaces( const Namespace& ns) const { std::unique_lock lock(mutex_); return root_namespace_->ListNamespaces(ns); } -Status InMemoryCatalogImpl::DropNamespace(const Namespace& ns) { +Status InMemoryCatalog::DropNamespace(const Namespace& ns) { std::unique_lock lock(mutex_); return root_namespace_->DropNamespace(ns); } -Result InMemoryCatalogImpl::NamespaceExists(const Namespace& ns) const { +Result InMemoryCatalog::NamespaceExists(const Namespace& ns) const { std::unique_lock lock(mutex_); return root_namespace_->NamespaceExists(ns); } -Result> -InMemoryCatalogImpl::GetNamespaceProperties(const Namespace& ns) const { - std::unique_lock lock(mutex_); - return root_namespace_->GetProperties(ns); -} - -Status InMemoryCatalogImpl::UpdateNamespaceProperties( +Status InMemoryCatalog::UpdateNamespaceProperties( const Namespace& ns, const std::unordered_map& updates, const std::unordered_set& removals) { std::unique_lock lock(mutex_); return root_namespace_->UpdateNamespaceProperties(ns, updates, removals); } -Result> InMemoryCatalogImpl::ListTables( +Result> InMemoryCatalog::ListTables( const Namespace& ns) const { std::unique_lock lock(mutex_); const auto& table_names = root_namespace_->ListTables(ns); @@ -441,44 +376,58 @@ Result> InMemoryCatalogImpl::ListTables( return table_idents; } -Result> InMemoryCatalogImpl::CreateTable( +Result> InMemoryCatalog::CreateTable( const TableIdentifier& identifier, const Schema& schema, const PartitionSpec& spec, const std::string& location, const std::unordered_map& properties) { return NotImplemented("create table"); } -Result> InMemoryCatalogImpl::UpdateTable( +Result> InMemoryCatalog::UpdateTable( const TableIdentifier& identifier, const std::vector>& requirements, const std::vector>& updates) { return NotImplemented("update table"); } -Result> InMemoryCatalogImpl::StageCreateTable( +Result> InMemoryCatalog::StageCreateTable( const TableIdentifier& identifier, const Schema& schema, const PartitionSpec& spec, const std::string& location, const std::unordered_map& properties) { return NotImplemented("stage create table"); } -Result InMemoryCatalogImpl::TableExists(const TableIdentifier& identifier) const { +Result InMemoryCatalog::TableExists(const TableIdentifier& identifier) const { std::unique_lock lock(mutex_); return root_namespace_->TableExists(identifier); } -Status InMemoryCatalogImpl::DropTable(const TableIdentifier& identifier, bool purge) { +Status InMemoryCatalog::DropTable(const TableIdentifier& identifier, bool purge) { std::unique_lock lock(mutex_); // TODO(Guotao): Delete all metadata files if purge is true. return root_namespace_->UnregisterTable(identifier); } -Result> InMemoryCatalogImpl::LoadTable( +Result> InMemoryCatalog::LoadTable( const TableIdentifier& identifier) const { - return NotImplemented("load table"); + if (!file_io_) [[unlikely]] { + return NotSupported("file_io is not set for catalog {}", catalog_name_); + } + + std::unique_lock lock(mutex_); + auto metadata_location = root_namespace_->GetTableMetadataLocation(identifier); + ICEBERG_RETURN_UNEXPECTED(metadata_location); + + auto metadata = TableMetadataUtil::Read(*file_io_, metadata_location.value()); + ICEBERG_RETURN_UNEXPECTED(metadata); + + return std::make_shared( + identifier, std::move(metadata.value()), metadata_location.value(), file_io_, + std::static_pointer_cast( + std::const_pointer_cast(shared_from_this()))); } -Result> InMemoryCatalogImpl::RegisterTable( +Result> InMemoryCatalog::RegisterTable( const TableIdentifier& identifier, const std::string& metadata_file_location) { std::unique_lock lock(mutex_); if (!root_namespace_->NamespaceExists(identifier.ns)) { @@ -490,95 +439,6 @@ Result> InMemoryCatalogImpl::RegisterTable( return LoadTable(identifier); } -std::unique_ptr InMemoryCatalogImpl::BuildTable( - const TableIdentifier& identifier, const Schema& schema) const { - throw IcebergError("not implemented"); -} - -InMemoryCatalog::InMemoryCatalog( - std::string const& name, std::shared_ptr const& file_io, - std::string const& warehouse_location, - std::unordered_map const& properties) - : impl_(std::make_unique(name, file_io, warehouse_location, - properties)) {} - -InMemoryCatalog::~InMemoryCatalog() = default; - -std::string_view InMemoryCatalog::name() const { return impl_->name(); } - -Status InMemoryCatalog::CreateNamespace( - const Namespace& ns, const std::unordered_map& properties) { - return impl_->CreateNamespace(ns, properties); -} - -Result> -InMemoryCatalog::GetNamespaceProperties(const Namespace& ns) const { - return impl_->GetNamespaceProperties(ns); -} - -Result> InMemoryCatalog::ListNamespaces( - const Namespace& ns) const { - return impl_->ListNamespaces(ns); -} - -Status InMemoryCatalog::DropNamespace(const Namespace& ns) { - return impl_->DropNamespace(ns); -} - -Result InMemoryCatalog::NamespaceExists(const Namespace& ns) const { - return impl_->NamespaceExists(ns); -} - -Status InMemoryCatalog::UpdateNamespaceProperties( - const Namespace& ns, const std::unordered_map& updates, - const std::unordered_set& removals) { - return impl_->UpdateNamespaceProperties(ns, updates, removals); -} - -Result> InMemoryCatalog::ListTables( - const Namespace& ns) const { - return impl_->ListTables(ns); -} - -Result> InMemoryCatalog::CreateTable( - const TableIdentifier& identifier, const Schema& schema, const PartitionSpec& spec, - const std::string& location, - const std::unordered_map& properties) { - return impl_->CreateTable(identifier, schema, spec, location, properties); -} - -Result> InMemoryCatalog::UpdateTable( - const TableIdentifier& identifier, - const std::vector>& requirements, - const std::vector>& updates) { - return impl_->UpdateTable(identifier, requirements, updates); -} - -Result> InMemoryCatalog::StageCreateTable( - const TableIdentifier& identifier, const Schema& schema, const PartitionSpec& spec, - const std::string& location, - const std::unordered_map& properties) { - return impl_->StageCreateTable(identifier, schema, spec, location, properties); -} - -Result InMemoryCatalog::TableExists(const TableIdentifier& identifier) const { - return impl_->TableExists(identifier); -} - -Status InMemoryCatalog::DropTable(const TableIdentifier& identifier, bool purge) { - return impl_->DropTable(identifier, purge); -} - -Result> InMemoryCatalog::LoadTable( - const TableIdentifier& identifier) const { - return impl_->LoadTable(identifier); -} - -Result> InMemoryCatalog::RegisterTable( - const TableIdentifier& identifier, const std::string& metadata_file_location) { - return impl_->RegisterTable(identifier, metadata_file_location); -} - std::unique_ptr InMemoryCatalog::BuildTable( const TableIdentifier& identifier, const Schema& schema) const { throw IcebergError("not implemented"); diff --git a/src/iceberg/catalog/in_memory_catalog.h b/src/iceberg/catalog/in_memory_catalog.h index c8e24b5db..d32a61b5e 100644 --- a/src/iceberg/catalog/in_memory_catalog.h +++ b/src/iceberg/catalog/in_memory_catalog.h @@ -19,9 +19,12 @@ #pragma once +#include + #include "iceberg/catalog.h" namespace iceberg { + /** * @brief An in-memory implementation of the Iceberg Catalog interface. * @@ -32,7 +35,9 @@ namespace iceberg { * @note This class is **not** suitable for production use. * All data will be lost when the process exits. */ -class ICEBERG_EXPORT InMemoryCatalog : public Catalog { +class ICEBERG_EXPORT InMemoryCatalog + : public Catalog, + public std::enable_shared_from_this { public: InMemoryCatalog(std::string const& name, std::shared_ptr const& file_io, std::string const& warehouse_location, @@ -90,7 +95,12 @@ class ICEBERG_EXPORT InMemoryCatalog : public Catalog { const Schema& schema) const override; private: - std::unique_ptr impl_; + std::string catalog_name_; + std::unordered_map properties_; + std::shared_ptr file_io_; + std::string warehouse_location_; + std::unique_ptr root_namespace_; + mutable std::recursive_mutex mutex_; }; } // namespace iceberg diff --git a/src/iceberg/expression/expression.cc b/src/iceberg/expression/expression.cc index 77f341eb5..c6fa9406b 100644 --- a/src/iceberg/expression/expression.cc +++ b/src/iceberg/expression/expression.cc @@ -21,8 +21,6 @@ #include -#include "iceberg/result.h" - namespace iceberg { // True implementation @@ -31,7 +29,7 @@ const std::shared_ptr& True::Instance() { return instance; } -Result> True::Negate() const { return False::Instance(); } +std::shared_ptr True::Negate() const { return False::Instance(); } // False implementation const std::shared_ptr& False::Instance() { @@ -39,7 +37,7 @@ const std::shared_ptr& False::Instance() { return instance; } -Result> False::Negate() const { return True::Instance(); } +std::shared_ptr False::Negate() const { return True::Instance(); } // And implementation And::And(std::shared_ptr left, std::shared_ptr right) @@ -49,9 +47,11 @@ std::string And::ToString() const { return std::format("({} and {})", left_->ToString(), right_->ToString()); } -Result> And::Negate() const { - // TODO(yingcai-cy): Implement Or expression - return InvalidExpression("And negation not yet implemented"); +std::shared_ptr And::Negate() const { + // De Morgan's law: not(A and B) = (not A) or (not B) + auto left_negated = left_->Negate(); + auto right_negated = right_->Negate(); + return std::make_shared(left_negated, right_negated); } bool And::Equals(const Expression& expr) const { @@ -63,4 +63,28 @@ bool And::Equals(const Expression& expr) const { return false; } +// Or implementation +Or::Or(std::shared_ptr left, std::shared_ptr right) + : left_(std::move(left)), right_(std::move(right)) {} + +std::string Or::ToString() const { + return std::format("({} or {})", left_->ToString(), right_->ToString()); +} + +std::shared_ptr Or::Negate() const { + // De Morgan's law: not(A or B) = (not A) and (not B) + auto left_negated = left_->Negate(); + auto right_negated = right_->Negate(); + return std::make_shared(left_negated, right_negated); +} + +bool Or::Equals(const Expression& expr) const { + if (expr.op() == Operation::kOr) { + const auto& other = static_cast(expr); + return (left_->Equals(*other.left()) && right_->Equals(*other.right())) || + (left_->Equals(*other.right()) && right_->Equals(*other.left())); + } + return false; +} + } // namespace iceberg diff --git a/src/iceberg/expression/expression.h b/src/iceberg/expression/expression.h index 258c9ee2a..9ceae1c69 100644 --- a/src/iceberg/expression/expression.h +++ b/src/iceberg/expression/expression.h @@ -19,15 +19,14 @@ #pragma once -/// \file iceberg/expression.h +/// \file iceberg/expression/expression.h /// Expression interface for Iceberg table operations. #include #include -#include "iceberg/expected.h" +#include "iceberg/exception.h" #include "iceberg/iceberg_export.h" -#include "iceberg/result.h" namespace iceberg { @@ -67,8 +66,8 @@ class ICEBERG_EXPORT Expression { virtual Operation op() const = 0; /// \brief Returns the negation of this expression, equivalent to not(this). - virtual Result> Negate() const { - return InvalidExpression("Expression cannot be negated"); + virtual std::shared_ptr Negate() const { + throw IcebergError("Expression cannot be negated"); } /// \brief Returns whether this expression will accept the same values as another. @@ -94,7 +93,7 @@ class ICEBERG_EXPORT True : public Expression { std::string ToString() const override { return "true"; } - Result> Negate() const override; + std::shared_ptr Negate() const override; bool Equals(const Expression& other) const override { return other.op() == Operation::kTrue; @@ -114,7 +113,7 @@ class ICEBERG_EXPORT False : public Expression { std::string ToString() const override { return "false"; } - Result> Negate() const override; + std::shared_ptr Negate() const override; bool Equals(const Expression& other) const override { return other.op() == Operation::kFalse; @@ -150,7 +149,42 @@ class ICEBERG_EXPORT And : public Expression { std::string ToString() const override; - Result> Negate() const override; + std::shared_ptr Negate() const override; + + bool Equals(const Expression& other) const override; + + private: + std::shared_ptr left_; + std::shared_ptr right_; +}; + +/// \brief An Expression that represents a logical OR operation between two expressions. +/// +/// This expression evaluates to true if at least one of its child expressions +/// evaluates to true. +class ICEBERG_EXPORT Or : public Expression { + public: + /// \brief Constructs an Or expression from two sub-expressions. + /// + /// \param left The left operand of the OR expression + /// \param right The right operand of the OR expression + Or(std::shared_ptr left, std::shared_ptr right); + + /// \brief Returns the left operand of the OR expression. + /// + /// \return The left operand of the OR expression + const std::shared_ptr& left() const { return left_; } + + /// \brief Returns the right operand of the OR expression. + /// + /// \return The right operand of the OR expression + const std::shared_ptr& right() const { return right_; } + + Operation op() const override { return Operation::kOr; } + + std::string ToString() const override; + + std::shared_ptr Negate() const override; bool Equals(const Expression& other) const override; diff --git a/src/iceberg/expression/literal.cc b/src/iceberg/expression/literal.cc new file mode 100644 index 000000000..8392f34cf --- /dev/null +++ b/src/iceberg/expression/literal.cc @@ -0,0 +1,351 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/expression/literal.h" + +#include +#include +#include + +#include "iceberg/exception.h" + +namespace iceberg { + +/// \brief LiteralCaster handles type casting operations for Literal. +/// This is an internal implementation class. +class LiteralCaster { + public: + /// Cast a Literal to the target type. + static Result CastTo(const Literal& literal, + const std::shared_ptr& target_type); + + /// Create a literal representing a value below the minimum for the given type. + static Literal BelowMinLiteral(std::shared_ptr type); + + /// Create a literal representing a value above the maximum for the given type. + static Literal AboveMaxLiteral(std::shared_ptr type); + + private: + /// Cast from Int type to target type. + static Result CastFromInt(const Literal& literal, + const std::shared_ptr& target_type); + + /// Cast from Long type to target type. + static Result CastFromLong(const Literal& literal, + const std::shared_ptr& target_type); + + /// Cast from Float type to target type. + static Result CastFromFloat(const Literal& literal, + const std::shared_ptr& target_type); +}; + +Literal LiteralCaster::BelowMinLiteral(std::shared_ptr type) { + return Literal(Literal::BelowMin{}, std::move(type)); +} + +Literal LiteralCaster::AboveMaxLiteral(std::shared_ptr type) { + return Literal(Literal::AboveMax{}, std::move(type)); +} + +Result LiteralCaster::CastFromInt( + const Literal& literal, const std::shared_ptr& target_type) { + auto int_val = std::get(literal.value_); + auto target_type_id = target_type->type_id(); + + switch (target_type_id) { + case TypeId::kLong: + return Literal::Long(static_cast(int_val)); + case TypeId::kFloat: + return Literal::Float(static_cast(int_val)); + case TypeId::kDouble: + return Literal::Double(static_cast(int_val)); + default: + return NotSupported("Cast from Int to {} is not implemented", + target_type->ToString()); + } +} + +Result LiteralCaster::CastFromLong( + const Literal& literal, const std::shared_ptr& target_type) { + auto long_val = std::get(literal.value_); + auto target_type_id = target_type->type_id(); + + switch (target_type_id) { + case TypeId::kInt: { + // Check for overflow + if (long_val >= std::numeric_limits::max()) { + return AboveMaxLiteral(target_type); + } + if (long_val <= std::numeric_limits::min()) { + return BelowMinLiteral(target_type); + } + return Literal::Int(static_cast(long_val)); + } + case TypeId::kFloat: + return Literal::Float(static_cast(long_val)); + case TypeId::kDouble: + return Literal::Double(static_cast(long_val)); + default: + return NotSupported("Cast from Long to {} is not supported", + target_type->ToString()); + } +} + +Result LiteralCaster::CastFromFloat( + const Literal& literal, const std::shared_ptr& target_type) { + auto float_val = std::get(literal.value_); + auto target_type_id = target_type->type_id(); + + switch (target_type_id) { + case TypeId::kDouble: + return Literal::Double(static_cast(float_val)); + default: + return NotSupported("Cast from Float to {} is not supported", + target_type->ToString()); + } +} + +// Constructor +Literal::Literal(Value value, std::shared_ptr type) + : value_(std::move(value)), type_(std::move(type)) {} + +// Factory methods +Literal Literal::Boolean(bool value) { + return {Value{value}, std::make_shared()}; +} + +Literal Literal::Int(int32_t value) { + return {Value{value}, std::make_shared()}; +} + +Literal Literal::Long(int64_t value) { + return {Value{value}, std::make_shared()}; +} + +Literal Literal::Float(float value) { + return {Value{value}, std::make_shared()}; +} + +Literal Literal::Double(double value) { + return {Value{value}, std::make_shared()}; +} + +Literal Literal::String(std::string value) { + return {Value{std::move(value)}, std::make_shared()}; +} + +Literal Literal::Binary(std::vector value) { + return {Value{std::move(value)}, std::make_shared()}; +} + +Result Literal::Deserialize(std::span data, + std::shared_ptr type) { + return NotImplemented("Deserialization of Literal is not implemented yet"); +} + +Result> Literal::Serialize() const { + return NotImplemented("Serialization of Literal is not implemented yet"); +} + +// Getters + +const std::shared_ptr& Literal::type() const { return type_; } + +// Cast method +Result Literal::CastTo(const std::shared_ptr& target_type) const { + return LiteralCaster::CastTo(*this, target_type); +} + +// Template function for floating point comparison following Iceberg rules: +// -NaN < NaN, but all NaN values (qNaN, sNaN) are treated as equivalent within their sign +template +std::strong_ordering CompareFloat(T lhs, T rhs) { + // If both are NaN, check their signs + bool all_nan = std::isnan(lhs) && std::isnan(rhs); + if (!all_nan) { + // If not both NaN, use strong ordering + return std::strong_order(lhs, rhs); + } + // Same sign NaN values are equivalent (no qNaN vs sNaN distinction), + // and -NAN < NAN. + bool lhs_is_negative = std::signbit(lhs); + bool rhs_is_negative = std::signbit(rhs); + return lhs_is_negative <=> rhs_is_negative; +} + +// Three-way comparison operator +std::partial_ordering Literal::operator<=>(const Literal& other) const { + // If types are different, comparison is unordered + if (type_->type_id() != other.type_->type_id()) { + return std::partial_ordering::unordered; + } + + // If either value is AboveMax or BelowMin, comparison is unordered + if (IsAboveMax() || IsBelowMin() || other.IsAboveMax() || other.IsBelowMin()) { + return std::partial_ordering::unordered; + } + + // Same type comparison for normal values + switch (type_->type_id()) { + case TypeId::kBoolean: { + auto this_val = std::get(value_); + auto other_val = std::get(other.value_); + if (this_val == other_val) return std::partial_ordering::equivalent; + return this_val ? std::partial_ordering::greater : std::partial_ordering::less; + } + + case TypeId::kInt: { + auto this_val = std::get(value_); + auto other_val = std::get(other.value_); + return this_val <=> other_val; + } + + case TypeId::kLong: { + auto this_val = std::get(value_); + auto other_val = std::get(other.value_); + return this_val <=> other_val; + } + + case TypeId::kFloat: { + auto this_val = std::get(value_); + auto other_val = std::get(other.value_); + // Use strong_ordering for floating point as spec requests + return CompareFloat(this_val, other_val); + } + + case TypeId::kDouble: { + auto this_val = std::get(value_); + auto other_val = std::get(other.value_); + // Use strong_ordering for floating point as spec requests + return CompareFloat(this_val, other_val); + } + + case TypeId::kString: { + auto& this_val = std::get(value_); + auto& other_val = std::get(other.value_); + return this_val <=> other_val; + } + + case TypeId::kBinary: { + auto& this_val = std::get>(value_); + auto& other_val = std::get>(other.value_); + return this_val <=> other_val; + } + + default: + // For unsupported types, return unordered + return std::partial_ordering::unordered; + } +} + +std::string Literal::ToString() const { + if (std::holds_alternative(value_)) { + return "belowMin"; + } + if (std::holds_alternative(value_)) { + return "aboveMax"; + } + + switch (type_->type_id()) { + case TypeId::kBoolean: { + return std::get(value_) ? "true" : "false"; + } + case TypeId::kInt: { + return std::to_string(std::get(value_)); + } + case TypeId::kLong: { + return std::to_string(std::get(value_)); + } + case TypeId::kFloat: { + return std::to_string(std::get(value_)); + } + case TypeId::kDouble: { + return std::to_string(std::get(value_)); + } + case TypeId::kString: { + return std::get(value_); + } + case TypeId::kBinary: { + const auto& binary_data = std::get>(value_); + std::string result; + result.reserve(binary_data.size() * 2); // 2 chars per byte + for (const auto& byte : binary_data) { + std::format_to(std::back_inserter(result), "{:02X}", byte); + } + return result; + } + case TypeId::kDecimal: + case TypeId::kUuid: + case TypeId::kFixed: + case TypeId::kDate: + case TypeId::kTime: + case TypeId::kTimestamp: + case TypeId::kTimestampTz: { + throw IcebergError("Not implemented: ToString for " + type_->ToString()); + } + default: { + throw IcebergError("Unknown type: " + type_->ToString()); + } + } +} + +bool Literal::IsBelowMin() const { return std::holds_alternative(value_); } + +bool Literal::IsAboveMax() const { return std::holds_alternative(value_); } + +// LiteralCaster implementation + +Result LiteralCaster::CastTo(const Literal& literal, + const std::shared_ptr& target_type) { + if (*literal.type_ == *target_type) { + // If types are the same, return a copy of the current literal + return Literal(literal.value_, target_type); + } + + // Handle special values + if (std::holds_alternative(literal.value_) || + std::holds_alternative(literal.value_)) { + // Cannot cast type for special values + return NotSupported("Cannot cast type for {}", literal.ToString()); + } + + auto source_type_id = literal.type_->type_id(); + + // Delegate to specific cast functions based on source type + switch (source_type_id) { + case TypeId::kInt: + return CastFromInt(literal, target_type); + case TypeId::kLong: + return CastFromLong(literal, target_type); + case TypeId::kFloat: + return CastFromFloat(literal, target_type); + case TypeId::kDouble: + case TypeId::kBoolean: + case TypeId::kString: + case TypeId::kBinary: + break; + default: + break; + } + + return NotSupported("Cast from {} to {} is not implemented", literal.type_->ToString(), + target_type->ToString()); +} + +} // namespace iceberg diff --git a/src/iceberg/expression/literal.h b/src/iceberg/expression/literal.h new file mode 100644 index 000000000..17752c488 --- /dev/null +++ b/src/iceberg/expression/literal.h @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "iceberg/result.h" +#include "iceberg/type.h" + +namespace iceberg { + +/// \brief Literal is a literal value that is associated with a primitive type. +class ICEBERG_EXPORT Literal { + private: + /// \brief Sentinel value to indicate that the literal value is below the valid range + /// of a specific primitive type. It can happen when casting a literal to a narrower + /// primitive type. + struct BelowMin { + bool operator==(const BelowMin&) const = default; + std::strong_ordering operator<=>(const BelowMin&) const = default; + }; + + /// \brief Sentinel value to indicate that the literal value is above the valid range + /// of a specific primitive type. It can happen when casting a literal to a narrower + /// primitive type. + struct AboveMax { + bool operator==(const AboveMax&) const = default; + std::strong_ordering operator<=>(const AboveMax&) const = default; + }; + + using Value = std::variant, // for binary, fixed + std::array, // for uuid and decimal + BelowMin, AboveMax>; + + public: + /// \brief Factory methods for primitive types + static Literal Boolean(bool value); + static Literal Int(int32_t value); + static Literal Long(int64_t value); + static Literal Float(float value); + static Literal Double(double value); + static Literal String(std::string value); + static Literal Binary(std::vector value); + + /// \brief Restore a literal from single-value serialization. + /// + /// See [this spec](https://iceberg.apache.org/spec/#binary-single-value-serialization) + /// for reference. + static Result Deserialize(std::span data, + std::shared_ptr type); + + /// \brief Perform single-value serialization. + /// + /// See [this spec](https://iceberg.apache.org/spec/#binary-single-value-serialization) + /// for reference. + Result> Serialize() const; + + /// \brief Get the literal type. + const std::shared_ptr& type() const; + + /// \brief Converts this literal to a literal of the given type. + /// + /// When a predicate is bound to a concrete data column, literals are converted to match + /// the bound column's type. This conversion process is more narrow than a cast and is + /// only intended for cases where substituting one type is a common mistake (e.g. 34 + /// instead of 34L) or where this API avoids requiring a concrete class (e.g., dates). + /// + /// If conversion to a target type is not supported, this method returns an error. + /// + /// This method may return BelowMin or AboveMax when the target type is not as wide as + /// the original type. These values indicate that the containing predicate can be + /// simplified. For example, std::numeric_limits::max()+1 converted to an int will + /// result in AboveMax and can simplify a < std::numeric_limits::max()+1 to always + /// true. + /// + /// \param target_type A primitive PrimitiveType + /// \return A Result containing a literal of the given type or an error if conversion + /// was not valid + Result CastTo(const std::shared_ptr& target_type) const; + + /// \brief Compare two PrimitiveLiterals. Both literals must have the same type + /// and should not be AboveMax or BelowMin. + std::partial_ordering operator<=>(const Literal& other) const; + + /// Check if this literal represents a value above the maximum allowed value + /// for its type. This occurs when casting from a wider type to a narrower type + /// and the value exceeds the target type's maximum. + /// \return true if this literal represents an AboveMax value, false otherwise + bool IsAboveMax() const; + + /// Check if this literal represents a value below the minimum allowed value + /// for its type. This occurs when casting from a wider type to a narrower type + /// and the value is less than the target type's minimum. + /// \return true if this literal represents a BelowMin value, false otherwise + bool IsBelowMin() const; + + std::string ToString() const; + + private: + Literal(Value value, std::shared_ptr type); + + friend class LiteralCaster; + + private: + Value value_; + std::shared_ptr type_; +}; + +} // namespace iceberg diff --git a/src/iceberg/file_format.h b/src/iceberg/file_format.h index eebb76d14..8e7396a0b 100644 --- a/src/iceberg/file_format.h +++ b/src/iceberg/file_format.h @@ -26,6 +26,7 @@ #include "iceberg/iceberg_export.h" #include "iceberg/result.h" +#include "iceberg/util/unreachable.h" namespace iceberg { @@ -49,6 +50,8 @@ ICEBERG_EXPORT inline std::string_view ToString(FileFormatType format_type) { case FileFormatType::kPuffin: return "puffin"; } + internal::Unreachable( + std::format("Invalid file format type: {}", static_cast(format_type))); } /// \brief Convert a string to a FileFormatType diff --git a/src/iceberg/json_internal.cc b/src/iceberg/json_internal.cc index 3614ed230..cc35d85e8 100644 --- a/src/iceberg/json_internal.cc +++ b/src/iceberg/json_internal.cc @@ -43,6 +43,7 @@ #include "iceberg/util/formatter.h" // IWYU pragma: keep #include "iceberg/util/macros.h" #include "iceberg/util/timepoint.h" +#include "iceberg/util/unreachable.h" namespace iceberg { @@ -477,6 +478,8 @@ nlohmann::json ToJson(const Type& type) { case TypeId::kUuid: return "uuid"; } + internal::Unreachable( + std::format("Unknown type id: {}", static_cast(type.type_id()))); } nlohmann::json ToJson(const Schema& schema) { @@ -1053,7 +1056,9 @@ Status ParsePartitionSpecs(const nlohmann::json& json, int8_t format_version, int32_t next_partition_field_id = PartitionSpec::kLegacyPartitionDataIdStart; std::vector fields; for (const auto& entry_json : partition_spec_json) { - ICEBERG_ASSIGN_OR_RAISE(auto field, PartitionFieldFromJson(entry_json)); + ICEBERG_ASSIGN_OR_RAISE( + auto field, PartitionFieldFromJson( + entry_json, /*allow_field_id_missing=*/format_version == 1)); int32_t field_id = field->field_id(); if (field_id == SchemaField::kInvalidFieldId) { // If the field ID is not set, we need to assign a new one diff --git a/src/iceberg/manifest_entry.h b/src/iceberg/manifest_entry.h index 43db00ae9..0b1355a7f 100644 --- a/src/iceberg/manifest_entry.h +++ b/src/iceberg/manifest_entry.h @@ -19,7 +19,6 @@ #pragma once -#include #include #include #include @@ -27,6 +26,7 @@ #include #include +#include "iceberg/expression/literal.h" #include "iceberg/file_format.h" #include "iceberg/iceberg_export.h" #include "iceberg/result.h" @@ -77,9 +77,8 @@ struct ICEBERG_EXPORT DataFile { FileFormatType file_format; /// Field id: 102 /// Partition data tuple, schema based on the partition spec output using partition - /// field ids for the struct field ids - /// TODO(zhjwpku): use StructLike to represent partition data tuple - std::any partition; + /// field ids + std::vector partition; /// Field id: 103 /// Number of records in this file, or the cardinality of a deletion vector int64_t record_count = 0; diff --git a/src/iceberg/manifest_list.h b/src/iceberg/manifest_list.h index ddcc38f73..432397ec1 100644 --- a/src/iceberg/manifest_list.h +++ b/src/iceberg/manifest_list.h @@ -30,6 +30,7 @@ #include "iceberg/result.h" #include "iceberg/schema_field.h" #include "iceberg/type.h" +#include "iceberg/util/unreachable.h" namespace iceberg { @@ -221,6 +222,8 @@ ICEBERG_EXPORT constexpr std::string_view ToString(ManifestFile::Content type) n case ManifestFile::Content::kDeletes: return "deletes"; } + internal::Unreachable( + std::format("Unknown manifest content type: {}", static_cast(type))); } /// \brief Get the relative manifest content type from name diff --git a/src/iceberg/partition_field.h b/src/iceberg/partition_field.h index daec4049e..5206cf260 100644 --- a/src/iceberg/partition_field.h +++ b/src/iceberg/partition_field.h @@ -62,10 +62,6 @@ class ICEBERG_EXPORT PartitionField : public util::Formattable { return lhs.Equals(rhs); } - friend bool operator!=(const PartitionField& lhs, const PartitionField& rhs) { - return !(lhs == rhs); - } - private: /// \brief Compare two fields for equality. [[nodiscard]] bool Equals(const PartitionField& other) const; diff --git a/src/iceberg/partition_spec.h b/src/iceberg/partition_spec.h index a18ba7b24..f105a27eb 100644 --- a/src/iceberg/partition_spec.h +++ b/src/iceberg/partition_spec.h @@ -75,10 +75,6 @@ class ICEBERG_EXPORT PartitionSpec : public util::Formattable { return lhs.Equals(rhs); } - friend bool operator!=(const PartitionSpec& lhs, const PartitionSpec& rhs) { - return !(lhs == rhs); - } - private: /// \brief Compare two partition specs for equality. [[nodiscard]] bool Equals(const PartitionSpec& other) const; diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h index edc25d6bc..490acb6de 100644 --- a/src/iceberg/schema.h +++ b/src/iceberg/schema.h @@ -56,8 +56,6 @@ class ICEBERG_EXPORT Schema : public StructType { friend bool operator==(const Schema& lhs, const Schema& rhs) { return lhs.Equals(rhs); } - friend bool operator!=(const Schema& lhs, const Schema& rhs) { return !(lhs == rhs); } - private: /// \brief Compare two schemas for equality. [[nodiscard]] bool Equals(const Schema& other) const; diff --git a/src/iceberg/schema_field.h b/src/iceberg/schema_field.h index afef71738..e947f2036 100644 --- a/src/iceberg/schema_field.h +++ b/src/iceberg/schema_field.h @@ -76,10 +76,6 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { return lhs.Equals(rhs); } - friend bool operator!=(const SchemaField& lhs, const SchemaField& rhs) { - return !(lhs == rhs); - } - private: /// \brief Compare two fields for equality. [[nodiscard]] bool Equals(const SchemaField& other) const; diff --git a/src/iceberg/schema_internal.cc b/src/iceberg/schema_internal.cc index 621807278..1ce279f7d 100644 --- a/src/iceberg/schema_internal.cc +++ b/src/iceberg/schema_internal.cc @@ -50,11 +50,9 @@ ArrowErrorCode ToArrowSchema(const Type& type, bool optional, std::string_view n switch (type.type_id()) { case TypeId::kStruct: { - NANOARROW_RETURN_NOT_OK(ArrowSchemaInitFromType(schema, NANOARROW_TYPE_STRUCT)); - const auto& struct_type = static_cast(type); const auto& fields = struct_type.fields(); - NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, fields.size())); + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetTypeStruct(schema, fields.size())); for (size_t i = 0; i < fields.size(); i++) { const auto& field = fields[i]; @@ -64,7 +62,7 @@ ArrowErrorCode ToArrowSchema(const Type& type, bool optional, std::string_view n } } break; case TypeId::kList: { - NANOARROW_RETURN_NOT_OK(ArrowSchemaInitFromType(schema, NANOARROW_TYPE_LIST)); + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, NANOARROW_TYPE_LIST)); const auto& list_type = static_cast(type); const auto& elem_field = list_type.fields()[0]; @@ -73,7 +71,7 @@ ArrowErrorCode ToArrowSchema(const Type& type, bool optional, std::string_view n schema->children[0])); } break; case TypeId::kMap: { - NANOARROW_RETURN_NOT_OK(ArrowSchemaInitFromType(schema, NANOARROW_TYPE_MAP)); + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, NANOARROW_TYPE_MAP)); const auto& map_type = static_cast(type); const auto& key_field = map_type.key(); @@ -86,61 +84,55 @@ ArrowErrorCode ToArrowSchema(const Type& type, bool optional, std::string_view n schema->children[0]->children[1])); } break; case TypeId::kBoolean: - NANOARROW_RETURN_NOT_OK(ArrowSchemaInitFromType(schema, NANOARROW_TYPE_BOOL)); + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, NANOARROW_TYPE_BOOL)); break; case TypeId::kInt: - NANOARROW_RETURN_NOT_OK(ArrowSchemaInitFromType(schema, NANOARROW_TYPE_INT32)); + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, NANOARROW_TYPE_INT32)); break; case TypeId::kLong: - NANOARROW_RETURN_NOT_OK(ArrowSchemaInitFromType(schema, NANOARROW_TYPE_INT64)); + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, NANOARROW_TYPE_INT64)); break; case TypeId::kFloat: - NANOARROW_RETURN_NOT_OK(ArrowSchemaInitFromType(schema, NANOARROW_TYPE_FLOAT)); + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, NANOARROW_TYPE_FLOAT)); break; case TypeId::kDouble: - NANOARROW_RETURN_NOT_OK(ArrowSchemaInitFromType(schema, NANOARROW_TYPE_DOUBLE)); + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, NANOARROW_TYPE_DOUBLE)); break; case TypeId::kDecimal: { - ArrowSchemaInit(schema); const auto& decimal_type = static_cast(type); NANOARROW_RETURN_NOT_OK(ArrowSchemaSetTypeDecimal(schema, NANOARROW_TYPE_DECIMAL128, decimal_type.precision(), decimal_type.scale())); } break; case TypeId::kDate: - NANOARROW_RETURN_NOT_OK(ArrowSchemaInitFromType(schema, NANOARROW_TYPE_DATE32)); + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, NANOARROW_TYPE_DATE32)); break; case TypeId::kTime: { - ArrowSchemaInit(schema); NANOARROW_RETURN_NOT_OK(ArrowSchemaSetTypeDateTime(schema, NANOARROW_TYPE_TIME64, NANOARROW_TIME_UNIT_MICRO, /*timezone=*/nullptr)); } break; case TypeId::kTimestamp: { - ArrowSchemaInit(schema); NANOARROW_RETURN_NOT_OK(ArrowSchemaSetTypeDateTime(schema, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MICRO, /*timezone=*/nullptr)); } break; case TypeId::kTimestampTz: { - ArrowSchemaInit(schema); NANOARROW_RETURN_NOT_OK(ArrowSchemaSetTypeDateTime( schema, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MICRO, "UTC")); } break; case TypeId::kString: - NANOARROW_RETURN_NOT_OK(ArrowSchemaInitFromType(schema, NANOARROW_TYPE_STRING)); + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, NANOARROW_TYPE_STRING)); break; case TypeId::kBinary: - NANOARROW_RETURN_NOT_OK(ArrowSchemaInitFromType(schema, NANOARROW_TYPE_BINARY)); + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, NANOARROW_TYPE_BINARY)); break; case TypeId::kFixed: { - ArrowSchemaInit(schema); const auto& fixed_type = static_cast(type); NANOARROW_RETURN_NOT_OK(ArrowSchemaSetTypeFixedSize( schema, NANOARROW_TYPE_FIXED_SIZE_BINARY, fixed_type.length())); } break; case TypeId::kUuid: { - ArrowSchemaInit(schema); NANOARROW_RETURN_NOT_OK(ArrowSchemaSetTypeFixedSize( schema, NANOARROW_TYPE_FIXED_SIZE_BINARY, /*fixed_size=*/16)); NANOARROW_RETURN_NOT_OK( @@ -173,6 +165,8 @@ Status ToArrowSchema(const Schema& schema, ArrowSchema* out) { return InvalidArgument("Output Arrow schema cannot be null"); } + ArrowSchemaInit(out); + if (ArrowErrorCode errorCode = ToArrowSchema(schema, /*optional=*/false, /*name=*/"", /*field_id=*/std::nullopt, out); errorCode != NANOARROW_OK) { diff --git a/src/iceberg/schema_util.cc b/src/iceberg/schema_util.cc index 3e409efd7..4139a2b38 100644 --- a/src/iceberg/schema_util.cc +++ b/src/iceberg/schema_util.cc @@ -29,6 +29,7 @@ #include "iceberg/util/checked_cast.h" #include "iceberg/util/formatter_internal.h" #include "iceberg/util/macros.h" +#include "iceberg/util/unreachable.h" namespace iceberg { @@ -172,6 +173,8 @@ std::string_view ToString(FieldProjection::Kind kind) { case FieldProjection::Kind::kNull: return "null"; } + internal::Unreachable( + std::format("Unknown field projection kind: {}", static_cast(kind))); } std::string ToString(const FieldProjection& projection) { diff --git a/src/iceberg/schema_util.h b/src/iceberg/schema_util.h index 72b6b55d7..8ba21a7c9 100644 --- a/src/iceberg/schema_util.h +++ b/src/iceberg/schema_util.h @@ -19,12 +19,12 @@ #pragma once -#include #include #include #include #include +#include "iceberg/expression/literal.h" #include "iceberg/iceberg_export.h" #include "iceberg/result.h" #include "iceberg/type_fwd.h" @@ -48,14 +48,13 @@ struct ICEBERG_EXPORT FieldProjection { kNull, }; - /// \brief The field index in the source schema on the same nesting level when - /// `kind` is `kProjected`. - using SourceFieldIndex = size_t; - /// \brief A literal value used when `kind` is `kConstant` or `kDefault`. - /// TODO(gangwu): replace it with a specifically defined literal type - using Literal = std::any; /// \brief A variant to indicate how to set the value of the field. - using From = std::variant; + /// \note `std::monostate` is used to indicate that the field is not projected. + /// \note `size_t` is used to indicate the field index in the source schema on the same + /// nesting level when `kind` is `kProjected`. + /// \note `Literal` is used to indicate the value of the field when `kind` is + /// `kConstant` or `kDefault`. + using From = std::variant; /// \brief Format-specific attributes for the field. /// For example, for Parquet it might store column id and level info of the projected diff --git a/src/iceberg/schema_util_internal.h b/src/iceberg/schema_util_internal.h index 33aad93a4..e0faf4f6e 100644 --- a/src/iceberg/schema_util_internal.h +++ b/src/iceberg/schema_util_internal.h @@ -26,7 +26,7 @@ namespace iceberg { // Fix `from` field of `FieldProjection` to use pruned field index. -void PruneFieldProjection(FieldProjection& field_projection) { +inline void PruneFieldProjection(FieldProjection& field_projection) { std::map local_index_to_pruned_index; for (const auto& child_projection : field_projection.children) { if (child_projection.kind == FieldProjection::Kind::kProjected) { diff --git a/src/iceberg/snapshot.h b/src/iceberg/snapshot.h index 2df6a44d1..c52feefbc 100644 --- a/src/iceberg/snapshot.h +++ b/src/iceberg/snapshot.h @@ -28,6 +28,7 @@ #include "iceberg/iceberg_export.h" #include "iceberg/result.h" #include "iceberg/util/timepoint.h" +#include "iceberg/util/unreachable.h" namespace iceberg { @@ -50,6 +51,8 @@ ICEBERG_EXPORT constexpr std::string_view SnapshotRefTypeToString( case SnapshotRefType::kTag: return "tag"; } + internal::Unreachable( + std::format("Invalid snapshot reference type: {}", static_cast(type))); } /// \brief Get the relative snapshot reference type from name ICEBERG_EXPORT constexpr Result SnapshotRefTypeFromString( @@ -80,9 +83,6 @@ struct ICEBERG_EXPORT SnapshotRef { return lhs.Equals(rhs); } - /// \brief Compare two branches for inequality. - friend bool operator!=(const Branch& lhs, const Branch& rhs) { return !(lhs == rhs); } - private: /// \brief Compare two branches for equality. bool Equals(const Branch& other) const; @@ -97,9 +97,6 @@ struct ICEBERG_EXPORT SnapshotRef { /// \brief Compare two tags for equality. friend bool operator==(const Tag& lhs, const Tag& rhs) { return lhs.Equals(rhs); } - /// \brief Compare two tags for inequality. - friend bool operator!=(const Tag& lhs, const Tag& rhs) { return !(lhs == rhs); } - private: /// \brief Compare two tags for equality. bool Equals(const Tag& other) const; @@ -117,11 +114,6 @@ struct ICEBERG_EXPORT SnapshotRef { return lhs.Equals(rhs); } - /// \brief Compare two snapshot refs for inequality. - friend bool operator!=(const SnapshotRef& lhs, const SnapshotRef& rhs) { - return !(lhs == rhs); - } - private: /// \brief Compare two snapshot refs for equality. bool Equals(const SnapshotRef& other) const; @@ -263,11 +255,6 @@ struct ICEBERG_EXPORT Snapshot { return lhs.Equals(rhs); } - /// \brief Compare two snapshots for inequality. - friend bool operator!=(const Snapshot& lhs, const Snapshot& rhs) { - return !(lhs == rhs); - } - private: /// \brief Compare two snapshots for equality. bool Equals(const Snapshot& other) const; diff --git a/src/iceberg/sort_field.h b/src/iceberg/sort_field.h index 263bbc65a..f25503354 100644 --- a/src/iceberg/sort_field.h +++ b/src/iceberg/sort_field.h @@ -113,10 +113,6 @@ class ICEBERG_EXPORT SortField : public util::Formattable { return lhs.Equals(rhs); } - friend bool operator!=(const SortField& lhs, const SortField& rhs) { - return !(lhs == rhs); - } - private: /// \brief Compare two fields for equality. [[nodiscard]] bool Equals(const SortField& other) const; diff --git a/src/iceberg/sort_order.h b/src/iceberg/sort_order.h index de4abbae2..6e491533c 100644 --- a/src/iceberg/sort_order.h +++ b/src/iceberg/sort_order.h @@ -55,10 +55,6 @@ class ICEBERG_EXPORT SortOrder : public util::Formattable { return lhs.Equals(rhs); } - friend bool operator!=(const SortOrder& lhs, const SortOrder& rhs) { - return !(lhs == rhs); - } - private: /// \brief Compare two sort orders for equality. bool Equals(const SortOrder& other) const; diff --git a/src/iceberg/statistics_file.h b/src/iceberg/statistics_file.h index 5bdc1c14c..7ec01d49d 100644 --- a/src/iceberg/statistics_file.h +++ b/src/iceberg/statistics_file.h @@ -50,11 +50,6 @@ struct ICEBERG_EXPORT BlobMetadata { lhs.source_snapshot_sequence_number == rhs.source_snapshot_sequence_number && lhs.fields == rhs.fields && lhs.properties == rhs.properties; } - - /// \brief Compare two BlobMetadatas for inequality. - friend bool operator!=(const BlobMetadata& lhs, const BlobMetadata& rhs) { - return !(lhs == rhs); - } }; /// \brief Represents a statistics file in the Puffin format @@ -77,11 +72,6 @@ struct ICEBERG_EXPORT StatisticsFile { lhs.file_footer_size_in_bytes == rhs.file_footer_size_in_bytes && lhs.blob_metadata == rhs.blob_metadata; } - - /// \brief Compare two StatisticsFiles for inequality. - friend bool operator!=(const StatisticsFile& lhs, const StatisticsFile& rhs) { - return !(lhs == rhs); - } }; /// \brief Represents a partition statistics file @@ -100,12 +90,6 @@ struct ICEBERG_EXPORT PartitionStatisticsFile { return lhs.snapshot_id == rhs.snapshot_id && lhs.path == rhs.path && lhs.file_size_in_bytes == rhs.file_size_in_bytes; } - - /// \brief Compare two PartitionStatisticsFiles for inequality. - friend bool operator!=(const PartitionStatisticsFile& lhs, - const PartitionStatisticsFile& rhs) { - return !(lhs == rhs); - } }; /// \brief Returns a string representation of a BlobMetadata diff --git a/src/iceberg/table.cc b/src/iceberg/table.cc new file mode 100644 index 000000000..c79f3786a --- /dev/null +++ b/src/iceberg/table.cc @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/table.h" + +#include "iceberg/partition_spec.h" +#include "iceberg/schema.h" +#include "iceberg/sort_order.h" +#include "iceberg/table_metadata.h" + +namespace iceberg { + +const std::string& Table::uuid() const { return metadata_->table_uuid; } + +Result> Table::schema() const { return metadata_->Schema(); } + +const std::shared_ptr>>& +Table::schemas() const { + if (!schemas_map_) { + schemas_map_ = + std::make_shared>>(); + for (const auto& schema : metadata_->schemas) { + if (schema->schema_id()) { + schemas_map_->emplace(schema->schema_id().value(), schema); + } + } + } + return schemas_map_; +} + +Result> Table::spec() const { + return metadata_->PartitionSpec(); +} + +const std::shared_ptr>>& +Table::specs() const { + if (!partition_spec_map_) { + partition_spec_map_ = + std::make_shared>>(); + for (const auto& spec : metadata_->partition_specs) { + partition_spec_map_->emplace(spec->spec_id(), spec); + } + } + return partition_spec_map_; +} + +Result> Table::sort_order() const { + return metadata_->SortOrder(); +} + +const std::shared_ptr>>& +Table::sort_orders() const { + if (!sort_orders_map_) { + sort_orders_map_ = + std::make_shared>>(); + for (const auto& order : metadata_->sort_orders) { + sort_orders_map_->emplace(order->order_id(), order); + } + } + return sort_orders_map_; +} + +const std::unordered_map& Table::properties() const { + return metadata_->properties; +} + +const std::string& Table::location() const { return metadata_->location; } + +Result> Table::current_snapshot() const { + return metadata_->Snapshot(); +} + +Result> Table::SnapshotById(int64_t snapshot_id) const { + auto iter = std::ranges::find_if(metadata_->snapshots, + [this, &snapshot_id](const auto& snapshot) { + return snapshot->snapshot_id == snapshot_id; + }); + if (iter == metadata_->snapshots.end()) { + return NotFound("Snapshot with ID {} is not found", snapshot_id); + } + return *iter; +} + +const std::vector>& Table::snapshots() const { + return metadata_->snapshots; +} + +const std::vector& Table::history() const { + return metadata_->snapshot_log; +} + +const std::shared_ptr& Table::io() const { return io_; } + +} // namespace iceberg diff --git a/src/iceberg/table.h b/src/iceberg/table.h index 11a9fc982..9db02b4b3 100644 --- a/src/iceberg/table.h +++ b/src/iceberg/table.h @@ -19,13 +19,13 @@ #pragma once -#include #include #include #include #include "iceberg/iceberg_export.h" -#include "iceberg/result.h" +#include "iceberg/snapshot.h" +#include "iceberg/table_identifier.h" #include "iceberg/type_fwd.h" namespace iceberg { @@ -35,77 +35,92 @@ class ICEBERG_EXPORT Table { public: virtual ~Table() = default; - /// \brief Return the full name for this table - virtual const std::string& name() const = 0; + /// \brief Construct a table. + /// \param[in] identifier The identifier of the table. + /// \param[in] metadata The metadata for the table. + /// \param[in] metadata_location The location of the table metadata file. + /// \param[in] io The FileIO to read and write table data and metadata files. + /// \param[in] catalog The catalog that this table belongs to. If null, the table will + /// be read-only. + Table(TableIdentifier identifier, std::shared_ptr metadata, + std::string metadata_location, std::shared_ptr io, + std::shared_ptr catalog) + : identifier_(std::move(identifier)), + metadata_(std::move(metadata)), + metadata_location_(std::move(metadata_location)), + io_(std::move(io)), + catalog_(std::move(catalog)) {}; + + /// \brief Return the identifier of this table + const TableIdentifier& name() const { return identifier_; } /// \brief Returns the UUID of the table - virtual const std::string& uuid() const = 0; + const std::string& uuid() const; - /// \brief Refresh the current table metadata - virtual Status Refresh() = 0; - - /// \brief Return the schema for this table - virtual const std::shared_ptr& schema() const = 0; + /// \brief Return the schema for this table, return NotFoundError if not found + Result> schema() const; /// \brief Return a map of schema for this table - virtual const std::unordered_map>& schemas() const = 0; + /// \note This method is **not** thread-safe in the current implementation. + const std::shared_ptr>>& schemas() + const; - /// \brief Return the partition spec for this table - virtual const std::shared_ptr& spec() const = 0; + /// \brief Return the partition spec for this table, return NotFoundError if not found + Result> spec() const; /// \brief Return a map of partition specs for this table - virtual const std::unordered_map>& specs() - const = 0; + /// \note This method is **not** thread-safe in the current implementation. + const std::shared_ptr>>& + specs() const; - /// \brief Return the sort order for this table - virtual const std::shared_ptr& sort_order() const = 0; + /// \brief Return the sort order for this table, return NotFoundError if not found + Result> sort_order() const; /// \brief Return a map of sort order IDs to sort orders for this table - virtual const std::unordered_map>& sort_orders() - const = 0; + /// \note This method is **not** thread-safe in the current implementation. + const std::shared_ptr>>& + sort_orders() const; /// \brief Return a map of string properties for this table - virtual const std::unordered_map& properties() const = 0; + const std::unordered_map& properties() const; /// \brief Return the table's base location - virtual const std::string& location() const = 0; + const std::string& location() const; - /// \brief Return the table's current snapshot - virtual const std::shared_ptr& current_snapshot() const = 0; + /// \brief Return the table's current snapshot, return NotFoundError if not found + Result> current_snapshot() const; - /// \brief Get the snapshot of this table with the given id, or null if there is no - /// matching snapshot + /// \brief Get the snapshot of this table with the given id /// /// \param snapshot_id the ID of the snapshot to get - /// \return the Snapshot with the given id - virtual Result> snapshot(int64_t snapshot_id) const = 0; + /// \return the Snapshot with the given id, return NotFoundError if not found + Result> SnapshotById(int64_t snapshot_id) const; /// \brief Get the snapshots of this table - virtual const std::vector>& snapshots() const = 0; + const std::vector>& snapshots() const; /// \brief Get the snapshot history of this table /// /// \return a vector of history entries - virtual const std::vector>& history() const = 0; - - /// \brief Create a new table scan for this table - /// - /// Once a table scan is created, it can be refined to project columns and filter data. - virtual std::unique_ptr NewScan() const = 0; - - /// \brief Create a new append API to add files to this table and commit - virtual std::shared_ptr NewAppend() = 0; - - /// \brief Create a new transaction API to commit multiple table operations at once - virtual std::unique_ptr NewTransaction() = 0; - - /// TODO(wgtmac): design of FileIO is not finalized yet. We intend to use an - /// IO-less design in the core library. - // /// \brief Returns a FileIO to read and write table data and metadata files - // virtual std::shared_ptr io() const = 0; - - /// \brief Returns a LocationProvider to provide locations for new data files - virtual std::unique_ptr location_provider() const = 0; + const std::vector& history() const; + + /// \brief Returns a FileIO to read and write table data and metadata files + const std::shared_ptr& io() const; + + private: + const TableIdentifier identifier_; + std::shared_ptr metadata_; + const std::string metadata_location_; + std::shared_ptr io_; + std::shared_ptr catalog_; + + // Cache lazy-initialized maps. + mutable std::shared_ptr>> + schemas_map_; + mutable std::shared_ptr>> + partition_spec_map_; + mutable std::shared_ptr>> + sort_orders_map_; }; } // namespace iceberg diff --git a/src/iceberg/table_metadata.cc b/src/iceberg/table_metadata.cc index 4e112fd21..b820517b2 100644 --- a/src/iceberg/table_metadata.cc +++ b/src/iceberg/table_metadata.cc @@ -76,6 +76,16 @@ Result> TableMetadata::SortOrder() const { return *iter; } +Result> TableMetadata::Snapshot() const { + auto iter = std::ranges::find_if(snapshots, [this](const auto& snapshot) { + return snapshot->snapshot_id == current_snapshot_id; + }); + if (iter == snapshots.end()) { + return NotFound("Current snapshot with ID {} is not found", current_snapshot_id); + } + return *iter; +} + namespace { template diff --git a/src/iceberg/table_metadata.h b/src/iceberg/table_metadata.h index 9c7f37da7..c34091aee 100644 --- a/src/iceberg/table_metadata.h +++ b/src/iceberg/table_metadata.h @@ -44,10 +44,6 @@ struct ICEBERG_EXPORT SnapshotLogEntry { friend bool operator==(const SnapshotLogEntry& lhs, const SnapshotLogEntry& rhs) { return lhs.timestamp_ms == rhs.timestamp_ms && lhs.snapshot_id == rhs.snapshot_id; } - - friend bool operator!=(const SnapshotLogEntry& lhs, const SnapshotLogEntry& rhs) { - return !(lhs == rhs); - } }; /// \brief Represents a metadata log entry @@ -60,10 +56,6 @@ struct ICEBERG_EXPORT MetadataLogEntry { friend bool operator==(const MetadataLogEntry& lhs, const MetadataLogEntry& rhs) { return lhs.timestamp_ms == rhs.timestamp_ms && lhs.metadata_file == rhs.metadata_file; } - - friend bool operator!=(const MetadataLogEntry& lhs, const MetadataLogEntry& rhs) { - return !(lhs == rhs); - } }; /// \brief Represents the metadata for an Iceberg table @@ -135,12 +127,10 @@ struct ICEBERG_EXPORT TableMetadata { Result> PartitionSpec() const; /// \brief Get the current sort order, return NotFoundError if not found Result> SortOrder() const; + /// \brief Get the current snapshot, return NotFoundError if not found + Result> Snapshot() const; friend bool operator==(const TableMetadata& lhs, const TableMetadata& rhs); - - friend bool operator!=(const TableMetadata& lhs, const TableMetadata& rhs) { - return !(lhs == rhs); - } }; /// \brief Returns a string representation of a SnapshotLogEntry diff --git a/src/iceberg/transform.cc b/src/iceberg/transform.cc index dc0529766..95f6fe717 100644 --- a/src/iceberg/transform.cc +++ b/src/iceberg/transform.cc @@ -24,6 +24,7 @@ #include "iceberg/transform_function.h" #include "iceberg/type.h" +#include "iceberg/util/unreachable.h" namespace iceberg { namespace { @@ -59,6 +60,8 @@ constexpr std::string_view TransformTypeToString(TransformType type) { case TransformType::kVoid: return kVoidName; } + internal::Unreachable( + std::format("Unknown transform type: {}", static_cast(type))); } std::shared_ptr Transform::Identity() { @@ -166,6 +169,8 @@ std::string Transform::ToString() const { return std::format("{}[{}]", TransformTypeToString(transform_type_), std::get(param_)); } + internal::Unreachable( + std::format("Unknown transform type: {}", static_cast(transform_type_))); } TransformFunction::TransformFunction(TransformType transform_type, diff --git a/src/iceberg/transform.h b/src/iceberg/transform.h index 7ca4abcba..f09f15bba 100644 --- a/src/iceberg/transform.h +++ b/src/iceberg/transform.h @@ -135,11 +135,6 @@ class ICEBERG_EXPORT Transform : public util::Formattable { return lhs.Equals(rhs); } - /// \brief Inequality comparison. - friend bool operator!=(const Transform& lhs, const Transform& rhs) { - return !(lhs == rhs); - } - private: /// \brief Constructs a Transform of the specified type (for non-parametric types). /// \param transform_type The transform type (e.g., identity, year, day). @@ -188,10 +183,6 @@ class ICEBERG_EXPORT TransformFunction { return lhs.Equals(rhs); } - friend bool operator!=(const TransformFunction& lhs, const TransformFunction& rhs) { - return !(lhs == rhs); - } - private: /// \brief Compare two partition specs for equality. [[nodiscard]] virtual bool Equals(const TransformFunction& other) const; diff --git a/src/iceberg/type.cc b/src/iceberg/type.cc index e58cd0e7d..5186e9c15 100644 --- a/src/iceberg/type.cc +++ b/src/iceberg/type.cc @@ -42,6 +42,7 @@ StructType::StructType(std::vector fields) : fields_(std::move(fiel } TypeId StructType::type_id() const { return kTypeId; } + std::string StructType::ToString() const { std::string repr = "struct<\n"; for (const auto& field : fields_) { @@ -59,7 +60,7 @@ std::optional> StructType::GetFieldByI } std::optional> StructType::GetFieldByIndex( int32_t index) const { - if (index < 0 || index >= static_cast(fields_.size())) { + if (index < 0 || index >= static_cast(fields_.size())) { return std::nullopt; } return fields_[index]; diff --git a/src/iceberg/type.h b/src/iceberg/type.h index 9bb862316..09e088f7a 100644 --- a/src/iceberg/type.h +++ b/src/iceberg/type.h @@ -55,9 +55,6 @@ class ICEBERG_EXPORT Type : public iceberg::util::Formattable { /// \brief Compare two types for equality. friend bool operator==(const Type& lhs, const Type& rhs) { return lhs.Equals(rhs); } - /// \brief Compare two types for inequality. - friend bool operator!=(const Type& lhs, const Type& rhs) { return !(lhs == rhs); } - protected: /// \brief Compare two types for equality. [[nodiscard]] virtual bool Equals(const Type& other) const = 0; diff --git a/src/iceberg/type_fwd.h b/src/iceberg/type_fwd.h index a5996c426..cc5f0a7fb 100644 --- a/src/iceberg/type_fwd.h +++ b/src/iceberg/type_fwd.h @@ -99,6 +99,10 @@ class TransformFunction; struct PartitionStatisticsFile; struct Snapshot; struct SnapshotRef; + +struct MetadataLogEntry; +struct SnapshotLogEntry; + struct StatisticsFile; struct TableMetadata; @@ -113,7 +117,6 @@ enum class TransformType; /// TODO: Forward declarations below are not added yet. /// ---------------------------------------------------------------------------- -class HistoryEntry; class StructLike; class MetadataUpdate; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 863bb09ef..ca96187ab 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -44,21 +44,23 @@ target_sources(schema_test target_link_libraries(schema_test PRIVATE iceberg_static GTest::gtest_main GTest::gmock) add_test(NAME schema_test COMMAND schema_test) -add_executable(catalog_test) -target_sources(catalog_test PRIVATE in_memory_catalog_test.cc) -target_link_libraries(catalog_test PRIVATE iceberg_static GTest::gtest_main GTest::gmock) -add_test(NAME catalog_test COMMAND catalog_test) +add_executable(table_test) +target_include_directories(table_test PRIVATE "${CMAKE_BINARY_DIR}") +target_sources(table_test PRIVATE test_common.cc json_internal_test.cc table_test.cc + schema_json_test.cc) +target_link_libraries(table_test PRIVATE iceberg_static GTest::gtest_main GTest::gmock) +add_test(NAME table_test COMMAND table_test) add_executable(expression_test) -target_sources(expression_test PRIVATE expression_test.cc) +target_sources(expression_test PRIVATE expression_test.cc literal_test.cc) target_link_libraries(expression_test PRIVATE iceberg_static GTest::gtest_main GTest::gmock) add_test(NAME expression_test COMMAND expression_test) add_executable(json_serde_test) target_include_directories(json_serde_test PRIVATE "${CMAKE_BINARY_DIR}") -target_sources(json_serde_test PRIVATE json_internal_test.cc metadata_serde_test.cc - schema_json_test.cc) +target_sources(json_serde_test PRIVATE test_common.cc json_internal_test.cc + metadata_serde_test.cc schema_json_test.cc) target_link_libraries(json_serde_test PRIVATE iceberg_static GTest::gtest_main GTest::gmock) add_test(NAME json_serde_test COMMAND json_serde_test) @@ -71,7 +73,8 @@ add_test(NAME util_test COMMAND util_test) if(ICEBERG_BUILD_BUNDLE) add_executable(avro_test) - target_sources(avro_test PRIVATE avro_test.cc avro_schema_test.cc avro_stream_test.cc) + target_sources(avro_test PRIVATE avro_data_test.cc avro_test.cc avro_schema_test.cc + avro_stream_test.cc) target_link_libraries(avro_test PRIVATE iceberg_bundle_static GTest::gtest_main GTest::gmock) add_test(NAME avro_test COMMAND avro_test) @@ -82,4 +85,11 @@ if(ICEBERG_BUILD_BUNDLE) target_link_libraries(arrow_test PRIVATE iceberg_bundle_static GTest::gtest_main GTest::gmock) add_test(NAME arrow_test COMMAND arrow_test) + + add_executable(catalog_test) + target_include_directories(catalog_test PRIVATE "${CMAKE_BINARY_DIR}") + target_sources(catalog_test PRIVATE test_common.cc in_memory_catalog_test.cc) + target_link_libraries(catalog_test PRIVATE iceberg_bundle_static GTest::gtest_main + GTest::gmock) + add_test(NAME catalog_test COMMAND catalog_test) endif() diff --git a/test/arrow_test.cc b/test/arrow_test.cc index 52cef049a..e8714e9ef 100644 --- a/test/arrow_test.cc +++ b/test/arrow_test.cc @@ -283,6 +283,7 @@ TEST_P(FromArrowSchemaTest, PrimitiveType) { auto type_result = FromArrowSchema(exported_schema, /*schema_id=*/1); ASSERT_THAT(type_result, IsOk()); + ArrowSchemaRelease(&exported_schema); const auto& schema = type_result.value(); ASSERT_EQ(schema->schema_id(), 1); @@ -358,6 +359,7 @@ TEST(FromArrowSchemaTest, StructType) { auto schema_result = FromArrowSchema(exported_schema, /*schema_id=*/0); ASSERT_THAT(schema_result, IsOk()); + ArrowSchemaRelease(&exported_schema); const auto& iceberg_schema = schema_result.value(); ASSERT_EQ(iceberg_schema->schema_id(), 0); @@ -408,6 +410,7 @@ TEST(FromArrowSchemaTest, ListType) { auto schema_result = FromArrowSchema(exported_schema, /*schema_id=*/0); ASSERT_THAT(schema_result, IsOk()); + ArrowSchemaRelease(&exported_schema); const auto& iceberg_schema = schema_result.value(); ASSERT_EQ(iceberg_schema->schema_id(), 0); @@ -458,6 +461,7 @@ TEST(FromArrowSchemaTest, MapType) { auto schema_result = FromArrowSchema(exported_schema, /*schema_id=*/0); ASSERT_THAT(schema_result, IsOk()); + ArrowSchemaRelease(&exported_schema); const auto& iceberg_schema = schema_result.value(); ASSERT_EQ(iceberg_schema->schema_id(), 0); diff --git a/test/avro_data_test.cc b/test/avro_data_test.cc new file mode 100644 index 000000000..33b417734 --- /dev/null +++ b/test/avro_data_test.cc @@ -0,0 +1,763 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "iceberg/avro/avro_data_util_internal.h" +#include "iceberg/avro/avro_schema_util_internal.h" +#include "iceberg/schema.h" +#include "iceberg/schema_internal.h" +#include "iceberg/schema_util.h" +#include "iceberg/type.h" +#include "matchers.h" + +namespace iceberg::avro { + +/// \brief Test case structure for parameterized primitive type tests +struct AppendDatumParam { + std::string name; + std::shared_ptr projected_type; + std::shared_ptr source_type; + std::function value_setter; + std::string expected_json; +}; + +/// \brief Helper function to create test data for a primitive type +std::vector<::avro::GenericDatum> CreateTestData( + const ::avro::NodePtr& avro_node, + const std::function& value_setter, int count = 3) { + std::vector<::avro::GenericDatum> avro_data; + for (int i = 0; i < count; ++i) { + ::avro::GenericDatum avro_datum(avro_node); + value_setter(avro_datum, i); + avro_data.push_back(avro_datum); + } + return avro_data; +} + +/// \brief Utility function to verify AppendDatumToBuilder behavior +void VerifyAppendDatumToBuilder(const Schema& projected_schema, + const ::avro::NodePtr& avro_node, + const std::vector<::avro::GenericDatum>& avro_data, + std::string_view expected_array_json) { + // Create 1 to 1 projection + auto projection_result = Project(projected_schema, avro_node, /*prune_source=*/false); + ASSERT_THAT(projection_result, IsOk()); + auto projection = std::move(projection_result.value()); + + // Create arrow schema and array builder + ArrowSchema arrow_c_schema; + ASSERT_THAT(ToArrowSchema(projected_schema, &arrow_c_schema), IsOk()); + auto arrow_schema = ::arrow::ImportSchema(&arrow_c_schema).ValueOrDie(); + auto arrow_struct_type = std::make_shared<::arrow::StructType>(arrow_schema->fields()); + auto builder = ::arrow::MakeBuilder(arrow_struct_type).ValueOrDie(); + + // Call AppendDatumToBuilder repeatedly to append the datum + for (const auto& avro_datum : avro_data) { + ASSERT_THAT(AppendDatumToBuilder(avro_node, avro_datum, projection, projected_schema, + builder.get()), + IsOk()); + } + + // Verify the result + auto array = builder->Finish().ValueOrDie(); + auto expected_array = + ::arrow::json::ArrayFromJSONString(arrow_struct_type, expected_array_json) + .ValueOrDie(); + ASSERT_TRUE(array->Equals(*expected_array)) + << "array: " << array->ToString() + << "\nexpected_array: " << expected_array->ToString(); +} + +/// \brief Test class for primitive types using parameterized tests +class AppendDatumToBuilderTest : public ::testing::TestWithParam {}; + +TEST_P(AppendDatumToBuilderTest, PrimitiveType) { + const auto& test_case = GetParam(); + + Schema projected_schema({SchemaField::MakeRequired( + /*field_id=*/1, /*name=*/"a", test_case.projected_type)}); + Schema source_schema({SchemaField::MakeRequired( + /*field_id=*/1, /*name=*/"a", test_case.source_type)}); + + ::avro::NodePtr avro_node; + EXPECT_THAT(ToAvroNodeVisitor{}.Visit(source_schema, &avro_node), IsOk()); + + auto avro_data = CreateTestData(avro_node, test_case.value_setter); + ASSERT_NO_FATAL_FAILURE(VerifyAppendDatumToBuilder(projected_schema, avro_node, + avro_data, test_case.expected_json)); +} + +// Define test cases for all primitive types +const std::vector kPrimitiveTestCases = { + { + .name = "Boolean", + .projected_type = std::make_shared(), + .source_type = std::make_shared(), + .value_setter = + [](::avro::GenericDatum& datum, int i) { + datum.value<::avro::GenericRecord>().fieldAt(0).value() = + (i % 2 == 0); + }, + .expected_json = R"([{"a": true}, {"a": false}, {"a": true}])", + }, + { + .name = "Int", + .projected_type = std::make_shared(), + .source_type = std::make_shared(), + .value_setter = + [](::avro::GenericDatum& datum, int i) { + datum.value<::avro::GenericRecord>().fieldAt(0).value() = i * 100; + }, + .expected_json = R"([{"a": 0}, {"a": 100}, {"a": 200}])", + }, + { + .name = "Long", + .projected_type = std::make_shared(), + .source_type = std::make_shared(), + .value_setter = + [](::avro::GenericDatum& datum, int i) { + datum.value<::avro::GenericRecord>().fieldAt(0).value() = + i * 1000000LL; + }, + .expected_json = R"([{"a": 0}, {"a": 1000000}, {"a": 2000000}])", + }, + { + .name = "Float", + .projected_type = std::make_shared(), + .source_type = std::make_shared(), + .value_setter = + [](::avro::GenericDatum& datum, int i) { + datum.value<::avro::GenericRecord>().fieldAt(0).value() = i * 3.14f; + }, + .expected_json = R"([{"a": 0.0}, {"a": 3.14}, {"a": 6.28}])", + }, + { + .name = "Double", + .projected_type = std::make_shared(), + .source_type = std::make_shared(), + .value_setter = + [](::avro::GenericDatum& datum, int i) { + datum.value<::avro::GenericRecord>().fieldAt(0).value() = + i * 1.234567890; + }, + .expected_json = R"([{"a": 0.0}, {"a": 1.234567890}, {"a": 2.469135780}])", + }, + { + .name = "String", + .projected_type = std::make_shared(), + .source_type = std::make_shared(), + .value_setter = + [](::avro::GenericDatum& datum, int i) { + datum.value<::avro::GenericRecord>().fieldAt(0).value() = + "test_string_" + std::to_string(i); + }, + .expected_json = + R"([{"a": "test_string_0"}, {"a": "test_string_1"}, {"a": "test_string_2"}])", + }, + { + .name = "Binary", + .projected_type = std::make_shared(), + .source_type = std::make_shared(), + .value_setter = + [](::avro::GenericDatum& datum, int i) { + datum.value<::avro::GenericRecord>() + .fieldAt(0) + .value>() = {static_cast('a' + i), + static_cast('b' + i), + static_cast('c' + i)}; + }, + .expected_json = R"([{"a": "abc"}, {"a": "bcd"}, {"a": "cde"}])", + }, + { + .name = "Fixed", + .projected_type = std::make_shared(4), + .source_type = std::make_shared(4), + .value_setter = + [](::avro::GenericDatum& datum, int i) { + datum.value<::avro::GenericRecord>() + .fieldAt(0) + .value<::avro::GenericFixed>() + .value() = { + static_cast('a' + i), static_cast('b' + i), + static_cast('c' + i), static_cast('d' + i)}; + }, + .expected_json = R"([{"a": "abcd"}, {"a": "bcde"}, {"a": "cdef"}])", + }, + /// FIXME: NotImplemented: MakeBuilder: cannot construct builder for type + /// extension. Need to fix this in the upstream Arrow. + // { + // .name = "UUID", + // .projected_type = std::make_shared(), + // .source_type = std::make_shared(), + // .value_setter = + // [](::avro::GenericDatum& datum, int i) { + // datum.value<::avro::GenericRecord>() + // .fieldAt(0) + // .value<::avro::GenericFixed>() + // .value() = {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', + // 'i', 'j', 'k', 'l', 'm', 'n', 'o', + // static_cast(i)}; + // }, + // .expected_json = R"([{"a": "abcdefghijklmnop"}, {"a": "bcdefghijklmnopq"}, + // {"a": "cdefghijklmnopqr"}])", + // }, + { + .name = "Decimal", + .projected_type = std::make_shared(10, 2), + .source_type = std::make_shared(10, 2), + .value_setter = + [](::avro::GenericDatum& datum, int i) { + int32_t decimal_value = i * 1000 + i; + std::vector& fixed = datum.value<::avro::GenericRecord>() + .fieldAt(0) + .value<::avro::GenericFixed>() + .value(); + // The byte array must contain the two's-complement representation of + // the unscaled integer value in big-endian byte order. + for (uint8_t& rvalue : std::ranges::reverse_view(fixed)) { + rvalue = static_cast(decimal_value & 0xFF); + decimal_value >>= 8; + } + }, + .expected_json = R"([{"a": "0.00"}, {"a": "10.01"}, {"a": "20.02"}])", + }, + { + .name = "Date", + .projected_type = std::make_shared(), + .source_type = std::make_shared(), + .value_setter = + [](::avro::GenericDatum& datum, int i) { + // Date as days since epoch (1970-01-01) + // 0 = 1970-01-01, 1 = 1970-01-02, etc. + datum.value<::avro::GenericRecord>().fieldAt(0).value() = + 18000 + i; // ~2019-04-11 + i days + }, + .expected_json = R"([{"a": 18000}, {"a": 18001}, {"a": 18002}])", + }, + { + .name = "Time", + .projected_type = std::make_shared(), + .source_type = std::make_shared(), + .value_setter = + [](::avro::GenericDatum& datum, int i) { + // Time as microseconds since midnight + // 12:30:45.123456 + i seconds = 45045123456 + i*1000000 microseconds + datum.value<::avro::GenericRecord>().fieldAt(0).value() = + 45045123456LL + i * 1000000LL; + }, + .expected_json = + R"([{"a": 45045123456}, {"a": 45046123456}, {"a": 45047123456}])", + }, + { + .name = "Timestamp", + .projected_type = std::make_shared(), + .source_type = std::make_shared(), + .value_setter = + [](::avro::GenericDatum& datum, int i) { + datum.value<::avro::GenericRecord>().fieldAt(0).value() = + i * 1000000LL; + }, + .expected_json = R"([{"a": 0}, {"a": 1000000}, {"a": 2000000}])", + }, + { + .name = "TimestampTz", + .projected_type = std::make_shared(), + .source_type = std::make_shared(), + .value_setter = + [](::avro::GenericDatum& datum, int i) { + datum.value<::avro::GenericRecord>().fieldAt(0).value() = + 1672531200000000LL + i * 1000000LL; + }, + .expected_json = + R"([{"a": 1672531200000000}, {"a": 1672531201000000}, {"a": 1672531202000000}])", + }, + { + .name = "IntToLongPromotion", + .projected_type = std::make_shared(), + .source_type = std::make_shared(), + .value_setter = + [](::avro::GenericDatum& datum, int i) { + datum.value<::avro::GenericRecord>().fieldAt(0).value() = i * 100; + }, + .expected_json = R"([{"a": 0}, {"a": 100}, {"a": 200}])", + }, + { + .name = "FloatToDoublePromotion", + .projected_type = std::make_shared(), + .source_type = std::make_shared(), + .value_setter = + [](::avro::GenericDatum& datum, int i) { + datum.value<::avro::GenericRecord>().fieldAt(0).value() = i * 1.0f; + }, + .expected_json = R"([{"a": 0.0}, {"a": 1.0}, {"a": 2.0}])", + }, + { + .name = "DecimalPrecisionPromotion", + .projected_type = std::make_shared(10, 2), + .source_type = std::make_shared(6, 2), + .value_setter = + [](::avro::GenericDatum& datum, int i) { + int32_t decimal_value = i * 1000 + i; + std::vector& fixed = datum.value<::avro::GenericRecord>() + .fieldAt(0) + .value<::avro::GenericFixed>() + .value(); + for (uint8_t& rvalue : std::ranges::reverse_view(fixed)) { + rvalue = static_cast(decimal_value & 0xFF); + decimal_value >>= 8; + } + }, + .expected_json = R"([{"a": "0.00"}, {"a": "10.01"}, {"a": "20.02"}])", + }, +}; + +INSTANTIATE_TEST_SUITE_P(AllPrimitiveTypes, AppendDatumToBuilderTest, + ::testing::ValuesIn(kPrimitiveTestCases), + [](const ::testing::TestParamInfo& info) { + return info.param.name; + }); + +TEST(AppendDatumToBuilderTest, StructWithTwoFields) { + Schema iceberg_schema({ + SchemaField::MakeRequired(1, "id", std::make_shared()), + SchemaField::MakeRequired(2, "name", std::make_shared()), + }); + ::avro::NodePtr avro_node; + ASSERT_THAT(ToAvroNodeVisitor{}.Visit(iceberg_schema, &avro_node), IsOk()); + + std::vector<::avro::GenericDatum> avro_data; + ::avro::GenericDatum avro_datum(avro_node); + auto& record = avro_datum.value<::avro::GenericRecord>(); + record.fieldAt(0).value() = 42; + record.fieldAt(1).value() = "test"; + avro_data.push_back(avro_datum); + + ASSERT_NO_FATAL_FAILURE(VerifyAppendDatumToBuilder(iceberg_schema, avro_node, avro_data, + R"([{"id": 42, "name": "test"}])")); +} + +TEST(AppendDatumToBuilderTest, NestedStruct) { + Schema iceberg_schema({ + SchemaField::MakeRequired(1, "id", std::make_shared()), + SchemaField::MakeRequired( + 2, "person", + std::make_shared(std::vector{ + SchemaField::MakeRequired(3, "name", std::make_shared()), + SchemaField::MakeRequired(4, "age", std::make_shared()), + })), + }); + + ::avro::NodePtr avro_node; + ASSERT_THAT(ToAvroNodeVisitor{}.Visit(iceberg_schema, &avro_node), IsOk()); + + std::vector<::avro::GenericDatum> avro_data; + for (int i = 0; i < 2; ++i) { + ::avro::GenericDatum avro_datum(avro_node); + auto& record = avro_datum.value<::avro::GenericRecord>(); + + // Set id field + record.fieldAt(0).value() = i + 1; + + // Set nested person struct + auto& person_record = record.fieldAt(1).value<::avro::GenericRecord>(); + person_record.fieldAt(0).value() = "Person" + std::to_string(i); + person_record.fieldAt(1).value() = 25 + i; + + avro_data.push_back(avro_datum); + } + + const std::string expected_json = R"([ + {"id": 1, "person": {"name": "Person0", "age": 25}}, + {"id": 2, "person": {"name": "Person1", "age": 26}} + ])"; + ASSERT_NO_FATAL_FAILURE( + VerifyAppendDatumToBuilder(iceberg_schema, avro_node, avro_data, expected_json)); +} + +TEST(AppendDatumToBuilderTest, ListOfIntegers) { + Schema iceberg_schema({ + SchemaField::MakeRequired(1, "numbers", + std::make_shared(SchemaField::MakeRequired( + 2, "element", std::make_shared()))), + }); + + ::avro::NodePtr avro_node; + ASSERT_THAT(ToAvroNodeVisitor{}.Visit(iceberg_schema, &avro_node), IsOk()); + + std::vector<::avro::GenericDatum> avro_data; + for (int i = 0; i < 2; ++i) { + ::avro::GenericDatum avro_datum(avro_node); + auto& record = avro_datum.value<::avro::GenericRecord>(); + + // Create array with values [i*10, i*10+1, i*10+2] + auto& array = record.fieldAt(0).value<::avro::GenericArray>(); + for (int j = 0; j < 3; ++j) { + ::avro::GenericDatum element(avro_node->leafAt(0)->leafAt(0)); + element.value() = i * 10 + j; + array.value().push_back(element); + } + + avro_data.push_back(avro_datum); + } + + const std::string expected_json = R"([ + {"numbers": [0, 1, 2]}, + {"numbers": [10, 11, 12]} + ])"; + ASSERT_NO_FATAL_FAILURE( + VerifyAppendDatumToBuilder(iceberg_schema, avro_node, avro_data, expected_json)); +} + +TEST(AppendDatumToBuilderTest, ListOfStructs) { + Schema iceberg_schema({ + SchemaField::MakeRequired( + 1, "people", + std::make_shared(SchemaField::MakeRequired( + 2, "element", + std::make_shared(std::vector{ + SchemaField::MakeRequired(3, "name", std::make_shared()), + SchemaField::MakeRequired(4, "age", std::make_shared()), + })))), + }); + + ::avro::NodePtr avro_node; + ASSERT_THAT(ToAvroNodeVisitor{}.Visit(iceberg_schema, &avro_node), IsOk()); + + std::vector<::avro::GenericDatum> avro_data; + for (int i = 0; i < 2; ++i) { + ::avro::GenericDatum avro_datum(avro_node); + auto& record = avro_datum.value<::avro::GenericRecord>(); + + auto& array = record.fieldAt(0).value<::avro::GenericArray>(); + for (int j = 0; j < 2; ++j) { + ::avro::GenericDatum element(avro_node->leafAt(0)->leafAt(0)); + auto& person_record = element.value<::avro::GenericRecord>(); + person_record.fieldAt(0).value() = + "Person" + std::to_string(i) + "_" + std::to_string(j); + person_record.fieldAt(1).value() = 20 + i * 10 + j; + array.value().push_back(element); + } + + avro_data.push_back(avro_datum); + } + + const std::string expected_json = R"([ + {"people": [ + {"name": "Person0_0", "age": 20}, + {"name": "Person0_1", "age": 21} + ]}, + {"people": [ + {"name": "Person1_0", "age": 30}, + {"name": "Person1_1", "age": 31} + ]} + ])"; + ASSERT_NO_FATAL_FAILURE( + VerifyAppendDatumToBuilder(iceberg_schema, avro_node, avro_data, expected_json)); +} + +TEST(AppendDatumToBuilderTest, MapStringToInt) { + Schema iceberg_schema({ + SchemaField::MakeRequired( + 1, "scores", + std::make_shared( + SchemaField::MakeRequired(2, "key", std::make_shared()), + SchemaField::MakeRequired(3, "value", std::make_shared()))), + }); + + ::avro::NodePtr avro_node; + ASSERT_THAT(ToAvroNodeVisitor{}.Visit(iceberg_schema, &avro_node), IsOk()); + + std::vector<::avro::GenericDatum> avro_data; + for (int i = 0; i < 2; ++i) { + ::avro::GenericDatum avro_datum(avro_node); + auto& record = avro_datum.value<::avro::GenericRecord>(); + + auto& map = record.fieldAt(0).value<::avro::GenericMap>(); + auto& map_container = map.value(); + + map_container.emplace_back("score_" + std::to_string(i * 2), + ::avro::GenericDatum(static_cast(100 + i * 10))); + map_container.emplace_back( + "score_" + std::to_string(i * 2 + 1), + ::avro::GenericDatum(static_cast(100 + i * 10 + 5))); + + avro_data.push_back(avro_datum); + } + + const std::string expected_json = R"([ + {"scores": [["score_0", 100], ["score_1", 105]]}, + {"scores": [["score_2", 110], ["score_3", 115]]} + ])"; + ASSERT_NO_FATAL_FAILURE( + VerifyAppendDatumToBuilder(iceberg_schema, avro_node, avro_data, expected_json)); +} + +TEST(AppendDatumToBuilderTest, MapIntToStringAsArray) { + Schema iceberg_schema({ + SchemaField::MakeRequired( + 1, "names", + std::make_shared( + SchemaField::MakeRequired(2, "key", std::make_shared()), + SchemaField::MakeRequired(3, "value", std::make_shared()))), + }); + + ::avro::NodePtr avro_node; + ASSERT_THAT(ToAvroNodeVisitor{}.Visit(iceberg_schema, &avro_node), IsOk()); + + std::vector<::avro::GenericDatum> avro_data; + for (int i = 0; i < 2; ++i) { + ::avro::GenericDatum avro_datum(avro_node); + auto& record = avro_datum.value<::avro::GenericRecord>(); + + auto& array = record.fieldAt(0).value<::avro::GenericArray>(); + for (int j = 0; j < 2; ++j) { + ::avro::GenericDatum kv_pair(avro_node->leafAt(0)->leafAt(0)); + auto& kv_record = kv_pair.value<::avro::GenericRecord>(); + kv_record.fieldAt(0).value() = i * 10 + j; + kv_record.fieldAt(1).value() = "name_" + std::to_string(i * 10 + j); + array.value().push_back(kv_pair); + } + + avro_data.push_back(avro_datum); + } + + const std::string expected_json = R"([ + {"names": [[0, "name_0"], [1, "name_1"]]}, + {"names": [[10, "name_10"], [11, "name_11"]]} + ])"; + ASSERT_NO_FATAL_FAILURE( + VerifyAppendDatumToBuilder(iceberg_schema, avro_node, avro_data, expected_json)); +} + +TEST(AppendDatumToBuilderTest, MapStringToStruct) { + Schema iceberg_schema({ + SchemaField::MakeRequired( + 1, "users", + std::make_shared( + SchemaField::MakeRequired(2, "key", std::make_shared()), + SchemaField::MakeRequired( + 3, "value", + std::make_shared(std::vector{ + SchemaField::MakeRequired(4, "id", std::make_shared()), + SchemaField::MakeRequired(5, "email", + std::make_shared()), + })))), + }); + + ::avro::NodePtr avro_node; + ASSERT_THAT(ToAvroNodeVisitor{}.Visit(iceberg_schema, &avro_node), IsOk()); + + std::vector<::avro::GenericDatum> avro_data; + for (int i = 0; i < 2; ++i) { + ::avro::GenericDatum avro_datum(avro_node); + auto& record = avro_datum.value<::avro::GenericRecord>(); + + auto& map = record.fieldAt(0).value<::avro::GenericMap>(); + auto& map_container = map.value(); + + ::avro::GenericDatum struct_value(avro_node->leafAt(0)->leafAt(1)); + auto& struct_record = struct_value.value<::avro::GenericRecord>(); + struct_record.fieldAt(0).value() = 1000 + i; + struct_record.fieldAt(1).value() = + "user" + std::to_string(i) + "@example.com"; + + map_container.emplace_back("user_" + std::to_string(i), std::move(struct_value)); + + avro_data.push_back(avro_datum); + } + + const std::string expected_json = R"([ + {"users": [["user_0", {"id": 1000, "email": "user0@example.com"}]]}, + {"users": [["user_1", {"id": 1001, "email": "user1@example.com"}]]} + ])"; + ASSERT_NO_FATAL_FAILURE( + VerifyAppendDatumToBuilder(iceberg_schema, avro_node, avro_data, expected_json)); +} + +TEST(AppendDatumToBuilderTest, StructWithMissingOptionalField) { + Schema iceberg_schema({ + SchemaField::MakeRequired(1, "id", std::make_shared()), + SchemaField::MakeRequired(2, "name", std::make_shared()), + SchemaField::MakeOptional(3, "age", + std::make_shared()), // Missing in Avro + SchemaField::MakeOptional(4, "email", + std::make_shared()), // Missing in Avro + }); + + // Create Avro schema that only has id and name fields (missing age and email) + std::string avro_schema_json = R"({ + "type": "record", + "name": "person", + "fields": [ + {"name": "id", "type": "int", "field-id": 1}, + {"name": "name", "type": "string", "field-id": 2} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + std::vector<::avro::GenericDatum> avro_data; + for (int i = 0; i < 2; ++i) { + ::avro::GenericDatum avro_datum(avro_schema.root()); + auto& record = avro_datum.value<::avro::GenericRecord>(); + record.fieldAt(0).value() = i + 1; + record.fieldAt(1).value() = "Person" + std::to_string(i); + avro_data.push_back(avro_datum); + } + + const std::string expected_json = R"([ + {"id": 1, "name": "Person0", "age": null, "email": null}, + {"id": 2, "name": "Person1", "age": null, "email": null} + ])"; + ASSERT_NO_FATAL_FAILURE(VerifyAppendDatumToBuilder(iceberg_schema, avro_schema.root(), + avro_data, expected_json)); +} + +TEST(AppendDatumToBuilderTest, NestedStructWithMissingOptionalFields) { + Schema iceberg_schema({ + SchemaField::MakeRequired(1, "id", std::make_shared()), + SchemaField::MakeRequired( + 2, "person", + std::make_shared(std::vector{ + SchemaField::MakeRequired(3, "name", std::make_shared()), + SchemaField::MakeOptional(4, "age", + std::make_shared()), // Missing + SchemaField::MakeOptional(5, "phone", + std::make_shared()), // Missing + })), + SchemaField::MakeOptional(6, "department", + std::make_shared()), // Missing + }); + + // Create Avro schema with only id, person.name fields + std::string avro_schema_json = R"({ + "type": "record", + "name": "employee", + "fields": [ + {"name": "id", "type": "int", "field-id": 1}, + {"name": "person", "type": { + "type": "record", + "name": "person_info", + "fields": [ + {"name": "name", "type": "string", "field-id": 3} + ] + }, "field-id": 2} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + std::vector<::avro::GenericDatum> avro_data; + for (int i = 0; i < 2; ++i) { + ::avro::GenericDatum avro_datum(avro_schema.root()); + auto& record = avro_datum.value<::avro::GenericRecord>(); + + record.fieldAt(0).value() = i + 100; + + auto& person_record = record.fieldAt(1).value<::avro::GenericRecord>(); + person_record.fieldAt(0).value() = "Employee" + std::to_string(i); + + avro_data.push_back(avro_datum); + } + + const std::string expected_json = R"([ + {"id": 100, "person": {"name": "Employee0", "age": null, "phone": null}, "department": null}, + {"id": 101, "person": {"name": "Employee1", "age": null, "phone": null}, "department": null} + ])"; + ASSERT_NO_FATAL_FAILURE(VerifyAppendDatumToBuilder(iceberg_schema, avro_schema.root(), + avro_data, expected_json)); +} + +TEST(AppendDatumToBuilderTest, ListWithMissingOptionalElementFields) { + Schema iceberg_schema({ + SchemaField::MakeRequired( + 1, "people", + std::make_shared(SchemaField::MakeRequired( + 2, "element", + std::make_shared(std::vector{ + SchemaField::MakeRequired(3, "name", std::make_shared()), + SchemaField::MakeOptional( + 4, "age", + std::make_shared()), // Missing in Avro + SchemaField::MakeOptional( + 5, "email", + std::make_shared()), // Missing in Avro + })))), + }); + + // Create Avro schema with list of structs that only have name field + std::string avro_schema_json = R"({ + "type": "record", + "name": "people_list", + "fields": [ + {"name": "people", "type": { + "type": "array", + "items": { + "type": "record", + "name": "person", + "fields": [ + {"name": "name", "type": "string", "field-id": 3} + ] + }, + "element-id": 2 + }, "field-id": 1} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + std::vector<::avro::GenericDatum> avro_data; + for (int i = 0; i < 2; ++i) { + ::avro::GenericDatum avro_datum(avro_schema.root()); + auto& record = avro_datum.value<::avro::GenericRecord>(); + + auto& array = record.fieldAt(0).value<::avro::GenericArray>(); + for (int j = 0; j < 2; ++j) { + ::avro::GenericDatum element(avro_schema.root()->leafAt(0)->leafAt(0)); + auto& person_record = element.value<::avro::GenericRecord>(); + person_record.fieldAt(0).value() = + "Person" + std::to_string(i) + "_" + std::to_string(j); + array.value().push_back(element); + } + + avro_data.push_back(avro_datum); + } + + const std::string expected_json = R"([ + {"people": [ + {"name": "Person0_0", "age": null, "email": null}, + {"name": "Person0_1", "age": null, "email": null} + ]}, + {"people": [ + {"name": "Person1_0", "age": null, "email": null}, + {"name": "Person1_1", "age": null, "email": null} + ]} + ])"; + ASSERT_NO_FATAL_FAILURE(VerifyAppendDatumToBuilder(iceberg_schema, avro_schema.root(), + avro_data, expected_json)); +} + +} // namespace iceberg::avro diff --git a/test/expression_test.cc b/test/expression_test.cc index f722d62c5..c14c7d9a3 100644 --- a/test/expression_test.cc +++ b/test/expression_test.cc @@ -30,25 +30,17 @@ TEST(TrueFalseTest, Basic) { auto false_instance = False::Instance(); auto negated = false_instance->Negate(); - EXPECT_TRUE(negated.has_value()); - // Check that negated expression is True - auto true_expr = negated.value(); - EXPECT_EQ(true_expr->op(), Expression::Operation::kTrue); - - EXPECT_EQ(true_expr->ToString(), "true"); + EXPECT_EQ(negated->op(), Expression::Operation::kTrue); + EXPECT_EQ(negated->ToString(), "true"); // Test negation of True returns false auto true_instance = True::Instance(); negated = true_instance->Negate(); - EXPECT_TRUE(negated.has_value()); - // Check that negated expression is False - auto false_expr = negated.value(); - EXPECT_EQ(false_expr->op(), Expression::Operation::kFalse); - - EXPECT_EQ(false_expr->ToString(), "false"); + EXPECT_EQ(negated->op(), Expression::Operation::kFalse); + EXPECT_EQ(negated->ToString(), "false"); } TEST(ANDTest, Basic) { @@ -64,4 +56,102 @@ TEST(ANDTest, Basic) { EXPECT_EQ(and_expr->left()->op(), Expression::Operation::kTrue); EXPECT_EQ(and_expr->right()->op(), Expression::Operation::kTrue); } + +TEST(ORTest, Basic) { + // Create True and False expressions + auto true_expr = True::Instance(); + auto false_expr = False::Instance(); + + // Create an OR expression + auto or_expr = std::make_shared(true_expr, false_expr); + + EXPECT_EQ(or_expr->op(), Expression::Operation::kOr); + EXPECT_EQ(or_expr->ToString(), "(true or false)"); + EXPECT_EQ(or_expr->left()->op(), Expression::Operation::kTrue); + EXPECT_EQ(or_expr->right()->op(), Expression::Operation::kFalse); +} + +TEST(ORTest, Negation) { + // Test De Morgan's law: not(A or B) = (not A) and (not B) + auto true_expr = True::Instance(); + auto false_expr = False::Instance(); + + auto or_expr = std::make_shared(true_expr, false_expr); + auto negated_or = or_expr->Negate(); + + // Should become AND expression + EXPECT_EQ(negated_or->op(), Expression::Operation::kAnd); + EXPECT_EQ(negated_or->ToString(), "(false and true)"); +} + +TEST(ORTest, Equals) { + auto true_expr = True::Instance(); + auto false_expr = False::Instance(); + + // Test basic equality + auto or_expr1 = std::make_shared(true_expr, false_expr); + auto or_expr2 = std::make_shared(true_expr, false_expr); + EXPECT_TRUE(or_expr1->Equals(*or_expr2)); + + // Test commutativity: (A or B) equals (B or A) + auto or_expr3 = std::make_shared(false_expr, true_expr); + EXPECT_TRUE(or_expr1->Equals(*or_expr3)); + + // Test inequality with different expressions + auto or_expr4 = std::make_shared(true_expr, true_expr); + EXPECT_FALSE(or_expr1->Equals(*or_expr4)); + + // Test inequality with different operation types + auto and_expr = std::make_shared(true_expr, false_expr); + EXPECT_FALSE(or_expr1->Equals(*and_expr)); +} + +TEST(ANDTest, Negation) { + // Test De Morgan's law: not(A and B) = (not A) or (not B) + auto true_expr = True::Instance(); + auto false_expr = False::Instance(); + + auto and_expr = std::make_shared(true_expr, false_expr); + auto negated_and = and_expr->Negate(); + + // Should become OR expression + EXPECT_EQ(negated_and->op(), Expression::Operation::kOr); + EXPECT_EQ(negated_and->ToString(), "(false or true)"); +} + +TEST(ANDTest, Equals) { + auto true_expr = True::Instance(); + auto false_expr = False::Instance(); + + // Test basic equality + auto and_expr1 = std::make_shared(true_expr, false_expr); + auto and_expr2 = std::make_shared(true_expr, false_expr); + EXPECT_TRUE(and_expr1->Equals(*and_expr2)); + + // Test commutativity: (A and B) equals (B and A) + auto and_expr3 = std::make_shared(false_expr, true_expr); + EXPECT_TRUE(and_expr1->Equals(*and_expr3)); + + // Test inequality with different expressions + auto and_expr4 = std::make_shared(true_expr, true_expr); + EXPECT_FALSE(and_expr1->Equals(*and_expr4)); + + // Test inequality with different operation types + auto or_expr = std::make_shared(true_expr, false_expr); + EXPECT_FALSE(and_expr1->Equals(*or_expr)); +} + +TEST(ExpressionTest, BaseClassNegateThrowsException) { + // Create a mock expression that doesn't override Negate() + class MockExpression : public Expression { + public: + Operation op() const override { return Operation::kTrue; } + // Deliberately not overriding Negate() to test base class behavior + }; + + auto mock_expr = std::make_shared(); + + // Should throw IcebergError when calling Negate() on base class + EXPECT_THROW(mock_expr->Negate(), IcebergError); +} } // namespace iceberg diff --git a/test/in_memory_catalog_test.cc b/test/in_memory_catalog_test.cc index c76d78878..753dedcea 100644 --- a/test/in_memory_catalog_test.cc +++ b/test/in_memory_catalog_test.cc @@ -19,24 +19,41 @@ #include "iceberg/catalog/in_memory_catalog.h" +#include #include #include +#include "iceberg/arrow/arrow_fs_file_io.h" +#include "iceberg/table.h" +#include "iceberg/table_metadata.h" #include "matchers.h" +#include "temp_file_test_base.h" +#include "test_common.h" namespace iceberg { class InMemoryCatalogTest : public ::testing::Test { protected: void SetUp() override { - file_io_ = nullptr; // TODO(Guotao): A real FileIO instance needs to be constructed. + // generate a unique temporary file path for the test + temp_filepath_ = GenerateUniqueTempFilePathWithSuffix(".metadata.json"); + + file_io_ = std::make_shared( + std::make_shared<::arrow::fs::LocalFileSystem>()); std::unordered_map properties = {{"prop1", "val1"}}; - catalog_ = std::make_unique("test_catalog", file_io_, + catalog_ = std::make_shared("test_catalog", file_io_, "/tmp/warehouse/", properties); } + void TearDown() override { + // Clean up the temporary files created for the table metadata + std::error_code ec; + std::filesystem::remove_all(temp_filepath_, ec); + } + + std::string temp_filepath_; std::shared_ptr file_io_; - std::unique_ptr catalog_; + std::shared_ptr catalog_; }; TEST_F(InMemoryCatalogTest, CatalogName) { @@ -58,6 +75,21 @@ TEST_F(InMemoryCatalogTest, TableExists) { EXPECT_THAT(result, HasValue(::testing::Eq(false))); } +TEST_F(InMemoryCatalogTest, RegisterTable) { + TableIdentifier tableIdent{.ns = {}, .name = "t1"}; + + std::unique_ptr metadata; + ASSERT_NO_FATAL_FAILURE(ReadTableMetadata("TableMetadataV2Valid.json", &metadata)); + + auto status = TableMetadataUtil::Write(*file_io_, temp_filepath_, *metadata); + EXPECT_THAT(status, IsOk()); + + auto table = catalog_->RegisterTable(tableIdent, temp_filepath_); + EXPECT_THAT(table, IsOk()); + ASSERT_EQ(table.value()->name().name, "t1"); + ASSERT_EQ(table.value()->location(), "s3://bucket/test/location"); +} + TEST_F(InMemoryCatalogTest, DropTable) { TableIdentifier tableIdent{.ns = {}, .name = "t1"}; auto result = catalog_->DropTable(tableIdent, false); diff --git a/test/literal_test.cc b/test/literal_test.cc new file mode 100644 index 000000000..6821c569a --- /dev/null +++ b/test/literal_test.cc @@ -0,0 +1,386 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/expression/literal.h" + +#include +#include +#include + +#include + +#include "iceberg/type.h" +#include "matchers.h" + +namespace iceberg { + +// Boolean type tests +TEST(LiteralTest, BooleanBasics) { + auto true_literal = Literal::Boolean(true); + auto false_literal = Literal::Boolean(false); + + EXPECT_EQ(true_literal.type()->type_id(), TypeId::kBoolean); + EXPECT_EQ(false_literal.type()->type_id(), TypeId::kBoolean); + + EXPECT_EQ(true_literal.ToString(), "true"); + EXPECT_EQ(false_literal.ToString(), "false"); +} + +TEST(LiteralTest, BooleanComparison) { + auto true_literal = Literal::Boolean(true); + auto false_literal = Literal::Boolean(false); + auto another_true = Literal::Boolean(true); + + EXPECT_EQ(true_literal <=> another_true, std::partial_ordering::equivalent); + EXPECT_EQ(true_literal <=> false_literal, std::partial_ordering::greater); + EXPECT_EQ(false_literal <=> true_literal, std::partial_ordering::less); +} + +// Int type tests +TEST(LiteralTest, IntBasics) { + auto int_literal = Literal::Int(42); + auto negative_int = Literal::Int(-123); + + EXPECT_EQ(int_literal.type()->type_id(), TypeId::kInt); + EXPECT_EQ(negative_int.type()->type_id(), TypeId::kInt); + + EXPECT_EQ(int_literal.ToString(), "42"); + EXPECT_EQ(negative_int.ToString(), "-123"); +} + +TEST(LiteralTest, IntComparison) { + auto int1 = Literal::Int(10); + auto int2 = Literal::Int(20); + auto int3 = Literal::Int(10); + + EXPECT_EQ(int1 <=> int3, std::partial_ordering::equivalent); + EXPECT_EQ(int1 <=> int2, std::partial_ordering::less); + EXPECT_EQ(int2 <=> int1, std::partial_ordering::greater); +} + +TEST(LiteralTest, IntCastTo) { + auto int_literal = Literal::Int(42); + + // Cast to Long + auto long_result = int_literal.CastTo(std::make_shared()); + ASSERT_THAT(long_result, IsOk()); + EXPECT_EQ(long_result->type()->type_id(), TypeId::kLong); + EXPECT_EQ(long_result->ToString(), "42"); + + // Cast to Float + auto float_result = int_literal.CastTo(std::make_shared()); + ASSERT_THAT(float_result, IsOk()); + EXPECT_EQ(float_result->type()->type_id(), TypeId::kFloat); + + // Cast to Double + auto double_result = int_literal.CastTo(std::make_shared()); + ASSERT_THAT(double_result, IsOk()); + EXPECT_EQ(double_result->type()->type_id(), TypeId::kDouble); +} + +// Long type tests +TEST(LiteralTest, LongBasics) { + auto long_literal = Literal::Long(1234567890L); + auto negative_long = Literal::Long(-9876543210L); + + EXPECT_EQ(long_literal.type()->type_id(), TypeId::kLong); + EXPECT_EQ(negative_long.type()->type_id(), TypeId::kLong); + + EXPECT_EQ(long_literal.ToString(), "1234567890"); + EXPECT_EQ(negative_long.ToString(), "-9876543210"); +} + +TEST(LiteralTest, LongComparison) { + auto long1 = Literal::Long(100L); + auto long2 = Literal::Long(200L); + auto long3 = Literal::Long(100L); + + EXPECT_EQ(long1 <=> long3, std::partial_ordering::equivalent); + EXPECT_EQ(long1 <=> long2, std::partial_ordering::less); + EXPECT_EQ(long2 <=> long1, std::partial_ordering::greater); +} + +TEST(LiteralTest, LongCastTo) { + auto long_literal = Literal::Long(42L); + + // Cast to Int (within range) + auto int_result = long_literal.CastTo(std::make_shared()); + ASSERT_THAT(int_result, IsOk()); + EXPECT_EQ(int_result->type()->type_id(), TypeId::kInt); + EXPECT_EQ(int_result->ToString(), "42"); + + // Cast to Float + auto float_result = long_literal.CastTo(std::make_shared()); + ASSERT_THAT(float_result, IsOk()); + EXPECT_EQ(float_result->type()->type_id(), TypeId::kFloat); + + // Cast to Double + auto double_result = long_literal.CastTo(std::make_shared()); + ASSERT_THAT(double_result, IsOk()); + EXPECT_EQ(double_result->type()->type_id(), TypeId::kDouble); +} + +TEST(LiteralTest, LongCastToIntOverflow) { + // Test overflow cases + auto max_long = + Literal::Long(static_cast(std::numeric_limits::max()) + 1); + auto min_long = + Literal::Long(static_cast(std::numeric_limits::min()) - 1); + + auto max_result = max_long.CastTo(std::make_shared()); + ASSERT_THAT(max_result, IsOk()); + EXPECT_TRUE(max_result->IsAboveMax()); + + auto min_result = min_long.CastTo(std::make_shared()); + ASSERT_THAT(min_result, IsOk()); + EXPECT_TRUE(min_result->IsBelowMin()); +} + +// Float type tests +TEST(LiteralTest, FloatBasics) { + auto float_literal = Literal::Float(3.14f); + auto negative_float = Literal::Float(-2.71f); + + EXPECT_EQ(float_literal.type()->type_id(), TypeId::kFloat); + EXPECT_EQ(negative_float.type()->type_id(), TypeId::kFloat); + + EXPECT_EQ(float_literal.ToString(), "3.140000"); + EXPECT_EQ(negative_float.ToString(), "-2.710000"); +} + +TEST(LiteralTest, FloatComparison) { + auto float1 = Literal::Float(1.5f); + auto float2 = Literal::Float(2.5f); + auto float3 = Literal::Float(1.5f); + + EXPECT_EQ(float1 <=> float3, std::partial_ordering::equivalent); + EXPECT_EQ(float1 <=> float2, std::partial_ordering::less); + EXPECT_EQ(float2 <=> float1, std::partial_ordering::greater); +} + +TEST(LiteralTest, FloatCastTo) { + auto float_literal = Literal::Float(3.14f); + + // Cast to Double + auto double_result = float_literal.CastTo(std::make_shared()); + ASSERT_THAT(double_result, IsOk()); + EXPECT_EQ(double_result->type()->type_id(), TypeId::kDouble); +} + +// Double type tests +TEST(LiteralTest, DoubleBasics) { + auto double_literal = Literal::Double(std::numbers::pi); + auto negative_double = Literal::Double(-std::numbers::e); + + EXPECT_EQ(double_literal.type()->type_id(), TypeId::kDouble); + EXPECT_EQ(negative_double.type()->type_id(), TypeId::kDouble); + + EXPECT_EQ(double_literal.ToString(), "3.141593"); + EXPECT_EQ(negative_double.ToString(), "-2.718282"); +} + +TEST(LiteralTest, DoubleComparison) { + auto double1 = Literal::Double(1.5); + auto double2 = Literal::Double(2.5); + auto double3 = Literal::Double(1.5); + + EXPECT_EQ(double1 <=> double3, std::partial_ordering::equivalent); + EXPECT_EQ(double1 <=> double2, std::partial_ordering::less); + EXPECT_EQ(double2 <=> double1, std::partial_ordering::greater); +} + +// String type tests +TEST(LiteralTest, StringBasics) { + auto string_literal = Literal::String("hello world"); + auto empty_string = Literal::String(""); + + EXPECT_EQ(string_literal.type()->type_id(), TypeId::kString); + EXPECT_EQ(empty_string.type()->type_id(), TypeId::kString); + + EXPECT_EQ(string_literal.ToString(), "hello world"); + EXPECT_EQ(empty_string.ToString(), ""); +} + +TEST(LiteralTest, StringComparison) { + auto string1 = Literal::String("apple"); + auto string2 = Literal::String("banana"); + auto string3 = Literal::String("apple"); + + EXPECT_EQ(string1 <=> string3, std::partial_ordering::equivalent); + EXPECT_EQ(string1 <=> string2, std::partial_ordering::less); + EXPECT_EQ(string2 <=> string1, std::partial_ordering::greater); +} + +// Binary type tests +TEST(LiteralTest, BinaryBasics) { + std::vector data = {0x01, 0x02, 0x03, 0xFF}; + auto binary_literal = Literal::Binary(data); + auto empty_binary = Literal::Binary({}); + + EXPECT_EQ(binary_literal.type()->type_id(), TypeId::kBinary); + EXPECT_EQ(empty_binary.type()->type_id(), TypeId::kBinary); + + EXPECT_EQ(binary_literal.ToString(), "010203FF"); + EXPECT_EQ(empty_binary.ToString(), ""); +} + +TEST(LiteralTest, BinaryComparison) { + std::vector data1 = {0x01, 0x02}; + std::vector data2 = {0x01, 0x03}; + std::vector data3 = {0x01, 0x02}; + + auto binary1 = Literal::Binary(data1); + auto binary2 = Literal::Binary(data2); + auto binary3 = Literal::Binary(data3); + + EXPECT_EQ(binary1 <=> binary3, std::partial_ordering::equivalent); + EXPECT_EQ(binary1 <=> binary2, std::partial_ordering::less); + EXPECT_EQ(binary2 <=> binary1, std::partial_ordering::greater); +} + +// Cross-type comparison tests +TEST(LiteralTest, CrossTypeComparison) { + auto int_literal = Literal::Int(42); + auto string_literal = Literal::String("42"); + + // Different types should return unordered + EXPECT_EQ(int_literal <=> string_literal, std::partial_ordering::unordered); +} + +// Special value tests +TEST(LiteralTest, SpecialValues) { + auto int_literal = Literal::Int(42); + + EXPECT_FALSE(int_literal.IsAboveMax()); + EXPECT_FALSE(int_literal.IsBelowMin()); +} + +// Same type cast test +TEST(LiteralTest, SameTypeCast) { + auto int_literal = Literal::Int(42); + + auto same_type_result = int_literal.CastTo(std::make_shared()); + ASSERT_THAT(same_type_result, IsOk()); + EXPECT_EQ(same_type_result->type()->type_id(), TypeId::kInt); + EXPECT_EQ(same_type_result->ToString(), "42"); +} + +// Float special values tests +TEST(LiteralTest, FloatSpecialValuesComparison) { + // Create special float values + auto neg_nan = Literal::Float(-std::numeric_limits::quiet_NaN()); + auto neg_inf = Literal::Float(-std::numeric_limits::infinity()); + auto neg_value = Literal::Float(-1.5f); + auto neg_zero = Literal::Float(-0.0f); + auto pos_zero = Literal::Float(0.0f); + auto pos_value = Literal::Float(1.5f); + auto pos_inf = Literal::Float(std::numeric_limits::infinity()); + auto pos_nan = Literal::Float(std::numeric_limits::quiet_NaN()); + + // Test the ordering: -NaN < -Infinity < -value < -0 < 0 < value < Infinity < NaN + EXPECT_EQ(neg_nan <=> neg_inf, std::partial_ordering::less); + EXPECT_EQ(neg_inf <=> neg_value, std::partial_ordering::less); + EXPECT_EQ(neg_value <=> neg_zero, std::partial_ordering::less); + EXPECT_EQ(neg_zero <=> pos_zero, std::partial_ordering::less); + EXPECT_EQ(pos_zero <=> pos_value, std::partial_ordering::less); + EXPECT_EQ(pos_value <=> pos_inf, std::partial_ordering::less); + EXPECT_EQ(pos_inf <=> pos_nan, std::partial_ordering::less); +} + +TEST(LiteralTest, FloatNaNComparison) { + auto nan1 = Literal::Float(std::numeric_limits::quiet_NaN()); + auto nan2 = Literal::Float(std::numeric_limits::quiet_NaN()); + auto signaling_nan = Literal::Float(std::numeric_limits::signaling_NaN()); + + // NaN should be equal to itself in strong ordering + EXPECT_EQ(nan1 <=> nan2, std::partial_ordering::equivalent); + EXPECT_EQ(nan1 <=> signaling_nan, std::partial_ordering::equivalent); +} + +TEST(LiteralTest, FloatInfinityComparison) { + auto neg_inf = Literal::Float(-std::numeric_limits::infinity()); + auto pos_inf = Literal::Float(std::numeric_limits::infinity()); + auto max_value = Literal::Float(std::numeric_limits::max()); + auto min_value = Literal::Float(std::numeric_limits::lowest()); + + EXPECT_EQ(neg_inf <=> min_value, std::partial_ordering::less); + EXPECT_EQ(max_value <=> pos_inf, std::partial_ordering::less); + EXPECT_EQ(neg_inf <=> pos_inf, std::partial_ordering::less); +} + +TEST(LiteralTest, FloatZeroComparison) { + auto neg_zero = Literal::Float(-0.0f); + auto pos_zero = Literal::Float(0.0f); + + // -0 should be less than +0 + EXPECT_EQ(neg_zero <=> pos_zero, std::partial_ordering::less); +} + +// Double special values tests +TEST(LiteralTest, DoubleSpecialValuesComparison) { + // Create special double values + auto neg_nan = Literal::Double(-std::numeric_limits::quiet_NaN()); + auto neg_inf = Literal::Double(-std::numeric_limits::infinity()); + auto neg_value = Literal::Double(-1.5); + auto neg_zero = Literal::Double(-0.0); + auto pos_zero = Literal::Double(0.0); + auto pos_value = Literal::Double(1.5); + auto pos_inf = Literal::Double(std::numeric_limits::infinity()); + auto pos_nan = Literal::Double(std::numeric_limits::quiet_NaN()); + + // Test the ordering: -NaN < -Infinity < -value < -0 < 0 < value < Infinity < NaN + EXPECT_EQ(neg_nan <=> neg_inf, std::partial_ordering::less); + EXPECT_EQ(neg_inf <=> neg_value, std::partial_ordering::less); + EXPECT_EQ(neg_value <=> neg_zero, std::partial_ordering::less); + EXPECT_EQ(neg_zero <=> pos_zero, std::partial_ordering::less); + EXPECT_EQ(pos_zero <=> pos_value, std::partial_ordering::less); + EXPECT_EQ(pos_value <=> pos_inf, std::partial_ordering::less); + EXPECT_EQ(pos_inf <=> pos_nan, std::partial_ordering::less); +} + +TEST(LiteralTest, DoubleNaNComparison) { + auto nan1 = Literal::Double(std::numeric_limits::quiet_NaN()); + auto nan2 = Literal::Double(std::numeric_limits::quiet_NaN()); + auto signaling_nan = Literal::Double(std::numeric_limits::signaling_NaN()); + + // NaN should be equal to itself in strong ordering + EXPECT_EQ(nan1 <=> nan2, std::partial_ordering::equivalent); + EXPECT_EQ(nan1 <=> signaling_nan, std::partial_ordering::equivalent); +} + +TEST(LiteralTest, DoubleInfinityComparison) { + auto neg_inf = Literal::Double(-std::numeric_limits::infinity()); + auto pos_inf = Literal::Double(std::numeric_limits::infinity()); + auto max_value = Literal::Double(std::numeric_limits::max()); + auto min_value = Literal::Double(std::numeric_limits::lowest()); + + EXPECT_EQ(neg_inf <=> min_value, std::partial_ordering::less); + EXPECT_EQ(max_value <=> pos_inf, std::partial_ordering::less); + EXPECT_EQ(neg_inf <=> pos_inf, std::partial_ordering::less); +} + +TEST(LiteralTest, DoubleZeroComparison) { + auto neg_zero = Literal::Double(-0.0); + auto pos_zero = Literal::Double(0.0); + + // -0 should be less than +0 + EXPECT_EQ(neg_zero <=> pos_zero, std::partial_ordering::less); +} + +} // namespace iceberg diff --git a/test/metadata_io_test.cc b/test/metadata_io_test.cc index 7d987e25b..432101b07 100644 --- a/test/metadata_io_test.cc +++ b/test/metadata_io_test.cc @@ -50,27 +50,25 @@ class MetadataIOTest : public TempFileTestBase { /*optional=*/false); auto schema = std::make_shared(std::move(schema_fields), /*schema_id=*/1); - TableMetadata metadata{ - .format_version = 1, - .table_uuid = "1234567890", - .location = "s3://bucket/path", - .last_sequence_number = 0, - .schemas = {schema}, - .current_schema_id = 1, - .default_spec_id = 0, - .last_partition_id = 0, - .properties = {{"key", "value"}}, - .current_snapshot_id = 3051729675574597004, - .snapshots = {std::make_shared(Snapshot{ - .snapshot_id = 3051729675574597004, - .sequence_number = 0, - .timestamp_ms = TimePointMsFromUnixMs(1515100955770).value(), - .manifest_list = "s3://a/b/1.avro", - .summary = {{"operation", "append"}}, - })}, - .default_sort_order_id = 0, - .next_row_id = 0}; - return metadata; + return TableMetadata{.format_version = 1, + .table_uuid = "1234567890", + .location = "s3://bucket/path", + .last_sequence_number = 0, + .schemas = {schema}, + .current_schema_id = 1, + .default_spec_id = 0, + .last_partition_id = 0, + .properties = {{"key", "value"}}, + .current_snapshot_id = 3051729675574597004, + .snapshots = {std::make_shared(Snapshot{ + .snapshot_id = 3051729675574597004, + .sequence_number = 0, + .timestamp_ms = TimePointMsFromUnixMs(1515100955770).value(), + .manifest_list = "s3://a/b/1.avro", + .summary = {{"operation", "append"}}, + })}, + .default_sort_order_id = 0, + .next_row_id = 0}; } std::shared_ptr io_; diff --git a/test/metadata_serde_test.cc b/test/metadata_serde_test.cc index 4a78e8ce4..4c78f652f 100644 --- a/test/metadata_serde_test.cc +++ b/test/metadata_serde_test.cc @@ -20,13 +20,11 @@ #include #include #include -#include #include #include #include -#include "iceberg/json_internal.h" #include "iceberg/partition_field.h" #include "iceberg/partition_spec.h" #include "iceberg/schema.h" @@ -35,9 +33,9 @@ #include "iceberg/sort_field.h" #include "iceberg/sort_order.h" #include "iceberg/table_metadata.h" -#include "iceberg/test/test_config.h" #include "iceberg/transform.h" #include "iceberg/type.h" +#include "test_common.h" namespace iceberg { @@ -46,33 +44,6 @@ namespace { class MetadataSerdeTest : public ::testing::Test { protected: void SetUp() override {} - - static std::string GetResourcePath(const std::string& file_name) { - return std::string(ICEBERG_TEST_RESOURCES) + "/" + file_name; - } - - static void ReadJsonFile(const std::string& file_name, std::string* content) { - std::filesystem::path path{GetResourcePath(file_name)}; - ASSERT_TRUE(std::filesystem::exists(path)) - << "File does not exist: " << path.string(); - - std::ifstream file(path); - std::stringstream buffer; - buffer << file.rdbuf(); - *content = buffer.str(); - } - - static void ReadTableMetadata(const std::string& file_name, - std::unique_ptr* metadata) { - std::string json_content; - ReadJsonFile(file_name, &json_content); - - nlohmann::json json = nlohmann::json::parse(json_content); - auto result = TableMetadataFromJson(json); - ASSERT_TRUE(result.has_value()) << "Failed to parse table metadata from " << file_name - << ": " << result.error().message; - *metadata = std::move(result.value()); - } }; } // namespace @@ -112,6 +83,8 @@ TEST_F(MetadataSerdeTest, DeserializeV1Valid) { auto partition_spec = metadata->PartitionSpec(); ASSERT_TRUE(partition_spec.has_value()); EXPECT_EQ(*(partition_spec.value().get()), *expected_spec); + auto snapshot = metadata->Snapshot(); + ASSERT_FALSE(snapshot.has_value()); } TEST_F(MetadataSerdeTest, DeserializeV2Valid) { @@ -163,7 +136,11 @@ TEST_F(MetadataSerdeTest, DeserializeV2Valid) { ASSERT_TRUE(sort_order.has_value()); EXPECT_EQ(*(sort_order.value().get()), *expected_sort_order); + // Compare snapshot EXPECT_EQ(metadata->current_snapshot_id, 3055729675574597004); + auto snapshot = metadata->Snapshot(); + ASSERT_TRUE(snapshot.has_value()); + EXPECT_EQ(snapshot.value()->snapshot_id, 3055729675574597004); // Compare snapshots std::vector expected_snapshots{ diff --git a/test/table_test.cc b/test/table_test.cc new file mode 100644 index 000000000..56f475de0 --- /dev/null +++ b/test/table_test.cc @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/table.h" + +#include +#include +#include +#include +#include + +#include +#include + +#include "iceberg/partition_spec.h" +#include "iceberg/schema.h" +#include "iceberg/snapshot.h" +#include "iceberg/table_metadata.h" +#include "test_common.h" + +namespace iceberg { + +TEST(Table, TableV1) { + std::unique_ptr metadata; + ASSERT_NO_FATAL_FAILURE(ReadTableMetadata("TableMetadataV1Valid.json", &metadata)); + TableIdentifier tableIdent{.ns = {}, .name = "test_table_v1"}; + Table table(tableIdent, std::move(metadata), "s3://bucket/test/location/meta/", nullptr, + nullptr); + ASSERT_EQ(table.name().name, "test_table_v1"); + + // Check table schema + auto schema = table.schema(); + ASSERT_TRUE(schema.has_value()); + ASSERT_EQ(schema.value()->fields().size(), 3); + auto schemas = table.schemas(); + ASSERT_TRUE(schemas->empty()); + + // Check table spec + auto spec = table.spec(); + ASSERT_TRUE(spec.has_value()); + auto specs = table.specs(); + ASSERT_EQ(1UL, specs->size()); + + // Check table sort_order + auto sort_order = table.sort_order(); + ASSERT_TRUE(sort_order.has_value()); + auto sort_orders = table.sort_orders(); + ASSERT_EQ(1UL, sort_orders->size()); + + // Check table location + auto location = table.location(); + ASSERT_EQ(location, "s3://bucket/test/location"); + + // Check table snapshots + auto snapshots = table.snapshots(); + ASSERT_TRUE(snapshots.empty()); + + auto io = table.io(); + ASSERT_TRUE(io == nullptr); +} + +TEST(Table, TableV2) { + std::unique_ptr metadata; + ASSERT_NO_FATAL_FAILURE(ReadTableMetadata("TableMetadataV2Valid.json", &metadata)); + TableIdentifier tableIdent{.ns = {}, .name = "test_table_v2"}; + + Table table(tableIdent, std::move(metadata), "s3://bucket/test/location/meta/", nullptr, + nullptr); + ASSERT_EQ(table.name().name, "test_table_v2"); + + // Check table schema + auto schema = table.schema(); + ASSERT_TRUE(schema.has_value()); + ASSERT_EQ(schema.value()->fields().size(), 3); + auto schemas = table.schemas(); + ASSERT_FALSE(schemas->empty()); + + // Check partition spec + auto spec = table.spec(); + ASSERT_TRUE(spec.has_value()); + auto specs = table.specs(); + ASSERT_EQ(1UL, specs->size()); + + // Check sort order + auto sort_order = table.sort_order(); + ASSERT_TRUE(sort_order.has_value()); + auto sort_orders = table.sort_orders(); + ASSERT_EQ(1UL, sort_orders->size()); + + // Check table location + auto location = table.location(); + ASSERT_EQ(location, "s3://bucket/test/location"); + + // Check snapshot + auto snapshots = table.snapshots(); + ASSERT_EQ(2UL, snapshots.size()); + auto snapshot = table.current_snapshot(); + ASSERT_TRUE(snapshot.has_value()); + snapshot = table.SnapshotById(snapshot.value()->snapshot_id); + ASSERT_TRUE(snapshot.has_value()); + auto invalid_snapshot_id = 9999; + snapshot = table.SnapshotById(invalid_snapshot_id); + ASSERT_FALSE(snapshot.has_value()); +} + +} // namespace iceberg diff --git a/test/temp_file_test_base.h b/test/temp_file_test_base.h index 8e20e2ca5..39714118b 100644 --- a/test/temp_file_test_base.h +++ b/test/temp_file_test_base.h @@ -31,6 +31,46 @@ namespace iceberg { +/// \brief Get the test name for inclusion in the filename +inline std::string TestInfo() { + if (const auto info = ::testing::UnitTest::GetInstance()->current_test_info(); info) { + return std::format("{}_{}", info->test_suite_name(), info->name()); + } + return "unknown_test"; +} + +/// \brief Helper to generate a random alphanumeric string for unique filenames +inline std::string GenerateRandomString(size_t length) { + const std::string_view chars = + "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution<> dist(0, static_cast(chars.size() - 1)); + + std::string result; + result.reserve(length); + for (size_t i = 0; i < length; ++i) { + result += chars[dist(gen)]; + } + return result; +} + +/// \brief Generates a unique temporary filepath that works across platforms +inline std::string GenerateUniqueTempFilePath() { + std::filesystem::path temp_dir = std::filesystem::temp_directory_path(); + std::string file_name = + std::format("iceberg_test_{}_{}.tmp", TestInfo(), GenerateRandomString(8)); + return (temp_dir / file_name).string(); +} + +/// \brief Create a temporary filepath with the specified suffix/extension +inline std::string GenerateUniqueTempFilePathWithSuffix(const std::string& suffix) { + std::filesystem::path temp_dir = std::filesystem::temp_directory_path(); + std::string file_name = + std::format("iceberg_test_{}_{}{}", TestInfo(), GenerateRandomString(8), suffix); + return (temp_dir / file_name).string(); +} + /// A base class for tests that need to create and manage temporary files. /// Provides utilities for creating platform-independent temporary files /// and ensures proper cleanup after tests run. @@ -83,46 +123,6 @@ class TempFileTestBase : public ::testing::Test { } } - /// \brief Generates a unique temporary filepath that works across platforms - std::string GenerateUniqueTempFilePath() const { - std::filesystem::path temp_dir = std::filesystem::temp_directory_path(); - std::string file_name = - std::format("iceberg_test_{}_{}.tmp", TestInfo(), GenerateRandomString(8)); - return (temp_dir / file_name).string(); - } - - /// \brief Create a temporary filepath with the specified suffix/extension - std::string GenerateUniqueTempFilePathWithSuffix(const std::string& suffix) { - std::filesystem::path temp_dir = std::filesystem::temp_directory_path(); - std::string file_name = - std::format("iceberg_test_{}_{}{}", TestInfo(), GenerateRandomString(8), suffix); - return (temp_dir / file_name).string(); - } - - /// \brief Helper to generate a random alphanumeric string for unique filenames - std::string GenerateRandomString(size_t length) const { - const std::string_view chars = - "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_int_distribution<> dist(0, static_cast(chars.size() - 1)); - - std::string result; - result.reserve(length); - for (size_t i = 0; i < length; ++i) { - result += chars[dist(gen)]; - } - return result; - } - - /// \brief Get the test name for inclusion in the filename - std::string TestInfo() const { - if (const auto info = ::testing::UnitTest::GetInstance()->current_test_info(); info) { - return std::format("{}_{}", info->test_suite_name(), info->name()); - } - return "unknown_test"; - } - /// \brief Creates a new temporary filepath and registers it for cleanup std::string CreateNewTempFilePath() { std::string filepath = GenerateUniqueTempFilePath(); diff --git a/test/test_common.cc b/test/test_common.cc new file mode 100644 index 000000000..25fa8f2c7 --- /dev/null +++ b/test/test_common.cc @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "test_common.h" + +#include +#include +#include +#include + +#include +#include + +#include "iceberg/json_internal.h" +#include "iceberg/test/test_config.h" + +namespace iceberg { + +std::string GetResourcePath(const std::string& file_name) { + return std::string(ICEBERG_TEST_RESOURCES) + "/" + file_name; +} + +void ReadJsonFile(const std::string& file_name, std::string* content) { + std::filesystem::path path{GetResourcePath(file_name)}; + ASSERT_TRUE(std::filesystem::exists(path)) << "File does not exist: " << path.string(); + + std::ifstream file(path); + std::stringstream buffer; + buffer << file.rdbuf(); + *content = buffer.str(); +} + +void ReadTableMetadata(const std::string& file_name, + std::unique_ptr* metadata) { + std::string json_content; + ReadJsonFile(file_name, &json_content); + + nlohmann::json json = nlohmann::json::parse(json_content); + auto result = TableMetadataFromJson(json); + ASSERT_TRUE(result.has_value()) << "Failed to parse table metadata from " << file_name + << ": " << result.error().message; + *metadata = std::move(result.value()); +} + +} // namespace iceberg diff --git a/test/test_common.h b/test/test_common.h new file mode 100644 index 000000000..a9dba8cad --- /dev/null +++ b/test/test_common.h @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include + +#include "iceberg/type_fwd.h" + +namespace iceberg { + +/// \brief Get the full path to a resource file in the test resources directory +std::string GetResourcePath(const std::string& file_name); + +/// \brief Read a JSON file from the test resources directory +void ReadJsonFile(const std::string& file_name, std::string* content); + +/// \brief Read table metadata from a JSON file in the test resources directory +void ReadTableMetadata(const std::string& file_name, + std::unique_ptr* metadata); + +} // namespace iceberg