From 02fb059088d5583ace72ca3a2f9f06b52f711b45 Mon Sep 17 00:00:00 2001 From: Li Feiyang Date: Wed, 26 Nov 2025 17:40:29 +0800 Subject: [PATCH 1/2] refactor: move type visitor classes to type_util --- src/iceberg/CMakeLists.txt | 1 + src/iceberg/schema.cc | 292 +--------------------------------- src/iceberg/util/meson.build | 5 + src/iceberg/util/type_util.cc | 149 +++++++++++++++++ src/iceberg/util/type_util.h | 197 ++++++++++++++++++++++- 5 files changed, 352 insertions(+), 292 deletions(-) create mode 100644 src/iceberg/util/type_util.cc diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index 9b3e3b892..d5429808c 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -77,6 +77,7 @@ set(ICEBERG_SOURCES util/temporal_util.cc util/timepoint.cc util/truncate_util.cc + util/type_util.cc util/uuid.cc) set(ICEBERG_STATIC_BUILD_INTERFACE_LIBS) diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index 8719f22b5..9b0fa5d23 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -29,91 +29,11 @@ #include "iceberg/util/formatter.h" // IWYU pragma: keep #include "iceberg/util/formatter_internal.h" #include "iceberg/util/macros.h" +#include "iceberg/util/type_util.h" #include "iceberg/util/visit_type.h" namespace iceberg { -class IdToFieldVisitor { - public: - explicit IdToFieldVisitor( - std::unordered_map>& - id_to_field); - Status Visit(const PrimitiveType& type); - Status Visit(const NestedType& type); - - private: - std::unordered_map>& id_to_field_; -}; - -class NameToIdVisitor { - public: - explicit NameToIdVisitor( - std::unordered_map>& name_to_id, - bool case_sensitive = true, - std::function quoting_func = {}); - Status Visit(const ListType& type, const std::string& path, - const std::string& short_path); - Status Visit(const MapType& type, const std::string& path, - const std::string& short_path); - Status Visit(const StructType& type, const std::string& path, - const std::string& short_path); - Status Visit(const PrimitiveType& type, const std::string& path, - const std::string& short_path); - void Finish(); - - private: - std::string BuildPath(std::string_view prefix, std::string_view field_name, - bool case_sensitive); - - private: - bool case_sensitive_; - std::unordered_map>& name_to_id_; - std::unordered_map> short_name_to_id_; - std::function quoting_func_; -}; - -class PositionPathVisitor { - public: - Status Visit(const PrimitiveType& type) { - if (current_field_id_ == kUnassignedFieldId) { - return InvalidSchema("Current field id is not assigned, type: {}", type.ToString()); - } - - if (auto ret = position_path_.try_emplace(current_field_id_, current_path_); - !ret.second) { - return InvalidSchema("Duplicate field id found: {}, prev path: {}, curr path: {}", - current_field_id_, ret.first->second, current_path_); - } - - return {}; - } - - Status Visit(const StructType& type) { - for (size_t i = 0; i < type.fields().size(); ++i) { - const auto& field = type.fields()[i]; - current_field_id_ = field.field_id(); - current_path_.push_back(i); - ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*field.type(), this)); - current_path_.pop_back(); - } - return {}; - } - - // Non-struct types are not supported yet, but it is not an error. - Status Visit(const ListType& type) { return {}; } - Status Visit(const MapType& type) { return {}; } - - std::unordered_map> Finish() { - return std::move(position_path_); - } - - private: - constexpr static int32_t kUnassignedFieldId = -1; - int32_t current_field_id_ = kUnassignedFieldId; - std::vector current_path_; - std::unordered_map> position_path_; -}; - Schema::Schema(std::vector fields, std::optional schema_id) : StructType(std::move(fields)), schema_id_(schema_id) {} @@ -210,216 +130,6 @@ Result> Schema::GetAccessorById( return NotFound("Cannot get accessor for field id: {}", field_id); } -IdToFieldVisitor::IdToFieldVisitor( - std::unordered_map>& id_to_field) - : id_to_field_(id_to_field) {} - -Status IdToFieldVisitor::Visit(const PrimitiveType& type) { return {}; } - -Status IdToFieldVisitor::Visit(const NestedType& type) { - const auto& nested = internal::checked_cast(type); - const auto& fields = nested.fields(); - for (const auto& field : fields) { - auto it = id_to_field_.try_emplace(field.field_id(), std::cref(field)); - if (!it.second) { - return InvalidSchema("Duplicate field id found: {}", field.field_id()); - } - ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*field.type(), this)); - } - return {}; -} - -NameToIdVisitor::NameToIdVisitor( - std::unordered_map>& name_to_id, - bool case_sensitive, std::function quoting_func) - : case_sensitive_(case_sensitive), - name_to_id_(name_to_id), - quoting_func_(std::move(quoting_func)) {} - -Status NameToIdVisitor::Visit(const ListType& type, const std::string& path, - const std::string& short_path) { - const auto& field = type.fields()[0]; - std::string new_path = BuildPath(path, field.name(), case_sensitive_); - std::string new_short_path; - if (field.type()->type_id() == TypeId::kStruct) { - new_short_path = short_path; - } else { - new_short_path = BuildPath(short_path, field.name(), case_sensitive_); - } - auto it = name_to_id_.try_emplace(new_path, field.field_id()); - if (!it.second) { - return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}", - it.first->first, it.first->second, field.field_id()); - } - short_name_to_id_.try_emplace(new_short_path, field.field_id()); - ICEBERG_RETURN_UNEXPECTED( - VisitTypeInline(*field.type(), this, new_path, new_short_path)); - return {}; -} - -Status NameToIdVisitor::Visit(const MapType& type, const std::string& path, - const std::string& short_path) { - std::string new_path, new_short_path; - const auto& fields = type.fields(); - for (const auto& field : fields) { - new_path = BuildPath(path, field.name(), case_sensitive_); - if (field.name() == MapType::kValueName && - field.type()->type_id() == TypeId::kStruct) { - new_short_path = short_path; - } else { - new_short_path = BuildPath(short_path, field.name(), case_sensitive_); - } - auto it = name_to_id_.try_emplace(new_path, field.field_id()); - if (!it.second) { - return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}", - it.first->first, it.first->second, field.field_id()); - } - short_name_to_id_.try_emplace(new_short_path, field.field_id()); - ICEBERG_RETURN_UNEXPECTED( - VisitTypeInline(*field.type(), this, new_path, new_short_path)); - } - return {}; -} - -Status NameToIdVisitor::Visit(const StructType& type, const std::string& path, - const std::string& short_path) { - const auto& fields = type.fields(); - std::string new_path, new_short_path; - for (const auto& field : fields) { - new_path = BuildPath(path, field.name(), case_sensitive_); - new_short_path = BuildPath(short_path, field.name(), case_sensitive_); - auto it = name_to_id_.try_emplace(new_path, field.field_id()); - if (!it.second) { - return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}", - it.first->first, it.first->second, field.field_id()); - } - short_name_to_id_.try_emplace(new_short_path, field.field_id()); - ICEBERG_RETURN_UNEXPECTED( - VisitTypeInline(*field.type(), this, new_path, new_short_path)); - } - return {}; -} - -Status NameToIdVisitor::Visit(const PrimitiveType& type, const std::string& path, - const std::string& short_path) { - return {}; -} - -std::string NameToIdVisitor::BuildPath(std::string_view prefix, - std::string_view field_name, bool case_sensitive) { - std::string quoted_name; - if (!quoting_func_) { - quoted_name = std::string(field_name); - } else { - quoted_name = quoting_func_(field_name); - } - if (case_sensitive) { - return prefix.empty() ? quoted_name : std::string(prefix) + "." + quoted_name; - } - return prefix.empty() ? StringUtils::ToLower(quoted_name) - : std::string(prefix) + "." + StringUtils::ToLower(quoted_name); -} - -void NameToIdVisitor::Finish() { - for (auto&& it : short_name_to_id_) { - name_to_id_.try_emplace(it.first, it.second); - } -} - -/// \brief Visitor for pruning columns based on selected field IDs. -/// -/// This visitor traverses a schema and creates a projected version containing only -/// the specified fields. When `select_full_types` is true, a field with all its -/// sub-fields are selected if its field-id has been selected; otherwise, only leaf -/// fields of selected field-ids are selected. -/// -/// \note It returns an error when projection is not successful. -class PruneColumnVisitor { - public: - PruneColumnVisitor(const std::unordered_set& selected_ids, - bool select_full_types) - : selected_ids_(selected_ids), select_full_types_(select_full_types) {} - - Result> Visit(const std::shared_ptr& type) const { - switch (type->type_id()) { - case TypeId::kStruct: - return Visit(internal::checked_pointer_cast(type)); - case TypeId::kList: - return Visit(internal::checked_pointer_cast(type)); - case TypeId::kMap: - return Visit(internal::checked_pointer_cast(type)); - default: - return nullptr; - } - } - - Result> Visit(const SchemaField& field) const { - if (selected_ids_.contains(field.field_id())) { - return (select_full_types_ || field.type()->is_primitive()) ? field.type() - : Visit(field.type()); - } - return Visit(field.type()); - } - - static SchemaField MakeField(const SchemaField& field, std::shared_ptr type) { - return {field.field_id(), std::string(field.name()), std::move(type), - field.optional(), std::string(field.doc())}; - } - - Result> Visit(const std::shared_ptr& type) const { - bool same_types = true; - std::vector selected_fields; - for (const auto& field : type->fields()) { - ICEBERG_ASSIGN_OR_RAISE(auto child_type, Visit(field)); - if (child_type) { - same_types = same_types && (child_type == field.type()); - selected_fields.emplace_back(MakeField(field, std::move(child_type))); - } - } - - if (selected_fields.empty()) { - return nullptr; - } else if (same_types && selected_fields.size() == type->fields().size()) { - return type; - } - return std::make_shared(std::move(selected_fields)); - } - - Result> Visit(const std::shared_ptr& type) const { - const auto& elem_field = type->fields()[0]; - ICEBERG_ASSIGN_OR_RAISE(auto elem_type, Visit(elem_field)); - if (elem_type == nullptr) { - return nullptr; - } else if (elem_type == elem_field.type()) { - return type; - } - return std::make_shared(MakeField(elem_field, std::move(elem_type))); - } - - Result> Visit(const std::shared_ptr& type) const { - const auto& key_field = type->fields()[0]; - const auto& value_field = type->fields()[1]; - ICEBERG_ASSIGN_OR_RAISE(auto key_type, Visit(key_field)); - ICEBERG_ASSIGN_OR_RAISE(auto value_type, Visit(value_field)); - - if (key_type == nullptr && value_type == nullptr) { - return nullptr; - } else if (value_type == value_field.type() && - (key_type == key_field.type() || key_type == nullptr)) { - return type; - } else if (value_type == nullptr) { - return InvalidArgument("Cannot project Map without value field"); - } - return std::make_shared( - (key_type == nullptr ? key_field : MakeField(key_field, std::move(key_type))), - MakeField(value_field, std::move(value_type))); - } - - private: - const std::unordered_set& selected_ids_; - const bool select_full_types_; -}; - Result> Schema::Select(std::span names, bool case_sensitive) const { const std::string kAllColumns = "*"; diff --git a/src/iceberg/util/meson.build b/src/iceberg/util/meson.build index 19c5ed1dc..30c42d02d 100644 --- a/src/iceberg/util/meson.build +++ b/src/iceberg/util/meson.build @@ -17,19 +17,24 @@ install_headers( [ + 'bucket_util.h', 'checked_cast.h', 'config.h', + 'conversions.h', 'decimal.h', 'endian.h', 'formattable.h', 'formatter.h', 'int128.h', + 'lazy.h', 'macros.h', 'partition_value_util.h', 'string_util.h', + 'temporal_util.h', 'timepoint.h', 'truncate_util.h', 'type_util.h', + 'uuid.h', 'visitor_generate.h', 'visit_type.h', ], diff --git a/src/iceberg/util/type_util.cc b/src/iceberg/util/type_util.cc new file mode 100644 index 000000000..773c0e60a --- /dev/null +++ b/src/iceberg/util/type_util.cc @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/type_util.h" + +#include "iceberg/result.h" +#include "iceberg/util/checked_cast.h" +#include "iceberg/util/string_util.h" +#include "iceberg/util/visit_type.h" + +namespace iceberg { + +// IdToFieldVisitor implementation + +IdToFieldVisitor::IdToFieldVisitor( + std::unordered_map>& id_to_field) + : id_to_field_(id_to_field) {} + +Status IdToFieldVisitor::Visit(const PrimitiveType& type) { return {}; } + +Status IdToFieldVisitor::Visit(const NestedType& type) { + const auto& nested = internal::checked_cast(type); + const auto& fields = nested.fields(); + for (const auto& field : fields) { + auto it = id_to_field_.try_emplace(field.field_id(), std::cref(field)); + if (!it.second) { + return InvalidSchema("Duplicate field id found: {}", field.field_id()); + } + ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*field.type(), this)); + } + return {}; +} + +// NameToIdVisitor implementation + +NameToIdVisitor::NameToIdVisitor( + std::unordered_map>& name_to_id, + bool case_sensitive, std::function quoting_func) + : case_sensitive_(case_sensitive), + name_to_id_(name_to_id), + quoting_func_(std::move(quoting_func)) {} + +Status NameToIdVisitor::Visit(const ListType& type, const std::string& path, + const std::string& short_path) { + const auto& field = type.fields()[0]; + std::string new_path = BuildPath(path, field.name(), case_sensitive_); + std::string new_short_path; + if (field.type()->type_id() == TypeId::kStruct) { + new_short_path = short_path; + } else { + new_short_path = BuildPath(short_path, field.name(), case_sensitive_); + } + auto it = name_to_id_.try_emplace(new_path, field.field_id()); + if (!it.second) { + return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}", + it.first->first, it.first->second, field.field_id()); + } + short_name_to_id_.try_emplace(new_short_path, field.field_id()); + ICEBERG_RETURN_UNEXPECTED( + VisitTypeInline(*field.type(), this, new_path, new_short_path)); + return {}; +} + +Status NameToIdVisitor::Visit(const MapType& type, const std::string& path, + const std::string& short_path) { + std::string new_path, new_short_path; + const auto& fields = type.fields(); + for (const auto& field : fields) { + new_path = BuildPath(path, field.name(), case_sensitive_); + if (field.name() == MapType::kValueName && + field.type()->type_id() == TypeId::kStruct) { + new_short_path = short_path; + } else { + new_short_path = BuildPath(short_path, field.name(), case_sensitive_); + } + auto it = name_to_id_.try_emplace(new_path, field.field_id()); + if (!it.second) { + return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}", + it.first->first, it.first->second, field.field_id()); + } + short_name_to_id_.try_emplace(new_short_path, field.field_id()); + ICEBERG_RETURN_UNEXPECTED( + VisitTypeInline(*field.type(), this, new_path, new_short_path)); + } + return {}; +} + +Status NameToIdVisitor::Visit(const StructType& type, const std::string& path, + const std::string& short_path) { + const auto& fields = type.fields(); + std::string new_path, new_short_path; + for (const auto& field : fields) { + new_path = BuildPath(path, field.name(), case_sensitive_); + new_short_path = BuildPath(short_path, field.name(), case_sensitive_); + auto it = name_to_id_.try_emplace(new_path, field.field_id()); + if (!it.second) { + return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}", + it.first->first, it.first->second, field.field_id()); + } + short_name_to_id_.try_emplace(new_short_path, field.field_id()); + ICEBERG_RETURN_UNEXPECTED( + VisitTypeInline(*field.type(), this, new_path, new_short_path)); + } + return {}; +} + +Status NameToIdVisitor::Visit(const PrimitiveType& type, const std::string& path, + const std::string& short_path) { + return {}; +} + +std::string NameToIdVisitor::BuildPath(std::string_view prefix, + std::string_view field_name, bool case_sensitive) { + std::string quoted_name; + if (!quoting_func_) { + quoted_name = std::string(field_name); + } else { + quoted_name = quoting_func_(field_name); + } + if (case_sensitive) { + return prefix.empty() ? quoted_name : std::string(prefix) + "." + quoted_name; + } + return prefix.empty() ? StringUtils::ToLower(quoted_name) + : std::string(prefix) + "." + StringUtils::ToLower(quoted_name); +} + +void NameToIdVisitor::Finish() { + for (auto&& it : short_name_to_id_) { + name_to_id_.try_emplace(it.first, it.second); + } +} + +} // namespace iceberg diff --git a/src/iceberg/util/type_util.h b/src/iceberg/util/type_util.h index cad9da54a..cfd06e270 100644 --- a/src/iceberg/util/type_util.h +++ b/src/iceberg/util/type_util.h @@ -20,16 +20,211 @@ #pragma once #include +#include +#include #include +#include +#include +#include +#include +#include +#include "iceberg/result.h" +#include "iceberg/schema_field.h" #include "iceberg/type.h" +#include "iceberg/util/checked_cast.h" +#include "iceberg/util/string_util.h" +#include "iceberg/util/visit_type.h" /// \file iceberg/util/type_util.h -/// Utility functions for Iceberg types. +/// Utility functions and visitors for Iceberg types. namespace iceberg { +/// \brief Visitor for building a map from field ID to SchemaField reference. +/// Corresponds to Java's IndexById visitor. +class IdToFieldVisitor { + public: + explicit IdToFieldVisitor( + std::unordered_map>& + id_to_field); + Status Visit(const PrimitiveType& type); + Status Visit(const NestedType& type); + + private: + std::unordered_map>& id_to_field_; +}; + +/// \brief Visitor for building a map from field name to field ID. +/// Corresponds to Java's IndexByName visitor. +class NameToIdVisitor { + public: + explicit NameToIdVisitor( + std::unordered_map>& name_to_id, + bool case_sensitive = true, + std::function quoting_func = {}); + Status Visit(const ListType& type, const std::string& path, + const std::string& short_path); + Status Visit(const MapType& type, const std::string& path, + const std::string& short_path); + Status Visit(const StructType& type, const std::string& path, + const std::string& short_path); + Status Visit(const PrimitiveType& type, const std::string& path, + const std::string& short_path); + void Finish(); + + private: + std::string BuildPath(std::string_view prefix, std::string_view field_name, + bool case_sensitive); + + private: + bool case_sensitive_; + std::unordered_map>& name_to_id_; + std::unordered_map> short_name_to_id_; + std::function quoting_func_; +}; + +/// \brief Visitor for building a map from field ID to position path. +/// Used for efficient field access in StructLike. +class PositionPathVisitor { + public: + Status Visit(const PrimitiveType& type) { + if (current_field_id_ == kUnassignedFieldId) { + return InvalidSchema("Current field id is not assigned, type: {}", type.ToString()); + } + + if (auto ret = position_path_.try_emplace(current_field_id_, current_path_); + !ret.second) { + return InvalidSchema("Duplicate field id found: {}, prev path: {}, curr path: {}", + current_field_id_, ret.first->second, current_path_); + } + + return {}; + } + + Status Visit(const StructType& type) { + for (size_t i = 0; i < type.fields().size(); ++i) { + const auto& field = type.fields()[i]; + current_field_id_ = field.field_id(); + current_path_.push_back(i); + ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*field.type(), this)); + current_path_.pop_back(); + } + return {}; + } + + // Non-struct types are not supported yet, but it is not an error. + Status Visit(const ListType& type) { return {}; } + Status Visit(const MapType& type) { return {}; } + + std::unordered_map> Finish() { + return std::move(position_path_); + } + + private: + constexpr static int32_t kUnassignedFieldId = -1; + int32_t current_field_id_ = kUnassignedFieldId; + std::vector current_path_; + std::unordered_map> position_path_; +}; + +/// \brief Visitor for pruning columns based on selected field IDs. +/// Corresponds to Java's PruneColumns visitor. +/// +/// This visitor traverses a schema and creates a projected version containing only +/// the specified fields. When `select_full_types` is true, a field with all its +/// sub-fields are selected if its field-id has been selected; otherwise, only leaf +/// fields of selected field-ids are selected. +/// +/// \note It returns an error when projection is not successful. +class PruneColumnVisitor { + public: + PruneColumnVisitor(const std::unordered_set& selected_ids, + bool select_full_types) + : selected_ids_(selected_ids), select_full_types_(select_full_types) {} + + Result> Visit(const std::shared_ptr& type) const { + switch (type->type_id()) { + case TypeId::kStruct: + return Visit(internal::checked_pointer_cast(type)); + case TypeId::kList: + return Visit(internal::checked_pointer_cast(type)); + case TypeId::kMap: + return Visit(internal::checked_pointer_cast(type)); + default: + return nullptr; + } + } + + Result> Visit(const SchemaField& field) const { + if (selected_ids_.contains(field.field_id())) { + return (select_full_types_ || field.type()->is_primitive()) ? field.type() + : Visit(field.type()); + } + return Visit(field.type()); + } + + static SchemaField MakeField(const SchemaField& field, std::shared_ptr type) { + return {field.field_id(), std::string(field.name()), std::move(type), + field.optional(), std::string(field.doc())}; + } + + Result> Visit(const std::shared_ptr& type) const { + bool same_types = true; + std::vector selected_fields; + for (const auto& field : type->fields()) { + ICEBERG_ASSIGN_OR_RAISE(auto child_type, Visit(field)); + if (child_type) { + same_types = same_types && (child_type == field.type()); + selected_fields.emplace_back(MakeField(field, std::move(child_type))); + } + } + + if (selected_fields.empty()) { + return nullptr; + } else if (same_types && selected_fields.size() == type->fields().size()) { + return type; + } + return std::make_shared(std::move(selected_fields)); + } + + Result> Visit(const std::shared_ptr& type) const { + const auto& elem_field = type->fields()[0]; + ICEBERG_ASSIGN_OR_RAISE(auto elem_type, Visit(elem_field)); + if (elem_type == nullptr) { + return nullptr; + } else if (elem_type == elem_field.type()) { + return type; + } + return std::make_shared(MakeField(elem_field, std::move(elem_type))); + } + + Result> Visit(const std::shared_ptr& type) const { + const auto& key_field = type->fields()[0]; + const auto& value_field = type->fields()[1]; + ICEBERG_ASSIGN_OR_RAISE(auto key_type, Visit(key_field)); + ICEBERG_ASSIGN_OR_RAISE(auto value_type, Visit(value_field)); + + if (key_type == nullptr && value_type == nullptr) { + return nullptr; + } else if (value_type == value_field.type() && + (key_type == key_field.type() || key_type == nullptr)) { + return type; + } else if (value_type == nullptr) { + return InvalidArgument("Cannot project Map without value field"); + } + return std::make_shared( + (key_type == nullptr ? key_field : MakeField(key_field, std::move(key_type))), + MakeField(value_field, std::move(value_type))); + } + + private: + const std::unordered_set& selected_ids_; + const bool select_full_types_; +}; + /// \brief Index parent field IDs for all fields in a struct hierarchy. +/// Corresponds to Java's indexParents(Types.StructType struct). /// \param root_struct The root struct type to analyze /// \return A map from field ID to its parent struct field ID /// \note This function assumes the input StructType has already been validated: From 4fe379047a108420234d321b29434107540ca571 Mon Sep 17 00:00:00 2001 From: Li Feiyang Date: Mon, 1 Dec 2025 10:35:24 +0800 Subject: [PATCH 2/2] 1 --- src/iceberg/meson.build | 1 + src/iceberg/partition_spec.cc | 2 +- src/iceberg/schema.cc | 1 - src/iceberg/test/type_test.cc | 2 +- src/iceberg/util/type_util.cc | 156 +++++++++++++++++++++++++++++++- src/iceberg/util/type_util.h | 164 ++++------------------------------ 6 files changed, 171 insertions(+), 155 deletions(-) diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build index 16324c310..d52739be9 100644 --- a/src/iceberg/meson.build +++ b/src/iceberg/meson.build @@ -99,6 +99,7 @@ iceberg_sources = files( 'util/temporal_util.cc', 'util/timepoint.cc', 'util/truncate_util.cc', + 'util/type_util.cc', 'util/uuid.cc', ) diff --git a/src/iceberg/partition_spec.cc b/src/iceberg/partition_spec.cc index f65f483e7..0c2dda124 100644 --- a/src/iceberg/partition_spec.cc +++ b/src/iceberg/partition_spec.cc @@ -108,7 +108,7 @@ bool PartitionSpec::Equals(const PartitionSpec& other) const { } Status PartitionSpec::Validate(const Schema& schema, bool allow_missing_fields) const { - std::unordered_map parents = indexParents(schema); + std::unordered_map parents = IndexParents(schema); for (const auto& partition_field : fields_) { ICEBERG_ASSIGN_OR_RAISE(auto source_field, schema.FindFieldById(partition_field.source_id())); diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index 9b0fa5d23..ca0d943fa 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -27,7 +27,6 @@ #include "iceberg/schema_internal.h" #include "iceberg/type.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep -#include "iceberg/util/formatter_internal.h" #include "iceberg/util/macros.h" #include "iceberg/util/type_util.h" #include "iceberg/util/visit_type.h" diff --git a/src/iceberg/test/type_test.cc b/src/iceberg/test/type_test.cc index 4568f9517..266ff6103 100644 --- a/src/iceberg/test/type_test.cc +++ b/src/iceberg/test/type_test.cc @@ -659,7 +659,7 @@ TEST(TypeTest, IndexParents) { points, }); - std::unordered_map parent_index = iceberg::indexParents(root_struct); + std::unordered_map parent_index = iceberg::IndexParents(root_struct); // Verify top-level fields have no parent ASSERT_EQ(parent_index.find(1), parent_index.end()); diff --git a/src/iceberg/util/type_util.cc b/src/iceberg/util/type_util.cc index 773c0e60a..016397f05 100644 --- a/src/iceberg/util/type_util.cc +++ b/src/iceberg/util/type_util.cc @@ -19,15 +19,16 @@ #include "iceberg/util/type_util.h" +#include + #include "iceberg/result.h" #include "iceberg/util/checked_cast.h" +#include "iceberg/util/formatter_internal.h" #include "iceberg/util/string_util.h" #include "iceberg/util/visit_type.h" namespace iceberg { -// IdToFieldVisitor implementation - IdToFieldVisitor::IdToFieldVisitor( std::unordered_map>& id_to_field) : id_to_field_(id_to_field) {} @@ -47,8 +48,6 @@ Status IdToFieldVisitor::Visit(const NestedType& type) { return {}; } -// NameToIdVisitor implementation - NameToIdVisitor::NameToIdVisitor( std::unordered_map>& name_to_id, bool case_sensitive, std::function quoting_func) @@ -146,4 +145,153 @@ void NameToIdVisitor::Finish() { } } +Status PositionPathVisitor::Visit(const PrimitiveType& type) { + if (current_field_id_ == kUnassignedFieldId) { + return InvalidSchema("Current field id is not assigned, type: {}", type.ToString()); + } + + if (auto ret = position_path_.try_emplace(current_field_id_, current_path_); + !ret.second) { + return InvalidSchema("Duplicate field id found: {}, prev path: {}, curr path: {}", + current_field_id_, ret.first->second, current_path_); + } + + return {}; +} + +Status PositionPathVisitor::Visit(const StructType& type) { + for (size_t i = 0; i < type.fields().size(); ++i) { + const auto& field = type.fields()[i]; + current_field_id_ = field.field_id(); + current_path_.push_back(i); + ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*field.type(), this)); + current_path_.pop_back(); + } + return {}; +} + +// Non-struct types are not supported yet, but it is not an error. +Status PositionPathVisitor::Visit(const ListType& type) { return {}; } + +Status PositionPathVisitor::Visit(const MapType& type) { return {}; } + +std::unordered_map> PositionPathVisitor::Finish() { + return std::move(position_path_); +} + +PruneColumnVisitor::PruneColumnVisitor(const std::unordered_set& selected_ids, + bool select_full_types) + : selected_ids_(selected_ids), select_full_types_(select_full_types) {} + +Result> PruneColumnVisitor::Visit( + const std::shared_ptr& type) const { + switch (type->type_id()) { + case TypeId::kStruct: + return Visit(internal::checked_pointer_cast(type)); + case TypeId::kList: + return Visit(internal::checked_pointer_cast(type)); + case TypeId::kMap: + return Visit(internal::checked_pointer_cast(type)); + default: + return nullptr; + } +} + +Result> PruneColumnVisitor::Visit(const SchemaField& field) const { + if (selected_ids_.contains(field.field_id())) { + return (select_full_types_ || field.type()->is_primitive()) ? field.type() + : Visit(field.type()); + } + return Visit(field.type()); +} + +SchemaField PruneColumnVisitor::MakeField(const SchemaField& field, + std::shared_ptr type) { + return {field.field_id(), std::string(field.name()), std::move(type), field.optional(), + std::string(field.doc())}; +} + +Result> PruneColumnVisitor::Visit( + const std::shared_ptr& type) const { + bool same_types = true; + std::vector selected_fields; + for (const auto& field : type->fields()) { + ICEBERG_ASSIGN_OR_RAISE(auto child_type, Visit(field)); + if (child_type) { + same_types = same_types && (child_type == field.type()); + selected_fields.emplace_back(MakeField(field, std::move(child_type))); + } + } + + if (selected_fields.empty()) { + return nullptr; + } else if (same_types && selected_fields.size() == type->fields().size()) { + return type; + } + return std::make_shared(std::move(selected_fields)); +} + +Result> PruneColumnVisitor::Visit( + const std::shared_ptr& type) const { + const auto& elem_field = type->fields()[0]; + ICEBERG_ASSIGN_OR_RAISE(auto elem_type, Visit(elem_field)); + if (elem_type == nullptr) { + return nullptr; + } else if (elem_type == elem_field.type()) { + return type; + } + return std::make_shared(MakeField(elem_field, std::move(elem_type))); +} + +Result> PruneColumnVisitor::Visit( + const std::shared_ptr& type) const { + const auto& key_field = type->fields()[0]; + const auto& value_field = type->fields()[1]; + ICEBERG_ASSIGN_OR_RAISE(auto key_type, Visit(key_field)); + ICEBERG_ASSIGN_OR_RAISE(auto value_type, Visit(value_field)); + + if (key_type == nullptr && value_type == nullptr) { + return nullptr; + } else if (value_type == value_field.type() && + (key_type == key_field.type() || key_type == nullptr)) { + return type; + } else if (value_type == nullptr) { + return InvalidArgument("Cannot project Map without value field"); + } + return std::make_shared( + (key_type == nullptr ? key_field : MakeField(key_field, std::move(key_type))), + MakeField(value_field, std::move(value_type))); +} + +std::unordered_map IndexParents(const StructType& root_struct) { + std::unordered_map id_to_parent; + std::stack parent_id_stack; + + // Recursive function to visit and build parent relationships + std::function visit = [&](const Type& type) -> void { + switch (type.type_id()) { + case TypeId::kStruct: + case TypeId::kList: + case TypeId::kMap: { + const auto& nested_type = static_cast(type); + for (const auto& field : nested_type.fields()) { + if (!parent_id_stack.empty()) { + id_to_parent[field.field_id()] = parent_id_stack.top(); + } + parent_id_stack.push(field.field_id()); + visit(*field.type()); + parent_id_stack.pop(); + } + break; + } + + default: + break; + } + }; + + visit(root_struct); + return id_to_parent; +} + } // namespace iceberg diff --git a/src/iceberg/util/type_util.h b/src/iceberg/util/type_util.h index cfd06e270..7cc274b0e 100644 --- a/src/iceberg/util/type_util.h +++ b/src/iceberg/util/type_util.h @@ -21,20 +21,17 @@ #include #include -#include -#include #include #include #include #include #include +#include "iceberg/iceberg_export.h" #include "iceberg/result.h" #include "iceberg/schema_field.h" -#include "iceberg/type.h" -#include "iceberg/util/checked_cast.h" +#include "iceberg/type_fwd.h" #include "iceberg/util/string_util.h" -#include "iceberg/util/visit_type.h" /// \file iceberg/util/type_util.h /// Utility functions and visitors for Iceberg types. @@ -42,7 +39,6 @@ namespace iceberg { /// \brief Visitor for building a map from field ID to SchemaField reference. -/// Corresponds to Java's IndexById visitor. class IdToFieldVisitor { public: explicit IdToFieldVisitor( @@ -56,7 +52,6 @@ class IdToFieldVisitor { }; /// \brief Visitor for building a map from field name to field ID. -/// Corresponds to Java's IndexByName visitor. class NameToIdVisitor { public: explicit NameToIdVisitor( @@ -85,41 +80,13 @@ class NameToIdVisitor { }; /// \brief Visitor for building a map from field ID to position path. -/// Used for efficient field access in StructLike. class PositionPathVisitor { public: - Status Visit(const PrimitiveType& type) { - if (current_field_id_ == kUnassignedFieldId) { - return InvalidSchema("Current field id is not assigned, type: {}", type.ToString()); - } - - if (auto ret = position_path_.try_emplace(current_field_id_, current_path_); - !ret.second) { - return InvalidSchema("Duplicate field id found: {}, prev path: {}, curr path: {}", - current_field_id_, ret.first->second, current_path_); - } - - return {}; - } - - Status Visit(const StructType& type) { - for (size_t i = 0; i < type.fields().size(); ++i) { - const auto& field = type.fields()[i]; - current_field_id_ = field.field_id(); - current_path_.push_back(i); - ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*field.type(), this)); - current_path_.pop_back(); - } - return {}; - } - - // Non-struct types are not supported yet, but it is not an error. - Status Visit(const ListType& type) { return {}; } - Status Visit(const MapType& type) { return {}; } - - std::unordered_map> Finish() { - return std::move(position_path_); - } + Status Visit(const PrimitiveType& type); + Status Visit(const StructType& type); + Status Visit(const ListType& type); + Status Visit(const MapType& type); + std::unordered_map> Finish(); private: constexpr static int32_t kUnassignedFieldId = -1; @@ -129,7 +96,6 @@ class PositionPathVisitor { }; /// \brief Visitor for pruning columns based on selected field IDs. -/// Corresponds to Java's PruneColumns visitor. /// /// This visitor traverses a schema and creates a projected version containing only /// the specified fields. When `select_full_types` is true, a field with all its @@ -140,83 +106,14 @@ class PositionPathVisitor { class PruneColumnVisitor { public: PruneColumnVisitor(const std::unordered_set& selected_ids, - bool select_full_types) - : selected_ids_(selected_ids), select_full_types_(select_full_types) {} - - Result> Visit(const std::shared_ptr& type) const { - switch (type->type_id()) { - case TypeId::kStruct: - return Visit(internal::checked_pointer_cast(type)); - case TypeId::kList: - return Visit(internal::checked_pointer_cast(type)); - case TypeId::kMap: - return Visit(internal::checked_pointer_cast(type)); - default: - return nullptr; - } - } - - Result> Visit(const SchemaField& field) const { - if (selected_ids_.contains(field.field_id())) { - return (select_full_types_ || field.type()->is_primitive()) ? field.type() - : Visit(field.type()); - } - return Visit(field.type()); - } + bool select_full_types); - static SchemaField MakeField(const SchemaField& field, std::shared_ptr type) { - return {field.field_id(), std::string(field.name()), std::move(type), - field.optional(), std::string(field.doc())}; - } - - Result> Visit(const std::shared_ptr& type) const { - bool same_types = true; - std::vector selected_fields; - for (const auto& field : type->fields()) { - ICEBERG_ASSIGN_OR_RAISE(auto child_type, Visit(field)); - if (child_type) { - same_types = same_types && (child_type == field.type()); - selected_fields.emplace_back(MakeField(field, std::move(child_type))); - } - } - - if (selected_fields.empty()) { - return nullptr; - } else if (same_types && selected_fields.size() == type->fields().size()) { - return type; - } - return std::make_shared(std::move(selected_fields)); - } - - Result> Visit(const std::shared_ptr& type) const { - const auto& elem_field = type->fields()[0]; - ICEBERG_ASSIGN_OR_RAISE(auto elem_type, Visit(elem_field)); - if (elem_type == nullptr) { - return nullptr; - } else if (elem_type == elem_field.type()) { - return type; - } - return std::make_shared(MakeField(elem_field, std::move(elem_type))); - } - - Result> Visit(const std::shared_ptr& type) const { - const auto& key_field = type->fields()[0]; - const auto& value_field = type->fields()[1]; - ICEBERG_ASSIGN_OR_RAISE(auto key_type, Visit(key_field)); - ICEBERG_ASSIGN_OR_RAISE(auto value_type, Visit(value_field)); - - if (key_type == nullptr && value_type == nullptr) { - return nullptr; - } else if (value_type == value_field.type() && - (key_type == key_field.type() || key_type == nullptr)) { - return type; - } else if (value_type == nullptr) { - return InvalidArgument("Cannot project Map without value field"); - } - return std::make_shared( - (key_type == nullptr ? key_field : MakeField(key_field, std::move(key_type))), - MakeField(value_field, std::move(value_type))); - } + Result> Visit(const std::shared_ptr& type) const; + Result> Visit(const SchemaField& field) const; + static SchemaField MakeField(const SchemaField& field, std::shared_ptr type); + Result> Visit(const std::shared_ptr& type) const; + Result> Visit(const std::shared_ptr& type) const; + Result> Visit(const std::shared_ptr& type) const; private: const std::unordered_set& selected_ids_; @@ -224,7 +121,6 @@ class PruneColumnVisitor { }; /// \brief Index parent field IDs for all fields in a struct hierarchy. -/// Corresponds to Java's indexParents(Types.StructType struct). /// \param root_struct The root struct type to analyze /// \return A map from field ID to its parent struct field ID /// \note This function assumes the input StructType has already been validated: @@ -232,35 +128,7 @@ class PruneColumnVisitor { /// - All field IDs must be unique across the entire schema hierarchy /// If the struct is part of a Schema, these invariants are enforced by /// StructType::InitFieldById which checks for duplicate field IDs. -static std::unordered_map indexParents(const StructType& root_struct) { - std::unordered_map id_to_parent; - std::stack parent_id_stack; - - // Recursive function to visit and build parent relationships - std::function visit = [&](const Type& type) -> void { - switch (type.type_id()) { - case TypeId::kStruct: - case TypeId::kList: - case TypeId::kMap: { - const auto& nested_type = static_cast(type); - for (const auto& field : nested_type.fields()) { - if (!parent_id_stack.empty()) { - id_to_parent[field.field_id()] = parent_id_stack.top(); - } - parent_id_stack.push(field.field_id()); - visit(*field.type()); - parent_id_stack.pop(); - } - break; - } - - default: - break; - } - }; - - visit(root_struct); - return id_to_parent; -} +ICEBERG_EXPORT std::unordered_map IndexParents( + const StructType& root_struct); } // namespace iceberg