From 96741af8dd7c32459c8be02cb505a2870af267b3 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Wed, 26 Nov 2025 17:42:51 +0800 Subject: [PATCH 1/3] refactor: implement table metadata cache for by ids mapping --- src/iceberg/table.cc | 45 ++++++------------------ src/iceberg/table.h | 24 +++++-------- src/iceberg/table_metadata.cc | 64 ++++++++++++++++++++++++++++++++++ src/iceberg/table_metadata.h | 35 +++++++++++++++++++ src/iceberg/test/table_test.cc | 12 +++---- 5 files changed, 125 insertions(+), 55 deletions(-) diff --git a/src/iceberg/table.cc b/src/iceberg/table.cc index 7c7df7c6f..9116a5e92 100644 --- a/src/iceberg/table.cc +++ b/src/iceberg/table.cc @@ -42,7 +42,8 @@ Table::Table(TableIdentifier identifier, std::shared_ptr metadata metadata_location_(std::move(metadata_location)), io_(std::move(io)), catalog_(std::move(catalog)), - properties_(TableProperties::FromMap(metadata_->properties)) {} + properties_(TableProperties::FromMap(metadata_->properties)), + metadata_cache_(std::make_unique(metadata_.get())) {} const std::string& Table::uuid() const { return metadata_->table_uuid; } @@ -57,60 +58,36 @@ Status Table::Refresh() { metadata_location_ = std::move(refreshed_table->metadata_location_); io_ = std::move(refreshed_table->io_); properties_ = std::move(refreshed_table->properties_); - - schemas_map_.reset(); - partition_spec_map_.reset(); - sort_orders_map_.reset(); + metadata_cache_ = std::make_unique(metadata_.get()); } return {}; } Result> Table::schema() const { return metadata_->Schema(); } -const std::shared_ptr>>& +Result>>> Table::schemas() const { - if (!schemas_map_) { - schemas_map_ = - std::make_shared>>(); - for (const auto& schema : metadata_->schemas) { - if (schema->schema_id()) { - schemas_map_->emplace(schema->schema_id().value(), schema); - } - } - } - return schemas_map_; + return metadata_cache_->GetSchemasById(); } Result> Table::spec() const { return metadata_->PartitionSpec(); } -const std::shared_ptr>>& +Result>>> Table::specs() const { - if (!partition_spec_map_) { - partition_spec_map_ = - std::make_shared>>(); - for (const auto& spec : metadata_->partition_specs) { - partition_spec_map_->emplace(spec->spec_id(), spec); - } - } - return partition_spec_map_; + return metadata_cache_->GetPartitionSpecsById(); } Result> Table::sort_order() const { return metadata_->SortOrder(); } -const std::shared_ptr>>& +Result< + std::reference_wrapper>>> Table::sort_orders() const { - if (!sort_orders_map_) { - sort_orders_map_ = - std::make_shared>>(); - for (const auto& order : metadata_->sort_orders) { - sort_orders_map_->emplace(order->order_id(), order); - } - } - return sort_orders_map_; + return metadata_cache_->GetSortOrdersById(); } const TableProperties& Table::properties() const { return *properties_; } diff --git a/src/iceberg/table.h b/src/iceberg/table.h index 672cac754..1ea5ce873 100644 --- a/src/iceberg/table.h +++ b/src/iceberg/table.h @@ -19,6 +19,7 @@ #pragma once +#include #include #include #include @@ -60,24 +61,24 @@ class ICEBERG_EXPORT Table { Result> schema() const; /// \brief Return a map of schema for this table - /// \note This method is **not** thread-safe in the current implementation. - const std::shared_ptr>>& schemas() - const; + Result< + std::reference_wrapper>>> + schemas() const; /// \brief Return the partition spec for this table, return NotFoundError if not found Result> spec() const; /// \brief Return a map of partition specs for this table - /// \note This method is **not** thread-safe in the current implementation. - const std::shared_ptr>>& + Result>>> specs() const; /// \brief Return the sort order for this table, return NotFoundError if not found Result> sort_order() const; /// \brief Return a map of sort order IDs to sort orders for this table - /// \note This method is **not** thread-safe in the current implementation. - const std::shared_ptr>>& + Result>>> sort_orders() const; /// \brief Return a map of string properties for this table @@ -124,14 +125,7 @@ class ICEBERG_EXPORT Table { std::shared_ptr io_; std::shared_ptr catalog_; std::unique_ptr properties_; - - // Cache lazy-initialized maps. - mutable std::shared_ptr>> - schemas_map_; - mutable std::shared_ptr>> - partition_spec_map_; - mutable std::shared_ptr>> - sort_orders_map_; + std::unique_ptr metadata_cache_; }; } // namespace iceberg diff --git a/src/iceberg/table_metadata.cc b/src/iceberg/table_metadata.cc index 669e5a15b..b8ddc958c 100644 --- a/src/iceberg/table_metadata.cc +++ b/src/iceberg/table_metadata.cc @@ -162,6 +162,70 @@ bool operator==(const TableMetadata& lhs, const TableMetadata& rhs) { lhs.next_row_id == rhs.next_row_id; } +// TableMetadataCache implementation + +Result TableMetadataCache::GetSchemasById() const { + return schemas_map_.Get(metadata_); +} + +Result +TableMetadataCache::GetPartitionSpecsById() const { + return partition_specs_map_.Get(metadata_); +} + +Result TableMetadataCache::GetSortOrdersById() + const { + return sort_orders_map_.Get(metadata_); +} + +Result TableMetadataCache::GetSnapshotsById() const { + return snapshot_map_.Get(metadata_); +} + +Result TableMetadataCache::InitSchemasMap( + const TableMetadata* metadata) { + SchemasMap schemas_map; + schemas_map.reserve(metadata->schemas.size()); + for (const auto& schema : metadata->schemas) { + if (schema->schema_id()) { + schemas_map.emplace(schema->schema_id().value(), schema); + } + } + return schemas_map; +} + +Result TableMetadataCache::InitPartitionSpecsMap( + const TableMetadata* metadata) { + PartitionSpecsMap partition_specs_map; + partition_specs_map.reserve(metadata->partition_specs.size()); + for (const auto& spec : metadata->partition_specs) { + partition_specs_map.emplace(spec->spec_id(), spec); + } + return partition_specs_map; +} + +Result TableMetadataCache::InitSortOrdersMap( + const TableMetadata* metadata) { + SortOrdersMap sort_orders_map; + sort_orders_map.reserve(metadata->sort_orders.size()); + for (const auto& order : metadata->sort_orders) { + sort_orders_map.emplace(order->order_id(), order); + } + return sort_orders_map; +} + +Result TableMetadataCache::InitSnapshotMap( + const TableMetadata* metadata) { + SnapshotsMap snapshots_map; + snapshots_map.reserve(metadata->snapshots.size()); + for (const auto& snapshot : metadata->snapshots) { + snapshots_map.emplace(snapshot->snapshot_id, snapshot); + } + return snapshots_map; +} + +// TableMetadataUtil implementation + Result TableMetadataUtil::CodecFromFileName( std::string_view file_name) { if (file_name.find(".metadata.json") == std::string::npos) { diff --git a/src/iceberg/table_metadata.h b/src/iceberg/table_metadata.h index 2a998c7a1..5ecbe4766 100644 --- a/src/iceberg/table_metadata.h +++ b/src/iceberg/table_metadata.h @@ -30,6 +30,7 @@ #include "iceberg/iceberg_export.h" #include "iceberg/type_fwd.h" +#include "iceberg/util/lazy.h" #include "iceberg/util/timepoint.h" namespace iceberg { @@ -139,6 +140,40 @@ struct ICEBERG_EXPORT TableMetadata { const TableMetadata& rhs); }; +// Cache for table metadata mappings to facilitate fast lookups. +class ICEBERG_EXPORT TableMetadataCache { + public: + explicit TableMetadataCache(const TableMetadata* metadata) : metadata_(metadata) {} + + template + using ByIdMap = std::unordered_map>; + using SchemasMap = ByIdMap; + using PartitionSpecsMap = ByIdMap; + using SortOrdersMap = ByIdMap; + using SnapshotsMap = std::unordered_map>; + using SchemasMapRef = std::reference_wrapper; + using PartitionSpecsMapRef = std::reference_wrapper; + using SortOrdersMapRef = std::reference_wrapper; + using SnapshotsMapRef = std::reference_wrapper; + + Result GetSchemasById() const; + Result GetPartitionSpecsById() const; + Result GetSortOrdersById() const; + Result GetSnapshotsById() const; + + private: + static Result InitSchemasMap(const TableMetadata* metadata); + static Result InitPartitionSpecsMap(const TableMetadata* metadata); + static Result InitSortOrdersMap(const TableMetadata* metadata); + static Result InitSnapshotMap(const TableMetadata* metadata); + + const TableMetadata* metadata_; + Lazy schemas_map_; + Lazy partition_specs_map_; + Lazy sort_orders_map_; + Lazy snapshot_map_; +}; + /// \brief Returns a string representation of a SnapshotLogEntry ICEBERG_EXPORT std::string ToString(const SnapshotLogEntry& entry); diff --git a/src/iceberg/test/table_test.cc b/src/iceberg/test/table_test.cc index 680491034..362add9b5 100644 --- a/src/iceberg/test/table_test.cc +++ b/src/iceberg/test/table_test.cc @@ -49,19 +49,19 @@ TEST(Table, TableV1) { ASSERT_TRUE(schema.has_value()); ASSERT_EQ(schema.value()->fields().size(), 3); auto schemas = table.schemas(); - ASSERT_TRUE(schemas->empty()); + ASSERT_TRUE(schemas->get().empty()); // Check table spec auto spec = table.spec(); ASSERT_TRUE(spec.has_value()); auto specs = table.specs(); - ASSERT_EQ(1UL, specs->size()); + ASSERT_EQ(1UL, specs->get().size()); // Check table sort_order auto sort_order = table.sort_order(); ASSERT_TRUE(sort_order.has_value()); auto sort_orders = table.sort_orders(); - ASSERT_EQ(1UL, sort_orders->size()); + ASSERT_EQ(1UL, sort_orders->get().size()); // Check table location auto location = table.location(); @@ -89,19 +89,19 @@ TEST(Table, TableV2) { ASSERT_TRUE(schema.has_value()); ASSERT_EQ(schema.value()->fields().size(), 3); auto schemas = table.schemas(); - ASSERT_FALSE(schemas->empty()); + ASSERT_FALSE(schemas->get().empty()); // Check partition spec auto spec = table.spec(); ASSERT_TRUE(spec.has_value()); auto specs = table.specs(); - ASSERT_EQ(1UL, specs->size()); + ASSERT_EQ(1UL, specs->get().size()); // Check sort order auto sort_order = table.sort_order(); ASSERT_TRUE(sort_order.has_value()); auto sort_orders = table.sort_orders(); - ASSERT_EQ(1UL, sort_orders->size()); + ASSERT_EQ(1UL, sort_orders->get().size()); // Check table location auto location = table.location(); From 5e4a108c6cde3d81d2bfcd2d131d430bc689ce97 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Wed, 26 Nov 2025 23:45:20 +0800 Subject: [PATCH 2/3] address comments --- src/iceberg/table_metadata.cc | 46 +++++++++++++++-------------------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/src/iceberg/table_metadata.cc b/src/iceberg/table_metadata.cc index b8ddc958c..f130aba64 100644 --- a/src/iceberg/table_metadata.cc +++ b/src/iceberg/table_metadata.cc @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -184,44 +185,37 @@ Result TableMetadataCache::GetSnapshotsById Result TableMetadataCache::InitSchemasMap( const TableMetadata* metadata) { - SchemasMap schemas_map; - schemas_map.reserve(metadata->schemas.size()); - for (const auto& schema : metadata->schemas) { - if (schema->schema_id()) { - schemas_map.emplace(schema->schema_id().value(), schema); - } - } - return schemas_map; + return metadata->schemas | std::views::filter([](const auto& schema) { + return schema->schema_id().has_value(); + }) | + std::views::transform([](const auto& schema) { + return std::pair(schema->schema_id().value(), schema); + }) | + std::ranges::to(); } Result TableMetadataCache::InitPartitionSpecsMap( const TableMetadata* metadata) { - PartitionSpecsMap partition_specs_map; - partition_specs_map.reserve(metadata->partition_specs.size()); - for (const auto& spec : metadata->partition_specs) { - partition_specs_map.emplace(spec->spec_id(), spec); - } - return partition_specs_map; + return metadata->partition_specs | std::views::transform([](const auto& spec) { + return std::pair(spec->spec_id(), spec); + }) | + std::ranges::to(); } Result TableMetadataCache::InitSortOrdersMap( const TableMetadata* metadata) { - SortOrdersMap sort_orders_map; - sort_orders_map.reserve(metadata->sort_orders.size()); - for (const auto& order : metadata->sort_orders) { - sort_orders_map.emplace(order->order_id(), order); - } - return sort_orders_map; + return metadata->sort_orders | std::views::transform([](const auto& order) { + return std::pair(order->order_id(), order); + }) | + std::ranges::to(); } Result TableMetadataCache::InitSnapshotMap( const TableMetadata* metadata) { - SnapshotsMap snapshots_map; - snapshots_map.reserve(metadata->snapshots.size()); - for (const auto& snapshot : metadata->snapshots) { - snapshots_map.emplace(snapshot->snapshot_id, snapshot); - } - return snapshots_map; + return metadata->snapshots | std::views::transform([](const auto& snapshot) { + return std::pair(snapshot->snapshot_id, snapshot); + }) | + std::ranges::to(); } // TableMetadataUtil implementation From d5d2ea2da548cbaf974c64550cda2b344fe0026d Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Wed, 26 Nov 2025 23:50:20 +0800 Subject: [PATCH 3/3] Revert "address comments" This reverts commit 5e4a108c6cde3d81d2bfcd2d131d430bc689ce97. --- src/iceberg/table_metadata.cc | 46 ++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/src/iceberg/table_metadata.cc b/src/iceberg/table_metadata.cc index f130aba64..b8ddc958c 100644 --- a/src/iceberg/table_metadata.cc +++ b/src/iceberg/table_metadata.cc @@ -22,7 +22,6 @@ #include #include #include -#include #include #include @@ -185,37 +184,44 @@ Result TableMetadataCache::GetSnapshotsById Result TableMetadataCache::InitSchemasMap( const TableMetadata* metadata) { - return metadata->schemas | std::views::filter([](const auto& schema) { - return schema->schema_id().has_value(); - }) | - std::views::transform([](const auto& schema) { - return std::pair(schema->schema_id().value(), schema); - }) | - std::ranges::to(); + SchemasMap schemas_map; + schemas_map.reserve(metadata->schemas.size()); + for (const auto& schema : metadata->schemas) { + if (schema->schema_id()) { + schemas_map.emplace(schema->schema_id().value(), schema); + } + } + return schemas_map; } Result TableMetadataCache::InitPartitionSpecsMap( const TableMetadata* metadata) { - return metadata->partition_specs | std::views::transform([](const auto& spec) { - return std::pair(spec->spec_id(), spec); - }) | - std::ranges::to(); + PartitionSpecsMap partition_specs_map; + partition_specs_map.reserve(metadata->partition_specs.size()); + for (const auto& spec : metadata->partition_specs) { + partition_specs_map.emplace(spec->spec_id(), spec); + } + return partition_specs_map; } Result TableMetadataCache::InitSortOrdersMap( const TableMetadata* metadata) { - return metadata->sort_orders | std::views::transform([](const auto& order) { - return std::pair(order->order_id(), order); - }) | - std::ranges::to(); + SortOrdersMap sort_orders_map; + sort_orders_map.reserve(metadata->sort_orders.size()); + for (const auto& order : metadata->sort_orders) { + sort_orders_map.emplace(order->order_id(), order); + } + return sort_orders_map; } Result TableMetadataCache::InitSnapshotMap( const TableMetadata* metadata) { - return metadata->snapshots | std::views::transform([](const auto& snapshot) { - return std::pair(snapshot->snapshot_id, snapshot); - }) | - std::ranges::to(); + SnapshotsMap snapshots_map; + snapshots_map.reserve(metadata->snapshots.size()); + for (const auto& snapshot : metadata->snapshots) { + snapshots_map.emplace(snapshot->snapshot_id, snapshot); + } + return snapshots_map; } // TableMetadataUtil implementation