From 48ffd18fda0d74256b4171417bde4fa272be247f Mon Sep 17 00:00:00 2001 From: Xinli Shang Date: Wed, 26 Nov 2025 18:13:59 -0800 Subject: [PATCH] feat: add UpdateStatistics interface Add the UpdateStatistics interface for updating statistics files in tables. This interface extends PendingUpdate directly and provides methods for: - SetStatistics: Set statistics file for a snapshot - RemoveStatistics: Remove statistics file for a snapshot Statistics files are in Puffin format and help read table data more efficiently. The implementation includes comprehensive tests that verify behavior through the public API and follows the Java Iceberg API design. --- src/iceberg/test/CMakeLists.txt | 3 +- src/iceberg/test/update_statistics_test.cc | 251 +++++++++++++++++++++ src/iceberg/type_fwd.h | 2 + src/iceberg/update_statistics.h | 88 ++++++++ 4 files changed, 343 insertions(+), 1 deletion(-) create mode 100644 src/iceberg/test/update_statistics_test.cc create mode 100644 src/iceberg/update_statistics.h diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt index 25a03932d..e49ca40b1 100644 --- a/src/iceberg/test/CMakeLists.txt +++ b/src/iceberg/test/CMakeLists.txt @@ -86,7 +86,8 @@ add_iceberg_test(table_test table_metadata_builder_test.cc table_requirement_test.cc table_update_test.cc - test_common.cc) + test_common.cc + update_statistics_test.cc) add_iceberg_test(expression_test SOURCES diff --git a/src/iceberg/test/update_statistics_test.cc b/src/iceberg/test/update_statistics_test.cc new file mode 100644 index 000000000..6893ba048 --- /dev/null +++ b/src/iceberg/test/update_statistics_test.cc @@ -0,0 +1,251 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/update_statistics.h" + +#include +#include + +#include + +#include "iceberg/result.h" +#include "iceberg/statistics_file.h" +#include "iceberg/test/matchers.h" + +namespace iceberg { + +// Mock implementation of UpdateStatistics for testing +// This mock tracks which methods were called to verify behavior +class MockUpdateStatistics : public UpdateStatistics { + public: + MockUpdateStatistics() = default; + + Result> Apply() override { + if (should_fail_) { + return ValidationFailed("Mock validation failed"); + } + apply_called_ = true; + + // Return a vector of statistics files that reflects the configuration + // In a real implementation, this would return the statistics files + // that were set or modified + return statistics_files_; + } + + Status Commit() override { + if (should_fail_commit_) { + return CommitFailed("Mock commit failed"); + } + commit_called_ = true; + return {}; + } + + UpdateStatistics& SetStatistics(const StatisticsFile& statistics_file) override { + statistics_files_.push_back(statistics_file); + return *this; + } + + UpdateStatistics& RemoveStatistics(int64_t snapshot_id) override { + removed_snapshot_ids_.push_back(snapshot_id); + return *this; + } + + void SetShouldFail(bool fail) { should_fail_ = fail; } + void SetShouldFailCommit(bool fail) { should_fail_commit_ = fail; } + bool ApplyCalled() const { return apply_called_; } + bool CommitCalled() const { return commit_called_; } + + private: + bool should_fail_ = false; + bool should_fail_commit_ = false; + bool apply_called_ = false; + bool commit_called_ = false; + + std::vector statistics_files_; + std::vector removed_snapshot_ids_; +}; + +TEST(UpdateStatisticsTest, SetStatistics) { + MockUpdateStatistics update; + StatisticsFile stats{ + .snapshot_id = 100, + .path = "s3://bucket/metadata/stats-100.puffin", + .file_size_in_bytes = 1024, + .file_footer_size_in_bytes = 128, + .blob_metadata = {}, + }; + + update.SetStatistics(stats); + + // Verify through public API: Apply() should return statistics files + auto result = update.Apply(); + ASSERT_THAT(result, IsOk()); + + const auto& files = result.value(); + EXPECT_EQ(files.size(), 1); + EXPECT_EQ(files[0].snapshot_id, 100); + EXPECT_EQ(files[0].path, "s3://bucket/metadata/stats-100.puffin"); +} + +TEST(UpdateStatisticsTest, SetMultipleStatistics) { + MockUpdateStatistics update; + StatisticsFile stats1{ + .snapshot_id = 100, + .path = "s3://bucket/metadata/stats-100.puffin", + .file_size_in_bytes = 1024, + .file_footer_size_in_bytes = 128, + .blob_metadata = {}, + }; + StatisticsFile stats2{ + .snapshot_id = 200, + .path = "s3://bucket/metadata/stats-200.puffin", + .file_size_in_bytes = 2048, + .file_footer_size_in_bytes = 256, + .blob_metadata = {}, + }; + + update.SetStatistics(stats1).SetStatistics(stats2); + + // Verify through public API + auto result = update.Apply(); + ASSERT_THAT(result, IsOk()); + + const auto& files = result.value(); + EXPECT_EQ(files.size(), 2); + EXPECT_EQ(files[0].snapshot_id, 100); + EXPECT_EQ(files[1].snapshot_id, 200); +} + +TEST(UpdateStatisticsTest, RemoveStatistics) { + MockUpdateStatistics update; + update.RemoveStatistics(100); + + // Just verify it doesn't error + auto result = update.Apply(); + EXPECT_THAT(result, IsOk()); +} + +TEST(UpdateStatisticsTest, RemoveMultipleStatistics) { + MockUpdateStatistics update; + update.RemoveStatistics(100).RemoveStatistics(200).RemoveStatistics(300); + + // Verify it doesn't error + auto result = update.Apply(); + EXPECT_THAT(result, IsOk()); +} + +TEST(UpdateStatisticsTest, MethodChaining) { + MockUpdateStatistics update; + StatisticsFile stats{ + .snapshot_id = 100, + .path = "s3://bucket/metadata/stats-100.puffin", + .file_size_in_bytes = 1024, + .file_footer_size_in_bytes = 128, + .blob_metadata = {}, + }; + + // Test that all methods return the reference for chaining + update.SetStatistics(stats).RemoveStatistics(50).RemoveStatistics(75); + + // Verify through public API + auto result = update.Apply(); + ASSERT_THAT(result, IsOk()); + + const auto& files = result.value(); + EXPECT_EQ(files.size(), 1); + EXPECT_EQ(files[0].snapshot_id, 100); +} + +TEST(UpdateStatisticsTest, SetStatisticsWithBlobMetadata) { + MockUpdateStatistics update; + BlobMetadata blob{ + .type = "ndv", + .source_snapshot_id = 100, + .source_snapshot_sequence_number = 1, + .fields = {1, 2, 3}, + .properties = {{"key1", "value1"}, {"key2", "value2"}}, + }; + + StatisticsFile stats{ + .snapshot_id = 100, + .path = "s3://bucket/metadata/stats-100.puffin", + .file_size_in_bytes = 1024, + .file_footer_size_in_bytes = 128, + .blob_metadata = {blob}, + }; + + update.SetStatistics(stats); + + auto result = update.Apply(); + ASSERT_THAT(result, IsOk()); + + const auto& files = result.value(); + EXPECT_EQ(files.size(), 1); + EXPECT_EQ(files[0].blob_metadata.size(), 1); + EXPECT_EQ(files[0].blob_metadata[0].type, "ndv"); + EXPECT_EQ(files[0].blob_metadata[0].fields.size(), 3); +} + +TEST(UpdateStatisticsTest, ApplySuccess) { + MockUpdateStatistics update; + auto result = update.Apply(); + EXPECT_THAT(result, IsOk()); + EXPECT_TRUE(update.ApplyCalled()); +} + +TEST(UpdateStatisticsTest, ApplyValidationFailed) { + MockUpdateStatistics update; + update.SetShouldFail(true); + auto result = update.Apply(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Mock validation failed")); +} + +TEST(UpdateStatisticsTest, CommitSuccess) { + MockUpdateStatistics update; + auto status = update.Commit(); + EXPECT_THAT(status, IsOk()); + EXPECT_TRUE(update.CommitCalled()); +} + +TEST(UpdateStatisticsTest, CommitFailed) { + MockUpdateStatistics update; + update.SetShouldFailCommit(true); + auto status = update.Commit(); + EXPECT_THAT(status, IsError(ErrorKind::kCommitFailed)); + EXPECT_THAT(status, HasErrorMessage("Mock commit failed")); +} + +TEST(UpdateStatisticsTest, InheritanceFromPendingUpdate) { + std::unique_ptr base_ptr = std::make_unique(); + auto status = base_ptr->Commit(); + EXPECT_THAT(status, IsOk()); +} + +TEST(UpdateStatisticsTest, InheritanceFromPendingUpdateTyped) { + std::unique_ptr>> typed_ptr = + std::make_unique(); + auto status = typed_ptr->Commit(); + EXPECT_THAT(status, IsOk()); + + auto result = typed_ptr->Apply(); + EXPECT_THAT(result, IsOk()); +} + +} // namespace iceberg diff --git a/src/iceberg/type_fwd.h b/src/iceberg/type_fwd.h index 81681ebcd..6dfc81215 100644 --- a/src/iceberg/type_fwd.h +++ b/src/iceberg/type_fwd.h @@ -160,6 +160,8 @@ class PendingUpdate; template class PendingUpdateTyped; +class UpdateStatistics; + /// ---------------------------------------------------------------------------- /// TODO: Forward declarations below are not added yet. /// ---------------------------------------------------------------------------- diff --git a/src/iceberg/update_statistics.h b/src/iceberg/update_statistics.h new file mode 100644 index 000000000..e24e3a547 --- /dev/null +++ b/src/iceberg/update_statistics.h @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/update_statistics.h +/// API for updating statistics files in a table + +#include +#include + +#include "iceberg/iceberg_export.h" +#include "iceberg/pending_update.h" +#include "iceberg/type_fwd.h" + +namespace iceberg { + +/// \brief API for updating statistics files in a table +/// +/// UpdateStatistics accumulates statistics file changes and commits them to +/// the table. This API allows setting and removing statistics files for +/// snapshots. +/// +/// Statistics are informational and used to read table data more efficiently. +/// A reader can choose to ignore statistics information. Statistics support +/// is not required to read the table correctly. +/// +/// When committing, changes are applied to the latest table metadata. Commit +/// conflicts are resolved by applying the changes to the new latest metadata +/// and reattempting the commit. +/// +/// Apply() returns a list of the statistics files that will be affected. +/// +/// Example usage: +/// \code +/// table.UpdateStatistics() +/// .SetStatistics(statistics_file) +/// .RemoveStatistics(old_snapshot_id) +/// .Commit(); +/// \endcode +class ICEBERG_EXPORT UpdateStatistics + : public PendingUpdateTyped> { + public: + ~UpdateStatistics() override = default; + + /// \brief Set the table's statistics file for a snapshot + /// + /// Sets the statistics file for a snapshot, replacing the previous statistics + /// file for the snapshot if any exists. The snapshot ID is taken from the + /// StatisticsFile object itself via statistics_file.snapshot_id. + /// + /// \param statistics_file The statistics file to set (contains snapshot ID) + /// \return Reference to this for method chaining + virtual UpdateStatistics& SetStatistics(const StatisticsFile& statistics_file) = 0; + + /// \brief Remove the table's statistics file for a snapshot + /// + /// Removes the statistics file associated with the specified snapshot. + /// + /// \param snapshot_id The ID of the snapshot whose statistics file should be removed + /// \return Reference to this for method chaining + virtual UpdateStatistics& RemoveStatistics(int64_t snapshot_id) = 0; + + // Non-copyable, movable (inherited from PendingUpdate) + UpdateStatistics(const UpdateStatistics&) = delete; + UpdateStatistics& operator=(const UpdateStatistics&) = delete; + + protected: + UpdateStatistics() = default; +}; + +} // namespace iceberg