From 064259c3faff310480729f78057026d5fadf11a0 Mon Sep 17 00:00:00 2001 From: HeartLinked Date: Thu, 14 Aug 2025 10:50:04 +0800 Subject: [PATCH 1/5] test: add ManifestReaderV2Test for non-partitioned manifests - Add V2NonPartitionedBasicTest to test manifest reader v2 functionality - Fix bounds values to match actual manifest file data - Use proper C++23 designated initializers for test data setup --- test/manifest_reader_test.cc | 93 +++++++++++++++++- ...df1bc9-830b-4015-aced-c060df36f150-m0.avro | Bin 0 -> 7207 bytes 2 files changed, 88 insertions(+), 5 deletions(-) create mode 100644 test/resources/2ddf1bc9-830b-4015-aced-c060df36f150-m0.avro diff --git a/test/manifest_reader_test.cc b/test/manifest_reader_test.cc index 2fb20b73b..608b8d41e 100644 --- a/test/manifest_reader_test.cc +++ b/test/manifest_reader_test.cc @@ -19,13 +19,14 @@ #include "iceberg/manifest_reader.h" +#include + #include #include #include "iceberg/arrow/arrow_fs_file_io_internal.h" #include "iceberg/avro/avro_reader.h" #include "iceberg/avro/avro_register.h" -#include "iceberg/avro/avro_schema_util_internal.h" #include "iceberg/manifest_entry.h" #include "iceberg/schema.h" #include "temp_file_test_base.h" @@ -33,7 +34,7 @@ namespace iceberg { -class ManifestReaderTest : public TempFileTestBase { +class ManifestReaderV1Test : public TempFileTestBase { protected: static void SetUpTestSuite() { avro::AvroReader::Register(); } @@ -45,7 +46,7 @@ class ManifestReaderTest : public TempFileTestBase { avro::RegisterLogicalTypes(); } - std::vector prepare_manifest_entries() { + std::vector prepareV1ManifestEntries() { std::vector manifest_entries; std::string test_dir_prefix = "/tmp/db/db/iceberg_test/data/"; std::vector paths = { @@ -102,7 +103,7 @@ class ManifestReaderTest : public TempFileTestBase { std::shared_ptr file_io_; }; -TEST_F(ManifestReaderTest, BasicTest) { +TEST_F(ManifestReaderV1Test, V1PartitionedBasicTest) { iceberg::SchemaField partition_field(1000, "order_ts_hour", iceberg::int32(), true); auto partition_schema = std::make_shared(std::vector({partition_field})); @@ -115,7 +116,89 @@ TEST_F(ManifestReaderTest, BasicTest) { auto read_result = manifest_reader->Entries(); ASSERT_EQ(read_result.has_value(), true) << read_result.error().message; - auto expected_entries = prepare_manifest_entries(); + auto expected_entries = prepareV1ManifestEntries(); + ASSERT_EQ(read_result.value(), expected_entries); +} + +class ManifestReaderV2Test : public TempFileTestBase { + protected: + static void SetUpTestSuite() { avro::AvroReader::Register(); } + + void SetUp() override { + TempFileTestBase::SetUp(); + local_fs_ = std::make_shared<::arrow::fs::LocalFileSystem>(); + file_io_ = std::make_shared(local_fs_); + + avro::RegisterLogicalTypes(); + } + + std::vector prepareV2NonPartitionedManifestEntries() { + std::vector manifest_entries; + std::string test_dir_prefix = "/tmp/db/db/v2_manifest_non_partitioned/data/"; + + std::vector paths = { + "00000-0-b0f98903-6d21-45fd-9e0b-afbd4963e365-0-00001.parquet"}; + + std::vector file_sizes = {1344}; + std::vector record_counts = {4}; + + // Real bounds data extracted from the manifest + std::vector>> lower_bounds = { + {{1, {0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}}, + {2, {'r', 'e', 'c', 'o', 'r', 'd', '_', 'f', 'o', 'u', 'r'}}, + {3, {'d', 'a', 't', 'a', '_', 'c', 'o', 'n', 't', 'e', 'n', 't', '_', '1'}}, + {4, {0xcd, 0xcc, 0xcc, 0xcc, 0xcc, 0xdc, 0x5e, 0x40}}}}; + + std::vector>> upper_bounds = { + {{1, {0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}}, + {2, {'r', 'e', 'c', 'o', 'r', 'd', '_', 't', 'w', 'o'}}, + {3, {'d', 'a', 't', 'a', '_', 'c', 'o', 'n', 't', 'e', 'n', 't', '_', '4'}}, + {4, {0x14, 0xae, 0x47, 0xe1, 0x7a, 0x8c, 0x7c, 0x40}}}}; + + manifest_entries.emplace_back( + ManifestEntry{.status = ManifestStatus::kAdded, + .snapshot_id = 679879563479918846LL, + .sequence_number = std::nullopt, + .file_sequence_number = std::nullopt, + .data_file = std::make_shared( + DataFile{.file_path = test_dir_prefix + paths[0], + .file_format = FileFormatType::kParquet, + .record_count = record_counts[0], + .file_size_in_bytes = file_sizes[0], + .column_sizes = {{1, 56}, {2, 73}, {3, 66}, {4, 67}}, + .value_counts = {{1, 4}, {2, 4}, {3, 4}, {4, 4}}, + .null_value_counts = {{1, 0}, {2, 0}, {3, 0}, {4, 0}}, + .nan_value_counts = {{4, 0}}, + .lower_bounds = lower_bounds[0], + .upper_bounds = upper_bounds[0], + .key_metadata = {}, + .split_offsets = {4}, + .equality_ids = {}, + .sort_order_id = 0, + .first_row_id = std::nullopt, + .referenced_data_file = std::nullopt, + .content_offset = std::nullopt, + .content_size_in_bytes = std::nullopt})}); + return manifest_entries; + } + + std::shared_ptr<::arrow::fs::LocalFileSystem> local_fs_; + std::shared_ptr file_io_; +}; + +TEST_F(ManifestReaderV2Test, V2NonPartitionedBasicTest) { + std::string path = GetResourcePath("2ddf1bc9-830b-4015-aced-c060df36f150-m0.avro"); + + auto manifest_reader_result = ManifestReader::MakeReader(path, file_io_, nullptr); + ASSERT_EQ(manifest_reader_result.has_value(), true) + << manifest_reader_result.error().message; + + auto manifest_reader = std::move(manifest_reader_result.value()); + auto read_result = manifest_reader->Entries(); + ASSERT_EQ(read_result.has_value(), true) << read_result.error().message; + ASSERT_EQ(read_result.value().size(), 1); + + auto expected_entries = prepareV2NonPartitionedManifestEntries(); ASSERT_EQ(read_result.value(), expected_entries); } diff --git a/test/resources/2ddf1bc9-830b-4015-aced-c060df36f150-m0.avro b/test/resources/2ddf1bc9-830b-4015-aced-c060df36f150-m0.avro new file mode 100644 index 0000000000000000000000000000000000000000..f8e6c1c413118be941de06f4a6f72e9bf886380f GIT binary patch literal 7207 zcmcIoZ-^CD6kpxVf<(0ZvxOh76GX;ado#2DL6Fi-Q`hhhN+!?y=H2H`&YOA7%v;-J zL4zPa7`X+CNKzuvhe*@D?T3;jG_2NIqCm)!2#xv}5(=GrXXf7f=Diu;zTM!lurv3b zbAG>j&bj9|Tk@~WSvcP_hDZs&({@ntN6JL$S3KXX7`~EGf;SyzLFvzCl%h$jf~WNF zKB&+qYDTefiLQtLGKx!Ht(Y!>7e#D&Boi649DAQKnhEx3d=J~2ybD4wrF}ZzSHM0_ z@fO?H$@k%sQ?V(gq&N#sC2tXBbkFEgJnTBF0_=x+IV&zP9G4;uQJ1i77K!KQi0!*0 z@Lb|2T;loIuXyk(B-UgRuyp~cl$U6&jmzGUU{*dqzQ1n|Yk z2%?-Kzf!hHCP0W$9($yKfL_WycZa+(F=BCP68fOJaz4H31PR&QlWjf_#FdVB=vU7B~uo@K_ zXBN9IrX9lcNeLP#e;#is`&G3kH>|3?5t^Xeh8bC;Mt+7 z52(_z%}~`B4@>YGW}HNomYo4rET1!wNrqJ^z>S7z<3`vnpg}yC&Yc`2-IZg9I*SBq zDkZS0#QANMo@<)iQSO^-n${G7j45Uuaxl2NlQT_~JE@KBStm>H1+w(MCbCVhDT1_Y z)BBnvi(d7#ULmg#V~w}Rpn_AAO_!U8w!vB$GGQ(FNOjREcVm|niA(iJAy+&2M{R{_PAd4g zvjMpAMi2uDunNeEX6mPEhw7EI@8Mt;rL!PRp4dkiF4}Oyhk-GSAw6t3@KJemrWe;@ zrUnhCK#T=Is)c>BJea^chKcKej+m<{e9Y~jV=ko6-0X7GAkdQcg+lWEReb6Ls13+# z5ZsY6j8vqMu3$kZYrK-&@Kt&4Q{FF^LS0>B;(GMQuNj?dkebI1vdIWO+|>C7{sZ=w^=MzT$@C(z-aVn2wnyCapUWlLp*HnDVBL+1Fq8phGyhZ{^6cLxeBN zKBh2)<%UW|OD-8*Dw*cPmjwY2Ul*4UCxox?0D=~iDguNXareaM{RBU}J6fhk zKaW<=)o^Wu51_?_Ux1<&!eG^j))53EJduO3*e;s!9sTswnx#wTPPQ8Pe=odt>-gV)?f&Iq_;&yD-l=DyMs3b$aL6$(@flmsgbknZNM*5wGP>eE8O)!?#zi{O Date: Thu, 14 Aug 2025 11:14:13 +0800 Subject: [PATCH 2/5] fix: correct partition field handling for non-partitioned tables - Fix ManifestEntry::operator== logic with proper parentheses grouping - Add null pointer safety in DataFile::Type() for non-partitioned tables - Change partition field type validation from list to struct in manifest reader - Support empty partition structs for non-partitioned tables (n_children == 0) - Update test bounds values to match actual manifest file data This resolves issues with non-partitioned table support and fixes critical bugs in object comparison and type validation. --- src/iceberg/manifest_entry.cc | 11 +++++++---- src/iceberg/manifest_reader_internal.cc | 16 +++++++++------- test/manifest_reader_test.cc | 1 - 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/src/iceberg/manifest_entry.cc b/src/iceberg/manifest_entry.cc index 4a9aaa2df..2d67a5349 100644 --- a/src/iceberg/manifest_entry.cc +++ b/src/iceberg/manifest_entry.cc @@ -29,13 +29,16 @@ namespace iceberg { bool ManifestEntry::operator==(const ManifestEntry& other) const { return status == other.status && snapshot_id == other.snapshot_id && - sequence_number == other.sequence_number && - file_sequence_number == other.file_sequence_number && - (data_file && other.data_file && *data_file == *other.data_file) || - (!data_file && !other.data_file); + sequence_number == other.sequence_number && + file_sequence_number == other.file_sequence_number && + ((data_file && other.data_file && *data_file == *other.data_file) || + (!data_file && !other.data_file)); } std::shared_ptr DataFile::Type(std::shared_ptr partition_type) { + if (!partition_type) { + partition_type = std::make_shared(std::vector{}); + } return std::make_shared(std::vector{ kContent, kFilePath, diff --git a/src/iceberg/manifest_reader_internal.cc b/src/iceberg/manifest_reader_internal.cc index 251e0d71d..4ddd6cf21 100644 --- a/src/iceberg/manifest_reader_internal.cc +++ b/src/iceberg/manifest_reader_internal.cc @@ -368,15 +368,17 @@ Status ParseDataFile(const std::shared_ptr& data_file_schema, break; case 3: { if (view_of_file_field->storage_type != ArrowType::NANOARROW_TYPE_STRUCT) { - return InvalidManifest("Field:{} should be a list.", field_name); + return InvalidManifest("Field:{} should be a struct.", field_name); } - auto view_of_partition = view_of_file_field->children[0]; - for (size_t row_idx = 0; row_idx < view_of_partition->length; row_idx++) { - if (ArrowArrayViewIsNull(view_of_partition, row_idx)) { - break; + if (view_of_file_field->n_children > 0) { + auto view_of_partition = view_of_file_field->children[0]; + for (size_t row_idx = 0; row_idx < view_of_partition->length; row_idx++) { + if (ArrowArrayViewIsNull(view_of_partition, row_idx)) { + break; + } + ICEBERG_RETURN_UNEXPECTED( + ParseLiteral(view_of_partition, row_idx, manifest_entries)); } - ICEBERG_RETURN_UNEXPECTED( - ParseLiteral(view_of_partition, row_idx, manifest_entries)); } } break; case 4: diff --git a/test/manifest_reader_test.cc b/test/manifest_reader_test.cc index 608b8d41e..092a2c3fc 100644 --- a/test/manifest_reader_test.cc +++ b/test/manifest_reader_test.cc @@ -142,7 +142,6 @@ class ManifestReaderV2Test : public TempFileTestBase { std::vector file_sizes = {1344}; std::vector record_counts = {4}; - // Real bounds data extracted from the manifest std::vector>> lower_bounds = { {{1, {0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}}, {2, {'r', 'e', 'c', 'o', 'r', 'd', '_', 'f', 'o', 'u', 'r'}}, From dcd60acb99917673d4bfdd28efe22b0fd00b191a Mon Sep 17 00:00:00 2001 From: HeartLinked Date: Thu, 14 Aug 2025 14:10:04 +0800 Subject: [PATCH 3/5] fix compare --- src/iceberg/manifest_reader_internal.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/iceberg/manifest_reader_internal.cc b/src/iceberg/manifest_reader_internal.cc index 4ddd6cf21..e3b9becc7 100644 --- a/src/iceberg/manifest_reader_internal.cc +++ b/src/iceberg/manifest_reader_internal.cc @@ -372,7 +372,7 @@ Status ParseDataFile(const std::shared_ptr& data_file_schema, } if (view_of_file_field->n_children > 0) { auto view_of_partition = view_of_file_field->children[0]; - for (size_t row_idx = 0; row_idx < view_of_partition->length; row_idx++) { + for (int64_t row_idx = 0; row_idx < view_of_partition->length; row_idx++) { if (ArrowArrayViewIsNull(view_of_partition, row_idx)) { break; } From 4be162f6cf3d414ed0131410de69a5043e19e4c0 Mon Sep 17 00:00:00 2001 From: HeartLinked Date: Fri, 15 Aug 2025 17:57:23 +0800 Subject: [PATCH 4/5] fix typo --- test/manifest_reader_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/manifest_reader_test.cc b/test/manifest_reader_test.cc index 092a2c3fc..325de3879 100644 --- a/test/manifest_reader_test.cc +++ b/test/manifest_reader_test.cc @@ -46,7 +46,7 @@ class ManifestReaderV1Test : public TempFileTestBase { avro::RegisterLogicalTypes(); } - std::vector prepareV1ManifestEntries() { + std::vector PrepareV1ManifestEntries() { std::vector manifest_entries; std::string test_dir_prefix = "/tmp/db/db/iceberg_test/data/"; std::vector paths = { @@ -116,7 +116,7 @@ TEST_F(ManifestReaderV1Test, V1PartitionedBasicTest) { auto read_result = manifest_reader->Entries(); ASSERT_EQ(read_result.has_value(), true) << read_result.error().message; - auto expected_entries = prepareV1ManifestEntries(); + auto expected_entries = PrepareV1ManifestEntries(); ASSERT_EQ(read_result.value(), expected_entries); } From 8670f02d5a389a7967a25cae141bf6d430609901 Mon Sep 17 00:00:00 2001 From: HeartLinked Date: Mon, 18 Aug 2025 18:40:39 +0800 Subject: [PATCH 5/5] fix review --- src/iceberg/manifest_entry.cc | 3 ++- src/iceberg/partition_spec.cc | 4 ++-- test/manifest_reader_test.cc | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/iceberg/manifest_entry.cc b/src/iceberg/manifest_entry.cc index 2d67a5349..d387aad68 100644 --- a/src/iceberg/manifest_entry.cc +++ b/src/iceberg/manifest_entry.cc @@ -22,6 +22,7 @@ #include #include +#include "iceberg/schema.h" #include "iceberg/schema_field.h" #include "iceberg/type.h" @@ -37,7 +38,7 @@ bool ManifestEntry::operator==(const ManifestEntry& other) const { std::shared_ptr DataFile::Type(std::shared_ptr partition_type) { if (!partition_type) { - partition_type = std::make_shared(std::vector{}); + partition_type = PartitionSpec::Unpartitioned()->schema(); } return std::make_shared(std::vector{ kContent, diff --git a/src/iceberg/partition_spec.cc b/src/iceberg/partition_spec.cc index 47794e7c6..2b41950b4 100644 --- a/src/iceberg/partition_spec.cc +++ b/src/iceberg/partition_spec.cc @@ -46,8 +46,8 @@ PartitionSpec::PartitionSpec(std::shared_ptr schema, int32_t spec_id, const std::shared_ptr& PartitionSpec::Unpartitioned() { static const std::shared_ptr unpartitioned = std::make_shared( - /*schema=*/nullptr, kInitialSpecId, std::vector{}, - kLegacyPartitionDataIdStart - 1); + /*schema=*/std::make_shared(std::vector{}), kInitialSpecId, + std::vector{}, kLegacyPartitionDataIdStart - 1); return unpartitioned; } diff --git a/test/manifest_reader_test.cc b/test/manifest_reader_test.cc index 325de3879..05224dbf8 100644 --- a/test/manifest_reader_test.cc +++ b/test/manifest_reader_test.cc @@ -132,7 +132,7 @@ class ManifestReaderV2Test : public TempFileTestBase { avro::RegisterLogicalTypes(); } - std::vector prepareV2NonPartitionedManifestEntries() { + std::vector PrepareV2NonPartitionedManifestEntries() { std::vector manifest_entries; std::string test_dir_prefix = "/tmp/db/db/v2_manifest_non_partitioned/data/"; @@ -197,7 +197,7 @@ TEST_F(ManifestReaderV2Test, V2NonPartitionedBasicTest) { ASSERT_EQ(read_result.has_value(), true) << read_result.error().message; ASSERT_EQ(read_result.value().size(), 1); - auto expected_entries = prepareV2NonPartitionedManifestEntries(); + auto expected_entries = PrepareV2NonPartitionedManifestEntries(); ASSERT_EQ(read_result.value(), expected_entries); }