Skip to content

Commit aeac91e

Browse files
committed
feat: assign fresh field ids for new schema
1 parent 7b95952 commit aeac91e

File tree

8 files changed

+444
-21
lines changed

8 files changed

+444
-21
lines changed

src/iceberg/schema.cc

Lines changed: 63 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,11 @@
3333

3434
namespace iceberg {
3535

36-
Schema::Schema(std::vector<SchemaField> fields, std::optional<int32_t> schema_id)
37-
: StructType(std::move(fields)), schema_id_(schema_id) {}
36+
Schema::Schema(std::vector<SchemaField> fields, std::optional<int32_t> schema_id,
37+
std::vector<int32_t> identifier_field_ids)
38+
: StructType(std::move(fields)),
39+
schema_id_(schema_id),
40+
identifier_field_ids_(std::move(identifier_field_ids)) {}
3841

3942
std::optional<int32_t> Schema::schema_id() const { return schema_id_; }
4043

@@ -48,15 +51,16 @@ std::string Schema::ToString() const {
4851
}
4952

5053
bool Schema::Equals(const Schema& other) const {
51-
return schema_id_ == other.schema_id_ && fields_ == other.fields_;
54+
return schema_id_ == other.schema_id_ && fields_ == other.fields_ &&
55+
identifier_field_ids_ == other.identifier_field_ids_;
5256
}
5357

5458
Result<std::optional<std::reference_wrapper<const SchemaField>>> Schema::FindFieldByName(
5559
std::string_view name, bool case_sensitive) const {
5660
if (case_sensitive) {
57-
ICEBERG_ASSIGN_OR_RAISE(auto name_to_id, name_to_id_.Get(*this));
58-
auto it = name_to_id.get().find(name);
59-
if (it == name_to_id.get().end()) {
61+
ICEBERG_ASSIGN_OR_RAISE(auto name_id_map, name_id_map_.Get(*this));
62+
auto it = name_id_map.get().name_to_id.find(name);
63+
if (it == name_id_map.get().name_to_id.end()) {
6064
return std::nullopt;
6165
};
6266
return FindFieldById(it->second);
@@ -77,21 +81,22 @@ Schema::InitIdToFieldMap(const Schema& self) {
7781
return id_to_field;
7882
}
7983

80-
Result<std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>>
81-
Schema::InitNameToIdMap(const Schema& self) {
82-
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>> name_to_id;
83-
NameToIdVisitor visitor(name_to_id, /*case_sensitive=*/true);
84+
Result<Schema::NameIdMap> Schema::InitNameIdMap(const Schema& self) {
85+
NameIdMap name_id_map;
86+
NameToIdVisitor visitor(name_id_map.name_to_id, &name_id_map.id_to_name,
87+
/*case_sensitive=*/true);
8488
ICEBERG_RETURN_UNEXPECTED(
8589
VisitTypeInline(self, &visitor, /*path=*/"", /*short_path=*/""));
8690
visitor.Finish();
87-
return name_to_id;
91+
return name_id_map;
8892
}
8993

9094
Result<std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>>
9195
Schema::InitLowerCaseNameToIdMap(const Schema& self) {
9296
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>
9397
lowercase_name_to_id;
94-
NameToIdVisitor visitor(lowercase_name_to_id, /*case_sensitive=*/false);
98+
NameToIdVisitor visitor(lowercase_name_to_id, /*id_to_name=*/nullptr,
99+
/*case_sensitive=*/false);
95100
ICEBERG_RETURN_UNEXPECTED(
96101
VisitTypeInline(self, &visitor, /*path=*/"", /*short_path=*/""));
97102
visitor.Finish();
@@ -108,6 +113,16 @@ Result<std::optional<std::reference_wrapper<const SchemaField>>> Schema::FindFie
108113
return it->second;
109114
}
110115

116+
Result<std::optional<std::string_view>> Schema::FindColumnNameById(
117+
int32_t field_id) const {
118+
ICEBERG_ASSIGN_OR_RAISE(auto name_id_map, name_id_map_.Get(*this));
119+
auto it = name_id_map.get().id_to_name.find(field_id);
120+
if (it == name_id_map.get().id_to_name.end()) {
121+
return std::nullopt;
122+
}
123+
return it->second;
124+
}
125+
111126
Result<std::unordered_map<int32_t, std::vector<size_t>>> Schema::InitIdToPositionPath(
112127
const Schema& self) {
113128
PositionPathVisitor visitor;
@@ -179,4 +194,40 @@ Result<std::unique_ptr<Schema>> Schema::Project(
179194
std::nullopt);
180195
}
181196

197+
const std::vector<int32_t>& Schema::IdentifierFieldIds() const {
198+
return identifier_field_ids_;
199+
}
200+
201+
Result<std::vector<std::string>> Schema::IdentifierFieldNames() const {
202+
using std::ranges::to;
203+
using std::views::transform;
204+
std::vector<std::string> names;
205+
names.reserve(identifier_field_ids_.size());
206+
for (auto id : identifier_field_ids_) {
207+
ICEBERG_ASSIGN_OR_RAISE(auto name, FindColumnNameById(id));
208+
if (!name.has_value()) {
209+
return InvalidSchema("Cannot find the field of the specified field id: {}", id);
210+
}
211+
names.emplace_back(name.value());
212+
}
213+
return names;
214+
}
215+
216+
Result<std::unique_ptr<Schema>> Schema::Make(
217+
std::vector<SchemaField> fields, int32_t schema_id,
218+
const std::vector<std::string>& identifier_field_names) {
219+
auto schema = std::make_unique<Schema>(std::move(fields), schema_id);
220+
221+
std::vector<int32_t> fresh_identifier_ids;
222+
for (const auto& name : identifier_field_names) {
223+
ICEBERG_ASSIGN_OR_RAISE(auto field, schema->FindFieldByName(name));
224+
if (!field) {
225+
return InvalidSchema("Cannot find identifier field: {}", name);
226+
}
227+
fresh_identifier_ids.push_back(field.value().get().field_id());
228+
}
229+
schema->identifier_field_ids_ = std::move(fresh_identifier_ids);
230+
return schema;
231+
}
232+
182233
} // namespace iceberg

src/iceberg/schema.h

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@ class ICEBERG_EXPORT Schema : public StructType {
4949
static constexpr int32_t kInvalidColumnId = -1;
5050

5151
explicit Schema(std::vector<SchemaField> fields,
52-
std::optional<int32_t> schema_id = std::nullopt);
52+
std::optional<int32_t> schema_id = std::nullopt,
53+
std::vector<int32_t> identifier_field_ids = {});
5354

5455
/// \brief Get the schema ID.
5556
///
@@ -78,6 +79,12 @@ class ICEBERG_EXPORT Schema : public StructType {
7879
Result<std::optional<std::reference_wrapper<const SchemaField>>> FindFieldById(
7980
int32_t field_id) const;
8081

82+
/// \brief Returns the full column name for the given id.
83+
///
84+
/// \param field_id The id of the field to get the full name for.
85+
/// \return The full name of the field with the given id, or std::nullopt if not found.
86+
Result<std::optional<std::string_view>> FindColumnNameById(int32_t field_id) const;
87+
8188
/// \brief Get the accessor to access the field by field id.
8289
///
8390
/// \param field_id The id of the field to get the accessor for.
@@ -103,26 +110,49 @@ class ICEBERG_EXPORT Schema : public StructType {
103110
Result<std::unique_ptr<Schema>> Project(
104111
const std::unordered_set<int32_t>& field_ids) const;
105112

113+
const std::vector<int32_t>& IdentifierFieldIds() const;
114+
Result<std::vector<std::string>> IdentifierFieldNames() const;
115+
106116
friend bool operator==(const Schema& lhs, const Schema& rhs) { return lhs.Equals(rhs); }
107117

118+
static Result<std::unique_ptr<Schema>> Make(
119+
std::vector<SchemaField> fields, int32_t schema_id = kInitialSchemaId,
120+
const std::vector<std::string>& identifier_field_names = {});
121+
108122
private:
109123
/// \brief Compare two schemas for equality.
110124
bool Equals(const Schema& other) const;
111125

126+
struct NameIdMap {
127+
/// \brief Mapping from full field name to ID
128+
///
129+
/// \note Short names for maps and lists are included for any name that does not
130+
/// conflict with a canonical name. For example, a list, 'l', of structs with field
131+
/// 'x' will produce short name 'l.x' in addition to canonical name 'l.element.x'.
132+
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>> name_to_id;
133+
134+
/// \brief Mapping from field ID to full name
135+
///
136+
/// \note Canonical names, but not short names are set, for example
137+
/// 'list.element.field' instead of 'list.field'.
138+
std::unordered_map<int32_t, std::string> id_to_name;
139+
};
140+
112141
static Result<std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>>
113142
InitIdToFieldMap(const Schema&);
114-
static Result<std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>>
115-
InitNameToIdMap(const Schema&);
143+
static Result<NameIdMap> InitNameIdMap(const Schema&);
116144
static Result<std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>>
117145
InitLowerCaseNameToIdMap(const Schema&);
118146
static Result<std::unordered_map<int32_t, std::vector<size_t>>> InitIdToPositionPath(
119147
const Schema&);
120148

121149
const std::optional<int32_t> schema_id_;
150+
/// Field IDs that uniquely identify rows in the table.
151+
std::vector<int32_t> identifier_field_ids_;
122152
/// Mapping from field id to field.
123153
Lazy<InitIdToFieldMap> id_to_field_;
124154
/// Mapping from field name to field id.
125-
Lazy<InitNameToIdMap> name_to_id_;
155+
Lazy<InitNameIdMap> name_id_map_;
126156
/// Mapping from lowercased field name to field id
127157
Lazy<InitLowerCaseNameToIdMap> lowercase_name_to_id_;
128158
/// Mapping from field id to (nested) position path to access the field.

src/iceberg/test/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ endfunction()
5454

5555
add_iceberg_test(schema_test
5656
SOURCES
57+
assign_id_visitor_test.cc
5758
name_mapping_test.cc
5859
partition_field_test.cc
5960
partition_spec_test.cc
Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include <memory>
21+
22+
#include <gmock/gmock.h>
23+
#include <gtest/gtest.h>
24+
25+
#include "iceberg/schema.h"
26+
#include "iceberg/schema_field.h"
27+
#include "iceberg/test/matchers.h"
28+
#include "iceberg/type.h"
29+
#include "iceberg/util/type_util.h"
30+
31+
namespace iceberg {
32+
33+
namespace {
34+
35+
Schema CreateFlatSchema() {
36+
return Schema({
37+
SchemaField::MakeRequired(/*field_id=*/10, "id", iceberg::int64()),
38+
SchemaField::MakeOptional(/*field_id=*/20, "name", iceberg::string()),
39+
SchemaField::MakeOptional(/*field_id=*/30, "age", iceberg::int32()),
40+
SchemaField::MakeRequired(/*field_id=*/40, "data", iceberg::float64()),
41+
});
42+
}
43+
44+
std::shared_ptr<Type> CreateListOfStruct() {
45+
return std::make_shared<ListType>(SchemaField::MakeOptional(
46+
/*field_id=*/101, "element",
47+
std::make_shared<StructType>(std::vector<SchemaField>{
48+
SchemaField::MakeOptional(/*field_id=*/102, "x", iceberg::int32()),
49+
SchemaField::MakeRequired(/*field_id=*/103, "y", iceberg::string()),
50+
})));
51+
}
52+
53+
std::shared_ptr<Type> CreateMapWithStructValue() {
54+
return std::make_shared<MapType>(
55+
SchemaField::MakeRequired(/*field_id=*/201, "key", iceberg::string()),
56+
SchemaField::MakeRequired(
57+
/*field_id=*/202, "value",
58+
std::make_shared<StructType>(std::vector<SchemaField>{
59+
SchemaField::MakeRequired(/*field_id=*/203, "id", iceberg::int64()),
60+
SchemaField::MakeOptional(/*field_id=*/204, "name", iceberg::string()),
61+
})));
62+
}
63+
64+
std::shared_ptr<Type> CreateNestedStruct() {
65+
return std::make_shared<StructType>(std::vector<SchemaField>{
66+
SchemaField::MakeRequired(/*field_id=*/301, "outer_id", iceberg::int64()),
67+
SchemaField::MakeRequired(
68+
/*field_id=*/302, "nested",
69+
std::make_shared<StructType>(std::vector<SchemaField>{
70+
SchemaField::MakeOptional(/*field_id=*/303, "inner_id", iceberg::int32()),
71+
SchemaField::MakeRequired(/*field_id=*/304, "inner_name",
72+
iceberg::string()),
73+
})),
74+
});
75+
}
76+
77+
Schema CreateNestedSchema(std::vector<int32_t> identifier_field_ids = {}) {
78+
return Schema(
79+
{
80+
SchemaField::MakeRequired(/*field_id=*/10, "id", iceberg::int64()),
81+
SchemaField::MakeOptional(/*field_id=*/20, "list", CreateListOfStruct()),
82+
SchemaField::MakeOptional(/*field_id=*/30, "map", CreateMapWithStructValue()),
83+
SchemaField::MakeRequired(/*field_id=*/40, "struct", CreateNestedStruct()),
84+
},
85+
Schema::kInitialSchemaId, std::move(identifier_field_ids));
86+
}
87+
88+
} // namespace
89+
90+
TEST(AssignFreshIdVisitorTest, FlatSchema) {
91+
Schema schema = CreateFlatSchema();
92+
93+
std::atomic<int32_t> id = 0;
94+
auto next_id = [&id]() { return ++id; };
95+
ICEBERG_UNWRAP_OR_FAIL(auto fresh_schema,
96+
AssignFreshIds(Schema::kInitialSchemaId, schema, next_id));
97+
98+
ASSERT_EQ(fresh_schema->fields().size(), schema.fields().size());
99+
EXPECT_EQ(Schema(
100+
{
101+
SchemaField::MakeRequired(/*field_id=*/1, "id", iceberg::int64()),
102+
SchemaField::MakeOptional(/*field_id=*/2, "name", iceberg::string()),
103+
SchemaField::MakeOptional(/*field_id=*/3, "age", iceberg::int32()),
104+
SchemaField::MakeRequired(/*field_id=*/4, "data", iceberg::float64()),
105+
},
106+
Schema::kInitialSchemaId),
107+
*fresh_schema);
108+
}
109+
110+
TEST(AssignFreshIdVisitorTest, NestedSchema) {
111+
Schema schema = CreateNestedSchema();
112+
std::atomic<int32_t> id = 0;
113+
auto next_id = [&id]() { return ++id; };
114+
ICEBERG_UNWRAP_OR_FAIL(auto fresh_schema,
115+
AssignFreshIds(Schema::kInitialSchemaId, schema, next_id));
116+
117+
ASSERT_EQ(4, fresh_schema->fields().size());
118+
for (int32_t i = 0; i < fresh_schema->fields().size(); ++i) {
119+
EXPECT_EQ(i + 1, fresh_schema->fields()[i].field_id());
120+
}
121+
122+
auto list_field = fresh_schema->fields()[1];
123+
auto list_type = std::dynamic_pointer_cast<ListType>(list_field.type());
124+
ASSERT_TRUE(list_type);
125+
auto list_element_field = list_type->fields()[0];
126+
EXPECT_EQ(5, list_element_field.field_id());
127+
auto list_element_type =
128+
std::dynamic_pointer_cast<StructType>(list_element_field.type());
129+
ASSERT_TRUE(list_element_type);
130+
EXPECT_EQ(StructType(std::vector<SchemaField>{
131+
SchemaField::MakeOptional(/*field_id=*/6, "x", iceberg::int32()),
132+
SchemaField::MakeRequired(/*field_id=*/7, "y", iceberg::string()),
133+
}),
134+
*list_element_type);
135+
136+
auto map_field = fresh_schema->fields()[2];
137+
auto map_type = std::dynamic_pointer_cast<MapType>(map_field.type());
138+
ASSERT_TRUE(map_type);
139+
EXPECT_EQ(8, map_type->fields()[0].field_id());
140+
auto map_value_field = map_type->fields()[1];
141+
EXPECT_EQ(9, map_value_field.field_id());
142+
auto map_value_type = std::dynamic_pointer_cast<StructType>(map_value_field.type());
143+
ASSERT_TRUE(map_value_type);
144+
EXPECT_EQ(StructType(std::vector<SchemaField>{
145+
SchemaField::MakeRequired(/*field_id=*/10, "id", iceberg::int64()),
146+
SchemaField::MakeOptional(/*field_id=*/11, "name", iceberg::string()),
147+
}),
148+
*map_value_type);
149+
150+
auto struct_field = fresh_schema->fields()[3];
151+
auto struct_type = std::dynamic_pointer_cast<StructType>(struct_field.type());
152+
ASSERT_TRUE(struct_type);
153+
154+
auto expect_nested_struct_type = std::make_shared<StructType>(std::vector<SchemaField>{
155+
SchemaField::MakeOptional(/*field_id=*/14, "inner_id", iceberg::int32()),
156+
SchemaField::MakeRequired(/*field_id=*/15, "inner_name", iceberg::string()),
157+
});
158+
EXPECT_EQ(StructType(std::vector<SchemaField>{
159+
SchemaField::MakeRequired(/*field_id=*/12, "outer_id", iceberg::int64()),
160+
SchemaField::MakeRequired(
161+
/*field_id=*/13, "nested", expect_nested_struct_type)}),
162+
*struct_type);
163+
164+
auto nested_struct_field = struct_type->fields()[1];
165+
auto nested_struct_type =
166+
std::dynamic_pointer_cast<StructType>(nested_struct_field.type());
167+
ASSERT_TRUE(nested_struct_type);
168+
EXPECT_EQ(*expect_nested_struct_type, *nested_struct_type);
169+
}
170+
171+
TEST(AssignFreshIdVisitorTest, RefreshIdentifierId) {
172+
std::atomic<int32_t> id = 0;
173+
auto next_id = [&id]() { return ++id; };
174+
175+
Schema invalid_schema = CreateNestedSchema({10, 400});
176+
// Invalid identified field id
177+
auto result = AssignFreshIds(Schema::kInitialSchemaId, invalid_schema, next_id);
178+
EXPECT_THAT(result, IsError(ErrorKind::kInvalidSchema));
179+
EXPECT_THAT(result, HasErrorMessage("Can not find"));
180+
181+
id = 0;
182+
Schema schema = CreateNestedSchema({10, 301});
183+
ICEBERG_UNWRAP_OR_FAIL(auto fresh_schema,
184+
AssignFreshIds(Schema::kInitialSchemaId, schema, next_id));
185+
EXPECT_THAT(fresh_schema->IdentifierFieldIds(), testing::ElementsAre(1, 12));
186+
}
187+
188+
} // namespace iceberg

0 commit comments

Comments
 (0)