Skip to content

Commit cf0fd37

Browse files
authored
fix: Validate identifier fields in Schema::Make (#450)
1 parent 95045a8 commit cf0fd37

File tree

9 files changed

+257
-134
lines changed

9 files changed

+257
-134
lines changed

src/iceberg/expression/residual_evaluator.cc

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -299,16 +299,12 @@ class UnpartitionedResidualEvaluator : public ResidualEvaluator {
299299
public:
300300
explicit UnpartitionedResidualEvaluator(std::shared_ptr<Expression> expr)
301301
: ResidualEvaluator(std::move(expr), *PartitionSpec::Unpartitioned(),
302-
*kEmptySchema_, true) {}
302+
*Schema::EmptySchema(), true) {}
303303

304304
Result<std::shared_ptr<Expression>> ResidualFor(
305305
const StructLike& /*partition_data*/) const override {
306306
return expr_;
307307
}
308-
309-
private:
310-
// Store an empty schema to avoid dangling reference when passing to base class
311-
inline static const std::shared_ptr<Schema> kEmptySchema_ = Schema::EmptySchema();
312308
};
313309

314310
} // namespace

src/iceberg/json_internal.cc

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -488,9 +488,8 @@ Result<std::unique_ptr<Schema>> SchemaFromJson(const nlohmann::json& json) {
488488
auto identifier_field_ids,
489489
GetJsonValueOrDefault<std::vector<int32_t>>(json, kIdentifierFieldIds));
490490

491-
return std::make_unique<Schema>(std::move(fields),
492-
schema_id_opt.value_or(Schema::kInitialSchemaId),
493-
std::move(identifier_field_ids));
491+
return Schema::Make(std::move(fields), schema_id_opt.value_or(Schema::kInitialSchemaId),
492+
std::move(identifier_field_ids));
494493
}
495494

496495
nlohmann::json ToJson(const PartitionField& partition_field) {

src/iceberg/schema.cc

Lines changed: 79 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
#include <format>
2323
#include <functional>
24+
#include <stack>
2425

2526
#include "iceberg/result.h"
2627
#include "iceberg/row/struct_like.h"
@@ -34,11 +35,25 @@
3435

3536
namespace iceberg {
3637

37-
Schema::Schema(std::vector<SchemaField> fields, int32_t schema_id,
38-
std::vector<int32_t> identifier_field_ids)
39-
: StructType(std::move(fields)),
40-
schema_id_(schema_id),
41-
identifier_field_ids_(std::move(identifier_field_ids)) {}
38+
Schema::Schema(std::vector<SchemaField> fields, int32_t schema_id)
39+
: StructType(std::move(fields)), schema_id_(schema_id) {}
40+
41+
Result<std::unique_ptr<Schema>> Schema::Make(std::vector<SchemaField> fields,
42+
int32_t schema_id,
43+
std::vector<int32_t> identifier_field_ids) {
44+
auto schema = std::make_unique<Schema>(std::move(fields), schema_id);
45+
46+
if (!identifier_field_ids.empty()) {
47+
auto id_to_parent = IndexParents(*schema);
48+
for (auto field_id : identifier_field_ids) {
49+
ICEBERG_RETURN_UNEXPECTED(
50+
ValidateIdentifierFields(field_id, *schema, id_to_parent));
51+
}
52+
}
53+
54+
schema->identifier_field_ids_ = std::move(identifier_field_ids);
55+
return schema;
56+
}
4257

4358
Result<std::unique_ptr<Schema>> Schema::Make(
4459
std::vector<SchemaField> fields, int32_t schema_id,
@@ -53,10 +68,69 @@ Result<std::unique_ptr<Schema>> Schema::Make(
5368
}
5469
fresh_identifier_ids.push_back(field.value().get().field_id());
5570
}
71+
72+
if (!fresh_identifier_ids.empty()) {
73+
auto id_to_parent = IndexParents(*schema);
74+
for (auto field_id : fresh_identifier_ids) {
75+
ICEBERG_RETURN_UNEXPECTED(
76+
ValidateIdentifierFields(field_id, *schema, id_to_parent));
77+
}
78+
}
79+
5680
schema->identifier_field_ids_ = std::move(fresh_identifier_ids);
5781
return schema;
5882
}
5983

84+
Status Schema::ValidateIdentifierFields(
85+
int32_t field_id, const Schema& schema,
86+
const std::unordered_map<int32_t, int32_t>& id_to_parent) {
87+
ICEBERG_ASSIGN_OR_RAISE(auto field_opt, schema.FindFieldById(field_id));
88+
ICEBERG_PRECHECK(field_opt.has_value(),
89+
"Cannot add field {} as an identifier field: field does not exist",
90+
field_id);
91+
92+
const SchemaField& field = field_opt.value().get();
93+
ICEBERG_PRECHECK(
94+
field.type()->is_primitive(),
95+
"Cannot add field {} as an identifier field: not a primitive type field", field_id);
96+
ICEBERG_PRECHECK(!field.optional(),
97+
"Cannot add field {} as an identifier field: not a required field",
98+
field_id);
99+
ICEBERG_PRECHECK(
100+
field.type()->type_id() != TypeId::kDouble &&
101+
field.type()->type_id() != TypeId::kFloat,
102+
"Cannot add field {} as an identifier field: must not be float or double field",
103+
field_id);
104+
105+
// check whether the nested field is in a chain of required struct fields
106+
// exploring from root for better error message for list and map types
107+
std::stack<int32_t> ancestors;
108+
auto parent_it = id_to_parent.find(field.field_id());
109+
while (parent_it != id_to_parent.end()) {
110+
ancestors.push(parent_it->second);
111+
parent_it = id_to_parent.find(parent_it->second);
112+
}
113+
114+
while (!ancestors.empty()) {
115+
ICEBERG_ASSIGN_OR_RAISE(auto parent_opt, schema.FindFieldById(ancestors.top()));
116+
ICEBERG_PRECHECK(
117+
parent_opt.has_value(),
118+
"Cannot add field {} as an identifier field: parent field id {} does not exist",
119+
field_id, ancestors.top());
120+
const SchemaField& parent = parent_opt.value().get();
121+
ICEBERG_PRECHECK(
122+
parent.type()->type_id() == TypeId::kStruct,
123+
"Cannot add field {} as an identifier field: must not be nested in {}", field_id,
124+
*parent.type());
125+
ICEBERG_PRECHECK(!parent.optional(),
126+
"Cannot add field {} as an identifier field: must not be nested in "
127+
"optional field {}",
128+
field_id, parent.field_id());
129+
ancestors.pop();
130+
}
131+
return {};
132+
}
133+
60134
const std::shared_ptr<Schema>& Schema::EmptySchema() {
61135
static const auto empty_schema =
62136
std::make_shared<Schema>(std::vector<SchemaField>{}, kInitialSchemaId);

src/iceberg/schema.h

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,18 +52,47 @@ class ICEBERG_EXPORT Schema : public StructType {
5252
/// \brief Special value to select all columns from manifest files.
5353
static constexpr std::string_view kAllColumns = "*";
5454

55-
explicit Schema(std::vector<SchemaField> fields, int32_t schema_id = kInitialSchemaId,
56-
std::vector<int32_t> identifier_field_ids = {});
55+
explicit Schema(std::vector<SchemaField> fields, int32_t schema_id = kInitialSchemaId);
56+
57+
/// \brief Create a schema.
58+
///
59+
/// \param fields The fields that make up the schema.
60+
/// \param schema_id The unique identifier for this schema (default:kInitialSchemaId).
61+
/// \param identifier_field_ids Field IDs that uniquely identify rows in the table.
62+
/// \return A new Schema instance or Status if failed.
63+
static Result<std::unique_ptr<Schema>> Make(std::vector<SchemaField> fields,
64+
int32_t schema_id,
65+
std::vector<int32_t> identifier_field_ids);
5766

5867
/// \brief Create a schema.
5968
///
6069
/// \param fields The fields that make up the schema.
6170
/// \param schema_id The unique identifier for this schema (default: kInitialSchemaId).
6271
/// \param identifier_field_names Canonical names of fields that uniquely identify rows
63-
/// in the table (default: empty). \return A new Schema instance or Status if failed.
72+
/// in the table.
73+
/// \return A new Schema instance or Status if failed.
6474
static Result<std::unique_ptr<Schema>> Make(
65-
std::vector<SchemaField> fields, int32_t schema_id = kInitialSchemaId,
66-
const std::vector<std::string>& identifier_field_names = {});
75+
std::vector<SchemaField> fields, int32_t schema_id,
76+
const std::vector<std::string>& identifier_field_names);
77+
78+
/// \brief Validate that the identifier field with the given ID is valid for the schema
79+
///
80+
/// This method checks that the specified field ID represents a valid identifier field
81+
/// according to Iceberg's identifier field requirements. It verifies that the field:
82+
/// - exists in the schema
83+
/// - is a primitive type
84+
/// - is not optional (required field)
85+
/// - is not a float or double type
86+
/// - is not nested within optional or non-struct parent fields
87+
///
88+
/// \param field_id The ID of the field to validate as an identifier field.
89+
/// \param schema The schema containing the field to validate.
90+
/// \param id_to_parent A mapping from field IDs to their parent field IDs for nested
91+
/// field validation.
92+
/// \return Status indicating success or failure of the validation.
93+
static Status ValidateIdentifierFields(
94+
int32_t field_id, const Schema& schema,
95+
const std::unordered_map<int32_t, int32_t>& id_to_parent);
6796

6897
/// \brief Get an empty schema.
6998
///

src/iceberg/table_metadata.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -958,9 +958,9 @@ Result<int32_t> TableMetadataBuilder::Impl::AddSchema(const Schema& schema,
958958

959959
metadata_.last_column_id = new_last_column_id;
960960

961-
auto new_schema =
962-
std::make_shared<Schema>(schema.fields() | std::ranges::to<std::vector>(),
963-
new_schema_id, schema.IdentifierFieldIds());
961+
ICEBERG_ASSIGN_OR_RAISE(std::shared_ptr<Schema> new_schema,
962+
Schema::Make(schema.fields() | std::ranges::to<std::vector>(),
963+
new_schema_id, schema.IdentifierFieldIds()))
964964

965965
if (!schema_found) {
966966
metadata_.schemas.push_back(new_schema);

src/iceberg/test/assign_id_visitor_test.cc

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,9 @@ std::shared_ptr<Type> CreateNestedStruct() {
7474
});
7575
}
7676

77-
Schema CreateNestedSchema(std::vector<int32_t> identifier_field_ids = {}) {
78-
return Schema(
77+
Result<std::unique_ptr<Schema>> CreateNestedSchema(
78+
std::vector<int32_t> identifier_field_ids = {}) {
79+
return Schema::Make(
7980
{
8081
SchemaField::MakeRequired(/*field_id=*/10, "id", iceberg::int64()),
8182
SchemaField::MakeOptional(/*field_id=*/20, "list", CreateListOfStruct()),
@@ -108,11 +109,11 @@ TEST(AssignFreshIdVisitorTest, FlatSchema) {
108109
}
109110

110111
TEST(AssignFreshIdVisitorTest, NestedSchema) {
111-
Schema schema = CreateNestedSchema();
112+
ICEBERG_UNWRAP_OR_FAIL(auto schema, CreateNestedSchema());
112113
std::atomic<int32_t> id = 0;
113114
auto next_id = [&id]() { return ++id; };
114115
ICEBERG_UNWRAP_OR_FAIL(auto fresh_schema,
115-
AssignFreshIds(Schema::kInitialSchemaId, schema, next_id));
116+
AssignFreshIds(Schema::kInitialSchemaId, *schema, next_id));
116117

117118
ASSERT_EQ(4, fresh_schema->fields().size());
118119
for (int32_t i = 0; i < fresh_schema->fields().size(); ++i) {
@@ -169,20 +170,16 @@ TEST(AssignFreshIdVisitorTest, NestedSchema) {
169170
}
170171

171172
TEST(AssignFreshIdVisitorTest, RefreshIdentifierId) {
172-
std::atomic<int32_t> id = 0;
173+
int32_t id = 0;
173174
auto next_id = [&id]() { return ++id; };
174175

175-
Schema invalid_schema = CreateNestedSchema({10, 400});
176-
// Invalid identified field id
177-
auto result = AssignFreshIds(Schema::kInitialSchemaId, invalid_schema, next_id);
178-
EXPECT_THAT(result, IsError(ErrorKind::kInvalidSchema));
179-
EXPECT_THAT(result, HasErrorMessage("Cannot find"));
180-
181-
id = 0;
182-
Schema schema = CreateNestedSchema({10, 301});
176+
ICEBERG_UNWRAP_OR_FAIL(auto schema, CreateNestedSchema({10, 301}));
183177
ICEBERG_UNWRAP_OR_FAIL(auto fresh_schema,
184-
AssignFreshIds(Schema::kInitialSchemaId, schema, next_id));
178+
AssignFreshIds(Schema::kInitialSchemaId, *schema, next_id));
185179
EXPECT_THAT(fresh_schema->IdentifierFieldIds(), testing::ElementsAre(1, 12));
180+
ICEBERG_UNWRAP_OR_FAIL(auto identifier_field_names,
181+
fresh_schema->IdentifierFieldNames());
182+
EXPECT_THAT(identifier_field_names, testing::ElementsAre("id", "struct.outer_id"));
186183
}
187184

188185
} // namespace iceberg

src/iceberg/test/metadata_serde_test.cc

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -139,12 +139,13 @@ TEST(MetadataSerdeTest, DeserializeV2Valid) {
139139
/*optional=*/false)},
140140
/*schema_id=*/0);
141141

142-
auto expected_schema_2 = std::make_shared<Schema>(
143-
std::vector<SchemaField>{SchemaField::MakeRequired(1, "x", int64()),
144-
SchemaField::MakeRequired(2, "y", int64()),
145-
SchemaField::MakeRequired(3, "z", int64())},
146-
/*schema_id=*/1,
147-
/*identifier_field_ids=*/std::vector<int32_t>{1, 2});
142+
ICEBERG_UNWRAP_OR_FAIL(
143+
std::shared_ptr<Schema> expected_schema_2,
144+
Schema::Make(std::vector<SchemaField>{SchemaField::MakeRequired(1, "x", int64()),
145+
SchemaField::MakeRequired(2, "y", int64()),
146+
SchemaField::MakeRequired(3, "z", int64())},
147+
/*schema_id=*/1,
148+
/*identifier_field_ids=*/std::vector<int32_t>{1, 2}));
148149

149150
auto expected_spec_result = PartitionSpec::Make(
150151
/*spec_id=*/0,

0 commit comments

Comments
 (0)