Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/iceberg/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ set(ICEBERG_SOURCES
util/temporal_util.cc
util/timepoint.cc
util/truncate_util.cc
util/type_util.cc
util/uuid.cc)

set(ICEBERG_STATIC_BUILD_INTERFACE_LIBS)
Expand Down
1 change: 1 addition & 0 deletions src/iceberg/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ iceberg_sources = files(
'util/temporal_util.cc',
'util/timepoint.cc',
'util/truncate_util.cc',
'util/type_util.cc',
'util/uuid.cc',
)

Expand Down
2 changes: 1 addition & 1 deletion src/iceberg/partition_spec.cc
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ bool PartitionSpec::Equals(const PartitionSpec& other) const {
}

Status PartitionSpec::Validate(const Schema& schema, bool allow_missing_fields) const {
std::unordered_map<int32_t, int32_t> parents = indexParents(schema);
std::unordered_map<int32_t, int32_t> parents = IndexParents(schema);
for (const auto& partition_field : fields_) {
ICEBERG_ASSIGN_OR_RAISE(auto source_field,
schema.FindFieldById(partition_field.source_id()));
Expand Down
293 changes: 1 addition & 292 deletions src/iceberg/schema.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,93 +27,12 @@
#include "iceberg/schema_internal.h"
#include "iceberg/type.h"
#include "iceberg/util/formatter.h" // IWYU pragma: keep
#include "iceberg/util/formatter_internal.h"
#include "iceberg/util/macros.h"
#include "iceberg/util/type_util.h"
#include "iceberg/util/visit_type.h"

namespace iceberg {

class IdToFieldVisitor {
public:
explicit IdToFieldVisitor(
std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>&
id_to_field);
Status Visit(const PrimitiveType& type);
Status Visit(const NestedType& type);

private:
std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>& id_to_field_;
};

class NameToIdVisitor {
public:
explicit NameToIdVisitor(
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>& name_to_id,
bool case_sensitive = true,
std::function<std::string(std::string_view)> quoting_func = {});
Status Visit(const ListType& type, const std::string& path,
const std::string& short_path);
Status Visit(const MapType& type, const std::string& path,
const std::string& short_path);
Status Visit(const StructType& type, const std::string& path,
const std::string& short_path);
Status Visit(const PrimitiveType& type, const std::string& path,
const std::string& short_path);
void Finish();

private:
std::string BuildPath(std::string_view prefix, std::string_view field_name,
bool case_sensitive);

private:
bool case_sensitive_;
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>& name_to_id_;
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>> short_name_to_id_;
std::function<std::string(std::string_view)> quoting_func_;
};

class PositionPathVisitor {
public:
Status Visit(const PrimitiveType& type) {
if (current_field_id_ == kUnassignedFieldId) {
return InvalidSchema("Current field id is not assigned, type: {}", type.ToString());
}

if (auto ret = position_path_.try_emplace(current_field_id_, current_path_);
!ret.second) {
return InvalidSchema("Duplicate field id found: {}, prev path: {}, curr path: {}",
current_field_id_, ret.first->second, current_path_);
}

return {};
}

Status Visit(const StructType& type) {
for (size_t i = 0; i < type.fields().size(); ++i) {
const auto& field = type.fields()[i];
current_field_id_ = field.field_id();
current_path_.push_back(i);
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*field.type(), this));
current_path_.pop_back();
}
return {};
}

// Non-struct types are not supported yet, but it is not an error.
Status Visit(const ListType& type) { return {}; }
Status Visit(const MapType& type) { return {}; }

std::unordered_map<int32_t, std::vector<size_t>> Finish() {
return std::move(position_path_);
}

private:
constexpr static int32_t kUnassignedFieldId = -1;
int32_t current_field_id_ = kUnassignedFieldId;
std::vector<size_t> current_path_;
std::unordered_map<int32_t, std::vector<size_t>> position_path_;
};

Schema::Schema(std::vector<SchemaField> fields, std::optional<int32_t> schema_id)
: StructType(std::move(fields)), schema_id_(schema_id) {}

Expand Down Expand Up @@ -210,216 +129,6 @@ Result<std::unique_ptr<StructLikeAccessor>> Schema::GetAccessorById(
return NotFound("Cannot get accessor for field id: {}", field_id);
}

IdToFieldVisitor::IdToFieldVisitor(
std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>& id_to_field)
: id_to_field_(id_to_field) {}

Status IdToFieldVisitor::Visit(const PrimitiveType& type) { return {}; }

Status IdToFieldVisitor::Visit(const NestedType& type) {
const auto& nested = internal::checked_cast<const NestedType&>(type);
const auto& fields = nested.fields();
for (const auto& field : fields) {
auto it = id_to_field_.try_emplace(field.field_id(), std::cref(field));
if (!it.second) {
return InvalidSchema("Duplicate field id found: {}", field.field_id());
}
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*field.type(), this));
}
return {};
}

NameToIdVisitor::NameToIdVisitor(
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>& name_to_id,
bool case_sensitive, std::function<std::string(std::string_view)> quoting_func)
: case_sensitive_(case_sensitive),
name_to_id_(name_to_id),
quoting_func_(std::move(quoting_func)) {}

Status NameToIdVisitor::Visit(const ListType& type, const std::string& path,
const std::string& short_path) {
const auto& field = type.fields()[0];
std::string new_path = BuildPath(path, field.name(), case_sensitive_);
std::string new_short_path;
if (field.type()->type_id() == TypeId::kStruct) {
new_short_path = short_path;
} else {
new_short_path = BuildPath(short_path, field.name(), case_sensitive_);
}
auto it = name_to_id_.try_emplace(new_path, field.field_id());
if (!it.second) {
return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}",
it.first->first, it.first->second, field.field_id());
}
short_name_to_id_.try_emplace(new_short_path, field.field_id());
ICEBERG_RETURN_UNEXPECTED(
VisitTypeInline(*field.type(), this, new_path, new_short_path));
return {};
}

Status NameToIdVisitor::Visit(const MapType& type, const std::string& path,
const std::string& short_path) {
std::string new_path, new_short_path;
const auto& fields = type.fields();
for (const auto& field : fields) {
new_path = BuildPath(path, field.name(), case_sensitive_);
if (field.name() == MapType::kValueName &&
field.type()->type_id() == TypeId::kStruct) {
new_short_path = short_path;
} else {
new_short_path = BuildPath(short_path, field.name(), case_sensitive_);
}
auto it = name_to_id_.try_emplace(new_path, field.field_id());
if (!it.second) {
return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}",
it.first->first, it.first->second, field.field_id());
}
short_name_to_id_.try_emplace(new_short_path, field.field_id());
ICEBERG_RETURN_UNEXPECTED(
VisitTypeInline(*field.type(), this, new_path, new_short_path));
}
return {};
}

Status NameToIdVisitor::Visit(const StructType& type, const std::string& path,
const std::string& short_path) {
const auto& fields = type.fields();
std::string new_path, new_short_path;
for (const auto& field : fields) {
new_path = BuildPath(path, field.name(), case_sensitive_);
new_short_path = BuildPath(short_path, field.name(), case_sensitive_);
auto it = name_to_id_.try_emplace(new_path, field.field_id());
if (!it.second) {
return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}",
it.first->first, it.first->second, field.field_id());
}
short_name_to_id_.try_emplace(new_short_path, field.field_id());
ICEBERG_RETURN_UNEXPECTED(
VisitTypeInline(*field.type(), this, new_path, new_short_path));
}
return {};
}

Status NameToIdVisitor::Visit(const PrimitiveType& type, const std::string& path,
const std::string& short_path) {
return {};
}

std::string NameToIdVisitor::BuildPath(std::string_view prefix,
std::string_view field_name, bool case_sensitive) {
std::string quoted_name;
if (!quoting_func_) {
quoted_name = std::string(field_name);
} else {
quoted_name = quoting_func_(field_name);
}
if (case_sensitive) {
return prefix.empty() ? quoted_name : std::string(prefix) + "." + quoted_name;
}
return prefix.empty() ? StringUtils::ToLower(quoted_name)
: std::string(prefix) + "." + StringUtils::ToLower(quoted_name);
}

void NameToIdVisitor::Finish() {
for (auto&& it : short_name_to_id_) {
name_to_id_.try_emplace(it.first, it.second);
}
}

/// \brief Visitor for pruning columns based on selected field IDs.
///
/// This visitor traverses a schema and creates a projected version containing only
/// the specified fields. When `select_full_types` is true, a field with all its
/// sub-fields are selected if its field-id has been selected; otherwise, only leaf
/// fields of selected field-ids are selected.
///
/// \note It returns an error when projection is not successful.
class PruneColumnVisitor {
public:
PruneColumnVisitor(const std::unordered_set<int32_t>& selected_ids,
bool select_full_types)
: selected_ids_(selected_ids), select_full_types_(select_full_types) {}

Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<Type>& type) const {
switch (type->type_id()) {
case TypeId::kStruct:
return Visit(internal::checked_pointer_cast<StructType>(type));
case TypeId::kList:
return Visit(internal::checked_pointer_cast<ListType>(type));
case TypeId::kMap:
return Visit(internal::checked_pointer_cast<MapType>(type));
default:
return nullptr;
}
}

Result<std::shared_ptr<Type>> Visit(const SchemaField& field) const {
if (selected_ids_.contains(field.field_id())) {
return (select_full_types_ || field.type()->is_primitive()) ? field.type()
: Visit(field.type());
}
return Visit(field.type());
}

static SchemaField MakeField(const SchemaField& field, std::shared_ptr<Type> type) {
return {field.field_id(), std::string(field.name()), std::move(type),
field.optional(), std::string(field.doc())};
}

Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<StructType>& type) const {
bool same_types = true;
std::vector<SchemaField> selected_fields;
for (const auto& field : type->fields()) {
ICEBERG_ASSIGN_OR_RAISE(auto child_type, Visit(field));
if (child_type) {
same_types = same_types && (child_type == field.type());
selected_fields.emplace_back(MakeField(field, std::move(child_type)));
}
}

if (selected_fields.empty()) {
return nullptr;
} else if (same_types && selected_fields.size() == type->fields().size()) {
return type;
}
return std::make_shared<StructType>(std::move(selected_fields));
}

Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<ListType>& type) const {
const auto& elem_field = type->fields()[0];
ICEBERG_ASSIGN_OR_RAISE(auto elem_type, Visit(elem_field));
if (elem_type == nullptr) {
return nullptr;
} else if (elem_type == elem_field.type()) {
return type;
}
return std::make_shared<ListType>(MakeField(elem_field, std::move(elem_type)));
}

Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<MapType>& type) const {
const auto& key_field = type->fields()[0];
const auto& value_field = type->fields()[1];
ICEBERG_ASSIGN_OR_RAISE(auto key_type, Visit(key_field));
ICEBERG_ASSIGN_OR_RAISE(auto value_type, Visit(value_field));

if (key_type == nullptr && value_type == nullptr) {
return nullptr;
} else if (value_type == value_field.type() &&
(key_type == key_field.type() || key_type == nullptr)) {
return type;
} else if (value_type == nullptr) {
return InvalidArgument("Cannot project Map without value field");
}
return std::make_shared<MapType>(
(key_type == nullptr ? key_field : MakeField(key_field, std::move(key_type))),
MakeField(value_field, std::move(value_type)));
}

private:
const std::unordered_set<int32_t>& selected_ids_;
const bool select_full_types_;
};

Result<std::unique_ptr<Schema>> Schema::Select(std::span<const std::string> names,
bool case_sensitive) const {
const std::string kAllColumns = "*";
Expand Down
2 changes: 1 addition & 1 deletion src/iceberg/test/type_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -659,7 +659,7 @@ TEST(TypeTest, IndexParents) {
points,
});

std::unordered_map<int32_t, int32_t> parent_index = iceberg::indexParents(root_struct);
std::unordered_map<int32_t, int32_t> parent_index = iceberg::IndexParents(root_struct);

// Verify top-level fields have no parent
ASSERT_EQ(parent_index.find(1), parent_index.end());
Expand Down
5 changes: 5 additions & 0 deletions src/iceberg/util/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,24 @@

install_headers(
[
'bucket_util.h',
'checked_cast.h',
'config.h',
'conversions.h',
'decimal.h',
'endian.h',
'formattable.h',
'formatter.h',
'int128.h',
'lazy.h',
'macros.h',
'partition_value_util.h',
'string_util.h',
'temporal_util.h',
'timepoint.h',
'truncate_util.h',
'type_util.h',
'uuid.h',
'visitor_generate.h',
'visit_type.h',
],
Expand Down
Loading
Loading