Skip to content

Commit 0fe048c

Browse files
Update vendored DuckDB sources to 365e207bc4
1 parent 33e1d56 commit 0fe048c

54 files changed

Lines changed: 1385 additions & 695 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

src/duckdb/extension/parquet/column_writer.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,9 @@ unique_ptr<ColumnWriter> ColumnWriter::CreateWriterRecursive(ClientContext &cont
288288
//! Construct the column schema
289289
auto variant_column =
290290
ParquetColumnSchema::FromLogicalType(name, type, max_define, max_repeat, 0, null_type, allow_geometry);
291+
if (field_id && field_id->set) {
292+
variant_column.field_id = field_id->field_id;
293+
}
291294
vector<unique_ptr<ColumnWriter>> child_writers;
292295
child_writers.reserve(child_types.size());
293296

src/duckdb/extension/parquet/include/column_writer.hpp

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ class ColumnWriter {
122122
}
123123
return false;
124124
}
125-
virtual LogicalType TransformedType() {
125+
virtual LogicalType TransformedType() const {
126126
throw NotImplementedException("Writer does not have a transformed type");
127127
}
128128
virtual unique_ptr<Expression> TransformExpression(unique_ptr<BoundReferenceExpression> expr) {
@@ -179,6 +179,18 @@ class ColumnWriter {
179179
virtual void Write(ColumnWriterState &state, Vector &vector, idx_t count) = 0;
180180
virtual void FinalizeWrite(ColumnWriterState &state) = 0;
181181

182+
public:
183+
template <class TARGET>
184+
TARGET &Cast() {
185+
DynamicCastCheck<TARGET>(this);
186+
return reinterpret_cast<TARGET &>(*this);
187+
}
188+
template <class TARGET>
189+
const TARGET &Cast() const {
190+
D_ASSERT(dynamic_cast<const TARGET *>(this));
191+
return reinterpret_cast<const TARGET &>(*this);
192+
}
193+
182194
protected:
183195
void HandleDefineLevels(ColumnWriterState &state, ColumnWriterState *parent, const ValidityMask &validity,
184196
const idx_t count, const uint16_t define_value, const uint16_t null_value) const;
@@ -189,6 +201,8 @@ class ColumnWriter {
189201

190202
public:
191203
ParquetWriter &writer;
204+
//! The parent writer (if this is a nested field)
205+
optional_ptr<ColumnWriter> parent;
192206
ParquetColumnSchema column_schema;
193207
vector<string> schema_path;
194208
bool can_have_nulls;

src/duckdb/extension/parquet/include/reader/variant_column_reader.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ class VariantColumnReader : public ColumnReader {
3535
idx_t GroupRowsAvailable() override;
3636
uint64_t TotalCompressedSize() override;
3737
void RegisterPrefetch(ThriftFileTransport &transport, bool allow_merge) override;
38+
static bool TypedValueLayoutToType(const LogicalType &typed_value, LogicalType &logical_type);
3839

3940
protected:
4041
idx_t metadata_reader_idx;

src/duckdb/extension/parquet/include/writer/list_column_writer.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ class ListColumnWriter : public ColumnWriter {
4848

4949
protected:
5050
ColumnWriter &GetChildWriter();
51+
const ColumnWriter &GetChildWriter() const;
5152
};
5253

5354
} // namespace duckdb

src/duckdb/extension/parquet/include/writer/struct_column_writer.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ class StructColumnWriter : public ColumnWriter {
1818
vector<unique_ptr<ColumnWriter>> child_writers_p)
1919
: ColumnWriter(writer, std::move(column_schema), std::move(schema_path_p)) {
2020
child_writers = std::move(child_writers_p);
21+
for (auto &writer : child_writers) {
22+
writer->parent = *this;
23+
}
2124
}
2225
~StructColumnWriter() override = default;
2326

src/duckdb/extension/parquet/include/writer/variant_column_writer.hpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,12 @@ struct VariantAnalyzeData {
2828
public:
2929
//! Map for every value what type it is
3030
variant_type_map type_map = {};
31+
uint32_t decimal_width;
32+
uint32_t decimal_scale;
33+
bool decimal_consistent = false;
34+
idx_t total_count = 0;
35+
3136
//! Map for every decimal value what physical type it has
32-
array<idx_t, 3> decimal_type_map = {};
3337
unique_ptr<ObjectAnalyzeData> object_data = nullptr;
3438
unique_ptr<ArrayAnalyzeData> array_data = nullptr;
3539
};
@@ -80,7 +84,7 @@ class VariantColumnWriter : public StructColumnWriter {
8084
bool HasTransform() override {
8185
return true;
8286
}
83-
LogicalType TransformedType() override {
87+
LogicalType TransformedType() const override {
8488
child_list_t<LogicalType> children;
8589
for (auto &writer : child_writers) {
8690
auto &child_name = writer->Schema().name;

src/duckdb/extension/parquet/parquet_statistics.cpp

Lines changed: 137 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,16 @@
77
#include "parquet_reader.hpp"
88
#include "reader/string_column_reader.hpp"
99
#include "reader/struct_column_reader.hpp"
10+
#include "reader/variant_column_reader.hpp"
1011
#include "zstd/common/xxhash.hpp"
1112
#include "duckdb/common/types/blob.hpp"
1213
#include "duckdb/common/types/time.hpp"
1314
#include "duckdb/common/types/value.hpp"
1415
#include "duckdb/storage/statistics/struct_stats.hpp"
16+
#include "duckdb/storage/statistics/list_stats.hpp"
1517
#include "duckdb/planner/filter/constant_filter.hpp"
1618
#include "reader/uuid_column_reader.hpp"
19+
#include "duckdb/common/type_visitor.hpp"
1720

1821
namespace duckdb {
1922

@@ -319,17 +322,100 @@ Value ParquetStatisticsUtils::ConvertValueInternal(const LogicalType &type, cons
319322
}
320323
}
321324

325+
static bool ConvertUnshreddedStats(BaseStatistics &result, optional_ptr<BaseStatistics> input_p) {
326+
D_ASSERT(result.GetType().id() == LogicalTypeId::UINTEGER);
327+
328+
if (!input_p) {
329+
return false;
330+
}
331+
auto &input = *input_p;
332+
D_ASSERT(input.GetType().id() == LogicalTypeId::BLOB);
333+
result.CopyValidity(input);
334+
335+
auto min = StringStats::Min(input);
336+
auto max = StringStats::Max(input);
337+
338+
if (!result.CanHaveNoNull()) {
339+
return true;
340+
}
341+
342+
if (min.empty() && max.empty()) {
343+
//! All non-shredded values are NULL or VARIANT_NULL, set the stats to indicate this
344+
NumericStats::SetMin<uint32_t>(result, 0);
345+
NumericStats::SetMax<uint32_t>(result, 0);
346+
result.SetHasNoNull();
347+
}
348+
return true;
349+
}
350+
351+
static bool ConvertShreddedStats(BaseStatistics &result, optional_ptr<BaseStatistics> input_p);
352+
353+
static bool ConvertShreddedStatsItem(BaseStatistics &result, BaseStatistics &input) {
354+
D_ASSERT(result.GetType().id() == LogicalTypeId::STRUCT);
355+
D_ASSERT(input.GetType().id() == LogicalTypeId::STRUCT);
356+
357+
auto &untyped_value_index_stats = StructStats::GetChildStats(result, 0);
358+
auto &typed_value_result = StructStats::GetChildStats(result, 1);
359+
360+
auto &value_stats = StructStats::GetChildStats(input, 0);
361+
auto &typed_value_input = StructStats::GetChildStats(input, 1);
362+
363+
if (!ConvertUnshreddedStats(untyped_value_index_stats, value_stats)) {
364+
return false;
365+
}
366+
if (!ConvertShreddedStats(typed_value_result, typed_value_input)) {
367+
return false;
368+
}
369+
return true;
370+
}
371+
372+
static bool ConvertShreddedStats(BaseStatistics &result, optional_ptr<BaseStatistics> input_p) {
373+
if (!input_p) {
374+
return false;
375+
}
376+
auto &input = *input_p;
377+
result.CopyValidity(input);
378+
379+
auto type_id = result.GetType().id();
380+
if (type_id == LogicalTypeId::LIST) {
381+
auto &child_result = ListStats::GetChildStats(result);
382+
auto &child_input = ListStats::GetChildStats(input);
383+
return ConvertShreddedStatsItem(child_result, child_input);
384+
}
385+
if (type_id == LogicalTypeId::STRUCT) {
386+
auto field_count = StructType::GetChildCount(result.GetType());
387+
for (idx_t i = 0; i < field_count; i++) {
388+
auto &result_field = StructStats::GetChildStats(result, i);
389+
auto &input_field = StructStats::GetChildStats(input, i);
390+
if (!ConvertShreddedStatsItem(result_field, input_field)) {
391+
return false;
392+
}
393+
}
394+
return true;
395+
}
396+
result.Copy(input);
397+
return true;
398+
}
399+
322400
unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(const ParquetColumnSchema &schema,
323401
const vector<ColumnChunk> &columns,
324402
bool can_have_nan) {
325403
// Not supported types
326404
auto &type = schema.type;
327-
if (type.id() == LogicalTypeId::ARRAY || type.id() == LogicalTypeId::MAP || type.id() == LogicalTypeId::LIST) {
405+
if (type.id() == LogicalTypeId::ARRAY || type.id() == LogicalTypeId::MAP) {
328406
return nullptr;
329407
}
330408

331409
unique_ptr<BaseStatistics> row_group_stats;
332410

411+
if (type.id() == LogicalTypeId::LIST) {
412+
auto list_stats = ListStats::CreateUnknown(type);
413+
auto &child_schema = schema.children[0];
414+
auto child_stats = ParquetStatisticsUtils::TransformColumnStatistics(child_schema, columns, can_have_nan);
415+
ListStats::SetChildStats(list_stats, std::move(child_stats));
416+
row_group_stats = list_stats.ToUnique();
417+
return row_group_stats;
418+
}
333419
// Structs are handled differently (they dont have stats)
334420
if (type.id() == LogicalTypeId::STRUCT) {
335421
auto struct_stats = StructStats::CreateUnknown(type);
@@ -340,19 +426,52 @@ unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(con
340426
StructStats::SetChildStats(struct_stats, i, std::move(child_stats));
341427
}
342428
row_group_stats = struct_stats.ToUnique();
343-
344-
// null count is generic
345-
if (row_group_stats) {
346-
row_group_stats->Set(StatsInfo::CAN_HAVE_NULL_AND_VALID_VALUES);
347-
}
348429
return row_group_stats;
349430
} else if (schema.schema_type == ParquetColumnSchemaType::VARIANT) {
350-
//! FIXME: there are situations where VARIANT columns can have stats
351-
return nullptr;
431+
auto children_count = schema.children.size();
432+
if (children_count != 3) {
433+
return nullptr;
434+
}
435+
//! Create the VARIANT stats
436+
auto &typed_value = schema.children[2];
437+
LogicalType logical_type;
438+
if (!VariantColumnReader::TypedValueLayoutToType(typed_value.type, logical_type)) {
439+
//! We couldn't convert the parquet typed_value to a structured type (likely because a nested 'typed_value'
440+
//! field is missing)
441+
return nullptr;
442+
}
443+
auto shredding_type = TypeVisitor::VisitReplace(logical_type, [](const LogicalType &type) {
444+
return LogicalType::STRUCT({{"untyped_value_index", LogicalType::UINTEGER}, {"typed_value", type}});
445+
});
446+
auto variant_stats = VariantStats::CreateShredded(shredding_type);
447+
448+
//! Take the root stats
449+
auto &shredded_stats = VariantStats::GetShreddedStats(variant_stats);
450+
auto &untyped_value_index_stats = StructStats::GetChildStats(shredded_stats, 0);
451+
auto &typed_value_stats = StructStats::GetChildStats(shredded_stats, 1);
452+
453+
//! Convert the root 'value' -> 'untyped_value_index'
454+
auto &value = schema.children[1];
455+
D_ASSERT(value.name == "value");
456+
auto value_stats = ParquetStatisticsUtils::TransformColumnStatistics(value, columns, can_have_nan);
457+
if (!ConvertUnshreddedStats(untyped_value_index_stats, value_stats.get())) {
458+
//! Couldn't convert the stats, or there are no stats
459+
return nullptr;
460+
}
461+
462+
auto parquet_typed_value_stats =
463+
ParquetStatisticsUtils::TransformColumnStatistics(typed_value, columns, can_have_nan);
464+
if (!ConvertShreddedStats(typed_value_stats, parquet_typed_value_stats.get())) {
465+
//! Couldn't convert the stats, or there are no stats
466+
return nullptr;
467+
}
468+
//! Set validity to UNKNOWN
469+
variant_stats.SetHasNoNull();
470+
variant_stats.SetHasNull();
471+
return variant_stats.ToUnique();
352472
}
353473

354474
// Otherwise, its a standard column with stats
355-
356475
auto &column_chunk = columns[schema.column_index];
357476
if (!column_chunk.__isset.meta_data || !column_chunk.meta_data.__isset.statistics) {
358477
// no stats present for row group
@@ -393,16 +512,18 @@ unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(con
393512
row_group_stats = CreateNumericStats(type, schema, parquet_stats);
394513
}
395514
break;
515+
case LogicalTypeId::BLOB:
396516
case LogicalTypeId::VARCHAR: {
397517
auto string_stats = StringStats::CreateUnknown(type);
398-
if (parquet_stats.__isset.min_value && StringColumnReader::IsValid(parquet_stats.min_value, true)) {
518+
const bool is_varchar = type.id() == LogicalTypeId::VARCHAR;
519+
if (parquet_stats.__isset.min_value && StringColumnReader::IsValid(parquet_stats.min_value, is_varchar)) {
399520
StringStats::SetMin(string_stats, parquet_stats.min_value);
400-
} else if (parquet_stats.__isset.min && StringColumnReader::IsValid(parquet_stats.min, true)) {
521+
} else if (parquet_stats.__isset.min && StringColumnReader::IsValid(parquet_stats.min, is_varchar)) {
401522
StringStats::SetMin(string_stats, parquet_stats.min);
402523
}
403-
if (parquet_stats.__isset.max_value && StringColumnReader::IsValid(parquet_stats.max_value, true)) {
524+
if (parquet_stats.__isset.max_value && StringColumnReader::IsValid(parquet_stats.max_value, is_varchar)) {
404525
StringStats::SetMax(string_stats, parquet_stats.max_value);
405-
} else if (parquet_stats.__isset.max && StringColumnReader::IsValid(parquet_stats.max, true)) {
526+
} else if (parquet_stats.__isset.max && StringColumnReader::IsValid(parquet_stats.max, is_varchar)) {
406527
StringStats::SetMax(string_stats, parquet_stats.max);
407528
}
408529
row_group_stats = string_stats.ToUnique();
@@ -456,7 +577,9 @@ unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(con
456577
break;
457578
}
458579
default:
459-
// no stats for you
580+
// no specific stats, only create unknown stats to hold validity information
581+
auto unknown_stats = BaseStatistics::CreateUnknown(type);
582+
row_group_stats = unknown_stats.ToUnique();
460583
break;
461584
} // end of type switch
462585

0 commit comments

Comments
 (0)