77#include " parquet_reader.hpp"
88#include " reader/string_column_reader.hpp"
99#include " reader/struct_column_reader.hpp"
10+ #include " reader/variant_column_reader.hpp"
1011#include " zstd/common/xxhash.hpp"
1112#include " duckdb/common/types/blob.hpp"
1213#include " duckdb/common/types/time.hpp"
1314#include " duckdb/common/types/value.hpp"
1415#include " duckdb/storage/statistics/struct_stats.hpp"
16+ #include " duckdb/storage/statistics/list_stats.hpp"
1517#include " duckdb/planner/filter/constant_filter.hpp"
1618#include " reader/uuid_column_reader.hpp"
19+ #include " duckdb/common/type_visitor.hpp"
1720
1821namespace duckdb {
1922
@@ -319,17 +322,100 @@ Value ParquetStatisticsUtils::ConvertValueInternal(const LogicalType &type, cons
319322 }
320323}
321324
325+ static bool ConvertUnshreddedStats (BaseStatistics &result, optional_ptr<BaseStatistics> input_p) {
326+ D_ASSERT (result.GetType ().id () == LogicalTypeId::UINTEGER);
327+
328+ if (!input_p) {
329+ return false ;
330+ }
331+ auto &input = *input_p;
332+ D_ASSERT (input.GetType ().id () == LogicalTypeId::BLOB);
333+ result.CopyValidity (input);
334+
335+ auto min = StringStats::Min (input);
336+ auto max = StringStats::Max (input);
337+
338+ if (!result.CanHaveNoNull ()) {
339+ return true ;
340+ }
341+
342+ if (min.empty () && max.empty ()) {
343+ // ! All non-shredded values are NULL or VARIANT_NULL, set the stats to indicate this
344+ NumericStats::SetMin<uint32_t >(result, 0 );
345+ NumericStats::SetMax<uint32_t >(result, 0 );
346+ result.SetHasNoNull ();
347+ }
348+ return true ;
349+ }
350+
351+ static bool ConvertShreddedStats (BaseStatistics &result, optional_ptr<BaseStatistics> input_p);
352+
353+ static bool ConvertShreddedStatsItem (BaseStatistics &result, BaseStatistics &input) {
354+ D_ASSERT (result.GetType ().id () == LogicalTypeId::STRUCT);
355+ D_ASSERT (input.GetType ().id () == LogicalTypeId::STRUCT);
356+
357+ auto &untyped_value_index_stats = StructStats::GetChildStats (result, 0 );
358+ auto &typed_value_result = StructStats::GetChildStats (result, 1 );
359+
360+ auto &value_stats = StructStats::GetChildStats (input, 0 );
361+ auto &typed_value_input = StructStats::GetChildStats (input, 1 );
362+
363+ if (!ConvertUnshreddedStats (untyped_value_index_stats, value_stats)) {
364+ return false ;
365+ }
366+ if (!ConvertShreddedStats (typed_value_result, typed_value_input)) {
367+ return false ;
368+ }
369+ return true ;
370+ }
371+
372+ static bool ConvertShreddedStats (BaseStatistics &result, optional_ptr<BaseStatistics> input_p) {
373+ if (!input_p) {
374+ return false ;
375+ }
376+ auto &input = *input_p;
377+ result.CopyValidity (input);
378+
379+ auto type_id = result.GetType ().id ();
380+ if (type_id == LogicalTypeId::LIST) {
381+ auto &child_result = ListStats::GetChildStats (result);
382+ auto &child_input = ListStats::GetChildStats (input);
383+ return ConvertShreddedStatsItem (child_result, child_input);
384+ }
385+ if (type_id == LogicalTypeId::STRUCT) {
386+ auto field_count = StructType::GetChildCount (result.GetType ());
387+ for (idx_t i = 0 ; i < field_count; i++) {
388+ auto &result_field = StructStats::GetChildStats (result, i);
389+ auto &input_field = StructStats::GetChildStats (input, i);
390+ if (!ConvertShreddedStatsItem (result_field, input_field)) {
391+ return false ;
392+ }
393+ }
394+ return true ;
395+ }
396+ result.Copy (input);
397+ return true ;
398+ }
399+
322400unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics (const ParquetColumnSchema &schema,
323401 const vector<ColumnChunk> &columns,
324402 bool can_have_nan) {
325403 // Not supported types
326404 auto &type = schema.type ;
327- if (type.id () == LogicalTypeId::ARRAY || type.id () == LogicalTypeId::MAP || type. id () == LogicalTypeId::LIST ) {
405+ if (type.id () == LogicalTypeId::ARRAY || type.id () == LogicalTypeId::MAP) {
328406 return nullptr ;
329407 }
330408
331409 unique_ptr<BaseStatistics> row_group_stats;
332410
411+ if (type.id () == LogicalTypeId::LIST) {
412+ auto list_stats = ListStats::CreateUnknown (type);
413+ auto &child_schema = schema.children [0 ];
414+ auto child_stats = ParquetStatisticsUtils::TransformColumnStatistics (child_schema, columns, can_have_nan);
415+ ListStats::SetChildStats (list_stats, std::move (child_stats));
416+ row_group_stats = list_stats.ToUnique ();
417+ return row_group_stats;
418+ }
333419 // Structs are handled differently (they dont have stats)
334420 if (type.id () == LogicalTypeId::STRUCT) {
335421 auto struct_stats = StructStats::CreateUnknown (type);
@@ -340,19 +426,52 @@ unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(con
340426 StructStats::SetChildStats (struct_stats, i, std::move (child_stats));
341427 }
342428 row_group_stats = struct_stats.ToUnique ();
343-
344- // null count is generic
345- if (row_group_stats) {
346- row_group_stats->Set (StatsInfo::CAN_HAVE_NULL_AND_VALID_VALUES);
347- }
348429 return row_group_stats;
349430 } else if (schema.schema_type == ParquetColumnSchemaType::VARIANT) {
350- // ! FIXME: there are situations where VARIANT columns can have stats
351- return nullptr ;
431+ auto children_count = schema.children .size ();
432+ if (children_count != 3 ) {
433+ return nullptr ;
434+ }
435+ // ! Create the VARIANT stats
436+ auto &typed_value = schema.children [2 ];
437+ LogicalType logical_type;
438+ if (!VariantColumnReader::TypedValueLayoutToType (typed_value.type , logical_type)) {
439+ // ! We couldn't convert the parquet typed_value to a structured type (likely because a nested 'typed_value'
440+ // ! field is missing)
441+ return nullptr ;
442+ }
443+ auto shredding_type = TypeVisitor::VisitReplace (logical_type, [](const LogicalType &type) {
444+ return LogicalType::STRUCT ({{" untyped_value_index" , LogicalType::UINTEGER}, {" typed_value" , type}});
445+ });
446+ auto variant_stats = VariantStats::CreateShredded (shredding_type);
447+
448+ // ! Take the root stats
449+ auto &shredded_stats = VariantStats::GetShreddedStats (variant_stats);
450+ auto &untyped_value_index_stats = StructStats::GetChildStats (shredded_stats, 0 );
451+ auto &typed_value_stats = StructStats::GetChildStats (shredded_stats, 1 );
452+
453+ // ! Convert the root 'value' -> 'untyped_value_index'
454+ auto &value = schema.children [1 ];
455+ D_ASSERT (value.name == " value" );
456+ auto value_stats = ParquetStatisticsUtils::TransformColumnStatistics (value, columns, can_have_nan);
457+ if (!ConvertUnshreddedStats (untyped_value_index_stats, value_stats.get ())) {
458+ // ! Couldn't convert the stats, or there are no stats
459+ return nullptr ;
460+ }
461+
462+ auto parquet_typed_value_stats =
463+ ParquetStatisticsUtils::TransformColumnStatistics (typed_value, columns, can_have_nan);
464+ if (!ConvertShreddedStats (typed_value_stats, parquet_typed_value_stats.get ())) {
465+ // ! Couldn't convert the stats, or there are no stats
466+ return nullptr ;
467+ }
468+ // ! Set validity to UNKNOWN
469+ variant_stats.SetHasNoNull ();
470+ variant_stats.SetHasNull ();
471+ return variant_stats.ToUnique ();
352472 }
353473
354474 // Otherwise, its a standard column with stats
355-
356475 auto &column_chunk = columns[schema.column_index ];
357476 if (!column_chunk.__isset .meta_data || !column_chunk.meta_data .__isset .statistics ) {
358477 // no stats present for row group
@@ -393,16 +512,18 @@ unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(con
393512 row_group_stats = CreateNumericStats (type, schema, parquet_stats);
394513 }
395514 break ;
515+ case LogicalTypeId::BLOB:
396516 case LogicalTypeId::VARCHAR: {
397517 auto string_stats = StringStats::CreateUnknown (type);
398- if (parquet_stats.__isset .min_value && StringColumnReader::IsValid (parquet_stats.min_value , true )) {
518+ const bool is_varchar = type.id () == LogicalTypeId::VARCHAR;
519+ if (parquet_stats.__isset .min_value && StringColumnReader::IsValid (parquet_stats.min_value , is_varchar)) {
399520 StringStats::SetMin (string_stats, parquet_stats.min_value );
400- } else if (parquet_stats.__isset .min && StringColumnReader::IsValid (parquet_stats.min , true )) {
521+ } else if (parquet_stats.__isset .min && StringColumnReader::IsValid (parquet_stats.min , is_varchar )) {
401522 StringStats::SetMin (string_stats, parquet_stats.min );
402523 }
403- if (parquet_stats.__isset .max_value && StringColumnReader::IsValid (parquet_stats.max_value , true )) {
524+ if (parquet_stats.__isset .max_value && StringColumnReader::IsValid (parquet_stats.max_value , is_varchar )) {
404525 StringStats::SetMax (string_stats, parquet_stats.max_value );
405- } else if (parquet_stats.__isset .max && StringColumnReader::IsValid (parquet_stats.max , true )) {
526+ } else if (parquet_stats.__isset .max && StringColumnReader::IsValid (parquet_stats.max , is_varchar )) {
406527 StringStats::SetMax (string_stats, parquet_stats.max );
407528 }
408529 row_group_stats = string_stats.ToUnique ();
@@ -456,7 +577,9 @@ unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(con
456577 break ;
457578 }
458579 default :
459- // no stats for you
580+ // no specific stats, only create unknown stats to hold validity information
581+ auto unknown_stats = BaseStatistics::CreateUnknown (type);
582+ row_group_stats = unknown_stats.ToUnique ();
460583 break ;
461584 } // end of type switch
462585
0 commit comments