diff --git a/be/src/exec/scan/file_scanner.cpp b/be/src/exec/scan/file_scanner.cpp index 64a0616674f040..e71b4a451562a7 100644 --- a/be/src/exec/scan/file_scanner.cpp +++ b/be/src/exec/scan/file_scanner.cpp @@ -58,6 +58,7 @@ #include "exprs/vexpr_fwd.h" #include "exprs/vslot_ref.h" #include "format/arrow/arrow_stream_reader.h" +#include "format/count_reader.h" #include "format/csv/csv_reader.h" #include "format/json/new_json_reader.h" #include "format/native/native_reader.h" @@ -125,6 +126,25 @@ FileScanner::FileScanner(RuntimeState* state, FileScanLocalState* local_state, i _input_tuple_desc = state->desc_tbl().get_tuple_descriptor(_params->src_tuple_id); _real_tuple_desc = _input_tuple_desc == nullptr ? _output_tuple_desc : _input_tuple_desc; _is_load = (_input_tuple_desc != nullptr); + _configure_file_scan_handlers(); +} + +void FileScanner::_configure_file_scan_handlers() { + if (_is_load) { + _init_src_block_handler = &FileScanner::_init_src_block_for_load; + _process_src_block_after_read_handler = + &FileScanner::_process_src_block_after_read_for_load; + _should_push_down_predicates_handler = &FileScanner::_should_push_down_predicates_for_load; + _should_enable_condition_cache_handler = + &FileScanner::_should_enable_condition_cache_for_load; + } else { + _init_src_block_handler = &FileScanner::_init_src_block_for_query; + _process_src_block_after_read_handler = + &FileScanner::_process_src_block_after_read_for_query; + _should_push_down_predicates_handler = &FileScanner::_should_push_down_predicates_for_query; + _should_enable_condition_cache_handler = + &FileScanner::_should_enable_condition_cache_for_query; + } } Status FileScanner::init(RuntimeState* state, const VExprContextSPtrs& conjuncts) { @@ -457,12 +477,6 @@ Status FileScanner::_get_block_wrapped(RuntimeState* state, Block* block, bool* // For query job, simply set _src_block_ptr to block. size_t read_rows = 0; RETURN_IF_ERROR(_init_src_block(block)); - if (_need_iceberg_rowid_column && _current_range.__isset.table_format_params && - _current_range.table_format_params.table_format_type == "iceberg") { - if (auto* iceberg_reader = dynamic_cast(_cur_reader.get())) { - iceberg_reader->set_row_id_column_position(_iceberg_rowid_column_pos); - } - } { SCOPED_TIMER(_get_block_timer); @@ -480,23 +494,7 @@ Status FileScanner::_get_block_wrapped(RuntimeState* state, Block* block, bool* // If the push_down_agg_type is COUNT, no need to do the rest, // because we only save a number in block. if (_get_push_down_agg_type() != TPushAggOp::type::COUNT) { - // Convert the src block columns type to string in-place. - RETURN_IF_ERROR(_cast_to_input_block(block)); - // FileReader can fill partition and missing columns itself - if (!_cur_reader->fill_all_columns()) { - // Fill rows in src block with partition columns from path. (e.g. Hive partition columns) - RETURN_IF_ERROR(_fill_columns_from_path(read_rows)); - // Fill columns not exist in file with null or default value - RETURN_IF_ERROR(_fill_missing_columns(read_rows)); - } - // Apply _pre_conjunct_ctxs to filter src block. - RETURN_IF_ERROR(_pre_filter_src_block()); - - // Convert src block to output block (dest block), string to dest data type and apply filters. - RETURN_IF_ERROR(_convert_to_output_block(block)); - // Truncate char columns or varchar columns if size is smaller than file columns - // or not found in the file column schema. - RETURN_IF_ERROR(_truncate_char_or_varchar_columns(block)); + RETURN_IF_ERROR(_process_src_block_after_read(block)); } } break; @@ -517,16 +515,15 @@ Status FileScanner::_get_block_wrapped(RuntimeState* state, Block* block, bool* * This is a temporary method, and will be replaced by tvf. */ Status FileScanner::_check_output_block_types() { - if (_is_load) { - TFileFormatType::type format_type = _params->format_type; - if (format_type == TFileFormatType::FORMAT_PARQUET || - format_type == TFileFormatType::FORMAT_ORC) { - for (auto slot : _output_tuple_desc->slots()) { - if (is_complex_type(slot->type()->get_primitive_type())) { - return Status::InternalError( - "Parquet/orc doesn't support complex types in broker/stream load, " - "please use tvf(table value function) to insert complex types."); - } + // Only called from _init_src_block_for_load, so _is_load is always true. + TFileFormatType::type format_type = _params->format_type; + if (format_type == TFileFormatType::FORMAT_PARQUET || + format_type == TFileFormatType::FORMAT_ORC) { + for (auto slot : _output_tuple_desc->slots()) { + if (is_complex_type(slot->type()->get_primitive_type())) { + return Status::InternalError( + "Parquet/orc doesn't support complex types in broker/stream load, " + "please use tvf(table value function) to insert complex types."); } } } @@ -534,29 +531,22 @@ Status FileScanner::_check_output_block_types() { } Status FileScanner::_init_src_block(Block* block) { - if (!_is_load) { - _src_block_ptr = block; - - bool update_name_to_idx = _src_block_name_to_idx.empty(); - _iceberg_rowid_column_pos = -1; - if (_need_iceberg_rowid_column && _current_range.__isset.table_format_params && - _current_range.table_format_params.table_format_type == "iceberg") { - int row_id_idx = block->get_position_by_name(BeConsts::ICEBERG_ROWID_COL); - if (row_id_idx >= 0) { - _iceberg_rowid_column_pos = row_id_idx; - if (!update_name_to_idx && - !_src_block_name_to_idx.contains(BeConsts::ICEBERG_ROWID_COL)) { - update_name_to_idx = true; - } - } - } + DCHECK(_init_src_block_handler != nullptr); + return (this->*_init_src_block_handler)(block); +} - // Build name to index map only once on first call - if (update_name_to_idx) { - _src_block_name_to_idx = block->get_name_to_pos_map(); - } - return Status::OK(); +Status FileScanner::_init_src_block_for_query(Block* block) { + _src_block_ptr = block; + + // Build name to index map only once on first call. + if (_src_block_name_to_idx.empty()) { + _src_block_name_to_idx = block->get_name_to_pos_map(); } + return Status::OK(); +} + +Status FileScanner::_init_src_block_for_load(Block* block) { + static_cast(block); RETURN_IF_ERROR(_check_output_block_types()); // if (_src_block_init) { @@ -605,9 +595,7 @@ Status FileScanner::_init_src_block(Block* block) { } Status FileScanner::_cast_to_input_block(Block* block) { - if (!_is_load) { - return Status::OK(); - } + // Only called from _process_src_block_after_read_for_load, so _is_load is always true. SCOPED_TIMER(_cast_to_input_block_timer); // cast primitive type(PT0) to primitive type(PT1) uint32_t idx = 0; @@ -640,84 +628,8 @@ Status FileScanner::_cast_to_input_block(Block* block) { return Status::OK(); } -Status FileScanner::_fill_columns_from_path(size_t rows) { - if (!_fill_partition_from_path) { - return Status::OK(); - } - DataTypeSerDe::FormatOptions _text_formatOptions; - for (auto& kv : _partition_col_descs) { - auto doris_column = - _src_block_ptr->get_by_position(_src_block_name_to_idx[kv.first]).column; - // _src_block_ptr points to a mutable block created by this class itself, so const_cast can be used here. - IColumn* col_ptr = const_cast(doris_column.get()); - auto& [value, slot_desc] = kv.second; - auto _text_serde = slot_desc->get_data_type_ptr()->get_serde(); - Slice slice(value.data(), value.size()); - uint64_t num_deserialized = 0; - if (_text_serde->deserialize_column_from_fixed_json(*col_ptr, slice, rows, - &num_deserialized, - _text_formatOptions) != Status::OK()) { - return Status::InternalError("Failed to fill partition column: {}={}", - slot_desc->col_name(), value); - } - if (num_deserialized != rows) { - return Status::InternalError( - "Failed to fill partition column: {}={} ." - "Number of rows expected to be written : {}, number of rows actually written : " - "{}", - slot_desc->col_name(), value, num_deserialized, rows); - } - } - return Status::OK(); -} - -Status FileScanner::_fill_missing_columns(size_t rows) { - if (_missing_cols.empty()) { - return Status::OK(); - } - - SCOPED_TIMER(_fill_missing_columns_timer); - for (auto& kv : _missing_col_descs) { - if (kv.second == nullptr) { - // no default column, fill with null - auto mutable_column = _src_block_ptr->get_by_position(_src_block_name_to_idx[kv.first]) - .column->assume_mutable(); - auto* nullable_column = static_cast(mutable_column.get()); - nullable_column->insert_many_defaults(rows); - } else { - // fill with default value - auto& ctx = kv.second; - ColumnPtr result_column_ptr; - // PT1 => dest primitive type - RETURN_IF_ERROR(ctx->execute(_src_block_ptr, result_column_ptr)); - if (result_column_ptr->use_count() == 1) { - // call resize because the first column of _src_block_ptr may not be filled by reader, - // so _src_block_ptr->rows() may return wrong result, cause the column created by `ctx->execute()` - // has only one row. - auto mutable_column = result_column_ptr->assume_mutable(); - mutable_column->resize(rows); - // result_column_ptr maybe a ColumnConst, convert it to a normal column - result_column_ptr = result_column_ptr->convert_to_full_column_if_const(); - auto origin_column_type = - _src_block_ptr->get_by_position(_src_block_name_to_idx[kv.first]).type; - bool is_nullable = origin_column_type->is_nullable(); - if (!_src_block_name_to_idx.contains(kv.first)) { - return Status::InternalError("Column {} not found in src block {}", kv.first, - _src_block_ptr->dump_structure()); - } - _src_block_ptr->replace_by_position( - _src_block_name_to_idx[kv.first], - is_nullable ? make_nullable(result_column_ptr) : result_column_ptr); - } - } - } - return Status::OK(); -} - Status FileScanner::_pre_filter_src_block() { - if (!_is_load) { - return Status::OK(); - } + // Only called from _process_src_block_after_read_for_load, so _is_load is always true. if (!_pre_conjunct_ctxs.empty()) { SCOPED_TIMER(_pre_filter_timer); auto origin_column_num = _src_block_ptr->columns(); @@ -730,9 +642,7 @@ Status FileScanner::_pre_filter_src_block() { } Status FileScanner::_convert_to_output_block(Block* block) { - if (!_is_load) { - return Status::OK(); - } + // Only called from _process_src_block_after_read_for_load, so _is_load is always true. SCOPED_TIMER(_convert_to_output_block_timer); // The block is passed from scanner context's free blocks, // which is initialized by output columns @@ -852,6 +762,118 @@ Status FileScanner::_convert_to_output_block(Block* block) { return Status::OK(); } +Status FileScanner::_process_src_block_after_read(Block* block) { + DCHECK(_process_src_block_after_read_handler != nullptr); + return (this->*_process_src_block_after_read_handler)(block); +} + +Status FileScanner::_process_src_block_after_read_for_query(Block* block) { + // Truncate CHAR/VARCHAR columns when target size is smaller than file schema. + // This is needed for external table queries with truncate_char_or_varchar_columns=true. + RETURN_IF_ERROR(_truncate_char_or_varchar_columns(block)); + return Status::OK(); +} + +Status FileScanner::_fill_columns_from_path(size_t rows) { + if (_partition_col_descs.empty()) { + return Status::OK(); + } + DataTypeSerDe::FormatOptions text_format_options; + for (auto& kv : _partition_col_descs) { + auto doris_column = + _src_block_ptr->get_by_position(_src_block_name_to_idx[kv.first]).column; + IColumn* col_ptr = const_cast(doris_column.get()); + // Skip if the reader already filled this column (e.g. ORC/Parquet readers + // fill partition columns internally via on_fill_partition_columns). + if (col_ptr->size() >= rows) { + continue; + } + auto& [value, slot_desc] = kv.second; + auto text_serde = slot_desc->get_data_type_ptr()->get_serde(); + Slice slice(value.data(), value.size()); + uint64_t num_deserialized = 0; + if (_partition_value_is_null.contains(kv.first) && _partition_value_is_null[kv.first]) { + col_ptr->insert_many_defaults(rows); + } else if (text_serde->deserialize_column_from_fixed_json( + *col_ptr, slice, rows, &num_deserialized, text_format_options) != + Status::OK()) { + return Status::InternalError("Failed to fill partition column: {}={}", + slot_desc->col_name(), value); + } else if (num_deserialized != rows) { + return Status::InternalError( + "Failed to fill partition column: {}={}. " + "Number of rows expected: {}, actual: {}", + slot_desc->col_name(), value, rows, num_deserialized); + } + } + return Status::OK(); +} + +Status FileScanner::_fill_missing_columns(size_t rows) { + // For columns in the table that are not from the file and not partition columns, + // fill with default values or NULL. + for (const auto& col_desc : _column_descs) { + if (col_desc.category != ColumnCategory::REGULAR && + col_desc.category != ColumnCategory::GENERATED) { + continue; + } + if (_is_file_slot.contains(col_desc.slot_desc->id())) { + continue; + } + auto it = _src_block_name_to_idx.find(col_desc.name); + if (it == _src_block_name_to_idx.end()) { + continue; + } + auto doris_column = _src_block_ptr->get_by_position(it->second).column; + IColumn* col_ptr = const_cast(doris_column.get()); + if (col_ptr->size() >= rows) { + continue; + } + size_t need_rows = rows - col_ptr->size(); + if (col_desc.default_expr != nullptr) { + Block default_block; + default_block.insert( + ColumnWithTypeAndName(col_desc.slot_desc->get_data_type_ptr()->create_column(), + col_desc.slot_desc->get_data_type_ptr(), col_desc.name)); + int result_column_id = 0; + RETURN_IF_ERROR(col_desc.default_expr->execute(&default_block, &result_column_id)); + auto& default_col = default_block.get_by_position(result_column_id).column; + for (size_t i = 0; i < need_rows; ++i) { + col_ptr->insert_from(*default_col, 0); + } + } else { + col_ptr->insert_many_defaults(need_rows); + } + } + return Status::OK(); +} + +Status FileScanner::_process_src_block_after_read_for_load(Block* block) { + // Convert the src block columns type in-place. + RETURN_IF_ERROR(_cast_to_input_block(block)); + // Compute row count from file columns (partition columns may be empty at this point). + size_t rows = 0; + for (size_t i = 0; i < _src_block_ptr->columns(); ++i) { + size_t s = _src_block_ptr->get_by_position(i).column->size(); + if (s > rows) { + rows = s; + } + } + // Fill partition columns from path for readers that do not handle them internally + // (e.g., CSV, JSON readers in broker/stream load). + RETURN_IF_ERROR(_fill_columns_from_path(rows)); + // Fill missing columns (non-file, non-partition) with default values or NULL. + RETURN_IF_ERROR(_fill_missing_columns(rows)); + // Apply _pre_conjunct_ctxs to filter src block. + RETURN_IF_ERROR(_pre_filter_src_block()); + + // Convert src block to output block (dest block), then apply filters. + RETURN_IF_ERROR(_convert_to_output_block(block)); + // Truncate CHAR/VARCHAR columns when target size is smaller than file schema. + RETURN_IF_ERROR(_truncate_char_or_varchar_columns(block)); + return Status::OK(); +} + Status FileScanner::_truncate_char_or_varchar_columns(Block* block) { // Truncate char columns or varchar columns if size is smaller than file columns // or not found in the file column schema. @@ -927,6 +949,18 @@ Status FileScanner::_create_row_id_column_iterator() { return Status::OK(); } +void FileScanner::_fill_base_init_context(ReaderInitContext* ctx) { + ctx->column_descs = &_column_descs; + ctx->col_name_to_block_idx = &_src_block_name_to_idx; + ctx->state = _state; + ctx->tuple_descriptor = _real_tuple_desc; + ctx->row_descriptor = _default_val_row_desc.get(); + ctx->params = _params; + ctx->range = &_current_range; + ctx->table_info_node = TableSchemaChangeHelper::ConstNode::get_instance(); + ctx->push_down_agg_type = _get_push_down_agg_type(); +} + Status FileScanner::_get_next_reader() { while (true) { if (_cur_reader) { @@ -949,7 +983,7 @@ Status FileScanner::_get_next_reader() { const TFileRangeDesc& range = _current_range; _current_range_path = range.path; - if (!_partition_slot_descs.empty()) { + if (!_partition_slot_index_map.empty()) { // we need get partition columns first for runtime filter partition pruning RETURN_IF_ERROR(_generate_partition_columns()); @@ -991,11 +1025,13 @@ Status FileScanner::_get_next_reader() { } } - // JNI reader can only push down column value range - bool push_down_predicates = !_is_load && format_type != TFileFormatType::FORMAT_JNI; + bool push_down_predicates = _should_push_down_predicates(format_type); bool need_to_get_parsed_schema = false; switch (format_type) { case TFileFormatType::FORMAT_JNI: { + ReaderInitContext jni_ctx; + _fill_base_init_context(&jni_ctx); + if (range.__isset.table_format_params && range.table_format_params.table_format_type == "max_compute") { const auto* mc_desc = static_cast( @@ -1006,7 +1042,7 @@ Status FileScanner::_get_next_reader() { std::unique_ptr mc_reader = MaxComputeJniReader::create_unique( mc_desc, range.table_format_params.max_compute_params, _file_slot_descs, range, _state, _profile); - init_status = mc_reader->init_reader(); + init_status = static_cast(mc_reader.get())->init_reader(&jni_ctx); _cur_reader = std::move(mc_reader); } else if (range.__isset.table_format_params && range.table_format_params.table_format_type == "paimon") { @@ -1022,39 +1058,47 @@ Status FileScanner::_get_next_reader() { cpp_reader->set_predicate(std::move(predicate)); } } - init_status = cpp_reader->init_reader(); + init_status = + static_cast(cpp_reader.get())->init_reader(&jni_ctx); _cur_reader = std::move(cpp_reader); } else { - _cur_reader = PaimonJniReader::create_unique(_file_slot_descs, _state, _profile, - range, _params); - init_status = ((PaimonJniReader*)(_cur_reader.get()))->init_reader(); + auto paimon_reader = PaimonJniReader::create_unique(_file_slot_descs, _state, + _profile, range, _params); + init_status = + static_cast(paimon_reader.get())->init_reader(&jni_ctx); + _cur_reader = std::move(paimon_reader); } } else if (range.__isset.table_format_params && range.table_format_params.table_format_type == "hudi") { - _cur_reader = HudiJniReader::create_unique(*_params, - range.table_format_params.hudi_params, - _file_slot_descs, _state, _profile); - init_status = ((HudiJniReader*)_cur_reader.get())->init_reader(); - + auto hudi_reader = HudiJniReader::create_unique( + *_params, range.table_format_params.hudi_params, _file_slot_descs, _state, + _profile); + init_status = static_cast(hudi_reader.get())->init_reader(&jni_ctx); + _cur_reader = std::move(hudi_reader); } else if (range.__isset.table_format_params && range.table_format_params.table_format_type == "trino_connector") { - _cur_reader = TrinoConnectorJniReader::create_unique(_file_slot_descs, _state, - _profile, range); - init_status = ((TrinoConnectorJniReader*)(_cur_reader.get()))->init_reader(); + auto trino_reader = TrinoConnectorJniReader::create_unique(_file_slot_descs, _state, + _profile, range); + init_status = + static_cast(trino_reader.get())->init_reader(&jni_ctx); + _cur_reader = std::move(trino_reader); } else if (range.__isset.table_format_params && range.table_format_params.table_format_type == "jdbc") { // Extract jdbc params from table_format_params std::map jdbc_params( range.table_format_params.jdbc_params.begin(), range.table_format_params.jdbc_params.end()); - _cur_reader = JdbcJniReader::create_unique(_file_slot_descs, _state, _profile, - jdbc_params); - init_status = ((JdbcJniReader*)(_cur_reader.get()))->init_reader(); + auto jdbc_reader = JdbcJniReader::create_unique(_file_slot_descs, _state, _profile, + jdbc_params); + init_status = static_cast(jdbc_reader.get())->init_reader(&jni_ctx); + _cur_reader = std::move(jdbc_reader); } else if (range.__isset.table_format_params && range.table_format_params.table_format_type == "iceberg") { - _cur_reader = IcebergSysTableJniReader::create_unique(_file_slot_descs, _state, - _profile, range, _params); - init_status = ((IcebergSysTableJniReader*)(_cur_reader.get()))->init_reader(); + auto iceberg_sys_reader = IcebergSysTableJniReader::create_unique( + _file_slot_descs, _state, _profile, range, _params); + init_status = static_cast(iceberg_sys_reader.get()) + ->init_reader(&jni_ctx); + _cur_reader = std::move(iceberg_sys_reader); } // Set col_name_to_block_idx for JNI readers to avoid repeated map creation if (_cur_reader) { @@ -1068,23 +1112,10 @@ Status FileScanner::_get_next_reader() { auto file_meta_cache_ptr = _should_enable_file_meta_cache() ? ExecEnv::GetInstance()->file_meta_cache() : nullptr; - std::unique_ptr parquet_reader = ParquetReader::create_unique( - _profile, *_params, range, _state->query_options().batch_size, - &_state->timezone_obj(), _io_ctx.get(), _state, file_meta_cache_ptr, - _state->query_options().enable_parquet_lazy_mat); - - if (_row_id_column_iterator_pair.second != -1) { - RETURN_IF_ERROR(_create_row_id_column_iterator()); - parquet_reader->set_row_id_column_iterator(_row_id_column_iterator_pair); - } - - // ATTN: the push down agg type may be set back to NONE, - // see IcebergTableReader::init_row_filters for example. - parquet_reader->set_push_down_agg_type(_get_push_down_agg_type()); if (push_down_predicates) { RETURN_IF_ERROR(_process_late_arrival_conjuncts()); } - RETURN_IF_ERROR(_init_parquet_reader(std::move(parquet_reader), file_meta_cache_ptr)); + RETURN_IF_ERROR(_init_parquet_reader(file_meta_cache_ptr)); need_to_get_parsed_schema = true; break; @@ -1093,20 +1124,10 @@ Status FileScanner::_get_next_reader() { auto file_meta_cache_ptr = _should_enable_file_meta_cache() ? ExecEnv::GetInstance()->file_meta_cache() : nullptr; - std::unique_ptr orc_reader = OrcReader::create_unique( - _profile, _state, *_params, range, _state->query_options().batch_size, - _state->timezone(), _io_ctx.get(), file_meta_cache_ptr, - _state->query_options().enable_orc_lazy_mat); - if (_row_id_column_iterator_pair.second != -1) { - RETURN_IF_ERROR(_create_row_id_column_iterator()); - orc_reader->set_row_id_column_iterator(_row_id_column_iterator_pair); - } - - orc_reader->set_push_down_agg_type(_get_push_down_agg_type()); if (push_down_predicates) { RETURN_IF_ERROR(_process_late_arrival_conjuncts()); } - RETURN_IF_ERROR(_init_orc_reader(std::move(orc_reader), file_meta_cache_ptr)); + RETURN_IF_ERROR(_init_orc_reader(file_meta_cache_ptr)); need_to_get_parsed_schema = true; break; @@ -1122,15 +1143,20 @@ Status FileScanner::_get_next_reader() { case TFileFormatType::FORMAT_PROTO: { auto reader = CsvReader::create_unique(_state, _profile, &_counter, *_params, range, _file_slot_descs, _io_ctx.get()); - - init_status = reader->init_reader(_is_load); + CsvInitContext csv_ctx; + _fill_base_init_context(&csv_ctx); + csv_ctx.is_load = _is_load; + init_status = static_cast(reader.get())->init_reader(&csv_ctx); _cur_reader = std::move(reader); break; } case TFileFormatType::FORMAT_TEXT: { auto reader = TextReader::create_unique(_state, _profile, &_counter, *_params, range, _file_slot_descs, _io_ctx.get()); - init_status = reader->init_reader(_is_load); + CsvInitContext text_ctx; + _fill_base_init_context(&text_ctx); + text_ctx.is_load = _is_load; + init_status = static_cast(reader.get())->init_reader(&text_ctx); _cur_reader = std::move(reader); break; } @@ -1138,39 +1164,53 @@ Status FileScanner::_get_next_reader() { _cur_reader = NewJsonReader::create_unique(_state, _profile, &_counter, *_params, range, _file_slot_descs, &_scanner_eof, _io_ctx.get()); - init_status = ((NewJsonReader*)(_cur_reader.get())) - ->init_reader(_col_default_value_ctx, _is_load); + JsonInitContext json_ctx; + _fill_base_init_context(&json_ctx); + json_ctx.col_default_value_ctx = &_col_default_value_ctx; + json_ctx.is_load = _is_load; + init_status = _cur_reader->init_reader(&json_ctx); break; } case TFileFormatType::FORMAT_WAL: { _cur_reader = WalReader::create_unique(_state); - init_status = ((WalReader*)(_cur_reader.get()))->init_reader(_output_tuple_desc); + WalInitContext wal_ctx; + _fill_base_init_context(&wal_ctx); + wal_ctx.output_tuple_descriptor = _output_tuple_desc; + init_status = _cur_reader->init_reader(&wal_ctx); break; } case TFileFormatType::FORMAT_NATIVE: { auto reader = NativeReader::create_unique(_profile, *_params, range, _io_ctx.get(), _state); - init_status = reader->init_reader(); + ReaderInitContext native_ctx; + _fill_base_init_context(&native_ctx); + init_status = static_cast(reader.get())->init_reader(&native_ctx); _cur_reader = std::move(reader); need_to_get_parsed_schema = false; break; } case TFileFormatType::FORMAT_ARROW: { + ReaderInitContext arrow_ctx; + _fill_base_init_context(&arrow_ctx); + if (range.__isset.table_format_params && range.table_format_params.table_format_type == "remote_doris") { - _cur_reader = + auto doris_reader = RemoteDorisReader::create_unique(_file_slot_descs, _state, _profile, range); - init_status = ((RemoteDorisReader*)(_cur_reader.get()))->init_reader(); - if (_cur_reader) { - static_cast(_cur_reader.get()) - ->set_col_name_to_block_idx(&_src_block_name_to_idx); + init_status = + static_cast(doris_reader.get())->init_reader(&arrow_ctx); + if (doris_reader) { + doris_reader->set_col_name_to_block_idx(&_src_block_name_to_idx); } + _cur_reader = std::move(doris_reader); } else { - _cur_reader = + auto arrow_reader = ArrowStreamReader::create_unique(_state, _profile, &_counter, *_params, range, _file_slot_descs, _io_ctx.get()); - init_status = ((ArrowStreamReader*)(_cur_reader.get()))->init_reader(); + init_status = + static_cast(arrow_reader.get())->init_reader(&arrow_ctx); + _cur_reader = std::move(arrow_reader); } break; } @@ -1204,16 +1244,13 @@ Status FileScanner::_get_next_reader() { return Status::InternalError("failed to init reader, err: {}", init_status.to_string()); } - _cur_reader->set_push_down_agg_type(_get_push_down_agg_type()); - if (_get_push_down_agg_type() == TPushAggOp::type::COUNT && - range.__isset.table_format_params && - range.table_format_params.table_level_row_count >= 0) { - // This is a table level count push down operation, no need to call - // _set_fill_or_truncate_columns. - // in _set_fill_or_truncate_columns, we will use [range.start_offset, end offset] - // to filter the row group. But if this is count push down, the offset is undefined, - // causing incorrect row group filter and may return empty result. - } else { + // For table-level COUNT pushdown, offsets are undefined so we must skip + // _set_fill_or_truncate_columns (it uses [start_offset, end_offset] to + // filter row groups, which would produce incorrect empty results). + bool is_table_level_count = _get_push_down_agg_type() == TPushAggOp::type::COUNT && + range.__isset.table_format_params && + range.table_format_params.table_level_row_count >= 0; + if (!is_table_level_count) { Status status = _set_fill_or_truncate_columns(need_to_get_parsed_schema); if (status.is()) { // all parquet row groups are filtered continue; @@ -1222,14 +1259,32 @@ Status FileScanner::_get_next_reader() { status.to_string()); } } + + // Unified COUNT(*) pushdown: replace the real reader with CountReader + // decorator if the reader accepts COUNT and can provide a total row count. + if (_cur_reader->get_push_down_agg_type() == TPushAggOp::type::COUNT) { + int64_t total_rows = -1; + if (is_table_level_count) { + // FE-provided count (may account for table-format deletions) + total_rows = range.table_format_params.table_level_row_count; + } else if (_cur_reader->supports_count_pushdown()) { + // File metadata count (ORC footer / Parquet row groups) + total_rows = _cur_reader->get_total_rows(); + } + if (total_rows >= 0) { + auto batch_size = _state->query_options().batch_size; + _cur_reader = std::make_unique(total_rows, batch_size, + std::move(_cur_reader)); + } + } _cur_reader_eof = false; break; } return Status::OK(); } -Status FileScanner::_init_parquet_reader(std::unique_ptr&& parquet_reader, - FileMetaCache* file_meta_cache_ptr) { +Status FileScanner::_init_parquet_reader(FileMetaCache* file_meta_cache_ptr, + std::unique_ptr parquet_reader) { const TFileRangeDesc& range = _current_range; Status init_status = Status::OK(); @@ -1237,227 +1292,237 @@ Status FileScanner::_init_parquet_reader(std::unique_ptr&& parque _local_state ? _local_state->cast()._slot_id_to_predicates : phmap::flat_hash_map>> {}; + + // Build unified ParquetInitContext (shared by all Parquet reader variants) + ParquetInitContext pctx; + _fill_base_init_context(&pctx); + pctx.conjuncts = &_push_down_conjuncts; + pctx.slot_id_to_predicates = &slot_id_to_predicates; + pctx.colname_to_slot_id = _col_name_to_slot_id; + pctx.not_single_slot_filter_conjuncts = &_not_single_slot_filter_conjuncts; + pctx.slot_id_to_filter_conjuncts = &_slot_id_to_filter_conjuncts; + if (range.__isset.table_format_params && range.table_format_params.table_format_type == "iceberg") { + // IcebergParquetReader IS-A ParquetReader (CRTP mixin), no wrapping needed std::unique_ptr iceberg_reader = IcebergParquetReader::create_unique( - std::move(parquet_reader), _profile, _state, *_params, range, _kv_cache, - _io_ctx.get(), file_meta_cache_ptr); - if (_need_iceberg_rowid_column) { - iceberg_reader->set_need_row_id_column(true); + _kv_cache, _profile, *_params, range, _state->query_options().batch_size, + &_state->timezone_obj(), _io_ctx.get(), _state, file_meta_cache_ptr); + + // Transfer properties + if (_row_id_column_iterator_pair.second != -1) { + RETURN_IF_ERROR(_create_row_id_column_iterator()); + iceberg_reader->set_row_id_column_iterator(_row_id_column_iterator_pair); } if (_row_lineage_columns.row_id_column_idx != -1 || _row_lineage_columns.last_updated_sequence_number_column_idx != -1) { - std::shared_ptr row_lineage_columns; - row_lineage_columns = std::make_shared(); + auto row_lineage_columns = std::make_shared(); row_lineage_columns->row_id_column_idx = _row_lineage_columns.row_id_column_idx; row_lineage_columns->last_updated_sequence_number_column_idx = _row_lineage_columns.last_updated_sequence_number_column_idx; + const auto& iceberg_params = range.table_format_params.iceberg_params; + row_lineage_columns->first_row_id = + iceberg_params.__isset.first_row_id ? iceberg_params.first_row_id : -1; + row_lineage_columns->last_updated_sequence_number = + iceberg_params.__isset.last_updated_sequence_number + ? iceberg_params.last_updated_sequence_number + : -1; iceberg_reader->set_row_lineage_columns(std::move(row_lineage_columns)); } iceberg_reader->set_push_down_agg_type(_get_push_down_agg_type()); - init_status = iceberg_reader->init_reader( - _file_col_names, &_src_block_name_to_idx, _push_down_conjuncts, - slot_id_to_predicates, _real_tuple_desc, _default_val_row_desc.get(), - _col_name_to_slot_id, &_not_single_slot_filter_conjuncts, - &_slot_id_to_filter_conjuncts); + init_status = static_cast(iceberg_reader.get())->init_reader(&pctx); _cur_reader = std::move(iceberg_reader); } else if (range.__isset.table_format_params && range.table_format_params.table_format_type == "paimon") { - std::unique_ptr paimon_reader = PaimonParquetReader::create_unique( - std::move(parquet_reader), _profile, _state, *_params, range, _kv_cache, - _io_ctx.get(), file_meta_cache_ptr); - init_status = paimon_reader->init_reader( - _file_col_names, &_src_block_name_to_idx, _push_down_conjuncts, - slot_id_to_predicates, _real_tuple_desc, _default_val_row_desc.get(), - _col_name_to_slot_id, &_not_single_slot_filter_conjuncts, - &_slot_id_to_filter_conjuncts); - RETURN_IF_ERROR(paimon_reader->init_row_filters()); + // PaimonParquetReader IS-A ParquetReader, no wrapping needed + auto paimon_reader = PaimonParquetReader::create_unique( + _profile, *_params, range, _state->query_options().batch_size, + &_state->timezone_obj(), _kv_cache, _io_ctx.get(), _state, file_meta_cache_ptr); + if (_row_id_column_iterator_pair.second != -1) { + RETURN_IF_ERROR(_create_row_id_column_iterator()); + paimon_reader->set_row_id_column_iterator(_row_id_column_iterator_pair); + } + paimon_reader->set_push_down_agg_type(_get_push_down_agg_type()); + init_status = static_cast(paimon_reader.get())->init_reader(&pctx); _cur_reader = std::move(paimon_reader); } else if (range.__isset.table_format_params && range.table_format_params.table_format_type == "hudi") { - std::unique_ptr hudi_reader = HudiParquetReader::create_unique( - std::move(parquet_reader), _profile, _state, *_params, range, _io_ctx.get(), - file_meta_cache_ptr); - init_status = hudi_reader->init_reader( - _file_col_names, &_src_block_name_to_idx, _push_down_conjuncts, - slot_id_to_predicates, _real_tuple_desc, _default_val_row_desc.get(), - _col_name_to_slot_id, &_not_single_slot_filter_conjuncts, - &_slot_id_to_filter_conjuncts); + // HudiParquetReader IS-A ParquetReader, no wrapping needed + auto hudi_reader = HudiParquetReader::create_unique( + _profile, *_params, range, _state->query_options().batch_size, + &_state->timezone_obj(), _io_ctx.get(), _state, file_meta_cache_ptr); + if (_row_id_column_iterator_pair.second != -1) { + RETURN_IF_ERROR(_create_row_id_column_iterator()); + hudi_reader->set_row_id_column_iterator(_row_id_column_iterator_pair); + } + hudi_reader->set_push_down_agg_type(_get_push_down_agg_type()); + init_status = static_cast(hudi_reader.get())->init_reader(&pctx); _cur_reader = std::move(hudi_reader); } else if (range.table_format_params.table_format_type == "hive") { - auto hive_reader = HiveParquetReader::create_unique(std::move(parquet_reader), _profile, - _state, *_params, range, _io_ctx.get(), - &_is_file_slot, file_meta_cache_ptr); - init_status = hive_reader->init_reader( - _file_col_names, &_src_block_name_to_idx, _push_down_conjuncts, - slot_id_to_predicates, _real_tuple_desc, _default_val_row_desc.get(), - _col_name_to_slot_id, &_not_single_slot_filter_conjuncts, - &_slot_id_to_filter_conjuncts); + auto hive_reader = HiveParquetReader::create_unique( + _profile, *_params, range, _state->query_options().batch_size, + &_state->timezone_obj(), _io_ctx.get(), _state, &_is_file_slot, file_meta_cache_ptr, + _state->query_options().enable_parquet_lazy_mat); + if (_row_id_column_iterator_pair.second != -1) { + RETURN_IF_ERROR(_create_row_id_column_iterator()); + hive_reader->set_row_id_column_iterator(_row_id_column_iterator_pair); + } + hive_reader->set_push_down_agg_type(_get_push_down_agg_type()); + init_status = static_cast(hive_reader.get())->init_reader(&pctx); _cur_reader = std::move(hive_reader); } else if (range.table_format_params.table_format_type == "tvf") { - const FieldDescriptor* parquet_meta = nullptr; - RETURN_IF_ERROR(parquet_reader->get_file_metadata_schema(&parquet_meta)); - DCHECK(parquet_meta != nullptr); - - // TVF will first `get_parsed_schema` to obtain file information from BE, and FE will convert - // the column names to lowercase (because the query process is case-insensitive), - // so the lowercase file column names are used here to match the read columns. - std::shared_ptr tvf_info_node = nullptr; - RETURN_IF_ERROR(TableSchemaChangeHelper::BuildTableInfoUtil::by_parquet_name( - _real_tuple_desc, *parquet_meta, tvf_info_node)); - init_status = parquet_reader->init_reader( - _file_col_names, &_src_block_name_to_idx, _push_down_conjuncts, - slot_id_to_predicates, _real_tuple_desc, _default_val_row_desc.get(), - _col_name_to_slot_id, &_not_single_slot_filter_conjuncts, - &_slot_id_to_filter_conjuncts, tvf_info_node); + if (!parquet_reader) { + parquet_reader = ParquetReader::create_unique( + _profile, *_params, range, _state->query_options().batch_size, + &_state->timezone_obj(), _io_ctx.get(), _state, file_meta_cache_ptr, + _state->query_options().enable_parquet_lazy_mat); + } + parquet_reader->set_push_down_agg_type(_get_push_down_agg_type()); + if (_row_id_column_iterator_pair.second != -1) { + RETURN_IF_ERROR(_create_row_id_column_iterator()); + parquet_reader->set_row_id_column_iterator(_row_id_column_iterator_pair); + } + init_status = static_cast(parquet_reader.get())->init_reader(&pctx); _cur_reader = std::move(parquet_reader); } else if (_is_load) { - const FieldDescriptor* parquet_meta = nullptr; - RETURN_IF_ERROR(parquet_reader->get_file_metadata_schema(&parquet_meta)); - DCHECK(parquet_meta != nullptr); - - // Load is case-insensitive, so you to match the columns in the file. - std::map file_lower_name_to_native; - for (const auto& parquet_field : parquet_meta->get_fields_schema()) { - file_lower_name_to_native.emplace(doris::to_lower(parquet_field.name), - parquet_field.name); - } - auto load_info_node = std::make_shared(); - for (const auto slot : _real_tuple_desc->slots()) { - if (file_lower_name_to_native.contains(slot->col_name())) { - load_info_node->add_children(slot->col_name(), - file_lower_name_to_native[slot->col_name()], - TableSchemaChangeHelper::ConstNode::get_instance()); - // For Load, `file_scanner` will create block columns using the file type, - // there is no schema change when reading inside the struct, - // so use `TableSchemaChangeHelper::ConstNode`. - } else { - load_info_node->add_not_exist_children(slot->col_name()); - } + if (!parquet_reader) { + parquet_reader = ParquetReader::create_unique( + _profile, *_params, range, _state->query_options().batch_size, + &_state->timezone_obj(), _io_ctx.get(), _state, file_meta_cache_ptr, + _state->query_options().enable_parquet_lazy_mat); } - - init_status = parquet_reader->init_reader( - _file_col_names, &_src_block_name_to_idx, _push_down_conjuncts, - slot_id_to_predicates, _real_tuple_desc, _default_val_row_desc.get(), - _col_name_to_slot_id, &_not_single_slot_filter_conjuncts, - &_slot_id_to_filter_conjuncts, load_info_node); + parquet_reader->set_push_down_agg_type(_get_push_down_agg_type()); + init_status = static_cast(parquet_reader.get())->init_reader(&pctx); _cur_reader = std::move(parquet_reader); } return init_status; } -Status FileScanner::_init_orc_reader(std::unique_ptr&& orc_reader, - FileMetaCache* file_meta_cache_ptr) { +Status FileScanner::_init_orc_reader(FileMetaCache* file_meta_cache_ptr, + std::unique_ptr orc_reader) { const TFileRangeDesc& range = _current_range; Status init_status = Status::OK(); + // Build unified OrcInitContext (shared by all ORC reader variants) + OrcInitContext octx; + _fill_base_init_context(&octx); + octx.conjuncts = &_push_down_conjuncts; + octx.not_single_slot_filter_conjuncts = &_not_single_slot_filter_conjuncts; + octx.slot_id_to_filter_conjuncts = &_slot_id_to_filter_conjuncts; + if (range.__isset.table_format_params && range.table_format_params.table_format_type == "transactional_hive") { - std::unique_ptr tran_orc_reader = - TransactionalHiveReader::create_unique(std::move(orc_reader), _profile, _state, - *_params, range, _io_ctx.get(), - file_meta_cache_ptr); - init_status = tran_orc_reader->init_reader( - _file_col_names, &_src_block_name_to_idx, _push_down_conjuncts, _real_tuple_desc, - _default_val_row_desc.get(), &_not_single_slot_filter_conjuncts, - &_slot_id_to_filter_conjuncts); - RETURN_IF_ERROR(tran_orc_reader->init_row_filters()); + // TransactionalHiveReader IS-A OrcReader, no wrapping needed + auto tran_orc_reader = TransactionalHiveReader::create_unique( + _profile, _state, *_params, range, _state->query_options().batch_size, + _state->timezone(), _io_ctx.get(), file_meta_cache_ptr); + if (_row_id_column_iterator_pair.second != -1) { + RETURN_IF_ERROR(_create_row_id_column_iterator()); + tran_orc_reader->set_row_id_column_iterator(_row_id_column_iterator_pair); + } + tran_orc_reader->set_push_down_agg_type(_get_push_down_agg_type()); + init_status = static_cast(tran_orc_reader.get())->init_reader(&octx); + _cur_reader = std::move(tran_orc_reader); } else if (range.__isset.table_format_params && range.table_format_params.table_format_type == "iceberg") { + // IcebergOrcReader IS-A OrcReader (CRTP mixin), no wrapping needed std::unique_ptr iceberg_reader = IcebergOrcReader::create_unique( - std::move(orc_reader), _profile, _state, *_params, range, _kv_cache, _io_ctx.get(), - file_meta_cache_ptr); - if (_need_iceberg_rowid_column) { - iceberg_reader->set_need_row_id_column(true); + _kv_cache, _profile, _state, *_params, range, _state->query_options().batch_size, + _state->timezone(), _io_ctx.get(), file_meta_cache_ptr); + + // Transfer properties + if (_row_id_column_iterator_pair.second != -1) { + RETURN_IF_ERROR(_create_row_id_column_iterator()); + iceberg_reader->set_row_id_column_iterator(_row_id_column_iterator_pair); } if (_row_lineage_columns.row_id_column_idx != -1 || _row_lineage_columns.last_updated_sequence_number_column_idx != -1) { - std::shared_ptr row_lineage_columns; - row_lineage_columns = std::make_shared(); + auto row_lineage_columns = std::make_shared(); row_lineage_columns->row_id_column_idx = _row_lineage_columns.row_id_column_idx; row_lineage_columns->last_updated_sequence_number_column_idx = _row_lineage_columns.last_updated_sequence_number_column_idx; + const auto& iceberg_params = range.table_format_params.iceberg_params; + row_lineage_columns->first_row_id = + iceberg_params.__isset.first_row_id ? iceberg_params.first_row_id : -1; + row_lineage_columns->last_updated_sequence_number = + iceberg_params.__isset.last_updated_sequence_number + ? iceberg_params.last_updated_sequence_number + : -1; iceberg_reader->set_row_lineage_columns(std::move(row_lineage_columns)); } - init_status = iceberg_reader->init_reader( - _file_col_names, &_src_block_name_to_idx, _push_down_conjuncts, _real_tuple_desc, - _default_val_row_desc.get(), _col_name_to_slot_id, - &_not_single_slot_filter_conjuncts, &_slot_id_to_filter_conjuncts); + iceberg_reader->set_push_down_agg_type(_get_push_down_agg_type()); + init_status = static_cast(iceberg_reader.get())->init_reader(&octx); + _cur_reader = std::move(iceberg_reader); } else if (range.__isset.table_format_params && range.table_format_params.table_format_type == "paimon") { - std::unique_ptr paimon_reader = PaimonOrcReader::create_unique( - std::move(orc_reader), _profile, _state, *_params, range, _kv_cache, _io_ctx.get(), - file_meta_cache_ptr); - - init_status = paimon_reader->init_reader( - _file_col_names, &_src_block_name_to_idx, _push_down_conjuncts, _real_tuple_desc, - _default_val_row_desc.get(), &_not_single_slot_filter_conjuncts, - &_slot_id_to_filter_conjuncts); - RETURN_IF_ERROR(paimon_reader->init_row_filters()); + // PaimonOrcReader IS-A OrcReader, no wrapping needed + auto paimon_reader = PaimonOrcReader::create_unique( + _profile, _state, *_params, range, _state->query_options().batch_size, + _state->timezone(), _kv_cache, _io_ctx.get(), file_meta_cache_ptr); + if (_row_id_column_iterator_pair.second != -1) { + RETURN_IF_ERROR(_create_row_id_column_iterator()); + paimon_reader->set_row_id_column_iterator(_row_id_column_iterator_pair); + } + paimon_reader->set_push_down_agg_type(_get_push_down_agg_type()); + init_status = static_cast(paimon_reader.get())->init_reader(&octx); + _cur_reader = std::move(paimon_reader); } else if (range.__isset.table_format_params && range.table_format_params.table_format_type == "hudi") { - std::unique_ptr hudi_reader = - HudiOrcReader::create_unique(std::move(orc_reader), _profile, _state, *_params, - range, _io_ctx.get(), file_meta_cache_ptr); - - init_status = hudi_reader->init_reader( - _file_col_names, &_src_block_name_to_idx, _push_down_conjuncts, _real_tuple_desc, - _default_val_row_desc.get(), &_not_single_slot_filter_conjuncts, - &_slot_id_to_filter_conjuncts); + // HudiOrcReader IS-A OrcReader, no wrapping needed + auto hudi_reader = HudiOrcReader::create_unique( + _profile, _state, *_params, range, _state->query_options().batch_size, + _state->timezone(), _io_ctx.get(), file_meta_cache_ptr); + if (_row_id_column_iterator_pair.second != -1) { + RETURN_IF_ERROR(_create_row_id_column_iterator()); + hudi_reader->set_row_id_column_iterator(_row_id_column_iterator_pair); + } + hudi_reader->set_push_down_agg_type(_get_push_down_agg_type()); + init_status = static_cast(hudi_reader.get())->init_reader(&octx); + _cur_reader = std::move(hudi_reader); } else if (range.__isset.table_format_params && range.table_format_params.table_format_type == "hive") { - std::unique_ptr hive_reader = HiveOrcReader::create_unique( - std::move(orc_reader), _profile, _state, *_params, range, _io_ctx.get(), - &_is_file_slot, file_meta_cache_ptr); - - init_status = hive_reader->init_reader( - _file_col_names, &_src_block_name_to_idx, _push_down_conjuncts, _real_tuple_desc, - _default_val_row_desc.get(), &_not_single_slot_filter_conjuncts, - &_slot_id_to_filter_conjuncts); + auto hive_reader = HiveOrcReader::create_unique( + _profile, _state, *_params, range, _state->query_options().batch_size, + _state->timezone(), _io_ctx.get(), &_is_file_slot, file_meta_cache_ptr, + _state->query_options().enable_orc_lazy_mat); + if (_row_id_column_iterator_pair.second != -1) { + RETURN_IF_ERROR(_create_row_id_column_iterator()); + hive_reader->set_row_id_column_iterator(_row_id_column_iterator_pair); + } + hive_reader->set_push_down_agg_type(_get_push_down_agg_type()); + init_status = static_cast(hive_reader.get())->init_reader(&octx); + _cur_reader = std::move(hive_reader); } else if (range.__isset.table_format_params && range.table_format_params.table_format_type == "tvf") { - const orc::Type* orc_type_ptr = nullptr; - RETURN_IF_ERROR(orc_reader->get_file_type(&orc_type_ptr)); - - std::shared_ptr tvf_info_node = nullptr; - RETURN_IF_ERROR(TableSchemaChangeHelper::BuildTableInfoUtil::by_orc_name( - _real_tuple_desc, orc_type_ptr, tvf_info_node)); - init_status = orc_reader->init_reader( - &_file_col_names, &_src_block_name_to_idx, _push_down_conjuncts, false, - _real_tuple_desc, _default_val_row_desc.get(), &_not_single_slot_filter_conjuncts, - &_slot_id_to_filter_conjuncts, tvf_info_node); + if (!orc_reader) { + orc_reader = OrcReader::create_unique( + _profile, _state, *_params, range, _state->query_options().batch_size, + _state->timezone(), _io_ctx.get(), file_meta_cache_ptr, + _state->query_options().enable_orc_lazy_mat); + } + orc_reader->set_push_down_agg_type(_get_push_down_agg_type()); + if (_row_id_column_iterator_pair.second != -1) { + RETURN_IF_ERROR(_create_row_id_column_iterator()); + orc_reader->set_row_id_column_iterator(_row_id_column_iterator_pair); + } + init_status = static_cast(orc_reader.get())->init_reader(&octx); _cur_reader = std::move(orc_reader); } else if (_is_load) { - const orc::Type* orc_type_ptr = nullptr; - RETURN_IF_ERROR(orc_reader->get_file_type(&orc_type_ptr)); - - std::map file_lower_name_to_native; - for (uint64_t idx = 0; idx < orc_type_ptr->getSubtypeCount(); idx++) { - file_lower_name_to_native.emplace(doris::to_lower(orc_type_ptr->getFieldName(idx)), - orc_type_ptr->getFieldName(idx)); - } - - auto load_info_node = std::make_shared(); - for (const auto slot : _real_tuple_desc->slots()) { - if (file_lower_name_to_native.contains(slot->col_name())) { - load_info_node->add_children(slot->col_name(), - file_lower_name_to_native[slot->col_name()], - TableSchemaChangeHelper::ConstNode::get_instance()); - } else { - load_info_node->add_not_exist_children(slot->col_name()); - } + if (!orc_reader) { + orc_reader = OrcReader::create_unique( + _profile, _state, *_params, range, _state->query_options().batch_size, + _state->timezone(), _io_ctx.get(), file_meta_cache_ptr, + _state->query_options().enable_orc_lazy_mat); } - init_status = orc_reader->init_reader( - &_file_col_names, &_src_block_name_to_idx, _push_down_conjuncts, false, - _real_tuple_desc, _default_val_row_desc.get(), &_not_single_slot_filter_conjuncts, - &_slot_id_to_filter_conjuncts, load_info_node); + orc_reader->set_push_down_agg_type(_get_push_down_agg_type()); + init_status = static_cast(orc_reader.get())->init_reader(&octx); _cur_reader = std::move(orc_reader); } @@ -1465,15 +1530,11 @@ Status FileScanner::_init_orc_reader(std::unique_ptr&& orc_reader, } Status FileScanner::_set_fill_or_truncate_columns(bool need_to_get_parsed_schema) { - _missing_cols.clear(); _slot_lower_name_to_col_type.clear(); + std::unordered_map name_to_col_type; - RETURN_IF_ERROR(_cur_reader->get_columns(&name_to_col_type, &_missing_cols)); - if (_need_iceberg_rowid_column && _current_range.__isset.table_format_params && - _current_range.table_format_params.table_format_type == "iceberg") { - _missing_cols.erase(BeConsts::ICEBERG_ROWID_COL); - _missing_cols.erase(to_lower(BeConsts::ICEBERG_ROWID_COL)); - } + RETURN_IF_ERROR(_cur_reader->get_columns(&name_to_col_type)); + for (const auto& [col_name, col_type] : name_to_col_type) { auto col_name_lower = to_lower(col_name); if (_partition_col_descs.contains(col_name_lower)) { @@ -1492,33 +1553,6 @@ Status FileScanner::_set_fill_or_truncate_columns(bool need_to_get_parsed_schema _slot_lower_name_to_col_type.emplace(col_name_lower, col_type); } - if (!_fill_partition_from_path && config::enable_iceberg_partition_column_fallback) { - // check if the cols of _partition_col_descs are in _missing_cols - // if so, set _fill_partition_from_path to true and remove the col from _missing_cols - for (const auto& [col_name, col_type] : _partition_col_descs) { - if (_missing_cols.contains(col_name)) { - _fill_partition_from_path = true; - _missing_cols.erase(col_name); - } - } - } - - RETURN_IF_ERROR(_generate_missing_columns()); - if (_fill_partition_from_path) { - RETURN_IF_ERROR(_cur_reader->set_fill_columns(_partition_col_descs, _missing_col_descs)); - } else { - // If the partition columns are not from path, we only fill the missing columns. - RETURN_IF_ERROR(_cur_reader->set_fill_columns({}, _missing_col_descs)); - } - if (VLOG_NOTICE_IS_ON && !_missing_cols.empty() && _is_load) { - fmt::memory_buffer col_buf; - for (auto& col : _missing_cols) { - fmt::format_to(col_buf, " {}", col); - } - VLOG_NOTICE << fmt::format("Unknown columns:{} in file {}", fmt::to_string(col_buf), - _current_range.path); - } - RETURN_IF_ERROR(_generate_truncate_columns(need_to_get_parsed_schema)); return Status::OK(); } @@ -1588,19 +1622,21 @@ Status FileScanner::read_lines_from_range(const TFileRangeDesc& range, std::unique_ptr parquet_reader = ParquetReader::create_unique( _profile, *_params, range, 1, &_state->timezone_obj(), _io_ctx.get(), _state, file_meta_cache_ptr, false); - - RETURN_IF_ERROR(parquet_reader->read_by_rows(row_ids)); RETURN_IF_ERROR( - _init_parquet_reader(std::move(parquet_reader), file_meta_cache_ptr)); + _init_parquet_reader(file_meta_cache_ptr, std::move(parquet_reader))); + // _init_parquet_reader may create a new table-format specific reader + // (e.g., HiveParquetReader) that replaces the original parquet_reader. + // We need to re-apply read_by_rows to the actual _cur_reader. + RETURN_IF_ERROR(_cur_reader->read_by_rows(row_ids)); break; } case TFileFormatType::FORMAT_ORC: { std::unique_ptr orc_reader = OrcReader::create_unique( _profile, _state, *_params, range, 1, _state->timezone(), _io_ctx.get(), file_meta_cache_ptr, false); - - RETURN_IF_ERROR(orc_reader->read_by_rows(row_ids)); - RETURN_IF_ERROR(_init_orc_reader(std::move(orc_reader), file_meta_cache_ptr)); + RETURN_IF_ERROR(_init_orc_reader(file_meta_cache_ptr, std::move(orc_reader))); + // Same as above: re-apply read_by_rows to the actual _cur_reader. + RETURN_IF_ERROR(_cur_reader->read_by_rows(row_ids)); break; } default: { @@ -1639,41 +1675,33 @@ Status FileScanner::_generate_partition_columns() { _partition_col_descs.clear(); _partition_value_is_null.clear(); const TFileRangeDesc& range = _current_range; - if (range.__isset.columns_from_path && !_partition_slot_descs.empty()) { - for (const auto& slot_desc : _partition_slot_descs) { - if (slot_desc) { - auto it = _partition_slot_index_map.find(slot_desc->id()); - if (it == std::end(_partition_slot_index_map)) { - return Status::InternalError("Unknown source slot descriptor, slot_id={}", - slot_desc->id()); - } - const std::string& column_from_path = range.columns_from_path[it->second]; - _partition_col_descs.emplace(slot_desc->col_name(), - std::make_tuple(column_from_path, slot_desc)); - if (range.__isset.columns_from_path_is_null) { - _partition_value_is_null.emplace(slot_desc->col_name(), - range.columns_from_path_is_null[it->second]); - } - } - } + if (!range.__isset.columns_from_path_keys) { + return Status::OK(); } - return Status::OK(); -} -Status FileScanner::_generate_missing_columns() { - _missing_col_descs.clear(); - if (!_missing_cols.empty()) { - for (auto* slot_desc : _real_tuple_desc->slots()) { - if (!_missing_cols.contains(slot_desc->col_name())) { - continue; - } + std::unordered_map partition_name_to_key_index; + int index = 0; + for (const auto& key : range.columns_from_path_keys) { + partition_name_to_key_index.emplace(key, index++); + } - auto it = _col_default_value_ctx.find(slot_desc->col_name()); - if (it == _col_default_value_ctx.end()) { - return Status::InternalError("failed to find default value expr for slot: {}", - slot_desc->col_name()); + // Iterate _column_descs to find PARTITION_KEY columns instead of _partition_slot_descs. + for (const auto& col_desc : _column_descs) { + if (col_desc.category != ColumnCategory::PARTITION_KEY) { + continue; + } + auto pit = partition_name_to_key_index.find(col_desc.name); + if (pit != partition_name_to_key_index.end()) { + int values_index = pit->second; + if (range.__isset.columns_from_path && values_index < range.columns_from_path.size()) { + _partition_col_descs.emplace( + col_desc.name, + std::make_tuple(range.columns_from_path[values_index], col_desc.slot_desc)); + if (range.__isset.columns_from_path_is_null) { + _partition_value_is_null.emplace(col_desc.name, + range.columns_from_path_is_null[values_index]); + } } - _missing_col_descs.emplace(slot_desc->col_name(), it->second); } } return Status::OK(); @@ -1705,7 +1733,6 @@ Status FileScanner::_init_expr_ctxes() { } _num_of_columns_from_file = _params->num_of_columns_from_file; - for (const auto& slot_info : _params->required_slots) { auto slot_id = slot_info.slot_id; auto it = full_src_slot_map.find(slot_id); @@ -1713,43 +1740,59 @@ Status FileScanner::_init_expr_ctxes() { return Status::InternalError( fmt::format("Unknown source slot descriptor, slot_id={}", slot_id)); } + + ColumnDescriptor col_desc; + col_desc.name = it->second->col_name(); + col_desc.slot_desc = it->second; + + // Read category from Thrift if available (new FE), otherwise fall back + // to slot_info.is_file_slot + partition_name_to_key_index_map for broker/stream load + // where the FE does not set TColumnCategory. + if (slot_info.__isset.category) { + switch (slot_info.category) { + case TColumnCategory::REGULAR: + col_desc.category = ColumnCategory::REGULAR; + break; + case TColumnCategory::PARTITION_KEY: + col_desc.category = ColumnCategory::PARTITION_KEY; + break; + case TColumnCategory::SYNTHESIZED: + col_desc.category = ColumnCategory::SYNTHESIZED; + break; + case TColumnCategory::GENERATED: + col_desc.category = ColumnCategory::GENERATED; + break; + } + } else if (partition_name_to_key_index_map.contains(it->second->col_name()) && + !slot_info.is_file_slot) { + col_desc.category = ColumnCategory::PARTITION_KEY; + } + if (it->second->col_name().starts_with(BeConsts::GLOBAL_ROWID_COL)) { _row_id_column_iterator_pair.second = _default_val_row_desc->get_column_id(slot_id); continue; } - if (it->second->col_name() == BeConsts::ICEBERG_ROWID_COL) { - _need_iceberg_rowid_column = true; - continue; - } + bool is_row_lineage_col = false; if (it->second->col_name() == IcebergTableReader::ROW_LINEAGE_ROW_ID) { _row_lineage_columns.row_id_column_idx = _default_val_row_desc->get_column_id(slot_id); + is_row_lineage_col = true; } if (it->second->col_name() == IcebergTableReader::ROW_LINEAGE_LAST_UPDATED_SEQ_NUMBER) { _row_lineage_columns.last_updated_sequence_number_column_idx = _default_val_row_desc->get_column_id(slot_id); + is_row_lineage_col = true; } - - if (slot_info.is_file_slot) { - _is_file_slot.emplace(slot_id); - _file_slot_descs.emplace_back(it->second); - _file_col_names.push_back(it->second->col_name()); + if (is_row_lineage_col) { + col_desc.category = ColumnCategory::SYNTHESIZED; } + // Derive is_file_slot from category + bool is_file_slot = (col_desc.category == ColumnCategory::REGULAR || + col_desc.category == ColumnCategory::GENERATED); + if (partition_name_to_key_index_map.contains(it->second->col_name())) { - if (slot_info.is_file_slot) { - // If there is slot which is both a partition column and a file column, - // we should not fill the partition column from path. - _fill_partition_from_path = false; - } else if (!_fill_partition_from_path) { - // This should not happen - return Status::InternalError( - "Partition column {} is not a file column, but there is already a column " - "which is both a partition column and a file column.", - it->second->col_name()); - } - _partition_slot_descs.emplace_back(it->second); if (_is_load) { auto iti = full_src_index_map.find(slot_id); _partition_slot_index_map.emplace(slot_id, iti->second - _num_of_columns_from_file); @@ -1758,20 +1801,66 @@ Status FileScanner::_init_expr_ctxes() { _partition_slot_index_map.emplace(slot_id, kit->second); } } + + if (is_file_slot) { + _is_file_slot.emplace(slot_id); + _file_slot_descs.emplace_back(it->second); + _file_col_names.push_back(it->second->col_name()); + } + + _column_descs.push_back(col_desc); } // set column name to default value expr map - for (auto* slot_desc : _real_tuple_desc->slots()) { + // new inline TFileScanSlotInfo.default_value_expr (preferred) + for (const auto& slot_info : _params->required_slots) { + auto slot_id = slot_info.slot_id; + auto it = full_src_slot_map.find(slot_id); + if (it == std::end(full_src_slot_map)) { + continue; + } + const std::string& col_name = it->second->col_name(); + VExprContextSPtr ctx; - auto it = _params->default_value_of_src_slot.find(slot_desc->id()); - if (it != std::end(_params->default_value_of_src_slot)) { - if (!it->second.nodes.empty()) { - RETURN_IF_ERROR(VExpr::create_expr_tree(it->second, ctx)); - RETURN_IF_ERROR(ctx->prepare(_state, *_default_val_row_desc)); - RETURN_IF_ERROR(ctx->open(_state)); - } + bool has_default = false; + + // Prefer inline default_value_expr from TFileScanSlotInfo (new FE) + if (slot_info.__isset.default_value_expr && !slot_info.default_value_expr.nodes.empty()) { + RETURN_IF_ERROR(VExpr::create_expr_tree(slot_info.default_value_expr, ctx)); + RETURN_IF_ERROR(ctx->prepare(_state, *_default_val_row_desc)); + RETURN_IF_ERROR(ctx->open(_state)); + has_default = true; + } else if (slot_info.__isset.default_value_expr) { + // Empty nodes means null default (same as legacy empty TExpr) + has_default = true; + } + + // // Fall back to legacy default_value_of_src_slot map (old FE) + // if (!has_default) { + // auto legacy_it = _params->default_value_of_src_slot.find(slot_id); + // if (legacy_it != std::end(_params->default_value_of_src_slot)) { + // if (!legacy_it->second.nodes.empty()) { + // RETURN_IF_ERROR(VExpr::create_expr_tree(legacy_it->second, ctx)); + // RETURN_IF_ERROR(ctx->prepare(_state, *_default_val_row_desc)); + // RETURN_IF_ERROR(ctx->open(_state)); + // } + // has_default = true; + // } + // } + + if (has_default) { // if expr is empty, the default value will be null - _col_default_value_ctx.emplace(slot_desc->col_name(), ctx); + _col_default_value_ctx.emplace(col_name, ctx); + } + } + + // Populate default_expr in each ColumnDescriptor from _col_default_value_ctx. + // This makes default values available to readers via column_descs, eliminating the + // need for the separate _generate_missing_columns roundtrip. + for (auto& col_desc : _column_descs) { + auto it = _col_default_value_ctx.find(col_desc.name); + if (it != _col_default_value_ctx.end()) { + col_desc.default_expr = it->second; } } @@ -1816,10 +1905,34 @@ Status FileScanner::_init_expr_ctxes() { } bool FileScanner::_should_enable_condition_cache() { - return _condition_cache_digest != 0 && !_is_load && + DCHECK(_should_enable_condition_cache_handler != nullptr); + return _condition_cache_digest != 0 && (this->*_should_enable_condition_cache_handler)() && (!_conjuncts.empty() || !_push_down_conjuncts.empty()); } +bool FileScanner::_should_enable_condition_cache_for_load() const { + return false; +} + +bool FileScanner::_should_enable_condition_cache_for_query() const { + return true; +} + +bool FileScanner::_should_push_down_predicates(TFileFormatType::type format_type) const { + DCHECK(_should_push_down_predicates_handler != nullptr); + return (this->*_should_push_down_predicates_handler)(format_type); +} + +bool FileScanner::_should_push_down_predicates_for_load(TFileFormatType::type format_type) const { + static_cast(format_type); + return false; +} + +bool FileScanner::_should_push_down_predicates_for_query(TFileFormatType::type format_type) const { + // JNI readers handle predicate conversion in their own paths. + return format_type != TFileFormatType::FORMAT_JNI; +} + void FileScanner::_init_reader_condition_cache() { _condition_cache = nullptr; _condition_cache_ctx = nullptr; diff --git a/be/src/exec/scan/file_scanner.h b/be/src/exec/scan/file_scanner.h index 6c1125f0a603a5..022847e165f067 100644 --- a/be/src/exec/scan/file_scanner.h +++ b/be/src/exec/scan/file_scanner.h @@ -90,7 +90,9 @@ class FileScanner : public Scanner { : Scanner(state, profile), _params(params), _col_name_to_slot_id(colname_to_slot_id), - _real_tuple_desc(tuple_desc) {}; + _real_tuple_desc(tuple_desc) { + _configure_file_scan_handlers(); + }; Status read_lines_from_range(const TFileRangeDesc& range, const std::list& row_ids, Block* result_block, const ExternalFileMappingInfo& external_info, @@ -107,6 +109,9 @@ class FileScanner : public Scanner { Status _get_next_reader(); + // Build a ReaderInitContext with shared fields from FileScanner members. + void _fill_base_init_context(ReaderInitContext* ctx); + // TODO: cast input block columns type to string. Status _cast_src_block(Block* block) { return Status::OK(); } @@ -128,10 +133,10 @@ class FileScanner : public Scanner { std::vector _file_slot_descs; // col names from _file_slot_descs std::vector _file_col_names; + // Unified column descriptors for init_reader (includes file, partition, missing, synthesized cols) + std::vector _column_descs; - // Partition source slot descriptors - std::vector _partition_slot_descs; - // Partition slot id to index in _partition_slot_descs + // Partition slot id to partition key index (for matching columns_from_path) std::unordered_map _partition_slot_index_map; // created from param.expr_of_dest_slot // For query, it saves default value expr of all dest columns, or nullptr for NULL. @@ -152,8 +157,6 @@ class FileScanner : public Scanner { // Get from GenericReader, save the existing columns in file to their type. std::unordered_map _slot_lower_name_to_col_type; // Get from GenericReader, save columns that required by scan but not exist in file. - // These columns will be filled by default value or null. - std::unordered_set _missing_cols; // The col lowercase name of source file to type of source file. std::map _source_file_col_name_types; @@ -192,7 +195,6 @@ class FileScanner : public Scanner { std::unordered_map> _partition_col_descs; std::unordered_map _partition_value_is_null; - std::unordered_map _missing_col_descs; // idx of skip_bitmap_col in _input_tuple_desc int32_t _skip_bitmap_col_idx {-1}; @@ -232,13 +234,17 @@ class FileScanner : public Scanner { std::pair, int> _row_id_column_iterator_pair = {nullptr, -1}; - bool _need_iceberg_rowid_column = false; - int _iceberg_rowid_column_pos = -1; // for iceberg row lineage RowLineageColumns _row_lineage_columns; int64_t _last_bytes_read_from_local = 0; int64_t _last_bytes_read_from_remote = 0; + Status (FileScanner::*_init_src_block_handler)(Block* block) = nullptr; + Status (FileScanner::*_process_src_block_after_read_handler)(Block* block) = nullptr; + bool (FileScanner::*_should_push_down_predicates_handler)( + TFileFormatType::type format_type) const = nullptr; + bool (FileScanner::*_should_enable_condition_cache_handler)() const = nullptr; + // Condition cache for external tables uint64_t _condition_cache_digest = 0; segment_v2::ConditionCache::ExternalCacheKey _condition_cache_key; @@ -246,18 +252,25 @@ class FileScanner : public Scanner { std::shared_ptr _condition_cache_ctx; int64_t _condition_cache_hit_count = 0; + void _configure_file_scan_handlers(); + Status _init_expr_ctxes(); Status _init_src_block(Block* block); - Status _check_output_block_types(); - Status _cast_to_input_block(Block* block); + Status _init_src_block_for_load(Block* block); + Status _init_src_block_for_query(Block* block); + Status _process_src_block_after_read(Block* block); + Status _process_src_block_after_read_for_load(Block* block); + Status _process_src_block_after_read_for_query(Block* block); Status _fill_columns_from_path(size_t rows); Status _fill_missing_columns(size_t rows); + Status _check_output_block_types(); + Status _cast_to_input_block(Block* block); Status _pre_filter_src_block(); Status _convert_to_output_block(Block* block); Status _truncate_char_or_varchar_columns(Block* block); void _truncate_char_or_varchar_column(Block* block, int idx, int len); Status _generate_partition_columns(); - Status _generate_missing_columns(); + bool _check_partition_prune_expr(const VExprSPtr& expr); void _init_runtime_filter_partition_prune_ctxs(); void _init_runtime_filter_partition_prune_block(); @@ -267,10 +280,10 @@ class FileScanner : public Scanner { void _get_slot_ids(VExpr* expr, std::vector* slot_ids); Status _generate_truncate_columns(bool need_to_get_parsed_schema); Status _set_fill_or_truncate_columns(bool need_to_get_parsed_schema); - Status _init_orc_reader(std::unique_ptr&& orc_reader, - FileMetaCache* file_meta_cache_ptr); - Status _init_parquet_reader(std::unique_ptr&& parquet_reader, - FileMetaCache* file_meta_cache_ptr); + Status _init_orc_reader(FileMetaCache* file_meta_cache_ptr, + std::unique_ptr orc_reader = nullptr); + Status _init_parquet_reader(FileMetaCache* file_meta_cache_ptr, + std::unique_ptr parquet_reader = nullptr); Status _create_row_id_column_iterator(); TFileFormatType::type _get_current_format_type() { @@ -291,6 +304,11 @@ class FileScanner : public Scanner { } bool _should_enable_condition_cache(); + bool _should_enable_condition_cache_for_load() const; + bool _should_enable_condition_cache_for_query() const; + bool _should_push_down_predicates(TFileFormatType::type format_type) const; + bool _should_push_down_predicates_for_load(TFileFormatType::type format_type) const; + bool _should_push_down_predicates_for_query(TFileFormatType::type format_type) const; void _init_reader_condition_cache(); void _finalize_reader_condition_cache(); diff --git a/be/src/format/arrow/arrow_stream_reader.cpp b/be/src/format/arrow/arrow_stream_reader.cpp index be6e8acb19fc64..4bbb8cbd39bb4a 100644 --- a/be/src/format/arrow/arrow_stream_reader.cpp +++ b/be/src/format/arrow/arrow_stream_reader.cpp @@ -65,7 +65,7 @@ Status ArrowStreamReader::init_reader() { return Status::OK(); } -Status ArrowStreamReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { +Status ArrowStreamReader::_do_get_next_block(Block* block, size_t* read_rows, bool* eof) { bool has_next = false; RETURN_IF_ERROR(_pip_stream->HasNext(&has_next)); if (!has_next) { @@ -126,8 +126,8 @@ Status ArrowStreamReader::get_next_block(Block* block, size_t* read_rows, bool* return Status::OK(); } -Status ArrowStreamReader::get_columns(std::unordered_map* name_to_type, - std::unordered_set* missing_cols) { +Status ArrowStreamReader::_get_columns_impl( + std::unordered_map* name_to_type) { for (const auto& slot : _file_slot_descs) { name_to_type->emplace(slot->col_name(), slot->type()); } diff --git a/be/src/format/arrow/arrow_stream_reader.h b/be/src/format/arrow/arrow_stream_reader.h index 7076df158d2b82..df20d8fd920a26 100644 --- a/be/src/format/arrow/arrow_stream_reader.h +++ b/be/src/format/arrow/arrow_stream_reader.h @@ -55,10 +55,12 @@ class ArrowStreamReader : public GenericReader { Status init_reader(); - Status get_next_block(Block* block, size_t* read_rows, bool* eof) override; + Status _do_get_next_block(Block* block, size_t* read_rows, bool* eof) override; - Status get_columns(std::unordered_map* name_to_type, - std::unordered_set* missing_cols) override; + Status _get_columns_impl(std::unordered_map* name_to_type) override; + +protected: + Status _do_init_reader(ReaderInitContext* /*ctx*/) override { return init_reader(); } private: RuntimeState* _state; diff --git a/be/src/format/column_descriptor.h b/be/src/format/column_descriptor.h new file mode 100644 index 00000000000000..37126fff39f0fa --- /dev/null +++ b/be/src/format/column_descriptor.h @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "exprs/vexpr_fwd.h" + +namespace doris { +class SlotDescriptor; + +/// Column categories for table format reading. +/// +/// Each column requested by the query is classified into one of these categories. +/// The category determines how the column's value is obtained: +/// - REGULAR: Read directly from the data file (Parquet/ORC). +/// If the column is absent from a file (schema evolution), +/// its default_expr is used to produce a default value. +/// - PARTITION_KEY: Filled from partition metadata (e.g. Hive path partitions). +/// - SYNTHESIZED: Never in the data file; fully computed at runtime +/// (e.g. Doris V2 __DORIS_ICEBERG_ROWID_COL__). +/// - GENERATED: May or may not exist in the data file. If present but null, +/// the value is backfilled at runtime (e.g. Iceberg V3 _row_id). +enum class ColumnCategory { + REGULAR, + PARTITION_KEY, + SYNTHESIZED, + GENERATED, +}; + +/// Describes a column requested by the query, along with its category. +struct ColumnDescriptor { + std::string name; + const SlotDescriptor* slot_desc = nullptr; + ColumnCategory category = ColumnCategory::REGULAR; + /// Default value expression when this column is missing from the data file. + /// nullptr means fill with NULL. Built once per table scan in FileScanner. + VExprContextSPtr default_expr; +}; + +} // namespace doris diff --git a/be/src/format/count_reader.h b/be/src/format/count_reader.h new file mode 100644 index 00000000000000..65764f5d028d11 --- /dev/null +++ b/be/src/format/count_reader.h @@ -0,0 +1,106 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "common/status.h" +#include "core/block/block.h" +#include "format/generic_reader.h" + +namespace doris { +#include "common/compile_check_begin.h" + +/// A lightweight reader that emits row counts without reading any actual data. +/// Used as a decorator to replace the real reader when COUNT(*) push down is active. +/// +/// Instead of duplicating the COUNT short-circuit logic in every format reader +/// (ORC, Parquet, etc.), FileScanner creates a CountReader after the real reader +/// is initialized and the total row count is known. The CountReader then serves +/// all subsequent get_next_block calls by simply resizing columns. +/// +/// This cleanly separates the "how many rows" concern from the actual data reading, +/// eliminating duplicated COUNT blocks across format readers. +class CountReader : public GenericReader { +public: + /// @param total_rows Total number of rows to emit (post-filter). + /// @param batch_size Maximum rows per batch. + /// @param inner_reader The original reader, kept alive for profile collection + /// and lifecycle management. Ownership is transferred. + CountReader(int64_t total_rows, size_t batch_size, + std::unique_ptr inner_reader = nullptr) + : _remaining_rows(total_rows), + _batch_size(batch_size), + _inner_reader(std::move(inner_reader)) { + set_push_down_agg_type(TPushAggOp::type::COUNT); + } + + ~CountReader() override = default; + + Status _do_get_next_block(Block* block, size_t* read_rows, bool* eof) override { + auto rows = std::min(_remaining_rows, static_cast(_batch_size)); + _remaining_rows -= rows; + + auto mutate_columns = block->mutate_columns(); + for (auto& col : mutate_columns) { + col->resize(rows); + } + block->set_columns(std::move(mutate_columns)); + + *read_rows = rows; + *eof = (_remaining_rows == 0); + return Status::OK(); + } + + /// CountReader counts rows by definition. + bool count_read_rows() override { return true; } + + /// Delegate to inner reader if available, otherwise return our total. + int64_t get_total_rows() const override { + return _inner_reader ? _inner_reader->get_total_rows() : _initial_total_rows(); + } + + Status close() override { + if (_inner_reader) { + return _inner_reader->close(); + } + return Status::OK(); + } + + /// Access the inner reader for profile collection or other lifecycle needs. + GenericReader* inner_reader() const { return _inner_reader.get(); } + +protected: + void _collect_profile_before_close() override { + if (_inner_reader) { + _inner_reader->collect_profile_before_close(); + } + } + +private: + int64_t _initial_total_rows() const { return _remaining_rows; } + + int64_t _remaining_rows; + size_t _batch_size; + std::unique_ptr _inner_reader; +}; + +#include "common/compile_check_end.h" +} // namespace doris diff --git a/be/src/format/csv/csv_reader.cpp b/be/src/format/csv/csv_reader.cpp index 63045f8bfebd7d..10d79126bb67c9 100644 --- a/be/src/format/csv/csv_reader.cpp +++ b/be/src/format/csv/csv_reader.cpp @@ -307,8 +307,78 @@ Status CsvReader::init_reader(bool is_load) { return Status::OK(); } +// ---- Unified init_reader(ReaderInitContext*) overrides ---- + +Status CsvReader::_open_file_reader(ReaderInitContext* base_ctx) { + _start_offset = _range.start_offset; + if (_start_offset == 0) { + if (_params.__isset.file_attributes && _params.file_attributes.__isset.header_type && + !_params.file_attributes.header_type.empty()) { + std::string header_type = to_lower(_params.file_attributes.header_type); + if (header_type == BeConsts::CSV_WITH_NAMES) { + _skip_lines = 1; + } else if (header_type == BeConsts::CSV_WITH_NAMES_AND_TYPES) { + _skip_lines = 2; + } + } else if (_params.file_attributes.__isset.skip_lines) { + _skip_lines = _params.file_attributes.skip_lines; + } + } else if (_start_offset != 0) { + if ((_file_compress_type != TFileCompressType::PLAIN) || + (_file_compress_type == TFileCompressType::UNKNOWN && + _file_format_type != TFileFormatType::FORMAT_CSV_PLAIN)) { + return Status::InternalError("For now we do not support split compressed file"); + } + int64_t pre_read_len = std::min( + static_cast(_params.file_attributes.text_params.line_delimiter.size()), + _start_offset); + _start_offset -= pre_read_len; + _size += pre_read_len; + _skip_lines = 1; + } + + RETURN_IF_ERROR(_init_options()); + RETURN_IF_ERROR(_create_file_reader(false)); + return Status::OK(); +} + +Status CsvReader::_do_init_reader(ReaderInitContext* base_ctx) { + auto* ctx = checked_context_cast(base_ctx); + _is_load = ctx->is_load; + + _use_nullable_string_opt.resize(_file_slot_descs.size()); + for (int i = 0; i < _file_slot_descs.size(); ++i) { + auto data_type_ptr = _file_slot_descs[i]->get_data_type_ptr(); + if (data_type_ptr->is_nullable() && is_string_type(data_type_ptr->get_primitive_type())) { + _use_nullable_string_opt[i] = 1; + } + } + + RETURN_IF_ERROR(_create_decompressor()); + RETURN_IF_ERROR(_create_line_reader()); + + if (!_is_load) { + DCHECK(_params.__isset.column_idxs); + _col_idxs = _params.column_idxs; + int idx = 0; + for (const auto& slot_info : _params.required_slots) { + if (slot_info.is_file_slot) { + _file_slot_idx_map.push_back(idx); + } + idx++; + } + } else { + int i = 0; + for (const auto& desc [[maybe_unused]] : _file_slot_descs) { + _col_idxs.push_back(i++); + } + } + _line_reader_eof = false; + return Status::OK(); +} + // !FIXME: Here we should use MutableBlock -Status CsvReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { +Status CsvReader::_do_get_next_block(Block* block, size_t* read_rows, bool* eof) { if (_line_reader_eof) { *eof = true; return Status::OK(); @@ -397,8 +467,7 @@ Status CsvReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { return Status::OK(); } -Status CsvReader::get_columns(std::unordered_map* name_to_type, - std::unordered_set* missing_cols) { +Status CsvReader::_get_columns_impl(std::unordered_map* name_to_type) { for (const auto& slot : _file_slot_descs) { name_to_type->emplace(slot->col_name(), slot->type()); } diff --git a/be/src/format/csv/csv_reader.h b/be/src/format/csv/csv_reader.h index 4e24be28d15b95..f9bcb06697c10c 100644 --- a/be/src/format/csv/csv_reader.h +++ b/be/src/format/csv/csv_reader.h @@ -32,7 +32,7 @@ #include "common/status.h" #include "core/data_type/data_type.h" #include "format/file_reader/new_plain_text_line_reader.h" -#include "format/generic_reader.h" +#include "format/table/table_format_reader.h" #include "io/file_factory.h" #include "io/fs/file_reader_writer_fwd.h" #include "util/decompressor.h" @@ -52,6 +52,11 @@ struct IOContext; struct ScannerCounter; class Block; +/// CSV/Text-specific initialization context. +struct CsvInitContext final : public ReaderInitContext { + bool is_load = false; +}; + class LineFieldSplitterIf { public: virtual ~LineFieldSplitterIf() = default; @@ -166,7 +171,7 @@ class PlainCsvTextFieldSplitter : public BaseCsvTextFieldSplitter* name_to_type, - std::unordered_set* missing_cols) override; + Status _do_get_next_block(Block* block, size_t* read_rows, bool* eof) override; + Status _get_columns_impl(std::unordered_map* name_to_type) override; Status init_schema_reader() override; // get schema of csv file from first one line or first two lines. @@ -193,6 +197,10 @@ class CsvReader : public GenericReader { Status close() override; protected: + // ---- Unified init_reader(ReaderInitContext*) overrides ---- + Status _open_file_reader(ReaderInitContext* ctx) override; + Status _do_init_reader(ReaderInitContext* ctx) override; + // init options for type serde virtual Status _init_options(); virtual Status _create_line_reader(); diff --git a/be/src/format/generic_reader.cpp b/be/src/format/generic_reader.cpp new file mode 100644 index 00000000000000..11f9c19bcc7da1 --- /dev/null +++ b/be/src/format/generic_reader.cpp @@ -0,0 +1,23 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/generic_reader.h" + +namespace doris { +// GenericReader has no out-of-line method implementations. +// Column-filling logic lives in TableFormatReader (table_format_reader.cpp). +} // namespace doris diff --git a/be/src/format/generic_reader.h b/be/src/format/generic_reader.h index c08b0427847feb..61b5c5a0ab8e42 100644 --- a/be/src/format/generic_reader.h +++ b/be/src/format/generic_reader.h @@ -19,10 +19,27 @@ #include +#include +#include +#include +#include +#include +#include +#include + #include "common/status.h" +#include "core/column/column.h" +#include "core/column/column_nullable.h" +#include "core/data_type/data_type.h" +#include "exprs/vexpr.h" +#include "exprs/vexpr_context.h" #include "exprs/vexpr_fwd.h" +#include "format/column_descriptor.h" +#include "format/table/table_schema_change_helper.h" #include "runtime/descriptors.h" +#include "runtime/runtime_state.h" #include "storage/predicate/block_column_predicate.h" +#include "storage/segment/common.h" #include "util/profile_collector.h" namespace doris { @@ -45,21 +62,87 @@ struct ConditionCacheContext { static constexpr int GRANULE_SIZE = 2048; }; -// This a reader interface for all file readers. -// A GenericReader is responsible for reading a file and return -// a set of blocks with specified schema, +/// Base context for the unified init_reader(ReaderInitContext*) template method. +/// Contains fields shared by ALL reader types. Format-specific readers define +/// subclasses (ParquetInitContext, OrcInitContext, etc.) with extra fields. +/// FileScanner allocates the appropriate subclass and populates the shared fields +/// before calling init_reader(). +struct ReaderInitContext { + virtual ~ReaderInitContext() = default; + + // ---- Owned by FileScanner, shared by all readers ---- + std::vector* column_descs = nullptr; + std::unordered_map* col_name_to_block_idx = nullptr; + RuntimeState* state = nullptr; + const TupleDescriptor* tuple_descriptor = nullptr; + const RowDescriptor* row_descriptor = nullptr; + const TFileScanRangeParams* params = nullptr; + const TFileRangeDesc* range = nullptr; + TPushAggOp::type push_down_agg_type = TPushAggOp::type::NONE; + + // ---- Output slots (filled by on_before_init_reader) ---- + std::vector column_names; + std::shared_ptr table_info_node = + TableSchemaChangeHelper::ConstNode::get_instance(); + std::set column_ids; + std::set filter_column_ids; +}; + +/// Safe downcast for ReaderInitContext subclasses. +/// Uses dynamic_cast + DORIS_CHECK: crashes on type mismatch (per Doris coding standards). +template +To* checked_context_cast(From* ptr) { + auto* result = dynamic_cast(ptr); + DORIS_CHECK(result != nullptr); + return result; +} + +/// Base reader interface for all file readers. +/// A GenericReader is responsible for reading a file and returning +/// a set of blocks with specified schema. +/// +/// Provides hook virtual methods that implement the Template Method pattern: +/// init_reader: _open_file_reader → on_before_init_reader → _do_init_reader → on_after_init_reader +/// get_next_block: on_before_read_block → _do_get_next_block → on_after_read_block +/// +/// Column-filling logic (partition/missing/synthesized) lives in TableFormatReader. class GenericReader : public ProfileCollector { public: GenericReader() : _push_down_agg_type(TPushAggOp::type::NONE) {} void set_push_down_agg_type(TPushAggOp::type push_down_agg_type) { - _push_down_agg_type = push_down_agg_type; + if (!_push_down_agg_type_locked) { + _push_down_agg_type = push_down_agg_type; + } } + // Lock the current push_down_agg_type so FileScanner cannot override it. + // Used by readers that must disable COUNT pushdown (e.g., ACID deletes, Paimon DV). + void lock_push_down_agg_type() { _push_down_agg_type_locked = true; } + TPushAggOp::type get_push_down_agg_type() const { return _push_down_agg_type; } - virtual Status get_next_block(Block* block, size_t* read_rows, bool* eof) = 0; + /// Template method for reading blocks. + /// Calls: on_before_read_block → _do_get_next_block → on_after_read_block + Status get_next_block(Block* block, size_t* read_rows, bool* eof) { + RETURN_IF_ERROR(on_before_read_block(block)); + RETURN_IF_ERROR(_do_get_next_block(block, read_rows, eof)); + RETURN_IF_ERROR(on_after_read_block(block, read_rows)); + return Status::OK(); + } // Type is always nullable to process illegal values. - virtual Status get_columns(std::unordered_map* name_to_type, - std::unordered_set* missing_cols) { + // Results are cached after the first successful call. + Status get_columns(std::unordered_map* name_to_type) { + if (_get_columns_cached) { + *name_to_type = _cached_name_to_type; + return Status::OK(); + } + RETURN_IF_ERROR(_get_columns_impl(name_to_type)); + _cached_name_to_type = *name_to_type; + _get_columns_cached = true; + + return Status::OK(); + } + + virtual Status _get_columns_impl(std::unordered_map* name_to_type) { return Status::NotSupported("get_columns is not implemented"); } @@ -75,20 +158,6 @@ class GenericReader : public ProfileCollector { } ~GenericReader() override = default; - /// If the underlying FileReader has filled the partition&missing columns, - /// The FileScanner does not need to fill - virtual bool fill_all_columns() const { return _fill_all_columns; } - - /// Tell the underlying FileReader the partition&missing columns, - /// and the FileReader determine to fill columns or not. - /// Should set _fill_all_columns = true, if fill the columns. - virtual Status set_fill_columns( - const std::unordered_map>& - partition_columns, - const std::unordered_map& missing_columns) { - return Status::OK(); - } - virtual Status close() { return Status::OK(); } Status read_by_rows(const std::list& row_ids) { @@ -102,21 +171,106 @@ class GenericReader : public ProfileCollector { /// can skip some pages/rowgroups through indexes. virtual bool count_read_rows() { return false; } + /// Returns true if on_before_init_reader has already set _column_descs. + bool has_column_descs() const { return _column_descs != nullptr; } + + /// Unified initialization entry point (NVI pattern). + /// Enforces the template method sequence for ALL readers: + /// _open_file_reader → on_before_init_reader → _do_init_reader → on_after_init_reader + /// Subclasses implement _open_file_reader and _do_init_reader(ReaderInitContext*). + /// FileScanner constructs the appropriate ReaderInitContext subclass and calls this. + /// + /// NOTE: During migration, readers not yet ported to this API still use their + /// format-specific init_reader(...) methods. This method is non-virtual so it + /// cannot be accidentally overridden. + Status init_reader(ReaderInitContext* ctx) { + // Apply push_down_agg_type early so _open_file_reader and _do_init_reader + // can use it (e.g., PaimonCppReader skips full init on COUNT pushdown). + // on_after_init_reader may reset this (e.g., Iceberg with equality deletes). + set_push_down_agg_type(ctx->push_down_agg_type); + + RETURN_IF_ERROR(_open_file_reader(ctx)); + + // Standalone readers (delete file readers, push handler) set column_descs=nullptr + // and pre-populate column_names directly. Skip hooks for them. + if (ctx->column_descs != nullptr) { + RETURN_IF_ERROR(on_before_init_reader(ctx)); + } + + RETURN_IF_ERROR(_do_init_reader(ctx)); + + if (ctx->column_descs != nullptr) { + RETURN_IF_ERROR(on_after_init_reader(ctx)); + } + + return Status::OK(); + } + + /// Hook called before core init. Default just sets _column_descs. + /// TableFormatReader overrides with partition/missing column computation. + /// ORC/Parquet/Hive/Iceberg further override with format-specific schema matching. + virtual Status on_before_init_reader(ReaderInitContext* ctx) { + _column_descs = ctx->column_descs; + return Status::OK(); + } + protected: + // ---- Init-time hooks (Template Method for init_reader) ---- + + /// Opens the file and prepares I/O resources before hooks run. Override in + /// subclasses to open files, read metadata, set up decompressors, etc. + /// For Parquet/ORC, opens the file and reads footer metadata. + /// For CSV/JSON, opens the file, creates decompressors, and sets up line readers. + /// Default is no-op (for JNI, Native, Arrow readers). + virtual Status _open_file_reader(ReaderInitContext* /*ctx*/) { return Status::OK(); } + + /// Core initialization (format-specific). Subclasses override to perform + /// their actual parsing engine setup. The context should be downcast to + /// the appropriate subclass using checked_context_cast. + /// Default returns NotSupported — readers not yet migrated to the unified + /// init_reader(ReaderInitContext*) API still use their old init methods. + virtual Status _do_init_reader(ReaderInitContext* /*ctx*/) { + return Status::NotSupported( + "_do_init_reader(ReaderInitContext*) not yet implemented for this reader"); + } + + // ---- Existing init-time hooks ---- + + /// Called after core init completes. Subclasses override to process + /// delete files, deletion vectors, etc. + virtual Status on_after_init_reader(ReaderInitContext* /*ctx*/) { return Status::OK(); } + + // ---- Read-time hooks ---- + + /// Called before reading a block. Subclasses override to modify block + /// structure (e.g. add ACID columns, expand for equality delete). + virtual Status on_before_read_block(Block* block) { return Status::OK(); } + + /// Core block reading. Subclasses must override with actual read logic. + virtual Status _do_get_next_block(Block* block, size_t* read_rows, bool* eof) = 0; + + /// Called after reading a block. Subclasses override to post-process + /// (e.g. remove ACID columns, apply equality delete filter). + virtual Status on_after_read_block(Block* block, size_t* read_rows) { return Status::OK(); } + virtual Status _set_read_one_line_impl() { return Status::NotSupported("read_by_rows is not implemented for this reader."); } const size_t _MIN_BATCH_SIZE = 4064; // 4094 - 32(padding) - /// Whether the underlying FileReader has filled the partition&missing columns - bool _fill_all_columns = false; TPushAggOp::type _push_down_agg_type {}; + bool _push_down_agg_type_locked = false; public: // Pass condition cache context to the reader for HIT/MISS tracking. virtual void set_condition_cache_context(std::shared_ptr ctx) {} + // Returns true if this reader can produce an accurate total row count from metadata + // without reading actual data. Used to determine if CountReader decorator can be applied. + // Only ORC and Parquet readers support this (via file footer metadata). + virtual bool supports_count_pushdown() const { return false; } + // Returns the total number of rows the reader will produce. // Used to pre-allocate condition cache with the correct number of granules. virtual int64_t get_total_rows() const { return 0; } @@ -133,6 +287,21 @@ class GenericReader : public ProfileCollector { // Cache to save some common part such as file footer. // Maybe null if not used FileMetaCache* _meta_cache = nullptr; + + // ---- Column descriptors (set by init_reader, owned by FileScanner) ---- + const std::vector* _column_descs = nullptr; + + // ---- get_columns cache ---- + bool _get_columns_cached = false; + std::unordered_map _cached_name_to_type; +}; + +/// Provides an accessor for the current batch's row positions within the file. +/// Implemented by RowGroupReader (Parquet) and OrcReader. +class RowPositionProvider { +public: + virtual ~RowPositionProvider() = default; + virtual const std::vector& current_batch_row_positions() const = 0; }; #include "common/compile_check_end.h" diff --git a/be/src/format/jni/jni_reader.cpp b/be/src/format/jni/jni_reader.cpp index 22e26d829c2010..e074fce9650d24 100644 --- a/be/src/format/jni/jni_reader.cpp +++ b/be/src/format/jni/jni_reader.cpp @@ -110,10 +110,10 @@ Status JniReader::open(RuntimeState* state, RuntimeProfile* profile) { } // ========================================================================= -// JniReader::get_next_block (merged from JniConnector::get_next_block) +// JniReader::_do_get_next_block (merged from JniConnector::get_next_block) // ========================================================================= -Status JniReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { +Status JniReader::_do_get_next_block(Block* block, size_t* read_rows, bool* eof) { JNIEnv* env = nullptr; RETURN_IF_ERROR(Jni::Env::Get(&env)); long meta_address = 0; diff --git a/be/src/format/jni/jni_reader.h b/be/src/format/jni/jni_reader.h index 87c0c9c0d828e1..9d1cfd4b404f35 100644 --- a/be/src/format/jni/jni_reader.h +++ b/be/src/format/jni/jni_reader.h @@ -87,8 +87,7 @@ class JniReader : public GenericReader { */ Status open(RuntimeState* state, RuntimeProfile* profile); - Status get_columns(std::unordered_map* name_to_type, - std::unordered_set* missing_cols) override { + Status _get_columns_impl(std::unordered_map* name_to_type) override { for (const auto& desc : _file_slot_descs) { name_to_type->emplace(desc->col_name(), desc->type()); } @@ -98,7 +97,7 @@ class JniReader : public GenericReader { /** * Read next batch from Java scanner and fill the block. */ - virtual Status get_next_block(Block* block, size_t* read_rows, bool* eof) override; + Status _do_get_next_block(Block* block, size_t* read_rows, bool* eof) override; /** * Get table schema from Java scanner (used by Avro schema discovery). diff --git a/be/src/format/json/new_json_reader.cpp b/be/src/format/json/new_json_reader.cpp index cecfcf3f0dcf54..4c22bec476b743 100644 --- a/be/src/format/json/new_json_reader.cpp +++ b/be/src/format/json/new_json_reader.cpp @@ -197,7 +197,53 @@ Status NewJsonReader::init_reader( return Status::OK(); } -Status NewJsonReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { +// ---- Unified init_reader(ReaderInitContext*) overrides ---- + +Status NewJsonReader::_open_file_reader(ReaderInitContext* /*ctx*/) { + RETURN_IF_ERROR(_get_range_params()); + RETURN_IF_ERROR(_open_file_reader(false)); + return Status::OK(); +} + +Status NewJsonReader::_do_init_reader(ReaderInitContext* base_ctx) { + auto* ctx = checked_context_cast(base_ctx); + _is_load = ctx->is_load; + + RETURN_IF_ERROR(_get_column_default_value(_file_slot_descs, *ctx->col_default_value_ctx)); + for (auto* slot_desc : _file_slot_descs) { + _serdes.emplace_back(slot_desc->get_data_type_ptr()->get_serde()); + } + + // Create decompressor (needed by line reader below) + RETURN_IF_ERROR(Decompressor::create_decompressor(_file_compress_type, &_decompressor)); + + if (LIKELY(_read_json_by_line)) { + RETURN_IF_ERROR(_open_line_reader()); + } + RETURN_IF_ERROR(_parse_jsonpath_and_json_root()); + + if (_parsed_jsonpaths.empty()) { + _vhandle_json_callback = &NewJsonReader::_simdjson_handle_simple_json; + } else { + if (_strip_outer_array) { + _vhandle_json_callback = &NewJsonReader::_simdjson_handle_flat_array_complex_json; + } else { + _vhandle_json_callback = &NewJsonReader::_simdjson_handle_nested_complex_json; + } + } + _ondemand_json_parser = std::make_unique(); + for (int i = 0; i < _file_slot_descs.size(); ++i) { + _slot_desc_index[StringRef {_file_slot_descs[i]->col_name()}] = i; + if (_file_slot_descs[i]->is_skip_bitmap_col()) { + skip_bitmap_col_idx = i; + } + } + _simdjson_ondemand_padding_buffer.resize(_padded_size); + _simdjson_ondemand_unscape_padding_buffer.resize(_padded_size); + return Status::OK(); +} + +Status NewJsonReader::_do_get_next_block(Block* block, size_t* read_rows, bool* eof) { if (_reader_eof) { *eof = true; return Status::OK(); @@ -228,8 +274,8 @@ Status NewJsonReader::get_next_block(Block* block, size_t* read_rows, bool* eof) return Status::OK(); } -Status NewJsonReader::get_columns(std::unordered_map* name_to_type, - std::unordered_set* missing_cols) { +Status NewJsonReader::_get_columns_impl( + std::unordered_map* name_to_type) { for (const auto& slot : _file_slot_descs) { name_to_type->emplace(slot->col_name(), slot->type()); } diff --git a/be/src/format/json/new_json_reader.h b/be/src/format/json/new_json_reader.h index 4d803fc1050b19..4c6bb4e3609c78 100644 --- a/be/src/format/json/new_json_reader.h +++ b/be/src/format/json/new_json_reader.h @@ -36,8 +36,8 @@ #include "core/string_ref.h" #include "core/types.h" #include "exprs/json_functions.h" -#include "format/generic_reader.h" #include "format/line_reader.h" +#include "format/table/table_format_reader.h" #include "io/file_factory.h" #include "io/fs/file_reader_writer_fwd.h" #include "runtime/runtime_profile.h" @@ -63,7 +63,14 @@ struct ScannerCounter; class Block; class IColumn; -class NewJsonReader : public GenericReader { +/// JSON-specific initialization context. +/// Extends ReaderInitContext with default value context (unique to JSON reader). +struct JsonInitContext final : public ReaderInitContext { + const std::unordered_map* col_default_value_ctx = nullptr; + bool is_load = false; +}; + +class NewJsonReader : public TableFormatReader { ENABLE_FACTORY_CREATOR(NewJsonReader); public: @@ -80,14 +87,17 @@ class NewJsonReader : public GenericReader { Status init_reader( const std::unordered_map& col_default_value_ctx, bool is_load); - Status get_next_block(Block* block, size_t* read_rows, bool* eof) override; - Status get_columns(std::unordered_map* name_to_type, - std::unordered_set* missing_cols) override; + Status _do_get_next_block(Block* block, size_t* read_rows, bool* eof) override; + Status _get_columns_impl(std::unordered_map* name_to_type) override; Status init_schema_reader() override; Status get_parsed_schema(std::vector* col_names, std::vector* col_types) override; protected: + // ---- Unified init_reader(ReaderInitContext*) overrides ---- + Status _open_file_reader(ReaderInitContext* ctx) override; + Status _do_init_reader(ReaderInitContext* ctx) override; + void _collect_profile_before_close() override; private: diff --git a/be/src/format/native/native_reader.cpp b/be/src/format/native/native_reader.cpp index 8693a3e9a22066..90599b223b4e52 100644 --- a/be/src/format/native/native_reader.cpp +++ b/be/src/format/native/native_reader.cpp @@ -146,7 +146,7 @@ Status NativeReader::init_reader() { return Status::OK(); } -Status NativeReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { +Status NativeReader::_do_get_next_block(Block* block, size_t* read_rows, bool* eof) { if (_eof) { *read_rows = 0; *eof = true; @@ -219,9 +219,7 @@ Status NativeReader::get_next_block(Block* block, size_t* read_rows, bool* eof) return Status::OK(); } -Status NativeReader::get_columns(std::unordered_map* name_to_type, - std::unordered_set* missing_cols) { - missing_cols->clear(); +Status NativeReader::_get_columns_impl(std::unordered_map* name_to_type) { RETURN_IF_ERROR(init_reader()); if (!_schema_inited) { diff --git a/be/src/format/native/native_reader.h b/be/src/format/native/native_reader.h index 65d70816628eea..1eb8df868eff0b 100644 --- a/be/src/format/native/native_reader.h +++ b/be/src/format/native/native_reader.h @@ -25,7 +25,7 @@ #include #include "common/status.h" -#include "format/generic_reader.h" +#include "format/table/table_format_reader.h" #include "io/fs/file_reader_writer_fwd.h" namespace doris { @@ -46,7 +46,7 @@ class Block; // it will read a sequence of Blocks encoded in Doris Native binary format. // // NOTE: current implementation is just a skeleton and will be filled step by step. -class NativeReader : public GenericReader { +class NativeReader : public TableFormatReader { public: ENABLE_FACTORY_CREATOR(NativeReader); @@ -58,10 +58,9 @@ class NativeReader : public GenericReader { // Initialize underlying file reader and any format specific state. Status init_reader(); - Status get_next_block(Block* block, size_t* read_rows, bool* eof) override; + Status _do_get_next_block(Block* block, size_t* read_rows, bool* eof) override; - Status get_columns(std::unordered_map* name_to_type, - std::unordered_set* missing_cols) override; + Status _get_columns_impl(std::unordered_map* name_to_type) override; Status init_schema_reader() override; @@ -74,6 +73,7 @@ class NativeReader : public GenericReader { protected: void _collect_profile_before_close() override {} + Status _do_init_reader(ReaderInitContext* /*ctx*/) override { return init_reader(); } private: RuntimeProfile* _profile = nullptr; diff --git a/be/src/format/orc/vorc_reader.cpp b/be/src/format/orc/vorc_reader.cpp index a1a46d8565226b..c7b1454f35aebb 100644 --- a/be/src/format/orc/vorc_reader.cpp +++ b/be/src/format/orc/vorc_reader.cpp @@ -292,6 +292,7 @@ OrcReader::OrcReader(const TFileScanRangeParams& params, const TFileRangeDesc& r : _profile(nullptr), _scan_params(params), _scan_range(range), + _batch_size(_MIN_BATCH_SIZE), _ctz(ctz), _file_system(nullptr), _io_ctx(io_ctx), @@ -309,6 +310,7 @@ OrcReader::OrcReader(const TFileScanRangeParams& params, const TFileRangeDesc& r : _profile(nullptr), _scan_params(params), _scan_range(range), + _batch_size(_MIN_BATCH_SIZE), _ctz(ctz), _file_system(nullptr), _io_ctx(io_ctx_holder ? io_ctx_holder.get() : nullptr), @@ -461,31 +463,37 @@ Status OrcReader::_create_file_reader() { return Status::OK(); } -Status OrcReader::init_reader( - const std::vector* column_names, - std::unordered_map* col_name_to_block_idx, - const VExprContextSPtrs& conjuncts, bool is_acid, const TupleDescriptor* tuple_descriptor, - const RowDescriptor* row_descriptor, - const VExprContextSPtrs* not_single_slot_filter_conjuncts, - const std::unordered_map* slot_id_to_filter_conjuncts, - std::shared_ptr table_info_node_ptr, - const std::set& column_ids, const std::set& filter_column_ids) { - _table_column_names = column_names; - _col_name_to_block_idx = col_name_to_block_idx; - _lazy_read_ctx.conjuncts = conjuncts; - _is_acid = is_acid; - _tuple_descriptor = tuple_descriptor; - _row_descriptor = row_descriptor; - _table_info_node_ptr = table_info_node_ptr; - _column_ids = column_ids; - _filter_column_ids = filter_column_ids; - - if (not_single_slot_filter_conjuncts != nullptr && !not_single_slot_filter_conjuncts->empty()) { +// ---- Unified init_reader(ReaderInitContext*) overrides ---- + +Status OrcReader::_open_file_reader(ReaderInitContext* /*ctx*/) { + if (_state != nullptr) { + _orc_tiny_stripe_threshold_bytes = _state->query_options().orc_tiny_stripe_threshold_bytes; + _orc_once_max_read_bytes = _state->query_options().orc_once_max_read_bytes; + _orc_max_merge_distance_bytes = _state->query_options().orc_max_merge_distance_bytes; + } + return _create_file_reader(); +} + +Status OrcReader::_do_init_reader(ReaderInitContext* base_ctx) { + auto* ctx = checked_context_cast(base_ctx); + _table_column_names = base_ctx->column_names; + _col_name_to_block_idx = base_ctx->col_name_to_block_idx; + if (ctx->conjuncts != nullptr) { + _lazy_read_ctx.conjuncts = *ctx->conjuncts; + } + _tuple_descriptor = ctx->tuple_descriptor; + _row_descriptor = ctx->row_descriptor; + _table_info_node_ptr = base_ctx->table_info_node; + _column_ids = base_ctx->column_ids; + _filter_column_ids = base_ctx->filter_column_ids; + + if (ctx->not_single_slot_filter_conjuncts != nullptr && + !ctx->not_single_slot_filter_conjuncts->empty()) { _not_single_slot_filter_conjuncts.insert(_not_single_slot_filter_conjuncts.end(), - not_single_slot_filter_conjuncts->begin(), - not_single_slot_filter_conjuncts->end()); + ctx->not_single_slot_filter_conjuncts->begin(), + ctx->not_single_slot_filter_conjuncts->end()); } - _slot_id_to_filter_conjuncts = slot_id_to_filter_conjuncts; + _slot_id_to_filter_conjuncts = ctx->slot_id_to_filter_conjuncts; _obj_pool = std::make_unique(); if (_state != nullptr) { @@ -494,8 +502,146 @@ Status OrcReader::init_reader( _orc_max_merge_distance_bytes = _state->query_options().orc_max_merge_distance_bytes; } - RETURN_IF_ERROR(_create_file_reader()); + // _create_file_reader() is called by init_reader template method before hooks. + // For standalone _do_init_reader callers (tvf, load, etc.), open the file here if not already opened. + if (_reader == nullptr) { + RETURN_IF_ERROR(_create_file_reader()); + } RETURN_IF_ERROR(_init_read_columns()); + + // Compute missing columns and file↔table column mapping. + // This runs in _do_init_reader (not on_before_init_reader) because table-format readers + // (Iceberg, Paimon, Hive, Hudi) override on_before_init_reader completely. + if (has_column_descs()) { + _fill_missing_cols.clear(); + _fill_missing_defaults.clear(); + for (const auto& col_name : _table_column_names) { + if (!_table_info_node_ptr->children_column_exists(col_name)) { + _fill_missing_cols.insert(col_name); + } + } + if (_column_descs && !_fill_missing_cols.empty()) { + for (const auto& desc : *_column_descs) { + if (_fill_missing_cols.contains(desc.name) && + !_fill_partition_values.contains(desc.name)) { + _fill_missing_defaults[desc.name] = desc.default_expr; + } + } + } + for (const auto& table_column_name : _table_column_names) { + if (_fill_missing_cols.contains(table_column_name)) { + continue; + } + const auto file_column_name = + _table_info_node_ptr->children_file_column_name(table_column_name); + _read_file_cols.emplace_back(file_column_name); + _read_table_cols.emplace_back(table_column_name); + } + } + + // Register row-position-based synthesized column handler. + // _row_id_column_iterator_pair, _row_lineage_columns, and _iceberg_rowid_params + // are all set before init_reader by FileScanner. + // This must be outside has_column_descs() guard because standalone readers + // (e.g., orc_read_lines tests) also use row_id columns. + if (_row_id_column_iterator_pair.first != nullptr || _iceberg_rowid_params.enabled || + (_row_lineage_columns != nullptr && + (_row_lineage_columns->need_row_ids() || + _row_lineage_columns->has_last_updated_sequence_number_column()))) { + register_synthesized_column_handler( + BeConsts::ROWID_COL, [this](Block* block, size_t rows) -> Status { + return _fill_row_id_columns(block, _row_reader->getRowNumber()); + }); + } + + // Standalone callers (column_descs == nullptr) skip on_before_init_reader, + // so _read_file_cols etc. are not populated. Use table_info_node for name mapping + // when available (e.g., ACID delete reader), otherwise fall back to 1:1 mapping. + if (!has_column_descs() && _read_file_cols.empty()) { + for (const auto& col_name : _table_column_names) { + if (_table_info_node_ptr && _table_info_node_ptr->children_column_exists(col_name)) { + _read_file_cols.emplace_back( + _table_info_node_ptr->children_file_column_name(col_name)); + } else { + _read_file_cols.emplace_back(col_name); + } + _read_table_cols.emplace_back(col_name); + } + } + + // ---- Inlined set_fill_columns logic (partition/missing/synthesized classification) ---- + SCOPED_RAW_TIMER(&_statistics.set_fill_column_time); + + // 1. Collect predicate columns from conjuncts for lazy materialization + std::unordered_map> predicate_table_columns; + _collect_predicate_columns_from_conjuncts(predicate_table_columns); + + // 2. Classify read/partition/missing/synthesized columns into lazy vs predicate groups + _classify_columns_for_lazy_read(predicate_table_columns, _fill_partition_values, + _fill_missing_defaults); + + // 3. Init search argument for min-max filtering + if (_lazy_read_ctx.conjuncts.empty()) { + _lazy_read_ctx.can_lazy_read = false; + } else if (_enable_filter_by_min_max) { + auto res = _init_search_argument(_push_down_exprs); + if (_state->query_options().check_orc_init_sargs_success && !res) { + std::stringstream ss; + for (const auto& conjunct : _lazy_read_ctx.conjuncts) { + ss << conjunct->root()->debug_string() << "\n"; + } + return Status::InternalError( + "Session variable check_orc_init_sargs_success is set, but " + "_init_search_argument returns false because all exprs can not be pushed " + "down:\n " + + ss.str()); + } + } + + // 4. Create ORC row reader (includes tiny stripe optimization and type map) + RETURN_IF_ERROR(_init_orc_row_reader()); + + // 5. Build filter conjuncts from not_single_slot and predicate_partition_columns + if (!_not_single_slot_filter_conjuncts.empty()) { + _filter_conjuncts.insert(_filter_conjuncts.end(), _not_single_slot_filter_conjuncts.begin(), + _not_single_slot_filter_conjuncts.end()); + _disable_dict_filter = true; + } + if (_slot_id_to_filter_conjuncts && !_slot_id_to_filter_conjuncts->empty()) { + for (auto& kv : _lazy_read_ctx.predicate_partition_columns) { + auto& [value, slot_desc] = kv.second; + auto iter = _slot_id_to_filter_conjuncts->find(slot_desc->id()); + if (iter != _slot_id_to_filter_conjuncts->end()) { + for (const auto& conjunct_ctx : iter->second) { + _filter_conjuncts.push_back(conjunct_ctx); + } + } + } + } + + return Status::OK(); +} + +Status OrcReader::on_before_init_reader(ReaderInitContext* ctx) { + _column_descs = ctx->column_descs; + _fill_col_name_to_block_idx = ctx->col_name_to_block_idx; + RETURN_IF_ERROR( + _extract_partition_values(*ctx->range, ctx->tuple_descriptor, _fill_partition_values)); + for (auto& desc : *ctx->column_descs) { + if (desc.category == ColumnCategory::REGULAR || + desc.category == ColumnCategory::GENERATED) { + ctx->column_names.push_back(desc.name); + } + } + + // Build table_info_node from ORC file type with case-insensitive recursive matching. + // _reader is available here because init_reader calls _create_file_reader() before this hook. + // tuple_descriptor may be null in unit tests that only set column_descs. + if (ctx->tuple_descriptor != nullptr) { + RETURN_IF_ERROR(TableSchemaChangeHelper::BuildTableInfoUtil::by_orc_name( + ctx->tuple_descriptor, &_reader->getType(), ctx->table_info_node)); + } + return Status::OK(); } @@ -524,6 +670,8 @@ void OrcReader::set_iceberg_rowid_params(const std::string& file_path, int32_t p _iceberg_rowid_params.row_id_column_pos = row_id_column_pos; } +// set_iceberg_rowid_params removed: now handled by ColumnProcessor + Status OrcReader::_init_read_columns() { SCOPED_RAW_TIMER(&_statistics.init_column_time); const auto& root_type = _reader->getType(); @@ -568,17 +716,6 @@ Status OrcReader::_init_read_columns() { } } - for (size_t i = 0; i < _table_column_names->size(); ++i) { - const auto& table_column_name = (*_table_column_names)[i]; - if (!_table_info_node_ptr->children_column_exists(table_column_name)) { - _missing_cols.emplace_back(table_column_name); - continue; - } - const auto file_column_name = - _table_info_node_ptr->children_file_column_name(table_column_name); - _read_file_cols.emplace_back(file_column_name); - _read_table_cols.emplace_back(table_column_name); - } return Status::OK(); } @@ -1142,15 +1279,8 @@ bool OrcReader::_init_search_argument(const VExprSPtrs& exprs) { return true; } -Status OrcReader::set_fill_columns( - const std::unordered_map>& - partition_columns, - const std::unordered_map& missing_columns) { - SCOPED_RAW_TIMER(&_statistics.set_fill_column_time); - - // std::unordered_map> - std::unordered_map> predicate_table_columns; - // visit_slot for lazy mat. +void OrcReader::_collect_predicate_columns_from_conjuncts( + std::unordered_map>& predicate_table_columns) { std::function visit_slot = [&](VExpr* expr) { if (expr->is_slot_ref()) { VSlotRef* slot_ref = static_cast(expr); @@ -1171,13 +1301,10 @@ Status OrcReader::set_fill_columns( auto expr = conjunct->root(); if (expr->is_rf_wrapper()) { - // REF: src/runtime_filter/runtime_filter_consumer.cpp auto* runtime_filter = static_cast(expr.get()); - auto filter_impl = runtime_filter->get_impl(); visit_slot(filter_impl.get()); - // only support push down for filter row group : MAX_FILTER, MAX_FILTER, MINMAX_FILTER, IN_FILTER if ((runtime_filter->node_type() == TExprNodeType::BINARY_PRED) && (runtime_filter->op() == TExprOpcode::GE || runtime_filter->op() == TExprOpcode::LE)) { @@ -1185,7 +1312,6 @@ Status OrcReader::set_fill_columns( } else if (runtime_filter->node_type() == TExprNodeType::IN_PRED && runtime_filter->op() == TExprOpcode::FILTER_IN) { auto* direct_in_predicate = static_cast(filter_impl.get()); - int max_in_size = _state->query_options().__isset.max_pushdown_conditions_per_column ? _state->query_options().max_pushdown_conditions_per_column @@ -1194,7 +1320,6 @@ Status OrcReader::set_fill_columns( direct_in_predicate->get_set_func()->size() > max_in_size) { continue; } - VExprSPtr new_in_slot = nullptr; if (direct_in_predicate->get_slot_in_expr(new_in_slot)) { expr = new_in_slot; @@ -1205,13 +1330,10 @@ Status OrcReader::set_fill_columns( continue; } } else if (VTopNPred* topn_pred = typeid_cast(expr.get())) { - // top runtime filter : only le && ge. DCHECK(topn_pred->children().size() > 0); visit_slot(topn_pred->children()[0].get()); - VExprSPtr binary_expr; if (topn_pred->get_binary_expr(binary_expr)) { - // for min-max filter. expr = binary_expr; } else { continue; @@ -1224,7 +1346,13 @@ Status OrcReader::set_fill_columns( _push_down_exprs.emplace_back(expr); } } +} +void OrcReader::_classify_columns_for_lazy_read( + const std::unordered_map>& predicate_table_columns, + const std::unordered_map>& + partition_columns, + const std::unordered_map& missing_columns) { if (_is_acid) { _lazy_read_ctx.predicate_orc_columns.insert( _lazy_read_ctx.predicate_orc_columns.end(), @@ -1258,7 +1386,6 @@ Status OrcReader::set_fill_columns( } else { _lazy_read_ctx.predicate_columns.first.emplace_back(iter->first); _lazy_read_ctx.predicate_columns.second.emplace_back(iter->second.second); - _lazy_read_ctx.predicate_orc_columns.emplace_back( _table_info_node_ptr->children_file_column_name(iter->first)); if (check_iceberg_row_lineage_column_idx(read_table_col) != -1) { @@ -1283,16 +1410,13 @@ Status OrcReader::set_fill_columns( if (iter == predicate_table_columns.end()) { _lazy_read_ctx.missing_columns.emplace(kv.first, kv.second); } else { - //For check missing column : missing column == xx, missing column is null,missing column is not null. if (_slot_id_to_filter_conjuncts->find(iter->second.second) != _slot_id_to_filter_conjuncts->end()) { for (const auto& ctx : _slot_id_to_filter_conjuncts->find(iter->second.second)->second) { - _filter_conjuncts.emplace_back(ctx); // todo ?????? + _filter_conjuncts.emplace_back(ctx); } } - - // predicate_missing_columns is VLiteral.To fill in default values for missing columns. _lazy_read_ctx.predicate_missing_columns.emplace(kv.first, kv.second); if (check_iceberg_row_lineage_column_idx(kv.first) != -1) { _enable_lazy_mat = false; @@ -1304,40 +1428,25 @@ Status OrcReader::set_fill_columns( !_lazy_read_ctx.lazy_read_columns.empty()) { _lazy_read_ctx.can_lazy_read = true; } +} - if (_lazy_read_ctx.conjuncts.empty()) { - _lazy_read_ctx.can_lazy_read = false; - } else if (_enable_filter_by_min_max) { - auto res = _init_search_argument(_push_down_exprs); - if (_state->query_options().check_orc_init_sargs_success && !res) { - std::stringstream ss; - for (const auto& conjunct : _lazy_read_ctx.conjuncts) { - ss << conjunct->root()->debug_string() << "\n"; - } - std::string conjuncts_str = ss.str(); - return Status::InternalError( - "Session variable check_orc_init_sargs_success is set, but " - "_init_search_argument returns false because all exprs can not be pushed " - "down:\n " + - conjuncts_str); - } - } +Status OrcReader::_init_orc_row_reader() { try { _row_reader_options.range(_range_start_offset, _range_size); - _row_reader_options.setTimezoneName(_ctz == "CST" ? "Asia/Shanghai" : _ctz); + std::string tz = _ctz.empty() ? "UTC" : (_ctz == "CST" ? "Asia/Shanghai" : _ctz); + _row_reader_options.setTimezoneName(tz); if (!_column_ids.empty()) { std::list column_ids_list(_column_ids.begin(), _column_ids.end()); _row_reader_options.includeTypes(column_ids_list); - } else { // If column_ids is empty, include all top-level columns to be read. + } else { _row_reader_options.include(_read_file_cols); } _row_reader_options.setEnableLazyDecoding(true); - //orc reader should not use the tiny stripe optimization when reading by row id. + // Tiny stripe optimization (skip when reading by row id) if (!_read_by_rows) { uint64_t number_of_stripes = _reader->getNumberOfStripes(); auto all_stripes_needed = _reader->getNeedReadStripes(_row_reader_options); - int64_t range_end_offset = _range_start_offset + _range_size; bool all_tiny_stripes = true; @@ -1356,7 +1465,6 @@ Status OrcReader::set_fill_columns( all_tiny_stripes = false; break; } - tiny_stripe_ranges.emplace_back(strip_start_offset, strip_end_offset); } if (all_tiny_stripes && number_of_stripes > 0) { @@ -1366,7 +1474,6 @@ Status OrcReader::set_fill_columns( _orc_once_max_read_bytes); auto range_finder = std::make_shared( std::move(prefetch_merge_ranges)); - auto* orc_input_stream_ptr = static_cast(_reader->getStream()); orc_input_stream_ptr->set_all_tiny_stripes(); auto& orc_file_reader = orc_input_stream_ptr->get_file_reader(); @@ -1376,6 +1483,7 @@ Status OrcReader::set_fill_columns( } } + // Merge predicate partition/missing back if can't lazy read if (!_lazy_read_ctx.can_lazy_read) { for (auto& kv : _lazy_read_ctx.predicate_partition_columns) { _lazy_read_ctx.partition_columns.emplace(kv.first, kv.second); @@ -1385,8 +1493,7 @@ Status OrcReader::set_fill_columns( } } - _fill_all_columns = true; - // create orc row reader + // Create ORC row reader if (_lazy_read_ctx.can_lazy_read) { _row_reader_options.filter(_lazy_read_ctx.predicate_orc_columns); _orc_filter = std::make_unique(this); @@ -1397,6 +1504,7 @@ Status OrcReader::set_fill_columns( _row_reader = _reader->createRowReader(_row_reader_options, _orc_filter.get(), _string_dict_filter.get()); + // Build column name → index and type maps _batch = _row_reader->createRowBatch(_batch_size); // Derive the first row in this scan range from ORC RowReader's initial state. @@ -1457,94 +1565,6 @@ Status OrcReader::set_fill_columns( } } - if (!_not_single_slot_filter_conjuncts.empty()) { - _filter_conjuncts.insert(_filter_conjuncts.end(), _not_single_slot_filter_conjuncts.begin(), - _not_single_slot_filter_conjuncts.end()); - _disable_dict_filter = true; - } - - if (_slot_id_to_filter_conjuncts && !_slot_id_to_filter_conjuncts->empty()) { - // Add predicate_partition_columns in _slot_id_to_filter_conjuncts(single slot conjuncts) - // to _filter_conjuncts, others should be added from not_single_slot_filter_conjuncts. - for (auto& kv : _lazy_read_ctx.predicate_partition_columns) { - auto& [value, slot_desc] = kv.second; - auto iter = _slot_id_to_filter_conjuncts->find(slot_desc->id()); - if (iter != _slot_id_to_filter_conjuncts->end()) { - for (const auto& ctx : iter->second) { - _filter_conjuncts.push_back(ctx); - } - } - } - } - return Status::OK(); -} - -Status OrcReader::_fill_partition_columns( - Block* block, uint64_t rows, - const std::unordered_map>& - partition_columns) { - DataTypeSerDe::FormatOptions _text_formatOptions; - for (const auto& kv : partition_columns) { - auto col_ptr = block->get_by_position((*_col_name_to_block_idx)[kv.first]) - .column->assume_mutable(); - const auto& [value, slot_desc] = kv.second; - auto text_serde = slot_desc->get_data_type_ptr()->get_serde(); - Slice slice(value.data(), value.size()); - uint64_t num_deserialized = 0; - if (text_serde->deserialize_column_from_fixed_json(*col_ptr, slice, rows, &num_deserialized, - _text_formatOptions) != Status::OK()) { - return Status::InternalError("Failed to fill partition column: {}={}", - slot_desc->col_name(), value); - } - if (num_deserialized != rows) { - return Status::InternalError( - "Failed to fill partition column: {}={} ." - "Number of rows expected to be written : {}, number of rows actually " - "written : " - "{}", - slot_desc->col_name(), value, num_deserialized, rows); - } - } - return Status::OK(); -} - -Status OrcReader::_fill_missing_columns( - Block* block, uint64_t rows, - const std::unordered_map& missing_columns) { - for (const auto& kv : missing_columns) { - if (!_col_name_to_block_idx->contains(kv.first)) { - return Status::InternalError("Failed to find missing column: {}, block: {}", kv.first, - block->dump_structure()); - } - if (kv.second == nullptr) { - // no default column, fill with null - auto mutable_column = block->get_by_position((*_col_name_to_block_idx)[kv.first]) - .column->assume_mutable(); - auto* nullable_column = static_cast(mutable_column.get()); - nullable_column->insert_many_defaults(rows); - } else { - // fill with default value - const auto& ctx = kv.second; - // PT1 => dest primitive type - ColumnPtr result_column_ptr; - RETURN_IF_ERROR(ctx->execute(block, result_column_ptr)); - if (result_column_ptr->use_count() == 1) { - // call resize because the first column of _src_block_ptr may not be filled by reader, - // so _src_block_ptr->rows() may return wrong result, cause the column created by `ctx->execute()` - // has only one row. - auto mutable_column = result_column_ptr->assume_mutable(); - mutable_column->resize(rows); - // result_column_ptr maybe a ColumnConst, convert it to a normal column - result_column_ptr = result_column_ptr->convert_to_full_column_if_const(); - auto origin_column_type = - block->get_by_position((*_col_name_to_block_idx)[kv.first]).type; - bool is_nullable = origin_column_type->is_nullable(); - block->replace_by_position( - (*_col_name_to_block_idx)[kv.first], - is_nullable ? make_nullable(result_column_ptr) : result_column_ptr); - } - } - } return Status::OK(); } @@ -1639,6 +1659,8 @@ Status OrcReader::_append_iceberg_rowid_column(Block* block, size_t rows, int64_ return Status::OK(); } +// _append_iceberg_rowid_column removed: now handled by ColumnProcessor.fill_synthesized_columns + void OrcReader::_init_system_properties() { if (_scan_range.__isset.file_type) { // for compatibility @@ -1819,16 +1841,12 @@ DataTypePtr OrcReader::convert_to_doris_type(const orc::Type* orc_type) { } } -Status OrcReader::get_columns(std::unordered_map* name_to_type, - std::unordered_set* missing_cols) { +Status OrcReader::_get_columns_impl(std::unordered_map* name_to_type) { const auto& root_type = _reader->getType(); for (int i = 0; i < root_type.getSubtypeCount(); ++i) { name_to_type->emplace(root_type.getFieldName(i), convert_to_doris_type(root_type.getSubtype(i))); } - for (auto& col : _missing_cols) { - missing_cols->insert(col); - } return Status::OK(); } @@ -2381,7 +2399,7 @@ std::string OrcReader::get_field_name_lower_case(const orc::Type* orc_type, int return name; } -Status OrcReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { +Status OrcReader::_do_get_next_block(Block* block, size_t* read_rows, bool* eof) { RETURN_IF_ERROR(_get_next_block_impl(block, read_rows, eof)); if (*eof) { COUNTER_UPDATE(_orc_profile.selected_row_group_count, @@ -2435,21 +2453,6 @@ Status OrcReader::_get_next_block_impl(Block* block, size_t* read_rows, bool* eo *read_rows = 0; return Status::OK(); } - if (_push_down_agg_type == TPushAggOp::type::COUNT) { - auto rows = std::min(get_remaining_rows(), (int64_t)_batch_size); - - set_remaining_rows(get_remaining_rows() - rows); - auto mutate_columns = block->mutate_columns(); - for (auto& col : mutate_columns) { - col->resize(rows); - } - block->set_columns(std::move(mutate_columns)); - *read_rows = rows; - if (get_remaining_rows() == 0) { - *eof = true; - } - return Status::OK(); - } if (!_seek_to_read_one_line()) { *eof = true; @@ -2482,6 +2485,7 @@ Status OrcReader::_get_next_block_impl(Block* block, size_t* read_rows, bool* eo } // After nextBatch(), getRowNumber() returns the start of the batch just read. _last_read_row_number = _row_reader->getRowNumber(); + // Use _batch->numElements (not rr) because ORC's nextBatch has an // internal do-while loop: when the filter callback rejects an entire // batch, the loop retries with the next batch. The return value (rr) @@ -2490,6 +2494,7 @@ Status OrcReader::_get_next_block_impl(Block* block, size_t* read_rows, bool* eo // to that iteration's batch size (Reader.cc:1427), giving the correct // next-read position. _current_read_position = _last_read_row_number + _batch->numElements; + } catch (std::exception& e) { std::string _err_msg = e.what(); if (_io_ctx && _io_ctx->should_stop && _err_msg == "stop") { @@ -2545,13 +2550,30 @@ Status OrcReader::_get_next_block_impl(Block* block, size_t* read_rows, bool* eo #endif } - RETURN_IF_ERROR(_fill_partition_columns(block, _batch->numElements, - _lazy_read_ctx.partition_columns)); - RETURN_IF_ERROR( - _fill_missing_columns(block, _batch->numElements, _lazy_read_ctx.missing_columns)); + { + std::vector part_cols; + for (const auto& kv : _lazy_read_ctx.partition_columns) { + part_cols.push_back(kv.first); + } + RETURN_IF_ERROR(on_fill_partition_columns(block, _batch->numElements, part_cols)); + } + { + std::vector miss_cols; + for (const auto& kv : _lazy_read_ctx.missing_columns) { + miss_cols.push_back(kv.first); + } + RETURN_IF_ERROR(on_fill_missing_columns(block, _batch->numElements, miss_cols)); + } - RETURN_IF_ERROR(_fill_row_id_columns(block, start_row)); - RETURN_IF_ERROR(_append_iceberg_rowid_column(block, block->rows(), start_row)); + // Build sequential row positions for RowPositionProvider + _current_batch_row_positions.resize(block->rows()); + for (size_t i = 0; i < block->rows(); ++i) { + _current_batch_row_positions[i] = + static_cast(start_row + static_cast(i)); + } + if (has_synthesized_column_handlers()) { + RETURN_IF_ERROR(fill_synthesized_columns(block, block->rows())); + } if (block->rows() == 0) { RETURN_IF_ERROR(_convert_dict_cols_to_string_cols(block, nullptr)); @@ -2616,6 +2638,7 @@ Status OrcReader::_get_next_block_impl(Block* block, size_t* read_rows, bool* eo } // After nextBatch(), getRowNumber() returns the start of the batch just read. _last_read_row_number = _row_reader->getRowNumber(); + _current_read_position = _last_read_row_number + _batch->numElements; } catch (std::exception& e) { std::string _err_msg = e.what(); @@ -2678,13 +2701,30 @@ Status OrcReader::_get_next_block_impl(Block* block, size_t* read_rows, bool* eo #endif } - RETURN_IF_ERROR(_fill_partition_columns(block, _batch->numElements, - _lazy_read_ctx.partition_columns)); - RETURN_IF_ERROR( - _fill_missing_columns(block, _batch->numElements, _lazy_read_ctx.missing_columns)); + { + std::vector part_cols; + for (const auto& kv : _lazy_read_ctx.partition_columns) { + part_cols.push_back(kv.first); + } + RETURN_IF_ERROR(on_fill_partition_columns(block, _batch->numElements, part_cols)); + } + { + std::vector miss_cols; + for (const auto& kv : _lazy_read_ctx.missing_columns) { + miss_cols.push_back(kv.first); + } + RETURN_IF_ERROR(on_fill_missing_columns(block, _batch->numElements, miss_cols)); + } - RETURN_IF_ERROR(_fill_row_id_columns(block, start_row)); - RETURN_IF_ERROR(_append_iceberg_rowid_column(block, block->rows(), start_row)); + // Build sequential row positions for RowPositionProvider + _current_batch_row_positions.resize(block->rows()); + for (size_t i = 0; i < block->rows(); ++i) { + _current_batch_row_positions[i] = + static_cast(start_row + static_cast(i)); + } + if (has_synthesized_column_handlers()) { + RETURN_IF_ERROR(fill_synthesized_columns(block, block->rows())); + } if (block->rows() == 0) { RETURN_IF_ERROR(_convert_dict_cols_to_string_cols(block, nullptr)); @@ -2836,10 +2876,9 @@ void OrcReader::_build_delete_row_filter(const Block* block, size_t rows) { auto bucket_id = bucket_id_column.get_int(i); auto row_id = row_id_column.get_int(i); - TransactionalHiveReader::AcidRowID transactional_row_id = { - .original_transaction = original_transaction, - .bucket = bucket_id, - .row_id = row_id}; + AcidRowID transactional_row_id = {.original_transaction = original_transaction, + .bucket = bucket_id, + .row_id = row_id}; if (_delete_rows->contains(transactional_row_id)) { _pos_delete_filter_data[i] = 0; } @@ -2903,9 +2942,20 @@ Status OrcReader::filter(orc::ColumnVectorBatch& data, uint16_t* sel, uint16_t s column_ptr->sanity_check(); #endif } - RETURN_IF_ERROR( - _fill_partition_columns(block, size, _lazy_read_ctx.predicate_partition_columns)); - RETURN_IF_ERROR(_fill_missing_columns(block, size, _lazy_read_ctx.predicate_missing_columns)); + { + std::vector pred_part_cols; + for (const auto& kv : _lazy_read_ctx.predicate_partition_columns) { + pred_part_cols.push_back(kv.first); + } + RETURN_IF_ERROR(on_fill_partition_columns(block, size, pred_part_cols)); + } + { + std::vector pred_miss_cols; + for (const auto& kv : _lazy_read_ctx.predicate_missing_columns) { + pred_miss_cols.push_back(kv.first); + } + RETURN_IF_ERROR(on_fill_missing_columns(block, size, pred_miss_cols)); + } if (_lazy_read_ctx.resize_first_column) { // VExprContext.execute has an optimization, the filtering is executed when block->rows() > 0 // The following process may be tricky and time-consuming, but we have no other way. diff --git a/be/src/format/orc/vorc_reader.h b/be/src/format/orc/vorc_reader.h index 4d7a73d19f6f6e..a4ae7f1e9c6fde 100644 --- a/be/src/format/orc/vorc_reader.h +++ b/be/src/format/orc/vorc_reader.h @@ -41,9 +41,9 @@ #include "exprs/vslot_ref.h" #include "format/column_type_convert.h" #include "format/format_common.h" -#include "format/generic_reader.h" #include "format/table/table_format_reader.h" -#include "format/table/transactional_hive_reader.h" +#include "format/table/table_schema_change_helper.h" +#include "format/table/transactional_hive_common.h" #include "io/file_factory.h" #include "io/fs/buffered_reader.h" #include "io/fs/file_reader.h" @@ -54,7 +54,6 @@ #include "orc/Vector.hh" #include "orc/sargs/Literal.hh" #include "runtime/runtime_profile.h" -#include "storage/olap_common.h" namespace doris { class RuntimeState; @@ -85,6 +84,18 @@ namespace doris { #include "common/compile_check_begin.h" class ORCFileInputStream; +/// ORC-specific initialization context. +/// Extends ReaderInitContext with conjuncts and filter fields. +/// Note: ORC does NOT use slot_id_to_predicates (unlike Parquet). +struct OrcInitContext final : public ReaderInitContext { + // Safe default for standalone readers (delete file readers) without conjuncts. + static inline const VExprContextSPtrs EMPTY_CONJUNCTS {}; + + const VExprContextSPtrs* conjuncts = &EMPTY_CONJUNCTS; + const VExprContextSPtrs* not_single_slot_filter_conjuncts = nullptr; + const std::unordered_map* slot_id_to_filter_conjuncts = nullptr; +}; + struct LazyReadContext { VExprContextSPtrs conjuncts; bool can_lazy_read = false; @@ -117,7 +128,7 @@ struct LazyReadContext { size_t filter_phase_rows = 0; }; -class OrcReader : public GenericReader { +class OrcReader : public TableFormatReader, public RowPositionProvider { ENABLE_FACTORY_CREATOR(OrcReader); public: @@ -161,30 +172,21 @@ class OrcReader : public GenericReader { FileMetaCache* meta_cache = nullptr, bool enable_lazy_mat = true); ~OrcReader() override = default; - //If you want to read the file by index instead of column name, set hive_use_column_names to false. - Status init_reader( - const std::vector* column_names, - std::unordered_map* col_name_to_block_idx, - const VExprContextSPtrs& conjuncts, bool is_acid, - const TupleDescriptor* tuple_descriptor, const RowDescriptor* row_descriptor, - const VExprContextSPtrs* not_single_slot_filter_conjuncts, - const std::unordered_map* slot_id_to_filter_conjuncts, - std::shared_ptr table_info_node_ptr = - TableSchemaChangeHelper::ConstNode::get_instance(), - const std::set& column_ids = {}, - const std::set& filter_column_ids = {}); - - Status set_fill_columns( - const std::unordered_map>& - partition_columns, - const std::unordered_map& missing_columns) override; - Status get_next_block(Block* block, size_t* read_rows, bool* eof) override; + // Override to build table_info_node from ORC file type using by_orc_name. + // Subclasses (HiveOrcReader, IcebergOrcReader) call GenericReader::on_before_init_reader + // directly, so this OrcReader-level override only applies to plain OrcReader (TVF, load). + Status on_before_init_reader(ReaderInitContext* ctx) override; +protected: + // ---- Unified init_reader(ReaderInitContext*) overrides ---- + Status _open_file_reader(ReaderInitContext* ctx) override; + Status _do_init_reader(ReaderInitContext* ctx) override; + +public: int64_t size() const; - Status get_columns(std::unordered_map* name_to_type, - std::unordered_set* missing_cols) override; + Status _get_columns_impl(std::unordered_map* name_to_type) override; Status init_schema_reader() override; @@ -195,9 +197,7 @@ class OrcReader : public GenericReader { _position_delete_ordered_rowids = delete_rows; } - void set_delete_rows(const TransactionalHiveReader::AcidRowIDSet* delete_rows) { - _delete_rows = delete_rows; - } + void set_delete_rows(const AcidRowIDSet* delete_rows) { _delete_rows = delete_rows; } Status filter(orc::ColumnVectorBatch& data, uint16_t* sel, uint16_t size, void* arg); @@ -240,6 +240,8 @@ class OrcReader : public GenericReader { _condition_cache_ctx = std::move(ctx); } + bool supports_count_pushdown() const override { return true; } + int64_t get_total_rows() const override { return _row_reader ? _row_reader->getNumberOfRows() : 0; } @@ -250,10 +252,36 @@ class OrcReader : public GenericReader { (_delete_rows != nullptr && !_delete_rows->empty()); } + // RowPositionProvider implementation + const std::vector& current_batch_row_positions() const override { + return _current_batch_row_positions; + } + protected: void _collect_profile_before_close() override; void _filter_rows_by_condition_cache(size_t* read_rows, bool* eof); + // Core block reading implementation + Status _do_get_next_block(Block* block, size_t* read_rows, bool* eof) override; + + // ORC fills partition/missing columns per-batch internally, + // so suppress TableFormatReader's default on_after_read_block fill. + Status on_after_read_block(Block* /*block*/, size_t* /*read_rows*/) override { + return Status::OK(); + } + + // Protected accessors so CRTP mixin subclasses can reach private members + io::IOContext* get_io_ctx() const { return _io_ctx; } + std::unordered_map*& col_name_to_block_idx_ref() { + return _col_name_to_block_idx; + } + RuntimeProfile* get_profile() const { return _profile; } + RuntimeState* get_state() const { return _state; } + const TFileScanRangeParams& get_scan_params() const { return _scan_params; } + const TFileRangeDesc& get_scan_range() const { return _scan_range; } + const TupleDescriptor* get_tuple_descriptor() const { return _tuple_descriptor; } + const RowDescriptor* get_row_descriptor() const { return _row_descriptor; } + private: struct IcebergRowIdParams { bool enabled = false; @@ -330,6 +358,20 @@ class OrcReader : public GenericReader { static bool _check_acid_schema(const orc::Type& type); + // ---- set_fill_columns sub-functions ---- + // Collect predicate columns from conjuncts for lazy materialization. + void _collect_predicate_columns_from_conjuncts( + std::unordered_map>& predicate_table_columns); + // Classify read/partition/missing columns into lazy vs predicate groups. + void _classify_columns_for_lazy_read( + const std::unordered_map>& + predicate_table_columns, + const std::unordered_map>& + partition_columns, + const std::unordered_map& missing_columns); + // Create ORC row reader with proper options, tiny stripe optimization, and type map. + Status _init_orc_row_reader(); + // functions for building search argument until _init_search_argument // Get predicate type from slot reference std::pair _get_orc_predicate_type(const VSlotRef* slot_ref); @@ -360,13 +402,6 @@ class OrcReader : public GenericReader { void _build_delete_row_filter(const Block* block, size_t rows); Status _get_next_block_impl(Block* block, size_t* read_rows, bool* eof); - Status _fill_partition_columns( - Block* block, uint64_t rows, - const std::unordered_map>& - partition_columns); - Status _fill_missing_columns( - Block* block, uint64_t rows, - const std::unordered_map& missing_columns); void _init_system_properties(); void _init_file_description(); @@ -679,6 +714,13 @@ class OrcReader : public GenericReader { Status _set_read_one_line_impl() override { _batch_size = 1; + // If the ORC row reader already exists, the batch was created earlier + // (during _do_init_reader) with the original _batch_size (capped to + // _MIN_BATCH_SIZE = 4064). We must recreate it with the new size of 1 + // so that nextBatch() returns at most 1 row per call. + if (_row_reader) { + _batch = _row_reader->createRowBatch(_batch_size); + } return Status::OK(); } @@ -693,13 +735,18 @@ class OrcReader : public GenericReader { io::FileDescription _file_description; size_t _batch_size; int64_t _range_start_offset; + +protected: + size_t get_batch_size() const { return _batch_size; } + +private: int64_t _range_size; std::string _ctz; cctz::time_zone _time_zone; // The columns of the table to be read (contain columns that do not exist) - const std::vector* _table_column_names; + std::vector _table_column_names; // The columns of the file to be read (file column name) std::list _read_file_cols; @@ -707,9 +754,6 @@ class OrcReader : public GenericReader { // The columns of the table to be read (table column name) std::list _read_table_cols; - // _read_table_cols + _missing_cols = _table_column_names - std::list _missing_cols; - // file column name to std::vector idx. std::unordered_map _colname_to_idx; @@ -745,20 +789,25 @@ class OrcReader : public GenericReader { io::IOContext* _io_ctx = nullptr; std::shared_ptr _io_ctx_holder; + const TupleDescriptor* _tuple_descriptor = nullptr; + const RowDescriptor* _row_descriptor = nullptr; bool _enable_lazy_mat = true; bool _enable_filter_by_min_max = true; std::vector _decimal_scale_params; size_t _decimal_scale_params_index; +protected: bool _is_acid = false; - std::unique_ptr _filter; + // Protected so Iceberg subclasses can register synthesized columns + // in on_before_init_reader. LazyReadContext _lazy_read_ctx; - const TransactionalHiveReader::AcidRowIDSet* _delete_rows = nullptr; + +private: + std::unique_ptr _filter; + const AcidRowIDSet* _delete_rows = nullptr; std::unique_ptr _delete_rows_filter_ptr; - const TupleDescriptor* _tuple_descriptor = nullptr; - const RowDescriptor* _row_descriptor = nullptr; VExprContextSPtrs _not_single_slot_filter_conjuncts; const std::unordered_map* _slot_id_to_filter_conjuncts = nullptr; VExprContextSPtrs _dict_filter_conjuncts; @@ -790,6 +839,8 @@ class OrcReader : public GenericReader { IcebergRowIdParams _iceberg_rowid_params; std::shared_ptr _row_lineage_columns; + std::vector _current_batch_row_positions; + // Through this node, you can find the file column based on the table column. std::shared_ptr _table_info_node_ptr = TableSchemaChangeHelper::ConstNode::get_instance(); diff --git a/be/src/format/parquet/schema_desc.cpp b/be/src/format/parquet/schema_desc.cpp index f12b7bc4102fb7..6501e0b7a91a10 100644 --- a/be/src/format/parquet/schema_desc.cpp +++ b/be/src/format/parquet/schema_desc.cpp @@ -30,7 +30,8 @@ #include "core/data_type/data_type_map.h" #include "core/data_type/data_type_struct.h" #include "core/data_type/define_primitive_type.h" -#include "format/table/table_format_reader.h" +#include "format/generic_reader.h" +#include "format/table/table_schema_change_helper.h" #include "util/slice.h" #include "util/string_util.h" diff --git a/be/src/format/parquet/vparquet_column_reader.h b/be/src/format/parquet/vparquet_column_reader.h index 02a94731bab380..3fa25667f18516 100644 --- a/be/src/format/parquet/vparquet_column_reader.h +++ b/be/src/format/parquet/vparquet_column_reader.h @@ -29,12 +29,15 @@ #include "common/status.h" #include "core/data_type/data_type.h" +#include "format/generic_reader.h" #include "format/parquet/parquet_column_convert.h" #include "format/parquet/parquet_common.h" #include "format/parquet/vparquet_column_chunk_reader.h" -#include "format/table/table_format_reader.h" +#include "format/table/table_schema_change_helper.h" #include "io/fs/buffered_reader.h" #include "io/fs/file_reader_writer_fwd.h" +#include "parquet_column_convert.h" +#include "vparquet_column_chunk_reader.h" namespace cctz { class time_zone; diff --git a/be/src/format/parquet/vparquet_group_reader.cpp b/be/src/format/parquet/vparquet_group_reader.cpp index 2cb13bfd6003f8..0efeea70c10eb8 100644 --- a/be/src/format/parquet/vparquet_group_reader.cpp +++ b/be/src/format/parquet/vparquet_group_reader.cpp @@ -29,7 +29,6 @@ #include #include "common/config.h" -#include "common/consts.h" #include "common/logging.h" #include "common/object_pool.h" #include "common/status.h" @@ -39,13 +38,10 @@ #include "core/column/column_const.h" #include "core/column/column_nullable.h" #include "core/column/column_string.h" -#include "core/column/column_struct.h" #include "core/column/column_vector.h" #include "core/custom_allocator.h" #include "core/data_type/data_type.h" -#include "core/data_type/data_type_number.h" #include "core/data_type/data_type_string.h" -#include "core/data_type/data_type_struct.h" #include "core/data_type/define_primitive_type.h" #include "core/pod_array.h" #include "core/types.h" @@ -79,63 +75,6 @@ struct IOContext; namespace doris { #include "common/compile_check_begin.h" -namespace { -Status build_iceberg_rowid_column(const DataTypePtr& type, const std::string& file_path, - const std::vector& row_ids, int32_t partition_spec_id, - const std::string& partition_data_json, - MutableColumnPtr* column_out) { - if (type == nullptr || column_out == nullptr) { - return Status::InvalidArgument("Invalid iceberg rowid column type or output column"); - } - - MutableColumnPtr column = type->create_column(); - ColumnNullable* nullable_col = check_and_get_column(column.get()); - ColumnStruct* struct_col = nullptr; - if (nullable_col != nullptr) { - struct_col = - check_and_get_column(nullable_col->get_nested_column_ptr().get()); - } else { - struct_col = check_and_get_column(column.get()); - } - - if (struct_col == nullptr || struct_col->tuple_size() < 4) { - return Status::InternalError("Invalid iceberg rowid column structure"); - } - - size_t num_rows = row_ids.size(); - auto& file_path_col = struct_col->get_column(0); - auto& row_pos_col = struct_col->get_column(1); - auto& spec_id_col = struct_col->get_column(2); - auto& partition_data_col = struct_col->get_column(3); - - file_path_col.reserve(num_rows); - row_pos_col.reserve(num_rows); - spec_id_col.reserve(num_rows); - partition_data_col.reserve(num_rows); - - for (size_t i = 0; i < num_rows; ++i) { - file_path_col.insert_data(file_path.data(), file_path.size()); - } - for (size_t i = 0; i < num_rows; ++i) { - int64_t row_pos = static_cast(row_ids[i]); - row_pos_col.insert_data(reinterpret_cast(&row_pos), sizeof(row_pos)); - } - for (size_t i = 0; i < num_rows; ++i) { - int32_t spec_id = partition_spec_id; - spec_id_col.insert_data(reinterpret_cast(&spec_id), sizeof(spec_id)); - } - for (size_t i = 0; i < num_rows; ++i) { - partition_data_col.insert_data(partition_data_json.data(), partition_data_json.size()); - } - - if (nullable_col != nullptr) { - nullable_col->get_null_map_data().resize_fill(num_rows, 0); - } - - *column_out = std::move(column); - return Status::OK(); -} -} // namespace const std::vector RowGroupReader::NO_DELETE = {}; static constexpr uint32_t MAX_DICT_CODE_PREDICATE_TO_REWRITE = std::numeric_limits::max(); @@ -386,12 +325,15 @@ Status RowGroupReader::next_batch(Block* block, size_t batch_size, size_t* read_ bool modify_row_ids = false; RETURN_IF_ERROR(_read_empty_batch(batch_size, read_rows, batch_eof, &modify_row_ids)); - RETURN_IF_ERROR( - _fill_partition_columns(block, *read_rows, _lazy_read_ctx.partition_columns)); - RETURN_IF_ERROR(_fill_missing_columns(block, *read_rows, _lazy_read_ctx.missing_columns)); - - RETURN_IF_ERROR(_fill_row_id_columns(block, *read_rows, modify_row_ids)); - RETURN_IF_ERROR(_append_iceberg_rowid_column(block, *read_rows, modify_row_ids)); + DCHECK(_table_format_reader); + RETURN_IF_ERROR(_table_format_reader->on_fill_partition_columns( + block, *read_rows, _lazy_read_ctx.partition_col_names)); + RETURN_IF_ERROR(_table_format_reader->on_fill_missing_columns( + block, *read_rows, _lazy_read_ctx.missing_col_names)); + if (_table_format_reader->has_synthesized_column_handlers()) { + RETURN_IF_ERROR(_get_current_batch_row_id(*read_rows)); + } + RETURN_IF_ERROR(_table_format_reader->fill_synthesized_columns(block, *read_rows)); Status st = VExprContext::filter_block(_lazy_read_ctx.conjuncts, block, block->columns()); *read_rows = block->rows(); @@ -405,11 +347,16 @@ Status RowGroupReader::next_batch(Block* block, size_t batch_size, size_t* read_ int64_t batch_base_row = _total_read_rows; RETURN_IF_ERROR((_read_column_data(block, _lazy_read_ctx.all_read_columns, batch_size, read_rows, batch_eof, filter_map))); - RETURN_IF_ERROR( - _fill_partition_columns(block, *read_rows, _lazy_read_ctx.partition_columns)); - RETURN_IF_ERROR(_fill_missing_columns(block, *read_rows, _lazy_read_ctx.missing_columns)); - RETURN_IF_ERROR(_fill_row_id_columns(block, *read_rows, false)); - RETURN_IF_ERROR(_append_iceberg_rowid_column(block, *read_rows, false)); + DCHECK(_table_format_reader); + RETURN_IF_ERROR(_table_format_reader->on_fill_partition_columns( + block, *read_rows, _lazy_read_ctx.partition_col_names)); + RETURN_IF_ERROR(_table_format_reader->on_fill_missing_columns( + block, *read_rows, _lazy_read_ctx.missing_col_names)); + + if (_table_format_reader->has_synthesized_column_handlers()) { + RETURN_IF_ERROR(_get_current_batch_row_id(*read_rows)); + } + RETURN_IF_ERROR(_table_format_reader->fill_synthesized_columns(block, *read_rows)); #ifndef NDEBUG for (auto col : *block) { @@ -683,12 +630,15 @@ Status RowGroupReader::_do_lazy_read(Block* block, size_t batch_size, size_t* re } pre_raw_read_rows += pre_read_rows; - RETURN_IF_ERROR(_fill_partition_columns(block, pre_read_rows, - _lazy_read_ctx.predicate_partition_columns)); - RETURN_IF_ERROR(_fill_missing_columns(block, pre_read_rows, - _lazy_read_ctx.predicate_missing_columns)); - RETURN_IF_ERROR(_fill_row_id_columns(block, pre_read_rows, false)); - RETURN_IF_ERROR(_append_iceberg_rowid_column(block, pre_read_rows, false)); + DCHECK(_table_format_reader); + RETURN_IF_ERROR(_table_format_reader->on_fill_partition_columns( + block, pre_read_rows, _lazy_read_ctx.predicate_partition_col_names)); + RETURN_IF_ERROR(_table_format_reader->on_fill_missing_columns( + block, pre_read_rows, _lazy_read_ctx.predicate_missing_col_names)); + if (_table_format_reader->has_synthesized_column_handlers()) { + RETURN_IF_ERROR(_get_current_batch_row_id(pre_read_rows)); + } + RETURN_IF_ERROR(_table_format_reader->fill_synthesized_columns(block, pre_read_rows)); RETURN_IF_ERROR(_build_pos_delete_filter(pre_read_rows)); @@ -706,7 +656,7 @@ Status RowGroupReader::_do_lazy_read(Block* block, size_t batch_size, size_t* re bool can_filter_all = false; bool resize_first_column = _lazy_read_ctx.resize_first_column; - if (resize_first_column && _iceberg_rowid_params.enabled) { + if (resize_first_column && _table_format_reader->has_synthesized_column_handlers()) { int row_id_idx = block->get_position_by_name(doris::BeConsts::ICEBERG_ROWID_COL); if (row_id_idx == 0) { resize_first_column = false; @@ -716,7 +666,7 @@ Status RowGroupReader::_do_lazy_read(Block* block, size_t batch_size, size_t* re SCOPED_RAW_TIMER(&_predicate_filter_time); // generate filter vector - if (resize_first_column) { + if (_lazy_read_ctx.resize_first_column) { // VExprContext.execute has an optimization, the filtering is executed when block->rows() > 0 // The following process may be tricky and time-consuming, but we have no other way. block->get_by_position(0).column->assume_mutable()->resize(pre_read_rows); @@ -742,7 +692,7 @@ Status RowGroupReader::_do_lazy_read(Block* block, size_t batch_size, size_t* re _mark_condition_cache_granules(result_filter.data(), pre_read_rows, batch_base_row); } - if (resize_first_column) { + if (_lazy_read_ctx.resize_first_column) { // We have to clean the first column to insert right data. block->get_by_position(0).column->assume_mutable()->clear(); } @@ -775,7 +725,7 @@ Status RowGroupReader::_do_lazy_read(Block* block, size_t batch_size, size_t* re .column->assume_mutable() ->clear(); } - if (_iceberg_rowid_params.enabled) { + if (_table_format_reader->has_synthesized_column_handlers()) { int row_id_idx = block->get_position_by_name(doris::BeConsts::ICEBERG_ROWID_COL); if (row_id_idx >= 0) { @@ -843,7 +793,7 @@ Status RowGroupReader::_do_lazy_read(Block* block, size_t batch_size, size_t* re SCOPED_RAW_TIMER(&_predicate_filter_time); if (filter_map.has_filter()) { std::vector predicate_columns = _lazy_read_ctx.all_predicate_col_ids; - if (_iceberg_rowid_params.enabled) { + if (_table_format_reader->has_synthesized_column_handlers()) { int row_id_idx = block->get_position_by_name(doris::BeConsts::ICEBERG_ROWID_COL); if (row_id_idx >= 0 && std::find(predicate_columns.begin(), predicate_columns.end(), @@ -877,8 +827,11 @@ Status RowGroupReader::_do_lazy_read(Block* block, size_t batch_size, size_t* re *read_rows = column_size; *batch_eof = pre_eof; - RETURN_IF_ERROR(_fill_partition_columns(block, column_size, _lazy_read_ctx.partition_columns)); - RETURN_IF_ERROR(_fill_missing_columns(block, column_size, _lazy_read_ctx.missing_columns)); + DCHECK(_table_format_reader); + RETURN_IF_ERROR(_table_format_reader->on_fill_partition_columns( + block, column_size, _lazy_read_ctx.partition_col_names)); + RETURN_IF_ERROR(_table_format_reader->on_fill_missing_columns( + block, column_size, _lazy_read_ctx.missing_col_names)); #ifndef NDEBUG for (auto col : *block) { col.column->sanity_check(); @@ -920,77 +873,6 @@ Status RowGroupReader::_rebuild_filter_map(FilterMap& filter_map, return Status::OK(); } -Status RowGroupReader::_fill_partition_columns( - Block* block, size_t rows, - const std::unordered_map>& - partition_columns) { - DataTypeSerDe::FormatOptions _text_formatOptions; - for (const auto& kv : partition_columns) { - auto doris_column = block->get_by_position((*_col_name_to_block_idx)[kv.first]).column; - // obtained from block*, it is a mutable object. - auto* col_ptr = const_cast(doris_column.get()); - const auto& [value, slot_desc] = kv.second; - auto _text_serde = slot_desc->get_data_type_ptr()->get_serde(); - Slice slice(value.data(), value.size()); - uint64_t num_deserialized = 0; - // Be careful when reading empty rows from parquet row groups. - if (_text_serde->deserialize_column_from_fixed_json(*col_ptr, slice, rows, - &num_deserialized, - _text_formatOptions) != Status::OK()) { - return Status::InternalError("Failed to fill partition column: {}={}", - slot_desc->col_name(), value); - } - if (num_deserialized != rows) { - return Status::InternalError( - "Failed to fill partition column: {}={} ." - "Number of rows expected to be written : {}, number of rows actually written : " - "{}", - slot_desc->col_name(), value, num_deserialized, rows); - } - } - return Status::OK(); -} - -Status RowGroupReader::_fill_missing_columns( - Block* block, size_t rows, - const std::unordered_map& missing_columns) { - for (const auto& kv : missing_columns) { - if (!_col_name_to_block_idx->contains(kv.first)) { - return Status::InternalError("Missing column: {} not found in block {}", kv.first, - block->dump_structure()); - } - if (kv.second == nullptr) { - // no default column, fill with null - auto mutable_column = block->get_by_position((*_col_name_to_block_idx)[kv.first]) - .column->assume_mutable(); - auto* nullable_column = assert_cast(mutable_column.get()); - nullable_column->insert_many_defaults(rows); - } else { - // fill with default value - const auto& ctx = kv.second; - ColumnPtr result_column_ptr; - // PT1 => dest primitive type - RETURN_IF_ERROR(ctx->execute(block, result_column_ptr)); - if (result_column_ptr->use_count() == 1) { - // call resize because the first column of _src_block_ptr may not be filled by reader, - // so _src_block_ptr->rows() may return wrong result, cause the column created by `ctx->execute()` - // has only one row. - auto mutable_column = result_column_ptr->assume_mutable(); - mutable_column->resize(rows); - // result_column_ptr maybe a ColumnConst, convert it to a normal column - result_column_ptr = result_column_ptr->convert_to_full_column_if_const(); - auto origin_column_type = - block->get_by_position((*_col_name_to_block_idx)[kv.first]).type; - bool is_nullable = origin_column_type->is_nullable(); - block->replace_by_position( - (*_col_name_to_block_idx)[kv.first], - is_nullable ? make_nullable(result_column_ptr) : result_column_ptr); - } - } - } - return Status::OK(); -} - Status RowGroupReader::_read_empty_batch(size_t batch_size, size_t* read_rows, bool* batch_eof, bool* modify_row_ids) { *modify_row_ids = false; @@ -1017,8 +899,8 @@ Status RowGroupReader::_read_empty_batch(size_t batch_size, size_t* read_rows, b _position_delete_ctx.current_row_id = end_row_id; *batch_eof = _position_delete_ctx.current_row_id == _position_delete_ctx.last_row_id; - if (_row_id_column_iterator_pair.first != nullptr || _iceberg_rowid_params.enabled || - (_row_lineage_columns != nullptr && _row_lineage_columns->need_row_ids())) { + if (_row_id_column_iterator_pair.first != nullptr || + _table_format_reader->has_synthesized_column_handlers()) { *modify_row_ids = true; _current_batch_row_ids.clear(); _current_batch_row_ids.resize(*read_rows); @@ -1042,7 +924,7 @@ Status RowGroupReader::_read_empty_batch(size_t batch_size, size_t* read_rows, b _remaining_rows = 0; *batch_eof = true; } - if (_iceberg_rowid_params.enabled) { + if (_table_format_reader->has_synthesized_column_handlers()) { *modify_row_ids = true; RETURN_IF_ERROR(_get_current_batch_row_id(*read_rows)); } @@ -1079,15 +961,9 @@ Status RowGroupReader::_get_current_batch_row_id(size_t read_rows) { return Status::OK(); } -Status RowGroupReader::_fill_row_id_columns(Block* block, size_t read_rows, - bool is_current_row_ids) { - const bool need_row_ids = - _row_id_column_iterator_pair.first != nullptr || - (_row_lineage_columns != nullptr && _row_lineage_columns->need_row_ids()); - if (need_row_ids && !is_current_row_ids) { - RETURN_IF_ERROR(_get_current_batch_row_id(read_rows)); - } +Status RowGroupReader::fill_topn_row_id(Block* block, size_t read_rows) { if (_row_id_column_iterator_pair.first != nullptr) { + // _get_current_batch_row_id must be called before fill_synthesized_columns auto col = block->get_by_position(_row_id_column_iterator_pair.second) .column->assume_mutable(); RETURN_IF_ERROR(_row_id_column_iterator_pair.first->read_by_rowids( @@ -1132,56 +1008,6 @@ Status RowGroupReader::_fill_row_id_columns(Block* block, size_t read_rows, return Status::OK(); } -Status RowGroupReader::_append_iceberg_rowid_column(Block* block, size_t read_rows, - bool is_current_row_ids) { - if (!_iceberg_rowid_params.enabled) { - return Status::OK(); - } - if (!is_current_row_ids) { - RETURN_IF_ERROR(_get_current_batch_row_id(read_rows)); - } - - int row_id_idx = block->get_position_by_name(doris::BeConsts::ICEBERG_ROWID_COL); - if (row_id_idx >= 0) { - auto& col_with_type = block->get_by_position(static_cast(row_id_idx)); - MutableColumnPtr row_id_column; - RETURN_IF_ERROR(build_iceberg_rowid_column( - col_with_type.type, _iceberg_rowid_params.file_path, _current_batch_row_ids, - _iceberg_rowid_params.partition_spec_id, _iceberg_rowid_params.partition_data_json, - &row_id_column)); - col_with_type.column = std::move(row_id_column); - } else { - DataTypes field_types; - field_types.push_back(std::make_shared()); - field_types.push_back(std::make_shared()); - field_types.push_back(std::make_shared()); - field_types.push_back(std::make_shared()); - - std::vector field_names = {"file_path", "row_position", "partition_spec_id", - "partition_data"}; - - auto row_id_type = std::make_shared(field_types, field_names); - MutableColumnPtr row_id_column; - RETURN_IF_ERROR(build_iceberg_rowid_column( - row_id_type, _iceberg_rowid_params.file_path, _current_batch_row_ids, - _iceberg_rowid_params.partition_spec_id, _iceberg_rowid_params.partition_data_json, - &row_id_column)); - int insert_pos = _iceberg_rowid_params.row_id_column_pos; - if (insert_pos < 0 || insert_pos > static_cast(block->columns())) { - insert_pos = static_cast(block->columns()); - } - block->insert(static_cast(insert_pos), - ColumnWithTypeAndName(std::move(row_id_column), row_id_type, - doris::BeConsts::ICEBERG_ROWID_COL)); - } - - if (_col_name_to_block_idx != nullptr) { - *_col_name_to_block_idx = block->get_name_to_pos_map(); - } - - return Status::OK(); -} - Status RowGroupReader::_build_pos_delete_filter(size_t read_rows) { if (!_position_delete_ctx.has_filter) { _pos_delete_filter_ptr.reset(nullptr); diff --git a/be/src/format/parquet/vparquet_group_reader.h b/be/src/format/parquet/vparquet_group_reader.h index fa1e1127f959a6..dfb593f1ae7054 100644 --- a/be/src/format/parquet/vparquet_group_reader.h +++ b/be/src/format/parquet/vparquet_group_reader.h @@ -27,14 +27,17 @@ #include #include "common/status.h" +#include "core/block/block.h" #include "core/column/column.h" #include "exprs/vexpr_fwd.h" #include "format/parquet/parquet_common.h" #include "format/parquet/vparquet_column_reader.h" #include "format/table/table_format_reader.h" +#include "format/table/table_schema_change_helper.h" #include "io/fs/file_reader_writer_fwd.h" #include "storage/id_manager.h" -#include "storage/utils.h" +#include "storage/segment/common.h" +#include "vparquet_column_reader.h" namespace cctz { class time_zone; @@ -67,15 +70,8 @@ namespace doris { #include "common/compile_check_begin.h" // TODO: we need to determine it by test. -class RowGroupReader : public ProfileCollector { +class RowGroupReader : public ProfileCollector, public RowPositionProvider { public: - struct IcebergRowIdParams { - bool enabled = false; - std::string file_path; - int32_t partition_spec_id = 0; - std::string partition_data_json; - int row_id_column_pos = -1; - }; std::shared_ptr _table_info_node_ptr; static const std::vector NO_DELETE; @@ -92,11 +88,6 @@ class RowGroupReader : public ProfileCollector { // all conjuncts: in sql, join runtime filter, topn runtime filter. VExprContextSPtrs conjuncts; - // ParquetReader::set_fill_columns(xxx, xxx) will set these two members - std::unordered_map> - fill_partition_columns; - std::unordered_map fill_missing_columns; - phmap::flat_hash_map>> slot_id_to_predicates; bool can_lazy_read = false; @@ -122,6 +113,14 @@ class RowGroupReader : public ProfileCollector { std::unordered_map missing_columns; // should turn off filtering by page index, lazy read and dict filter if having complex type bool has_complex_type = false; + + // ColumnProcessor path: column name lists for each category. + // Predicate phase: columns involved in predicate filtering. + std::vector predicate_partition_col_names; + std::vector predicate_missing_col_names; + // Remaining phase: columns filled after lazy reads. + std::vector partition_col_names; + std::vector missing_col_names; }; /** @@ -188,6 +187,8 @@ class RowGroupReader : public ProfileCollector { ParquetColumnReader::ColumnStatistics merged_column_statistics(); void set_remaining_rows(int64_t rows) { _remaining_rows = rows; } + Status fill_topn_row_id(Block* block, size_t read_rows); + int64_t get_remaining_rows() { return _remaining_rows; } // Filters read_ranges by removing row chunks whose condition cache granules are all-false. @@ -202,8 +203,11 @@ class RowGroupReader : public ProfileCollector { _row_id_column_iterator_pair = iterator_pair; } - void set_iceberg_rowid_params(const IcebergRowIdParams& params) { - _iceberg_rowid_params = params; + void set_table_format_reader(TableFormatReader* reader) { _table_format_reader = reader; } + + // RowPositionProvider interface + const std::vector& current_batch_row_positions() const override { + return _current_batch_row_ids; } void set_row_lineage_columns(std::shared_ptr row_lineage_columns) { @@ -242,13 +246,7 @@ class RowGroupReader : public ProfileCollector { Status _rebuild_filter_map(FilterMap& filter_map, DorisUniqueBufferPtr& filter_map_data, size_t pre_read_rows) const; - Status _fill_partition_columns( - Block* block, size_t rows, - const std::unordered_map>& - partition_columns); - Status _fill_missing_columns( - Block* block, size_t rows, - const std::unordered_map& missing_columns); + Status _build_pos_delete_filter(size_t read_rows); Status _filter_block(Block* block, int column_to_keep, const std::vector& columns_to_filter); @@ -265,8 +263,6 @@ class RowGroupReader : public ProfileCollector { int64_t batch_seq_start); Status _get_current_batch_row_id(size_t read_rows); - Status _fill_row_id_columns(Block* block, size_t read_rows, bool is_current_row_ids); - Status _append_iceberg_rowid_column(Block* block, size_t read_rows, bool is_current_row_ids); io::FileReaderSPtr _file_reader; std::unordered_map> @@ -313,7 +309,7 @@ class RowGroupReader : public ProfileCollector { std::vector _current_batch_row_ids; std::unordered_map* _col_name_to_block_idx = nullptr; - IcebergRowIdParams _iceberg_rowid_params; + TableFormatReader* _table_format_reader = nullptr; }; #include "common/compile_check_end.h" diff --git a/be/src/format/parquet/vparquet_reader.cpp b/be/src/format/parquet/vparquet_reader.cpp index 6de215608f72c8..b9838d7d6a051b 100644 --- a/be/src/format/parquet/vparquet_reader.cpp +++ b/be/src/format/parquet/vparquet_reader.cpp @@ -393,30 +393,53 @@ void ParquetReader::_init_file_description() { } } -Status ParquetReader::init_reader( - const std::vector& all_column_names, - std::unordered_map* col_name_to_block_idx, - const VExprContextSPtrs& conjuncts, - phmap::flat_hash_map>>& - slot_id_to_predicates, - const TupleDescriptor* tuple_descriptor, const RowDescriptor* row_descriptor, - const std::unordered_map* colname_to_slot_id, - const VExprContextSPtrs* not_single_slot_filter_conjuncts, - const std::unordered_map* slot_id_to_filter_conjuncts, - std::shared_ptr table_info_node_ptr, bool filter_groups, - const std::set& column_ids, const std::set& filter_column_ids) { - _col_name_to_block_idx = col_name_to_block_idx; - _tuple_descriptor = tuple_descriptor; - _row_descriptor = row_descriptor; - _colname_to_slot_id = colname_to_slot_id; - _not_single_slot_filter_conjuncts = not_single_slot_filter_conjuncts; - _slot_id_to_filter_conjuncts = slot_id_to_filter_conjuncts; - _table_info_node_ptr = table_info_node_ptr; - _filter_groups = filter_groups; - _column_ids = column_ids; - _filter_column_ids = filter_column_ids; +Status ParquetReader::on_before_init_reader(ReaderInitContext* ctx) { + _column_descs = ctx->column_descs; + _fill_col_name_to_block_idx = ctx->col_name_to_block_idx; + RETURN_IF_ERROR( + _extract_partition_values(*ctx->range, ctx->tuple_descriptor, _fill_partition_values)); + for (auto& desc : *ctx->column_descs) { + if (desc.category == ColumnCategory::REGULAR || + desc.category == ColumnCategory::GENERATED) { + ctx->column_names.push_back(desc.name); + } + } - RETURN_IF_ERROR(_open_file()); + // Build table_info_node from Parquet file metadata with case-insensitive recursive matching. + // File is already opened by init_reader before this hook, so metadata is available. + // tuple_descriptor may be null in unit tests that only set column_descs. + if (ctx->tuple_descriptor != nullptr) { + const FieldDescriptor* field_desc = nullptr; + RETURN_IF_ERROR(get_file_metadata_schema(&field_desc)); + RETURN_IF_ERROR(TableSchemaChangeHelper::BuildTableInfoUtil::by_parquet_name( + ctx->tuple_descriptor, *field_desc, ctx->table_info_node)); + } + + return Status::OK(); +} + +Status ParquetReader::_open_file_reader(ReaderInitContext* /*ctx*/) { + return _open_file(); +} + +Status ParquetReader::_do_init_reader(ReaderInitContext* base_ctx) { + auto* ctx = checked_context_cast(base_ctx); + _col_name_to_block_idx = base_ctx->col_name_to_block_idx; + _tuple_descriptor = ctx->tuple_descriptor; + _row_descriptor = ctx->row_descriptor; + _colname_to_slot_id = ctx->colname_to_slot_id; + _not_single_slot_filter_conjuncts = ctx->not_single_slot_filter_conjuncts; + _slot_id_to_filter_conjuncts = ctx->slot_id_to_filter_conjuncts; + _filter_groups = ctx->filter_groups; + _table_info_node_ptr = base_ctx->table_info_node; + _column_ids = base_ctx->column_ids; + _filter_column_ids = base_ctx->filter_column_ids; + + // _open_file() is called by init_reader template method before hooks. + // For standalone _do_init_reader callers (tvf, load, etc.), open the file here if not already opened. + if (_file_metadata == nullptr) { + RETURN_IF_ERROR(_open_file()); + } _t_metadata = &(_file_metadata->to_thrift()); if (_file_metadata == nullptr) { return Status::InternalError("failed to init parquet reader, please open reader first"); @@ -429,30 +452,127 @@ Status ParquetReader::init_reader( } _current_row_group_index = RowGroupReader::RowGroupIndex {-1, 0, 0}; - _table_column_names = &all_column_names; - auto schema_desc = _file_metadata->schema(); - - std::map required_file_columns; //file column -> table column - for (auto table_column_name : all_column_names) { - if (_table_info_node_ptr->children_column_exists(table_column_name)) { - required_file_columns.emplace( - _table_info_node_ptr->children_file_column_name(table_column_name), - table_column_name); - } else { - _missing_cols.emplace_back(table_column_name); + // Compute missing columns and file↔table column mapping. + // This runs in _do_init_reader (not on_before_init_reader) because table-format readers + // (Iceberg, Paimon, Hive, Hudi) override on_before_init_reader completely. + if (has_column_descs()) { + _fill_missing_cols.clear(); + _fill_missing_defaults.clear(); + for (const auto& col_name : base_ctx->column_names) { + if (!_table_info_node_ptr->children_column_exists(col_name)) { + _fill_missing_cols.insert(col_name); + } } + if (_column_descs && !_fill_missing_cols.empty()) { + for (const auto& desc : *_column_descs) { + if (_fill_missing_cols.contains(desc.name) && + !_fill_partition_values.contains(desc.name)) { + _fill_missing_defaults[desc.name] = desc.default_expr; + } + } + } + // Resolve file-column ↔ table-column mapping in file-schema order. + // Iterating schema_desc preserves the physical column order for efficient reads. + auto schema_desc = _file_metadata->schema(); + std::map required_file_columns; + for (const auto& table_column_name : base_ctx->column_names) { + if (_fill_missing_cols.contains(table_column_name)) { + continue; + } + auto file_col = _table_info_node_ptr->children_file_column_name(table_column_name); + required_file_columns.emplace(file_col, table_column_name); + } + for (int i = 0; i < schema_desc.size(); ++i) { + const auto& name = schema_desc.get_column(i)->name; + if (required_file_columns.contains(name)) { + _read_file_columns.emplace_back(name); + _read_table_columns.emplace_back(required_file_columns[name]); + _read_table_columns_set.insert(required_file_columns[name]); + } + } + // Register row-position-based synthesized column handler. } - for (int i = 0; i < schema_desc.size(); ++i) { - const auto& name = schema_desc.get_column(i)->name; - if (required_file_columns.contains(name)) { - _read_file_columns.emplace_back(name); - _read_table_columns.emplace_back(required_file_columns[name]); - _read_table_columns_set.insert(required_file_columns[name]); + + // Register row-position-based synthesized column handler. + // _row_id_column_iterator_pair and _row_lineage_columns are set before init_reader + // by FileScanner. This must be outside has_column_descs() guard because standalone + // readers also need synthesized column handlers. + if (_row_id_column_iterator_pair.first != nullptr || + (_row_lineage_columns != nullptr && + (_row_lineage_columns->need_row_ids() || + _row_lineage_columns->has_last_updated_sequence_number_column()))) { + register_synthesized_column_handler( + BeConsts::ROWID_COL, [this](Block* block, size_t rows) -> Status { + if (_current_group_reader) { + return _current_group_reader->fill_topn_row_id(block, rows); + } + return Status::OK(); + }); + } + + // Standalone callers (column_descs == nullptr) skip on_before_init_reader, + // so _read_file_columns etc. are not populated. Use table_info_node for name mapping + // when available, otherwise fall back to 1:1 mapping using file schema. + // Must iterate in file schema order (not user column order) so that + // _generate_random_access_ranges sees monotonically increasing chunk offsets. + if (!has_column_descs() && _read_file_columns.empty()) { + auto schema_desc = _file_metadata->schema(); + // Build map: file_col_name -> table_col_name for requested columns. + std::unordered_map required_file_columns; + for (const auto& col_name : base_ctx->column_names) { + std::string file_col_name = col_name; + if (_table_info_node_ptr && _table_info_node_ptr->children_column_exists(col_name)) { + file_col_name = _table_info_node_ptr->children_file_column_name(col_name); + } + required_file_columns[file_col_name] = col_name; + } + // Iterate file schema to preserve physical column order. + for (int i = 0; i < schema_desc.size(); ++i) { + const auto& name = schema_desc.get_column(i)->name; + if (required_file_columns.contains(name)) { + _read_file_columns.emplace_back(name); + _read_table_columns.emplace_back(required_file_columns[name]); + _read_table_columns_set.insert(required_file_columns[name]); + } } } + // build column predicates for column lazy read - _lazy_read_ctx.conjuncts = conjuncts; - _lazy_read_ctx.slot_id_to_predicates = slot_id_to_predicates; + if (ctx->conjuncts != nullptr) { + _lazy_read_ctx.conjuncts = *ctx->conjuncts; + } + if (ctx->slot_id_to_predicates != nullptr) { + _lazy_read_ctx.slot_id_to_predicates = *ctx->slot_id_to_predicates; + } + + // ---- Inlined set_fill_columns logic (partition/missing/synthesized classification) ---- + + // 1. Collect predicate columns from conjuncts for lazy materialization + std::unordered_map> predicate_columns; + _collect_predicate_columns_from_conjuncts(predicate_columns); + + // 2. Classify read/partition/missing/synthesized columns into lazy vs predicate groups + _classify_columns_for_lazy_read(predicate_columns, _fill_partition_values, + _fill_missing_defaults); + + // 3. Populate col_names vectors for ColumnProcessor path + for (auto& kv : _lazy_read_ctx.predicate_partition_columns) { + _lazy_read_ctx.predicate_partition_col_names.emplace_back(kv.first); + } + for (auto& kv : _lazy_read_ctx.predicate_missing_columns) { + _lazy_read_ctx.predicate_missing_col_names.emplace_back(kv.first); + } + for (auto& kv : _lazy_read_ctx.partition_columns) { + _lazy_read_ctx.partition_col_names.emplace_back(kv.first); + } + for (auto& kv : _lazy_read_ctx.missing_columns) { + _lazy_read_ctx.missing_col_names.emplace_back(kv.first); + } + + if (_filter_groups && (_total_groups == 0 || _t_metadata->num_rows == 0 || _range_size < 0)) { + return Status::EndOfFile("No row group to read"); + } + return Status::OK(); } @@ -478,18 +598,8 @@ bool ParquetReader::_type_matches(const int cid) const { !is_complex_type(table_col_type->get_primitive_type()); } -Status ParquetReader::set_fill_columns( - const std::unordered_map>& - partition_columns, - const std::unordered_map& missing_columns) { - _lazy_read_ctx.fill_partition_columns = partition_columns; - _lazy_read_ctx.fill_missing_columns = missing_columns; - - // std::unordered_map> - std::unordered_map> predicate_columns; - - // TODO(gabriel): we should try to clear too much structs which are used to represent conjuncts and predicates. - // visit_slot for lazy mat. +void ParquetReader::_collect_predicate_columns_from_conjuncts( + std::unordered_map>& predicate_columns) { std::function visit_slot = [&](VExpr* expr) { if (expr->is_slot_ref()) { VSlotRef* slot_ref = static_cast(expr); @@ -505,19 +615,18 @@ Status ParquetReader::set_fill_columns( visit_slot(child.get()); } }; + for (const auto& conjunct : _lazy_read_ctx.conjuncts) { auto expr = conjunct->root(); - if (expr->is_rf_wrapper()) { - // REF: src/runtime_filter/runtime_filter_consumer.cpp VRuntimeFilterWrapper* runtime_filter = assert_cast(expr.get()); - auto filter_impl = runtime_filter->get_impl(); visit_slot(filter_impl.get()); } else { visit_slot(expr.get()); } } + if (!_lazy_read_ctx.slot_id_to_predicates.empty()) { auto and_pred = AndBlockColumnPredicate::create_unique(); for (const auto& entry : _lazy_read_ctx.slot_id_to_predicates) { @@ -533,7 +642,13 @@ Status ParquetReader::set_fill_columns( _push_down_predicates.push_back(std::move(and_pred)); } } +} +void ParquetReader::_classify_columns_for_lazy_read( + const std::unordered_map>& predicate_columns, + const std::unordered_map>& + partition_columns, + const std::unordered_map& missing_columns) { const FieldDescriptor& schema = _file_metadata->schema(); auto check_iceberg_row_lineage_column_idx = [&](const auto& col_name) -> int { @@ -585,7 +700,7 @@ Status ParquetReader::set_fill_columns( _lazy_read_ctx.all_predicate_col_ids.emplace_back(_row_id_column_iterator_pair.second); } - for (auto& kv : _lazy_read_ctx.fill_partition_columns) { + for (auto& kv : partition_columns) { auto iter = predicate_columns.find(kv.first); if (iter == predicate_columns.end()) { _lazy_read_ctx.partition_columns.emplace(kv.first, kv.second); @@ -595,7 +710,7 @@ Status ParquetReader::set_fill_columns( } } - for (auto& kv : _lazy_read_ctx.fill_missing_columns) { + for (auto& kv : missing_columns) { auto iter = predicate_columns.find(kv.first); if (iter != predicate_columns.end()) { //For check missing column : missing column == xx, missing column is null,missing column is not null. @@ -605,7 +720,6 @@ Status ParquetReader::set_fill_columns( _lazy_read_ctx.missing_columns_conjuncts.emplace_back(ctx); } } - _lazy_read_ctx.predicate_missing_columns.emplace(kv.first, kv.second); _lazy_read_ctx.all_predicate_col_ids.emplace_back(iter->second.first); } else if (auto row_lineage_idx = check_iceberg_row_lineage_column_idx(kv.first); @@ -630,12 +744,6 @@ Status ParquetReader::set_fill_columns( _lazy_read_ctx.missing_columns.emplace(kv.first, kv.second); } } - - if (_filter_groups && (_total_groups == 0 || _t_metadata->num_rows == 0 || _range_size < 0)) { - return Status::EndOfFile("No row group to read"); - } - _fill_all_columns = true; - return Status::OK(); } // init file reader and file metadata for parsing schema @@ -657,22 +765,8 @@ Status ParquetReader::get_parsed_schema(std::vector* col_names, return Status::OK(); } -void ParquetReader::set_iceberg_rowid_params(const std::string& file_path, - int32_t partition_spec_id, - const std::string& partition_data_json, - int row_id_column_pos) { - _iceberg_rowid_params.enabled = true; - _iceberg_rowid_params.file_path = file_path; - _iceberg_rowid_params.partition_spec_id = partition_spec_id; - _iceberg_rowid_params.partition_data_json = partition_data_json; - _iceberg_rowid_params.row_id_column_pos = row_id_column_pos; - if (_current_group_reader != nullptr) { - _current_group_reader->set_iceberg_rowid_params(_iceberg_rowid_params); - } -} - -Status ParquetReader::get_columns(std::unordered_map* name_to_type, - std::unordered_set* missing_cols) { +Status ParquetReader::_get_columns_impl( + std::unordered_map* name_to_type) { const auto& schema_desc = _file_metadata->schema(); std::unordered_set column_names; schema_desc.get_column_names(&column_names); @@ -680,13 +774,10 @@ Status ParquetReader::get_columns(std::unordered_map* auto field = schema_desc.get_column(name); name_to_type->emplace(name, field->data_type); } - for (auto& col : _missing_cols) { - missing_cols->insert(col); - } return Status::OK(); } -Status ParquetReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { +Status ParquetReader::_do_get_next_block(Block* block, size_t* read_rows, bool* eof) { if (_current_group_reader == nullptr || _row_group_eof) { Status st = _next_row_group_reader(); if (!st.ok() && !st.is()) { @@ -700,24 +791,6 @@ Status ParquetReader::get_next_block(Block* block, size_t* read_rows, bool* eof) return Status::OK(); } } - if (_push_down_agg_type == TPushAggOp::type::COUNT) { - auto rows = std::min(_current_group_reader->get_remaining_rows(), (int64_t)_batch_size); - - _current_group_reader->set_remaining_rows(_current_group_reader->get_remaining_rows() - - rows); - auto mutate_columns = block->mutate_columns(); - for (auto& col : mutate_columns) { - col->resize(rows); - } - block->set_columns(std::move(mutate_columns)); - - *read_rows = rows; - if (_current_group_reader->get_remaining_rows() == 0) { - _current_group_reader.reset(nullptr); - } - - return Status::OK(); - } SCOPED_RAW_TIMER(&_reader_statistics.column_read_time); Status batch_st = @@ -759,7 +832,13 @@ Status ParquetReader::get_next_block(Block* block, size_t* read_rows, bool* eof) RowGroupReader::PositionDeleteContext ParquetReader::_get_position_delete_ctx( const tparquet::RowGroup& row_group, const RowGroupReader::RowGroupIndex& row_group_index) { + LOG(INFO) << "[PosDeleteDebug] _get_position_delete_ctx: _delete_rows=" + << (_delete_rows ? "set(" + std::to_string(_delete_rows->size()) + ")" : "null") + << " row_group.num_rows=" << row_group.num_rows + << " first_row=" << row_group_index.first_row; if (_delete_rows == nullptr) { + LOG(INFO) << "[PosDeleteDebug] _get_position_delete_ctx: NO delete rows, returning " + "no-filter ctx"; return RowGroupReader::PositionDeleteContext(row_group.num_rows, row_group_index.first_row); } const int64_t* delete_rows = &(*_delete_rows)[0]; @@ -871,9 +950,6 @@ Status ParquetReader::_next_row_group_reader() { : group_file_reader, _read_table_columns, _current_row_group_index.row_group_id, row_group, _ctz, _io_ctx, position_delete_ctx, _lazy_read_ctx, _state, _column_ids, _filter_column_ids)); - if (_iceberg_rowid_params.enabled) { - _current_group_reader->set_iceberg_rowid_params(_iceberg_rowid_params); - } _row_group_eof = false; _current_group_reader->set_current_row_group_idx(_current_row_group_index); @@ -883,6 +959,7 @@ Status ParquetReader::_next_row_group_reader() { if (_condition_cache_ctx) { _current_group_reader->set_condition_cache_context(_condition_cache_ctx); } + _current_group_reader->set_table_format_reader(this); _current_group_reader->_table_info_node_ptr = _table_info_node_ptr; return _current_group_reader->init(_file_metadata->schema(), candidate_row_ranges, _col_offsets, diff --git a/be/src/format/parquet/vparquet_reader.h b/be/src/format/parquet/vparquet_reader.h index 5172c8efdb9df3..b347608a5bc397 100644 --- a/be/src/format/parquet/vparquet_reader.h +++ b/be/src/format/parquet/vparquet_reader.h @@ -30,12 +30,12 @@ #include #include "common/status.h" -#include "format/generic_reader.h" #include "format/parquet/parquet_common.h" #include "format/parquet/parquet_predicate.h" #include "format/parquet/vparquet_column_reader.h" #include "format/parquet/vparquet_group_reader.h" #include "format/table/table_format_reader.h" +#include "format/table/table_schema_change_helper.h" #include "io/file_factory.h" #include "io/fs/file_meta_cache.h" #include "io/fs/file_reader.h" @@ -69,7 +69,26 @@ struct RowLineageColumns; namespace doris { #include "common/compile_check_begin.h" -class ParquetReader : public GenericReader { + +/// Parquet-specific initialization context. +/// Extends ReaderInitContext with predicate pushdown fields. +struct ParquetInitContext final : public ReaderInitContext { + // Safe defaults for standalone readers (delete file readers, push handler) + // that don't have conjuncts/predicates. Dereferenced by _do_init_reader. + static inline const VExprContextSPtrs EMPTY_CONJUNCTS {}; + static inline phmap::flat_hash_map>> + EMPTY_SLOT_PREDICATES {}; + + const VExprContextSPtrs* conjuncts = &EMPTY_CONJUNCTS; + phmap::flat_hash_map>>* + slot_id_to_predicates = &EMPTY_SLOT_PREDICATES; + const std::unordered_map* colname_to_slot_id = nullptr; + const VExprContextSPtrs* not_single_slot_filter_conjuncts = nullptr; + const std::unordered_map* slot_id_to_filter_conjuncts = nullptr; + bool filter_groups = true; +}; + +class ParquetReader : public TableFormatReader { ENABLE_FACTORY_CREATOR(ParquetReader); public: @@ -123,23 +142,17 @@ class ParquetReader : public GenericReader { void set_file_reader(io::FileReaderSPtr file_reader); #endif - Status init_reader( - const std::vector& all_column_names, - std::unordered_map* col_name_to_block_idx, - const VExprContextSPtrs& conjuncts, - phmap::flat_hash_map>>& - slot_id_to_predicates, - const TupleDescriptor* tuple_descriptor, const RowDescriptor* row_descriptor, - const std::unordered_map* colname_to_slot_id, - const VExprContextSPtrs* not_single_slot_filter_conjuncts, - const std::unordered_map* slot_id_to_filter_conjuncts, - std::shared_ptr table_info_node_ptr = - TableSchemaChangeHelper::ConstNode::get_instance(), - bool filter_groups = true, const std::set& column_ids = {}, - const std::set& filter_column_ids = {}); - - Status get_next_block(Block* block, size_t* read_rows, bool* eof) override; + // Override to build table_info_node from Parquet file metadata using by_parquet_name. + // Subclasses (HiveParquetReader, etc.) call GenericReader::on_before_init_reader directly, + // so this override only applies to plain ParquetReader (TVF, load). + Status on_before_init_reader(ReaderInitContext* ctx) override; + +protected: + // ---- Unified init_reader(ReaderInitContext*) overrides ---- + Status _open_file_reader(ReaderInitContext* ctx) override; + Status _do_init_reader(ReaderInitContext* ctx) override; +public: Status close() override; // set the delete rows in current parquet file @@ -147,8 +160,7 @@ class ParquetReader : public GenericReader { int64_t size() const { return _file_reader->size(); } - Status get_columns(std::unordered_map* name_to_type, - std::unordered_set* missing_cols) override; + Status _get_columns_impl(std::unordered_map* name_to_type) override; Status init_schema_reader() override; @@ -159,12 +171,6 @@ class ParquetReader : public GenericReader { const tparquet::FileMetaData* get_meta_data() const { return _t_metadata; } - // Partition columns will not be materialized in parquet files. So we should fill it with missing columns. - Status set_fill_columns( - const std::unordered_map>& - partition_columns, - const std::unordered_map& missing_columns) override; - Status get_file_metadata_schema(const FieldDescriptor** ptr); void set_row_id_column_iterator( @@ -172,8 +178,11 @@ class ParquetReader : public GenericReader { _row_id_column_iterator_pair = iterator_pair; } - void set_iceberg_rowid_params(const std::string& file_path, int32_t partition_spec_id, - const std::string& partition_data_json, int row_id_column_pos); + /// Access current batch row positions (delegates to RowGroupReader). + /// Used by IcebergReaderMixin to build $row_id column. + const std::vector& current_batch_row_positions() const { + return _current_group_reader->current_batch_row_positions(); + } void set_row_lineage_columns(std::shared_ptr row_lineage_columns) { _row_lineage_columns = std::move(row_lineage_columns); @@ -183,15 +192,43 @@ class ParquetReader : public GenericReader { void set_condition_cache_context(std::shared_ptr ctx) override; + bool supports_count_pushdown() const override { return true; } + int64_t get_total_rows() const override; bool has_delete_operations() const override { return _delete_rows != nullptr && !_delete_rows->empty(); } + /// Disable row-group range filtering (needed when reading delete files + /// whose TFileRangeDesc has size=-1). + void set_filter_groups(bool v) { _filter_groups = v; } + protected: void _collect_profile_before_close() override; + // Core block reading implementation + Status _do_get_next_block(Block* block, size_t* read_rows, bool* eof) override; + + // Parquet fills partition/missing columns per-batch internally via RowGroupReader, + // so suppress TableFormatReader's default on_after_read_block fill. + Status on_after_read_block(Block* /*block*/, size_t* /*read_rows*/) override { + return Status::OK(); + } + + // Protected accessors so CRTP mixin subclasses can reach private members + io::IOContext* get_io_ctx() const { return _io_ctx; } + std::unordered_map*& col_name_to_block_idx_ref() { + return _col_name_to_block_idx; + } + RuntimeProfile* get_profile() const { return _profile; } + RuntimeState* get_state() const { return _state; } + const TFileScanRangeParams& get_scan_params() const { return _scan_params; } + const TFileRangeDesc& get_scan_range() const { return _scan_range; } + const TupleDescriptor* get_tuple_descriptor() const { return _tuple_descriptor; } + const RowDescriptor* get_row_descriptor() const { return _row_descriptor; } + const FileMetaData* get_file_metadata() const { return _file_metadata; } + private: struct ParquetProfile { RuntimeProfile::Counter* filtered_row_groups = nullptr; @@ -239,6 +276,15 @@ class ParquetReader : public GenericReader { RuntimeProfile::Counter* bloom_filter_read_time = nullptr; }; + // ---- set_fill_columns sub-functions ---- + void _collect_predicate_columns_from_conjuncts( + std::unordered_map>& predicate_columns); + void _classify_columns_for_lazy_read( + const std::unordered_map>& predicate_columns, + const std::unordered_map>& + partition_columns, + const std::unordered_map& missing_columns); + Status _open_file(); void _init_profile(); void _close_internal(); @@ -290,9 +336,6 @@ class ParquetReader : public GenericReader { bool _exists_in_file(const std::string& expr_name) const; bool _type_matches(const int cid) const; - RuntimeProfile* _profile = nullptr; - const TFileScanRangeParams& _scan_params; - const TFileRangeDesc& _scan_range; io::FileSystemProperties _system_properties; io::FileDescription _file_description; @@ -305,7 +348,6 @@ class ParquetReader : public GenericReader { // after _file_reader. Otherwise, there may be heap-use-after-free bug. ObjLRUCache::CacheHandle _meta_cache_handle; std::unique_ptr _file_metadata_ptr; - const FileMetaData* _file_metadata = nullptr; const tparquet::FileMetaData* _t_metadata = nullptr; // _tracing_file_reader wraps _file_reader. @@ -336,10 +378,10 @@ class ParquetReader : public GenericReader { const std::vector* _delete_rows = nullptr; int64_t _delete_rows_index = 0; - // Used for column lazy read. - RowGroupReader::LazyReadContext _lazy_read_ctx; - // parquet file reader object + RuntimeProfile* _profile = nullptr; + const TFileScanRangeParams& _scan_params; + const TFileRangeDesc& _scan_range; size_t _batch_size; int64_t _range_start_offset; int64_t _range_size; @@ -347,10 +389,6 @@ class ParquetReader : public GenericReader { std::unordered_map _col_offsets; - std::vector _missing_cols; - // _table_column_names = _missing_cols + _read_table_columns - const std::vector* _table_column_names = nullptr; - ReaderStatistics _reader_statistics; ParquetColumnReader::ColumnStatistics _column_statistics; ParquetProfile _parquet_profile; @@ -358,11 +396,14 @@ class ParquetReader : public GenericReader { io::IOContext* _io_ctx = nullptr; std::shared_ptr _io_ctx_holder; RuntimeState* _state = nullptr; + const TupleDescriptor* _tuple_descriptor = nullptr; + const RowDescriptor* _row_descriptor = nullptr; + const FileMetaData* _file_metadata = nullptr; + // Pointer to external column name to block index mapping (from FileScanner) + std::unordered_map* _col_name_to_block_idx = nullptr; bool _enable_lazy_mat = true; bool _enable_filter_by_min_max = true; bool _enable_filter_by_bloom_filter = true; - const TupleDescriptor* _tuple_descriptor = nullptr; - const RowDescriptor* _row_descriptor = nullptr; const std::unordered_map* _colname_to_slot_id = nullptr; const VExprContextSPtrs* _not_single_slot_filter_conjuncts = nullptr; const std::unordered_map* _slot_id_to_filter_conjuncts = nullptr; @@ -373,14 +414,16 @@ class ParquetReader : public GenericReader { std::shared_ptr _row_lineage_columns; protected: + // Used for column lazy read. Protected so Iceberg/Paimon subclasses can + // register synthesized columns in on_before_init_reader. + RowGroupReader::LazyReadContext _lazy_read_ctx; bool _filter_groups = true; - RowGroupReader::IcebergRowIdParams _iceberg_rowid_params; + size_t get_batch_size() const { return _batch_size; } +private: std::set _column_ids; std::set _filter_column_ids; - std::unordered_map* _col_name_to_block_idx = nullptr; - std::vector> _push_down_predicates; Arena _arena; }; diff --git a/be/src/format/table/equality_delete.cpp b/be/src/format/table/equality_delete.cpp index d1e3954836a81b..ee799a14a5b4a7 100644 --- a/be/src/format/table/equality_delete.cpp +++ b/be/src/format/table/equality_delete.cpp @@ -52,8 +52,14 @@ Status SimpleEqualityDelete::filter_data_block( DCHECK(_delete_col_ids.size() == 1); auto column_field_id = _delete_col_ids[0]; - auto column_and_type = data_block->get_by_position( - col_name_to_block_idx->at(id_to_block_column_name.at(column_field_id))); + const auto& block_col_name = id_to_block_column_name.at(column_field_id); + auto block_idx = col_name_to_block_idx->at(block_col_name); + LOG(INFO) << "[EqDeleteDebug] SimpleEqualityDelete::filter_data_block: field_id=" + << column_field_id << ", block_col_name=" << block_col_name + << ", block_idx=" << block_idx << ", delete_block_rows=" << _delete_block->rows() + << ", data_block_rows=" << data_block->rows(); + + auto column_and_type = data_block->get_by_position(block_idx); size_t rows = data_block->rows(); // _filter: 1 => in _hybrid_set; 0 => not in _hybrid_set diff --git a/be/src/format/table/hive/hive_orc_nested_column_utils.cpp b/be/src/format/table/hive/hive_orc_nested_column_utils.cpp index 8ff065490e8cf7..0e014c95e5dcf6 100644 --- a/be/src/format/table/hive/hive_orc_nested_column_utils.cpp +++ b/be/src/format/table/hive/hive_orc_nested_column_utils.cpp @@ -25,7 +25,8 @@ #include #include "common/logging.h" -#include "format/table/table_format_reader.h" +#include "format/generic_reader.h" +#include "format/table/table_schema_change_helper.h" #include "orc/Type.hh" namespace doris { diff --git a/be/src/format/table/hive/hive_orc_nested_column_utils.h b/be/src/format/table/hive/hive_orc_nested_column_utils.h index 6cc28e001cf1b5..a410f8d29ea198 100644 --- a/be/src/format/table/hive/hive_orc_nested_column_utils.h +++ b/be/src/format/table/hive/hive_orc_nested_column_utils.h @@ -23,7 +23,7 @@ #include #include -#include "format/table/table_format_reader.h" +#include "format/table/table_schema_change_helper.h" namespace orc { class Type; diff --git a/be/src/format/table/hive/hive_parquet_nested_column_utils.cpp b/be/src/format/table/hive/hive_parquet_nested_column_utils.cpp index 5096a328b1de04..d990ff0b86f685 100644 --- a/be/src/format/table/hive/hive_parquet_nested_column_utils.cpp +++ b/be/src/format/table/hive/hive_parquet_nested_column_utils.cpp @@ -25,7 +25,7 @@ #include #include "format/parquet/schema_desc.h" -#include "format/table/table_format_reader.h" +#include "format/table/table_schema_change_helper.h" namespace doris { #include "common/compile_check_begin.h" diff --git a/be/src/format/table/hive/hive_parquet_nested_column_utils.h b/be/src/format/table/hive/hive_parquet_nested_column_utils.h index 5e4b528800c823..1e953ef5ea34ae 100644 --- a/be/src/format/table/hive/hive_parquet_nested_column_utils.h +++ b/be/src/format/table/hive/hive_parquet_nested_column_utils.h @@ -23,7 +23,7 @@ #include #include -#include "format/table/table_format_reader.h" +#include "format/table/table_schema_change_helper.h" namespace doris { #include "common/compile_check_begin.h" diff --git a/be/src/format/table/hive_reader.cpp b/be/src/format/table/hive_reader.cpp index 205becbc7b62d0..fc6dfbc025c235 100644 --- a/be/src/format/table/hive_reader.cpp +++ b/be/src/format/table/hive_reader.cpp @@ -28,73 +28,68 @@ namespace doris { #include "common/compile_check_begin.h" -Status HiveReader::get_next_block_inner(Block* block, size_t* read_rows, bool* eof) { - RETURN_IF_ERROR(_file_format_reader->get_next_block(block, read_rows, eof)); - return Status::OK(); -}; - -Status HiveOrcReader::init_reader( - const std::vector& read_table_col_names, - std::unordered_map* col_name_to_block_idx, - const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor, - const RowDescriptor* row_descriptor, - const VExprContextSPtrs* not_single_slot_filter_conjuncts, - const std::unordered_map* slot_id_to_filter_conjuncts) { - auto* orc_reader = static_cast(_file_format_reader.get()); +Status HiveOrcReader::on_before_init_reader(ReaderInitContext* ctx) { + _column_descs = ctx->column_descs; + _fill_col_name_to_block_idx = ctx->col_name_to_block_idx; + RETURN_IF_ERROR( + _extract_partition_values(*ctx->range, ctx->tuple_descriptor, _fill_partition_values)); + for (auto& desc : *ctx->column_descs) { + if (desc.category == ColumnCategory::REGULAR || + desc.category == ColumnCategory::GENERATED) { + ctx->column_names.push_back(desc.name); + } + } + // Get file type (available because _create_file_reader() runs before this hook) const orc::Type* orc_type_ptr = nullptr; - RETURN_IF_ERROR(orc_reader->get_file_type(&orc_type_ptr)); + RETURN_IF_ERROR(get_file_type(&orc_type_ptr)); bool is_hive_col_name = OrcReader::is_hive1_col_name(orc_type_ptr); - if (_state->query_options().hive_orc_use_column_names && !is_hive_col_name) { - // Directly use the table column name to match the file column name, but pay attention to the case issue. - RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(tuple_descriptor, orc_type_ptr, - table_info_node_ptr, _is_file_slot)); + // Build table_info_node based on config + if (get_state()->query_options().hive_orc_use_column_names && !is_hive_col_name) { + RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(ctx->tuple_descriptor, orc_type_ptr, + ctx->table_info_node, _is_file_slot)); } else { - // hive1 / use index - std::map slot_map; // table_name to slot - for (const auto& slot : tuple_descriptor->slots()) { + ctx->table_info_node = std::make_shared(); + std::map slot_map; + for (const auto& slot : ctx->tuple_descriptor->slots()) { slot_map.emplace(slot->col_name_lower_case(), slot); } - // For top-level columns, use indexes to match, and for sub-columns, still use name to match columns. - for (size_t idx = 0; idx < _params.column_idxs.size(); idx++) { - auto table_column_name = read_table_col_names[idx]; - auto file_index = _params.column_idxs[idx]; + for (size_t idx = 0; idx < get_scan_params().column_idxs.size(); idx++) { + auto table_column_name = ctx->column_names[idx]; + auto file_index = get_scan_params().column_idxs[idx]; if (file_index >= orc_type_ptr->getSubtypeCount()) { - table_info_node_ptr->add_not_exist_children(table_column_name); + ctx->table_info_node->add_not_exist_children(table_column_name); } else { auto field_node = std::make_shared(); - // For sub-columns, still use name to match columns. RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name( slot_map[table_column_name]->type(), orc_type_ptr->getSubtype(file_index), field_node)); - table_info_node_ptr->add_children( + ctx->table_info_node->add_children( table_column_name, orc_type_ptr->getFieldName(file_index), field_node); } slot_map.erase(table_column_name); } for (const auto& [partition_col_name, _] : slot_map) { - table_info_node_ptr->add_not_exist_children(partition_col_name); + ctx->table_info_node->add_not_exist_children(partition_col_name); } } + // Compute column_ids auto column_id_result = ColumnIdResult(); - if (_state->query_options().hive_orc_use_column_names && !is_hive_col_name) { - column_id_result = _create_column_ids(orc_type_ptr, tuple_descriptor); + if (get_state()->query_options().hive_orc_use_column_names && !is_hive_col_name) { + column_id_result = _create_column_ids(orc_type_ptr, ctx->tuple_descriptor); } else { column_id_result = - _create_column_ids_by_top_level_col_index(orc_type_ptr, tuple_descriptor); + _create_column_ids_by_top_level_col_index(orc_type_ptr, ctx->tuple_descriptor); } + ctx->column_ids = std::move(column_id_result.column_ids); + ctx->filter_column_ids = std::move(column_id_result.filter_column_ids); - const auto& column_ids = column_id_result.column_ids; - const auto& filter_column_ids = column_id_result.filter_column_ids; - - return orc_reader->init_reader(&read_table_col_names, col_name_to_block_idx, conjuncts, false, - tuple_descriptor, row_descriptor, - not_single_slot_filter_conjuncts, slot_id_to_filter_conjuncts, - table_info_node_ptr, column_ids, filter_column_ids); + // _is_acid is false by default, no need to set explicitly + return Status::OK(); } ColumnIdResult HiveOrcReader::_create_column_ids(const orc::Type* orc_type, @@ -210,86 +205,69 @@ ColumnIdResult HiveOrcReader::_create_column_ids_by_top_level_col_index( return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids)); } -Status HiveParquetReader::init_reader( - const std::vector& read_table_col_names, - std::unordered_map* col_name_to_block_idx, - const VExprContextSPtrs& conjuncts, - phmap::flat_hash_map>>& - slot_id_to_predicates, - const TupleDescriptor* tuple_descriptor, const RowDescriptor* row_descriptor, - const std::unordered_map* colname_to_slot_id, - const VExprContextSPtrs* not_single_slot_filter_conjuncts, - const std::unordered_map* slot_id_to_filter_conjuncts) { - auto* parquet_reader = static_cast(_file_format_reader.get()); +Status HiveParquetReader::on_before_init_reader(ReaderInitContext* ctx) { + _column_descs = ctx->column_descs; + _fill_col_name_to_block_idx = ctx->col_name_to_block_idx; + RETURN_IF_ERROR( + _extract_partition_values(*ctx->range, ctx->tuple_descriptor, _fill_partition_values)); + for (auto& desc : *ctx->column_descs) { + if (desc.category == ColumnCategory::REGULAR || + desc.category == ColumnCategory::GENERATED) { + ctx->column_names.push_back(desc.name); + } + } + + // Get file metadata schema (available because _open_file() runs before this hook) const FieldDescriptor* field_desc = nullptr; - RETURN_IF_ERROR(parquet_reader->get_file_metadata_schema(&field_desc)); + RETURN_IF_ERROR(get_file_metadata_schema(&field_desc)); DCHECK(field_desc != nullptr); - if (_state->query_options().hive_parquet_use_column_names) { - // Directly use the table column name to match the file column name, but pay attention to the case issue. - RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(tuple_descriptor, *field_desc, - table_info_node_ptr, _is_file_slot)); - } else { // use idx - std::map slot_map; //table_name to slot - for (const auto& slot : tuple_descriptor->slots()) { + // Build table_info_node based on config + if (get_state()->query_options().hive_parquet_use_column_names) { + RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(ctx->tuple_descriptor, *field_desc, + ctx->table_info_node, _is_file_slot)); + } else { + ctx->table_info_node = std::make_shared(); + std::map slot_map; + for (const auto& slot : ctx->tuple_descriptor->slots()) { slot_map.emplace(slot->col_name_lower_case(), slot); } - // For top-level columns, use indexes to match, and for sub-columns, still use name to match columns. auto parquet_fields_schema = field_desc->get_fields_schema(); - for (size_t idx = 0; idx < _params.column_idxs.size(); idx++) { - auto table_column_name = read_table_col_names[idx]; - auto file_index = _params.column_idxs[idx]; + for (size_t idx = 0; idx < get_scan_params().column_idxs.size(); idx++) { + auto table_column_name = ctx->column_names[idx]; + auto file_index = get_scan_params().column_idxs[idx]; if (file_index >= parquet_fields_schema.size()) { - // Non-partitioning columns, which may be columns added later. - table_info_node_ptr->add_not_exist_children(table_column_name); + ctx->table_info_node->add_not_exist_children(table_column_name); } else { - // Non-partitioning columns, columns that exist in both the table and the file. auto field_node = std::make_shared(); - // for sub-columns, still use name to match columns. RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name( slot_map[table_column_name]->type(), parquet_fields_schema[file_index], field_node)); - table_info_node_ptr->add_children( + ctx->table_info_node->add_children( table_column_name, parquet_fields_schema[file_index].name, field_node); } - slot_map.erase(table_column_name); } - /* - * `_params.column_idxs` only have `isIsFileSlot()`, so we need add `partition slot`. - * eg: - * Table : A, B, C, D (D: partition column) - * Parquet file : A, B - * Column C is obtained by add column. - * - * sql : select * from table; - * slot : A, B, C, D - * _params.column_idxs: 0, 1, 2 (There is no 3, because column D is the partition column) - * - */ for (const auto& [partition_col_name, _] : slot_map) { - table_info_node_ptr->add_not_exist_children(partition_col_name); + ctx->table_info_node->add_not_exist_children(partition_col_name); } } + // Compute column_ids for lazy materialization auto column_id_result = ColumnIdResult(); - if (_state->query_options().hive_parquet_use_column_names) { - column_id_result = _create_column_ids(field_desc, tuple_descriptor); + if (get_state()->query_options().hive_parquet_use_column_names) { + column_id_result = _create_column_ids(field_desc, ctx->tuple_descriptor); } else { - column_id_result = _create_column_ids_by_top_level_col_index(field_desc, tuple_descriptor); + column_id_result = + _create_column_ids_by_top_level_col_index(field_desc, ctx->tuple_descriptor); } + ctx->column_ids = std::move(column_id_result.column_ids); + ctx->filter_column_ids = std::move(column_id_result.filter_column_ids); - const auto& column_ids = column_id_result.column_ids; - const auto& filter_column_ids = column_id_result.filter_column_ids; - - RETURN_IF_ERROR(init_row_filters()); - - return parquet_reader->init_reader( - read_table_col_names, col_name_to_block_idx, conjuncts, slot_id_to_predicates, - tuple_descriptor, row_descriptor, colname_to_slot_id, not_single_slot_filter_conjuncts, - slot_id_to_filter_conjuncts, table_info_node_ptr, true, column_ids, filter_column_ids); + _filter_groups = true; + return Status::OK(); } ColumnIdResult HiveParquetReader::_create_column_ids(const FieldDescriptor* field_desc, diff --git a/be/src/format/table/hive_reader.h b/be/src/format/table/hive_reader.h index c741b434f166dc..32c4101993dfe3 100644 --- a/be/src/format/table/hive_reader.h +++ b/be/src/format/table/hive_reader.h @@ -21,50 +21,25 @@ #include "format/orc/vorc_reader.h" #include "format/parquet/vparquet_reader.h" -#include "format/table/table_format_reader.h" +#include "format/table/table_schema_change_helper.h" namespace doris { #include "common/compile_check_begin.h" -// By holding a parquet/orc reader, used to read the parquet/orc table of hive. -class HiveReader : public TableFormatReader, public TableSchemaChangeHelper { -public: - HiveReader(std::unique_ptr file_format_reader, RuntimeProfile* profile, - RuntimeState* state, const TFileScanRangeParams& params, const TFileRangeDesc& range, - io::IOContext* io_ctx, const std::set* is_file_slot, - FileMetaCache* meta_cache) - : TableFormatReader(std::move(file_format_reader), state, profile, params, range, - io_ctx, meta_cache), - _is_file_slot(is_file_slot) {}; - - ~HiveReader() override = default; - - Status get_next_block_inner(Block* block, size_t* read_rows, bool* eof) final; - - Status init_row_filters() final { return Status::OK(); }; - -protected: - // https://github.com/apache/doris/pull/23369 - const std::set* _is_file_slot = nullptr; -}; - -class HiveOrcReader final : public HiveReader { +class HiveOrcReader final : public OrcReader, public TableSchemaChangeHelper { public: ENABLE_FACTORY_CREATOR(HiveOrcReader); - HiveOrcReader(std::unique_ptr file_format_reader, RuntimeProfile* profile, - RuntimeState* state, const TFileScanRangeParams& params, - const TFileRangeDesc& range, io::IOContext* io_ctx, - const std::set* is_file_slot, FileMetaCache* meta_cache) - : HiveReader(std::move(file_format_reader), profile, state, params, range, io_ctx, - is_file_slot, meta_cache) {}; + HiveOrcReader(RuntimeProfile* profile, RuntimeState* state, const TFileScanRangeParams& params, + const TFileRangeDesc& range, size_t batch_size, const std::string& ctz, + io::IOContext* io_ctx, const std::set* is_file_slot, + FileMetaCache* meta_cache = nullptr, bool enable_lazy_mat = true) + : OrcReader(profile, state, params, range, batch_size, ctz, io_ctx, meta_cache, + enable_lazy_mat), + _is_file_slot(is_file_slot) {} + ~HiveOrcReader() final = default; - Status init_reader( - const std::vector& read_table_col_names, - std::unordered_map* col_name_to_block_idx, - const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor, - const RowDescriptor* row_descriptor, - const VExprContextSPtrs* not_single_slot_filter_conjuncts, - const std::unordered_map* slot_id_to_filter_conjuncts); +protected: + Status on_before_init_reader(ReaderInitContext* ctx) override; private: static ColumnIdResult _create_column_ids(const orc::Type* orc_type, @@ -72,29 +47,26 @@ class HiveOrcReader final : public HiveReader { static ColumnIdResult _create_column_ids_by_top_level_col_index( const orc::Type* orc_type, const TupleDescriptor* tuple_descriptor); + + const std::set* _is_file_slot = nullptr; }; -class HiveParquetReader final : public HiveReader { +class HiveParquetReader final : public ParquetReader, public TableSchemaChangeHelper { public: ENABLE_FACTORY_CREATOR(HiveParquetReader); - HiveParquetReader(std::unique_ptr file_format_reader, RuntimeProfile* profile, - RuntimeState* state, const TFileScanRangeParams& params, - const TFileRangeDesc& range, io::IOContext* io_ctx, - const std::set* is_file_slot, FileMetaCache* meta_cache) - : HiveReader(std::move(file_format_reader), profile, state, params, range, io_ctx, - is_file_slot, meta_cache) {}; + HiveParquetReader(RuntimeProfile* profile, const TFileScanRangeParams& params, + const TFileRangeDesc& range, size_t batch_size, const cctz::time_zone* ctz, + io::IOContext* io_ctx, RuntimeState* state, + const std::set* is_file_slot, FileMetaCache* meta_cache = nullptr, + bool enable_lazy_mat = true) + : ParquetReader(profile, params, range, batch_size, ctz, io_ctx, state, meta_cache, + enable_lazy_mat), + _is_file_slot(is_file_slot) {} + ~HiveParquetReader() final = default; - Status init_reader( - const std::vector& read_table_col_names, - std::unordered_map* col_name_to_block_idx, - const VExprContextSPtrs& conjuncts, - phmap::flat_hash_map>>& - slot_id_to_predicates, - const TupleDescriptor* tuple_descriptor, const RowDescriptor* row_descriptor, - const std::unordered_map* colname_to_slot_id, - const VExprContextSPtrs* not_single_slot_filter_conjuncts, - const std::unordered_map* slot_id_to_filter_conjuncts); +protected: + Status on_before_init_reader(ReaderInitContext* ctx) override; private: static ColumnIdResult _create_column_ids(const FieldDescriptor* field_desc, @@ -102,6 +74,8 @@ class HiveParquetReader final : public HiveReader { static ColumnIdResult _create_column_ids_by_top_level_col_index( const FieldDescriptor* field_desc, const TupleDescriptor* tuple_descriptor); + + const std::set* _is_file_slot = nullptr; }; #include "common/compile_check_end.h" -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/format/table/hudi_jni_reader.h b/be/src/format/table/hudi_jni_reader.h index 47bc6bc8de2df0..514cfe68171068 100644 --- a/be/src/format/table/hudi_jni_reader.h +++ b/be/src/format/table/hudi_jni_reader.h @@ -50,6 +50,9 @@ class HudiJniReader : public JniReader { ~HudiJniReader() override = default; Status init_reader(); + +protected: + Status _do_init_reader(ReaderInitContext* /*ctx*/) override { return init_reader(); } }; #include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/format/table/hudi_reader.cpp b/be/src/format/table/hudi_reader.cpp index 2e296d158adef1..631b32d368b1c3 100644 --- a/be/src/format/table/hudi_reader.cpp +++ b/be/src/format/table/hudi_reader.cpp @@ -20,39 +20,61 @@ #include #include "common/status.h" -#include "runtime/runtime_state.h" namespace doris { #include "common/compile_check_begin.h" -Status HudiReader::get_next_block_inner(Block* block, size_t* read_rows, bool* eof) { - RETURN_IF_ERROR(_file_format_reader->get_next_block(block, read_rows, eof)); - return Status::OK(); -}; - -Status HudiParquetReader::init_reader( - const std::vector& read_table_col_names, - std::unordered_map* col_name_to_block_idx, - const VExprContextSPtrs& conjuncts, - phmap::flat_hash_map>>& - slot_id_to_predicates, - const TupleDescriptor* tuple_descriptor, const RowDescriptor* row_descriptor, - const std::unordered_map* colname_to_slot_id, - const VExprContextSPtrs* not_single_slot_filter_conjuncts, - const std::unordered_map* slot_id_to_filter_conjuncts) { - auto* parquet_reader = static_cast(_file_format_reader.get()); +// ============================================================================ +// HudiParquetReader: on_before_init_reader +// ============================================================================ +Status HudiParquetReader::on_before_init_reader(ReaderInitContext* ctx) { + _column_descs = ctx->column_descs; + _fill_col_name_to_block_idx = ctx->col_name_to_block_idx; + // Get parquet file metadata schema (file already opened by init_reader) const FieldDescriptor* field_desc = nullptr; - RETURN_IF_ERROR(parquet_reader->get_file_metadata_schema(&field_desc)); + RETURN_IF_ERROR(get_file_metadata_schema(&field_desc)); DCHECK(field_desc != nullptr); - auto parquet_fields_schema = field_desc->get_fields_schema(); + // Build table_info_node using field_id matching (shared with Paimon/Iceberg) RETURN_IF_ERROR(gen_table_info_node_by_field_id( - _params, _range.table_format_params.hudi_params.schema_id, tuple_descriptor, - *field_desc)); - return parquet_reader->init_reader(read_table_col_names, col_name_to_block_idx, conjuncts, - slot_id_to_predicates, tuple_descriptor, row_descriptor, - colname_to_slot_id, not_single_slot_filter_conjuncts, - slot_id_to_filter_conjuncts, table_info_node_ptr); + get_scan_params(), get_scan_range().table_format_params.hudi_params.schema_id, + get_tuple_descriptor(), *field_desc)); + ctx->table_info_node = table_info_node_ptr; + + // Extract column names from descriptors + for (const auto& desc : *ctx->column_descs) { + if (desc.category == ColumnCategory::REGULAR || + desc.category == ColumnCategory::GENERATED) { + ctx->column_names.push_back(desc.name); + } + } + return Status::OK(); +} + +// ============================================================================ +// HudiOrcReader: on_before_init_reader +// ============================================================================ +Status HudiOrcReader::on_before_init_reader(ReaderInitContext* ctx) { + _column_descs = ctx->column_descs; + _fill_col_name_to_block_idx = ctx->col_name_to_block_idx; + // Get ORC file type (file already opened by init_reader) + const orc::Type* orc_type_ptr = nullptr; + RETURN_IF_ERROR(get_file_type(&orc_type_ptr)); + + // Build table_info_node using field_id matching + RETURN_IF_ERROR(gen_table_info_node_by_field_id( + get_scan_params(), get_scan_range().table_format_params.hudi_params.schema_id, + get_tuple_descriptor(), orc_type_ptr)); + ctx->table_info_node = table_info_node_ptr; + + // Extract column names from descriptors + for (const auto& desc : *ctx->column_descs) { + if (desc.category == ColumnCategory::REGULAR || + desc.category == ColumnCategory::GENERATED) { + ctx->column_names.push_back(desc.name); + } + } + return Status::OK(); } #include "common/compile_check_end.h" diff --git a/be/src/format/table/hudi_reader.h b/be/src/format/table/hudi_reader.h index 319c6c5af05f60..c3628ac6044a01 100644 --- a/be/src/format/table/hudi_reader.h +++ b/be/src/format/table/hudi_reader.h @@ -20,76 +20,42 @@ #include "format/orc/vorc_reader.h" #include "format/parquet/vparquet_reader.h" -#include "format/table/table_format_reader.h" +#include "format/table/table_schema_change_helper.h" namespace doris { #include "common/compile_check_begin.h" -class HudiReader : public TableFormatReader, public TableSchemaChangeHelper { -public: - HudiReader(std::unique_ptr file_format_reader, RuntimeProfile* profile, - RuntimeState* state, const TFileScanRangeParams& params, const TFileRangeDesc& range, - io::IOContext* io_ctx, FileMetaCache* meta_cache) - : TableFormatReader(std::move(file_format_reader), state, profile, params, range, - io_ctx, meta_cache) {}; - - ~HudiReader() override = default; - - Status get_next_block_inner(Block* block, size_t* read_rows, bool* eof) final; - Status init_row_filters() final { return Status::OK(); }; -}; - -class HudiParquetReader final : public HudiReader { +// HudiParquetReader: directly inherits ParquetReader (no composition wrapping). +// Schema mapping is done in on_before_init_reader hook via field_id matching. +class HudiParquetReader final : public ParquetReader, public TableSchemaChangeHelper { public: ENABLE_FACTORY_CREATOR(HudiParquetReader); - HudiParquetReader(std::unique_ptr file_format_reader, RuntimeProfile* profile, - RuntimeState* state, const TFileScanRangeParams& params, - const TFileRangeDesc& range, io::IOContext* io_ctx, FileMetaCache* meta_cache) - : HudiReader(std::move(file_format_reader), profile, state, params, range, io_ctx, - meta_cache) {}; + HudiParquetReader(RuntimeProfile* profile, const TFileScanRangeParams& params, + const TFileRangeDesc& range, size_t batch_size, const cctz::time_zone* ctz, + io::IOContext* io_ctx, RuntimeState* state, + FileMetaCache* meta_cache = nullptr, bool enable_lazy_mat = true) + : ParquetReader(profile, params, range, batch_size, ctz, io_ctx, state, meta_cache, + enable_lazy_mat) {} ~HudiParquetReader() final = default; - Status init_reader( - const std::vector& read_table_col_names, - std::unordered_map* col_name_to_block_idx, - const VExprContextSPtrs& conjuncts, - phmap::flat_hash_map>>& - slot_id_to_predicates, - const TupleDescriptor* tuple_descriptor, const RowDescriptor* row_descriptor, - const std::unordered_map* colname_to_slot_id, - const VExprContextSPtrs* not_single_slot_filter_conjuncts, - const std::unordered_map* slot_id_to_filter_conjuncts); +protected: + Status on_before_init_reader(ReaderInitContext* ctx) override; }; -class HudiOrcReader final : public HudiReader { +// HudiOrcReader: directly inherits OrcReader (no composition wrapping). +class HudiOrcReader final : public OrcReader, public TableSchemaChangeHelper { public: ENABLE_FACTORY_CREATOR(HudiOrcReader); - HudiOrcReader(std::unique_ptr file_format_reader, RuntimeProfile* profile, - RuntimeState* state, const TFileScanRangeParams& params, - const TFileRangeDesc& range, io::IOContext* io_ctx, FileMetaCache* meta_cache) - : HudiReader(std::move(file_format_reader), profile, state, params, range, io_ctx, - meta_cache) {}; + HudiOrcReader(RuntimeProfile* profile, RuntimeState* state, const TFileScanRangeParams& params, + const TFileRangeDesc& range, size_t batch_size, const std::string& ctz, + io::IOContext* io_ctx, FileMetaCache* meta_cache = nullptr, + bool enable_lazy_mat = true) + : OrcReader(profile, state, params, range, batch_size, ctz, io_ctx, meta_cache, + enable_lazy_mat) {} ~HudiOrcReader() final = default; - Status init_reader( - const std::vector& read_table_col_names, - std::unordered_map* col_name_to_block_idx, - const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor, - const RowDescriptor* row_descriptor, - const VExprContextSPtrs* not_single_slot_filter_conjuncts, - const std::unordered_map* slot_id_to_filter_conjuncts) { - auto* orc_reader = static_cast(_file_format_reader.get()); - const orc::Type* orc_type_ptr = nullptr; - RETURN_IF_ERROR(orc_reader->get_file_type(&orc_type_ptr)); - RETURN_IF_ERROR(gen_table_info_node_by_field_id( - _params, _range.table_format_params.hudi_params.schema_id, tuple_descriptor, - orc_type_ptr)); - - return orc_reader->init_reader(&read_table_col_names, col_name_to_block_idx, conjuncts, - false, tuple_descriptor, row_descriptor, - not_single_slot_filter_conjuncts, - slot_id_to_filter_conjuncts, table_info_node_ptr); - } +protected: + Status on_before_init_reader(ReaderInitContext* ctx) override; }; #include "common/compile_check_end.h" -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/format/table/iceberg/iceberg_orc_nested_column_utils.cpp b/be/src/format/table/iceberg/iceberg_orc_nested_column_utils.cpp index da9ad8168106a2..c2f0593b3f59b1 100644 --- a/be/src/format/table/iceberg/iceberg_orc_nested_column_utils.cpp +++ b/be/src/format/table/iceberg/iceberg_orc_nested_column_utils.cpp @@ -24,7 +24,8 @@ #include #include -#include "format/table/table_format_reader.h" +#include "format/generic_reader.h" +#include "format/table/table_schema_change_helper.h" #include "orc/Type.hh" namespace doris { diff --git a/be/src/format/table/iceberg/iceberg_orc_nested_column_utils.h b/be/src/format/table/iceberg/iceberg_orc_nested_column_utils.h index 142dee706b89f6..cc5761854736fc 100644 --- a/be/src/format/table/iceberg/iceberg_orc_nested_column_utils.h +++ b/be/src/format/table/iceberg/iceberg_orc_nested_column_utils.h @@ -21,7 +21,7 @@ #include #include -#include "format/table/table_format_reader.h" +#include "format/table/table_schema_change_helper.h" namespace orc { class Type; diff --git a/be/src/format/table/iceberg/iceberg_parquet_nested_column_utils.cpp b/be/src/format/table/iceberg/iceberg_parquet_nested_column_utils.cpp index e84cc3700d0f32..a9ad8f27d0c6bc 100644 --- a/be/src/format/table/iceberg/iceberg_parquet_nested_column_utils.cpp +++ b/be/src/format/table/iceberg/iceberg_parquet_nested_column_utils.cpp @@ -26,7 +26,7 @@ #include #include "format/parquet/schema_desc.h" -#include "format/table/table_format_reader.h" +#include "format/table/table_schema_change_helper.h" namespace doris { #include "common/compile_check_begin.h" diff --git a/be/src/format/table/iceberg/iceberg_parquet_nested_column_utils.h b/be/src/format/table/iceberg/iceberg_parquet_nested_column_utils.h index 5d16d1053c898e..fd47ed37c69fe8 100644 --- a/be/src/format/table/iceberg/iceberg_parquet_nested_column_utils.h +++ b/be/src/format/table/iceberg/iceberg_parquet_nested_column_utils.h @@ -23,7 +23,7 @@ #include #include -#include "format/table/table_format_reader.h" +#include "format/table/table_schema_change_helper.h" namespace doris { #include "common/compile_check_begin.h" diff --git a/be/src/format/table/iceberg_delete_file_reader_helper.cpp b/be/src/format/table/iceberg_delete_file_reader_helper.cpp index 55525a0635cc29..2e7045c81ad551 100644 --- a/be/src/format/table/iceberg_delete_file_reader_helper.cpp +++ b/be/src/format/table/iceberg_delete_file_reader_helper.cpp @@ -34,7 +34,6 @@ #include "core/data_type/data_type_number.h" #include "core/data_type/data_type_string.h" #include "exec/common/endian.h" -#include "exprs/vexpr_context.h" #include "format/orc/vorc_reader.h" #include "format/parquet/vparquet_column_chunk_reader.h" #include "format/parquet/vparquet_reader.h" @@ -42,7 +41,6 @@ #include "format/table/iceberg_reader.h" #include "format/table/table_format_reader.h" #include "io/hdfs_builder.h" -#include "runtime/descriptors.h" #include "runtime/runtime_state.h" #include "storage/predicate/column_predicate.h" @@ -121,16 +119,11 @@ Status init_parquet_delete_reader(ParquetReader* reader, bool* dictionary_coded) return Status::InvalidArgument("invalid parquet delete reader arguments"); } - phmap::flat_hash_map>> slot_id_to_predicates; - RETURN_IF_ERROR(reader->init_reader(DELETE_COL_NAMES, &DELETE_COL_NAME_TO_BLOCK_IDX, {}, - slot_id_to_predicates, nullptr, nullptr, nullptr, nullptr, - nullptr, TableSchemaChangeHelper::ConstNode::get_instance(), - false)); - - std::unordered_map> - partition_columns; - std::unordered_map missing_columns; - RETURN_IF_ERROR(reader->set_fill_columns(partition_columns, missing_columns)); + ParquetInitContext ctx; + ctx.column_names = DELETE_COL_NAMES; + ctx.col_name_to_block_idx = &DELETE_COL_NAME_TO_BLOCK_IDX; + ctx.filter_groups = false; + RETURN_IF_ERROR(reader->init_reader(&ctx)); const tparquet::FileMetaData* meta_data = reader->get_meta_data(); *dictionary_coded = true; @@ -150,14 +143,10 @@ Status init_orc_delete_reader(OrcReader* reader) { return Status::InvalidArgument("orc delete reader is null"); } - RETURN_IF_ERROR(reader->init_reader(&DELETE_COL_NAMES, &DELETE_COL_NAME_TO_BLOCK_IDX, {}, false, - nullptr, nullptr, nullptr, nullptr, - TableSchemaChangeHelper::ConstNode::get_instance())); - - std::unordered_map> - partition_columns; - std::unordered_map missing_columns; - RETURN_IF_ERROR(reader->set_fill_columns(partition_columns, missing_columns)); + OrcInitContext ctx; + ctx.column_names = DELETE_COL_NAMES; + ctx.col_name_to_block_idx = &DELETE_COL_NAME_TO_BLOCK_IDX; + RETURN_IF_ERROR(reader->init_reader(&ctx)); return Status::OK(); } diff --git a/be/src/format/table/iceberg_reader.cpp b/be/src/format/table/iceberg_reader.cpp index 574324f4d55a02..b9afea2fb2abd7 100644 --- a/be/src/format/table/iceberg_reader.cpp +++ b/be/src/format/table/iceberg_reader.cpp @@ -29,15 +29,20 @@ #include #include #include -#include #include "common/compiler_util.h" // IWYU pragma: keep +#include "common/consts.h" #include "common/status.h" #include "core/assert_cast.h" #include "core/block/block.h" #include "core/block/column_with_type_and_name.h" #include "core/column/column.h" +#include "core/column/column_string.h" +#include "core/column/column_vector.h" #include "core/data_type/data_type_factory.hpp" +#include "core/data_type/define_primitive_type.h" +#include "core/data_type/primitive_type.h" +#include "core/string_ref.h" #include "exprs/aggregate/aggregate_function.h" #include "format/format_common.h" #include "format/generic_reader.h" @@ -47,9 +52,8 @@ #include "format/table/deletion_vector_reader.h" #include "format/table/iceberg/iceberg_orc_nested_column_utils.h" #include "format/table/iceberg/iceberg_parquet_nested_column_utils.h" -#include "format/table/iceberg_delete_file_reader_helper.h" #include "format/table/nested_column_access_helper.h" -#include "format/table/table_format_reader.h" +#include "format/table/table_schema_change_helper.h" #include "runtime/runtime_state.h" #include "util/coding.h" @@ -69,42 +73,6 @@ class VExprContext; } // namespace doris namespace doris { -namespace { - -class GroupedDeleteRowsVisitor final : public IcebergPositionDeleteVisitor { -public: - using DeleteRows = std::vector; - using DeleteFile = phmap::parallel_flat_hash_map< - std::string, std::unique_ptr, std::hash, std::equal_to<>, - std::allocator>>, 8, - std::mutex>; - - explicit GroupedDeleteRowsVisitor(DeleteFile* position_delete) - : _position_delete(position_delete) {} - - Status visit(const std::string& file_path, int64_t pos) override { - if (_position_delete == nullptr) { - return Status::InvalidArgument("position delete map is null"); - } - - auto iter = _position_delete->find(file_path); - DeleteRows* delete_rows = nullptr; - if (iter == _position_delete->end()) { - delete_rows = new DeleteRows; - (*_position_delete)[file_path] = std::unique_ptr(delete_rows); - } else { - delete_rows = iter->second.get(); - } - delete_rows->push_back(pos); - return Status::OK(); - } - -private: - DeleteFile* _position_delete; -}; - -} // namespace - const std::string IcebergOrcReader::ICEBERG_ORC_ATTRIBUTE = "iceberg.id"; bool IcebergTableReader::_is_fully_dictionary_encoded( @@ -157,461 +125,185 @@ bool IcebergTableReader::_is_fully_dictionary_encoded( return true; } -IcebergTableReader::IcebergTableReader(std::unique_ptr file_format_reader, - RuntimeProfile* profile, RuntimeState* state, - const TFileScanRangeParams& params, - const TFileRangeDesc& range, ShardedKVCache* kv_cache, - io::IOContext* io_ctx, FileMetaCache* meta_cache) - : TableFormatReader(std::move(file_format_reader), state, profile, params, range, io_ctx, - meta_cache), - _kv_cache(kv_cache) { - static const char* iceberg_profile = "IcebergProfile"; - ADD_TIMER(_profile, iceberg_profile); - _iceberg_profile.num_delete_files = - ADD_CHILD_COUNTER(_profile, "NumDeleteFiles", TUnit::UNIT, iceberg_profile); - _iceberg_profile.num_delete_rows = - ADD_CHILD_COUNTER(_profile, "NumDeleteRows", TUnit::UNIT, iceberg_profile); - _iceberg_profile.delete_files_read_time = - ADD_CHILD_TIMER(_profile, "DeleteFileReadTime", iceberg_profile); - _iceberg_profile.delete_rows_sort_time = - ADD_CHILD_TIMER(_profile, "DeleteRowsSortTime", iceberg_profile); - _iceberg_profile.parse_delete_file_time = - ADD_CHILD_TIMER(_profile, "ParseDeleteFileTime", iceberg_profile); -} - -Status IcebergTableReader::get_next_block_inner(Block* block, size_t* read_rows, bool* eof) { - RETURN_IF_ERROR(_expand_block_if_need(block)); - - RETURN_IF_ERROR(_file_format_reader->get_next_block(block, read_rows, eof)); +// ============================================================================ +// IcebergParquetReader: on_before_init_reader (Parquet-specific schema matching) +// ============================================================================ +Status IcebergParquetReader::on_before_init_reader(ReaderInitContext* ctx) { + _column_descs = ctx->column_descs; + _fill_col_name_to_block_idx = ctx->col_name_to_block_idx; + _file_format = Fileformat::PARQUET; - if (_equality_delete_impls.size() > 0) { - std::unique_ptr filter = - std::make_unique(block->rows(), 1); - for (auto& equality_delete_impl : _equality_delete_impls) { - RETURN_IF_ERROR(equality_delete_impl->filter_data_block( - block, _col_name_to_block_idx, _id_to_block_column_name, *filter)); + // Get file metadata schema first (available because _open_file() already ran) + const FieldDescriptor* field_desc = nullptr; + RETURN_IF_ERROR(this->get_file_metadata_schema(&field_desc)); + DCHECK(field_desc != nullptr); + + // Build table_info_node by field_id or name matching. + // This must happen BEFORE column classification so we can use children_column_exists + // to check if a column exists in the file (by field ID, not name). + if (!get_scan_params().__isset.history_schema_info || + get_scan_params().history_schema_info.empty()) [[unlikely]] { + RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(ctx->tuple_descriptor, *field_desc, + ctx->table_info_node)); + } else { + bool exist_field_id = true; + RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_field_id( + get_scan_params().history_schema_info.front().root_field, *field_desc, + ctx->table_info_node, exist_field_id)); + if (!exist_field_id) { + RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(ctx->tuple_descriptor, *field_desc, + ctx->table_info_node)); + } + } + + std::unordered_set partition_col_names; + if (ctx->range->__isset.columns_from_path_keys) { + partition_col_names.insert(ctx->range->columns_from_path_keys.begin(), + ctx->range->columns_from_path_keys.end()); + } + + // Single pass: classify columns, detect $row_id, handle partition fallback. + bool has_partition_from_path = false; + for (auto& desc : *ctx->column_descs) { + if (desc.category == ColumnCategory::SYNTHESIZED && + desc.name == BeConsts::ICEBERG_ROWID_COL) { + _need_row_id_column = true; + this->register_synthesized_column_handler(BeConsts::ICEBERG_ROWID_COL, + [this](Block* block, size_t rows) -> Status { + return _fill_iceberg_row_id(block, rows); + }); + continue; + } + if (desc.category == ColumnCategory::REGULAR) { + // Partition fallback: if column is a partition key and NOT in the file + // (checked via field ID matching in table_info_node), read from path instead. + if (partition_col_names.contains(desc.name) && + !ctx->table_info_node->children_column_exists(desc.name)) { + if (config::enable_iceberg_partition_column_fallback) { + desc.category = ColumnCategory::PARTITION_KEY; + has_partition_from_path = true; + continue; + } + } + ctx->column_names.push_back(desc.name); + } else if (desc.category == ColumnCategory::GENERATED) { + ctx->column_names.push_back(desc.name); } - Block::filter_block_internal(block, *filter, block->columns()); } - *read_rows = block->rows(); - return _shrink_block_if_need(block); -} - -Status IcebergTableReader::init_row_filters() { - // We get the count value by doris's be, so we don't need to read the delete file - if (_push_down_agg_type == TPushAggOp::type::COUNT && _table_level_row_count > 0) { - return Status::OK(); + // Set up partition value extraction if any partition columns need filling from path + if (has_partition_from_path) { + RETURN_IF_ERROR(_extract_partition_values(*ctx->range, ctx->tuple_descriptor, + _fill_partition_values)); } - const auto& table_desc = _range.table_format_params.iceberg_params; - const auto& version = table_desc.format_version; - if (version < MIN_SUPPORT_DELETE_FILES_VERSION) { - return Status::OK(); - } + _all_required_col_names = ctx->column_names; - auto* parquet_reader = dynamic_cast(_file_format_reader.get()); - auto* orc_reader = dynamic_cast(_file_format_reader.get()); - - // Initialize file information for $row_id generation - // Extract from table_desc which contains current file's metadata - if (_need_row_id_column) { - std::string file_path = table_desc.original_file_path; - int32_t partition_spec_id = 0; - std::string partition_data_json; - if (table_desc.__isset.partition_spec_id) { - partition_spec_id = table_desc.partition_spec_id; - } - if (table_desc.__isset.partition_data_json) { - partition_data_json = table_desc.partition_data_json; - } + // Create column IDs from field descriptor + auto column_id_result = _create_column_ids(field_desc, ctx->tuple_descriptor); + ctx->column_ids = std::move(column_id_result.column_ids); + ctx->filter_column_ids = std::move(column_id_result.filter_column_ids); - if (parquet_reader != nullptr) { - parquet_reader->set_iceberg_rowid_params(file_path, partition_spec_id, - partition_data_json, _row_id_column_position); - } else if (orc_reader != nullptr) { - orc_reader->set_iceberg_rowid_params(file_path, partition_spec_id, partition_data_json, - _row_id_column_position); - } - LOG(INFO) << "Initialized $row_id generation for file: " << file_path - << ", partition_spec_id: " << partition_spec_id; + // Build field_id -> block_column_name mapping for equality delete filtering. + // This was previously done in init_reader() column matching (pre-CRTP refactoring). + for (const auto* slot : ctx->tuple_descriptor->slots()) { + _id_to_block_column_name.emplace(slot->col_unique_id(), slot->col_name()); } - std::vector position_delete_files; - std::vector equality_delete_files; - std::vector deletion_vector_files; - for (const TIcebergDeleteFileDesc& desc : table_desc.delete_files) { - if (desc.content == POSITION_DELETE) { - position_delete_files.emplace_back(desc); - } else if (desc.content == EQUALITY_DELETE) { - equality_delete_files.emplace_back(desc); - } else if (desc.content == DELETION_VECTOR) { - deletion_vector_files.emplace_back(desc); - } - } + // Process delete files (must happen before _do_init_reader so expand col IDs are included) + RETURN_IF_ERROR(_init_row_filters()); - if (!equality_delete_files.empty()) { - RETURN_IF_ERROR(_process_equality_delete(equality_delete_files)); - _file_format_reader->set_push_down_agg_type(TPushAggOp::NONE); + // Add expand column IDs for equality delete and remap expand column names + // to match master's behavior: + // - Use field_id to find the actual file column name in Parquet schema + // - Prefix with __equality_delete_column__ to avoid name conflicts + // - Correctly map table_col_name → file_col_name in table_info_node + const static std::string EQ_DELETE_PRE = "__equality_delete_column__"; + std::unordered_map field_id_to_file_col_name; + for (int i = 0; i < field_desc->size(); ++i) { + auto field_schema = field_desc->get_column(i); + if (field_schema) { + field_id_to_file_col_name[field_schema->field_id] = field_schema->name; + } } - if (!deletion_vector_files.empty()) { - if (deletion_vector_files.size() != 1) [[unlikely]] { - /* - * Deletion vectors are a binary representation of deletes for a single data file that is more efficient - * at execution time than position delete files. Unlike equality or position delete files, there can be - * at most one deletion vector for a given data file in a snapshot. - */ - return Status::DataQualityError("This iceberg data file has multiple DVs."); + // Rebuild _expand_col_names with proper file-column-based names + std::vector new_expand_col_names; + for (size_t i = 0; i < _expand_col_names.size(); ++i) { + const auto& old_name = _expand_col_names[i]; + // Find the field_id for this expand column + int field_id = -1; + for (auto& [fid, name] : _id_to_block_column_name) { + if (name == old_name) { + field_id = fid; + break; + } } - RETURN_IF_ERROR( - read_deletion_vector(table_desc.original_file_path, deletion_vector_files[0])); - - _file_format_reader->set_push_down_agg_type(TPushAggOp::NONE); - // Readers can safely ignore position delete files if there is a DV for a data file. - } else if (!position_delete_files.empty()) { - RETURN_IF_ERROR( - _position_delete_base(table_desc.original_file_path, position_delete_files)); - _file_format_reader->set_push_down_agg_type(TPushAggOp::NONE); - } - COUNTER_UPDATE(_iceberg_profile.num_delete_files, table_desc.delete_files.size()); - return Status::OK(); -} + std::string file_col_name = old_name; + auto it = field_id_to_file_col_name.find(field_id); + if (it != field_id_to_file_col_name.end()) { + file_col_name = it->second; + } -void IcebergTableReader::_generate_equality_delete_block( - Block* block, const std::vector& equality_delete_col_names, - const std::vector& equality_delete_col_types) { - for (int i = 0; i < equality_delete_col_names.size(); ++i) { - DataTypePtr data_type = make_nullable(equality_delete_col_types[i]); - MutableColumnPtr data_column = data_type->create_column(); - block->insert(ColumnWithTypeAndName(std::move(data_column), data_type, - equality_delete_col_names[i])); - } -} + std::string table_col_name = EQ_DELETE_PRE + file_col_name; -Status IcebergTableReader::_expand_block_if_need(Block* block) { - std::set names; - auto block_names = block->get_names(); - names.insert(block_names.begin(), block_names.end()); - for (auto& col : _expand_columns) { - col.column->assume_mutable()->clear(); - if (names.contains(col.name)) { - return Status::InternalError("Wrong expand column '{}'", col.name); + // Update _id_to_block_column_name + if (field_id >= 0) { + _id_to_block_column_name[field_id] = table_col_name; } - names.insert(col.name); - (*_col_name_to_block_idx)[col.name] = static_cast(block->columns()); - block->insert(col); - } - return Status::OK(); -} -Status IcebergTableReader::_shrink_block_if_need(Block* block) { - std::set positions_to_erase; - for (const std::string& expand_col : _expand_col_names) { - if (!_col_name_to_block_idx->contains(expand_col)) { - return Status::InternalError("Wrong erase column '{}', block: {}", expand_col, - block->dump_names()); + // Update _expand_columns name + if (i < _expand_columns.size()) { + _expand_columns[i].name = table_col_name; } - positions_to_erase.emplace((*_col_name_to_block_idx)[expand_col]); - } - block->erase(positions_to_erase); - for (const std::string& expand_col : _expand_col_names) { - _col_name_to_block_idx->erase(expand_col); - } - return Status::OK(); -} -Status IcebergTableReader::_position_delete_base( - const std::string data_file_path, const std::vector& delete_files) { - std::vector delete_rows_array; - int64_t num_delete_rows = 0; - for (const auto& delete_file : delete_files) { - SCOPED_TIMER(_iceberg_profile.delete_files_read_time); - Status create_status = Status::OK(); - auto* delete_file_cache = _kv_cache->get( - _delet_file_cache_key(delete_file.path), [&]() -> DeleteFile* { - auto* position_delete = new DeleteFile; - create_status = _read_position_delete_file(delete_file, position_delete); - - if (!create_status) { - return nullptr; - } - - return position_delete; - }); - if (create_status.is()) { - continue; - } else if (!create_status.ok()) { - return create_status; - } + new_expand_col_names.push_back(table_col_name); - DeleteFile& delete_file_map = *((DeleteFile*)delete_file_cache); - auto get_value = [&](const auto& v) { - DeleteRows* row_ids = v.second.get(); - if (!row_ids->empty()) { - delete_rows_array.emplace_back(row_ids); - num_delete_rows += row_ids->size(); + // Add column IDs + if (it != field_id_to_file_col_name.end()) { + for (int j = 0; j < field_desc->size(); ++j) { + auto field_schema = field_desc->get_column(j); + if (field_schema && field_schema->field_id == field_id) { + ctx->column_ids.insert(field_schema->get_column_id()); + break; + } } - }; - delete_file_map.if_contains(data_file_path, get_value); - } - // Use a KV cache to store the delete rows corresponding to a data file path. - // The Parquet/ORC reader holds a reference (pointer) to this cached entry. - // This allows delete rows to be reused when a single data file is split into - // multiple splits, avoiding excessive memory usage when delete rows are large. - if (num_delete_rows > 0) { - SCOPED_TIMER(_iceberg_profile.delete_rows_sort_time); - _iceberg_delete_rows = - _kv_cache->get(data_file_path, - [&]() -> DeleteRows* { - auto* data_file_position_delete = new DeleteRows; - _sort_delete_rows(delete_rows_array, num_delete_rows, - *data_file_position_delete); - - return data_file_position_delete; - } - - ); - set_delete_rows(); - COUNTER_UPDATE(_iceberg_profile.num_delete_rows, num_delete_rows); - } - return Status::OK(); -} - -Status IcebergTableReader::_read_position_delete_file(const TIcebergDeleteFileDesc& delete_file, - DeleteFile* position_delete) { - GroupedDeleteRowsVisitor visitor(position_delete); - IcebergDeleteFileReaderOptions options; - options.state = _state; - options.profile = _profile; - options.scan_params = &_params; - options.io_ctx = _io_ctx; - options.meta_cache = _meta_cache; - options.fs_name = &_range.fs_name; - options.batch_size = READ_DELETE_FILE_BATCH_SIZE; - return read_iceberg_position_delete_file(delete_file, options, &visitor); -} + } -/** - * https://iceberg.apache.org/spec/#position-delete-files - * The rows in the delete file must be sorted by file_path then position to optimize filtering rows while scanning. - * Sorting by file_path allows filter pushdown by file in columnar storage formats. - * Sorting by position allows filtering rows while scanning, to avoid keeping deletes in memory. - */ -void IcebergTableReader::_sort_delete_rows( - const std::vector*>& delete_rows_array, int64_t num_delete_rows, - std::vector& result) { - if (delete_rows_array.empty()) { - return; - } - if (delete_rows_array.size() == 1) { - result.resize(num_delete_rows); - memcpy(result.data(), delete_rows_array.front()->data(), sizeof(int64_t) * num_delete_rows); - return; - } - if (delete_rows_array.size() == 2) { - result.resize(num_delete_rows); - std::merge(delete_rows_array.front()->begin(), delete_rows_array.front()->end(), - delete_rows_array.back()->begin(), delete_rows_array.back()->end(), - result.begin()); - return; + // Register in table_info_node: table_col_name → file_col_name + ctx->column_names.push_back(table_col_name); + ctx->table_info_node->add_children(table_col_name, file_col_name, + TableSchemaChangeHelper::ConstNode::get_instance()); } + _expand_col_names = std::move(new_expand_col_names); - using vec_pair = std::pair::iterator, std::vector::iterator>; - result.resize(num_delete_rows); - auto row_id_iter = result.begin(); - auto iter_end = result.end(); - std::vector rows_array; - for (auto* rows : delete_rows_array) { - if (!rows->empty()) { - rows_array.emplace_back(rows->begin(), rows->end()); - } + // Debug logging + for (const auto& name : _expand_col_names) { + LOG(INFO) << "[EqDeleteDebug] final expand col: " << name; } - size_t array_size = rows_array.size(); - while (row_id_iter != iter_end) { - int64_t min_index = 0; - int64_t min = *rows_array[0].first; - for (size_t i = 0; i < array_size; ++i) { - if (*rows_array[i].first < min) { - min_index = i; - min = *rows_array[i].first; - } - } - *row_id_iter++ = min; - rows_array[min_index].first++; - if (UNLIKELY(rows_array[min_index].first == rows_array[min_index].second)) { - rows_array.erase(rows_array.begin() + min_index); - array_size--; - } - } -} - -Status IcebergParquetReader::init_reader( - const std::vector& file_col_names, - std::unordered_map* col_name_to_block_idx, - const VExprContextSPtrs& conjuncts, - phmap::flat_hash_map>>& - slot_id_to_predicates, - const TupleDescriptor* tuple_descriptor, const RowDescriptor* row_descriptor, - const std::unordered_map* colname_to_slot_id, - const VExprContextSPtrs* not_single_slot_filter_conjuncts, - const std::unordered_map* slot_id_to_filter_conjuncts) { - _file_format = Fileformat::PARQUET; - _col_name_to_block_idx = col_name_to_block_idx; - auto* parquet_reader = static_cast(_file_format_reader.get()); - RETURN_IF_ERROR(parquet_reader->get_file_metadata_schema(&_data_file_field_desc)); - DCHECK(_data_file_field_desc != nullptr); - if (_row_lineage_columns != nullptr) { - const auto& table_desc = _range.table_format_params.iceberg_params; - _row_lineage_columns->first_row_id = - table_desc.__isset.first_row_id ? table_desc.first_row_id : -1; - _row_lineage_columns->last_updated_sequence_number = - table_desc.__isset.last_updated_sequence_number - ? table_desc.last_updated_sequence_number - : -1; - parquet_reader->set_row_lineage_columns(_row_lineage_columns); + for (auto& [fid, name] : _id_to_block_column_name) { + LOG(INFO) << "[EqDeleteDebug] final _id_to_block_column_name[" << fid << "] = " << name; } - auto column_id_result = _create_column_ids(_data_file_field_desc, tuple_descriptor); - auto& column_ids = column_id_result.column_ids; - const auto& filter_column_ids = column_id_result.filter_column_ids; - - RETURN_IF_ERROR(init_row_filters()); - _all_required_col_names = file_col_names; - - if (!_params.__isset.history_schema_info || _params.history_schema_info.empty()) [[unlikely]] { - RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name( - tuple_descriptor, *_data_file_field_desc, table_info_node_ptr)); - } else { - std::set read_col_name_set(file_col_names.begin(), file_col_names.end()); + // Enable group filtering for Iceberg + _filter_groups = true; - bool exist_field_id = true; - for (int idx = 0; idx < _data_file_field_desc->size(); idx++) { - if (_data_file_field_desc->get_column(idx)->field_id == -1) { - // the data file may be from hive table migrated to iceberg, field id is missing - exist_field_id = false; - break; - } - } - const auto& table_schema = _params.history_schema_info.front().root_field; - - table_info_node_ptr = std::make_shared(); - if (exist_field_id) { - // id -> table column name. columns that need read data file. - std::unordered_map> id_to_table_field; - for (const auto& table_field : table_schema.fields) { - auto field = table_field.field_ptr; - DCHECK(field->__isset.name); - if (!read_col_name_set.contains(field->name)) { - continue; - } - id_to_table_field.emplace(field->id, field); - } - - for (int idx = 0; idx < _data_file_field_desc->size(); idx++) { - const auto& data_file_field = _data_file_field_desc->get_column(idx); - auto data_file_column_id = _data_file_field_desc->get_column(idx)->field_id; - - if (id_to_table_field.contains(data_file_column_id)) { - const auto& table_field = id_to_table_field[data_file_column_id]; - - std::shared_ptr field_node = nullptr; - RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_field_id( - *table_field, *data_file_field, exist_field_id, field_node)); - table_info_node_ptr->add_children(table_field->name, data_file_field->name, - field_node); - - _id_to_block_column_name.emplace(data_file_column_id, table_field->name); - id_to_table_field.erase(data_file_column_id); - } else if (_equality_delete_col_ids.contains(data_file_column_id)) { - // Columns that need to be read for equality delete. - const static std::string EQ_DELETE_PRE = "__equality_delete_column__"; - - // Construct table column names that avoid duplication with current table schema. - // As the columns currently being read may have been deleted in the latest - // table structure or have undergone a series of schema changes... - std::string table_column_name = EQ_DELETE_PRE + data_file_field->name; - table_info_node_ptr->add_children( - table_column_name, data_file_field->name, - std::make_shared()); - - _id_to_block_column_name.emplace(data_file_column_id, table_column_name); - _expand_col_names.emplace_back(table_column_name); - auto expand_data_type = make_nullable(data_file_field->data_type); - _expand_columns.emplace_back( - ColumnWithTypeAndName {expand_data_type->create_column(), - expand_data_type, table_column_name}); - - _all_required_col_names.emplace_back(table_column_name); - column_ids.insert(data_file_field->get_column_id()); - } - } - for (const auto& [id, table_field] : id_to_table_field) { - table_info_node_ptr->add_not_exist_children(table_field->name); - } - } else { - if (!_equality_delete_col_ids.empty()) [[unlikely]] { - return Status::InternalError( - "Can not read missing field id data file when have equality delete"); - } - std::map file_column_idx_map; - for (size_t idx = 0; idx < _data_file_field_desc->size(); idx++) { - file_column_idx_map.emplace(_data_file_field_desc->get_column(idx)->name, idx); - } - - for (const auto& table_field : table_schema.fields) { - DCHECK(table_field.__isset.field_ptr); - DCHECK(table_field.field_ptr->__isset.name); - const auto& table_column_name = table_field.field_ptr->name; - if (!read_col_name_set.contains(table_column_name)) { - continue; - } - if (!table_field.field_ptr->__isset.name_mapping || - table_field.field_ptr->name_mapping.size() == 0) { - return Status::DataQualityError( - "name_mapping must be set when read missing field id data file."); - } - bool have_mapping = false; - for (const auto& mapped_name : table_field.field_ptr->name_mapping) { - if (file_column_idx_map.contains(mapped_name)) { - std::shared_ptr field_node = nullptr; - const auto& file_field = _data_file_field_desc->get_column( - file_column_idx_map.at(mapped_name)); - RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_field_id( - *table_field.field_ptr, *file_field, exist_field_id, field_node)); - table_info_node_ptr->add_children(table_column_name, file_field->name, - field_node); - have_mapping = true; - break; - } - } - if (!have_mapping) { - table_info_node_ptr->add_not_exist_children(table_column_name); - } - } - } - } - - return parquet_reader->init_reader( - _all_required_col_names, _col_name_to_block_idx, conjuncts, slot_id_to_predicates, - tuple_descriptor, row_descriptor, colname_to_slot_id, not_single_slot_filter_conjuncts, - slot_id_to_filter_conjuncts, table_info_node_ptr, true, column_ids, filter_column_ids); + return Status::OK(); } +// ============================================================================ +// IcebergParquetReader: _create_column_ids +// ============================================================================ ColumnIdResult IcebergParquetReader::_create_column_ids(const FieldDescriptor* field_desc, const TupleDescriptor* tuple_descriptor) { - // First, assign column IDs to the field descriptor auto* mutable_field_desc = const_cast(field_desc); mutable_field_desc->assign_ids(); - // map top-level table column iceberg_id -> FieldSchema* std::unordered_map iceberg_id_to_field_schema_map; - for (int i = 0; i < field_desc->size(); ++i) { auto field_schema = field_desc->get_column(i); if (!field_schema) continue; - int iceberg_id = field_schema->field_id; iceberg_id_to_field_schema_map[iceberg_id] = field_schema; } @@ -619,7 +311,6 @@ ColumnIdResult IcebergParquetReader::_create_column_ids(const FieldDescriptor* f std::set column_ids; std::set filter_column_ids; - // helper to process access paths for a given top-level parquet field auto process_access_paths = [](const FieldSchema* parquet_field, const std::vector& access_paths, std::set& out_ids) { @@ -633,23 +324,19 @@ ColumnIdResult IcebergParquetReader::_create_column_ids(const FieldDescriptor* f for (const auto* slot : tuple_descriptor->slots()) { auto it = iceberg_id_to_field_schema_map.find(slot->col_unique_id()); if (it == iceberg_id_to_field_schema_map.end()) { - // Column not found in file (e.g., partition column, added column) continue; } auto field_schema = it->second; - // primitive (non-nested) types: direct mapping by name if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY && slot->col_type() != TYPE_MAP)) { column_ids.insert(field_schema->column_id); - if (slot->is_predicate()) { filter_column_ids.insert(field_schema->column_id); } continue; } - // complex types: const auto& all_access_paths = slot->all_access_paths(); process_access_paths(field_schema, all_access_paths, column_ids); @@ -661,173 +348,209 @@ ColumnIdResult IcebergParquetReader::_create_column_ids(const FieldDescriptor* f return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids)); } -Status IcebergOrcReader::init_reader( - const std::vector& file_col_names, - std::unordered_map* col_name_to_block_idx, - const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor, - const RowDescriptor* row_descriptor, - const std::unordered_map* colname_to_slot_id, - const VExprContextSPtrs* not_single_slot_filter_conjuncts, - const std::unordered_map* slot_id_to_filter_conjuncts) { - _file_format = Fileformat::ORC; - _col_name_to_block_idx = col_name_to_block_idx; - auto* orc_reader = static_cast(_file_format_reader.get()); - RETURN_IF_ERROR(orc_reader->get_file_type(&_data_file_type_desc)); - std::vector data_file_col_names; - std::vector data_file_col_types; - RETURN_IF_ERROR(orc_reader->get_parsed_schema(&data_file_col_names, &data_file_col_types)); - if (_row_lineage_columns != nullptr) { - const auto& table_desc = _range.table_format_params.iceberg_params; - _row_lineage_columns->first_row_id = - table_desc.__isset.first_row_id ? table_desc.first_row_id : -1; - _row_lineage_columns->last_updated_sequence_number = - table_desc.__isset.last_updated_sequence_number - ? table_desc.last_updated_sequence_number - : -1; - orc_reader->set_row_lineage_columns(_row_lineage_columns); +// ============================================================================ +// IcebergParquetReader: _read_position_delete_file +// ============================================================================ +Status IcebergParquetReader::_read_position_delete_file(const TFileRangeDesc* delete_range, + DeleteFile* position_delete) { + ParquetReader parquet_delete_reader(get_profile(), get_scan_params(), *delete_range, + READ_DELETE_FILE_BATCH_SIZE, &get_state()->timezone_obj(), + get_io_ctx(), get_state(), _meta_cache); + // The delete file range has size=-1 (read whole file). We must disable + // row group filtering before init; otherwise _do_init_reader returns EndOfFile + // when _filter_groups && _range_size < 0. + ParquetInitContext delete_ctx; + delete_ctx.filter_groups = false; + delete_ctx.column_names = delete_file_col_names; + delete_ctx.col_name_to_block_idx = + const_cast*>(&DELETE_COL_NAME_TO_BLOCK_IDX); + RETURN_IF_ERROR(parquet_delete_reader.init_reader(&delete_ctx)); + + const tparquet::FileMetaData* meta_data = parquet_delete_reader.get_meta_data(); + bool dictionary_coded = true; + for (const auto& row_group : meta_data->row_groups) { + const auto& column_chunk = row_group.columns[ICEBERG_FILE_PATH_INDEX]; + if (!(column_chunk.__isset.meta_data && has_dict_page(column_chunk.meta_data))) { + dictionary_coded = false; + break; + } } + DataTypePtr data_type_file_path {new DataTypeString}; + DataTypePtr data_type_pos {new DataTypeInt64}; + bool eof = false; + while (!eof) { + Block block = {dictionary_coded + ? ColumnWithTypeAndName {ColumnDictI32::create( + FieldType::OLAP_FIELD_TYPE_VARCHAR), + data_type_file_path, ICEBERG_FILE_PATH} + : ColumnWithTypeAndName {data_type_file_path, ICEBERG_FILE_PATH}, - auto column_id_result = _create_column_ids(_data_file_type_desc, tuple_descriptor); - auto& column_ids = column_id_result.column_ids; - const auto& filter_column_ids = column_id_result.filter_column_ids; + {data_type_pos, ICEBERG_ROW_POS}}; + size_t read_rows = 0; + RETURN_IF_ERROR(parquet_delete_reader.get_next_block(&block, &read_rows, &eof)); - RETURN_IF_ERROR(init_row_filters()); + if (read_rows <= 0) { + break; + } + _gen_position_delete_file_range(block, position_delete, read_rows, dictionary_coded); + } + return Status::OK(); +}; - _all_required_col_names = file_col_names; - if (!_params.__isset.history_schema_info || _params.history_schema_info.empty()) [[unlikely]] { - RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(tuple_descriptor, _data_file_type_desc, - table_info_node_ptr)); - } else { - std::set read_col_name_set(file_col_names.begin(), file_col_names.end()); +// ============================================================================ +// IcebergOrcReader: on_before_init_reader (ORC-specific schema matching) +// ============================================================================ +Status IcebergOrcReader::on_before_init_reader(ReaderInitContext* ctx) { + _column_descs = ctx->column_descs; + _fill_col_name_to_block_idx = ctx->col_name_to_block_idx; + _file_format = Fileformat::ORC; + // Get ORC file type first (available because _create_file_reader() already ran) + const orc::Type* orc_type_ptr = nullptr; + RETURN_IF_ERROR(this->get_file_type(&orc_type_ptr)); + + // Build table_info_node by field_id or name matching. + // This must happen BEFORE column classification so we can use children_column_exists + // to check if a column exists in the file (by field ID, not name). + if (!get_scan_params().__isset.history_schema_info || + get_scan_params().history_schema_info.empty()) [[unlikely]] { + RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(ctx->tuple_descriptor, orc_type_ptr, + ctx->table_info_node)); + } else { bool exist_field_id = true; - for (size_t idx = 0; idx < _data_file_type_desc->getSubtypeCount(); idx++) { - if (!_data_file_type_desc->getSubtype(idx)->hasAttributeKey(ICEBERG_ORC_ATTRIBUTE)) { - exist_field_id = false; - break; - } + RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_field_id( + get_scan_params().history_schema_info.front().root_field, orc_type_ptr, + ICEBERG_ORC_ATTRIBUTE, ctx->table_info_node, exist_field_id)); + if (!exist_field_id) { + RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(ctx->tuple_descriptor, orc_type_ptr, + ctx->table_info_node)); + } + } + + std::unordered_set partition_col_names; + if (ctx->range->__isset.columns_from_path_keys) { + partition_col_names.insert(ctx->range->columns_from_path_keys.begin(), + ctx->range->columns_from_path_keys.end()); + } + + // Single pass: classify columns, detect $row_id, handle partition fallback. + bool has_partition_from_path = false; + for (auto& desc : *ctx->column_descs) { + if (desc.category == ColumnCategory::SYNTHESIZED && + desc.name == BeConsts::ICEBERG_ROWID_COL) { + _need_row_id_column = true; + this->register_synthesized_column_handler(BeConsts::ICEBERG_ROWID_COL, + [this](Block* block, size_t rows) -> Status { + return _fill_iceberg_row_id(block, rows); + }); + continue; } - - const auto& table_schema = _params.history_schema_info.front().root_field; - table_info_node_ptr = std::make_shared(); - if (exist_field_id) { - // id -> table column name. columns that need read data file. - std::unordered_map> id_to_table_field; - for (const auto& table_field : table_schema.fields) { - auto field = table_field.field_ptr; - DCHECK(field->__isset.name); - if (!read_col_name_set.contains(field->name)) { + if (desc.category == ColumnCategory::REGULAR) { + // Partition fallback: if column is a partition key and NOT in the file + // (checked via field ID matching in table_info_node), read from path instead. + if (partition_col_names.contains(desc.name) && + !ctx->table_info_node->children_column_exists(desc.name)) { + if (config::enable_iceberg_partition_column_fallback) { + desc.category = ColumnCategory::PARTITION_KEY; + has_partition_from_path = true; continue; } - - id_to_table_field.emplace(field->id, field); } + ctx->column_names.push_back(desc.name); + } else if (desc.category == ColumnCategory::GENERATED) { + ctx->column_names.push_back(desc.name); + } + } - for (int idx = 0; idx < _data_file_type_desc->getSubtypeCount(); idx++) { - const auto& data_file_field = _data_file_type_desc->getSubtype(idx); - auto data_file_column_id = - std::stoi(data_file_field->getAttributeValue(ICEBERG_ORC_ATTRIBUTE)); - auto const& file_column_name = _data_file_type_desc->getFieldName(idx); - - if (id_to_table_field.contains(data_file_column_id)) { - const auto& table_field = id_to_table_field[data_file_column_id]; - - std::shared_ptr field_node = nullptr; - RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_field_id( - *table_field, data_file_field, ICEBERG_ORC_ATTRIBUTE, exist_field_id, - field_node)); - table_info_node_ptr->add_children(table_field->name, file_column_name, - field_node); - - _id_to_block_column_name.emplace(data_file_column_id, table_field->name); - id_to_table_field.erase(data_file_column_id); - } else if (_equality_delete_col_ids.contains(data_file_column_id)) { - // Columns that need to be read for equality delete. - const static std::string EQ_DELETE_PRE = "__equality_delete_column__"; - - // Construct table column names that avoid duplication with current table schema. - // As the columns currently being read may have been deleted in the latest - // table structure or have undergone a series of schema changes... - std::string table_column_name = EQ_DELETE_PRE + file_column_name; - table_info_node_ptr->add_children( - table_column_name, file_column_name, - std::make_shared()); - - _id_to_block_column_name.emplace(data_file_column_id, table_column_name); - _expand_col_names.emplace_back(table_column_name); - - auto expand_data_type = make_nullable(data_file_col_types[idx]); - _expand_columns.emplace_back( - ColumnWithTypeAndName {expand_data_type->create_column(), - expand_data_type, table_column_name}); - - _all_required_col_names.emplace_back(table_column_name); - column_ids.insert(data_file_field->getColumnId()); - } - } - for (const auto& [id, table_field] : id_to_table_field) { - table_info_node_ptr->add_not_exist_children(table_field->name); - } - } else { - if (!_equality_delete_col_ids.empty()) [[unlikely]] { - return Status::InternalError( - "Can not read missing field id data file when have equality delete"); - } - std::map file_column_idx_map; - for (int idx = 0; idx < _data_file_type_desc->getSubtypeCount(); idx++) { - auto const& file_column_name = _data_file_type_desc->getFieldName(idx); - file_column_idx_map.emplace(file_column_name, idx); + if (has_partition_from_path) { + RETURN_IF_ERROR(_extract_partition_values(*ctx->range, ctx->tuple_descriptor, + _fill_partition_values)); + } + + _all_required_col_names = ctx->column_names; + + // Create column IDs from ORC type + auto column_id_result = _create_column_ids(orc_type_ptr, ctx->tuple_descriptor); + ctx->column_ids = std::move(column_id_result.column_ids); + ctx->filter_column_ids = std::move(column_id_result.filter_column_ids); + + // Build field_id -> block_column_name mapping for equality delete filtering. + for (const auto* slot : ctx->tuple_descriptor->slots()) { + _id_to_block_column_name.emplace(slot->col_unique_id(), slot->col_name()); + } + + // Process delete files (must happen before _do_init_reader so expand col IDs are included) + RETURN_IF_ERROR(_init_row_filters()); + + // Add expand column IDs for equality delete and remap expand column names + // (matching master's behavior with __equality_delete_column__ prefix) + const static std::string EQ_DELETE_PRE = "__equality_delete_column__"; + std::unordered_map field_id_to_file_col_name; + for (uint64_t i = 0; i < orc_type_ptr->getSubtypeCount(); ++i) { + std::string col_name = orc_type_ptr->getFieldName(i); + const orc::Type* sub_type = orc_type_ptr->getSubtype(i); + if (sub_type->hasAttributeKey(ICEBERG_ORC_ATTRIBUTE)) { + int fid = std::stoi(sub_type->getAttributeValue(ICEBERG_ORC_ATTRIBUTE)); + field_id_to_file_col_name[fid] = col_name; + } + } + + std::vector new_expand_col_names; + for (size_t i = 0; i < _expand_col_names.size(); ++i) { + const auto& old_name = _expand_col_names[i]; + int field_id = -1; + for (auto& [fid, name] : _id_to_block_column_name) { + if (name == old_name) { + field_id = fid; + break; } + } - for (const auto& table_field : table_schema.fields) { - DCHECK(table_field.__isset.field_ptr); - DCHECK(table_field.field_ptr->__isset.name); - const auto& table_column_name = table_field.field_ptr->name; - if (!read_col_name_set.contains(table_column_name)) { - continue; - } - if (!table_field.field_ptr->__isset.name_mapping || - table_field.field_ptr->name_mapping.size() == 0) { - return Status::DataQualityError( - "name_mapping must be set when read missing field id data file."); - } - auto have_mapping = false; - for (const auto& mapped_name : table_field.field_ptr->name_mapping) { - if (file_column_idx_map.contains(mapped_name)) { - auto file_column_idx = file_column_idx_map.at(mapped_name); - std::shared_ptr field_node = nullptr; - const auto& file_field = _data_file_type_desc->getSubtype(file_column_idx); - RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_field_id( - *table_field.field_ptr, file_field, ICEBERG_ORC_ATTRIBUTE, - exist_field_id, field_node)); - table_info_node_ptr->add_children( - table_column_name, - _data_file_type_desc->getFieldName(file_column_idx), field_node); - have_mapping = true; - break; - } - } - if (!have_mapping) { - table_info_node_ptr->add_not_exist_children(table_column_name); + std::string file_col_name = old_name; + auto it = field_id_to_file_col_name.find(field_id); + if (it != field_id_to_file_col_name.end()) { + file_col_name = it->second; + } + + std::string table_col_name = EQ_DELETE_PRE + file_col_name; + + if (field_id >= 0) { + _id_to_block_column_name[field_id] = table_col_name; + } + if (i < _expand_columns.size()) { + _expand_columns[i].name = table_col_name; + } + new_expand_col_names.push_back(table_col_name); + + // Add column IDs + if (it != field_id_to_file_col_name.end()) { + for (uint64_t j = 0; j < orc_type_ptr->getSubtypeCount(); ++j) { + const orc::Type* sub_type = orc_type_ptr->getSubtype(j); + if (orc_type_ptr->getFieldName(j) == file_col_name) { + ctx->column_ids.insert(sub_type->getColumnId()); + break; } } } + + ctx->column_names.push_back(table_col_name); + ctx->table_info_node->add_children(table_col_name, file_col_name, + TableSchemaChangeHelper::ConstNode::get_instance()); } + _expand_col_names = std::move(new_expand_col_names); - return orc_reader->init_reader(&_all_required_col_names, _col_name_to_block_idx, conjuncts, - false, tuple_descriptor, row_descriptor, - not_single_slot_filter_conjuncts, slot_id_to_filter_conjuncts, - table_info_node_ptr, column_ids, filter_column_ids); + return Status::OK(); } +// ============================================================================ +// IcebergOrcReader: _create_column_ids +// ============================================================================ ColumnIdResult IcebergOrcReader::_create_column_ids(const orc::Type* orc_type, const TupleDescriptor* tuple_descriptor) { - // map top-level table column iceberg_id -> orc::Type* std::unordered_map iceberg_id_to_orc_type_map; for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) { auto orc_sub_type = orc_type->getSubtype(i); if (!orc_sub_type) continue; - if (!orc_sub_type->hasAttributeKey(ICEBERG_ORC_ATTRIBUTE)) { continue; } @@ -838,7 +561,6 @@ ColumnIdResult IcebergOrcReader::_create_column_ids(const orc::Type* orc_type, std::set column_ids; std::set filter_column_ids; - // helper to process access paths for a given top-level orc field auto process_access_paths = [](const orc::Type* orc_field, const std::vector& access_paths, std::set& out_ids) { @@ -852,12 +574,10 @@ ColumnIdResult IcebergOrcReader::_create_column_ids(const orc::Type* orc_type, for (const auto* slot : tuple_descriptor->slots()) { auto it = iceberg_id_to_orc_type_map.find(slot->col_unique_id()); if (it == iceberg_id_to_orc_type_map.end()) { - // Column not found in file continue; } const orc::Type* orc_field = it->second; - // primitive (non-nested) types if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY && slot->col_type() != TYPE_MAP)) { column_ids.insert(orc_field->getColumnId()); @@ -867,7 +587,6 @@ ColumnIdResult IcebergOrcReader::_create_column_ids(const orc::Type* orc_type, continue; } - // complex types const auto& all_access_paths = slot->all_access_paths(); process_access_paths(orc_field, all_access_paths, column_ids); @@ -880,378 +599,33 @@ ColumnIdResult IcebergOrcReader::_create_column_ids(const orc::Type* orc_type, return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids)); } -// Directly read the deletion vector using the `content_offset` and -// `content_size_in_bytes` provided by FE in `delete_file_desc`. -// These two fields indicate the location of a blob in storage. -// Since the current format is `deletion-vector-v1`, which does not -// compress any blobs, we can temporarily skip parsing the Puffin footer. -Status IcebergTableReader::read_deletion_vector(const std::string& data_file_path, - const TIcebergDeleteFileDesc& delete_file_desc) { - Status create_status = Status::OK(); - SCOPED_TIMER(_iceberg_profile.delete_files_read_time); - _iceberg_delete_rows = _kv_cache->get(data_file_path, [&]() -> DeleteRows* { - auto* delete_rows = new DeleteRows; - - TFileRangeDesc delete_range; - // must use __set() method to make sure __isset is true - delete_range.__set_fs_name(_range.fs_name); - delete_range.path = delete_file_desc.path; - delete_range.start_offset = delete_file_desc.content_offset; - delete_range.size = delete_file_desc.content_size_in_bytes; - delete_range.file_size = -1; - - // We may consider caching the DeletionVectorReader when reading Puffin files, - // where the underlying reader is an `InMemoryFileReader` and a single data file is - // split into multiple splits. However, we need to ensure that the underlying - // reader supports multi-threaded access. - DeletionVectorReader dv_reader(_state, _profile, _params, delete_range, _io_ctx); - create_status = dv_reader.open(); - if (!create_status.ok()) [[unlikely]] { - return nullptr; - } - - size_t buffer_size = delete_range.size; - std::vector buf(buffer_size); - if (buffer_size < 12) [[unlikely]] { - // Minimum size: 4 bytes length + 4 bytes magic + 4 bytes CRC32 - create_status = Status::DataQualityError("Deletion vector file size too small: {}", - buffer_size); - return nullptr; - } - - create_status = dv_reader.read_at(delete_range.start_offset, {buf.data(), buffer_size}); - if (!create_status) [[unlikely]] { - return nullptr; - } - // The serialized blob contains: - // - // Combined length of the vector and magic bytes stored as 4 bytes, big-endian - // A 4-byte magic sequence, D1 D3 39 64 - // The vector, serialized as described below - // A CRC-32 checksum of the magic bytes and serialized vector as 4 bytes, big-endian - - auto total_length = BigEndian::Load32(buf.data()); - if (total_length + 8 != buffer_size) [[unlikely]] { - create_status = Status::DataQualityError( - "Deletion vector length mismatch, expected: {}, actual: {}", total_length + 8, - buffer_size); - return nullptr; - } - - constexpr static char MAGIC_NUMBER[] = {'\xD1', '\xD3', '\x39', '\x64'}; - if (memcmp(buf.data() + sizeof(total_length), MAGIC_NUMBER, 4)) [[unlikely]] { - create_status = Status::DataQualityError("Deletion vector magic number mismatch"); - return nullptr; - } - - roaring::Roaring64Map bitmap; - SCOPED_TIMER(_iceberg_profile.parse_delete_file_time); - try { - bitmap = roaring::Roaring64Map::readSafe(buf.data() + 8, buffer_size - 12); - } catch (const std::runtime_error& e) { - create_status = Status::DataQualityError("Decode roaring bitmap failed, {}", e.what()); - return nullptr; - } - // skip CRC-32 checksum - - delete_rows->reserve(bitmap.cardinality()); - for (auto it = bitmap.begin(); it != bitmap.end(); it++) { - delete_rows->push_back(*it); - } - COUNTER_UPDATE(_iceberg_profile.num_delete_rows, delete_rows->size()); - return delete_rows; - }); - - RETURN_IF_ERROR(create_status); - if (!_iceberg_delete_rows->empty()) [[likely]] { - set_delete_rows(); - } - return Status::OK(); -} - -// Similar to the code structure of IcebergOrcReader::_process_equality_delete, -// but considering the significant differences in how parquet/orc obtains -// attributes/column IDs, it is not easy to combine them. -Status IcebergParquetReader::_process_equality_delete( - const std::vector& delete_files) { - std::unordered_map> - partition_columns; - std::unordered_map missing_columns; - - std::map data_file_id_to_field_schema; - for (int idx = 0; idx < _data_file_field_desc->size(); ++idx) { - auto field_schema = _data_file_field_desc->get_column(idx); - if (_data_file_field_desc->get_column(idx)->field_id == -1) { - return Status::DataQualityError("Iceberg equality delete data file missing field id."); - } - data_file_id_to_field_schema[_data_file_field_desc->get_column(idx)->field_id] = - field_schema; - } - - for (const auto& delete_file : delete_files) { - TFileRangeDesc delete_desc; - // must use __set() method to make sure __isset is true - delete_desc.__set_fs_name(_range.fs_name); - delete_desc.path = delete_file.path; - delete_desc.start_offset = 0; - delete_desc.size = -1; - delete_desc.file_size = -1; - - if (!delete_file.__isset.field_ids) [[unlikely]] { - return Status::InternalError( - "missing delete field ids when reading equality delete file"); - } - auto& read_column_field_ids = delete_file.field_ids; - std::set read_column_field_ids_set; - for (const auto& field_id : read_column_field_ids) { - read_column_field_ids_set.insert(field_id); - _equality_delete_col_ids.insert(field_id); - } - - auto delete_reader = ParquetReader::create_unique( - _profile, _params, delete_desc, READ_DELETE_FILE_BATCH_SIZE, - &_state->timezone_obj(), _io_ctx, _state, _meta_cache); - RETURN_IF_ERROR(delete_reader->init_schema_reader()); - - // the column that to read equality delete file. - // (delete file may be have extra columns that don't need to read) - std::vector delete_col_names; - std::vector delete_col_types; - std::vector delete_col_ids; - std::unordered_map delete_col_name_to_block_idx; - - const FieldDescriptor* delete_field_desc = nullptr; - RETURN_IF_ERROR(delete_reader->get_file_metadata_schema(&delete_field_desc)); - DCHECK(delete_field_desc != nullptr); - - auto eq_file_node = std::make_shared(); - for (const auto& delete_file_field : delete_field_desc->get_fields_schema()) { - if (delete_file_field.field_id == -1) [[unlikely]] { // missing delete_file_field id - // equality delete file must have delete_file_field id to match column. - return Status::DataQualityError( - "missing delete_file_field id when reading equality delete file"); - } else if (read_column_field_ids_set.contains(delete_file_field.field_id)) { - // the column that need to read. - if (delete_file_field.children.size() > 0) [[unlikely]] { // complex column - return Status::InternalError( - "can not support read complex column in equality delete file"); - } else if (!data_file_id_to_field_schema.contains(delete_file_field.field_id)) - [[unlikely]] { - return Status::DataQualityError( - "can not find delete field id in data file schema when reading " - "equality delete file"); - } - auto data_file_field = data_file_id_to_field_schema[delete_file_field.field_id]; - if (data_file_field->data_type->get_primitive_type() != - delete_file_field.data_type->get_primitive_type()) [[unlikely]] { - return Status::NotSupported( - "Not Support type change in equality delete, field: {}, delete " - "file type: {}, data file type: {}", - delete_file_field.field_id, delete_file_field.data_type->get_name(), - data_file_field->data_type->get_name()); - } - - std::string filed_lower_name = to_lower(delete_file_field.name); - eq_file_node->add_children(filed_lower_name, delete_file_field.name, - std::make_shared()); - - delete_col_ids.emplace_back(delete_file_field.field_id); - delete_col_names.emplace_back(filed_lower_name); - delete_col_types.emplace_back(make_nullable(delete_file_field.data_type)); - - read_column_field_ids_set.erase(delete_file_field.field_id); - } else { - // delete file may be have extra columns that don't need to read - } - } - if (!read_column_field_ids_set.empty()) [[unlikely]] { - return Status::DataQualityError("some field ids not found in equality delete file."); - } - - for (uint32_t idx = 0; idx < delete_col_names.size(); ++idx) { - delete_col_name_to_block_idx[delete_col_names[idx]] = idx; - } - phmap::flat_hash_map>> tmp; - RETURN_IF_ERROR(delete_reader->init_reader(delete_col_names, &delete_col_name_to_block_idx, - {}, tmp, nullptr, nullptr, nullptr, nullptr, - nullptr, eq_file_node, false)); - RETURN_IF_ERROR(delete_reader->set_fill_columns(partition_columns, missing_columns)); - - if (!_equality_delete_block_map.contains(delete_col_ids)) { - _equality_delete_block_map.emplace(delete_col_ids, _equality_delete_blocks.size()); - Block block; - _generate_equality_delete_block(&block, delete_col_names, delete_col_types); - _equality_delete_blocks.emplace_back(block); - } - Block& eq_file_block = _equality_delete_blocks[_equality_delete_block_map[delete_col_ids]]; - bool eof = false; - while (!eof) { - Block tmp_block; - _generate_equality_delete_block(&tmp_block, delete_col_names, delete_col_types); - size_t read_rows = 0; - RETURN_IF_ERROR(delete_reader->get_next_block(&tmp_block, &read_rows, &eof)); - if (read_rows > 0) { - MutableBlock mutable_block(&eq_file_block); - RETURN_IF_ERROR(mutable_block.merge(tmp_block)); - } - } - } - - for (const auto& [delete_col_ids, block_idx] : _equality_delete_block_map) { - auto& eq_file_block = _equality_delete_blocks[block_idx]; - auto equality_delete_impl = - EqualityDeleteBase::get_delete_impl(&eq_file_block, delete_col_ids); - RETURN_IF_ERROR(equality_delete_impl->init(_profile)); - _equality_delete_impls.emplace_back(std::move(equality_delete_impl)); +// ============================================================================ +// IcebergOrcReader: _read_position_delete_file +// ============================================================================ +Status IcebergOrcReader::_read_position_delete_file(const TFileRangeDesc* delete_range, + DeleteFile* position_delete) { + OrcReader orc_delete_reader(get_profile(), get_state(), get_scan_params(), *delete_range, + READ_DELETE_FILE_BATCH_SIZE, get_state()->timezone(), get_io_ctx(), + _meta_cache); + OrcInitContext delete_ctx; + delete_ctx.column_names = delete_file_col_names; + delete_ctx.col_name_to_block_idx = + const_cast*>(&DELETE_COL_NAME_TO_BLOCK_IDX); + RETURN_IF_ERROR(orc_delete_reader.init_reader(&delete_ctx)); + + bool eof = false; + DataTypePtr data_type_file_path {new DataTypeString}; + DataTypePtr data_type_pos {new DataTypeInt64}; + while (!eof) { + Block block = {{data_type_file_path, ICEBERG_FILE_PATH}, {data_type_pos, ICEBERG_ROW_POS}}; + + size_t read_rows = 0; + RETURN_IF_ERROR(orc_delete_reader.get_next_block(&block, &read_rows, &eof)); + + _gen_position_delete_file_range(block, position_delete, read_rows, false); } return Status::OK(); } -Status IcebergOrcReader::_process_equality_delete( - const std::vector& delete_files) { - std::unordered_map> - partition_columns; - std::unordered_map missing_columns; - - std::map data_file_id_to_field_idx; - for (int idx = 0; idx < _data_file_type_desc->getSubtypeCount(); ++idx) { - if (!_data_file_type_desc->getSubtype(idx)->hasAttributeKey(ICEBERG_ORC_ATTRIBUTE)) { - return Status::DataQualityError("Iceberg equality delete data file missing field id."); - } - auto field_id = std::stoi( - _data_file_type_desc->getSubtype(idx)->getAttributeValue(ICEBERG_ORC_ATTRIBUTE)); - data_file_id_to_field_idx[field_id] = idx; - } - - for (const auto& delete_file : delete_files) { - TFileRangeDesc delete_desc; - // must use __set() method to make sure __isset is true - delete_desc.__set_fs_name(_range.fs_name); - delete_desc.path = delete_file.path; - delete_desc.start_offset = 0; - delete_desc.size = -1; - delete_desc.file_size = -1; - - if (!delete_file.__isset.field_ids) [[unlikely]] { - return Status::InternalError( - "missing delete field ids when reading equality delete file"); - } - auto& read_column_field_ids = delete_file.field_ids; - std::set read_column_field_ids_set; - for (const auto& field_id : read_column_field_ids) { - read_column_field_ids_set.insert(field_id); - _equality_delete_col_ids.insert(field_id); - } - - auto delete_reader = OrcReader::create_unique(_profile, _state, _params, delete_desc, - READ_DELETE_FILE_BATCH_SIZE, - _state->timezone(), _io_ctx, _meta_cache); - RETURN_IF_ERROR(delete_reader->init_schema_reader()); - // delete file schema - std::vector delete_file_col_names; - std::vector delete_file_col_types; - RETURN_IF_ERROR( - delete_reader->get_parsed_schema(&delete_file_col_names, &delete_file_col_types)); - - // the column that to read equality delete file. - // (delete file maybe have extra columns that don't need to read) - std::vector delete_col_names; - std::vector delete_col_types; - std::vector delete_col_ids; - std::unordered_map delete_col_name_to_block_idx; - - const orc::Type* delete_field_desc = nullptr; - RETURN_IF_ERROR(delete_reader->get_file_type(&delete_field_desc)); - DCHECK(delete_field_desc != nullptr); - - auto eq_file_node = std::make_shared(); - - for (size_t idx = 0; idx < delete_field_desc->getSubtypeCount(); idx++) { - auto delete_file_field = delete_field_desc->getSubtype(idx); - - if (!delete_file_field->hasAttributeKey(ICEBERG_ORC_ATTRIBUTE)) - [[unlikely]] { // missing delete_file_field id - // equality delete file must have delete_file_field id to match column. - return Status::DataQualityError( - "missing delete_file_field id when reading equality delete file"); - } else { - auto delete_field_id = - std::stoi(delete_file_field->getAttributeValue(ICEBERG_ORC_ATTRIBUTE)); - if (read_column_field_ids_set.contains(delete_field_id)) { - // the column that need to read. - if (is_complex_type(delete_file_col_types[idx]->get_primitive_type())) - [[unlikely]] { - return Status::InternalError( - "can not support read complex column in equality delete file."); - } else if (!data_file_id_to_field_idx.contains(delete_field_id)) [[unlikely]] { - return Status::DataQualityError( - "can not find delete field id in data file schema when reading " - "equality delete file"); - } - - auto data_file_field = _data_file_type_desc->getSubtype( - data_file_id_to_field_idx[delete_field_id]); - - if (delete_file_field->getKind() != data_file_field->getKind()) [[unlikely]] { - return Status::NotSupported( - "Not Support type change in equality delete, field: {}, delete " - "file type: {}, data file type: {}", - delete_field_id, delete_file_field->getKind(), - data_file_field->getKind()); - } - std::string filed_lower_name = to_lower(delete_field_desc->getFieldName(idx)); - eq_file_node->add_children( - filed_lower_name, delete_field_desc->getFieldName(idx), - std::make_shared()); - - delete_col_ids.emplace_back(delete_field_id); - delete_col_names.emplace_back(filed_lower_name); - delete_col_types.emplace_back(make_nullable(delete_file_col_types[idx])); - read_column_field_ids_set.erase(delete_field_id); - } - } - } - if (!read_column_field_ids_set.empty()) [[unlikely]] { - return Status::DataQualityError("some field ids not found in equality delete file."); - } - - for (uint32_t idx = 0; idx < delete_col_names.size(); ++idx) { - delete_col_name_to_block_idx[delete_col_names[idx]] = idx; - } - - RETURN_IF_ERROR(delete_reader->init_reader(&delete_col_names, &delete_col_name_to_block_idx, - {}, false, nullptr, nullptr, nullptr, nullptr, - eq_file_node)); - RETURN_IF_ERROR(delete_reader->set_fill_columns(partition_columns, missing_columns)); - - if (!_equality_delete_block_map.contains(delete_col_ids)) { - _equality_delete_block_map.emplace(delete_col_ids, _equality_delete_blocks.size()); - Block block; - _generate_equality_delete_block(&block, delete_col_names, delete_col_types); - _equality_delete_blocks.emplace_back(block); - } - Block& eq_file_block = _equality_delete_blocks[_equality_delete_block_map[delete_col_ids]]; - bool eof = false; - while (!eof) { - Block tmp_block; - _generate_equality_delete_block(&tmp_block, delete_col_names, delete_col_types); - size_t read_rows = 0; - RETURN_IF_ERROR(delete_reader->get_next_block(&tmp_block, &read_rows, &eof)); - if (read_rows > 0) { - MutableBlock mutable_block(&eq_file_block); - RETURN_IF_ERROR(mutable_block.merge(tmp_block)); - } - } - } - - for (const auto& [delete_col_ids, block_idx] : _equality_delete_block_map) { - auto& eq_file_block = _equality_delete_blocks[block_idx]; - auto equality_delete_impl = - EqualityDeleteBase::get_delete_impl(&eq_file_block, delete_col_ids); - RETURN_IF_ERROR(equality_delete_impl->init(_profile)); - _equality_delete_impls.emplace_back(std::move(equality_delete_impl)); - } - return Status::OK(); -} #include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/format/table/iceberg_reader.h b/be/src/format/table/iceberg_reader.h index e8f33c9ee29006..d21c661f207ad7 100644 --- a/be/src/format/table/iceberg_reader.h +++ b/be/src/format/table/iceberg_reader.h @@ -26,15 +26,18 @@ #include #include "common/status.h" -#include "exprs/vslot_ref.h" +#include "core/column/column_dictionary.h" +#include "core/data_type/define_primitive_type.h" +#include "core/data_type/primitive_type.h" +#include "core/types.h" #include "format/orc/vorc_reader.h" #include "format/parquet/vparquet_reader.h" -#include "format/table/equality_delete.h" -#include "format/table/table_format_reader.h" -#include "storage/olap_scan_common.h" +#include "format/table/iceberg_reader_mixin.h" +#include "storage/olap_common.h" namespace tparquet { class KeyValue; +class ColumnMetaData; } // namespace tparquet namespace doris { @@ -70,191 +73,92 @@ struct RowLineageColumns { } }; -class IcebergTableReader : public TableFormatReader, public TableSchemaChangeHelper { -public: +struct IcebergTableReader { static constexpr const char* ROW_LINEAGE_ROW_ID = "_row_id"; static constexpr const char* ROW_LINEAGE_LAST_UPDATED_SEQ_NUMBER = "_last_updated_sequence_number"; - IcebergTableReader(std::unique_ptr file_format_reader, RuntimeProfile* profile, - RuntimeState* state, const TFileScanRangeParams& params, - const TFileRangeDesc& range, ShardedKVCache* kv_cache, io::IOContext* io_ctx, - FileMetaCache* meta_cache); - ~IcebergTableReader() override = default; - - void set_need_row_id_column(bool need) { _need_row_id_column = need; } - bool need_row_id_column() const { return _need_row_id_column; } - void set_row_id_column_position(int position) { _row_id_column_position = position; } - - Status init_row_filters() final; - - Status get_next_block_inner(Block* block, size_t* read_rows, bool* eof) final; - - enum { DATA, POSITION_DELETE, EQUALITY_DELETE, DELETION_VECTOR }; - enum Fileformat { NONE, PARQUET, ORC, AVRO }; - - virtual void set_delete_rows() = 0; - - bool has_delete_operations() const override { - return _equality_delete_impls.size() > 0 || TableFormatReader::has_delete_operations(); - } - - Status read_deletion_vector(const std::string& data_file_path, - const TIcebergDeleteFileDesc& delete_file_desc); - - void set_row_lineage_columns(std::shared_ptr row_lineage_columns) { - _row_lineage_columns = std::move(row_lineage_columns); - } - static bool _is_fully_dictionary_encoded(const tparquet::ColumnMetaData& column_metadata); - -protected: - struct IcebergProfile { - RuntimeProfile::Counter* num_delete_files; - RuntimeProfile::Counter* num_delete_rows; - RuntimeProfile::Counter* delete_files_read_time; - RuntimeProfile::Counter* delete_rows_sort_time; - RuntimeProfile::Counter* parse_delete_file_time; - }; - using DeleteRows = std::vector; - using DeleteFile = phmap::parallel_flat_hash_map< - std::string, std::unique_ptr, std::hash, std::equal_to<>, - std::allocator>>, 8, - std::mutex>; - - // $row_id metadata column generation state - bool _need_row_id_column = false; - int _row_id_column_position = -1; - /** - * https://iceberg.apache.org/spec/#position-delete-files - * The rows in the delete file must be sorted by file_path then position to optimize filtering rows while scanning. - * Sorting by file_path allows filter pushdown by file in columnar storage formats. - * Sorting by position allows filtering rows while scanning, to avoid keeping deletes in memory. - */ - static void _sort_delete_rows(const std::vector*>& delete_rows_array, - int64_t num_delete_rows, std::vector& result); - - static std::string _delet_file_cache_key(const std::string& path) { return "delete_" + path; } - - Status _position_delete_base(const std::string data_file_path, - const std::vector& delete_files); - virtual Status _process_equality_delete( - const std::vector& delete_files) = 0; - void _generate_equality_delete_block(Block* block, - const std::vector& equality_delete_col_names, - const std::vector& equality_delete_col_types); - // Equality delete should read the primary columns. Add the missing columns - Status _expand_block_if_need(Block* block); - // Remove the added delete columns - Status _shrink_block_if_need(Block* block); - - // owned by scan node - ShardedKVCache* _kv_cache; - IcebergProfile _iceberg_profile; - // _iceberg_delete_rows from kv_cache - const std::vector* _iceberg_delete_rows = nullptr; - - // Pointer to external column name to block index mapping (from FileScanner) - // Used to dynamically add expand columns for equality delete - std::unordered_map* _col_name_to_block_idx = nullptr; - - Fileformat _file_format = Fileformat::NONE; - - const int64_t MIN_SUPPORT_DELETE_FILES_VERSION = 2; - const int READ_DELETE_FILE_BATCH_SIZE = 102400; - - // Read a position delete file from the full Iceberg delete descriptor. - Status _read_position_delete_file(const TIcebergDeleteFileDesc&, DeleteFile*); - - // read table colummn + extra equality delete columns - std::vector _all_required_col_names; - - // extra equality delete name and type - std::vector _expand_col_names; - std::vector _expand_columns; - - // all ids that need read for eq delete (from all qe delte file.) - std::set _equality_delete_col_ids; - // eq delete column ids -> location of _equality_delete_blocks / _equality_delete_impls - std::map, int> _equality_delete_block_map; - // EqualityDeleteBase stores raw pointers to these blocks, so do not modify this vector after - // creating entries in _equality_delete_impls. - std::vector _equality_delete_blocks; - std::vector> _equality_delete_impls; - - // id -> block column name. - std::unordered_map _id_to_block_column_name; - - std::shared_ptr _row_lineage_columns; }; -class IcebergParquetReader final : public IcebergTableReader { +// IcebergParquetReader: inherits ParquetReader via IcebergReaderMixin CRTP +class IcebergParquetReader final : public IcebergReaderMixin { public: ENABLE_FACTORY_CREATOR(IcebergParquetReader); - IcebergParquetReader(std::unique_ptr file_format_reader, RuntimeProfile* profile, - RuntimeState* state, const TFileScanRangeParams& params, - const TFileRangeDesc& range, ShardedKVCache* kv_cache, - io::IOContext* io_ctx, FileMetaCache* meta_cache) - : IcebergTableReader(std::move(file_format_reader), profile, state, params, range, - kv_cache, io_ctx, meta_cache) {} - Status init_reader( - const std::vector& file_col_names, - std::unordered_map* col_name_to_block_idx, - const VExprContextSPtrs& conjuncts, - phmap::flat_hash_map>>& - slot_id_to_predicates, - const TupleDescriptor* tuple_descriptor, const RowDescriptor* row_descriptor, - const std::unordered_map* colname_to_slot_id, - const VExprContextSPtrs* not_single_slot_filter_conjuncts, - const std::unordered_map* slot_id_to_filter_conjuncts); + IcebergParquetReader(ShardedKVCache* kv_cache, RuntimeProfile* profile, + const TFileScanRangeParams& params, const TFileRangeDesc& range, + size_t batch_size, const cctz::time_zone* ctz, io::IOContext* io_ctx, + RuntimeState* state, FileMetaCache* meta_cache) + : IcebergReaderMixin(kv_cache, profile, params, range, batch_size, ctz, + io_ctx, state, meta_cache) {} void set_delete_rows() final { - auto* parquet_reader = (ParquetReader*)(_file_format_reader.get()); - parquet_reader->set_delete_rows(_iceberg_delete_rows); + LOG(INFO) << "[PosDeleteDebug] IcebergParquetReader::set_delete_rows: _iceberg_delete_rows=" + << (_iceberg_delete_rows + ? "set(" + std::to_string(_iceberg_delete_rows->size()) + ")" + : "null"); + // Call ParquetReader's set_delete_rows(const vector*) + ParquetReader::set_delete_rows(_iceberg_delete_rows); + } + +protected: + // Parquet-specific schema matching via on_before_init_reader hook + Status on_before_init_reader(ReaderInitContext* ctx) override; + + std::unique_ptr _create_equality_reader( + const TFileRangeDesc& delete_desc) final { + return ParquetReader::create_unique(this->get_profile(), this->get_scan_params(), + delete_desc, READ_DELETE_FILE_BATCH_SIZE, + &this->get_state()->timezone_obj(), this->get_io_ctx(), + this->get_state(), this->_meta_cache); } -private: static ColumnIdResult _create_column_ids(const FieldDescriptor* field_desc, const TupleDescriptor* tuple_descriptor); - Status _process_equality_delete(const std::vector& delete_files) final; - const FieldDescriptor* _data_file_field_desc = nullptr; +private: + Status _read_position_delete_file(const TFileRangeDesc* delete_range, + DeleteFile* position_delete) final; }; -class IcebergOrcReader final : public IcebergTableReader { + +// IcebergOrcReader: inherits OrcReader via IcebergReaderMixin CRTP +class IcebergOrcReader final : public IcebergReaderMixin { public: ENABLE_FACTORY_CREATOR(IcebergOrcReader); - IcebergOrcReader(std::unique_ptr file_format_reader, RuntimeProfile* profile, - RuntimeState* state, const TFileScanRangeParams& params, - const TFileRangeDesc& range, ShardedKVCache* kv_cache, io::IOContext* io_ctx, + IcebergOrcReader(ShardedKVCache* kv_cache, RuntimeProfile* profile, RuntimeState* state, + const TFileScanRangeParams& params, const TFileRangeDesc& range, + size_t batch_size, const std::string& ctz, io::IOContext* io_ctx, FileMetaCache* meta_cache) - : IcebergTableReader(std::move(file_format_reader), profile, state, params, range, - kv_cache, io_ctx, meta_cache) {} + : IcebergReaderMixin(kv_cache, profile, state, params, range, batch_size, + ctz, io_ctx, meta_cache) {} void set_delete_rows() final { - auto* orc_reader = (OrcReader*)_file_format_reader.get(); - orc_reader->set_position_delete_rowids(_iceberg_delete_rows); + // Call OrcReader's set_position_delete_rowids + this->set_position_delete_rowids(_iceberg_delete_rows); } - Status init_reader( - const std::vector& file_col_names, - std::unordered_map* col_name_to_block_idx, - const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor, - const RowDescriptor* row_descriptor, - const std::unordered_map* colname_to_slot_id, - const VExprContextSPtrs* not_single_slot_filter_conjuncts, - const std::unordered_map* slot_id_to_filter_conjuncts); - -private: - Status _process_equality_delete(const std::vector& delete_files) final; +protected: + // ORC-specific schema matching via on_before_init_reader hook + Status on_before_init_reader(ReaderInitContext* ctx) override; + + std::unique_ptr _create_equality_reader( + const TFileRangeDesc& delete_desc) override { + return OrcReader::create_unique(this->get_profile(), this->get_state(), + this->get_scan_params(), delete_desc, + READ_DELETE_FILE_BATCH_SIZE, this->get_state()->timezone(), + this->get_io_ctx(), this->_meta_cache); + } static ColumnIdResult _create_column_ids(const orc::Type* orc_type, const TupleDescriptor* tuple_descriptor); -private: static const std::string ICEBERG_ORC_ATTRIBUTE; - const orc::Type* _data_file_type_desc = nullptr; + +private: + Status _read_position_delete_file(const TFileRangeDesc* delete_range, + DeleteFile* position_delete) final; }; #include "common/compile_check_end.h" diff --git a/be/src/format/table/iceberg_reader_mixin.h b/be/src/format/table/iceberg_reader_mixin.h new file mode 100644 index 00000000000000..f598757ab20f01 --- /dev/null +++ b/be/src/format/table/iceberg_reader_mixin.h @@ -0,0 +1,899 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "common/consts.h" +#include "common/status.h" +#include "core/block/block.h" +#include "core/column/column_dictionary.h" +#include "core/column/column_nullable.h" +#include "core/column/column_string.h" +#include "core/column/column_struct.h" +#include "core/data_type/data_type_number.h" +#include "core/data_type/data_type_string.h" +#include "format/generic_reader.h" +#include "format/table/deletion_vector_reader.h" +#include "format/table/equality_delete.h" +#include "format/table/table_schema_change_helper.h" +#include "runtime/runtime_profile.h" +#include "runtime/runtime_state.h" +#include "storage/olap_common.h" + +namespace doris { +class TIcebergDeleteFileDesc; +} // namespace doris + +namespace doris { +#include "common/compile_check_begin.h" + +class ShardedKVCache; + +// CRTP mixin for Iceberg reader functionality. +// BaseReader should be ParquetReader or OrcReader. +// Inherits BaseReader + TableSchemaChangeHelper, providing shared Iceberg logic +// (delete files, deletion vectors, equality delete, $row_id synthesis). +// +// Inheritance chain: +// IcebergParquetReader -> IcebergReaderMixin -> ParquetReader -> GenericReader +// IcebergOrcReader -> IcebergReaderMixin -> OrcReader -> GenericReader +template +class IcebergReaderMixin : public BaseReader, public TableSchemaChangeHelper { +public: + struct PositionDeleteRange { + std::vector data_file_path; + std::vector> range; + }; + + // Forward BaseReader constructor arguments + Iceberg-specific kv_cache + template + IcebergReaderMixin(ShardedKVCache* kv_cache, Args&&... args) + : BaseReader(std::forward(args)...), _kv_cache(kv_cache) { + // Initialize table-level row count from scan range (replicates master's + // GenericReader constructor logic). + const auto& range = this->get_scan_range(); + if (range.table_format_params.__isset.table_level_row_count) { + _table_level_row_count = range.table_format_params.table_level_row_count; + } + + static const char* iceberg_profile = "IcebergProfile"; + ADD_TIMER(this->get_profile(), iceberg_profile); + _iceberg_profile.num_delete_files = ADD_CHILD_COUNTER(this->get_profile(), "NumDeleteFiles", + TUnit::UNIT, iceberg_profile); + _iceberg_profile.num_delete_rows = ADD_CHILD_COUNTER(this->get_profile(), "NumDeleteRows", + TUnit::UNIT, iceberg_profile); + _iceberg_profile.delete_files_read_time = + ADD_CHILD_TIMER(this->get_profile(), "DeleteFileReadTime", iceberg_profile); + _iceberg_profile.delete_rows_sort_time = + ADD_CHILD_TIMER(this->get_profile(), "DeleteRowsSortTime", iceberg_profile); + _iceberg_profile.parse_delete_file_time = + ADD_CHILD_TIMER(this->get_profile(), "ParseDeleteFileTime", iceberg_profile); + } + + ~IcebergReaderMixin() override = default; + + void set_current_file_info(const std::string& file_path, int32_t partition_spec_id, + const std::string& partition_data_json) { + _current_file_path = file_path; + _partition_spec_id = partition_spec_id; + _partition_data_json = partition_data_json; + } + + enum { DATA, POSITION_DELETE, EQUALITY_DELETE, DELETION_VECTOR }; + enum Fileformat { NONE, PARQUET, ORC, AVRO }; + + virtual void set_delete_rows() = 0; + + // Replicates master's GenericReader::get_next_block(). + // Handles table-level count pushdown by returning FE-provided _table_level_row_count + // directly (without reading any files). For all other cases, resets push_down_agg_type + // to NONE and delegates to the base reader — because in master, ParquetReader/OrcReader + // never had _push_down_agg_type == COUNT (it was absorbed by the GenericReader layer). + Status _do_get_next_block(Block* block, size_t* read_rows, bool* eof) override { + if (this->_push_down_agg_type == TPushAggOp::type::COUNT && _table_level_row_count >= 0) { + auto rows = std::min(_table_level_row_count, (int64_t)this->get_batch_size()); + _table_level_row_count -= rows; + auto mutate_columns = block->mutate_columns(); + for (auto& col : mutate_columns) { + col->resize(rows); + } + block->set_columns(std::move(mutate_columns)); + *read_rows = rows; + if (_table_level_row_count == 0) { + *eof = true; + } + return Status::OK(); + } + this->set_push_down_agg_type(TPushAggOp::NONE); + return BaseReader::_do_get_next_block(block, read_rows, eof); + } + +protected: + // ---- Hook implementations ---- + + // Called before reading a block: expand block for equality delete columns + detect row_id + Status on_before_read_block(Block* block) override { + RETURN_IF_ERROR(_expand_block_if_need(block)); + return Status::OK(); + } + + /// Fill Iceberg $row_id synthesized column. Registered as handler during init. + Status _fill_iceberg_row_id(Block* block, size_t rows) { + int row_id_pos = block->get_position_by_name(BeConsts::ICEBERG_ROWID_COL); + if (row_id_pos < 0) { + return Status::OK(); + } + + // Lazy-init file info: only set when $row_id is actually needed. + const auto& table_desc = this->get_scan_range().table_format_params.iceberg_params; + std::string file_path = table_desc.original_file_path; + int32_t partition_spec_id = + table_desc.__isset.partition_spec_id ? table_desc.partition_spec_id : 0; + std::string partition_data_json; + if (table_desc.__isset.partition_data_json) { + partition_data_json = table_desc.partition_data_json; + } + set_current_file_info(file_path, partition_spec_id, partition_data_json); + + const auto& row_ids = this->current_batch_row_positions(); + auto& col_with_type = block->get_by_position(static_cast(row_id_pos)); + MutableColumnPtr row_id_column; + RETURN_IF_ERROR(_build_iceberg_rowid_column(col_with_type.type, _current_file_path, row_ids, + _partition_spec_id, _partition_data_json, + &row_id_column)); + col_with_type.column = std::move(row_id_column); + return Status::OK(); + } + + // Called after reading a block: apply equality delete filter + shrink block + Status on_after_read_block(Block* block, size_t* read_rows) override { + if (!_equality_delete_impls.empty()) { + LOG(INFO) << "[EqDeleteDebug] on_after_read_block: block has " << block->rows() + << " rows, columns: " << block->dump_names(); + for (auto& [fid, name] : _id_to_block_column_name) { + LOG(INFO) << "[EqDeleteDebug] _id_to_block_column_name[" << fid << "] = " << name; + } + if (this->col_name_to_block_idx_ref()) { + for (auto& [name, idx] : *this->col_name_to_block_idx_ref()) { + LOG(INFO) << "[EqDeleteDebug] col_name_to_block_idx[" << name << "] = " << idx; + } + } + std::unique_ptr filter = + std::make_unique(block->rows(), 1); + for (auto& equality_delete_impl : _equality_delete_impls) { + RETURN_IF_ERROR(equality_delete_impl->filter_data_block( + block, this->col_name_to_block_idx_ref(), _id_to_block_column_name, + *filter)); + } + size_t kept = 0; + for (size_t i = 0; i < filter->size(); i++) { + if ((*filter)[i]) kept++; + } + LOG(INFO) << "[EqDeleteDebug] after filter: kept " << kept << " of " << block->rows(); + Block::filter_block_internal(block, *filter, block->columns()); + *read_rows = block->rows(); + } + return _shrink_block_if_need(block); + } + + // ---- Shared Iceberg methods ---- + + Status _init_row_filters(); + Status _position_delete_base(const std::string data_file_path, + const std::vector& delete_files); + Status _equality_delete_base(const std::vector& delete_files); + Status read_deletion_vector(const std::string& data_file_path, + const TIcebergDeleteFileDesc& delete_file_desc); + + Status _expand_block_if_need(Block* block); + Status _shrink_block_if_need(Block* block); + + // Type aliases — must be defined before member function declarations that use them. + using DeleteRows = std::vector; + using DeleteFile = phmap::parallel_flat_hash_map< + std::string, std::unique_ptr, std::hash, std::equal_to<>, + std::allocator>>, 8, + std::mutex>; + + PositionDeleteRange _get_range(const ColumnDictI32& file_path_column); + PositionDeleteRange _get_range(const ColumnString& file_path_column); + static void _sort_delete_rows(const std::vector*>& delete_rows_array, + int64_t num_delete_rows, std::vector& result); + void _gen_position_delete_file_range(Block& block, DeleteFile* position_delete, + size_t read_rows, bool file_path_column_dictionary_coded); + void _generate_equality_delete_block(Block* block, + const std::vector& equality_delete_col_names, + const std::vector& equality_delete_col_types); + + // Pure virtual: format-specific delete file reading + virtual Status _read_position_delete_file(const TFileRangeDesc*, DeleteFile*) = 0; + virtual std::unique_ptr _create_equality_reader( + const TFileRangeDesc& delete_desc) = 0; + + static std::string _delet_file_cache_key(const std::string& path) { return "delete_" + path; } + + /// Build the Iceberg V2 row-id struct column. + static Status _build_iceberg_rowid_column(const DataTypePtr& type, const std::string& file_path, + const std::vector& row_ids, + int32_t partition_spec_id, + const std::string& partition_data_json, + MutableColumnPtr* column_out) { + if (type == nullptr || column_out == nullptr) { + return Status::InvalidArgument("Invalid iceberg rowid column type or output column"); + } + MutableColumnPtr column = type->create_column(); + ColumnNullable* nullable_col = check_and_get_column(column.get()); + ColumnStruct* struct_col = nullptr; + if (nullable_col != nullptr) { + struct_col = + check_and_get_column(nullable_col->get_nested_column_ptr().get()); + } else { + struct_col = check_and_get_column(column.get()); + } + if (struct_col == nullptr || struct_col->tuple_size() < 4) { + return Status::InternalError("Invalid iceberg rowid column structure"); + } + size_t num_rows = row_ids.size(); + auto& file_path_col = struct_col->get_column(0); + auto& row_pos_col = struct_col->get_column(1); + auto& spec_id_col = struct_col->get_column(2); + auto& partition_data_col = struct_col->get_column(3); + file_path_col.reserve(num_rows); + row_pos_col.reserve(num_rows); + spec_id_col.reserve(num_rows); + partition_data_col.reserve(num_rows); + for (size_t i = 0; i < num_rows; ++i) { + file_path_col.insert_data(file_path.data(), file_path.size()); + } + for (size_t i = 0; i < num_rows; ++i) { + int64_t row_pos = static_cast(row_ids[i]); + row_pos_col.insert_data(reinterpret_cast(&row_pos), sizeof(row_pos)); + } + for (size_t i = 0; i < num_rows; ++i) { + int32_t spec_id = partition_spec_id; + spec_id_col.insert_data(reinterpret_cast(&spec_id), sizeof(spec_id)); + } + for (size_t i = 0; i < num_rows; ++i) { + partition_data_col.insert_data(partition_data_json.data(), partition_data_json.size()); + } + if (nullable_col != nullptr) { + nullable_col->get_null_map_data().resize_fill(num_rows, 0); + } + *column_out = std::move(column); + return Status::OK(); + } + + struct IcebergProfile { + RuntimeProfile::Counter* num_delete_files; + RuntimeProfile::Counter* num_delete_rows; + RuntimeProfile::Counter* delete_files_read_time; + RuntimeProfile::Counter* delete_rows_sort_time; + RuntimeProfile::Counter* parse_delete_file_time; + }; + + bool _need_row_id_column = false; + std::string _current_file_path; + int32_t _partition_spec_id = 0; + std::string _partition_data_json; + + ShardedKVCache* _kv_cache; + int64_t _table_level_row_count = -1; + IcebergProfile _iceberg_profile; + const std::vector* _iceberg_delete_rows = nullptr; + std::vector _expand_col_names; + std::vector _expand_columns; + std::vector _all_required_col_names; + Fileformat _file_format = Fileformat::NONE; + + const int64_t MIN_SUPPORT_DELETE_FILES_VERSION = 2; + const std::string ICEBERG_FILE_PATH = "file_path"; + const std::string ICEBERG_ROW_POS = "pos"; + const std::vector delete_file_col_names {ICEBERG_FILE_PATH, ICEBERG_ROW_POS}; + const std::unordered_map DELETE_COL_NAME_TO_BLOCK_IDX = { + {ICEBERG_FILE_PATH, 0}, {ICEBERG_ROW_POS, 1}}; + const int ICEBERG_FILE_PATH_INDEX = 0; + const int ICEBERG_FILE_POS_INDEX = 1; + const int READ_DELETE_FILE_BATCH_SIZE = 102400; + + // all ids that need read for eq delete (from all eq delete files) + std::set _equality_delete_col_ids; + // eq delete column ids -> location of _equality_delete_blocks / _equality_delete_impls + std::map, int> _equality_delete_block_map; + // EqualityDeleteBase stores raw pointers to these blocks, so do not modify this vector after + // creating entries in _equality_delete_impls. + std::vector _equality_delete_blocks; + std::vector> _equality_delete_impls; + + // id -> block column name + std::unordered_map _id_to_block_column_name; + + // File column names used during init + std::vector _file_col_names; +}; + +// ============================================================================ +// Template method implementations (must be in header for templates) +// ============================================================================ + +template +Status IcebergReaderMixin::_init_row_filters() { + // COUNT(*) short-circuit + if (this->_push_down_agg_type == TPushAggOp::type::COUNT && + this->get_scan_range().table_format_params.__isset.table_level_row_count && + this->get_scan_range().table_format_params.table_level_row_count > 0) { + return Status::OK(); + } + + const auto& table_desc = this->get_scan_range().table_format_params.iceberg_params; + const auto& version = table_desc.format_version; + if (version < MIN_SUPPORT_DELETE_FILES_VERSION) { + return Status::OK(); + } + + std::vector position_delete_files; + std::vector equality_delete_files; + std::vector deletion_vector_files; + for (const TIcebergDeleteFileDesc& desc : table_desc.delete_files) { + if (desc.content == POSITION_DELETE) { + position_delete_files.emplace_back(desc); + } else if (desc.content == EQUALITY_DELETE) { + equality_delete_files.emplace_back(desc); + } else if (desc.content == DELETION_VECTOR) { + deletion_vector_files.emplace_back(desc); + } + } + LOG(INFO) << "[IcebergDebug] _init_row_filters: total_delete_files=" + << table_desc.delete_files.size() << ", position=" << position_delete_files.size() + << ", equality=" << equality_delete_files.size() + << ", dv=" << deletion_vector_files.size(); + + if (!equality_delete_files.empty()) { + RETURN_IF_ERROR(_equality_delete_base(equality_delete_files)); + this->set_push_down_agg_type(TPushAggOp::NONE); + } + + if (!deletion_vector_files.empty()) { + if (deletion_vector_files.size() != 1) [[unlikely]] { + /* + * Deletion vectors are a binary representation of deletes for a single data file that is more efficient + * at execution time than position delete files. Unlike equality or position delete files, there can be + * at most one deletion vector for a given data file in a snapshot. + */ + return Status::DataQualityError("This iceberg data file has multiple DVs."); + } + RETURN_IF_ERROR( + read_deletion_vector(table_desc.original_file_path, deletion_vector_files[0])); + this->set_push_down_agg_type(TPushAggOp::NONE); + } else if (!position_delete_files.empty()) { + RETURN_IF_ERROR( + _position_delete_base(table_desc.original_file_path, position_delete_files)); + this->set_push_down_agg_type(TPushAggOp::NONE); + } + + COUNTER_UPDATE(_iceberg_profile.num_delete_files, table_desc.delete_files.size()); + return Status::OK(); +} + +template +Status IcebergReaderMixin::_equality_delete_base( + const std::vector& delete_files) { + std::unordered_map> + partition_columns; + std::unordered_map missing_columns; + + for (const auto& delete_file : delete_files) { + TFileRangeDesc delete_desc; + delete_desc.__set_fs_name(this->get_scan_range().fs_name); + delete_desc.path = delete_file.path; + delete_desc.start_offset = 0; + delete_desc.size = -1; + delete_desc.file_size = -1; + + if (!delete_file.__isset.field_ids) [[unlikely]] { + return Status::InternalError( + "missing delete field ids when reading equality delete file"); + } + auto& read_column_field_ids = delete_file.field_ids; + std::set read_column_field_ids_set; + for (const auto& field_id : read_column_field_ids) { + read_column_field_ids_set.insert(field_id); + _equality_delete_col_ids.insert(field_id); + } + + std::unique_ptr delete_reader = _create_equality_reader(delete_desc); + RETURN_IF_ERROR(delete_reader->init_schema_reader()); + + std::vector equality_delete_col_names; + std::vector equality_delete_col_types; + + // Build delete col names/types/ids by matching field_ids from delete file schema. + // Master iterates delete file's FieldDescriptor and uses field_id to match, + // NOT idx-based pairing (get_parsed_schema order != field_ids order). + std::vector delete_col_names; + std::vector delete_col_types; + std::vector delete_col_ids; + std::unordered_map delete_col_name_to_block_idx; + + if (auto* parquet_reader = typeid_cast(delete_reader.get())) { + LOG(INFO) << "[EqDeleteDebug] step1: parquet delete reader cast OK"; + const FieldDescriptor* delete_field_desc = nullptr; + auto st1 = parquet_reader->get_file_metadata_schema(&delete_field_desc); + if (!st1.ok()) { + LOG(WARNING) << "[EqDeleteDebug] get_file_metadata_schema FAILED: " << st1; + return st1; + } + LOG(INFO) << "[EqDeleteDebug] step2: get_file_metadata_schema OK, fields=" + << delete_field_desc->get_fields_schema().size(); + DCHECK(delete_field_desc != nullptr); + + for (const auto& delete_file_field : delete_field_desc->get_fields_schema()) { + LOG(INFO) << "[EqDeleteDebug] step3: field name=" << delete_file_field.name + << ", field_id=" << delete_file_field.field_id << ", in_set=" + << read_column_field_ids_set.contains(delete_file_field.field_id); + if (delete_file_field.field_id == -1) [[unlikely]] { + return Status::DataQualityError( + "missing field id when reading equality delete file"); + } + if (!read_column_field_ids_set.contains(delete_file_field.field_id)) { + continue; + } + if (delete_file_field.children.size() > 0) [[unlikely]] { + return Status::InternalError( + "can not support read complex column in equality delete file"); + } + + delete_col_ids.emplace_back(delete_file_field.field_id); + delete_col_names.emplace_back(delete_file_field.name); + delete_col_types.emplace_back(make_nullable(delete_file_field.data_type)); + + int field_id = delete_file_field.field_id; + if (!_id_to_block_column_name.contains(field_id)) { + _id_to_block_column_name.emplace(field_id, delete_file_field.name); + _expand_col_names.emplace_back(delete_file_field.name); + _expand_columns.emplace_back( + make_nullable(delete_file_field.data_type)->create_column(), + make_nullable(delete_file_field.data_type), delete_file_field.name); + } + } + LOG(INFO) << "[EqDeleteDebug] step4: after loop, delete_col_names.size=" + << delete_col_names.size(); + for (uint32_t idx = 0; idx < delete_col_names.size(); ++idx) { + delete_col_name_to_block_idx[delete_col_names[idx]] = idx; + } + // Delete files have TFileRangeDesc.size=-1, which would cause + // set_fill_columns to return EndOfFile("No row group to read") + // when _filter_groups is true. Master passes filter_groups=false. + ParquetInitContext eq_delete_ctx; + eq_delete_ctx.filter_groups = false; + eq_delete_ctx.column_names = delete_col_names; + eq_delete_ctx.col_name_to_block_idx = &delete_col_name_to_block_idx; + auto st2 = parquet_reader->init_reader(&eq_delete_ctx); + if (!st2.ok()) { + LOG(WARNING) << "[EqDeleteDebug] _do_init_reader for delete reader FAILED: " << st2; + return st2; + } + LOG(INFO) << "[EqDeleteDebug] step5: _do_init_reader OK"; + } else if (auto* orc_reader = typeid_cast(delete_reader.get())) { + // For ORC: use get_parsed_schema with field_ids from delete_file + // ORC field_ids come from the Thrift descriptor, not from ORC metadata + RETURN_IF_ERROR(delete_reader->get_parsed_schema(&equality_delete_col_names, + &equality_delete_col_types)); + for (uint32_t idx = 0; idx < equality_delete_col_names.size(); ++idx) { + if (idx < read_column_field_ids.size()) { + int field_id = read_column_field_ids[idx]; + if (!read_column_field_ids_set.contains(field_id)) continue; + delete_col_ids.emplace_back(field_id); + delete_col_names.emplace_back(equality_delete_col_names[idx]); + delete_col_types.emplace_back(make_nullable(equality_delete_col_types[idx])); + if (!_id_to_block_column_name.contains(field_id)) { + _id_to_block_column_name.emplace(field_id, equality_delete_col_names[idx]); + _expand_col_names.emplace_back(equality_delete_col_names[idx]); + _expand_columns.emplace_back( + make_nullable(equality_delete_col_types[idx])->create_column(), + make_nullable(equality_delete_col_types[idx]), + equality_delete_col_names[idx]); + } + } + } + for (uint32_t idx = 0; idx < delete_col_names.size(); ++idx) { + delete_col_name_to_block_idx[delete_col_names[idx]] = idx; + } + OrcInitContext eq_delete_ctx; + eq_delete_ctx.column_names = delete_col_names; + eq_delete_ctx.col_name_to_block_idx = &delete_col_name_to_block_idx; + RETURN_IF_ERROR(orc_reader->init_reader(&eq_delete_ctx)); + } else { + return Status::InternalError("Unsupported format of delete file"); + } + + LOG(INFO) << "[EqDeleteDebug] after init, delete_col_ids.size=" << delete_col_ids.size() + << ", delete_col_names.size=" << delete_col_names.size(); + for (size_t i = 0; i < delete_col_names.size(); i++) { + LOG(INFO) << "[EqDeleteDebug] delete_col[" << i << "]: name=" << delete_col_names[i] + << (i < delete_col_ids.size() + ? ", field_id=" + std::to_string(delete_col_ids[i]) + : ""); + } + + if (!_equality_delete_block_map.contains(delete_col_ids)) { + _equality_delete_block_map.emplace(delete_col_ids, _equality_delete_blocks.size()); + Block block; + _generate_equality_delete_block(&block, delete_col_names, delete_col_types); + _equality_delete_blocks.emplace_back(block); + } + Block& eq_file_block = _equality_delete_blocks[_equality_delete_block_map[delete_col_ids]]; + + bool eof = false; + while (!eof) { + Block tmp_block; + _generate_equality_delete_block(&tmp_block, delete_col_names, delete_col_types); + size_t read_rows = 0; + auto st = delete_reader->get_next_block(&tmp_block, &read_rows, &eof); + if (!st.ok()) { + LOG(WARNING) << "[EqDeleteDebug] delete_reader->get_next_block failed: " << st; + return st; + } + LOG(INFO) << "[EqDeleteDebug] read delete file: rows=" << read_rows << ", eof=" << eof; + if (read_rows > 0) { + MutableBlock mutable_block(&eq_file_block); + RETURN_IF_ERROR(mutable_block.merge(tmp_block)); + } + } + } + + for (const auto& [delete_col_ids, block_idx] : _equality_delete_block_map) { + auto& eq_file_block = _equality_delete_blocks[block_idx]; + auto equality_delete_impl = + EqualityDeleteBase::get_delete_impl(&eq_file_block, delete_col_ids); + RETURN_IF_ERROR(equality_delete_impl->init(this->get_profile())); + _equality_delete_impls.emplace_back(std::move(equality_delete_impl)); + } + LOG(INFO) << "[EqDeleteDebug] _equality_delete_base done: impls=" + << _equality_delete_impls.size() << ", expand_cols=" << _expand_col_names.size(); + return Status::OK(); +} + +template +void IcebergReaderMixin::_generate_equality_delete_block( + Block* block, const std::vector& equality_delete_col_names, + const std::vector& equality_delete_col_types) { + for (int i = 0; i < equality_delete_col_names.size(); ++i) { + DataTypePtr data_type = make_nullable(equality_delete_col_types[i]); + MutableColumnPtr data_column = data_type->create_column(); + block->insert(ColumnWithTypeAndName(std::move(data_column), data_type, + equality_delete_col_names[i])); + } +} + +template +Status IcebergReaderMixin::_expand_block_if_need(Block* block) { + std::set names; + auto block_names = block->get_names(); + names.insert(block_names.begin(), block_names.end()); + for (auto& col : _expand_columns) { + col.column->assume_mutable()->clear(); + if (names.contains(col.name)) { + return Status::InternalError("Wrong expand column '{}'", col.name); + } + names.insert(col.name); + (*this->col_name_to_block_idx_ref())[col.name] = static_cast(block->columns()); + block->insert(col); + } + return Status::OK(); +} + +template +Status IcebergReaderMixin::_shrink_block_if_need(Block* block) { + std::set positions_to_erase; + for (const std::string& expand_col : _expand_col_names) { + if (!this->col_name_to_block_idx_ref()->contains(expand_col)) { + return Status::InternalError("Wrong erase column '{}', block: {}", expand_col, + block->dump_names()); + } + positions_to_erase.emplace((*this->col_name_to_block_idx_ref())[expand_col]); + } + block->erase(positions_to_erase); + for (const std::string& expand_col : _expand_col_names) { + this->col_name_to_block_idx_ref()->erase(expand_col); + } + return Status::OK(); +} + +template +Status IcebergReaderMixin::_position_delete_base( + const std::string data_file_path, const std::vector& delete_files) { + std::vector delete_rows_array; + int64_t num_delete_rows = 0; + for (const auto& delete_file : delete_files) { + SCOPED_TIMER(_iceberg_profile.delete_files_read_time); + Status create_status = Status::OK(); + auto* delete_file_cache = _kv_cache->template get( + _delet_file_cache_key(delete_file.path), [&]() -> DeleteFile* { + auto* position_delete = new DeleteFile; + TFileRangeDesc delete_file_range; + delete_file_range.__set_fs_name(this->get_scan_range().fs_name); + delete_file_range.path = delete_file.path; + delete_file_range.start_offset = 0; + delete_file_range.size = -1; + delete_file_range.file_size = -1; + create_status = _read_position_delete_file(&delete_file_range, position_delete); + if (!create_status) { + return nullptr; + } + return position_delete; + }); + if (create_status.is()) { + LOG(INFO) << "[IcebergDebug] _position_delete_base: END_OF_FILE for " + << delete_file.path << ", skipping"; + continue; + } else if (!create_status.ok()) { + LOG(INFO) << "[IcebergDebug] _position_delete_base: ERROR reading " << delete_file.path + << ": " << create_status.to_string(); + return create_status; + } + LOG(INFO) << "[IcebergDebug] _position_delete_base: cache returned, " + << "delete_file_cache=" << (void*)delete_file_cache + << ", delete_file=" << delete_file.path; + + DeleteFile& delete_file_map = *((DeleteFile*)delete_file_cache); + auto get_value = [&](const auto& v) { + DeleteRows* row_ids = v.second.get(); + if (!row_ids->empty()) { + delete_rows_array.emplace_back(row_ids); + num_delete_rows += row_ids->size(); + } + }; + delete_file_map.if_contains(data_file_path, get_value); + LOG(INFO) << "[IcebergDebug] _position_delete_base: data_file_path=" << data_file_path + << ", delete_file=" << delete_file.path + << ", num_delete_rows_so_far=" << num_delete_rows + << ", delete_file_map_size=" << delete_file_map.size(); + // Log all keys in the delete file map for debugging + delete_file_map.for_each([&](const auto& kv) { + LOG(INFO) << "[IcebergDebug] _position_delete_base: map_key=" << kv.first + << ", rows=" << kv.second->size(); + }); + } + if (num_delete_rows > 0) { + SCOPED_TIMER(_iceberg_profile.delete_rows_sort_time); + _iceberg_delete_rows = + _kv_cache->template get(data_file_path, [&]() -> DeleteRows* { + auto* data_file_position_delete = new DeleteRows; + _sort_delete_rows(delete_rows_array, num_delete_rows, + *data_file_position_delete); + return data_file_position_delete; + }); + set_delete_rows(); + COUNTER_UPDATE(_iceberg_profile.num_delete_rows, num_delete_rows); + } else { + LOG(INFO) << "[IcebergDebug] _position_delete_base: NO delete rows matched for " + << data_file_path; + } + return Status::OK(); +} + +template +typename IcebergReaderMixin::PositionDeleteRange +IcebergReaderMixin::_get_range(const ColumnDictI32& file_path_column) { + PositionDeleteRange range; + size_t read_rows = file_path_column.get_data().size(); + const int* code_path = file_path_column.get_data().data(); + const int* code_path_start = code_path; + const int* code_path_end = code_path + read_rows; + while (code_path < code_path_end) { + int code = code_path[0]; + const int* code_end = std::upper_bound(code_path, code_path_end, code); + range.data_file_path.emplace_back(file_path_column.get_value(code).to_string()); + range.range.emplace_back(code_path - code_path_start, code_end - code_path_start); + code_path = code_end; + } + return range; +} + +template +typename IcebergReaderMixin::PositionDeleteRange +IcebergReaderMixin::_get_range(const ColumnString& file_path_column) { + PositionDeleteRange range; + size_t read_rows = file_path_column.size(); + size_t index = 0; + while (index < read_rows) { + StringRef data_path = file_path_column.get_data_at(index); + size_t left = index - 1; + size_t right = read_rows; + while (left + 1 != right) { + size_t mid = left + (right - left) / 2; + if (file_path_column.get_data_at(mid) > data_path) { + right = mid; + } else { + left = mid; + } + } + range.data_file_path.emplace_back(data_path.to_string()); + range.range.emplace_back(index, left + 1); + index = left + 1; + } + return range; +} + +template +void IcebergReaderMixin::_sort_delete_rows( + const std::vector*>& delete_rows_array, int64_t num_delete_rows, + std::vector& result) { + if (delete_rows_array.empty()) { + return; + } + if (delete_rows_array.size() == 1) { + result.resize(num_delete_rows); + memcpy(result.data(), delete_rows_array.front()->data(), sizeof(int64_t) * num_delete_rows); + return; + } + if (delete_rows_array.size() == 2) { + result.resize(num_delete_rows); + std::merge(delete_rows_array.front()->begin(), delete_rows_array.front()->end(), + delete_rows_array.back()->begin(), delete_rows_array.back()->end(), + result.begin()); + return; + } + + using vec_pair = std::pair::iterator, std::vector::iterator>; + result.resize(num_delete_rows); + auto row_id_iter = result.begin(); + auto iter_end = result.end(); + std::vector rows_array; + for (auto* rows : delete_rows_array) { + if (!rows->empty()) { + rows_array.emplace_back(rows->begin(), rows->end()); + } + } + size_t array_size = rows_array.size(); + while (row_id_iter != iter_end) { + int64_t min_index = 0; + int64_t min = *rows_array[0].first; + for (size_t i = 0; i < array_size; ++i) { + if (*rows_array[i].first < min) { + min_index = i; + min = *rows_array[i].first; + } + } + *row_id_iter++ = min; + rows_array[min_index].first++; + if (UNLIKELY(rows_array[min_index].first == rows_array[min_index].second)) { + rows_array.erase(rows_array.begin() + min_index); + array_size--; + } + } +} + +template +void IcebergReaderMixin::_gen_position_delete_file_range( + Block& block, DeleteFile* position_delete, size_t read_rows, + bool file_path_column_dictionary_coded) { + SCOPED_TIMER(_iceberg_profile.parse_delete_file_time); + auto name_to_pos_map = block.get_name_to_pos_map(); + ColumnPtr path_column = block.get_by_position(name_to_pos_map[ICEBERG_FILE_PATH]).column; + DCHECK_EQ(path_column->size(), read_rows); + ColumnPtr pos_column = block.get_by_position(name_to_pos_map[ICEBERG_ROW_POS]).column; + using ColumnType = typename PrimitiveTypeTraits::ColumnType; + const int64_t* src_data = assert_cast(*pos_column).get_data().data(); + PositionDeleteRange range; + if (file_path_column_dictionary_coded) { + range = _get_range(assert_cast(*path_column)); + } else { + range = _get_range(assert_cast(*path_column)); + } + for (int i = 0; i < range.range.size(); ++i) { + std::string key = range.data_file_path[i]; + auto iter = position_delete->find(key); + DeleteRows* delete_rows; + if (iter == position_delete->end()) { + delete_rows = new DeleteRows; + std::unique_ptr delete_rows_ptr(delete_rows); + (*position_delete)[key] = std::move(delete_rows_ptr); + } else { + delete_rows = iter->second.get(); + } + const int64_t* cpy_start = src_data + range.range[i].first; + const int64_t cpy_count = range.range[i].second - range.range[i].first; + int64_t origin_size = delete_rows->size(); + delete_rows->resize(origin_size + cpy_count); + int64_t* dest_position = &(*delete_rows)[origin_size]; + memcpy(dest_position, cpy_start, cpy_count * sizeof(int64_t)); + } +} + +template +Status IcebergReaderMixin::read_deletion_vector( + const std::string& data_file_path, const TIcebergDeleteFileDesc& delete_file_desc) { + Status create_status = Status::OK(); + SCOPED_TIMER(_iceberg_profile.delete_files_read_time); + _iceberg_delete_rows = _kv_cache->template get< + DeleteRows>(data_file_path, [&]() -> DeleteRows* { + auto* delete_rows = new DeleteRows; + + TFileRangeDesc delete_range; + delete_range.__set_fs_name(this->get_scan_range().fs_name); + delete_range.path = delete_file_desc.path; + delete_range.start_offset = delete_file_desc.content_offset; + delete_range.size = delete_file_desc.content_size_in_bytes; + delete_range.file_size = -1; + + DeletionVectorReader dv_reader(this->get_state(), this->get_profile(), + this->get_scan_params(), delete_range, this->get_io_ctx()); + create_status = dv_reader.open(); + if (!create_status.ok()) [[unlikely]] { + return nullptr; + } + + size_t buffer_size = delete_range.size; + std::vector buf(buffer_size); + if (buffer_size < 12) [[unlikely]] { + create_status = Status::DataQualityError("Deletion vector file size too small: {}", + buffer_size); + return nullptr; + } + + create_status = dv_reader.read_at(delete_range.start_offset, {buf.data(), buffer_size}); + if (!create_status) [[unlikely]] { + return nullptr; + } + + auto total_length = BigEndian::Load32(buf.data()); + if (total_length + 8 != buffer_size) [[unlikely]] { + create_status = Status::DataQualityError( + "Deletion vector length mismatch, expected: {}, actual: {}", total_length + 8, + buffer_size); + return nullptr; + } + + constexpr static char MAGIC_NUMBER[] = {'\xD1', '\xD3', '\x39', '\x64'}; + if (memcmp(buf.data() + sizeof(total_length), MAGIC_NUMBER, 4)) [[unlikely]] { + create_status = Status::DataQualityError("Deletion vector magic number mismatch"); + return nullptr; + } + + roaring::Roaring64Map bitmap; + SCOPED_TIMER(_iceberg_profile.parse_delete_file_time); + try { + bitmap = roaring::Roaring64Map::readSafe(buf.data() + 8, buffer_size - 12); + } catch (const std::runtime_error& e) { + create_status = Status::DataQualityError("Decode roaring bitmap failed, {}", e.what()); + return nullptr; + } + + delete_rows->reserve(bitmap.cardinality()); + for (auto it = bitmap.begin(); it != bitmap.end(); it++) { + delete_rows->push_back(*it); + } + COUNTER_UPDATE(_iceberg_profile.num_delete_rows, delete_rows->size()); + return delete_rows; + }); + + RETURN_IF_ERROR(create_status); + if (!_iceberg_delete_rows->empty()) [[likely]] { + set_delete_rows(); + } + return Status::OK(); +} + +#include "common/compile_check_end.h" +} // namespace doris diff --git a/be/src/format/table/iceberg_sys_table_jni_reader.h b/be/src/format/table/iceberg_sys_table_jni_reader.h index f5bc69f6776772..c9232fb468ae3c 100644 --- a/be/src/format/table/iceberg_sys_table_jni_reader.h +++ b/be/src/format/table/iceberg_sys_table_jni_reader.h @@ -53,6 +53,9 @@ class IcebergSysTableJniReader : public JniReader { Status init_reader(); +protected: + Status _do_init_reader(ReaderInitContext* /*ctx*/) override { return init_reader(); } + private: Status _init_status; }; diff --git a/be/src/format/table/jdbc_jni_reader.cpp b/be/src/format/table/jdbc_jni_reader.cpp index 89071563c653ca..101f3b8bef131b 100644 --- a/be/src/format/table/jdbc_jni_reader.cpp +++ b/be/src/format/table/jdbc_jni_reader.cpp @@ -105,7 +105,7 @@ bool JdbcJniReader::_is_special_type(PrimitiveType type) { type == PrimitiveType::TYPE_QUANTILE_STATE || type == PrimitiveType::TYPE_JSONB; } -Status JdbcJniReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { +Status JdbcJniReader::_do_get_next_block(Block* block, size_t* read_rows, bool* eof) { // Identify columns with special types (bitmap, HLL, quantile_state, JSONB) // and temporarily replace them with string columns for JNI data transfer. // This follows the same pattern as the old vjdbc_connector.cpp _get_reader_params. @@ -144,7 +144,7 @@ Status JdbcJniReader::get_next_block(Block* block, size_t* read_rows, bool* eof) } // Call parent to do the actual JNI read with string columns - RETURN_IF_ERROR(JniReader::get_next_block(block, read_rows, eof)); + RETURN_IF_ERROR(JniReader::_do_get_next_block(block, read_rows, eof)); // Cast string columns back to their target types if (*read_rows > 0 && !special_columns.empty()) { diff --git a/be/src/format/table/jdbc_jni_reader.h b/be/src/format/table/jdbc_jni_reader.h index c0ef978682c66e..f12f2ba234eede 100644 --- a/be/src/format/table/jdbc_jni_reader.h +++ b/be/src/format/table/jdbc_jni_reader.h @@ -77,7 +77,10 @@ class JdbcJniReader : public JniReader { * Before reading, replaces block columns of special types with string columns. * After reading, casts the string data back to the target types. */ - Status get_next_block(Block* block, size_t* read_rows, bool* eof) override; + Status _do_get_next_block(Block* block, size_t* read_rows, bool* eof) override; + +protected: + Status _do_init_reader(ReaderInitContext* /*ctx*/) override { return init_reader(); } private: std::map _jdbc_params; diff --git a/be/src/format/table/max_compute_jni_reader.h b/be/src/format/table/max_compute_jni_reader.h index 71a1e74f4f85a2..5b11b6eed8310f 100644 --- a/be/src/format/table/max_compute_jni_reader.h +++ b/be/src/format/table/max_compute_jni_reader.h @@ -55,6 +55,9 @@ class MaxComputeJniReader : public JniReader { ~MaxComputeJniReader() override = default; Status init_reader(); + +protected: + Status _do_init_reader(ReaderInitContext* /*ctx*/) override { return init_reader(); } }; #include "common/compile_check_end.h" diff --git a/be/src/format/table/nested_column_access_helper.h b/be/src/format/table/nested_column_access_helper.h index 5b3d03b358b394..b184eabea3d72f 100644 --- a/be/src/format/table/nested_column_access_helper.h +++ b/be/src/format/table/nested_column_access_helper.h @@ -21,7 +21,7 @@ #include #include -#include "format/table/table_format_reader.h" +#include "format/table/table_schema_change_helper.h" namespace doris { #include "common/compile_check_begin.h" diff --git a/be/src/format/table/paimon_cpp_reader.cpp b/be/src/format/table/paimon_cpp_reader.cpp index e4b182c41edfc7..4e45c72b96a239 100644 --- a/be/src/format/table/paimon_cpp_reader.cpp +++ b/be/src/format/table/paimon_cpp_reader.cpp @@ -70,7 +70,7 @@ Status PaimonCppReader::init_reader() { return _init_paimon_reader(); } -Status PaimonCppReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { +Status PaimonCppReader::_do_get_next_block(Block* block, size_t* read_rows, bool* eof) { if (_push_down_agg_type == TPushAggOp::type::COUNT && _remaining_table_level_row_count >= 0) { auto rows = std::min(_remaining_table_level_row_count, (int64_t)_state->query_options().batch_size); @@ -144,8 +144,8 @@ Status PaimonCppReader::get_next_block(Block* block, size_t* read_rows, bool* eo return Status::OK(); } -Status PaimonCppReader::get_columns(std::unordered_map* name_to_type, - std::unordered_set* missing_cols) { +Status PaimonCppReader::_get_columns_impl( + std::unordered_map* name_to_type) { for (const auto& slot : _file_slot_descs) { name_to_type->emplace(slot->col_name(), slot->type()); } diff --git a/be/src/format/table/paimon_cpp_reader.h b/be/src/format/table/paimon_cpp_reader.h index d567b1b24bb71a..309e21ae55a570 100644 --- a/be/src/format/table/paimon_cpp_reader.h +++ b/be/src/format/table/paimon_cpp_reader.h @@ -59,14 +59,16 @@ class PaimonCppReader : public GenericReader { ~PaimonCppReader() override; Status init_reader(); - Status get_next_block(Block* block, size_t* read_rows, bool* eof) override; - Status get_columns(std::unordered_map* name_to_type, - std::unordered_set* missing_cols) override; + Status _do_get_next_block(Block* block, size_t* read_rows, bool* eof) override; + Status _get_columns_impl(std::unordered_map* name_to_type) override; Status close() override; void set_predicate(std::shared_ptr predicate) { _predicate = std::move(predicate); } +protected: + Status _do_init_reader(ReaderInitContext* /*ctx*/) override { return init_reader(); } + private: Status _init_paimon_reader(); Status _decode_split(std::shared_ptr* split); diff --git a/be/src/format/table/paimon_jni_reader.cpp b/be/src/format/table/paimon_jni_reader.cpp index 12e6171b3a5305..0a21ee10ef07d5 100644 --- a/be/src/format/table/paimon_jni_reader.cpp +++ b/be/src/format/table/paimon_jni_reader.cpp @@ -101,7 +101,7 @@ PaimonJniReader::PaimonJniReader(const std::vector& file_slot_d } } -Status PaimonJniReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { +Status PaimonJniReader::_do_get_next_block(Block* block, size_t* read_rows, bool* eof) { if (_push_down_agg_type == TPushAggOp::type::COUNT && _remaining_table_level_row_count >= 0) { auto rows = std::min(_remaining_table_level_row_count, (int64_t)_state->query_options().batch_size); @@ -118,7 +118,7 @@ Status PaimonJniReader::get_next_block(Block* block, size_t* read_rows, bool* eo return Status::OK(); } - return JniReader::get_next_block(block, read_rows, eof); + return JniReader::_do_get_next_block(block, read_rows, eof); } Status PaimonJniReader::init_reader() { diff --git a/be/src/format/table/paimon_jni_reader.h b/be/src/format/table/paimon_jni_reader.h index 548f1c6485a2f7..77cd4fdc518a49 100644 --- a/be/src/format/table/paimon_jni_reader.h +++ b/be/src/format/table/paimon_jni_reader.h @@ -54,10 +54,13 @@ class PaimonJniReader : public JniReader { ~PaimonJniReader() override = default; - Status get_next_block(Block* block, size_t* read_rows, bool* eof) override; + Status _do_get_next_block(Block* block, size_t* read_rows, bool* eof) override; Status init_reader(); +protected: + Status _do_init_reader(ReaderInitContext* /*ctx*/) override { return init_reader(); } + private: int64_t _remaining_table_level_row_count; }; diff --git a/be/src/format/table/paimon_reader.cpp b/be/src/format/table/paimon_reader.cpp index 0667ad8efff7f4..fdc29b5a27d185 100644 --- a/be/src/format/table/paimon_reader.cpp +++ b/be/src/format/table/paimon_reader.cpp @@ -25,34 +25,55 @@ namespace doris { #include "common/compile_check_begin.h" -PaimonReader::PaimonReader(std::unique_ptr file_format_reader, - RuntimeProfile* profile, RuntimeState* state, - const TFileScanRangeParams& params, const TFileRangeDesc& range, - ShardedKVCache* kv_cache, io::IOContext* io_ctx, - FileMetaCache* meta_cache) - : TableFormatReader(std::move(file_format_reader), state, profile, params, range, io_ctx, - meta_cache), - _kv_cache(kv_cache) { + +// ============================================================================ +// PaimonOrcReader +// ============================================================================ +void PaimonOrcReader::_init_paimon_profile() { static const char* paimon_profile = "PaimonProfile"; - ADD_TIMER(_profile, paimon_profile); + ADD_TIMER(get_profile(), paimon_profile); _paimon_profile.num_delete_rows = - ADD_CHILD_COUNTER(_profile, "NumDeleteRows", TUnit::UNIT, paimon_profile); + ADD_CHILD_COUNTER(get_profile(), "NumDeleteRows", TUnit::UNIT, paimon_profile); _paimon_profile.delete_files_read_time = - ADD_CHILD_TIMER(_profile, "DeleteFileReadTime", paimon_profile); + ADD_CHILD_TIMER(get_profile(), "DeleteFileReadTime", paimon_profile); _paimon_profile.parse_deletion_vector_time = - ADD_CHILD_TIMER(_profile, "ParseDeletionVectorTime", paimon_profile); + ADD_CHILD_TIMER(get_profile(), "ParseDeletionVectorTime", paimon_profile); +} + +Status PaimonOrcReader::on_before_init_reader(ReaderInitContext* ctx) { + _column_descs = ctx->column_descs; + _fill_col_name_to_block_idx = ctx->col_name_to_block_idx; + const orc::Type* orc_type_ptr = nullptr; + RETURN_IF_ERROR(get_file_type(&orc_type_ptr)); + + RETURN_IF_ERROR(gen_table_info_node_by_field_id( + get_scan_params(), get_scan_range().table_format_params.paimon_params.schema_id, + get_tuple_descriptor(), orc_type_ptr)); + ctx->table_info_node = table_info_node_ptr; + + for (const auto& desc : *ctx->column_descs) { + if (desc.category == ColumnCategory::REGULAR || + desc.category == ColumnCategory::GENERATED) { + ctx->column_names.push_back(desc.name); + } + } + return Status::OK(); +} + +Status PaimonOrcReader::on_after_init_reader(ReaderInitContext* /*ctx*/) { + return _init_deletion_vector(); } -Status PaimonReader::init_row_filters() { - const auto& table_desc = _range.table_format_params.paimon_params; +Status PaimonOrcReader::_init_deletion_vector() { + const auto& table_desc = get_scan_range().table_format_params.paimon_params; if (!table_desc.__isset.deletion_file) { return Status::OK(); } - // set push down agg type to NONE because we can not do count push down opt - // if there are delete files. - if (!_range.table_format_params.paimon_params.__isset.row_count) { - _file_format_reader->set_push_down_agg_type(TPushAggOp::NONE); + // Cannot do count push down if there are delete files + if (!get_scan_range().table_format_params.paimon_params.__isset.row_count) { + set_push_down_agg_type(TPushAggOp::NONE); + lock_push_down_agg_type(); } const auto& deletion_file = table_desc.deletion_file; @@ -70,33 +91,29 @@ Status PaimonReader::init_row_filters() { auto* delete_rows = new DeleteRows; TFileRangeDesc delete_range; - // must use __set() method to make sure __isset is true - delete_range.__set_fs_name(_range.fs_name); + delete_range.__set_fs_name(get_scan_range().fs_name); delete_range.path = deletion_file.path; delete_range.start_offset = deletion_file.offset; delete_range.size = deletion_file.length + 4; delete_range.file_size = -1; - DeletionVectorReader dv_reader(_state, _profile, _params, delete_range, _io_ctx); + DeletionVectorReader dv_reader(get_state(), get_profile(), get_scan_params(), delete_range, + get_io_ctx()); create_status = dv_reader.open(); if (!create_status.ok()) [[unlikely]] { return nullptr; } - // the reason of adding 4: https://github.com/apache/paimon/issues/3313 size_t bytes_read = deletion_file.length + 4; - // TODO: better way to alloc memeory std::vector buffer(bytes_read); create_status = dv_reader.read_at(deletion_file.offset, {buffer.data(), bytes_read}); if (!create_status.ok()) [[unlikely]] { return nullptr; } - // parse deletion vector const char* buf = buffer.data(); uint32_t actual_length; std::memcpy(reinterpret_cast(&actual_length), buf, 4); - // change byte order to big endian std::reverse(reinterpret_cast(&actual_length), reinterpret_cast(&actual_length) + 4); buf += 4; @@ -109,7 +126,6 @@ Status PaimonReader::init_row_filters() { } uint32_t magic_number; std::memcpy(reinterpret_cast(&magic_number), buf, 4); - // change byte order to big endian std::reverse(reinterpret_cast(&magic_number), reinterpret_cast(&magic_number) + 4); buf += 4; @@ -139,14 +155,144 @@ Status PaimonReader::init_row_filters() { }); RETURN_IF_ERROR(create_status); if (!_delete_rows->empty()) [[likely]] { - set_delete_rows(); + set_position_delete_rowids(_delete_rows); } return Status::OK(); } -Status PaimonReader::get_next_block_inner(Block* block, size_t* read_rows, bool* eof) { - RETURN_IF_ERROR(_file_format_reader->get_next_block(block, read_rows, eof)); +// ============================================================================ +// PaimonParquetReader +// ============================================================================ +void PaimonParquetReader::_init_paimon_profile() { + static const char* paimon_profile = "PaimonProfile"; + ADD_TIMER(get_profile(), paimon_profile); + _paimon_profile.num_delete_rows = + ADD_CHILD_COUNTER(get_profile(), "NumDeleteRows", TUnit::UNIT, paimon_profile); + _paimon_profile.delete_files_read_time = + ADD_CHILD_TIMER(get_profile(), "DeleteFileReadTime", paimon_profile); + _paimon_profile.parse_deletion_vector_time = + ADD_CHILD_TIMER(get_profile(), "ParseDeletionVectorTime", paimon_profile); +} + +Status PaimonParquetReader::on_before_init_reader(ReaderInitContext* ctx) { + _column_descs = ctx->column_descs; + _fill_col_name_to_block_idx = ctx->col_name_to_block_idx; + const FieldDescriptor* field_desc = nullptr; + RETURN_IF_ERROR(get_file_metadata_schema(&field_desc)); + DCHECK(field_desc != nullptr); + + RETURN_IF_ERROR(gen_table_info_node_by_field_id( + get_scan_params(), get_scan_range().table_format_params.paimon_params.schema_id, + get_tuple_descriptor(), *field_desc)); + ctx->table_info_node = table_info_node_ptr; + + for (const auto& desc : *ctx->column_descs) { + if (desc.category == ColumnCategory::REGULAR || + desc.category == ColumnCategory::GENERATED) { + ctx->column_names.push_back(desc.name); + } + } return Status::OK(); } + +Status PaimonParquetReader::on_after_init_reader(ReaderInitContext* /*ctx*/) { + return _init_deletion_vector(); +} + +Status PaimonParquetReader::_init_deletion_vector() { + const auto& table_desc = get_scan_range().table_format_params.paimon_params; + if (!table_desc.__isset.deletion_file) { + return Status::OK(); + } + + if (!get_scan_range().table_format_params.paimon_params.__isset.row_count) { + set_push_down_agg_type(TPushAggOp::NONE); + lock_push_down_agg_type(); + } + const auto& deletion_file = table_desc.deletion_file; + + Status create_status = Status::OK(); + + std::string key; + key.resize(deletion_file.path.size() + sizeof(deletion_file.offset)); + memcpy(key.data(), deletion_file.path.data(), deletion_file.path.size()); + memcpy(key.data() + deletion_file.path.size(), &deletion_file.offset, + sizeof(deletion_file.offset)); + + SCOPED_TIMER(_paimon_profile.delete_files_read_time); + using DeleteRows = std::vector; + _delete_rows = _kv_cache->get(key, [&]() -> DeleteRows* { + auto* delete_rows = new DeleteRows; + + TFileRangeDesc delete_range; + delete_range.__set_fs_name(get_scan_range().fs_name); + delete_range.path = deletion_file.path; + delete_range.start_offset = deletion_file.offset; + delete_range.size = deletion_file.length + 4; + delete_range.file_size = -1; + + DeletionVectorReader dv_reader(get_state(), get_profile(), get_scan_params(), delete_range, + get_io_ctx()); + create_status = dv_reader.open(); + if (!create_status.ok()) [[unlikely]] { + return nullptr; + } + + size_t bytes_read = deletion_file.length + 4; + std::vector buffer(bytes_read); + create_status = dv_reader.read_at(deletion_file.offset, {buffer.data(), bytes_read}); + if (!create_status.ok()) [[unlikely]] { + return nullptr; + } + + const char* buf = buffer.data(); + uint32_t actual_length; + std::memcpy(reinterpret_cast(&actual_length), buf, 4); + std::reverse(reinterpret_cast(&actual_length), + reinterpret_cast(&actual_length) + 4); + buf += 4; + if (actual_length != bytes_read - 4) [[unlikely]] { + create_status = Status::RuntimeError( + "DeletionVector deserialize error: length not match, " + "actual length: {}, expect length: {}", + actual_length, bytes_read - 4); + return nullptr; + } + uint32_t magic_number; + std::memcpy(reinterpret_cast(&magic_number), buf, 4); + std::reverse(reinterpret_cast(&magic_number), + reinterpret_cast(&magic_number) + 4); + buf += 4; + const static uint32_t MAGIC_NUMBER = 1581511376; + if (magic_number != MAGIC_NUMBER) [[unlikely]] { + create_status = Status::RuntimeError( + "DeletionVector deserialize error: invalid magic number {}", magic_number); + return nullptr; + } + + roaring::Roaring roaring_bitmap; + SCOPED_TIMER(_paimon_profile.parse_deletion_vector_time); + try { + roaring_bitmap = roaring::Roaring::readSafe(buf, bytes_read - 4); + } catch (const std::runtime_error& e) { + create_status = Status::RuntimeError( + "DeletionVector deserialize error: failed to deserialize roaring bitmap, {}", + e.what()); + return nullptr; + } + delete_rows->reserve(roaring_bitmap.cardinality()); + for (auto it = roaring_bitmap.begin(); it != roaring_bitmap.end(); it++) { + delete_rows->push_back(*it); + } + COUNTER_UPDATE(_paimon_profile.num_delete_rows, delete_rows->size()); + return delete_rows; + }); + RETURN_IF_ERROR(create_status); + if (!_delete_rows->empty()) [[likely]] { + ParquetReader::set_delete_rows(_delete_rows); + } + return Status::OK(); +} + #include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/format/table/paimon_reader.h b/be/src/format/table/paimon_reader.h index de16c63cdd9a75..b0ab2da9e28f68 100644 --- a/be/src/format/table/paimon_reader.h +++ b/be/src/format/table/paimon_reader.h @@ -22,116 +22,83 @@ #include "format/orc/vorc_reader.h" #include "format/parquet/vparquet_reader.h" -#include "format/table/table_format_reader.h" +#include "format/table/table_schema_change_helper.h" namespace doris { #include "common/compile_check_begin.h" -class PaimonReader : public TableFormatReader, public TableSchemaChangeHelper { +class ShardedKVCache; + +// PaimonOrcReader: directly inherits OrcReader (no composition wrapping). +// Schema mapping in on_before_init_reader, deletion vector reading in on_after_init_reader. +class PaimonOrcReader final : public OrcReader, public TableSchemaChangeHelper { public: - PaimonReader(std::unique_ptr file_format_reader, RuntimeProfile* profile, - RuntimeState* state, const TFileScanRangeParams& params, - const TFileRangeDesc& range, ShardedKVCache* kv_cache, io::IOContext* io_ctx, - FileMetaCache* meta_cache); + ENABLE_FACTORY_CREATOR(PaimonOrcReader); + PaimonOrcReader(RuntimeProfile* profile, RuntimeState* state, + const TFileScanRangeParams& params, const TFileRangeDesc& range, + size_t batch_size, const std::string& ctz, ShardedKVCache* kv_cache, + io::IOContext* io_ctx, FileMetaCache* meta_cache = nullptr, + bool enable_lazy_mat = true) + : OrcReader(profile, state, params, range, batch_size, ctz, io_ctx, meta_cache, + enable_lazy_mat), + _kv_cache(kv_cache) { + _init_paimon_profile(); + } + ~PaimonOrcReader() final = default; - ~PaimonReader() override = default; +protected: + Status on_before_init_reader(ReaderInitContext* ctx) override; - Status init_row_filters() final; + Status on_after_init_reader(ReaderInitContext* /*ctx*/) override; - Status get_next_block_inner(Block* block, size_t* read_rows, bool* eof) final; +private: + void _init_paimon_profile(); + Status _init_deletion_vector(); -protected: struct PaimonProfile { - RuntimeProfile::Counter* num_delete_rows; - RuntimeProfile::Counter* delete_files_read_time; - RuntimeProfile::Counter* parse_deletion_vector_time; + RuntimeProfile::Counter* num_delete_rows = nullptr; + RuntimeProfile::Counter* delete_files_read_time = nullptr; + RuntimeProfile::Counter* parse_deletion_vector_time = nullptr; }; - // _delete_rows from kv_cache. + const std::vector* _delete_rows = nullptr; - // owned by scan node ShardedKVCache* _kv_cache; PaimonProfile _paimon_profile; - - virtual void set_delete_rows() = 0; }; -class PaimonOrcReader final : public PaimonReader { +// PaimonParquetReader: directly inherits ParquetReader (no composition wrapping). +class PaimonParquetReader final : public ParquetReader, public TableSchemaChangeHelper { public: - ENABLE_FACTORY_CREATOR(PaimonOrcReader); - PaimonOrcReader(std::unique_ptr file_format_reader, RuntimeProfile* profile, - RuntimeState* state, const TFileScanRangeParams& params, - const TFileRangeDesc& range, ShardedKVCache* kv_cache, io::IOContext* io_ctx, - FileMetaCache* meta_cache) - : PaimonReader(std::move(file_format_reader), profile, state, params, range, kv_cache, - io_ctx, meta_cache) {}; - ~PaimonOrcReader() final = default; - - void set_delete_rows() final { - (reinterpret_cast(_file_format_reader.get())) - ->set_position_delete_rowids(_delete_rows); + ENABLE_FACTORY_CREATOR(PaimonParquetReader); + PaimonParquetReader(RuntimeProfile* profile, const TFileScanRangeParams& params, + const TFileRangeDesc& range, size_t batch_size, const cctz::time_zone* ctz, + ShardedKVCache* kv_cache, io::IOContext* io_ctx, RuntimeState* state, + FileMetaCache* meta_cache = nullptr, bool enable_lazy_mat = true) + : ParquetReader(profile, params, range, batch_size, ctz, io_ctx, state, meta_cache, + enable_lazy_mat), + _kv_cache(kv_cache) { + _init_paimon_profile(); } + ~PaimonParquetReader() final = default; - Status init_reader( - const std::vector& read_table_col_names, - std::unordered_map* col_name_to_block_idx, - const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor, - const RowDescriptor* row_descriptor, - const VExprContextSPtrs* not_single_slot_filter_conjuncts, - const std::unordered_map* slot_id_to_filter_conjuncts) { - auto* orc_reader = static_cast(_file_format_reader.get()); - const orc::Type* orc_type_ptr = nullptr; - RETURN_IF_ERROR(orc_reader->get_file_type(&orc_type_ptr)); - RETURN_IF_ERROR(gen_table_info_node_by_field_id( - _params, _range.table_format_params.paimon_params.schema_id, tuple_descriptor, - orc_type_ptr)); - - return orc_reader->init_reader(&read_table_col_names, col_name_to_block_idx, conjuncts, - false, tuple_descriptor, row_descriptor, - not_single_slot_filter_conjuncts, - slot_id_to_filter_conjuncts, table_info_node_ptr); - } -}; +protected: + Status on_before_init_reader(ReaderInitContext* ctx) override; -class PaimonParquetReader final : public PaimonReader { -public: - ENABLE_FACTORY_CREATOR(PaimonParquetReader); - PaimonParquetReader(std::unique_ptr file_format_reader, RuntimeProfile* profile, - RuntimeState* state, const TFileScanRangeParams& params, - const TFileRangeDesc& range, ShardedKVCache* kv_cache, - io::IOContext* io_ctx, FileMetaCache* meta_cache) - : PaimonReader(std::move(file_format_reader), profile, state, params, range, kv_cache, - io_ctx, meta_cache) {}; - ~PaimonParquetReader() final = default; + Status on_after_init_reader(ReaderInitContext* /*ctx*/) override; - void set_delete_rows() final { - (reinterpret_cast(_file_format_reader.get())) - ->set_delete_rows(_delete_rows); - } +private: + void _init_paimon_profile(); + Status _init_deletion_vector(); - Status init_reader( - const std::vector& read_table_col_names, - std::unordered_map* col_name_to_block_idx, - const VExprContextSPtrs& conjuncts, - phmap::flat_hash_map>>& - slot_id_to_predicates, - const TupleDescriptor* tuple_descriptor, const RowDescriptor* row_descriptor, - const std::unordered_map* colname_to_slot_id, - const VExprContextSPtrs* not_single_slot_filter_conjuncts, - const std::unordered_map* slot_id_to_filter_conjuncts) { - auto* parquet_reader = static_cast(_file_format_reader.get()); - - const FieldDescriptor* field_desc = nullptr; - RETURN_IF_ERROR(parquet_reader->get_file_metadata_schema(&field_desc)); - DCHECK(field_desc != nullptr); - - RETURN_IF_ERROR(gen_table_info_node_by_field_id( - _params, _range.table_format_params.paimon_params.schema_id, tuple_descriptor, - *field_desc)); - - return parquet_reader->init_reader(read_table_col_names, col_name_to_block_idx, conjuncts, - slot_id_to_predicates, tuple_descriptor, row_descriptor, - colname_to_slot_id, not_single_slot_filter_conjuncts, - slot_id_to_filter_conjuncts, table_info_node_ptr); - } + struct PaimonProfile { + RuntimeProfile::Counter* num_delete_rows = nullptr; + RuntimeProfile::Counter* delete_files_read_time = nullptr; + RuntimeProfile::Counter* parse_deletion_vector_time = nullptr; + }; + + const std::vector* _delete_rows = nullptr; + ShardedKVCache* _kv_cache; + PaimonProfile _paimon_profile; }; + #include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/format/table/parquet_metadata_reader.cpp b/be/src/format/table/parquet_metadata_reader.cpp index 6a032e67f76ba5..7df80f673cb602 100644 --- a/be/src/format/table/parquet_metadata_reader.cpp +++ b/be/src/format/table/parquet_metadata_reader.cpp @@ -798,7 +798,7 @@ Status ParquetMetadataReader::_init_from_scan_range(const TMetaScanRange& scan_r return Status::OK(); } -Status ParquetMetadataReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { +Status ParquetMetadataReader::_do_get_next_block(Block* block, size_t* read_rows, bool* eof) { if (_eof) { *eof = true; *read_rows = 0; diff --git a/be/src/format/table/parquet_metadata_reader.h b/be/src/format/table/parquet_metadata_reader.h index 33eef93037e65e..d1b338d17f7085 100644 --- a/be/src/format/table/parquet_metadata_reader.h +++ b/be/src/format/table/parquet_metadata_reader.h @@ -54,9 +54,12 @@ class ParquetMetadataReader : public GenericReader { ~ParquetMetadataReader() override; Status init_reader(); - Status get_next_block(Block* block, size_t* read_rows, bool* eof) override; + Status _do_get_next_block(Block* block, size_t* read_rows, bool* eof) override; Status close() override; +protected: + Status _do_init_reader(ReaderInitContext* /*ctx*/) override { return init_reader(); } + private: Status _init_from_scan_range(const TMetaScanRange& scan_range); Status _build_rows(std::vector& columns); diff --git a/be/src/format/table/remote_doris_reader.cpp b/be/src/format/table/remote_doris_reader.cpp index fde4dc49896db5..f97ed62c4fd37c 100644 --- a/be/src/format/table/remote_doris_reader.cpp +++ b/be/src/format/table/remote_doris_reader.cpp @@ -59,7 +59,7 @@ Status RemoteDorisReader::init_reader() { return Status::OK(); } -Status RemoteDorisReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { +Status RemoteDorisReader::_do_get_next_block(Block* block, size_t* read_rows, bool* eof) { arrow::flight::FlightStreamChunk chunk; RETURN_DORIS_STATUS_IF_ERROR(_stream->Next().Value(&chunk)); @@ -95,11 +95,12 @@ Status RemoteDorisReader::get_next_block(Block* block, size_t* read_rows, bool* } *read_rows += num_rows; + return Status::OK(); } -Status RemoteDorisReader::get_columns(std::unordered_map* name_to_type, - std::unordered_set* missing_cols) { +Status RemoteDorisReader::_get_columns_impl( + std::unordered_map* name_to_type) { for (const auto& slot : _file_slot_descs) { name_to_type->emplace(slot->col_name(), slot->type()); } diff --git a/be/src/format/table/remote_doris_reader.h b/be/src/format/table/remote_doris_reader.h index d8ea431fda288f..d4d6beaac345ce 100644 --- a/be/src/format/table/remote_doris_reader.h +++ b/be/src/format/table/remote_doris_reader.h @@ -27,6 +27,7 @@ #include "common/status.h" #include "format/jni/jni_reader.h" +#include "format/table/table_format_reader.h" #include "storage/olap_scan_common.h" namespace doris { @@ -38,7 +39,7 @@ class Block; namespace doris { #include "common/compile_check_begin.h" -class RemoteDorisReader : public GenericReader { +class RemoteDorisReader : public TableFormatReader { ENABLE_FACTORY_CREATOR(RemoteDorisReader); public: @@ -49,10 +50,9 @@ class RemoteDorisReader : public GenericReader { Status init_reader(); - Status get_next_block(Block* block, size_t* read_rows, bool* eof) override; + Status _do_get_next_block(Block* block, size_t* read_rows, bool* eof) override; - Status get_columns(std::unordered_map* name_to_type, - std::unordered_set* missing_cols) override; + Status _get_columns_impl(std::unordered_map* name_to_type) override; Status close() override; @@ -64,6 +64,9 @@ class RemoteDorisReader : public GenericReader { _col_name_to_block_idx = col_name_to_block_idx; } +protected: + Status _do_init_reader(ReaderInitContext* /*ctx*/) override { return init_reader(); } + private: arrow::Status init_stream(); const TFileRangeDesc& _range; diff --git a/be/src/format/table/table_format_reader.cpp b/be/src/format/table/table_format_reader.cpp index 09144f04ebd625..fe7afaa3514bb8 100644 --- a/be/src/format/table/table_format_reader.cpp +++ b/be/src/format/table/table_format_reader.cpp @@ -17,658 +17,95 @@ #include "format/table/table_format_reader.h" -#include +#include +#include -#include -#include - -#include "common/status.h" -#include "core/block/block.h" -#include "core/data_type/data_type_array.h" -#include "core/data_type/data_type_map.h" -#include "core/data_type/data_type_struct.h" -#include "format/generic_reader.h" +#include "runtime/descriptors.h" #include "util/string_util.h" namespace doris { -#include "common/compile_check_begin.h" -const Status TableSchemaChangeHelper::BuildTableInfoUtil::SCHEMA_ERROR = Status::NotSupported( - "In the parquet/orc reader, it is not possible to read scenarios where the complex column " - "types" - "of the table and the file are inconsistent."); - -Status TableSchemaChangeHelper::BuildTableInfoUtil::by_parquet_name( - const TupleDescriptor* table_tuple_descriptor, const FieldDescriptor& parquet_field_desc, - std::shared_ptr& node, - const std::set* is_file_slot) { - auto struct_node = std::make_shared(); - auto parquet_fields_schema = parquet_field_desc.get_fields_schema(); - std::map file_column_name_idx_map; - for (size_t idx = 0; idx < parquet_fields_schema.size(); idx++) { - file_column_name_idx_map.emplace(to_lower(parquet_fields_schema[idx].name), idx); - } - - for (const auto& slot : table_tuple_descriptor->slots()) { - const auto& table_column_name = slot->col_name(); - // https://github.com/apache/doris/pull/23369/files - if ((is_file_slot == nullptr || is_file_slot->contains(slot->id())) && - file_column_name_idx_map.contains(table_column_name)) { - auto file_column_idx = file_column_name_idx_map[table_column_name]; - std::shared_ptr field_node = nullptr; - RETURN_IF_ERROR(by_parquet_name(slot->type(), parquet_fields_schema[file_column_idx], - field_node)); - - struct_node->add_children(table_column_name, - parquet_fields_schema[file_column_idx].name, field_node); - } else { - struct_node->add_not_exist_children(table_column_name); - } - } - - node = struct_node; - return Status::OK(); -}; - -Status TableSchemaChangeHelper::BuildTableInfoUtil::by_parquet_name( - const DataTypePtr& table_data_type, const FieldSchema& file_field, - std::shared_ptr& node) { - switch (table_data_type->get_primitive_type()) { - case TYPE_MAP: { - if (file_field.data_type->get_primitive_type() != TYPE_MAP) [[unlikely]] { - return SCHEMA_ERROR; - } - MOCK_REMOVE(DCHECK(file_field.children.size() == 2)); - std::shared_ptr key_node = nullptr; - - { - const auto& key_type = assert_cast( - assert_cast(remove_nullable(table_data_type).get()) - ->get_key_type()); - - RETURN_IF_ERROR(by_parquet_name(key_type, file_field.children[0], key_node)); - } - - std::shared_ptr value_node = nullptr; - { - const auto& value_type = assert_cast( - assert_cast(remove_nullable(table_data_type).get()) - ->get_value_type()); - - RETURN_IF_ERROR(by_parquet_name(value_type, file_field.children[1], value_node)); - } - node = std::make_shared(key_node, value_node); - break; - } - case TYPE_ARRAY: { - if (file_field.data_type->get_primitive_type() != TYPE_ARRAY) [[unlikely]] { - return SCHEMA_ERROR; - } - MOCK_REMOVE(DCHECK(file_field.children.size() == 1)); - - std::shared_ptr element_node = nullptr; - const auto& element_type = assert_cast( - assert_cast(remove_nullable(table_data_type).get()) - ->get_nested_type()); - - RETURN_IF_ERROR(by_parquet_name(element_type, file_field.children[0], element_node)); - - node = std::make_shared(element_node); - break; - } - case TYPE_STRUCT: { - if (file_field.data_type->get_primitive_type() != TYPE_STRUCT) [[unlikely]] { - return SCHEMA_ERROR; - } - - auto struct_node = std::make_shared(); - - const auto struct_data_type = - assert_cast(remove_nullable(table_data_type).get()); - - std::map parquet_field_names; - for (size_t idx = 0; idx < file_field.children.size(); idx++) { - parquet_field_names.emplace(to_lower(file_field.children[idx].name), idx); - } - for (size_t idx = 0; idx < struct_data_type->get_elements().size(); idx++) { - const auto& doris_field_name = struct_data_type->get_element_name(idx); - if (parquet_field_names.contains(doris_field_name)) { - auto parquet_field_idx = parquet_field_names[doris_field_name]; - std::shared_ptr field_node = nullptr; - - RETURN_IF_ERROR(by_parquet_name(struct_data_type->get_element(idx), - file_field.children[parquet_field_idx], - field_node)); - struct_node->add_children(doris_field_name, - file_field.children[parquet_field_idx].name, field_node); - } else { - struct_node->add_not_exist_children(doris_field_name); +/* static */ +Status TableFormatReader::_extract_partition_values( + const TFileRangeDesc& range, const TupleDescriptor* tuple_descriptor, + std::unordered_map>& + partition_values) { + partition_values.clear(); + if (range.__isset.columns_from_path_keys && tuple_descriptor != nullptr) { + std::unordered_map name_to_slot; + for (auto* slot : tuple_descriptor->slots()) { + name_to_slot[slot->col_name()] = slot; + } + for (size_t i = 0; i < range.columns_from_path_keys.size(); i++) { + const auto& key = range.columns_from_path_keys[i]; + const auto& value = range.columns_from_path[i]; + auto slot_it = name_to_slot.find(key); + if (slot_it != name_to_slot.end()) { + partition_values.emplace(key, std::make_tuple(value, slot_it->second)); } } - node = struct_node; - break; - } - default: { - node = std::make_shared(); - break; - } } - return Status::OK(); } -Status TableSchemaChangeHelper::BuildTableInfoUtil::by_orc_name( - const TupleDescriptor* table_tuple_descriptor, const orc::Type* orc_type_ptr, - std::shared_ptr& node, - const std::set* is_file_slot) { - auto struct_node = std::make_shared(); +Status TableFormatReader::on_before_init_reader(ReaderInitContext* ctx) { + _column_descs = ctx->column_descs; + _fill_col_name_to_block_idx = ctx->col_name_to_block_idx; + RETURN_IF_ERROR( + _extract_partition_values(*ctx->range, ctx->tuple_descriptor, _fill_partition_values)); - std::map file_column_name_idx_map; - for (uint64_t idx = 0; idx < orc_type_ptr->getSubtypeCount(); idx++) { - // to_lower for match table column name. - file_column_name_idx_map.emplace(to_lower(orc_type_ptr->getFieldName(idx)), idx); - } - - for (const auto& slot : table_tuple_descriptor->slots()) { - const auto& table_column_name = slot->col_name(); - if ((is_file_slot == nullptr || is_file_slot->contains(slot->id())) && - file_column_name_idx_map.contains(table_column_name)) { - auto file_column_idx = file_column_name_idx_map[table_column_name]; - std::shared_ptr field_node = nullptr; - RETURN_IF_ERROR(by_orc_name(slot->type(), orc_type_ptr->getSubtype(file_column_idx), - field_node)); - struct_node->add_children(table_column_name, - orc_type_ptr->getFieldName(file_column_idx), field_node); - } else { - struct_node->add_not_exist_children(table_column_name); + for (auto& desc : *ctx->column_descs) { + if (desc.category == ColumnCategory::REGULAR || + desc.category == ColumnCategory::GENERATED) { + ctx->column_names.push_back(desc.name); } } - node = struct_node; - return Status::OK(); -} -Status TableSchemaChangeHelper::BuildTableInfoUtil::by_orc_name( - const DataTypePtr& table_data_type, const orc::Type* orc_root, - std::shared_ptr& node) { - switch (table_data_type->get_primitive_type()) { - case TYPE_MAP: { - if (orc_root->getKind() != orc::TypeKind::MAP) [[unlikely]] { - return SCHEMA_ERROR; - } - MOCK_REMOVE(DCHECK(orc_root->getSubtypeCount() == 2)); - - std::shared_ptr key_node = nullptr; - const auto& key_type = assert_cast( - assert_cast(remove_nullable(table_data_type).get()) - ->get_key_type()); - RETURN_IF_ERROR(by_orc_name(key_type, orc_root->getSubtype(0), key_node)); - - std::shared_ptr value_node = nullptr; - const auto& value_type = assert_cast( - assert_cast(remove_nullable(table_data_type).get()) - ->get_value_type()); - RETURN_IF_ERROR(by_orc_name(value_type, orc_root->getSubtype(1), value_node)); - node = std::make_shared(key_node, value_node); - - break; - } - case TYPE_ARRAY: { - if (orc_root->getKind() != orc::TypeKind::LIST) [[unlikely]] { - return SCHEMA_ERROR; - } - MOCK_REMOVE(DCHECK(orc_root->getSubtypeCount() == 1)); - - std::shared_ptr element_node = nullptr; - const auto& element_type = assert_cast( - assert_cast(remove_nullable(table_data_type).get()) - ->get_nested_type()); + // Build default table_info_node from file column names (case-insensitive matching). + // Subclasses (OrcReader, ParquetReader, Hive, Iceberg, etc.) override on_before_init_reader + // and build their own table_info_node AFTER calling _extract_partition_values. + // For simple readers (CSV, JSON, etc.) that do NOT override, we build it here. + std::unordered_map file_columns; + RETURN_IF_ERROR(get_columns(&file_columns)); - RETURN_IF_ERROR(by_orc_name(element_type, orc_root->getSubtype(0), element_node)); - node = std::make_shared(element_node); - break; + // lowercase file name → original file name + std::unordered_map lower_to_native; + for (const auto& [name, _] : file_columns) { + lower_to_native[doris::to_lower(name)] = name; } - case TYPE_STRUCT: { - if (orc_root->getKind() != orc::TypeKind::STRUCT) [[unlikely]] { - return SCHEMA_ERROR; - } - auto struct_node = std::make_shared(); - - const auto struct_data_type = - assert_cast(remove_nullable(table_data_type).get()); - std::map orc_field_names; - for (uint64_t idx = 0; idx < orc_root->getSubtypeCount(); idx++) { - orc_field_names.emplace(to_lower(orc_root->getFieldName(idx)), idx); - } - for (size_t idx = 0; idx < struct_data_type->get_elements().size(); idx++) { - const auto& doris_field_name = struct_data_type->get_element_name(idx); - - if (orc_field_names.contains(doris_field_name)) { - std::shared_ptr field_node = nullptr; - - auto orc_field_idx = orc_field_names[doris_field_name]; - RETURN_IF_ERROR(by_orc_name(struct_data_type->get_element(idx), - orc_root->getSubtype(orc_field_idx), field_node)); - struct_node->add_children(doris_field_name, orc_root->getFieldName(orc_field_idx), - field_node); - } else { - struct_node->add_not_exist_children(doris_field_name); + // Auto-compute missing columns for simple readers (CSV/JSON/Arrow/etc.). + // Parquet/ORC readers compute their own _fill_missing_defaults in _do_init_reader. + if (_column_descs) { + for (const auto& desc : *_column_descs) { + if (desc.category != ColumnCategory::REGULAR && + desc.category != ColumnCategory::GENERATED) { + continue; } - } - node = struct_node; - break; - } - default: { - node = std::make_shared(); - break; - } - } - return Status::OK(); -} - -Status TableSchemaChangeHelper::BuildTableInfoUtil::by_table_field_id( - const schema::external::TField table_schema, const schema::external::TField file_schema, - std::shared_ptr& node) { - switch (table_schema.type.type) { - case TPrimitiveType::MAP: { - if (file_schema.type.type != TPrimitiveType::MAP) [[unlikely]] { - return SCHEMA_ERROR; - } - MOCK_REMOVE(DCHECK(table_schema.__isset.nestedField)); - MOCK_REMOVE(DCHECK(table_schema.nestedField.__isset.map_field)); - MOCK_REMOVE(DCHECK(table_schema.nestedField.map_field.__isset.key_field)); - MOCK_REMOVE(DCHECK(table_schema.nestedField.map_field.__isset.value_field)); - MOCK_REMOVE(DCHECK(table_schema.nestedField.map_field.key_field.field_ptr != nullptr)); - MOCK_REMOVE(DCHECK(table_schema.nestedField.map_field.value_field.field_ptr != nullptr)); - - MOCK_REMOVE(DCHECK(file_schema.__isset.nestedField)); - MOCK_REMOVE(DCHECK(file_schema.nestedField.__isset.map_field)); - MOCK_REMOVE(DCHECK(file_schema.nestedField.map_field.__isset.key_field)); - MOCK_REMOVE(DCHECK(file_schema.nestedField.map_field.__isset.value_field)); - MOCK_REMOVE(DCHECK(file_schema.nestedField.map_field.key_field.field_ptr != nullptr)); - MOCK_REMOVE(DCHECK(file_schema.nestedField.map_field.value_field.field_ptr != nullptr)); - - std::shared_ptr key_node = nullptr; - RETURN_IF_ERROR(by_table_field_id(*table_schema.nestedField.map_field.key_field.field_ptr, - *file_schema.nestedField.map_field.key_field.field_ptr, - key_node)); - - std::shared_ptr value_node = nullptr; - RETURN_IF_ERROR(by_table_field_id(*table_schema.nestedField.map_field.value_field.field_ptr, - *file_schema.nestedField.map_field.value_field.field_ptr, - value_node)); - - node = std::make_shared(key_node, value_node); - break; - } - case TPrimitiveType::ARRAY: { - if (file_schema.type.type != TPrimitiveType::ARRAY) [[unlikely]] { - return SCHEMA_ERROR; - } - - MOCK_REMOVE(DCHECK(table_schema.__isset.nestedField)); - MOCK_REMOVE(DCHECK(table_schema.nestedField.__isset.array_field)); - MOCK_REMOVE(DCHECK(table_schema.nestedField.array_field.__isset.item_field)); - MOCK_REMOVE(DCHECK(table_schema.nestedField.array_field.item_field.field_ptr != nullptr)); - - MOCK_REMOVE(DCHECK(file_schema.__isset.nestedField)); - MOCK_REMOVE(DCHECK(file_schema.nestedField.__isset.array_field)); - MOCK_REMOVE(DCHECK(file_schema.nestedField.array_field.__isset.item_field)); - MOCK_REMOVE(DCHECK(file_schema.nestedField.array_field.item_field.field_ptr != nullptr)); - - std::shared_ptr item_node = nullptr; - RETURN_IF_ERROR(by_table_field_id( - *table_schema.nestedField.array_field.item_field.field_ptr, - *file_schema.nestedField.array_field.item_field.field_ptr, item_node)); - - node = std::make_shared(item_node); - break; - } - case TPrimitiveType::STRUCT: { - if (file_schema.type.type != TPrimitiveType::STRUCT) [[unlikely]] { - return SCHEMA_ERROR; - } - MOCK_REMOVE(DCHECK(table_schema.__isset.nestedField)); - MOCK_REMOVE(DCHECK(table_schema.nestedField.__isset.struct_field)); - - MOCK_REMOVE(DCHECK(file_schema.__isset.nestedField)); - MOCK_REMOVE(DCHECK(file_schema.nestedField.__isset.struct_field)); - - RETURN_IF_ERROR(by_table_field_id(table_schema.nestedField.struct_field, - file_schema.nestedField.struct_field, node)); - break; - } - default: { - node = std::make_shared(); - break; - } - } - - return Status::OK(); -} - -Status TableSchemaChangeHelper::BuildTableInfoUtil::by_table_field_id( - const schema::external::TStructField& table_schema, - const schema::external::TStructField& file_schema, - std::shared_ptr& node) { - std::map file_field_id_to_idx; - for (size_t idx = 0; idx < file_schema.fields.size(); ++idx) { - file_field_id_to_idx.emplace(file_schema.fields[idx].field_ptr->id, idx); - } - auto struct_node = std::make_shared(); - - for (const auto& table_field : table_schema.fields) { - const auto& table_column_name = table_field.field_ptr->name; - - if (file_field_id_to_idx.contains(table_field.field_ptr->id)) { - const auto& file_field = - file_schema.fields.at(file_field_id_to_idx[table_field.field_ptr->id]); - - std::shared_ptr field_node = nullptr; - RETURN_IF_ERROR( - by_table_field_id(*table_field.field_ptr, *file_field.field_ptr, field_node)); - - struct_node->add_children(table_column_name, file_field.field_ptr->name, field_node); - } else { - struct_node->add_not_exist_children(table_column_name); - } - } - node = std::move(struct_node); - return Status::OK(); -} - -Status TableSchemaChangeHelper::BuildTableInfoUtil::by_parquet_field_id( - const schema::external::TField& table_schema, const FieldSchema& parquet_field, - const bool exist_field_id, std::shared_ptr& node) { - switch (table_schema.type.type) { - case TPrimitiveType::MAP: { - if (parquet_field.data_type->get_primitive_type() != TYPE_MAP) [[unlikely]] { - return SCHEMA_ERROR; - } - MOCK_REMOVE(DCHECK(table_schema.__isset.nestedField)); - MOCK_REMOVE(DCHECK(table_schema.nestedField.__isset.map_field)); - MOCK_REMOVE(DCHECK(table_schema.nestedField.map_field.__isset.key_field)); - MOCK_REMOVE(DCHECK(table_schema.nestedField.map_field.__isset.value_field)); - MOCK_REMOVE(DCHECK(table_schema.nestedField.map_field.key_field.field_ptr != nullptr)); - MOCK_REMOVE(DCHECK(table_schema.nestedField.map_field.value_field.field_ptr != nullptr)); - - MOCK_REMOVE(DCHECK(parquet_field.children.size() == 2)); - - std::shared_ptr key_node = nullptr; - std::shared_ptr value_node = nullptr; - - RETURN_IF_ERROR(by_parquet_field_id(*table_schema.nestedField.map_field.key_field.field_ptr, - parquet_field.children[0], exist_field_id, key_node)); - - RETURN_IF_ERROR( - by_parquet_field_id(*table_schema.nestedField.map_field.value_field.field_ptr, - parquet_field.children[1], exist_field_id, value_node)); - - node = std::make_shared(key_node, value_node); - break; - } - case TPrimitiveType::ARRAY: { - if (parquet_field.data_type->get_primitive_type() != TYPE_ARRAY) [[unlikely]] { - return SCHEMA_ERROR; - } - MOCK_REMOVE(DCHECK(table_schema.__isset.nestedField)); - MOCK_REMOVE(DCHECK(table_schema.nestedField.__isset.array_field)); - MOCK_REMOVE(DCHECK(table_schema.nestedField.array_field.__isset.item_field)); - MOCK_REMOVE(DCHECK(table_schema.nestedField.array_field.item_field.field_ptr != nullptr)); - - MOCK_REMOVE(DCHECK(parquet_field.children.size() == 1)); - - std::shared_ptr element_node = nullptr; - RETURN_IF_ERROR( - by_parquet_field_id(*table_schema.nestedField.array_field.item_field.field_ptr, - parquet_field.children[0], exist_field_id, element_node)); - - node = std::make_shared(element_node); - break; - } - case TPrimitiveType::STRUCT: { - if (parquet_field.data_type->get_primitive_type() != TYPE_STRUCT) [[unlikely]] { - return SCHEMA_ERROR; - } - MOCK_REMOVE(DCHECK(table_schema.__isset.nestedField)); - MOCK_REMOVE(DCHECK(table_schema.nestedField.__isset.struct_field)); - - auto struct_node = std::make_shared(); - - if (exist_field_id) { - std::map file_column_id_idx_map; - for (size_t idx = 0; idx < parquet_field.children.size(); idx++) { - DCHECK_NE(parquet_field.children[idx].field_id, -1); - file_column_id_idx_map.emplace(parquet_field.children[idx].field_id, idx); + // Skip columns already handled as partition columns to avoid double-fill. + if (_fill_partition_values.contains(desc.name)) { + continue; } - - for (const auto& table_field : table_schema.nestedField.struct_field.fields) { - const auto& table_column_name = table_field.field_ptr->name; - if (file_column_id_idx_map.contains(table_field.field_ptr->id)) { - const auto& file_field = parquet_field.children.at( - file_column_id_idx_map[table_field.field_ptr->id]); - std::shared_ptr field_node = nullptr; - RETURN_IF_ERROR(by_parquet_field_id(*table_field.field_ptr, file_field, - exist_field_id, field_node)); - struct_node->add_children(table_column_name, file_field.name, field_node); - } else { - struct_node->add_not_exist_children(table_column_name); - } - } - } else { - std::map file_column_idx_map; - for (size_t idx = 0; idx < parquet_field.children.size(); idx++) { - file_column_idx_map.emplace(parquet_field.children[idx].name, idx); + if (!lower_to_native.contains(doris::to_lower(desc.name))) { + _fill_missing_defaults[desc.name] = desc.default_expr; + _fill_missing_cols.insert(desc.name); } - - for (const auto& table_field : table_schema.nestedField.struct_field.fields) { - const auto& table_column_name = table_field.field_ptr->name; - if (!table_field.field_ptr->__isset.name_mapping || - table_field.field_ptr->name_mapping.size() == 0) { - return Status::DataQualityError( - "name_mapping must be set when read missing field id data file."); - } - - auto have_mapping = false; - for (const auto& mapped_name : table_field.field_ptr->name_mapping) { - if (file_column_idx_map.contains(mapped_name)) { - std::shared_ptr field_node = nullptr; - const auto& file_field = - parquet_field.children.at(file_column_idx_map.at(mapped_name)); - RETURN_IF_ERROR(by_parquet_field_id(*table_field.field_ptr, file_field, - exist_field_id, field_node)); - struct_node->add_children(table_column_name, file_field.name, field_node); - have_mapping = true; - break; - } - } - if (!have_mapping) { - struct_node->add_not_exist_children(table_column_name); - } - } - } - node = struct_node; - break; - } - default: { - node = std::make_shared(); - break; - } - } - return Status::OK(); -} - -Status TableSchemaChangeHelper::BuildTableInfoUtil::by_orc_field_id( - const schema::external::TField& table_schema, const orc::Type* orc_root, - const std::string& field_id_attribute_key, const bool exist_field_id, - std::shared_ptr& node) { - switch (table_schema.type.type) { - case TPrimitiveType::MAP: { - if (orc_root->getKind() != orc::TypeKind::MAP) [[unlikely]] { - return SCHEMA_ERROR; } - MOCK_REMOVE(DCHECK(table_schema.__isset.nestedField)); - MOCK_REMOVE(DCHECK(table_schema.nestedField.__isset.map_field)); - MOCK_REMOVE(DCHECK(table_schema.nestedField.map_field.__isset.key_field)); - MOCK_REMOVE(DCHECK(table_schema.nestedField.map_field.__isset.value_field)); - MOCK_REMOVE(DCHECK(table_schema.nestedField.map_field.key_field.field_ptr != nullptr)); - MOCK_REMOVE(DCHECK(table_schema.nestedField.map_field.value_field.field_ptr != nullptr)); - - MOCK_REMOVE(DCHECK(orc_root->getSubtypeCount() == 2)); - - std::shared_ptr key_node = nullptr; - std::shared_ptr value_node = nullptr; - - RETURN_IF_ERROR(by_orc_field_id(*table_schema.nestedField.map_field.key_field.field_ptr, - orc_root->getSubtype(0), field_id_attribute_key, - exist_field_id, key_node)); - - RETURN_IF_ERROR(by_orc_field_id(*table_schema.nestedField.map_field.value_field.field_ptr, - orc_root->getSubtype(1), field_id_attribute_key, - exist_field_id, value_node)); - - node = std::make_shared(key_node, value_node); - break; } - case TPrimitiveType::ARRAY: { - if (orc_root->getKind() != orc::TypeKind::LIST) [[unlikely]] { - return SCHEMA_ERROR; - } - MOCK_REMOVE(DCHECK(table_schema.__isset.nestedField)); - MOCK_REMOVE(DCHECK(table_schema.nestedField.__isset.array_field)); - MOCK_REMOVE(DCHECK(table_schema.nestedField.array_field.__isset.item_field)); - MOCK_REMOVE(DCHECK(table_schema.nestedField.array_field.item_field.field_ptr != nullptr)); - - MOCK_REMOVE(DCHECK(orc_root->getSubtypeCount() == 1)); - - std::shared_ptr element_node = nullptr; - RETURN_IF_ERROR(by_orc_field_id(*table_schema.nestedField.array_field.item_field.field_ptr, - orc_root->getSubtype(0), field_id_attribute_key, - exist_field_id, element_node)); - - node = std::make_shared(element_node); - break; - } - case TPrimitiveType::STRUCT: { - if (orc_root->getKind() != orc::TypeKind::STRUCT) [[unlikely]] { - return SCHEMA_ERROR; - } - MOCK_REMOVE(DCHECK(table_schema.__isset.nestedField)); - MOCK_REMOVE(DCHECK(table_schema.nestedField.__isset.struct_field)); - - auto struct_node = std::make_shared(); - if (exist_field_id) { - std::map file_column_id_idx_map; - for (size_t idx = 0; idx < orc_root->getSubtypeCount(); idx++) { - auto field_id = std::stoi( - orc_root->getSubtype(idx)->getAttributeValue(field_id_attribute_key)); - file_column_id_idx_map.emplace(field_id, idx); - } - for (const auto& table_field : table_schema.nestedField.struct_field.fields) { - const auto& table_column_name = table_field.field_ptr->name; - if (file_column_id_idx_map.contains(table_field.field_ptr->id)) { - auto file_field_idx = file_column_id_idx_map[table_field.field_ptr->id]; - const auto& file_field = orc_root->getSubtype(file_field_idx); - std::shared_ptr field_node = nullptr; - RETURN_IF_ERROR(by_orc_field_id(*table_field.field_ptr, file_field, - field_id_attribute_key, exist_field_id, - field_node)); - struct_node->add_children(table_column_name, - orc_root->getFieldName(file_field_idx), field_node); - } else { - struct_node->add_not_exist_children(table_column_name); - } - } + auto info_node = std::make_shared(); + for (const auto* slot : ctx->tuple_descriptor->slots()) { + auto it = lower_to_native.find(slot->col_name_lower_case()); + if (it != lower_to_native.end()) { + info_node->add_children(slot->col_name(), it->second, + TableSchemaChangeHelper::ConstNode::get_instance()); } else { - std::map file_column_idx_map; - - for (size_t idx = 0; idx < orc_root->getSubtypeCount(); idx++) { - file_column_idx_map.emplace(orc_root->getFieldName(idx), idx); - } - - for (const auto& table_field : table_schema.nestedField.struct_field.fields) { - const auto& table_column_name = table_field.field_ptr->name; - if (!table_field.field_ptr->__isset.name_mapping || - table_field.field_ptr->name_mapping.size() == 0) { - return Status::DataQualityError( - "name_mapping must be set when read missing field id data file."); - } - auto have_mapping = false; - for (const auto& mapped_name : table_field.field_ptr->name_mapping) { - if (file_column_idx_map.contains(mapped_name)) { - std::shared_ptr field_node = nullptr; - auto file_field_idx = file_column_idx_map.at(mapped_name); - const auto& file_field = orc_root->getSubtype(file_field_idx); - - RETURN_IF_ERROR(by_orc_field_id(*table_field.field_ptr, file_field, - field_id_attribute_key, exist_field_id, - field_node)); - struct_node->add_children(table_column_name, - orc_root->getFieldName(file_field_idx), - field_node); - have_mapping = true; - break; - } - } - if (!have_mapping) { - struct_node->add_not_exist_children(table_column_name); - } - } + info_node->add_not_exist_children(slot->col_name()); } - - node = struct_node; - break; - } - default: { - node = std::make_shared(); - break; - } } + ctx->table_info_node = info_node; return Status::OK(); } -std::string TableSchemaChangeHelper::debug(const std::shared_ptr& root, size_t level) { - std::string ans; - - auto indent = [](size_t level) { return std::string(level * 2, ' '); }; - - std::string prefix = indent(level); - - if (std::dynamic_pointer_cast(root)) { - ans += prefix + "ScalarNode\n"; - } else if (auto struct_node = std::dynamic_pointer_cast(root)) { - ans += prefix + "StructNode\n"; - for (const auto& [table_col_name, value] : struct_node->get_children()) { - ans += indent(level + 1) + table_col_name; - if (value.exists) { - ans += " (file: " + value.column_name + ")"; - } else { - ans += " (not exists)"; - } - ans += "\n"; - if (value.node) { - ans += debug(value.node, level + 2); - } - } - } else if (auto array_node = std::dynamic_pointer_cast(root)) { - ans += prefix + "ArrayNode\n"; - ans += indent(level + 1) + "Element:\n"; - ans += debug(array_node->get_element_node(), level + 2); - } else if (auto map_node = std::dynamic_pointer_cast(root)) { - ans += prefix + "MapNode\n"; - ans += indent(level + 1) + "Key:\n"; - ans += debug(map_node->get_key_node(), level + 2); - ans += indent(level + 1) + "Value:\n"; - ans += debug(map_node->get_value_node(), level + 2); - } else if (std::dynamic_pointer_cast(root)) { - ans += prefix + "ConstNode\n"; - } else { - ans += prefix + "UnknownNodeType\n"; - } - - return ans; -} -#include "common/compile_check_end.h" -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/format/table/table_format_reader.h b/be/src/format/table/table_format_reader.h index 55c93e773bed80..d889420944203a 100644 --- a/be/src/format/table/table_format_reader.h +++ b/be/src/format/table/table_format_reader.h @@ -17,406 +17,188 @@ #pragma once -#include -#include +#include #include +#include +#include +#include +#include #include "common/status.h" -#include "core/block/block.h" -#include "core/data_type/data_type_array.h" -#include "core/data_type/data_type_map.h" -#include "core/data_type/data_type_struct.h" +#include "core/column/column.h" +#include "core/column/column_nullable.h" +#include "exprs/vexpr_fwd.h" #include "format/generic_reader.h" -#include "format/parquet/schema_desc.h" -#include "runtime/runtime_profile.h" -#include "runtime/runtime_state.h" -#include "storage/olap_scan_common.h" -#include "util/string_util.h" namespace doris { class TFileRangeDesc; -class Block; +class TupleDescriptor; +class SlotDescriptor; } // namespace doris namespace doris { #include "common/compile_check_begin.h" + +/// Intermediate base class for "table readers" used by FileScanner. +/// +/// Owns all column-filling state and logic: +/// - partition column values (from path metadata) +/// - missing column defaults (columns not in file) +/// - synthesized column handlers (e.g. Iceberg $row_id) +/// +/// Provides default on_after_read_block that auto-fills these columns. +/// Parquet/ORC override to no-op (they fill per-batch internally). +/// +/// Also provides the default on_before_init_reader for simple readers +/// (CSV, JSON, etc.) that auto-computes partition/missing columns. +/// ORC/Parquet override on_before_init_reader with format-specific schema matching. class TableFormatReader : public GenericReader { public: - TableFormatReader(std::unique_ptr file_format_reader, RuntimeState* state, - RuntimeProfile* profile, const TFileScanRangeParams& params, - const TFileRangeDesc& range, io::IOContext* io_ctx, FileMetaCache* meta_cache) - : _file_format_reader(std::move(file_format_reader)), - _state(state), - _profile(profile), - _params(params), - _range(range), - _io_ctx(io_ctx) { - _meta_cache = meta_cache; - if (range.table_format_params.__isset.table_level_row_count) { - _table_level_row_count = range.table_format_params.table_level_row_count; - } else { - _table_level_row_count = -1; + /// Get missing columns computed by on_before_init_reader / get_columns(). + const std::unordered_set& missing_cols() const { return _fill_missing_cols; } + + // ---- Fill-column hooks (called by RowGroupReader and ORC per-batch reading) ---- + + /// Fill partition columns from metadata values. + virtual Status on_fill_partition_columns(Block* block, size_t rows, + const std::vector& cols) { + DataTypeSerDe::FormatOptions text_format_options; + for (const auto& col_name : cols) { + auto it = _fill_partition_values.find(col_name); + if (it == _fill_partition_values.end()) { + continue; + } + auto col_ptr = block->get_by_position((*_fill_col_name_to_block_idx)[col_name]) + .column->assume_mutable(); + const auto& [value, slot_desc] = it->second; + auto text_serde = slot_desc->get_data_type_ptr()->get_serde(); + Slice slice(value.data(), value.size()); + uint64_t num_deserialized = 0; + if (text_serde->deserialize_column_from_fixed_json( + *col_ptr, slice, rows, &num_deserialized, text_format_options) != + Status::OK()) { + return Status::InternalError("Failed to fill partition column: {}={}", + slot_desc->col_name(), value); + } + if (num_deserialized != rows) { + return Status::InternalError( + "Failed to fill partition column: {}={}. " + "Expected rows: {}, actual: {}", + slot_desc->col_name(), value, num_deserialized, rows); + } } + return Status::OK(); } - ~TableFormatReader() override = default; - Status get_next_block(Block* block, size_t* read_rows, bool* eof) final { - if (_push_down_agg_type == TPushAggOp::type::COUNT && _table_level_row_count >= 0) { - auto rows = - std::min(_table_level_row_count, (int64_t)_state->query_options().batch_size); - _table_level_row_count -= rows; - auto mutate_columns = block->mutate_columns(); - for (auto& col : mutate_columns) { - col->resize(rows); + + /// Fill missing columns with default values or null. + virtual Status on_fill_missing_columns(Block* block, size_t rows, + const std::vector& cols) { + for (const auto& col_name : cols) { + if (!_fill_col_name_to_block_idx->contains(col_name)) { + return Status::InternalError("Missing column: {} not found in block {}", col_name, + block->dump_structure()); } - block->set_columns(std::move(mutate_columns)); - *read_rows = rows; - if (_table_level_row_count == 0) { - *eof = true; + auto it = _fill_missing_defaults.find(col_name); + VExprContextSPtr ctx = (it != _fill_missing_defaults.end()) ? it->second : nullptr; + + if (ctx == nullptr) { + auto mutable_column = + block->get_by_position((*_fill_col_name_to_block_idx)[col_name]) + .column->assume_mutable(); + auto* nullable_column = static_cast(mutable_column.get()); + nullable_column->insert_many_defaults(rows); + } else { + ColumnPtr result_column_ptr; + RETURN_IF_ERROR(ctx->execute(block, result_column_ptr)); + if (result_column_ptr->use_count() == 1) { + auto mutable_column = result_column_ptr->assume_mutable(); + mutable_column->resize(rows); + result_column_ptr = result_column_ptr->convert_to_full_column_if_const(); + auto origin_column_type = + block->get_by_position((*_fill_col_name_to_block_idx)[col_name]).type; + bool is_nullable = origin_column_type->is_nullable(); + block->replace_by_position( + (*_fill_col_name_to_block_idx)[col_name], + is_nullable ? make_nullable(result_column_ptr) : result_column_ptr); + } } - - return Status::OK(); } - return get_next_block_inner(block, read_rows, eof); + return Status::OK(); } - virtual Status get_next_block_inner(Block* block, size_t* read_rows, bool* eof) = 0; - - Status get_columns(std::unordered_map* name_to_type, - std::unordered_set* missing_cols) final { - return _file_format_reader->get_columns(name_to_type, missing_cols); - } + // ---- Synthesized column handler registry ---- - Status get_parsed_schema(std::vector* col_names, - std::vector* col_types) override { - return _file_format_reader->get_parsed_schema(col_names, col_types); - } + using SynthesizedColumnHandler = std::function; - Status set_fill_columns( - const std::unordered_map>& - partition_columns, - const std::unordered_map& missing_columns) final { - return _file_format_reader->set_fill_columns(partition_columns, missing_columns); + void register_synthesized_column_handler(const std::string& col_name, + SynthesizedColumnHandler handler) { + _synthesized_col_handlers.emplace_back(col_name, std::move(handler)); } - bool fill_all_columns() const override { return _file_format_reader->fill_all_columns(); } - - virtual Status init_row_filters() = 0; - - bool count_read_rows() override { return _file_format_reader->count_read_rows(); } - - void set_condition_cache_context(std::shared_ptr ctx) override { - _file_format_reader->set_condition_cache_context(std::move(ctx)); + Status fill_synthesized_columns(Block* block, size_t rows) { + for (auto& [name, handler] : _synthesized_col_handlers) { + RETURN_IF_ERROR(handler(block, rows)); + } + return Status::OK(); } - bool has_delete_operations() const override { - return _file_format_reader->has_delete_operations(); + /// Unified fill for partition + missing + synthesized columns. + /// Called automatically by on_after_read_block for simple readers. + /// Parquet/ORC call individual on_fill_* methods per-batch internally. + Status fill_remaining_columns(Block* block, size_t rows) { + std::vector part_col_names; + for (auto& kv : _fill_partition_values) { + part_col_names.push_back(kv.first); + } + RETURN_IF_ERROR(on_fill_partition_columns(block, rows, part_col_names)); + std::vector miss_col_names; + for (auto& kv : _fill_missing_defaults) { + miss_col_names.push_back(kv.first); + } + RETURN_IF_ERROR(on_fill_missing_columns(block, rows, miss_col_names)); + RETURN_IF_ERROR(fill_synthesized_columns(block, rows)); + return Status::OK(); } - int64_t get_total_rows() const override { return _file_format_reader->get_total_rows(); } + bool has_synthesized_column_handlers() const { return !_synthesized_col_handlers.empty(); } -protected: - std::string _table_format; // hudi, iceberg, paimon - std::unique_ptr _file_format_reader; // parquet, orc - RuntimeState* _state = nullptr; // for query options - RuntimeProfile* _profile = nullptr; - const TFileScanRangeParams& _params; - const TFileRangeDesc& _range; - io::IOContext* _io_ctx = nullptr; - int64_t _table_level_row_count = -1; // for optimization of count(*) push down - void _collect_profile_before_close() override { - if (_file_format_reader != nullptr) { - _file_format_reader->collect_profile_before_close(); - } + /// Fill generated columns. Default is no-op. + virtual Status on_fill_generated_columns(Block* block, size_t rows, + const std::vector& cols) { + return Status::OK(); } -}; - -class TableSchemaChangeHelper { -public: - ~TableSchemaChangeHelper() = default; - - class Node { - public: - virtual ~Node() = default; - virtual std::shared_ptr get_children_node(std::string table_column_name) const { - throw std::logic_error("get_children_node should not be called on base TableInfoNode"); - }; - virtual std::shared_ptr get_children_node_by_file_column_name( - std::string file_column_name) const { - throw std::logic_error( - "get_children_node_by_file_column_name should not be called on base " - "TableInfoNode"); - }; - - virtual std::string children_file_column_name(std::string table_column_name) const { - throw std::logic_error( - "children_file_column_name should not be called on base TableInfoNode"); - } - - virtual bool children_column_exists(std::string table_column_name) const { - throw std::logic_error( - "children_column_exists should not be called on base TableInfoNode"); - } - - virtual std::shared_ptr get_element_node() const { - throw std::logic_error("get_element_node should not be called on base TableInfoNode"); - } - - virtual std::shared_ptr get_key_node() const { - throw std::logic_error("get_key_node should not be called on base TableInfoNode"); - } - virtual std::shared_ptr get_value_node() const { - throw std::logic_error("get_value_node should not be called on base TableInfoNode"); - } - - virtual void add_not_exist_children(std::string table_column_name) { - throw std::logic_error( - "add_not_exist_children should not be called on base TableInfoNode"); - }; - - virtual void add_children(std::string table_column_name, std::string file_column_name, - std::shared_ptr children_node) { - throw std::logic_error("add_children should not be called on base TableInfoNode"); - } - }; - - class ScalarNode : public Node {}; - - class StructNode : public Node { - struct StructChild { - const std::shared_ptr node; - const std::string column_name; - const bool exists; - }; - - // table column name -> { node, file_column_name, exists_in_file} - std::map children; - - public: - std::shared_ptr get_children_node(std::string table_column_name) const override { - DCHECK(children.contains(table_column_name)); - DCHECK(children_column_exists(table_column_name)); - return children.at(table_column_name).node; - } - - std::shared_ptr get_children_node_by_file_column_name( - std::string file_column_name) const override { - // Search for the child by file column name - for (const auto& [table_name, child] : children) { - if (child.exists && child.column_name == file_column_name) { - return child.node; - } - } - // Not found - throw or return nullptr - throw std::runtime_error("File column name '" + file_column_name + - "' not found in struct children"); - } - - std::string children_file_column_name(std::string table_column_name) const override { - DCHECK(children.contains(table_column_name)); - DCHECK(children_column_exists(table_column_name)); - return children.at(table_column_name).column_name; - } - - bool children_column_exists(std::string table_column_name) const override { - DCHECK(children.contains(table_column_name)); - return children.at(table_column_name).exists; - } - - void add_not_exist_children(std::string table_column_name) override { - children.emplace(table_column_name, StructChild {nullptr, "", false}); - } - - void add_children(std::string table_column_name, std::string file_column_name, - std::shared_ptr children_node) override { - children.emplace(table_column_name, - StructChild {children_node, file_column_name, true}); - } - - const std::map& get_children() const { return children; } - }; - - class ArrayNode : public Node { - std::shared_ptr _element_node; - - public: - ArrayNode(const std::shared_ptr& element_node) : _element_node(element_node) {} - - std::shared_ptr get_element_node() const override { return _element_node; } - }; - - class MapNode : public Node { - std::shared_ptr _key_node; - std::shared_ptr _value_node; - - public: - MapNode(const std::shared_ptr& key_node, const std::shared_ptr& value_node) - : _key_node(key_node), _value_node(value_node) {} - - std::shared_ptr get_key_node() const override { return _key_node; } - - std::shared_ptr get_value_node() const override { return _value_node; } - }; - - class ConstNode : public Node { - // If you can be sure that there has been no schema change between the table and the file, - // you can use constNode (of course, you need to pay attention to case sensitivity). - public: - std::shared_ptr get_children_node(std::string table_column_name) const override { - return get_instance(); - }; - - std::shared_ptr get_children_node_by_file_column_name( - std::string file_column_name) const override { - return get_instance(); - }; - - std::string children_file_column_name(std::string table_column_name) const override { - return table_column_name; - } - - bool children_column_exists(std::string table_column_name) const override { return true; } - - std::shared_ptr get_element_node() const override { return get_instance(); } - - std::shared_ptr get_key_node() const override { return get_instance(); } - - std::shared_ptr get_value_node() const override { return get_instance(); } - - static const std::shared_ptr& get_instance() { - static const std::shared_ptr instance = std::make_shared(); - return instance; - } - }; - - static std::string debug(const std::shared_ptr& root, size_t level = 0); - -protected: - // Whenever external components invoke the Parquet/ORC reader (e.g., init_reader, get_next_block, set_fill_columns), - // the parameters passed in are based on `table column names`. - // The table_info_node_ptr assists the Parquet/ORC reader in mapping these to the actual - // `file columns name` to be read and enables min/max filtering. - std::shared_ptr table_info_node_ptr = std::make_shared(); + /// Default on_before_init_reader for simple readers (CSV, JSON, etc.). + /// Auto-computes partition values, missing columns, and table_info_node. + /// ORC/Parquet/Hive/Iceberg override with format-specific schema matching. + Status on_before_init_reader(ReaderInitContext* ctx) override; protected: - Status gen_table_info_node_by_field_id(const TFileScanRangeParams& params, - int64_t split_schema_id, - const TupleDescriptor* tuple_descriptor, - const FieldDescriptor& parquet_field_desc) { - if (!params.__isset.history_schema_info) [[unlikely]] { - RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name( - tuple_descriptor, parquet_field_desc, table_info_node_ptr)); - return Status::OK(); + /// Default on_after_read_block: auto-fill partition/missing/synthesized columns. + /// Parquet/ORC override to no-op (they fill per-batch internally). + Status on_after_read_block(Block* block, size_t* read_rows) override { + if (*read_rows > 0 && _push_down_agg_type != TPushAggOp::type::COUNT) { + RETURN_IF_ERROR(fill_remaining_columns(block, *read_rows)); } - return gen_table_info_node_by_field_id(params, split_schema_id); - } - - Status gen_table_info_node_by_field_id(const TFileScanRangeParams& params, - int64_t split_schema_id, - const TupleDescriptor* tuple_descriptor, - const orc::Type* orc_type_ptr) { - if (!params.__isset.history_schema_info) [[unlikely]] { - RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(tuple_descriptor, orc_type_ptr, - table_info_node_ptr)); - return Status::OK(); - } - return gen_table_info_node_by_field_id(params, split_schema_id); - } - -private: - // The filed id of both the table and the file come from the pass from fe. (params.history_schema_info) - Status gen_table_info_node_by_field_id(const TFileScanRangeParams& params, - int64_t split_schema_id) { - if (params.current_schema_id == split_schema_id) { - table_info_node_ptr = ConstNode::get_instance(); - return Status::OK(); - } - - int32_t table_schema_idx = -1; - int32_t file_schema_idx = -1; - //todo : Perhaps this process can be optimized by pre-generating a map - for (int32_t idx = 0; idx < params.history_schema_info.size(); idx++) { - if (params.history_schema_info[idx].schema_id == params.current_schema_id) { - table_schema_idx = idx; - } else if (params.history_schema_info[idx].schema_id == split_schema_id) { - file_schema_idx = idx; - } - } - - if (table_schema_idx == -1 || file_schema_idx == -1) [[unlikely]] { - return Status::InternalError( - "miss table/file schema info, table_schema_idx:{} file_schema_idx:{}", - table_schema_idx, file_schema_idx); - } - RETURN_IF_ERROR(BuildTableInfoUtil::by_table_field_id( - params.history_schema_info.at(table_schema_idx).root_field, - params.history_schema_info.at(file_schema_idx).root_field, table_info_node_ptr)); return Status::OK(); } -public: - /* Schema change Util. Used to generate `std::shared_ptr node`. - Passed node to parquet/orc reader to find file columns based on table columns, - */ - struct BuildTableInfoUtil { - static const Status SCHEMA_ERROR; - - // todo : Maybe I can use templates to implement this functionality. - - // for hive parquet : The table column names passed from fe are lowercase, so use lowercase file column names to match table column names. - static Status by_parquet_name(const TupleDescriptor* table_tuple_descriptor, - const FieldDescriptor& parquet_field_desc, - std::shared_ptr& node, - const std::set* is_file_slot = nullptr); - - // for hive parquet - static Status by_parquet_name(const DataTypePtr& table_data_type, - const FieldSchema& file_field, - std::shared_ptr& node); - - // for hive orc: The table column names passed from fe are lowercase, so use lowercase file column names to match table column names. - static Status by_orc_name(const TupleDescriptor* table_tuple_descriptor, - const orc::Type* orc_type_ptr, - std::shared_ptr& node, - const std::set* is_file_slot = nullptr); - // for hive orc - static Status by_orc_name(const DataTypePtr& table_data_type, const orc::Type* orc_root, - std::shared_ptr& node); - - // for paimon hudi: Use the field id in the `table schema` and `history table schema` to match columns. - static Status by_table_field_id(const schema::external::TField table_schema, - const schema::external::TField file_schema, - std::shared_ptr& node); - - // for paimon hudi - static Status by_table_field_id(const schema::external::TStructField& table_schema, - const schema::external::TStructField& file_schema, - std::shared_ptr& node); - - // for iceberg parquet - static Status by_parquet_field_id(const schema::external::TField& table_schema, - const FieldSchema& parquet_field, - const bool exist_field_id, - std::shared_ptr& node); - - // for iceberg orc - static Status by_orc_field_id(const schema::external::TField& table_schema, - const orc::Type* orc_root, - const std::string& field_id_attribute_key, - const bool exist_field_id, - std::shared_ptr& node); - }; -}; - -struct ColumnIdResult { - std::set column_ids; - std::set filter_column_ids; - - ColumnIdResult() = default; // Add default constructor - - ColumnIdResult(std::set column_ids_, std::set filter_column_ids_) - : column_ids(std::move(column_ids_)), - filter_column_ids(std::move(filter_column_ids_)) {} + /// Extracts partition key→value pairs from the file range. + /// Static utility called by on_before_init_reader implementations. + static Status _extract_partition_values( + const TFileRangeDesc& range, const TupleDescriptor* tuple_descriptor, + std::unordered_map>& + partition_values); + + // ---- Fill column data (set by on_before_init_reader / _do_init_reader) ---- + std::unordered_map> + _fill_partition_values; + std::unordered_map _fill_missing_defaults; + std::unordered_map* _fill_col_name_to_block_idx = nullptr; + std::unordered_set _fill_missing_cols; + + // ---- Synthesized column handlers ---- + std::vector> _synthesized_col_handlers; }; #include "common/compile_check_end.h" diff --git a/be/src/format/table/table_schema_change_helper.cpp b/be/src/format/table/table_schema_change_helper.cpp new file mode 100644 index 00000000000000..48492b58dad967 --- /dev/null +++ b/be/src/format/table/table_schema_change_helper.cpp @@ -0,0 +1,657 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "table_schema_change_helper.h" + +#include + +#include +#include + +#include "common/status.h" +#include "core/block/block.h" +#include "core/data_type/data_type_array.h" +#include "core/data_type/data_type_map.h" +#include "core/data_type/data_type_struct.h" +#include "format/generic_reader.h" +#include "util/string_util.h" + +namespace doris { +#include "common/compile_check_begin.h" +const Status TableSchemaChangeHelper::BuildTableInfoUtil::SCHEMA_ERROR = Status::NotSupported( + "In the parquet/orc reader, it is not possible to read scenarios where the complex column " + "types" + "of the table and the file are inconsistent."); + +Status TableSchemaChangeHelper::BuildTableInfoUtil::by_parquet_name( + const TupleDescriptor* table_tuple_descriptor, const FieldDescriptor& parquet_field_desc, + std::shared_ptr& node, + const std::set* is_file_slot) { + auto struct_node = std::make_shared(); + auto parquet_fields_schema = parquet_field_desc.get_fields_schema(); + std::map file_column_name_idx_map; + for (size_t idx = 0; idx < parquet_fields_schema.size(); idx++) { + file_column_name_idx_map.emplace(to_lower(parquet_fields_schema[idx].name), idx); + } + + for (const auto& slot : table_tuple_descriptor->slots()) { + const auto& table_column_name = slot->col_name(); + // https://github.com/apache/doris/pull/23369/files + if ((is_file_slot == nullptr || is_file_slot->contains(slot->id())) && + file_column_name_idx_map.contains(table_column_name)) { + auto file_column_idx = file_column_name_idx_map[table_column_name]; + std::shared_ptr field_node = nullptr; + RETURN_IF_ERROR(by_parquet_name(slot->type(), parquet_fields_schema[file_column_idx], + field_node)); + + struct_node->add_children(table_column_name, + parquet_fields_schema[file_column_idx].name, field_node); + } else { + struct_node->add_not_exist_children(table_column_name); + } + } + + node = struct_node; + return Status::OK(); +}; + +Status TableSchemaChangeHelper::BuildTableInfoUtil::by_parquet_name( + const DataTypePtr& table_data_type, const FieldSchema& file_field, + std::shared_ptr& node) { + switch (table_data_type->get_primitive_type()) { + case TYPE_MAP: { + if (file_field.data_type->get_primitive_type() != TYPE_MAP) [[unlikely]] { + return SCHEMA_ERROR; + } + MOCK_REMOVE(DCHECK(file_field.children.size() == 2)); + std::shared_ptr key_node = nullptr; + + { + const auto& key_type = assert_cast( + assert_cast(remove_nullable(table_data_type).get()) + ->get_key_type()); + + RETURN_IF_ERROR(by_parquet_name(key_type, file_field.children[0], key_node)); + } + + std::shared_ptr value_node = nullptr; + { + const auto& value_type = assert_cast( + assert_cast(remove_nullable(table_data_type).get()) + ->get_value_type()); + + RETURN_IF_ERROR(by_parquet_name(value_type, file_field.children[1], value_node)); + } + node = std::make_shared(key_node, value_node); + break; + } + case TYPE_ARRAY: { + if (file_field.data_type->get_primitive_type() != TYPE_ARRAY) [[unlikely]] { + return SCHEMA_ERROR; + } + MOCK_REMOVE(DCHECK(file_field.children.size() == 1)); + + std::shared_ptr element_node = nullptr; + const auto& element_type = assert_cast( + assert_cast(remove_nullable(table_data_type).get()) + ->get_nested_type()); + + RETURN_IF_ERROR(by_parquet_name(element_type, file_field.children[0], element_node)); + + node = std::make_shared(element_node); + break; + } + case TYPE_STRUCT: { + if (file_field.data_type->get_primitive_type() != TYPE_STRUCT) [[unlikely]] { + return SCHEMA_ERROR; + } + + auto struct_node = std::make_shared(); + + const auto struct_data_type = + assert_cast(remove_nullable(table_data_type).get()); + + std::map parquet_field_names; + for (size_t idx = 0; idx < file_field.children.size(); idx++) { + parquet_field_names.emplace(to_lower(file_field.children[idx].name), idx); + } + for (size_t idx = 0; idx < struct_data_type->get_elements().size(); idx++) { + const auto& doris_field_name = struct_data_type->get_element_name(idx); + + if (parquet_field_names.contains(doris_field_name)) { + auto parquet_field_idx = parquet_field_names[doris_field_name]; + std::shared_ptr field_node = nullptr; + + RETURN_IF_ERROR(by_parquet_name(struct_data_type->get_element(idx), + file_field.children[parquet_field_idx], + field_node)); + struct_node->add_children(doris_field_name, + file_field.children[parquet_field_idx].name, field_node); + } else { + struct_node->add_not_exist_children(doris_field_name); + } + } + node = struct_node; + break; + } + default: { + node = std::make_shared(); + break; + } + } + + return Status::OK(); +} + +Status TableSchemaChangeHelper::BuildTableInfoUtil::by_orc_name( + const TupleDescriptor* table_tuple_descriptor, const orc::Type* orc_type_ptr, + std::shared_ptr& node, + const std::set* is_file_slot) { + auto struct_node = std::make_shared(); + + std::map file_column_name_idx_map; + for (uint64_t idx = 0; idx < orc_type_ptr->getSubtypeCount(); idx++) { + // to_lower for match table column name. + file_column_name_idx_map.emplace(to_lower(orc_type_ptr->getFieldName(idx)), idx); + } + + for (const auto& slot : table_tuple_descriptor->slots()) { + const auto& table_column_name = slot->col_name(); + if ((is_file_slot == nullptr || is_file_slot->contains(slot->id())) && + file_column_name_idx_map.contains(table_column_name)) { + auto file_column_idx = file_column_name_idx_map[table_column_name]; + std::shared_ptr field_node = nullptr; + RETURN_IF_ERROR(by_orc_name(slot->type(), orc_type_ptr->getSubtype(file_column_idx), + field_node)); + struct_node->add_children(table_column_name, + orc_type_ptr->getFieldName(file_column_idx), field_node); + } else { + struct_node->add_not_exist_children(table_column_name); + } + } + node = struct_node; + return Status::OK(); +} + +Status TableSchemaChangeHelper::BuildTableInfoUtil::by_orc_name( + const DataTypePtr& table_data_type, const orc::Type* orc_root, + std::shared_ptr& node) { + switch (table_data_type->get_primitive_type()) { + case TYPE_MAP: { + if (orc_root->getKind() != orc::TypeKind::MAP) [[unlikely]] { + return SCHEMA_ERROR; + } + MOCK_REMOVE(DCHECK(orc_root->getSubtypeCount() == 2)); + + std::shared_ptr key_node = nullptr; + const auto& key_type = assert_cast( + assert_cast(remove_nullable(table_data_type).get()) + ->get_key_type()); + RETURN_IF_ERROR(by_orc_name(key_type, orc_root->getSubtype(0), key_node)); + + std::shared_ptr value_node = nullptr; + const auto& value_type = assert_cast( + assert_cast(remove_nullable(table_data_type).get()) + ->get_value_type()); + RETURN_IF_ERROR(by_orc_name(value_type, orc_root->getSubtype(1), value_node)); + node = std::make_shared(key_node, value_node); + + break; + } + case TYPE_ARRAY: { + if (orc_root->getKind() != orc::TypeKind::LIST) [[unlikely]] { + return SCHEMA_ERROR; + } + MOCK_REMOVE(DCHECK(orc_root->getSubtypeCount() == 1)); + + std::shared_ptr element_node = nullptr; + const auto& element_type = assert_cast( + assert_cast(remove_nullable(table_data_type).get()) + ->get_nested_type()); + + RETURN_IF_ERROR(by_orc_name(element_type, orc_root->getSubtype(0), element_node)); + node = std::make_shared(element_node); + break; + } + case TYPE_STRUCT: { + if (orc_root->getKind() != orc::TypeKind::STRUCT) [[unlikely]] { + return SCHEMA_ERROR; + } + auto struct_node = std::make_shared(); + + const auto struct_data_type = + assert_cast(remove_nullable(table_data_type).get()); + std::map orc_field_names; + for (uint64_t idx = 0; idx < orc_root->getSubtypeCount(); idx++) { + orc_field_names.emplace(to_lower(orc_root->getFieldName(idx)), idx); + } + + for (size_t idx = 0; idx < struct_data_type->get_elements().size(); idx++) { + const auto& doris_field_name = struct_data_type->get_element_name(idx); + + if (orc_field_names.contains(doris_field_name)) { + std::shared_ptr field_node = nullptr; + + auto orc_field_idx = orc_field_names[doris_field_name]; + RETURN_IF_ERROR(by_orc_name(struct_data_type->get_element(idx), + orc_root->getSubtype(orc_field_idx), field_node)); + struct_node->add_children(doris_field_name, orc_root->getFieldName(orc_field_idx), + field_node); + } else { + struct_node->add_not_exist_children(doris_field_name); + } + } + node = struct_node; + break; + } + default: { + node = std::make_shared(); + break; + } + } + return Status::OK(); +} + +Status TableSchemaChangeHelper::BuildTableInfoUtil::by_table_field_id( + const schema::external::TField table_schema, const schema::external::TField file_schema, + std::shared_ptr& node) { + switch (table_schema.type.type) { + case TPrimitiveType::MAP: { + if (file_schema.type.type != TPrimitiveType::MAP) [[unlikely]] { + return SCHEMA_ERROR; + } + MOCK_REMOVE(DCHECK(table_schema.__isset.nestedField)); + MOCK_REMOVE(DCHECK(table_schema.nestedField.__isset.map_field)); + MOCK_REMOVE(DCHECK(table_schema.nestedField.map_field.__isset.key_field)); + MOCK_REMOVE(DCHECK(table_schema.nestedField.map_field.__isset.value_field)); + MOCK_REMOVE(DCHECK(table_schema.nestedField.map_field.key_field.field_ptr != nullptr)); + MOCK_REMOVE(DCHECK(table_schema.nestedField.map_field.value_field.field_ptr != nullptr)); + + MOCK_REMOVE(DCHECK(file_schema.__isset.nestedField)); + MOCK_REMOVE(DCHECK(file_schema.nestedField.__isset.map_field)); + MOCK_REMOVE(DCHECK(file_schema.nestedField.map_field.__isset.key_field)); + MOCK_REMOVE(DCHECK(file_schema.nestedField.map_field.__isset.value_field)); + MOCK_REMOVE(DCHECK(file_schema.nestedField.map_field.key_field.field_ptr != nullptr)); + MOCK_REMOVE(DCHECK(file_schema.nestedField.map_field.value_field.field_ptr != nullptr)); + + std::shared_ptr key_node = nullptr; + RETURN_IF_ERROR(by_table_field_id(*table_schema.nestedField.map_field.key_field.field_ptr, + *file_schema.nestedField.map_field.key_field.field_ptr, + key_node)); + + std::shared_ptr value_node = nullptr; + RETURN_IF_ERROR(by_table_field_id(*table_schema.nestedField.map_field.value_field.field_ptr, + *file_schema.nestedField.map_field.value_field.field_ptr, + value_node)); + + node = std::make_shared(key_node, value_node); + break; + } + case TPrimitiveType::ARRAY: { + if (file_schema.type.type != TPrimitiveType::ARRAY) [[unlikely]] { + return SCHEMA_ERROR; + } + + MOCK_REMOVE(DCHECK(table_schema.__isset.nestedField)); + MOCK_REMOVE(DCHECK(table_schema.nestedField.__isset.array_field)); + MOCK_REMOVE(DCHECK(table_schema.nestedField.array_field.__isset.item_field)); + MOCK_REMOVE(DCHECK(table_schema.nestedField.array_field.item_field.field_ptr != nullptr)); + + MOCK_REMOVE(DCHECK(file_schema.__isset.nestedField)); + MOCK_REMOVE(DCHECK(file_schema.nestedField.__isset.array_field)); + MOCK_REMOVE(DCHECK(file_schema.nestedField.array_field.__isset.item_field)); + MOCK_REMOVE(DCHECK(file_schema.nestedField.array_field.item_field.field_ptr != nullptr)); + + std::shared_ptr item_node = nullptr; + RETURN_IF_ERROR(by_table_field_id( + *table_schema.nestedField.array_field.item_field.field_ptr, + *file_schema.nestedField.array_field.item_field.field_ptr, item_node)); + + node = std::make_shared(item_node); + break; + } + case TPrimitiveType::STRUCT: { + if (file_schema.type.type != TPrimitiveType::STRUCT) [[unlikely]] { + return SCHEMA_ERROR; + } + MOCK_REMOVE(DCHECK(table_schema.__isset.nestedField)); + MOCK_REMOVE(DCHECK(table_schema.nestedField.__isset.struct_field)); + + MOCK_REMOVE(DCHECK(file_schema.__isset.nestedField)); + MOCK_REMOVE(DCHECK(file_schema.nestedField.__isset.struct_field)); + + RETURN_IF_ERROR(by_table_field_id(table_schema.nestedField.struct_field, + file_schema.nestedField.struct_field, node)); + break; + } + default: { + node = std::make_shared(); + break; + } + } + + return Status::OK(); +} + +Status TableSchemaChangeHelper::BuildTableInfoUtil::by_table_field_id( + const schema::external::TStructField& table_schema, + const schema::external::TStructField& file_schema, + std::shared_ptr& node) { + std::map file_field_id_to_idx; + for (size_t idx = 0; idx < file_schema.fields.size(); ++idx) { + file_field_id_to_idx.emplace(file_schema.fields[idx].field_ptr->id, idx); + } + auto struct_node = std::make_shared(); + + for (const auto& table_field : table_schema.fields) { + const auto& table_column_name = table_field.field_ptr->name; + + if (file_field_id_to_idx.contains(table_field.field_ptr->id)) { + const auto& file_field = + file_schema.fields.at(file_field_id_to_idx[table_field.field_ptr->id]); + + std::shared_ptr field_node = nullptr; + RETURN_IF_ERROR( + by_table_field_id(*table_field.field_ptr, *file_field.field_ptr, field_node)); + + struct_node->add_children(table_column_name, file_field.field_ptr->name, field_node); + } else { + struct_node->add_not_exist_children(table_column_name); + } + } + node = std::move(struct_node); + return Status::OK(); +} + +Status TableSchemaChangeHelper::BuildTableInfoUtil::by_parquet_field_id( + const schema::external::TStructField& table_schema, + const FieldDescriptor& parquet_field_desc, + std::shared_ptr& node, bool& exist_field_id) { + auto struct_node = std::make_shared(); + auto parquet_fields_schema = parquet_field_desc.get_fields_schema(); + std::map file_column_id_idx_map; + for (size_t idx = 0; idx < parquet_fields_schema.size(); idx++) { + if (parquet_fields_schema[idx].field_id == -1) { + exist_field_id = false; + return Status::OK(); + } else { + file_column_id_idx_map.emplace(parquet_fields_schema[idx].field_id, idx); + } + } + + for (const auto& table_field : table_schema.fields) { + const auto& table_column_name = table_field.field_ptr->name; + + if (file_column_id_idx_map.contains(table_field.field_ptr->id)) { + auto file_column_idx = file_column_id_idx_map[table_field.field_ptr->id]; + std::shared_ptr field_node = nullptr; + RETURN_IF_ERROR(by_parquet_field_id(*table_field.field_ptr, + parquet_fields_schema[file_column_idx], field_node, + exist_field_id)); + struct_node->add_children(table_column_name, + parquet_fields_schema[file_column_idx].name, field_node); + } else { + struct_node->add_not_exist_children(table_column_name); + } + } + + node = struct_node; + return Status::OK(); +} + +Status TableSchemaChangeHelper::BuildTableInfoUtil::by_parquet_field_id( + const schema::external::TField& table_schema, const FieldSchema& parquet_field, + std::shared_ptr& node, bool& exist_field_id) { + switch (table_schema.type.type) { + case TPrimitiveType::MAP: { + if (parquet_field.data_type->get_primitive_type() != TYPE_MAP) [[unlikely]] { + return SCHEMA_ERROR; + } + MOCK_REMOVE(DCHECK(table_schema.__isset.nestedField)); + MOCK_REMOVE(DCHECK(table_schema.nestedField.__isset.map_field)); + MOCK_REMOVE(DCHECK(table_schema.nestedField.map_field.__isset.key_field)); + MOCK_REMOVE(DCHECK(table_schema.nestedField.map_field.__isset.value_field)); + MOCK_REMOVE(DCHECK(table_schema.nestedField.map_field.key_field.field_ptr != nullptr)); + MOCK_REMOVE(DCHECK(table_schema.nestedField.map_field.value_field.field_ptr != nullptr)); + + MOCK_REMOVE(DCHECK(parquet_field.children.size() == 2)); + + std::shared_ptr key_node = nullptr; + std::shared_ptr value_node = nullptr; + + RETURN_IF_ERROR(by_parquet_field_id(*table_schema.nestedField.map_field.key_field.field_ptr, + parquet_field.children[0], key_node, exist_field_id)); + + RETURN_IF_ERROR( + by_parquet_field_id(*table_schema.nestedField.map_field.value_field.field_ptr, + parquet_field.children[1], value_node, exist_field_id)); + + node = std::make_shared(key_node, value_node); + break; + } + case TPrimitiveType::ARRAY: { + if (parquet_field.data_type->get_primitive_type() != TYPE_ARRAY) [[unlikely]] { + return SCHEMA_ERROR; + } + MOCK_REMOVE(DCHECK(table_schema.__isset.nestedField)); + MOCK_REMOVE(DCHECK(table_schema.nestedField.__isset.array_field)); + MOCK_REMOVE(DCHECK(table_schema.nestedField.array_field.__isset.item_field)); + MOCK_REMOVE(DCHECK(table_schema.nestedField.array_field.item_field.field_ptr != nullptr)); + + MOCK_REMOVE(DCHECK(parquet_field.children.size() == 1)); + + std::shared_ptr element_node = nullptr; + RETURN_IF_ERROR( + by_parquet_field_id(*table_schema.nestedField.array_field.item_field.field_ptr, + parquet_field.children[0], element_node, exist_field_id)); + + node = std::make_shared(element_node); + break; + } + case TPrimitiveType::STRUCT: { + if (parquet_field.data_type->get_primitive_type() != TYPE_STRUCT) [[unlikely]] { + return SCHEMA_ERROR; + } + MOCK_REMOVE(DCHECK(table_schema.__isset.nestedField)); + MOCK_REMOVE(DCHECK(table_schema.nestedField.__isset.struct_field)); + + auto struct_node = std::make_shared(); + + std::map file_column_id_idx_map; + for (size_t idx = 0; idx < parquet_field.children.size(); idx++) { + if (parquet_field.children[idx].field_id == -1) { + exist_field_id = false; + return Status::OK(); + } else { + file_column_id_idx_map.emplace(parquet_field.children[idx].field_id, idx); + } + } + + for (const auto& table_field : table_schema.nestedField.struct_field.fields) { + const auto& table_column_name = table_field.field_ptr->name; + if (file_column_id_idx_map.contains(table_field.field_ptr->id)) { + const auto& file_field = parquet_field.children.at( + file_column_id_idx_map[table_field.field_ptr->id]); + std::shared_ptr field_node = nullptr; + RETURN_IF_ERROR(by_parquet_field_id(*table_field.field_ptr, file_field, field_node, + exist_field_id)); + struct_node->add_children(table_column_name, file_field.name, field_node); + } else { + struct_node->add_not_exist_children(table_column_name); + } + } + node = struct_node; + break; + } + default: { + node = std::make_shared(); + break; + } + } + return Status::OK(); +} + +Status TableSchemaChangeHelper::BuildTableInfoUtil::by_orc_field_id( + const schema::external::TStructField& table_schema, const orc::Type* orc_root, + const std::string& field_id_attribute_key, + std::shared_ptr& node, bool& exist_field_id) { + auto struct_node = std::make_shared(); + + std::map file_column_id_idx_map; + for (size_t idx = 0; idx < orc_root->getSubtypeCount(); idx++) { + if (orc_root->getSubtype(idx)->hasAttributeKey(field_id_attribute_key)) { + auto field_id = + std::stoi(orc_root->getSubtype(idx)->getAttributeValue(field_id_attribute_key)); + file_column_id_idx_map.emplace(field_id, idx); + } else { + exist_field_id = false; + return Status::OK(); + } + } + + for (const auto& table_field : table_schema.fields) { + const auto& table_column_name = table_field.field_ptr->name; + if (file_column_id_idx_map.contains(table_field.field_ptr->id)) { + auto file_field_idx = file_column_id_idx_map[table_field.field_ptr->id]; + const auto& file_field = orc_root->getSubtype(file_field_idx); + std::shared_ptr field_node = nullptr; + RETURN_IF_ERROR(by_orc_field_id(*table_field.field_ptr, file_field, + field_id_attribute_key, field_node, exist_field_id)); + struct_node->add_children(table_column_name, orc_root->getFieldName(file_field_idx), + field_node); + } else { + struct_node->add_not_exist_children(table_column_name); + } + } + node = struct_node; + return Status::OK(); +} + +Status TableSchemaChangeHelper::BuildTableInfoUtil::by_orc_field_id( + const schema::external::TField& table_schema, const orc::Type* orc_root, + const std::string& field_id_attribute_key, + std::shared_ptr& node, bool& exist_field_id) { + switch (table_schema.type.type) { + case TPrimitiveType::MAP: { + if (orc_root->getKind() != orc::TypeKind::MAP) [[unlikely]] { + return SCHEMA_ERROR; + } + MOCK_REMOVE(DCHECK(table_schema.__isset.nestedField)); + MOCK_REMOVE(DCHECK(table_schema.nestedField.__isset.map_field)); + MOCK_REMOVE(DCHECK(table_schema.nestedField.map_field.__isset.key_field)); + MOCK_REMOVE(DCHECK(table_schema.nestedField.map_field.__isset.value_field)); + MOCK_REMOVE(DCHECK(table_schema.nestedField.map_field.key_field.field_ptr != nullptr)); + MOCK_REMOVE(DCHECK(table_schema.nestedField.map_field.value_field.field_ptr != nullptr)); + + MOCK_REMOVE(DCHECK(orc_root->getSubtypeCount() == 2)); + + std::shared_ptr key_node = nullptr; + std::shared_ptr value_node = nullptr; + + RETURN_IF_ERROR(by_orc_field_id(*table_schema.nestedField.map_field.key_field.field_ptr, + orc_root->getSubtype(0), field_id_attribute_key, key_node, + exist_field_id)); + + RETURN_IF_ERROR(by_orc_field_id(*table_schema.nestedField.map_field.value_field.field_ptr, + orc_root->getSubtype(1), field_id_attribute_key, value_node, + exist_field_id)); + + node = std::make_shared(key_node, value_node); + break; + } + case TPrimitiveType::ARRAY: { + if (orc_root->getKind() != orc::TypeKind::LIST) [[unlikely]] { + return SCHEMA_ERROR; + } + MOCK_REMOVE(DCHECK(table_schema.__isset.nestedField)); + MOCK_REMOVE(DCHECK(table_schema.nestedField.__isset.array_field)); + MOCK_REMOVE(DCHECK(table_schema.nestedField.array_field.__isset.item_field)); + MOCK_REMOVE(DCHECK(table_schema.nestedField.array_field.item_field.field_ptr != nullptr)); + + MOCK_REMOVE(DCHECK(orc_root->getSubtypeCount() == 1)); + + std::shared_ptr element_node = nullptr; + RETURN_IF_ERROR(by_orc_field_id(*table_schema.nestedField.array_field.item_field.field_ptr, + orc_root->getSubtype(0), field_id_attribute_key, + element_node, exist_field_id)); + + node = std::make_shared(element_node); + break; + } + case TPrimitiveType::STRUCT: { + if (orc_root->getKind() != orc::TypeKind::STRUCT) [[unlikely]] { + return SCHEMA_ERROR; + } + MOCK_REMOVE(DCHECK(table_schema.__isset.nestedField)); + MOCK_REMOVE(DCHECK(table_schema.nestedField.__isset.struct_field)); + RETURN_IF_ERROR(by_orc_field_id(table_schema.nestedField.struct_field, orc_root, + field_id_attribute_key, node, exist_field_id)); + + break; + } + default: { + node = std::make_shared(); + break; + } + } + + return Status::OK(); +} + +std::string TableSchemaChangeHelper::debug(const std::shared_ptr& root, size_t level) { + std::string ans; + + auto indent = [](size_t level) { return std::string(level * 2, ' '); }; + + std::string prefix = indent(level); + + if (std::dynamic_pointer_cast(root)) { + ans += prefix + "ScalarNode\n"; + } else if (auto struct_node = std::dynamic_pointer_cast(root)) { + ans += prefix + "StructNode\n"; + for (const auto& [table_col_name, value] : struct_node->get_children()) { + ans += indent(level + 1) + table_col_name; + if (value.exists) { + ans += " (file: " + value.column_name + ")"; + } else { + ans += " (not exists)"; + } + ans += "\n"; + if (value.node) { + ans += debug(value.node, level + 2); + } + } + } else if (auto array_node = std::dynamic_pointer_cast(root)) { + ans += prefix + "ArrayNode\n"; + ans += indent(level + 1) + "Element:\n"; + ans += debug(array_node->get_element_node(), level + 2); + } else if (auto map_node = std::dynamic_pointer_cast(root)) { + ans += prefix + "MapNode\n"; + ans += indent(level + 1) + "Key:\n"; + ans += debug(map_node->get_key_node(), level + 2); + ans += indent(level + 1) + "Value:\n"; + ans += debug(map_node->get_value_node(), level + 2); + } else if (std::dynamic_pointer_cast(root)) { + ans += prefix + "ConstNode\n"; + } else { + ans += prefix + "UnknownNodeType\n"; + } + + return ans; +} +#include "common/compile_check_end.h" +} // namespace doris diff --git a/be/src/format/table/table_schema_change_helper.h b/be/src/format/table/table_schema_change_helper.h new file mode 100644 index 00000000000000..4e3425b676b0d4 --- /dev/null +++ b/be/src/format/table/table_schema_change_helper.h @@ -0,0 +1,350 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "common/status.h" +#include "core/block/block.h" +#include "core/data_type/data_type_array.h" +#include "core/data_type/data_type_map.h" +#include "core/data_type/data_type_struct.h" +#include "format/parquet/schema_desc.h" +#include "runtime/runtime_profile.h" +#include "runtime/runtime_state.h" +#include "storage/olap_common.h" +#include "util/string_util.h" + +namespace doris { +class TFileRangeDesc; +class Block; +} // namespace doris + +namespace doris { +#include "common/compile_check_begin.h" + +class TableSchemaChangeHelper { +public: + ~TableSchemaChangeHelper() = default; + + class Node { + public: + virtual ~Node() = default; + virtual std::shared_ptr get_children_node(std::string table_column_name) const { + throw std::logic_error("get_children_node should not be called on base TableInfoNode"); + }; + + virtual std::shared_ptr get_children_node_by_file_column_name( + std::string file_column_name) const { + throw std::logic_error( + "get_children_node_by_file_column_name should not be called on base " + "TableInfoNode"); + }; + + virtual std::string children_file_column_name(std::string table_column_name) const { + throw std::logic_error( + "children_file_column_name should not be called on base TableInfoNode"); + } + + virtual bool children_column_exists(std::string table_column_name) const { + throw std::logic_error( + "children_column_exists should not be called on base TableInfoNode"); + } + + virtual std::shared_ptr get_element_node() const { + throw std::logic_error("get_element_node should not be called on base TableInfoNode"); + } + + virtual std::shared_ptr get_key_node() const { + throw std::logic_error("get_key_node should not be called on base TableInfoNode"); + } + virtual std::shared_ptr get_value_node() const { + throw std::logic_error("get_value_node should not be called on base TableInfoNode"); + } + + virtual void add_not_exist_children(std::string table_column_name) { + throw std::logic_error( + "add_not_exist_children should not be called on base TableInfoNode"); + }; + + virtual void add_children(std::string table_column_name, std::string file_column_name, + std::shared_ptr children_node) { + throw std::logic_error("add_children should not be called on base TableInfoNode"); + } + }; + + class ConstNode : public Node { + // If you can be sure that there has been no schema change between the table and the file, + // you can use constNode (of course, you need to pay attention to case sensitivity). + public: + std::shared_ptr get_children_node(std::string table_column_name) const override { + return get_instance(); + }; + + std::shared_ptr get_children_node_by_file_column_name( + std::string file_column_name) const override { + return get_instance(); + }; + + std::string children_file_column_name(std::string table_column_name) const override { + return table_column_name; + } + + bool children_column_exists(std::string table_column_name) const override { return true; } + + std::shared_ptr get_element_node() const override { return get_instance(); } + + std::shared_ptr get_key_node() const override { return get_instance(); } + + std::shared_ptr get_value_node() const override { return get_instance(); } + + static const std::shared_ptr& get_instance() { + static const std::shared_ptr instance = std::make_shared(); + return instance; + } + }; + + // ScalarNode inherits from ConstNode so that unexpected calls to + // get_element_node / get_key_node / get_value_node (e.g. on schema + // mismatch where the file has a complex type but the table has a + // scalar) are handled safely instead of crashing. + class ScalarNode : public ConstNode {}; + + class StructNode : public Node { + struct StructChild { + const std::shared_ptr node; + const std::string column_name; + const bool exists; + }; + + // table column name -> { node, file_column_name, exists_in_file} + std::map children; + + public: + std::shared_ptr get_children_node(std::string table_column_name) const override { + DCHECK(children.contains(table_column_name)); + DCHECK(children_column_exists(table_column_name)); + return children.at(table_column_name).node; + } + + std::shared_ptr get_children_node_by_file_column_name( + std::string file_column_name) const override { + // Search for the child by file column name + for (const auto& [table_name, child] : children) { + if (child.exists && child.column_name == file_column_name) { + return child.node; + } + } + // Not found - throw or return nullptr + throw std::runtime_error("File column name '" + file_column_name + + "' not found in struct children"); + } + + std::string children_file_column_name(std::string table_column_name) const override { + DCHECK(children.contains(table_column_name)); + DCHECK(children_column_exists(table_column_name)); + return children.at(table_column_name).column_name; + } + + bool children_column_exists(std::string table_column_name) const override { + DCHECK(children.contains(table_column_name)); + return children.at(table_column_name).exists; + } + + void add_not_exist_children(std::string table_column_name) override { + children.emplace(table_column_name, StructChild {nullptr, "", false}); + } + + void add_children(std::string table_column_name, std::string file_column_name, + std::shared_ptr children_node) override { + children.emplace(table_column_name, + StructChild {children_node, file_column_name, true}); + } + + const std::map& get_children() const { return children; } + }; + + class ArrayNode : public Node { + std::shared_ptr _element_node; + + public: + ArrayNode(const std::shared_ptr& element_node) : _element_node(element_node) {} + + std::shared_ptr get_element_node() const override { return _element_node; } + }; + + class MapNode : public Node { + std::shared_ptr _key_node; + std::shared_ptr _value_node; + + public: + MapNode(const std::shared_ptr& key_node, const std::shared_ptr& value_node) + : _key_node(key_node), _value_node(value_node) {} + + std::shared_ptr get_key_node() const override { return _key_node; } + + std::shared_ptr get_value_node() const override { return _value_node; } + }; + + static std::string debug(const std::shared_ptr& root, size_t level = 0); + +protected: + // Whenever external components invoke the Parquet/ORC reader (e.g., init_reader, get_next_block, set_fill_columns), + // the parameters passed in are based on `table column names`. + // The table_info_node_ptr assists the Parquet/ORC reader in mapping these to the actual + // `file columns name` to be read and enables min/max filtering. + std::shared_ptr table_info_node_ptr = std::make_shared(); + +protected: + Status gen_table_info_node_by_field_id(const TFileScanRangeParams& params, + int64_t split_schema_id, + const TupleDescriptor* tuple_descriptor, + const FieldDescriptor& parquet_field_desc) { + if (!params.__isset.history_schema_info) [[unlikely]] { + RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name( + tuple_descriptor, parquet_field_desc, table_info_node_ptr)); + return Status::OK(); + } + return gen_table_info_node_by_field_id(params, split_schema_id); + } + + Status gen_table_info_node_by_field_id(const TFileScanRangeParams& params, + int64_t split_schema_id, + const TupleDescriptor* tuple_descriptor, + const orc::Type* orc_type_ptr) { + if (!params.__isset.history_schema_info) [[unlikely]] { + RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(tuple_descriptor, orc_type_ptr, + table_info_node_ptr)); + return Status::OK(); + } + return gen_table_info_node_by_field_id(params, split_schema_id); + } + +private: + // The filed id of both the table and the file come from the pass from fe. (params.history_schema_info) + Status gen_table_info_node_by_field_id(const TFileScanRangeParams& params, + int64_t split_schema_id) { + if (params.current_schema_id == split_schema_id) { + table_info_node_ptr = ConstNode::get_instance(); + return Status::OK(); + } + + int32_t table_schema_idx = -1; + int32_t file_schema_idx = -1; + //todo : Perhaps this process can be optimized by pre-generating a map + for (int32_t idx = 0; idx < params.history_schema_info.size(); idx++) { + if (params.history_schema_info[idx].schema_id == params.current_schema_id) { + table_schema_idx = idx; + } else if (params.history_schema_info[idx].schema_id == split_schema_id) { + file_schema_idx = idx; + } + } + + if (table_schema_idx == -1 || file_schema_idx == -1) [[unlikely]] { + return Status::InternalError( + "miss table/file schema info, table_schema_idx:{} file_schema_idx:{}", + table_schema_idx, file_schema_idx); + } + RETURN_IF_ERROR(BuildTableInfoUtil::by_table_field_id( + params.history_schema_info.at(table_schema_idx).root_field, + params.history_schema_info.at(file_schema_idx).root_field, table_info_node_ptr)); + return Status::OK(); + } + +public: + /* Schema change Util. Used to generate `std::shared_ptr node`. + Passed node to parquet/orc reader to find file columns based on table columns, + */ + struct BuildTableInfoUtil { + static const Status SCHEMA_ERROR; + + // todo : Maybe I can use templates to implement this functionality. + + // for hive parquet : The table column names passed from fe are lowercase, so use lowercase file column names to match table column names. + static Status by_parquet_name(const TupleDescriptor* table_tuple_descriptor, + const FieldDescriptor& parquet_field_desc, + std::shared_ptr& node, + const std::set* is_file_slot = nullptr); + + // for hive parquet + static Status by_parquet_name(const DataTypePtr& table_data_type, + const FieldSchema& file_field, + std::shared_ptr& node); + + // for hive orc: The table column names passed from fe are lowercase, so use lowercase file column names to match table column names. + static Status by_orc_name(const TupleDescriptor* table_tuple_descriptor, + const orc::Type* orc_type_ptr, + std::shared_ptr& node, + const std::set* is_file_slot = nullptr); + // for hive orc + static Status by_orc_name(const DataTypePtr& table_data_type, const orc::Type* orc_root, + std::shared_ptr& node); + + // for paimon hudi: Use the field id in the `table schema` and `history table schema` to match columns. + static Status by_table_field_id(const schema::external::TField table_schema, + const schema::external::TField file_schema, + std::shared_ptr& node); + + // for paimon hudi + static Status by_table_field_id(const schema::external::TStructField& table_schema, + const schema::external::TStructField& file_schema, + std::shared_ptr& node); + + //for iceberg parquet: Use the field id in the `table schema` and the parquet file to match columns. + static Status by_parquet_field_id(const schema::external::TStructField& table_schema, + const FieldDescriptor& parquet_field_desc, + std::shared_ptr& node, + bool& exist_field_id); + + // for iceberg parquet + static Status by_parquet_field_id(const schema::external::TField& table_schema, + const FieldSchema& parquet_field, + std::shared_ptr& node, + bool& exist_field_id); + + // for iceberg orc : Use the field id in the `table schema` and the orc file to match columns. + static Status by_orc_field_id(const schema::external::TStructField& table_schema, + const orc::Type* orc_root, + const std::string& field_id_attribute_key, + std::shared_ptr& node, + bool& exist_field_id); + + // for iceberg orc + static Status by_orc_field_id(const schema::external::TField& table_schema, + const orc::Type* orc_root, + const std::string& field_id_attribute_key, + std::shared_ptr& node, + bool& exist_field_id); + }; +}; + +struct ColumnIdResult { + std::set column_ids; + std::set filter_column_ids; + + ColumnIdResult() = default; // Add default constructor + + ColumnIdResult(std::set column_ids_, std::set filter_column_ids_) + : column_ids(std::move(column_ids_)), + filter_column_ids(std::move(filter_column_ids_)) {} +}; + +#include "common/compile_check_end.h" +} // namespace doris diff --git a/be/src/format/table/transactional_hive_common.h b/be/src/format/table/transactional_hive_common.h index 4ec08c3254e3bb..f2cba0c660764c 100644 --- a/be/src/format/table/transactional_hive_common.h +++ b/be/src/format/table/transactional_hive_common.h @@ -17,11 +17,14 @@ #pragma once +#include +#include #include #include #include #include "core/data_type/define_primitive_type.h" +#include "exec/common/hash_table/phmap_fwd_decl.h" namespace doris { #include "common/compile_check_begin.h" @@ -55,5 +58,48 @@ struct TransactionalHive { static const std::unordered_map DELETE_COL_NAME_TO_BLOCK_IDX; }; + +// ACID row identifier for transactional Hive tables, used for delete row matching. +// Placed here (not in TransactionalHiveReader) to avoid circular dependency with OrcReader. +struct AcidRowID { + int64_t original_transaction; + int64_t bucket; + int64_t row_id; + + struct Hash { + size_t operator()(const AcidRowID& transactional_row_id) const { + size_t hash_value = 0; + hash_value ^= std::hash {}(transactional_row_id.original_transaction) + + 0x9e3779b9 + (hash_value << 6) + (hash_value >> 2); + hash_value ^= std::hash {}(transactional_row_id.bucket) + 0x9e3779b9 + + (hash_value << 6) + (hash_value >> 2); + hash_value ^= std::hash {}(transactional_row_id.row_id) + 0x9e3779b9 + + (hash_value << 6) + (hash_value >> 2); + return hash_value; + } + }; + + struct Eq { + bool operator()(const AcidRowID& lhs, const AcidRowID& rhs) const { + return lhs.original_transaction == rhs.original_transaction && + lhs.bucket == rhs.bucket && lhs.row_id == rhs.row_id; + } + }; +}; + +using AcidRowIDSet = flat_hash_set; + +inline bool operator<(const AcidRowID& lhs, const AcidRowID& rhs) { + if (lhs.original_transaction != rhs.original_transaction) { + return lhs.original_transaction < rhs.original_transaction; + } else if (lhs.bucket != rhs.bucket) { + return lhs.bucket < rhs.bucket; + } else if (lhs.row_id != rhs.row_id) { + return lhs.row_id < rhs.row_id; + } else { + return false; + } +} + #include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/format/table/transactional_hive_reader.cpp b/be/src/format/table/transactional_hive_reader.cpp index de6227977734df..a9cca89baca6b6 100644 --- a/be/src/format/table/transactional_hive_reader.cpp +++ b/be/src/format/table/transactional_hive_reader.cpp @@ -21,8 +21,8 @@ #include "core/data_type/data_type_factory.hpp" #include "format/orc/vorc_reader.h" -#include "format/table/table_format_reader.h" -#include "format/table/transactional_hive_common.h" +#include "format/table/table_schema_change_helper.h" +#include "transactional_hive_common.h" namespace doris { #include "common/compile_check_begin.h" @@ -35,49 +35,57 @@ class VExprContext; namespace doris { -TransactionalHiveReader::TransactionalHiveReader(std::unique_ptr file_format_reader, - RuntimeProfile* profile, RuntimeState* state, +TransactionalHiveReader::TransactionalHiveReader(RuntimeProfile* profile, RuntimeState* state, const TFileScanRangeParams& params, - const TFileRangeDesc& range, io::IOContext* io_ctx, + const TFileRangeDesc& range, size_t batch_size, + const std::string& ctz, io::IOContext* io_ctx, FileMetaCache* meta_cache) - : TableFormatReader(std::move(file_format_reader), state, profile, params, range, io_ctx, - meta_cache) { + : OrcReader(profile, state, params, range, batch_size, ctz, io_ctx, meta_cache, false) { static const char* transactional_hive_profile = "TransactionalHiveProfile"; - ADD_TIMER(_profile, transactional_hive_profile); - _transactional_orc_profile.num_delete_files = - ADD_CHILD_COUNTER(_profile, "NumDeleteFiles", TUnit::UNIT, transactional_hive_profile); - _transactional_orc_profile.num_delete_rows = - ADD_CHILD_COUNTER(_profile, "NumDeleteRows", TUnit::UNIT, transactional_hive_profile); + ADD_TIMER(get_profile(), transactional_hive_profile); + _transactional_orc_profile.num_delete_files = ADD_CHILD_COUNTER( + get_profile(), "NumDeleteFiles", TUnit::UNIT, transactional_hive_profile); + _transactional_orc_profile.num_delete_rows = ADD_CHILD_COUNTER( + get_profile(), "NumDeleteRows", TUnit::UNIT, transactional_hive_profile); _transactional_orc_profile.delete_files_read_time = - ADD_CHILD_TIMER(_profile, "DeleteFileReadTime", transactional_hive_profile); + ADD_CHILD_TIMER(get_profile(), "DeleteFileReadTime", transactional_hive_profile); } -Status TransactionalHiveReader::init_reader( - const std::vector& column_names, - std::unordered_map* col_name_to_block_idx, - const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor, - const RowDescriptor* row_descriptor, - const VExprContextSPtrs* not_single_slot_filter_conjuncts, - const std::unordered_map* slot_id_to_filter_conjuncts) { - _col_name_to_block_idx = col_name_to_block_idx; - auto* orc_reader = static_cast(_file_format_reader.get()); - _col_names.insert(_col_names.end(), column_names.begin(), column_names.end()); +// ============================================================================ +// on_before_init_reader: ACID schema mapping +// ============================================================================ +Status TransactionalHiveReader::on_before_init_reader(ReaderInitContext* ctx) { + _column_descs = ctx->column_descs; + _fill_col_name_to_block_idx = ctx->col_name_to_block_idx; + RETURN_IF_ERROR( + _extract_partition_values(*ctx->range, ctx->tuple_descriptor, _fill_partition_values)); + for (auto& desc : *ctx->column_descs) { + if (desc.category == ColumnCategory::REGULAR || + desc.category == ColumnCategory::GENERATED) { + _col_names.push_back(desc.name); + } + } + + _is_acid = true; + // Add ACID column names (originalTransaction, bucket, rowId, etc.) _col_names.insert(_col_names.end(), TransactionalHive::READ_ROW_COLUMN_NAMES_LOWER_CASE.begin(), TransactionalHive::READ_ROW_COLUMN_NAMES_LOWER_CASE.end()); + ctx->column_names = _col_names; - // https://issues.apache.org/jira/browse/HIVE-15190 + // Get ORC file type const orc::Type* orc_type_ptr = nullptr; - RETURN_IF_ERROR(orc_reader->get_file_type(&orc_type_ptr)); + RETURN_IF_ERROR(get_file_type(&orc_type_ptr)); const auto& orc_type = *orc_type_ptr; + // Add ACID metadata columns to table_info_node for (auto idx = 0; idx < TransactionalHive::READ_ROW_COLUMN_NAMES_LOWER_CASE.size(); idx++) { table_info_node_ptr->add_children(TransactionalHive::READ_ROW_COLUMN_NAMES_LOWER_CASE[idx], TransactionalHive::READ_ROW_COLUMN_NAMES[idx], std::make_shared()); } + // https://issues.apache.org/jira/browse/HIVE-15190 auto row_orc_type = orc_type.getSubtype(TransactionalHive::ROW_OFFSET); - // struct> std::vector row_names; std::map row_names_map; for (uint64_t idx = 0; idx < row_orc_type->getSubtypeCount(); idx++) { @@ -86,8 +94,8 @@ Status TransactionalHiveReader::init_reader( row_names_map.emplace(file_column_name, idx); } - // use name for match. - for (const auto& slot : tuple_descriptor->slots()) { + // Match table columns to file columns by name + for (const auto& slot : ctx->tuple_descriptor->slots()) { const auto& slot_name = slot->col_name(); if (std::count(TransactionalHive::READ_ROW_COLUMN_NAMES_LOWER_CASE.begin(), @@ -103,58 +111,36 @@ Status TransactionalHiveReader::init_reader( "{}.{}", TransactionalHive::ACID_COLUMN_NAMES[TransactionalHive::ROW_OFFSET], slot_name); table_info_node_ptr->add_children(slot_name, file_column_name, child_node); - } else { table_info_node_ptr->add_not_exist_children(slot_name); } } - - Status status = orc_reader->init_reader( - &_col_names, col_name_to_block_idx, conjuncts, true, tuple_descriptor, row_descriptor, - not_single_slot_filter_conjuncts, slot_id_to_filter_conjuncts, table_info_node_ptr); - return status; -} - -Status TransactionalHiveReader::get_next_block_inner(Block* block, size_t* read_rows, bool* eof) { - for (const auto& i : TransactionalHive::READ_PARAMS) { - DataTypePtr data_type = get_data_type_with_default_argument( - DataTypeFactory::instance().create_data_type(i.type, false)); - MutableColumnPtr data_column = data_type->create_column(); - (*_col_name_to_block_idx)[i.column_lower_case] = static_cast(block->columns()); - block->insert( - ColumnWithTypeAndName(std::move(data_column), data_type, i.column_lower_case)); - } - auto res = _file_format_reader->get_next_block(block, read_rows, eof); - Block::erase_useless_column(block, block->columns() - TransactionalHive::READ_PARAMS.size()); - for (const auto& i : TransactionalHive::READ_PARAMS) { - _col_name_to_block_idx->erase(i.column_lower_case); - } - return res; + ctx->table_info_node = table_info_node_ptr; + return Status::OK(); } -Status TransactionalHiveReader::init_row_filters() { - std::string data_file_path = _range.path; - // the path in _range is remove the namenode prefix, +// ============================================================================ +// on_after_init_reader: read delete delta files +// ============================================================================ +Status TransactionalHiveReader::on_after_init_reader(ReaderInitContext* /*ctx*/) { + std::string data_file_path = get_scan_range().path; + // the path in _range has the namenode prefix removed, // and the file_path in delete file is full path, so we should add it back. - if (_params.__isset.hdfs_params && _params.hdfs_params.__isset.fs_name) { - std::string fs_name = _params.hdfs_params.fs_name; + if (get_scan_params().__isset.hdfs_params && get_scan_params().hdfs_params.__isset.fs_name) { + std::string fs_name = get_scan_params().hdfs_params.fs_name; if (!starts_with(data_file_path, fs_name)) { data_file_path = fs_name + data_file_path; } } - auto* orc_reader = (OrcReader*)(_file_format_reader.get()); std::vector delete_file_col_names; int64_t num_delete_rows = 0; int64_t num_delete_files = 0; std::filesystem::path file_path(data_file_path); - //See https://github.com/apache/hive/commit/ffee30e6267e85f00a22767262192abb9681cfb7#diff-5fe26c36b4e029dcd344fc5d484e7347R165 // bucket_xxx_attemptId => bucket_xxx - // bucket_xxx => bucket_xxx auto remove_bucket_attemptId = [](const std::string& str) { re2::RE2 pattern("^bucket_\\d+_\\d+$"); - if (re2::RE2::FullMatch(str, pattern)) { size_t pos = str.rfind('_'); if (pos != std::string::npos) { @@ -166,10 +152,9 @@ Status TransactionalHiveReader::init_row_filters() { SCOPED_TIMER(_transactional_orc_profile.delete_files_read_time); for (const auto& delete_delta : - _range.table_format_params.transactional_hive_params.delete_deltas) { + get_scan_range().table_format_params.transactional_hive_params.delete_deltas) { const std::string file_name = file_path.filename().string(); - //need opt. std::vector delete_delta_file_names; for (const auto& x : delete_delta.file_names) { delete_delta_file_names.emplace_back(remove_bucket_attemptId(x)); @@ -184,15 +169,15 @@ Status TransactionalHiveReader::init_row_filters() { delete_delta.file_names[iter - delete_delta_file_names.begin()]); TFileRangeDesc delete_range; - // must use __set() method to make sure __isset is true - delete_range.__set_fs_name(_range.fs_name); + delete_range.__set_fs_name(get_scan_range().fs_name); delete_range.path = delete_file; delete_range.start_offset = 0; delete_range.size = -1; delete_range.file_size = -1; - OrcReader delete_reader(_profile, _state, _params, delete_range, _MIN_BATCH_SIZE, - _state->timezone(), _io_ctx, _meta_cache, false); + OrcReader delete_reader(get_profile(), get_state(), get_scan_params(), delete_range, + 256 /*batch_size*/, get_state()->timezone(), get_io_ctx(), + _meta_cache, false); auto acid_info_node = std::make_shared(); for (auto idx = 0; idx < TransactionalHive::DELETE_ROW_COLUMN_NAMES_LOWER_CASE.size(); @@ -204,16 +189,14 @@ Status TransactionalHiveReader::init_row_filters() { std::make_shared()); } - RETURN_IF_ERROR(delete_reader.init_reader( - &TransactionalHive::DELETE_ROW_COLUMN_NAMES_LOWER_CASE, - const_cast*>( - &TransactionalHive::DELETE_COL_NAME_TO_BLOCK_IDX), - {}, false, nullptr, nullptr, nullptr, nullptr, acid_info_node)); - - std::unordered_map> - partition_columns; - std::unordered_map missing_columns; - RETURN_IF_ERROR(delete_reader.set_fill_columns(partition_columns, missing_columns)); + OrcInitContext delete_ctx; + delete_ctx.column_names.assign( + TransactionalHive::DELETE_ROW_COLUMN_NAMES_LOWER_CASE.begin(), + TransactionalHive::DELETE_ROW_COLUMN_NAMES_LOWER_CASE.end()); + delete_ctx.col_name_to_block_idx = const_cast*>( + &TransactionalHive::DELETE_COL_NAME_TO_BLOCK_IDX); + delete_ctx.table_info_node = acid_info_node; + RETURN_IF_ERROR(delete_reader.init_reader(&delete_ctx)); bool eof = false; while (!eof) { @@ -247,7 +230,7 @@ Status TransactionalHiveReader::init_row_filters() { Int64 bucket_id = bucket_id_column.get_int(i); Int64 row_id = row_id_column.get_int(i); AcidRowID delete_row_id = {original_transaction, bucket_id, row_id}; - _delete_rows.insert(delete_row_id); + _acid_delete_rows.insert(delete_row_id); ++num_delete_rows; } } @@ -255,12 +238,41 @@ Status TransactionalHiveReader::init_row_filters() { ++num_delete_files; } if (num_delete_rows > 0) { - orc_reader->set_push_down_agg_type(TPushAggOp::NONE); - orc_reader->set_delete_rows(&_delete_rows); + set_push_down_agg_type(TPushAggOp::NONE); + lock_push_down_agg_type(); + set_delete_rows(&_acid_delete_rows); COUNTER_UPDATE(_transactional_orc_profile.num_delete_files, num_delete_files); COUNTER_UPDATE(_transactional_orc_profile.num_delete_rows, num_delete_rows); } return Status::OK(); } + +// ============================================================================ +// on_before_read_block: expand ACID columns into block +// ============================================================================ +Status TransactionalHiveReader::on_before_read_block(Block* block) { + for (const auto& i : TransactionalHive::READ_PARAMS) { + DataTypePtr data_type = get_data_type_with_default_argument( + DataTypeFactory::instance().create_data_type(i.type, false)); + MutableColumnPtr data_column = data_type->create_column(); + (*col_name_to_block_idx_ref())[i.column_lower_case] = + static_cast(block->columns()); + block->insert( + ColumnWithTypeAndName(std::move(data_column), data_type, i.column_lower_case)); + } + return Status::OK(); +} + +// ============================================================================ +// on_after_read_block: shrink ACID columns from block +// ============================================================================ +Status TransactionalHiveReader::on_after_read_block(Block* block, size_t* /*read_rows*/) { + Block::erase_useless_column(block, block->columns() - TransactionalHive::READ_PARAMS.size()); + for (const auto& i : TransactionalHive::READ_PARAMS) { + col_name_to_block_idx_ref()->erase(i.column_lower_case); + } + return Status::OK(); +} + #include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/format/table/transactional_hive_reader.h b/be/src/format/table/transactional_hive_reader.h index 9c4603d4ce819c..02fd294621402e 100644 --- a/be/src/format/table/transactional_hive_reader.h +++ b/be/src/format/table/transactional_hive_reader.h @@ -21,19 +21,17 @@ #include #include #include -#include #include #include "common/factory_creator.h" #include "common/status.h" -#include "exec/common/hash_table/phmap_fwd_decl.h" -#include "format/table/table_format_reader.h" -#include "storage/olap_scan_common.h" +#include "format/orc/vorc_reader.h" +#include "format/table/table_schema_change_helper.h" +#include "format/table/transactional_hive_common.h" namespace doris { #include "common/compile_check_begin.h" class RuntimeState; -class SlotDescriptor; class TFileRangeDesc; class TFileScanRangeParams; @@ -42,63 +40,34 @@ struct IOContext; } // namespace io class Block; -class GenericReader; class ShardedKVCache; class VExprContext; -class TransactionalHiveReader : public TableFormatReader, public TableSchemaChangeHelper { +// TransactionalHiveReader: directly inherits OrcReader (no composition wrapping). +// ACID column expansion/shrinking done via on_before_read_block/on_after_read_block hooks. +// Delete delta reading done via on_after_init_reader hook. +class TransactionalHiveReader final : public OrcReader, public TableSchemaChangeHelper { ENABLE_FACTORY_CREATOR(TransactionalHiveReader); public: - struct AcidRowID { - int64_t original_transaction; - int64_t bucket; - int64_t row_id; - - struct Hash { - size_t operator()(const AcidRowID& transactional_row_id) const { - size_t hash_value = 0; - hash_value ^= std::hash {}(transactional_row_id.original_transaction) + - 0x9e3779b9 + (hash_value << 6) + (hash_value >> 2); - hash_value ^= std::hash {}(transactional_row_id.bucket) + 0x9e3779b9 + - (hash_value << 6) + (hash_value >> 2); - hash_value ^= std::hash {}(transactional_row_id.row_id) + 0x9e3779b9 + - (hash_value << 6) + (hash_value >> 2); - return hash_value; - } - }; - - struct Eq { - bool operator()(const AcidRowID& lhs, const AcidRowID& rhs) const { - return lhs.original_transaction == rhs.original_transaction && - lhs.bucket == rhs.bucket && lhs.row_id == rhs.row_id; - } - }; - }; - - using AcidRowIDSet = flat_hash_set; - - TransactionalHiveReader(std::unique_ptr file_format_reader, - RuntimeProfile* profile, RuntimeState* state, + TransactionalHiveReader(RuntimeProfile* profile, RuntimeState* state, const TFileScanRangeParams& params, const TFileRangeDesc& range, - io::IOContext* io_ctx, FileMetaCache* meta_cache); - ~TransactionalHiveReader() override = default; + size_t batch_size, const std::string& ctz, io::IOContext* io_ctx, + FileMetaCache* meta_cache = nullptr); + ~TransactionalHiveReader() final = default; - Status init_row_filters() final; +protected: + // Hook: ACID schema mapping (add transactional columns, map row.* fields) + Status on_before_init_reader(ReaderInitContext* ctx) override; - Status get_next_block_inner(Block* block, size_t* read_rows, bool* eof) final; + // Hook: read delete delta files + Status on_after_init_reader(ReaderInitContext* /*ctx*/) override; - bool has_delete_operations() const override { - return !_delete_rows.empty() || TableFormatReader::has_delete_operations(); - } + // Hook: expand ACID columns into block before reading + Status on_before_read_block(Block* block) override; - Status init_reader( - const std::vector& column_names, - std::unordered_map* col_name_to_block_idx, - const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor, - const RowDescriptor* row_descriptor, - const VExprContextSPtrs* not_single_slot_filter_conjuncts, - const std::unordered_map* slot_id_to_filter_conjuncts); + // Hook: shrink ACID columns from block after reading + Status on_after_read_block(Block* block, size_t* read_rows) override; private: struct TransactionalHiveProfile { @@ -108,25 +77,9 @@ class TransactionalHiveReader : public TableFormatReader, public TableSchemaChan }; TransactionalHiveProfile _transactional_orc_profile; - AcidRowIDSet _delete_rows; - std::unique_ptr _delete_rows_filter_ptr; + AcidRowIDSet _acid_delete_rows; std::vector _col_names; - // Column name to block index map, passed from FileScanner - std::unordered_map* _col_name_to_block_idx = nullptr; }; -inline bool operator<(const TransactionalHiveReader::AcidRowID& lhs, - const TransactionalHiveReader::AcidRowID& rhs) { - if (lhs.original_transaction != rhs.original_transaction) { - return lhs.original_transaction < rhs.original_transaction; - } else if (lhs.bucket != rhs.bucket) { - return lhs.bucket < rhs.bucket; - } else if (lhs.row_id != rhs.row_id) { - return lhs.row_id < rhs.row_id; - } else { - return false; - } -} - #include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/format/table/trino_connector_jni_reader.h b/be/src/format/table/trino_connector_jni_reader.h index d571c5cd5866ed..5a2482ee9e29da 100644 --- a/be/src/format/table/trino_connector_jni_reader.h +++ b/be/src/format/table/trino_connector_jni_reader.h @@ -49,6 +49,9 @@ class TrinoConnectorJniReader : public JniReader { Status init_reader(); +protected: + Status _do_init_reader(ReaderInitContext* /*ctx*/) override { return init_reader(); } + private: Status _set_spi_plugins_dir(); }; diff --git a/be/src/load/delta_writer/push_handler.cpp b/be/src/load/delta_writer/push_handler.cpp index 8996082adfcf38..cf32886f7b90ba 100644 --- a/be/src/load/delta_writer/push_handler.cpp +++ b/be/src/load/delta_writer/push_handler.cpp @@ -638,21 +638,24 @@ Status PushBrokerReader::_get_next_reader() { _runtime_profile, _file_params, range, _runtime_state->query_options().batch_size, &_runtime_state->timezone_obj(), _io_ctx.get(), _runtime_state.get()); - init_status = parquet_reader->init_reader( - _all_col_names, &_col_name_to_block_idx, _push_down_exprs, _slot_id_to_predicates, - _real_tuple_desc, _default_val_row_desc.get(), _col_name_to_slot_id, - &_not_single_slot_filter_conjuncts, &_slot_id_to_filter_conjuncts, - TableSchemaChangeHelper::ConstNode::get_instance(), false); + ParquetInitContext ctx; + ctx.column_names = _all_col_names; + ctx.col_name_to_block_idx = &_col_name_to_block_idx; + ctx.conjuncts = &_push_down_exprs; + ctx.slot_id_to_predicates = &_slot_id_to_predicates; + ctx.tuple_descriptor = _real_tuple_desc; + ctx.row_descriptor = _default_val_row_desc.get(); + ctx.colname_to_slot_id = _col_name_to_slot_id; + ctx.not_single_slot_filter_conjuncts = &_not_single_slot_filter_conjuncts; + ctx.slot_id_to_filter_conjuncts = &_slot_id_to_filter_conjuncts; + + init_status = parquet_reader->init_reader(&ctx); _cur_reader = std::move(parquet_reader); if (!init_status.ok()) { return Status::InternalError("failed to init reader for file {}, err: {}", range.path, init_status.to_string()); } - std::unordered_map> - partition_columns; - std::unordered_map missing_columns; - RETURN_IF_ERROR(_cur_reader->get_columns(&_name_to_col_type, &_missing_cols)); - RETURN_IF_ERROR(_cur_reader->set_fill_columns(partition_columns, missing_columns)); + RETURN_IF_ERROR(_cur_reader->get_columns(&_name_to_col_type)); break; } default: diff --git a/be/src/load/group_commit/wal/wal_reader.cpp b/be/src/load/group_commit/wal/wal_reader.cpp index 610b27f9f8b545..1f2a45d262f8f0 100644 --- a/be/src/load/group_commit/wal/wal_reader.cpp +++ b/be/src/load/group_commit/wal/wal_reader.cpp @@ -40,7 +40,22 @@ Status WalReader::init_reader(const TupleDescriptor* tuple_descriptor) { return Status::OK(); } -Status WalReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { +// ---- Unified init_reader(ReaderInitContext*) overrides ---- + +Status WalReader::_open_file_reader(ReaderInitContext* /*ctx*/) { + RETURN_IF_ERROR(_state->exec_env()->wal_mgr()->get_wal_path(_wal_id, _wal_path)); + _wal_reader = std::make_shared(_wal_path); + RETURN_IF_ERROR(_wal_reader->init()); + return Status::OK(); +} + +Status WalReader::_do_init_reader(ReaderInitContext* base_ctx) { + auto* ctx = checked_context_cast(base_ctx); + _tuple_descriptor = ctx->output_tuple_descriptor; + return Status::OK(); +} + +Status WalReader::_do_get_next_block(Block* block, size_t* read_rows, bool* eof) { //read src block PBlock pblock; auto st = _wal_reader->read_block(pblock); @@ -97,11 +112,11 @@ Status WalReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { block->swap(dst_block); *read_rows = block->rows(); VLOG_DEBUG << "read block rows:" << *read_rows; + return Status::OK(); } -Status WalReader::get_columns(std::unordered_map* name_to_type, - std::unordered_set* missing_cols) { +Status WalReader::_get_columns_impl(std::unordered_map* name_to_type) { std::string col_ids; RETURN_IF_ERROR(_wal_reader->read_header(_version, col_ids)); std::vector column_id_vector = diff --git a/be/src/load/group_commit/wal/wal_reader.h b/be/src/load/group_commit/wal/wal_reader.h index 23579daff45343..db87397704d681 100644 --- a/be/src/load/group_commit/wal/wal_reader.h +++ b/be/src/load/group_commit/wal/wal_reader.h @@ -16,23 +16,29 @@ // under the License. #pragma once -#include "format/generic_reader.h" +#include "format/table/table_format_reader.h" #include "load/group_commit/wal/wal_file_reader.h" #include "runtime/descriptors.h" namespace doris { #include "common/compile_check_begin.h" struct ScannerCounter; -class WalReader : public GenericReader { + +/// WAL-specific initialization context. +/// Extends ReaderInitContext with output tuple descriptor (unique to WAL reader). +struct WalInitContext final : public ReaderInitContext { + const TupleDescriptor* output_tuple_descriptor = nullptr; +}; + +class WalReader : public TableFormatReader { ENABLE_FACTORY_CREATOR(WalReader); public: WalReader(RuntimeState* state); ~WalReader() override = default; Status init_reader(const TupleDescriptor* tuple_descriptor); - Status get_next_block(Block* block, size_t* read_rows, bool* eof) override; - Status get_columns(std::unordered_map* name_to_type, - std::unordered_set* missing_cols) override; + Status _do_get_next_block(Block* block, size_t* read_rows, bool* eof) override; + Status _get_columns_impl(std::unordered_map* name_to_type) override; Status close() override { if (_wal_reader) { @@ -41,6 +47,11 @@ class WalReader : public GenericReader { return Status::OK(); } +protected: + // ---- Unified init_reader(ReaderInitContext*) overrides ---- + Status _open_file_reader(ReaderInitContext* ctx) override; + Status _do_init_reader(ReaderInitContext* ctx) override; + private: RuntimeState* _state = nullptr; int64_t _wal_id; diff --git a/be/test/format/condition_cache_test.cpp b/be/test/format/condition_cache_test.cpp index 50c5e8c55031df..ae89a67210b625 100644 --- a/be/test/format/condition_cache_test.cpp +++ b/be/test/format/condition_cache_test.cpp @@ -25,8 +25,9 @@ #include "common/status.h" #include "format/generic_reader.h" +#include "format/orc/vorc_reader.h" #include "format/parquet/vparquet_reader.h" -#include "format/table/iceberg_reader.h" +#include "format/table/transactional_hive_common.h" namespace doris::vectorized { @@ -365,42 +366,13 @@ TEST_F(CachePreAllocTest, ExtraElementDoesNotCauseIncorrectFiltering) { // ============================================================ // GenericReader whose has_delete_operations() result is configurable, -// used as the inner file-format reader for table-format readers. +// used to test condition cache skip logic for various delete scenarios. class MockFileFormatReader : public GenericReader { public: bool mock_has_deletes = false; - Status get_next_block(Block*, size_t*, bool*) override { return Status::OK(); } + Status _do_get_next_block(Block*, size_t*, bool*) override { return Status::OK(); } bool has_delete_operations() const override { return mock_has_deletes; } }; - -// Concrete IcebergTableReader (pure-virtual stubs filled in). -// Exposes the protected _equality_delete_impls for testing. -class TestableIcebergReader : public IcebergTableReader { -public: - using IcebergTableReader::IcebergTableReader; - void set_delete_rows() override {} - Status _process_equality_delete( - const std::vector& delete_files) override { - return Status::OK(); - } - void test_set_equality_delete(std::unique_ptr impl) { - _equality_delete_impls.push_back(std::move(impl)); - } -}; - -// Minimal EqualityDeleteBase (only needs to be non-null for the check). -class MockEqualityDelete : public EqualityDeleteBase { -public: - MockEqualityDelete() : EqualityDeleteBase(nullptr, {}) {} - Status _build_set() override { return Status::OK(); } - Status filter_data_block(Block* data_block, - const std::unordered_map* col_name_to_block_idx, - const std::unordered_map& id_to_block_column_name, - IColumn::Filter& filter) override { - return Status::OK(); - } -}; - // ============================================================ // These tests reproduce the logic from // FileScanner::_init_reader_condition_cache() (file_scanner.cpp) @@ -539,7 +511,7 @@ TEST_F(ConditionCacheDeleteOpsTest, OrcWithAcidDeletes_CacheSkipped) { TFileScanRangeParams params; TFileRangeDesc range; auto reader = OrcReader::create_unique(params, range, "", nullptr); - TransactionalHiveReader::AcidRowIDSet acid_deletes; + AcidRowIDSet acid_deletes; acid_deletes.insert({1, 0, 5}); reader->set_delete_rows(&acid_deletes); @@ -552,62 +524,37 @@ TEST_F(ConditionCacheDeleteOpsTest, OrcWithAcidDeletes_CacheSkipped) { EXPECT_EQ(cache, nullptr); } -// -- IcebergTableReader: with equality deletes -> cache skipped -- -TEST_F(ConditionCacheDeleteOpsTest, IcebergWithEqualityDeletes_CacheSkipped) { - TFileScanRangeParams params; - TFileRangeDesc range; - auto inner = std::make_unique(); - inner->mock_has_deletes = false; - RuntimeProfile profile("test"); - TestableIcebergReader reader(std::move(inner), &profile, nullptr, params, range, nullptr, - nullptr, nullptr); - reader.test_set_equality_delete(std::make_unique()); +// -- MockReader: with deletes (simulating Iceberg/Hive with inner deletes) -> cache skipped -- +// In the new architecture, Iceberg readers inherit ParquetReader/OrcReader directly (CRTP), +// so has_delete_operations() is resolved through the base reader. We use MockFileFormatReader +// to test the generic condition cache skip logic. +TEST_F(ConditionCacheDeleteOpsTest, ReaderWithDeletes_CacheSkipped) { + auto reader = std::make_unique(); + reader->mock_has_deletes = true; bool hit = false; std::shared_ptr> cache; std::shared_ptr ctx; - simulate_init_condition_cache(&reader, 42, "/data/iceberg.parquet", hit, cache, ctx); + simulate_init_condition_cache(reader.get(), 42, "/data/iceberg.parquet", hit, cache, ctx); EXPECT_EQ(ctx, nullptr); EXPECT_EQ(cache, nullptr); } -// -- IcebergTableReader: with position deletes in inner reader -> cache skipped -- -TEST_F(ConditionCacheDeleteOpsTest, IcebergWithPositionDeletes_CacheSkipped) { - TFileScanRangeParams params; - TFileRangeDesc range; - auto inner = std::make_unique(); - inner->mock_has_deletes = true; // inner reader has position deletes - RuntimeProfile profile("test"); - TestableIcebergReader reader(std::move(inner), &profile, nullptr, params, range, nullptr, - nullptr, nullptr); +// -- MockReader: no deletes -> cache populated -- +TEST_F(ConditionCacheDeleteOpsTest, ReaderWithoutDeletes_CachePopulated) { + auto reader = std::make_unique(); + reader->mock_has_deletes = false; bool hit = false; std::shared_ptr> cache; std::shared_ptr ctx; - simulate_init_condition_cache(&reader, 42, "/data/iceberg.parquet", hit, cache, ctx); + simulate_init_condition_cache(reader.get(), 42, "/data/iceberg.parquet", hit, cache, ctx); - EXPECT_EQ(ctx, nullptr); - EXPECT_EQ(cache, nullptr); -} - -// -- TransactionalHiveReader: inner reader has deletes -> cache skipped -- -TEST_F(ConditionCacheDeleteOpsTest, TransactionalHiveInnerDeletes_CacheSkipped) { - TFileScanRangeParams params; - TFileRangeDesc range; - auto inner = std::make_unique(); - inner->mock_has_deletes = true; - RuntimeProfile profile("test"); - auto reader = TransactionalHiveReader::create_unique(std::move(inner), &profile, nullptr, - params, range, nullptr, nullptr); - - bool hit = false; - std::shared_ptr> cache; - std::shared_ptr ctx; - simulate_init_condition_cache(reader.get(), 42, "/data/hive_acid.orc", hit, cache, ctx); - - EXPECT_EQ(ctx, nullptr); - EXPECT_EQ(cache, nullptr); + EXPECT_FALSE(hit); + EXPECT_NE(ctx, nullptr); + EXPECT_NE(cache, nullptr); + EXPECT_FALSE(ctx->is_hit); } // -- Pre-populated cache entry is NOT returned when deletes exist -- diff --git a/be/test/format/native/native_reader_writer_test.cpp b/be/test/format/native/native_reader_writer_test.cpp index 0b5bcf1bfa4bca..5d1d7dc207cef7 100644 --- a/be/test/format/native/native_reader_writer_test.cpp +++ b/be/test/format/native/native_reader_writer_test.cpp @@ -757,10 +757,9 @@ TEST_F(NativeReaderWriterTest, get_columns_and_parsed_schema) { NativeReader reader_impl(nullptr, scan_params, scan_range, nullptr, &state); std::unordered_map name_to_type; - std::unordered_set missing_cols; - st = reader_impl.get_columns(&name_to_type, &missing_cols); + st = reader_impl.get_columns(&name_to_type); ASSERT_TRUE(st.ok()) << st; - ASSERT_TRUE(missing_cols.empty()); + ASSERT_TRUE(reader_impl.missing_cols().empty()); // All columns from src_block should appear in name_to_type. for (size_t i = 0; i < src_block.columns(); ++i) { diff --git a/be/test/format/orc/orc_read_lines.cpp b/be/test/format/orc/orc_read_lines.cpp index 3e8803d5681458..7381be75719942 100644 --- a/be/test/format/orc/orc_read_lines.cpp +++ b/be/test/format/orc/orc_read_lines.cpp @@ -135,16 +135,20 @@ static void read_orc_line(int64_t line, std::string block_dump, tuple_desc->slots().size()); reader->set_row_id_column_iterator(iterator_pair); - auto status = reader->init_reader(&column_names, &col_name_to_block_idx, {}, false, tuple_desc, - &row_desc, nullptr, nullptr); + // Construct OrcInitContext for standalone reader (no column_descs). + OrcInitContext orc_ctx; + orc_ctx.column_names = column_names; + orc_ctx.col_name_to_block_idx = &col_name_to_block_idx; + orc_ctx.tuple_descriptor = tuple_desc; + orc_ctx.row_descriptor = &row_desc; + orc_ctx.params = ¶ms; + orc_ctx.range = ⦥ + auto status = reader->init_reader(&orc_ctx); EXPECT_TRUE(status.ok()); - std::unordered_map> - partition_columns; - std::unordered_map missing_columns; - auto st = reader->set_fill_columns(partition_columns, missing_columns); - EXPECT_TRUE(st.ok()) << st; + // set_fill_columns logic is now inlined in _do_init_reader, + // so no separate call is needed. BlockUPtr block = Block::create_unique(); for (const auto& slot_desc : tuple_desc->slots()) { auto data_type = slot_desc->type(); @@ -159,7 +163,7 @@ static void read_orc_line(int64_t line, std::string block_dump, bool eof = false; size_t read_row = 0; - st = reader->get_next_block(block.get(), &read_row, &eof); + Status st = reader->get_next_block(block.get(), &read_row, &eof); EXPECT_TRUE(st.ok()) << st; auto row_id_string_column = static_cast( *block->get_by_position(block->get_position_by_name("row_id")).column.get()); diff --git a/be/test/format/orc/orc_reader_init_column_test.cpp b/be/test/format/orc/orc_reader_init_column_test.cpp index 4005edcf8fb7e6..e4d40c40935718 100644 --- a/be/test/format/orc/orc_reader_init_column_test.cpp +++ b/be/test/format/orc/orc_reader_init_column_test.cpp @@ -58,13 +58,13 @@ TEST_F(OrcReaderInitColumnTest, InitReadColumn) { std::vector tmp; tmp.emplace_back("col1"); - reader->_table_column_names = &tmp; + reader->_table_column_names = tmp; Status st = reader->_init_read_columns(); std::cout << "st =" << st << "\n"; - std::list ans; - ans.emplace_back("col1"); - ASSERT_EQ(ans, reader->_read_file_cols); - ASSERT_EQ(ans, reader->_read_table_cols); + // _init_read_columns builds _type_map; _read_file_cols is populated later + // in _do_init_reader's standalone path when _table_column_names is set. + ASSERT_TRUE(reader->_type_map.contains("col1")); + ASSERT_FALSE(reader->_type_map.contains("nonexistent")); } } diff --git a/be/test/format/orc/orc_reader_test.cpp b/be/test/format/orc/orc_reader_test.cpp index 6d44eeb1a36d72..3adbb000048c27 100644 --- a/be/test/format/orc/orc_reader_test.cpp +++ b/be/test/format/orc/orc_reader_test.cpp @@ -65,7 +65,7 @@ class OrcReaderTest : public testing::Test { "o_orderstatus") << std::make_tuple(DataTypeFactory::instance().create_data_type(TYPE_DOUBLE, false), "o_totalprice") - << std::make_tuple(DataTypeFactory::instance().create_data_type(TYPE_DATE, false), + << std::make_tuple(DataTypeFactory::instance().create_data_type(TYPE_DATEV2, false), "o_orderdate") << std::make_tuple(DataTypeFactory::instance().create_data_type(TYPE_STRING, false), "o_orderpriority") @@ -83,10 +83,16 @@ class OrcReaderTest : public testing::Test { range.path = "./be/test/exec/test_data/orc_scanner/orders.orc"; range.start_offset = 0; range.size = 1293; - auto reader = OrcReader::create_unique(params, range, "", nullptr, &cache, true); - auto status = reader->init_reader(&column_names, &col_name_to_block_idx, {}, false, - tuple_desc, &row_desc, nullptr, nullptr); - EXPECT_TRUE(status.ok()); + auto reader = OrcReader::create_unique(params, range, "UTC", nullptr, &cache, true); + OrcInitContext orc_ctx; + orc_ctx.column_names = column_names; + orc_ctx.col_name_to_block_idx = &col_name_to_block_idx; + orc_ctx.tuple_descriptor = tuple_desc; + orc_ctx.row_descriptor = &row_desc; + orc_ctx.params = ¶ms; + orc_ctx.range = ⦥ + auto status = reader->init_reader(&orc_ctx); + EXPECT_TRUE(status.ok()) << "init_reader failed: " << status.to_string(); // deserialize expr auto exprx = apache::thrift::from_json_string(expr); @@ -155,7 +161,7 @@ TEST_F(OrcReaderTest, test_build_search_argument) { "<= 1200000), leaf-3 = (o_orderkey = 1100000), expr = (and (or leaf-0 (not leaf-1)) " "(or leaf-0 leaf-2) (or leaf-0 (not leaf-3)))", "leaf-0 = (o_orderkey in [1000000, 2000000, 3000000]), leaf-1 = (o_orderdate < " - "17121205), leaf-2 = (o_orderdate <= 17121205), expr = (and (or leaf-0 (not leaf-1)) " + "8766), leaf-2 = (o_orderdate <= 9130), expr = (and (or leaf-0 (not leaf-1)) " "(or leaf-0 leaf-2))", "leaf-0 = (o_orderkey < 2), leaf-1 = (o_orderpriority = 1-URGENT), expr = (or leaf-0 " "leaf-1)", diff --git a/be/test/format/parquet/parquet_expr_test.cpp b/be/test/format/parquet/parquet_expr_test.cpp index 73441901db7743..4db4bc03feebd6 100644 --- a/be/test/format/parquet/parquet_expr_test.cpp +++ b/be/test/format/parquet/parquet_expr_test.cpp @@ -279,10 +279,14 @@ class ParquetExprTest : public testing::Test { &ctz, nullptr, nullptr); p_reader->set_file_reader(local_file_reader); colname_to_slot_id.emplace("int64_col", 2); - phmap::flat_hash_map>> tmp; - static_cast(p_reader->init_reader(column_names, &col_name_to_block_idx, {}, tmp, - tuple_desc, nullptr, &colname_to_slot_id, nullptr, - nullptr)); + ParquetInitContext pq_ctx; + pq_ctx.column_names = column_names; + pq_ctx.col_name_to_block_idx = &col_name_to_block_idx; + pq_ctx.tuple_descriptor = tuple_desc; + pq_ctx.colname_to_slot_id = &colname_to_slot_id; + pq_ctx.params = &scan_params; + pq_ctx.range = &scan_range; + static_cast(p_reader->init_reader(&pq_ctx)); size_t meta_size; static_cast(parse_thrift_footer(p_reader->_file_reader, &doris_file_metadata, @@ -326,15 +330,16 @@ class ParquetExprTest : public testing::Test { auto local_reader = ParquetReader::create_unique( nullptr, scan_params, scan_range, scan_range.size, &local_ctz, nullptr, nullptr); local_reader->set_file_reader(local_file_reader); - phmap::flat_hash_map>> tmp; - static_cast(local_reader->init_reader(column_names, &col_name_to_block_idx, {}, tmp, - tuple_desc, nullptr, nullptr, nullptr, - nullptr)); - - std::unordered_map> - partition_columns; - std::unordered_map missing_columns; - static_cast(local_reader->set_fill_columns(partition_columns, missing_columns)); + ParquetInitContext pq_ctx2; + pq_ctx2.column_names = column_names; + pq_ctx2.col_name_to_block_idx = &col_name_to_block_idx; + pq_ctx2.tuple_descriptor = tuple_desc; + pq_ctx2.params = &scan_params; + pq_ctx2.range = &scan_range; + static_cast(local_reader->init_reader(&pq_ctx2)); + + // set_fill_columns logic is now inlined in _do_init_reader, + // so no separate call is needed. bool eof = false; std::string dump; diff --git a/be/test/format/parquet/parquet_read_lines.cpp b/be/test/format/parquet/parquet_read_lines.cpp index 443f5226775201..abcb62024e463a 100644 --- a/be/test/format/parquet/parquet_read_lines.cpp +++ b/be/test/format/parquet/parquet_read_lines.cpp @@ -151,13 +151,14 @@ static void read_parquet_lines(std::vector numeric_types, runtime_state.set_desc_tbl(desc_tbl); std::unordered_map colname_to_value_range; - phmap::flat_hash_map>> tmp; - static_cast(p_reader->init_reader(column_names, &col_name_to_block_idx, {}, tmp, nullptr, - nullptr, nullptr, nullptr, nullptr)); - std::unordered_map> - partition_columns; - std::unordered_map missing_columns; - static_cast(p_reader->set_fill_columns(partition_columns, missing_columns)); + ParquetInitContext pq_ctx; + pq_ctx.column_names = column_names; + pq_ctx.col_name_to_block_idx = &col_name_to_block_idx; + pq_ctx.params = &scan_params; + pq_ctx.range = &scan_range; + static_cast(p_reader->init_reader(&pq_ctx)); + // set_fill_columns logic is now inlined in _do_init_reader, + // so no separate call is needed. BlockUPtr block = Block::create_unique(); for (const auto& slot_desc : tuple_desc->slots()) { auto data_type = make_nullable(slot_desc->type()); diff --git a/be/test/format/parquet/parquet_reader_test.cpp b/be/test/format/parquet/parquet_reader_test.cpp index 1d6e4632842f89..e738d9c4de77ad 100644 --- a/be/test/format/parquet/parquet_reader_test.cpp +++ b/be/test/format/parquet/parquet_reader_test.cpp @@ -38,6 +38,7 @@ #include "core/data_type/data_type.h" #include "core/data_type/data_type_factory.hpp" #include "core/string_view.h" +#include "format/column_descriptor.h" #include "format/parquet/vparquet_reader.h" #include "gtest/gtest_pred_impl.h" #include "io/fs/file_meta_cache.h" @@ -51,6 +52,13 @@ namespace doris { class VExprContext; +static std::vector to_column_descs(const std::vector& names) { + std::vector descs; + for (const auto& name : names) { + descs.push_back({name, nullptr, ColumnCategory::REGULAR, nullptr}); + } + return descs; +} static VExprContextSPtrs create_predicates(DescriptorTbl* desc_tbl, RuntimeState* runtime_state); template static VExprContextSPtrs create_partition_predicates(DescriptorTbl* desc_tbl, @@ -109,27 +117,34 @@ class ParquetReaderTest : public testing::Test { nullptr, &runtime_state, &cache, enable_lazy); p_reader->set_file_reader(reader); runtime_state.set_desc_tbl(desc_tbl); - phmap::flat_hash_map>> tmp; auto conjuncts = create_predicates(desc_tbl, &runtime_state); std::unordered_map slot_id_to_expr_ctxs; slot_id_to_expr_ctxs[0].emplace_back(conjuncts[0]); slot_id_to_expr_ctxs[1].emplace_back(conjuncts[1]); + auto column_descs = to_column_descs(column_names); if constexpr (filter_all) { - st = p_reader->init_reader(column_names, &col_name_to_block_idx, conjuncts, tmp, - tuple_desc, nullptr, nullptr, nullptr, - &slot_id_to_expr_ctxs); + ParquetInitContext pq_ctx; + pq_ctx.column_descs = &column_descs; + pq_ctx.col_name_to_block_idx = &col_name_to_block_idx; + pq_ctx.conjuncts = &conjuncts; + pq_ctx.tuple_descriptor = tuple_desc; + pq_ctx.slot_id_to_filter_conjuncts = &slot_id_to_expr_ctxs; + pq_ctx.params = &scan_params; + pq_ctx.range = &scan_range; + st = p_reader->init_reader(&pq_ctx); } else { - st = p_reader->init_reader(column_names, &col_name_to_block_idx, {}, tmp, nullptr, - nullptr, nullptr, nullptr, nullptr); + ParquetInitContext pq_ctx; + pq_ctx.column_descs = &column_descs; + pq_ctx.col_name_to_block_idx = &col_name_to_block_idx; + pq_ctx.params = &scan_params; + pq_ctx.range = &scan_range; + st = p_reader->init_reader(&pq_ctx); } EXPECT_TRUE(st.ok()) << st; - std::unordered_map> - partition_columns; - std::unordered_map missing_columns; - st = p_reader->set_fill_columns(partition_columns, missing_columns); - EXPECT_TRUE(st.ok()) << st; + // set_fill_columns logic is now inlined in _do_init_reader, + // so no separate call is needed. bool eof = false; size_t total_rows = 0; bool all_null = true; @@ -195,6 +210,8 @@ class ParquetReaderTest : public testing::Test { TFileRangeDesc scan_range; scan_range.start_offset = 0; scan_range.size = 1000; + scan_range.__set_columns_from_path_keys({"part_col"}); + scan_range.__set_columns_from_path({"1"}); auto q_options = TQueryOptions(); q_options.__set_enable_adjust_conjunct_order_by_cost(true); RuntimeState runtime_state = RuntimeState(q_options, TQueryGlobals()); @@ -204,22 +221,25 @@ class ParquetReaderTest : public testing::Test { p_reader->set_file_reader(reader); runtime_state.set_desc_tbl(desc_tbl); - phmap::flat_hash_map>> tmp; auto conjuncts = create_partition_predicates(desc_tbl, &runtime_state); std::unordered_map slot_id_to_expr_ctxs; slot_id_to_expr_ctxs[1].emplace_back(conjuncts[0]); slot_id_to_expr_ctxs[2].emplace_back(conjuncts[1]); - st = p_reader->init_reader(column_names, &col_name_to_block_idx, conjuncts, tmp, tuple_desc, - nullptr, nullptr, nullptr, &slot_id_to_expr_ctxs); + auto column_descs = to_column_descs(column_names); + ParquetInitContext pq_ctx; + pq_ctx.column_descs = &column_descs; + pq_ctx.col_name_to_block_idx = &col_name_to_block_idx; + pq_ctx.conjuncts = &conjuncts; + pq_ctx.tuple_descriptor = tuple_desc; + pq_ctx.slot_id_to_filter_conjuncts = &slot_id_to_expr_ctxs; + pq_ctx.params = &scan_params; + pq_ctx.range = &scan_range; + st = p_reader->init_reader(&pq_ctx); EXPECT_TRUE(st.ok()) << st; - std::unordered_map> - partition_columns; - partition_columns.emplace("part_col", std::make_tuple("1", tuple_desc->slots()[2])); - std::unordered_map missing_columns; - st = p_reader->set_fill_columns(partition_columns, missing_columns); - EXPECT_TRUE(st.ok()) << st; + // Partition/missing column logic is now handled by on_before_init_reader + // via _extract_partition_values from scan_range. bool eof = false; size_t total_rows = 0; @@ -348,13 +368,14 @@ TEST_F(ParquetReaderTest, normal) { RuntimeState runtime_state((TQueryOptions()), TQueryGlobals()); runtime_state.set_desc_tbl(desc_tbl); - phmap::flat_hash_map>> tmp; - static_cast(p_reader->init_reader(column_names, &col_name_to_block_idx, {}, tmp, nullptr, - nullptr, nullptr, nullptr, nullptr)); - std::unordered_map> - partition_columns; - std::unordered_map missing_columns; - static_cast(p_reader->set_fill_columns(partition_columns, missing_columns)); + ParquetInitContext pq_ctx; + pq_ctx.column_names = column_names; + pq_ctx.col_name_to_block_idx = &col_name_to_block_idx; + pq_ctx.params = &scan_params; + pq_ctx.range = &scan_range; + static_cast(p_reader->init_reader(&pq_ctx)); + // set_fill_columns logic is now inlined in _do_init_reader, + // so no separate call is needed. BlockUPtr block = Block::create_unique(); for (const auto& slot_desc : tuple_desc->slots()) { auto data_type = make_nullable(slot_desc->type()); @@ -413,15 +434,15 @@ TEST_F(ParquetReaderTest, uuid_varbinary) { RuntimeState runtime_state = RuntimeState(TQueryOptions(), TQueryGlobals()); runtime_state.set_desc_tbl(desc_tbl); - phmap::flat_hash_map>> tmp; - st = p_reader->init_reader(column_names, &col_name_to_block_idx, {}, tmp, nullptr, nullptr, - nullptr, nullptr, nullptr); - EXPECT_TRUE(st.ok()) << st; - std::unordered_map> - partition_columns; - std::unordered_map missing_columns; - st = p_reader->set_fill_columns(partition_columns, missing_columns); + ParquetInitContext pq_ctx; + pq_ctx.column_names = column_names; + pq_ctx.col_name_to_block_idx = &col_name_to_block_idx; + pq_ctx.params = &scan_params; + pq_ctx.range = &scan_range; + st = p_reader->init_reader(&pq_ctx); EXPECT_TRUE(st.ok()) << st; + // set_fill_columns logic is now inlined in _do_init_reader, + // so no separate call is needed. BlockUPtr block = Block::create_unique(); for (const auto& slot_desc : tuple_desc->slots()) { auto data_type = make_nullable(slot_desc->type()); @@ -487,15 +508,15 @@ TEST_F(ParquetReaderTest, varbinary_varbinary) { RuntimeState runtime_state = RuntimeState(TQueryOptions(), TQueryGlobals()); runtime_state.set_desc_tbl(desc_tbl); - phmap::flat_hash_map>> tmp; - st = p_reader->init_reader(column_names, &col_name_to_block_idx, {}, tmp, nullptr, nullptr, - nullptr, nullptr, nullptr); - EXPECT_TRUE(st.ok()) << st; - std::unordered_map> - partition_columns; - std::unordered_map missing_columns; - st = p_reader->set_fill_columns(partition_columns, missing_columns); + ParquetInitContext pq_ctx; + pq_ctx.column_names = column_names; + pq_ctx.col_name_to_block_idx = &col_name_to_block_idx; + pq_ctx.params = &scan_params; + pq_ctx.range = &scan_range; + st = p_reader->init_reader(&pq_ctx); EXPECT_TRUE(st.ok()) << st; + // set_fill_columns logic is now inlined in _do_init_reader, + // so no separate call is needed. BlockUPtr block = Block::create_unique(); for (const auto& slot_desc : tuple_desc->slots()) { auto data_type = make_nullable(slot_desc->type()); @@ -563,15 +584,15 @@ TEST_F(ParquetReaderTest, varbinary_string) { RuntimeState runtime_state = RuntimeState(TQueryOptions(), TQueryGlobals()); runtime_state.set_desc_tbl(desc_tbl); - phmap::flat_hash_map>> tmp; - st = p_reader->init_reader(column_names, &col_name_to_block_idx, {}, tmp, nullptr, nullptr, - nullptr, nullptr, nullptr); - EXPECT_TRUE(st.ok()) << st; - std::unordered_map> - partition_columns; - std::unordered_map missing_columns; - st = p_reader->set_fill_columns(partition_columns, missing_columns); + ParquetInitContext pq_ctx; + pq_ctx.column_names = column_names; + pq_ctx.col_name_to_block_idx = &col_name_to_block_idx; + pq_ctx.params = &scan_params; + pq_ctx.range = &scan_range; + st = p_reader->init_reader(&pq_ctx); EXPECT_TRUE(st.ok()) << st; + // set_fill_columns logic is now inlined in _do_init_reader, + // so no separate call is needed. BlockUPtr block = Block::create_unique(); for (const auto& slot_desc : tuple_desc->slots()) { auto data_type = make_nullable(slot_desc->type()); @@ -639,15 +660,15 @@ TEST_F(ParquetReaderTest, varbinary_string2) { RuntimeState runtime_state = RuntimeState(TQueryOptions(), TQueryGlobals()); runtime_state.set_desc_tbl(desc_tbl); - phmap::flat_hash_map>> tmp; - st = p_reader->init_reader(column_names, &col_name_to_block_idx, {}, tmp, nullptr, nullptr, - nullptr, nullptr, nullptr); - EXPECT_TRUE(st.ok()) << st; - std::unordered_map> - partition_columns; - std::unordered_map missing_columns; - st = p_reader->set_fill_columns(partition_columns, missing_columns); + ParquetInitContext pq_ctx; + pq_ctx.column_names = column_names; + pq_ctx.col_name_to_block_idx = &col_name_to_block_idx; + pq_ctx.params = &scan_params; + pq_ctx.range = &scan_range; + st = p_reader->init_reader(&pq_ctx); EXPECT_TRUE(st.ok()) << st; + // set_fill_columns logic is now inlined in _do_init_reader, + // so no separate call is needed. BlockUPtr block = Block::create_unique(); for (const auto& slot_desc : tuple_desc->slots()) { auto data_type = make_nullable(slot_desc->type()); @@ -961,6 +982,8 @@ TEST_F(ParquetReaderTest, only_partition_column) { TFileRangeDesc scan_range; scan_range.start_offset = 0; scan_range.size = 1000; + scan_range.__set_columns_from_path_keys({"part_col"}); + scan_range.__set_columns_from_path({"1"}); auto q_options = TQueryOptions(); q_options.__set_enable_adjust_conjunct_order_by_cost(true); RuntimeState runtime_state = RuntimeState(q_options, TQueryGlobals()); @@ -969,21 +992,24 @@ TEST_F(ParquetReaderTest, only_partition_column) { p_reader->set_file_reader(reader); runtime_state.set_desc_tbl(desc_tbl); - phmap::flat_hash_map>> tmp; auto conjuncts = create_only_partition_predicates(desc_tbl, &runtime_state); std::unordered_map slot_id_to_expr_ctxs; slot_id_to_expr_ctxs[0].emplace_back(conjuncts[0]); - st = p_reader->init_reader(column_names, &col_name_to_block_idx, conjuncts, tmp, tuple_desc, - nullptr, nullptr, nullptr, &slot_id_to_expr_ctxs); + auto column_descs = to_column_descs(column_names); + ParquetInitContext pq_ctx; + pq_ctx.column_descs = &column_descs; + pq_ctx.col_name_to_block_idx = &col_name_to_block_idx; + pq_ctx.conjuncts = &conjuncts; + pq_ctx.tuple_descriptor = tuple_desc; + pq_ctx.slot_id_to_filter_conjuncts = &slot_id_to_expr_ctxs; + pq_ctx.params = &scan_params; + pq_ctx.range = &scan_range; + st = p_reader->init_reader(&pq_ctx); EXPECT_TRUE(st.ok()) << st; - std::unordered_map> - partition_columns; - partition_columns.emplace("part_col", std::make_tuple("1", tuple_desc->slots()[0])); - std::unordered_map missing_columns; - st = p_reader->set_fill_columns(partition_columns, missing_columns); - EXPECT_TRUE(st.ok()) << st; + // Partition/missing column logic is now handled by on_before_init_reader + // via _extract_partition_values from scan_range. bool eof = false; size_t total_rows = 0; diff --git a/be/test/format/table/hive/hive_reader_create_column_ids_test.cpp b/be/test/format/table/hive/hive_reader_create_column_ids_test.cpp index 845594d608faee..7a884359027d73 100644 --- a/be/test/format/table/hive/hive_reader_create_column_ids_test.cpp +++ b/be/test/format/table/hive/hive_reader_create_column_ids_test.cpp @@ -660,33 +660,28 @@ class HiveReaderCreateColumnIdsTest : public ::testing::Test { cctz::time_zone ctz; TimezoneUtils::find_cctz_time_zone(TimezoneUtils::default_time_zone, ctz); - auto generic_reader = - ParquetReader::create_unique(&profile, scan_params, scan_range, 1024, &ctz, nullptr, - &runtime_state, cache.get()); - if (!generic_reader) { + + auto hive_reader = + std::make_unique(&profile, scan_params, scan_range, 1024, &ctz, + nullptr, &runtime_state, nullptr, cache.get()); + if (!hive_reader) { return {nullptr, nullptr}; } - auto parquet_reader = static_cast(generic_reader.get()); - parquet_reader->set_file_reader(file_reader); + hive_reader->set_file_reader(file_reader); const FieldDescriptor* field_desc = nullptr; - st = parquet_reader->get_file_metadata_schema(&field_desc); + st = hive_reader->get_file_metadata_schema(&field_desc); if (!st.ok() || !field_desc) { return {nullptr, nullptr}; } - auto hive_reader = std::make_unique( - std::move(generic_reader), &profile, &runtime_state, scan_params, scan_range, - nullptr, nullptr, cache.get()); - return {std::move(hive_reader), field_desc}; } // Helper function: Create and setup OrcReader std::tuple, const orc::Type*> create_orc_reader( const std::string& test_file) { - // Open the Hive Orc test file auto local_fs = io::global_local_filesystem(); io::FileReaderSPtr file_reader; auto st = local_fs->open_file(test_file, &file_reader); @@ -694,45 +689,31 @@ class HiveReaderCreateColumnIdsTest : public ::testing::Test { return {nullptr, nullptr}; } - // Setup runtime state RuntimeState runtime_state = RuntimeState(TQueryOptions(), TQueryGlobals()); - - // Setup scan parameters TFileScanRangeParams scan_params; scan_params.format_type = TFileFormatType::FORMAT_ORC; - TFileRangeDesc scan_range; scan_range.start_offset = 0; - scan_range.size = file_reader->size(); // Read entire file + scan_range.size = file_reader->size(); scan_range.path = test_file; - - // Create mock profile RuntimeProfile profile("test_profile"); - // Create OrcReader as the underlying file format reader cctz::time_zone ctz; TimezoneUtils::find_cctz_time_zone(TimezoneUtils::default_time_zone, ctz); - auto generic_reader = - OrcReader::create_unique(&profile, &runtime_state, scan_params, scan_range, 1024, - "CST", nullptr, cache.get()); - if (!generic_reader) { + auto hive_reader = + std::make_unique(&profile, &runtime_state, scan_params, scan_range, + 1024, "CST", nullptr, nullptr, cache.get()); + if (!hive_reader) { return {nullptr, nullptr}; } - auto orc_reader = static_cast(generic_reader.get()); - // Get FieldDescriptor from Orc file const orc::Type* orc_type_ptr = nullptr; - st = orc_reader->get_file_type(&orc_type_ptr); + st = hive_reader->get_file_type(&orc_type_ptr); if (!st.ok() || !orc_type_ptr) { return {nullptr, nullptr}; } - // Create HiveOrcReader - auto hive_reader = std::make_unique(std::move(generic_reader), &profile, - &runtime_state, scan_params, scan_range, - nullptr, nullptr, cache.get()); - return {std::move(hive_reader), orc_type_ptr}; } diff --git a/be/test/format/table/hive/hive_reader_test.cpp b/be/test/format/table/hive/hive_reader_test.cpp index 6b3711ff9794d9..7746cbffa60bac 100644 --- a/be/test/format/table/hive/hive_reader_test.cpp +++ b/be/test/format/table/hive/hive_reader_test.cpp @@ -44,6 +44,7 @@ #include "core/data_type/data_type_number.h" #include "core/data_type/data_type_string.h" #include "core/data_type/data_type_struct.h" +#include "format/orc/vorc_reader.h" #include "format/parquet/vparquet_reader.h" #include "io/fs/file_meta_cache.h" #include "io/fs/file_reader_writer_fwd.h" @@ -525,22 +526,15 @@ TEST_F(HiveReaderTest, read_hive_parquet_file) { // Create mock profile RuntimeProfile profile("test_profile"); - // Create ParquetReader as the underlying file format reader + // Create HiveParquetReader (directly inherits ParquetReader) cctz::time_zone ctz; TimezoneUtils::find_cctz_time_zone(TimezoneUtils::default_time_zone, ctz); + auto hive_reader = + std::make_unique(&profile, scan_params, scan_range, 1024, &ctz, + nullptr, &runtime_state, nullptr, cache.get()); - auto generic_reader = ParquetReader::create_unique(&profile, scan_params, scan_range, 1024, - &ctz, nullptr, &runtime_state, cache.get()); - ASSERT_NE(generic_reader, nullptr); - - // Set file reader for the generic reader - auto parquet_reader = static_cast(generic_reader.get()); - parquet_reader->set_file_reader(file_reader); - - // Create HiveParquetReader - auto hive_reader = std::make_unique(std::move(generic_reader), &profile, - &runtime_state, scan_params, scan_range, - nullptr, nullptr, cache.get()); + // Set file reader for the hive reader (inherited from ParquetReader) + hive_reader->set_file_reader(file_reader); // Create complex struct types using helper function DataTypePtr coordinates_struct_type, address_struct_type, phone_struct_type; @@ -564,24 +558,21 @@ TEST_F(HiveReaderTest, read_hive_parquet_file) { create_tuple_descriptor(&desc_tbl, obj_pool, t_desc_table, t_table_desc, table_column_names, table_column_positions, table_column_types); - VExprContextSPtrs conjuncts; // Empty conjuncts for this test - std::vector table_col_names = {"name", "profile"}; std::unordered_map col_name_to_block_idx = {{"name", 0}, {"profile", 1}}; - const RowDescriptor* row_descriptor = nullptr; - const std::unordered_map* colname_to_slot_id = nullptr; - const VExprContextSPtrs* not_single_slot_filter_conjuncts = nullptr; - const std::unordered_map* slot_id_to_filter_conjuncts = nullptr; - - phmap::flat_hash_map>> tmp; - st = hive_reader->init_reader(table_col_names, &col_name_to_block_idx, conjuncts, tmp, - tuple_descriptor, row_descriptor, colname_to_slot_id, - not_single_slot_filter_conjuncts, slot_id_to_filter_conjuncts); + + // Use the template method init_reader (inherited from ParquetReader) + // on_before_init_columns hook in HiveParquetReader will do schema matching + ParquetInitContext pq_ctx; + pq_ctx.column_names = table_column_names; + pq_ctx.col_name_to_block_idx = &col_name_to_block_idx; + pq_ctx.tuple_descriptor = tuple_descriptor; + pq_ctx.params = &scan_params; + pq_ctx.range = &scan_range; + st = hive_reader->init_reader(&pq_ctx); ASSERT_TRUE(st.ok()) << st; - std::unordered_map> - partition_columns; - std::unordered_map missing_columns; - ASSERT_TRUE(hive_reader->set_fill_columns(partition_columns, missing_columns).ok()); + // set_fill_columns logic is now inlined in _do_init_reader, + // so no separate call is needed. // Create block for reading nested structure (not flattened) Block block; @@ -667,18 +658,10 @@ TEST_F(HiveReaderTest, read_hive_rrc_file) { // Create mock profile RuntimeProfile profile("test_profile"); - // Create OrcReader as the underlying file format reader - cctz::time_zone ctz; - TimezoneUtils::find_cctz_time_zone(TimezoneUtils::default_time_zone, ctz); - - auto generic_reader = OrcReader::create_unique(&profile, &runtime_state, scan_params, - scan_range, 1024, "CST", nullptr, cache.get()); - ASSERT_NE(generic_reader, nullptr); - - // Create HiveOrcReader + // Create HiveOrcReader (directly inherits OrcReader) auto hive_reader = - std::make_unique(std::move(generic_reader), &profile, &runtime_state, - scan_params, scan_range, nullptr, nullptr, cache.get()); + std::make_unique(&profile, &runtime_state, scan_params, scan_range, 1024, + "CST", nullptr, nullptr, cache.get()); // Create complex struct types using helper function DataTypePtr coordinates_struct_type, address_struct_type, phone_struct_type; @@ -702,22 +685,19 @@ TEST_F(HiveReaderTest, read_hive_rrc_file) { create_tuple_descriptor(&desc_tbl, obj_pool, t_desc_table, t_table_desc, table_column_names, table_column_positions, table_column_types); - VExprContextSPtrs conjuncts; // Empty conjuncts for this test - std::vector table_col_names = {"name", "profile"}; std::unordered_map col_name_to_block_idx = {{"name", 0}, {"profile", 1}}; - const RowDescriptor* row_descriptor = nullptr; - const VExprContextSPtrs* not_single_slot_filter_conjuncts = nullptr; - const std::unordered_map* slot_id_to_filter_conjuncts = nullptr; - st = hive_reader->init_reader(table_col_names, &col_name_to_block_idx, conjuncts, - tuple_descriptor, row_descriptor, - not_single_slot_filter_conjuncts, slot_id_to_filter_conjuncts); + OrcInitContext orc_ctx; + orc_ctx.column_names = table_column_names; + orc_ctx.col_name_to_block_idx = &col_name_to_block_idx; + orc_ctx.tuple_descriptor = tuple_descriptor; + orc_ctx.params = &scan_params; + orc_ctx.range = &scan_range; + st = hive_reader->init_reader(&orc_ctx); ASSERT_TRUE(st.ok()) << st; - std::unordered_map> - partition_columns; - std::unordered_map missing_columns; - ASSERT_TRUE(hive_reader->set_fill_columns(partition_columns, missing_columns).ok()); + // set_fill_columns logic is now inlined in _do_init_reader, + // so no separate call is needed. // Create block for reading nested structure (not flattened) Block block; diff --git a/be/test/format/table/iceberg/iceberg_reader_create_column_ids_test.cpp b/be/test/format/table/iceberg/iceberg_reader_create_column_ids_test.cpp index d2de833c494fcd..e32153d1ef7f74 100644 --- a/be/test/format/table/iceberg/iceberg_reader_create_column_ids_test.cpp +++ b/be/test/format/table/iceberg/iceberg_reader_create_column_ids_test.cpp @@ -686,32 +686,26 @@ class IcebergReaderCreateColumnIdsTest : public ::testing::Test { // Create mock profile RuntimeProfile profile("test_profile"); - // Create ParquetReader as the underlying file format reader + // Create IcebergParquetReader (IS-A ParquetReader via CRTP mixin) cctz::time_zone ctz; TimezoneUtils::find_cctz_time_zone(TimezoneUtils::default_time_zone, ctz); - auto generic_reader = - ParquetReader::create_unique(&profile, scan_params, scan_range, 1024, &ctz, nullptr, - &runtime_state, cache.get()); - if (!generic_reader) { + auto iceberg_reader = std::make_unique( + nullptr /* kv_cache */, &profile, scan_params, scan_range, 1024, &ctz, + nullptr /* io_ctx */, &runtime_state, cache.get()); + if (!iceberg_reader) { return {nullptr, nullptr}; } - // Set file reader for the generic reader - auto parquet_reader = static_cast(generic_reader.get()); - parquet_reader->set_file_reader(file_reader); + // Set file reader directly on the iceberg reader (it IS the ParquetReader) + iceberg_reader->set_file_reader(file_reader); const FieldDescriptor* field_desc = nullptr; - st = parquet_reader->get_file_metadata_schema(&field_desc); + st = iceberg_reader->get_file_metadata_schema(&field_desc); if (!st.ok() || !field_desc) { return {nullptr, nullptr}; } - // Create IcebergParquetReader - auto iceberg_reader = std::make_unique( - std::move(generic_reader), &profile, &runtime_state, scan_params, scan_range, - nullptr, nullptr, cache.get()); - return {std::move(iceberg_reader), field_desc}; } @@ -741,30 +735,21 @@ class IcebergReaderCreateColumnIdsTest : public ::testing::Test { // Create mock profile RuntimeProfile profile("test_profile"); - // Create OrcReader as the underlying file format reader - cctz::time_zone ctz; - TimezoneUtils::find_cctz_time_zone(TimezoneUtils::default_time_zone, ctz); - - auto generic_reader = - OrcReader::create_unique(&profile, &runtime_state, scan_params, scan_range, 1024, - "CST", nullptr, cache.get()); - if (!generic_reader) { + // Create IcebergOrcReader (IS-A OrcReader via CRTP mixin) + auto iceberg_reader = std::make_unique( + nullptr /* kv_cache */, &profile, &runtime_state, scan_params, scan_range, 1024, + "CST", nullptr /* io_ctx */, cache.get()); + if (!iceberg_reader) { return {nullptr, nullptr}; } - auto orc_reader = static_cast(generic_reader.get()); - // Get FieldDescriptor from Orc file + // Get ORC type from the iceberg reader (it IS the OrcReader) const orc::Type* orc_type_ptr = nullptr; - st = orc_reader->get_file_type(&orc_type_ptr); + st = iceberg_reader->get_file_type(&orc_type_ptr); if (!st.ok() || !orc_type_ptr) { return {nullptr, nullptr}; } - // Create IcebergOrcReader - auto iceberg_reader = std::make_unique( - std::move(generic_reader), &profile, &runtime_state, scan_params, scan_range, - nullptr, nullptr, cache.get()); - return {std::move(iceberg_reader), orc_type_ptr}; } diff --git a/be/test/format/table/iceberg/iceberg_reader_test.cpp b/be/test/format/table/iceberg/iceberg_reader_test.cpp index ba387b8c0bb8a4..fb35b78c6174b0 100644 --- a/be/test/format/table/iceberg/iceberg_reader_test.cpp +++ b/be/test/format/table/iceberg/iceberg_reader_test.cpp @@ -45,6 +45,8 @@ #include "core/data_type/data_type_number.h" #include "core/data_type/data_type_string.h" #include "core/data_type/data_type_struct.h" +#include "format/column_descriptor.h" +#include "format/orc/vorc_reader.h" #include "format/parquet/vparquet_column_chunk_reader.h" #include "format/parquet/vparquet_reader.h" #include "io/fs/file_meta_cache.h" @@ -107,23 +109,18 @@ class IcebergReaderTest : public ::testing::Test { parquet_reader->set_file_reader(*file_reader); - phmap::flat_hash_map>> predicates; - st = parquet_reader->init_reader(delete_file_column_names, - &delete_file_col_name_to_block_idx, {}, predicates, - nullptr, nullptr, nullptr, nullptr, nullptr); + ParquetInitContext pq_ctx; + pq_ctx.column_names = delete_file_column_names; + pq_ctx.col_name_to_block_idx = &delete_file_col_name_to_block_idx; + pq_ctx.params = scan_params; + pq_ctx.range = scan_range; + st = parquet_reader->init_reader(&pq_ctx); EXPECT_TRUE(st.ok()) << st; if (!st.ok()) { return nullptr; } - std::unordered_map> - partition_columns; - std::unordered_map missing_columns; - st = parquet_reader->set_fill_columns(partition_columns, missing_columns); - EXPECT_TRUE(st.ok()) << st; - if (!st.ok()) { - return nullptr; - } + // Partition/missing column logic is now inlined in _do_init_reader. *file_meta_data = parquet_reader->get_meta_data(); return parquet_reader; @@ -705,22 +702,17 @@ TEST_F(IcebergReaderTest, read_iceberg_parquet_file) { // Create mock profile RuntimeProfile profile("test_profile"); - // Create ParquetReader as the underlying file format reader + // Create IcebergParquetReader (IS-A ParquetReader via CRTP mixin) cctz::time_zone ctz; TimezoneUtils::find_cctz_time_zone(TimezoneUtils::default_time_zone, ctz); - auto generic_reader = ParquetReader::create_unique(&profile, scan_params, scan_range, 1024, - &ctz, nullptr, &runtime_state, cache.get()); - ASSERT_NE(generic_reader, nullptr); - - // Set file reader for the generic reader - auto parquet_reader = static_cast(generic_reader.get()); - parquet_reader->set_file_reader(file_reader); - - // Create IcebergParquetReader auto iceberg_reader = std::make_unique( - std::move(generic_reader), &profile, &runtime_state, scan_params, scan_range, nullptr, - nullptr, cache.get()); + nullptr /* kv_cache */, &profile, scan_params, scan_range, 1024, &ctz, + nullptr /* io_ctx */, &runtime_state, cache.get()); + ASSERT_NE(iceberg_reader, nullptr); + + // Set file reader for the iceberg reader (it IS the ParquetReader) + iceberg_reader->set_file_reader(file_reader); // Create complex struct types using helper function DataTypePtr coordinates_struct_type, address_struct_type, phone_struct_type; @@ -738,27 +730,29 @@ TEST_F(IcebergReaderTest, read_iceberg_parquet_file) { const TupleDescriptor* tuple_descriptor = create_tuple_descriptor(&desc_tbl, obj_pool, t_desc_table, t_table_desc); - VExprContextSPtrs conjuncts; // Empty conjuncts for this test std::vector table_col_names = {"name", "profile"}; std::unordered_map col_name_to_block_idx = { {"name", 0}, {"profile", 1}, }; - const RowDescriptor* row_descriptor = nullptr; - const std::unordered_map* colname_to_slot_id = nullptr; - const VExprContextSPtrs* not_single_slot_filter_conjuncts = nullptr; - const std::unordered_map* slot_id_to_filter_conjuncts = nullptr; - - phmap::flat_hash_map>> tmp; - st = iceberg_reader->init_reader(table_col_names, &col_name_to_block_idx, conjuncts, tmp, - tuple_descriptor, row_descriptor, colname_to_slot_id, - not_single_slot_filter_conjuncts, slot_id_to_filter_conjuncts); + + std::vector column_descs; + for (const auto& name : table_col_names) { + ColumnDescriptor desc; + desc.name = name; + column_descs.push_back(desc); + } + ParquetInitContext pq_ctx; + pq_ctx.column_descs = &column_descs; + pq_ctx.col_name_to_block_idx = &col_name_to_block_idx; + pq_ctx.tuple_descriptor = tuple_descriptor; + pq_ctx.params = &scan_params; + pq_ctx.range = &scan_range; + st = iceberg_reader->init_reader(&pq_ctx); ASSERT_TRUE(st.ok()) << st; - std::unordered_map> - partition_columns; - std::unordered_map missing_columns; - ASSERT_TRUE(iceberg_reader->set_fill_columns(partition_columns, missing_columns).ok()); + // set_fill_columns logic is now inlined in _do_init_reader, + // so no separate call is needed. // Create block for reading nested structure (not flattened) Block block; @@ -845,18 +839,11 @@ TEST_F(IcebergReaderTest, read_iceberg_orc_file) { // Create mock profile RuntimeProfile profile("test_profile"); - // Create OrcReader as the underlying file format reader - cctz::time_zone ctz; - TimezoneUtils::find_cctz_time_zone(TimezoneUtils::default_time_zone, ctz); - - auto generic_reader = OrcReader::create_unique(&profile, &runtime_state, scan_params, - scan_range, 1024, "CST", nullptr, cache.get()); - ASSERT_NE(generic_reader, nullptr); - - // Create IcebergOrcReader + // Create IcebergOrcReader (IS-A OrcReader via CRTP mixin) auto iceberg_reader = std::make_unique( - std::move(generic_reader), &profile, &runtime_state, scan_params, scan_range, nullptr, - nullptr, cache.get()); + nullptr /* kv_cache */, &profile, &runtime_state, scan_params, scan_range, 1024, "CST", + nullptr /* io_ctx */, cache.get()); + ASSERT_NE(iceberg_reader, nullptr); // Create complex struct types using helper function DataTypePtr coordinates_struct_type, address_struct_type, phone_struct_type; @@ -874,26 +861,31 @@ TEST_F(IcebergReaderTest, read_iceberg_orc_file) { const TupleDescriptor* tuple_descriptor = create_tuple_descriptor(&desc_tbl, obj_pool, t_desc_table, t_table_desc); - VExprContextSPtrs conjuncts; // Empty conjuncts for this test std::vector table_col_names = {"name", "profile"}; const RowDescriptor* row_descriptor = nullptr; - const std::unordered_map* colname_to_slot_id = nullptr; std::unordered_map col_name_to_block_idx = { {"name", 0}, {"profile", 1}, }; - const VExprContextSPtrs* not_single_slot_filter_conjuncts = nullptr; - const std::unordered_map* slot_id_to_filter_conjuncts = nullptr; - st = iceberg_reader->init_reader(table_col_names, &col_name_to_block_idx, conjuncts, - tuple_descriptor, row_descriptor, colname_to_slot_id, - not_single_slot_filter_conjuncts, slot_id_to_filter_conjuncts); + std::vector column_descs; + for (const auto& name : table_col_names) { + ColumnDescriptor desc; + desc.name = name; + column_descs.push_back(desc); + } + OrcInitContext orc_ctx; + orc_ctx.column_descs = &column_descs; + orc_ctx.col_name_to_block_idx = &col_name_to_block_idx; + orc_ctx.tuple_descriptor = tuple_descriptor; + orc_ctx.row_descriptor = row_descriptor; + orc_ctx.params = &scan_params; + orc_ctx.range = &scan_range; + st = iceberg_reader->init_reader(&orc_ctx); ASSERT_TRUE(st.ok()) << st; - std::unordered_map> - partition_columns; - std::unordered_map missing_columns; - ASSERT_TRUE(iceberg_reader->set_fill_columns(partition_columns, missing_columns).ok()); + // set_fill_columns logic is now inlined in _do_init_reader, + // so no separate call is needed. // Create block for reading nested structure (not flattened) Block block; diff --git a/be/test/format/table/table_schema_change_helper_test.cpp b/be/test/format/table/table_schema_change_helper_test.cpp index ba1d96e4d6d3c9..de653c1486a5f0 100644 --- a/be/test/format/table/table_schema_change_helper_test.cpp +++ b/be/test/format/table/table_schema_change_helper_test.cpp @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +#include "format/table/table_schema_change_helper.h" + #include #include @@ -24,7 +26,6 @@ #include "core/column/column_string.h" #include "core/data_type/data_type_factory.hpp" #include "format/table/iceberg_reader.h" -#include "format/table/table_format_reader.h" #include "testutil/desc_tbl_builder.h" namespace doris { @@ -337,7 +338,7 @@ TEST(MockTableSchemaChangeHelper, IcebergParquetSchemaChange) { bool exist_field_id = true; std::shared_ptr ans_node = nullptr; ASSERT_TRUE(TableSchemaChangeHelper::BuildTableInfoUtil::by_parquet_field_id( - test_field, parquet_field, exist_field_id, ans_node) + test_field, parquet_field, ans_node, exist_field_id) .ok()); ASSERT_TRUE(exist_field_id); std::cout << TableSchemaChangeHelper::debug(ans_node) << "\n"; @@ -420,7 +421,7 @@ TEST(MockTableSchemaChangeHelper, IcebergOrcSchemaChange) { bool exist_field_id = true; std::shared_ptr ans_node = nullptr; ASSERT_TRUE(TableSchemaChangeHelper::BuildTableInfoUtil::by_orc_field_id( - test_field, orc_type.get(), attribute, exist_field_id, ans_node) + test_field, orc_type.get(), attribute, ans_node, exist_field_id) .ok()); ASSERT_TRUE(exist_field_id); @@ -806,7 +807,7 @@ TEST(MockTableSchemaChangeHelper, OrcFieldIdNestedStructMap) { bool exist_field_id = true; std::shared_ptr ans_node = nullptr; ASSERT_TRUE(TableSchemaChangeHelper::BuildTableInfoUtil::by_orc_field_id( - test_field, orc_type.get(), attribute, exist_field_id, ans_node) + test_field, orc_type.get(), attribute, ans_node, exist_field_id) .ok()); ASSERT_TRUE(exist_field_id); diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java index 9f3bbe2376ab77..c6a07c9cdcd5cb 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java @@ -45,6 +45,7 @@ import org.apache.doris.spi.Split; import org.apache.doris.system.Backend; import org.apache.doris.tablefunction.ExternalFileTableValuedFunction; +import org.apache.doris.thrift.TColumnCategory; import org.apache.doris.thrift.TExternalScanRange; import org.apache.doris.thrift.TFileAttributes; import org.apache.doris.thrift.TFileCompressType; @@ -169,11 +170,9 @@ protected void initSchemaParams() throws UserException { for (SlotDescriptor slot : desc.getSlots()) { TFileScanSlotInfo slotInfo = new TFileScanSlotInfo(); slotInfo.setSlotId(slot.getId().asInt()); - boolean isFileSlot = !partitionKeys.contains(slot.getColumn().getName()); - if (isIcebergRowIdColumn(slot)) { - isFileSlot = false; - } - slotInfo.setIsFileSlot(isFileSlot); + TColumnCategory category = classifyColumn(slot, partitionKeys); + slotInfo.setCategory(category); + slotInfo.setIsFileSlot(category == TColumnCategory.REGULAR || category == TColumnCategory.GENERATED); params.addToRequiredSlots(slotInfo); } setDefaultValueExprs(getTargetTable(), destSlotDescByName, null, params, false); @@ -190,19 +189,27 @@ private void updateRequiredSlots() throws UserException { for (SlotDescriptor slot : desc.getSlots()) { TFileScanSlotInfo slotInfo = new TFileScanSlotInfo(); slotInfo.setSlotId(slot.getId().asInt()); - boolean isFileSlot = !getPathPartitionKeys().contains(slot.getColumn().getName()); - if (isIcebergRowIdColumn(slot)) { - isFileSlot = false; - } - slotInfo.setIsFileSlot(isFileSlot); + TColumnCategory category = classifyColumn(slot, getPathPartitionKeys()); + slotInfo.setCategory(category); + slotInfo.setIsFileSlot(category == TColumnCategory.REGULAR || category == TColumnCategory.GENERATED); params.addToRequiredSlots(slotInfo); } // Update required slots and column_idxs in scanRangeLocations. setColumnPositionMapping(); } - private boolean isIcebergRowIdColumn(SlotDescriptor slot) { - return Column.ICEBERG_ROWID_COL.equalsIgnoreCase(slot.getColumn().getName()); + /** + * Classify a column's category for the BE reader. + * Subclasses override this for format-specific classification. + */ + protected TColumnCategory classifyColumn(SlotDescriptor slot, List partitionKeys) { + if (Column.ICEBERG_ROWID_COL.equalsIgnoreCase(slot.getColumn().getName())) { + return TColumnCategory.SYNTHESIZED; + } + if (partitionKeys.contains(slot.getColumn().getName())) { + return TColumnCategory.PARTITION_KEY; + } + return TColumnCategory.REGULAR; } public void setTableSample(TableSample tSample) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileScanNode.java index c8aa0baab0ec88..e00463f16d3052 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileScanNode.java @@ -42,6 +42,7 @@ import org.apache.doris.thrift.TFileRangeDesc; import org.apache.doris.thrift.TFileScanNode; import org.apache.doris.thrift.TFileScanRangeParams; +import org.apache.doris.thrift.TFileScanSlotInfo; import org.apache.doris.thrift.TPlanNode; import org.apache.doris.thrift.TPlanNodeType; import org.apache.doris.thrift.TPushAggOp; @@ -252,6 +253,15 @@ protected void setDefaultValueExprs(TableIf tbl, nameToSlotDesc.put(slot.getColumn().getName(), slot); } + // Build slot_id -> index map for required_slots to set default_value_expr inline. + Map slotIdToRequiredIdx = Maps.newHashMap(); + if (params.getRequiredSlots() != null) { + for (int i = 0; i < params.getRequiredSlots().size(); i++) { + TFileScanSlotInfo slotInfo = params.getRequiredSlots().get(i); + slotIdToRequiredIdx.put(slotInfo.getSlotId(), i); + } + } + for (Column column : desc.getTable().getFullSchema()) { Expr expr; Expression expression; @@ -293,19 +303,28 @@ protected void setDefaultValueExprs(TableIf tbl, // default value. // and if z is not nullable, the load will fail. if (slotDesc != null) { + TExpr defaultExpr; if (expression != null) { expression = TypeCoercionUtils.castIfNotSameType(expression, DataType.fromCatalogType(slotDesc.getType())); expr = ExpressionTranslator.translate(expression, new PlanTranslatorContext(CascadesContext.initTempContext())); - params.putToDefaultValueOfSrcSlot(slotDesc.getId().asInt(), ExprToThriftVisitor.treeToThrift(expr)); + defaultExpr = ExprToThriftVisitor.treeToThrift(expr); } else { - params.putToDefaultValueOfSrcSlot(slotDesc.getId().asInt(), tExpr); + defaultExpr = tExpr; + } + // Populate legacy map (for backward compatibility with old BE) + params.putToDefaultValueOfSrcSlot(slotDesc.getId().asInt(), defaultExpr); + // Also embed default expr directly in the TFileScanSlotInfo + Integer idx = slotIdToRequiredIdx.get(slotDesc.getId().asInt()); + if (idx != null) { + params.getRequiredSlots().get(idx).setDefaultValueExpr(defaultExpr); } } } } + protected void addFileCacheAdmissionLog(String userIdentity, Boolean admitted, String reason, double durationMs) { String admissionStatus = admitted ? "ADMITTED" : "DENIED"; String admissionLog = String.format("file cache request %s: user_identity:%s, reason:%s, cost:%.6f ms", diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/load/NereidsLoadPlanInfoCollector.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/load/NereidsLoadPlanInfoCollector.java index 74fbab4590d216..f5a2f9397f3b39 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/load/NereidsLoadPlanInfoCollector.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/load/NereidsLoadPlanInfoCollector.java @@ -187,14 +187,26 @@ public TFileScanRangeParams toFileScanRangeParams(TUniqueId loadId, NereidsFileG params.putToExprOfDestSlot(entry.getKey().asInt(), ExprToThriftVisitor.treeToThrift(entry.getValue())); } + // Build slot_id -> index map for required_slots to set default_value_expr inline. + Map slotIdToRequiredIdx = Maps.newHashMap(); + for (int i = 0; i < params.getRequiredSlots().size(); i++) { + slotIdToRequiredIdx.put(params.getRequiredSlots().get(i).getSlotId(), i); + } + for (Map.Entry entry : srcSlotIdToDefaultValueMap.entrySet()) { + TExpr defaultExpr; if (entry.getValue() != null) { - params.putToDefaultValueOfSrcSlot(entry.getKey().asInt(), - ExprToThriftVisitor.treeToThrift(entry.getValue())); + defaultExpr = ExprToThriftVisitor.treeToThrift(entry.getValue()); } else { - TExpr tExpr = new TExpr(); - tExpr.setNodes(Lists.newArrayList()); - params.putToDefaultValueOfSrcSlot(entry.getKey().asInt(), tExpr); + defaultExpr = new TExpr(); + defaultExpr.setNodes(Lists.newArrayList()); + } + // Populate legacy map (for backward compatibility with old BE) + params.putToDefaultValueOfSrcSlot(entry.getKey().asInt(), defaultExpr); + // Also embed default expr directly in the TFileScanSlotInfo + Integer idx = slotIdToRequiredIdx.get(entry.getKey().asInt()); + if (idx != null) { + params.getRequiredSlots().get(idx).setDefaultValueExpr(defaultExpr); } } diff --git a/gensrc/thrift/PlanNodes.thrift b/gensrc/thrift/PlanNodes.thrift index 60c5e01ecf8fdd..ae7427e3327c55 100644 --- a/gensrc/thrift/PlanNodes.thrift +++ b/gensrc/thrift/PlanNodes.thrift @@ -253,9 +253,21 @@ struct TFileTextScanRangeParams { 8: optional bool empty_field_as_null } +enum TColumnCategory { + REGULAR = 0, + PARTITION_KEY = 1, + SYNTHESIZED = 2, + GENERATED = 3, +} + struct TFileScanSlotInfo { 1: optional Types.TSlotId slot_id; 2: optional bool is_file_slot; + 3: optional TColumnCategory category; + // Default value expression for this column when it is missing from the data file. + // Populated by FE from Column.getDefaultValue() or NULL literal. + // This replaces the separate default_value_of_src_slot map in TFileScanRangeParams. + 4: optional Exprs.TExpr default_value_expr; } // descirbe how to read file diff --git a/regression-test/suites/external_table_p0/iceberg/action/test_iceberg_v3_row_lineage_rewrite_data_files.groovy b/regression-test/suites/external_table_p0/iceberg/action/test_iceberg_v3_row_lineage_rewrite_data_files.groovy deleted file mode 100644 index 438276c6946950..00000000000000 --- a/regression-test/suites/external_table_p0/iceberg/action/test_iceberg_v3_row_lineage_rewrite_data_files.groovy +++ /dev/null @@ -1,244 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -suite("test_iceberg_v3_row_lineage_rewrite_data_files", "p0,external,iceberg,external_docker,external_docker_iceberg") { - String enabled = context.config.otherConfigs.get("enableIcebergTest") - if (enabled == null || !enabled.equalsIgnoreCase("true")) { - logger.info("Iceberg test is disabled") - return - } - - String catalogName = "test_iceberg_v3_row_lineage_rewrite_data_files" - String dbName = "test_row_lineage_rewrite_db" - String restPort = context.config.otherConfigs.get("iceberg_rest_uri_port") - String minioPort = context.config.otherConfigs.get("iceberg_minio_port") - String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") - String endpoint = "http://${externalEnvIp}:${minioPort}" - - def formats = ["parquet", "orc"] - - def schemaContainsField = { schemaRows, fieldName -> - String target = fieldName.toLowerCase() - return schemaRows.any { row -> row.toString().toLowerCase().contains(target) } - } - - def fileSchemaRows = { filePath, format -> - return sql(""" - desc function s3( - "uri" = "${filePath}", - "format" = "${format}", - "s3.access_key" = "admin", - "s3.secret_key" = "password", - "s3.endpoint" = "${endpoint}", - "s3.region" = "us-east-1" - ) - """) - } - - def assertCurrentFilesContainRowLineageColumns = { tableName, format -> - def files = sql("""select file_path, lower(file_format) from ${tableName}\$files order by file_path""") - log.info("Checking rewritten files for physical row lineage columns in ${tableName}: ${files}") - assertTrue(files.size() > 0, "Current files should exist for ${tableName}") - files.each { row -> - assertEquals(format, row[1].toString()) - assertTrue(row[0].toString().endsWith(format == "parquet" ? ".parquet" : ".orc"), - "Current data file should match ${format} for ${tableName}, file=${row[0]}") - def schemaRows = fileSchemaRows(row[0].toString(), format) - log.info("Rewritten ${format} schema for ${tableName}, file=${row[0]} -> ${schemaRows}") - assertTrue(schemaContainsField(schemaRows, "_row_id"), - "Rewritten file should physically contain _row_id for ${tableName}, schema=${schemaRows}") - assertTrue(schemaContainsField(schemaRows, "_last_updated_sequence_number"), - "Rewritten file should physically contain _last_updated_sequence_number for ${tableName}, schema=${schemaRows}") - } - } - - def assertCurrentFilesDoNotContainRowLineageColumns = { tableName, format -> - def files = sql("""select file_path, lower(file_format) from ${tableName}\$files order by file_path""") - log.info("Checking regular INSERT files for absence of physical row lineage columns in ${tableName}: ${files}") - assertTrue(files.size() > 0, "Current files should exist for ${tableName}") - files.each { row -> - assertEquals(format, row[1].toString()) - assertTrue(row[0].toString().endsWith(format == "parquet" ? ".parquet" : ".orc"), - "Current data file should match ${format} for ${tableName}, file=${row[0]}") - def schemaRows = fileSchemaRows(row[0].toString(), format) - log.info("Regular INSERT ${format} schema for ${tableName}, file=${row[0]} -> ${schemaRows}") - assertTrue(!schemaContainsField(schemaRows, "_row_id"), - "Normal INSERT file should not contain _row_id for ${tableName}, schema=${schemaRows}") - assertTrue(!schemaContainsField(schemaRows, "_last_updated_sequence_number"), - "Normal INSERT file should not contain _last_updated_sequence_number for ${tableName}, schema=${schemaRows}") - } - } - - def lineageMap = { tableName -> - def rows = sql(""" - select id, _row_id, _last_updated_sequence_number - from ${tableName} - order by id - """) - Map> result = [:] - rows.each { row -> - result[row[0].toString().toInteger()] = [row[1].toString(), row[2].toString()] - } - log.info("Built lineage map for ${tableName}: ${result}") - return result - } - - def assertLineageMapEquals = { expected, actual, tableName -> - log.info("Comparing lineage maps for ${tableName}: expected=${expected}, actual=${actual}") - assertEquals(expected.size(), actual.size()) - expected.each { key, value -> - assertTrue(actual.containsKey(key), "Missing id=${key} after rewrite for ${tableName}") - assertEquals(value[0], actual[key][0]) - assertEquals(value[1], actual[key][1]) - } - } - - def runRewriteAndAssert = { tableName, format, expectedCount -> - def filesBefore = sql("""select file_path from ${tableName}\$files order by file_path""") - def snapshotsBefore = sql("""select snapshot_id from ${tableName}\$snapshots order by committed_at""") - log.info("Checking rewrite preconditions for ${tableName}: filesBefore=${filesBefore}, snapshotsBefore=${snapshotsBefore}") - assertTrue(filesBefore.size() >= 2, - "Rewrite test requires at least 2 input files for ${tableName}, but got ${filesBefore.size()}") - - def visibleBefore = sql("""select * from ${tableName} order by id""") - def rowLineageBefore = lineageMap(tableName) - log.info("Visible rows before rewrite for ${tableName}: ${visibleBefore}") - - assertCurrentFilesDoNotContainRowLineageColumns(tableName, format) - - def rewriteResult = sql(""" - alter table ${catalogName}.${dbName}.${tableName} - execute rewrite_data_files( - "target-file-size-bytes" = "10485760", - "min-input-files" = "1" - ) - """) - log.info("rewrite_data_files result for ${tableName}: ${rewriteResult}") - assertTrue(rewriteResult.size() > 0, "rewrite_data_files should return summary rows for ${tableName}") - int rewrittenFiles = rewriteResult[0][0] as int - assertTrue(rewrittenFiles > 0, "rewrite_data_files should rewrite at least one file for ${tableName}") - - def visibleAfter = sql("""select * from ${tableName} order by id""") - log.info("Visible rows after rewrite for ${tableName}: ${visibleAfter}") - assertEquals(visibleBefore, visibleAfter) - - def rowLineageAfter = lineageMap(tableName) - assertLineageMapEquals(rowLineageBefore, rowLineageAfter, tableName) - - def countAfter = sql("""select count(*) from ${tableName}""") - log.info("Checking row count after rewrite for ${tableName}: ${countAfter}") - assertEquals(expectedCount, countAfter[0][0].toString().toInteger()) - - def snapshotsAfter = sql("""select snapshot_id from ${tableName}\$snapshots order by committed_at""") - log.info("Snapshots after rewrite for ${tableName}: ${snapshotsAfter}") - assertTrue(snapshotsAfter.size() > snapshotsBefore.size(), - "rewrite_data_files should create a new snapshot for ${tableName}") - - assertCurrentFilesContainRowLineageColumns(tableName, format) - - def sampleRowId = rowLineageAfter.entrySet().iterator().next().value[0] - def sampleQuery = sql("""select count(*) from ${tableName} where _row_id = ${sampleRowId}""") - log.info("Checking sample _row_id predicate after rewrite for ${tableName}: sampleRowId=${sampleRowId}, result=${sampleQuery}") - assertEquals(1, sampleQuery[0][0].toString().toInteger()) - } - - sql """drop catalog if exists ${catalogName}""" - sql """ - create catalog if not exists ${catalogName} properties ( - "type" = "iceberg", - "iceberg.catalog.type" = "rest", - "uri" = "http://${externalEnvIp}:${restPort}", - "s3.access_key" = "admin", - "s3.secret_key" = "password", - "s3.endpoint" = "${endpoint}", - "s3.region" = "us-east-1" - ) - """ - - sql """switch ${catalogName}""" - sql """create database if not exists ${dbName}""" - sql """use ${dbName}""" - sql """set enable_fallback_to_original_planner = false""" - sql """set show_hidden_columns = false""" - - try { - formats.each { format -> - String rewriteTable = "test_row_lineage_rewrite_unpartitioned_${format}" - String rewritePartitionTable = "test_row_lineage_rewrite_partitioned_${format}" - log.info("Run rewrite_data_files row lineage test with format ${format}") - - try { - sql """drop table if exists ${rewriteTable}""" - sql """ - create table ${rewriteTable} ( - id int, - name string, - score int - ) engine=iceberg - properties ( - "format-version" = "3", - "write.format.default" = "${format}" - ) - """ - - sql """insert into ${rewriteTable} values (1, 'A', 10), (2, 'B', 20)""" - sql """insert into ${rewriteTable} values (3, 'C', 30), (4, 'D', 40)""" - sql """insert into ${rewriteTable} values (5, 'E', 50), (6, 'F', 60)""" - log.info("Inserted three batches into ${rewriteTable} to prepare rewrite_data_files input files") - - // Assert baseline: - // 1. Data files from regular INSERT do not physically contain the two row lineage columns. - // 2. After rewrite_data_files, every current data file should contain both row lineage columns. - // 3. Visible query results stay unchanged before and after rewrite. - // 4. _row_id and _last_updated_sequence_number stay stable for every row across rewrite. - runRewriteAndAssert(rewriteTable, format, 6) - - sql """drop table if exists ${rewritePartitionTable}""" - sql """ - create table ${rewritePartitionTable} ( - id int, - name string, - score int, - dt date - ) engine=iceberg - partition by list (day(dt)) () - properties ( - "format-version" = "3", - "write.format.default" = "${format}" - ) - """ - - sql """insert into ${rewritePartitionTable} values (11, 'P1', 10, '2024-01-01'), (12, 'P2', 20, '2024-01-01')""" - sql """insert into ${rewritePartitionTable} values (13, 'P3', 30, '2024-01-01'), (14, 'P4', 40, '2024-02-01')""" - sql """insert into ${rewritePartitionTable} values (15, 'P5', 50, '2024-02-01'), (16, 'P6', 60, '2024-01-01')""" - log.info("Inserted three partitioned batches into ${rewritePartitionTable} to prepare rewrite_data_files input files") - - // Assert baseline: - // 1. Partitioned tables also write row lineage columns physically only during rewrite. - // 2. Business data and row lineage values stay stable before and after rewrite. - // 3. _row_id predicate queries remain available after rewrite. - runRewriteAndAssert(rewritePartitionTable, format, 6) - } finally { - sql """drop table if exists ${rewritePartitionTable}""" - sql """drop table if exists ${rewriteTable}""" - } - } - } finally { - sql """drop database if exists ${dbName} force""" - sql """drop catalog if exists ${catalogName}""" - } -} diff --git a/regression-test/suites/external_table_p0/iceberg/test_iceberg_v2_to_v3_doris_spark_compare.groovy b/regression-test/suites/external_table_p0/iceberg/test_iceberg_v2_to_v3_doris_spark_compare.groovy deleted file mode 100644 index df6d1bbea20087..00000000000000 --- a/regression-test/suites/external_table_p0/iceberg/test_iceberg_v2_to_v3_doris_spark_compare.groovy +++ /dev/null @@ -1,223 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -suite("test_iceberg_v2_to_v3_doris_spark_compare", "p0,external,iceberg,external_docker,external_docker_iceberg") { - def enabled = context.config.otherConfigs.get("enableIcebergTest") - if (enabled == null || !enabled.equalsIgnoreCase("true")) { - logger.info("Iceberg test is disabled") - return - } - - def catalogName = "test_iceberg_v2_to_v3_doris_spark_compare" - def dbName = "test_v2_to_v3_doris_spark_compare_db" - def restPort = context.config.otherConfigs.get("iceberg_rest_uri_port") - def minioPort = context.config.otherConfigs.get("iceberg_minio_port") - def externalEnvIp = context.config.otherConfigs.get("externalEnvIp") - - def formats = ["parquet", "orc"] - - def tableNameForFormat = { baseName, format -> - return format == "parquet" ? baseName : "${baseName}_orc" - } - - sql """drop catalog if exists ${catalogName}""" - sql """ - create catalog if not exists ${catalogName} properties ( - "type" = "iceberg", - "iceberg.catalog.type" = "rest", - "uri" = "http://${externalEnvIp}:${restPort}", - "s3.access_key" = "admin", - "s3.secret_key" = "password", - "s3.endpoint" = "http://${externalEnvIp}:${minioPort}", - "s3.region" = "us-east-1" - ) - """ - - sql """switch ${catalogName}""" - sql """use ${dbName}""" - sql """set enable_fallback_to_original_planner = false""" - - try { - def assertV2RowsAreNullAfterUpgrade = { tableName -> - def rows = sql """ - select id, _row_id, _last_updated_sequence_number - from ${tableName} - order by id - """ - assertEquals(2, rows.size()) - rows.each { row -> - assertTrue(row[1] == null, - "_row_id should be null for v2 rows after upgrade in ${tableName}, row=${row}") - assertTrue(row[2] == null, - "_last_updated_sequence_number should be null for v2 rows after upgrade in ${tableName}, row=${row}") - } - } - - def assertV23RowsNotNullAfterUpd = { tableName -> - def rows = sql """ - select id, _row_id, _last_updated_sequence_number - from ${tableName} - order by id - """ - rows.each { row -> - assertTrue(row[1] != null, - "_row_id should be non-null after Doris operator for ${tableName}") - assertTrue(row[2] != null, - "_last_updated_sequence_number should be non-null after Doris operator for ${tableName}") - - } - } - - def upgradeV3DorisOperationInsert = { tableName -> - assertV2RowsAreNullAfterUpgrade(tableName) - - sql """ - insert into ${tableName} values - (4, 'post_v3_i', 400, date '2024-01-04') - """ - - def rows = sql """ - select id, tag, score, _row_id, _last_updated_sequence_number - from ${tableName} - order by id - """ - assertEquals(3, rows.size()) - assertEquals(4, rows[2][0].toString().toInteger()) - assertEquals("post_v3_i", rows[2][1]) - assertV23RowsNotNullAfterUpd(tableName) - } - - def upgradeV3DorisOperationDelete = { tableName -> - assertV2RowsAreNullAfterUpgrade(tableName) - - sql """ - delete from ${tableName} - where id = 3 - """ - - def rows = sql """ - select id, tag, score - from ${tableName} - order by id - """ - assertEquals(1, rows.size()) - assertEquals(1, rows[0][0].toString().toInteger()) - assertV23RowsNotNullAfterUpd(tableName) - - } - - def upgradeV3DorisOperationUpdate = { tableName -> - assertV2RowsAreNullAfterUpgrade(tableName) - - sql """ - update ${tableName} - set tag = 'post_v3_u', score = score + 20 - where id = 1 - """ - - def rows = sql """ - select id, tag, score - from ${tableName} - order by id - """ - assertEquals(2, rows.size()) - assertEquals(1, rows[0][0].toString().toInteger()) - assertEquals("post_v3_u", rows[0][1]) - assertV23RowsNotNullAfterUpd(tableName) - } - - def upgradeV3DorisOperationRewrite = { tableName -> - assertV2RowsAreNullAfterUpgrade(tableName) - - def rewriteResult = sql(""" - alter table ${catalogName}.${dbName}.${tableName} - execute rewrite_data_files( - "target-file-size-bytes" = "10485760", - "min-input-files" = "1" - ) - """) - assertTrue(rewriteResult.size() > 0, - "rewrite_data_files should return summary rows for ${tableName}") - - def rowCount = sql """ - select count(*) - from ${tableName} - """ - assertEquals(2, rowCount[0][0].toString().toInteger()) - assertV23RowsNotNullAfterUpd(tableName) - } - - formats.each { format -> - def rowLineageNullTable = tableNameForFormat("v2v3_row_lineage_null_after_upgrade", format) - def sparkReferenceTable = tableNameForFormat("v2v3_spark_ops_reference", format) - def dorisTargetTable = tableNameForFormat("v2v3_doris_ops_target", format) - log.info("Run v2-to-v3 Doris/Spark compare test with format ${format}") - - def scenario1Rows = sql """ - select id, _row_id, _last_updated_sequence_number - from ${rowLineageNullTable} - order by id - """ - assertEquals(3, scenario1Rows.size()) - scenario1Rows.each { row -> - assertTrue(row[1] == null, - "_row_id should be null for rows written before v3 upgrade, row=${row}") - assertTrue(row[2] == null, - "_last_updated_sequence_number should be null for rows written before v3 upgrade, row=${row}") - } - - sql """ - update ${dorisTargetTable} - set tag = 'post_v3_u', score = score + 20 - where id = 2 - """ - - sql """ - insert into ${dorisTargetTable} values - (4, 'post_v3_i', 400, date '2024-02-04') - """ - - def dorisRewriteResult = sql(""" - alter table ${catalogName}.${dbName}.${dorisTargetTable} - execute rewrite_data_files( - "target-file-size-bytes" = "10485760", - "min-input-files" = "1" - ) - """) - assertTrue(dorisRewriteResult.size() > 0, - "Doris rewrite_data_files should return summary rows") - - check_sqls_result_equal """ - select * - from ${dorisTargetTable} - order by id - """, """ - select * - from ${sparkReferenceTable} - order by id - """ - - upgradeV3DorisOperationInsert(tableNameForFormat("v2v3_doris_upd_case1", format)) - upgradeV3DorisOperationDelete(tableNameForFormat("v2v3_doris_upd_case2", format)) - upgradeV3DorisOperationUpdate(tableNameForFormat("v2v3_doris_upd_case3", format)) - upgradeV3DorisOperationRewrite(tableNameForFormat("v2v3_doris_upd_case4", format)) - } - - } finally { - sql """drop catalog if exists ${catalogName}""" - } -} diff --git a/regression-test/suites/external_table_p0/iceberg/test_iceberg_v3_row_lineage_query_insert.groovy b/regression-test/suites/external_table_p0/iceberg/test_iceberg_v3_row_lineage_query_insert.groovy deleted file mode 100644 index 7276fadba76b2c..00000000000000 --- a/regression-test/suites/external_table_p0/iceberg/test_iceberg_v3_row_lineage_query_insert.groovy +++ /dev/null @@ -1,304 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -suite("test_iceberg_v3_row_lineage_query_insert", "p0,external,iceberg,external_docker,external_docker_iceberg") { - String enabled = context.config.otherConfigs.get("enableIcebergTest") - if (enabled == null || !enabled.equalsIgnoreCase("true")) { - logger.info("Iceberg test is disabled") - return - } - - String catalogName = "test_iceberg_v3_row_lineage_query_insert" - String dbName = "test_row_lineage_query_insert_db" - String restPort = context.config.otherConfigs.get("iceberg_rest_uri_port") - String minioPort = context.config.otherConfigs.get("iceberg_minio_port") - String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") - String endpoint = "http://${externalEnvIp}:${minioPort}" - - def formats = ["parquet", "orc"] - - def collectDescColumns = { rows -> - return rows.collect { row -> row[0].toString().toLowerCase() } - } - - def schemaContainsField = { schemaRows, fieldName -> - String target = fieldName.toLowerCase() - return schemaRows.any { row -> row.toString().toLowerCase().contains(target) } - } - - def fileSchemaRows = { filePath, format -> - return sql(""" - desc function s3( - "uri" = "${filePath}", - "format" = "${format}", - "s3.access_key" = "admin", - "s3.secret_key" = "password", - "s3.endpoint" = "${endpoint}", - "s3.region" = "us-east-1" - ) - """) - } - - def assertCurrentFilesDoNotContainRowLineageColumns = { tableName, format, messagePrefix -> - def files = sql("""select file_path, lower(file_format) from ${tableName}\$files order by file_path""") - log.info("${messagePrefix}: checking ${files.size()} current data files for ${tableName}: ${files}") - assertTrue(files.size() > 0, "Current data files should exist for ${tableName}") - files.each { row -> - assertEquals(format, row[1].toString()) - assertTrue(row[0].toString().endsWith(format == "parquet" ? ".parquet" : ".orc"), - "${messagePrefix} should write ${format} files for ${tableName}, file=${row[0]}") - def schemaRows = fileSchemaRows(row[0].toString(), format) - log.info("${messagePrefix}: ${format} schema for ${tableName}, file=${row[0]} -> ${schemaRows}") - assertTrue(!schemaContainsField(schemaRows, "_row_id"), - "${messagePrefix} should not physically write _row_id, schema=${schemaRows}") - assertTrue(!schemaContainsField(schemaRows, "_last_updated_sequence_number"), - "${messagePrefix} should not physically write _last_updated_sequence_number, schema=${schemaRows}") - } - } - - def assertRowLineageHiddenColumns = { tableName, visibleColumnCount -> - sql("""set show_hidden_columns = false""") - def descDefault = sql("""desc ${tableName}""") - def defaultColumns = collectDescColumns(descDefault) - log.info("Checking hidden-column default visibility for ${tableName}: desc=${descDefault}") - assertTrue(!defaultColumns.contains("_row_id"), - "DESC default should hide _row_id for ${tableName}, got ${defaultColumns}") - assertTrue(!defaultColumns.contains("_last_updated_sequence_number"), - "DESC default should hide _last_updated_sequence_number for ${tableName}, got ${defaultColumns}") - - def selectVisible = sql("""select * from ${tableName} order by id""") - log.info("Checking visible SELECT * layout for ${tableName}: rowCount=${selectVisible.size()}, firstRow=${selectVisible ? selectVisible[0] : 'EMPTY'}") - assertTrue(selectVisible.size() > 0, "SELECT * should return rows for ${tableName}") - assertEquals(visibleColumnCount, selectVisible[0].size()) - - sql("""set show_hidden_columns = true""") - def descHidden = sql("""desc ${tableName}""") - def hiddenColumns = collectDescColumns(descHidden) - log.info("Checking hidden-column enabled visibility for ${tableName}: desc=${descHidden}") - assertTrue(hiddenColumns.contains("_row_id"), - "DESC with show_hidden_columns=true should expose _row_id for ${tableName}, got ${hiddenColumns}") - assertTrue(hiddenColumns.contains("_last_updated_sequence_number"), - "DESC with show_hidden_columns=true should expose _last_updated_sequence_number for ${tableName}, got ${hiddenColumns}") - - def selectHidden = sql("""select * from ${tableName} order by id""") - log.info("Checking hidden SELECT * layout for ${tableName}: rowCount=${selectHidden.size()}, firstRow=${selectHidden ? selectHidden[0] : 'EMPTY'}") - assertTrue(selectHidden.size() > 0, "SELECT * with hidden columns should return rows for ${tableName}") - assertEquals(visibleColumnCount + 2 + 1, selectHidden[0].size()) // _row_id + _last_updated_sequence_number + __DORIS_ICEBERG_ROWID_COL__ - - sql("""set show_hidden_columns = false""") - } - - def assertExplicitRowLineageReadable = { tableName, expectedIds -> - def rowLineageRows = sql(""" - select id, _row_id, _last_updated_sequence_number - from ${tableName} - order by id - """) - log.info("Checking explicit row lineage projection for ${tableName}: rows=${rowLineageRows}") - assertEquals(expectedIds.size(), rowLineageRows.size()) - for (int i = 0; i < expectedIds.size(); i++) { - assertEquals(expectedIds[i], rowLineageRows[i][0].toString().toInteger()) - assertTrue(rowLineageRows[i][1] != null, - "_row_id should be non-null for ${tableName}, row=${rowLineageRows[i]}") - assertTrue(rowLineageRows[i][2] != null, - "_last_updated_sequence_number should be non-null for ${tableName}, row=${rowLineageRows[i]}") - } - - long firstRowId = rowLineageRows[0][1].toString().toLong() - long secondRowId = rowLineageRows[1][1].toString().toLong() - assertTrue(firstRowId < secondRowId, - "Row lineage ids should increase with row position for ${tableName}, rows=${rowLineageRows}") - - def byRowId = sql("""select id from ${tableName} where _row_id = ${firstRowId} order by id""") - log.info("Checking single _row_id predicate for ${tableName}: rowId=${firstRowId}, result=${byRowId}") - assertEquals(1, byRowId.size()) - assertEquals(expectedIds[0], byRowId[0][0].toString().toInteger()) - - def combinedPredicate = sql(""" - select id - from ${tableName} - where id >= ${expectedIds[1]} and _row_id in (${rowLineageRows[1][1]}, ${rowLineageRows[2][1]}) - order by id - """) - log.info("Checking combined business + _row_id predicate for ${tableName}: result=${combinedPredicate}") - assertEquals(2, combinedPredicate.size()) - assertEquals(expectedIds[1], combinedPredicate[0][0].toString().toInteger()) - assertEquals(expectedIds[2], combinedPredicate[1][0].toString().toInteger()) - } - - sql """drop catalog if exists ${catalogName}""" - sql """ - create catalog if not exists ${catalogName} properties ( - "type" = "iceberg", - "iceberg.catalog.type" = "rest", - "uri" = "http://${externalEnvIp}:${restPort}", - "s3.access_key" = "admin", - "s3.secret_key" = "password", - "s3.endpoint" = "${endpoint}", - "s3.region" = "us-east-1" - ) - """ - - sql """switch ${catalogName}""" - sql """create database if not exists ${dbName}""" - sql """use ${dbName}""" - sql """set enable_fallback_to_original_planner = false""" - sql """set show_hidden_columns = false""" - - try { - formats.each { format -> - String unpartitionedTable = "test_row_lineage_query_insert_unpartitioned_${format}" - String partitionedTable = "test_row_lineage_query_insert_partitioned_${format}" - log.info("Run row lineage query/insert test with format ${format}") - - try { - sql """drop table if exists ${unpartitionedTable}""" - sql """ - create table ${unpartitionedTable} ( - id int, - name string, - age int - ) engine=iceberg - properties ( - "format-version" = "3", - "write.format.default" = "${format}" - ) - """ - - sql """ - insert into ${unpartitionedTable} values(1, 'Alice', 25); - """ - sql """ insert into ${unpartitionedTable} values(2, 'Bob', 30) """ - sql """ insert into ${unpartitionedTable} values(3, 'Charlie', 35) """ - - log.info("Inserted initial rows into ${unpartitionedTable}") - - // Assert baseline: - // 1. DESC and SELECT * hide row lineage columns by default. - // 2. show_hidden_columns=true exposes both hidden columns in DESC and SELECT *. - // 3. Explicit SELECT on row lineage columns returns non-null values. - assertRowLineageHiddenColumns(unpartitionedTable, 3) - assertExplicitRowLineageReadable(unpartitionedTable, [1, 2, 3]) - - test { - sql """insert into ${unpartitionedTable}(_row_id, id, name, age) values (1, 9, 'BadRow', 99)""" - exception "Cannot specify row lineage column '_row_id' in INSERT statement" - } - - test { - sql """ - insert into ${unpartitionedTable}(_last_updated_sequence_number, id, name, age) - values (1, 10, 'BadSeq', 100) - """ - exception "Cannot specify row lineage column '_last_updated_sequence_number' in INSERT statement" - } - - sql """insert into ${unpartitionedTable}(id, name, age) values (4, 'Doris', 40)""" - def unpartitionedCount = sql """select count(*) from ${unpartitionedTable}""" - log.info("Checking row count after regular INSERT for ${unpartitionedTable}: result=${unpartitionedCount}") - assertEquals(4, unpartitionedCount[0][0].toString().toInteger()) - - assertCurrentFilesDoNotContainRowLineageColumns( - unpartitionedTable, - format, - "Unpartitioned normal INSERT") - - sql """drop table if exists ${partitionedTable}""" - sql """ - create table ${partitionedTable} ( - id int, - name string, - age int, - dt date - ) engine=iceberg - partition by list (day(dt)) () - properties ( - "format-version" = "3", - "write.format.default" = "${format}" - ) - """ - - sql """ insert into ${partitionedTable} values(11, 'Penny', 21, '2024-01-01')""" - sql """ insert into ${partitionedTable} values(12, 'Quinn', 22, '2024-01-02')""" - sql """ insert into ${partitionedTable} values(13, 'Rita', 23, '2024-01-03')""" - - log.info("Inserted initial rows into ${partitionedTable}") - - // Assert baseline: - // 1. Partitioned tables follow the same row lineage semantics as unpartitioned tables. - // 2. Explicit SELECT on _row_id remains readable under partition predicates. - // 3. Regular INSERT still rejects hidden columns and does not write them physically. - assertRowLineageHiddenColumns(partitionedTable, 4) - - def partitionLineageRows = sql """ - select id, _row_id, _last_updated_sequence_number - from ${partitionedTable} - where dt >= '2024-01-01' - order by id - """ - log.info("Checking partitioned row lineage projection for ${partitionedTable}: rows=${partitionLineageRows}") - assertEquals(3, partitionLineageRows.size()) - partitionLineageRows.each { row -> - assertTrue(row[1] != null, "_row_id should be non-null for partitioned table row=${row}") - assertTrue(row[2] != null, "_last_updated_sequence_number should be non-null for partitioned table row=${row}") - } - - def exactPartitionPredicate = sql """ - select id - from ${partitionedTable} - where dt = '2024-01-02' and _row_id = ${partitionLineageRows[1][1]} - """ - log.info("Checking exact partition + _row_id predicate for ${partitionedTable}: result=${exactPartitionPredicate}") - assertEquals(1, exactPartitionPredicate.size()) - assertEquals(12, exactPartitionPredicate[0][0].toString().toInteger()) - - test { - sql """ - insert into ${partitionedTable}(_row_id, id, name, age, dt) - values (1, 14, 'BadPartitionRow', 24, '2024-01-04') - """ - exception "Cannot specify row lineage column '_row_id' in INSERT statement" - } - - test { - sql """ - insert into ${partitionedTable}(_last_updated_sequence_number, id, name, age, dt) - values (1, 15, 'BadPartitionSeq', 25, '2024-01-05') - """ - exception "Cannot specify row lineage column '_last_updated_sequence_number' in INSERT statement" - } - - sql """insert into ${partitionedTable}(id, name, age, dt) values (14, 'Sara', 24, '2024-01-04')""" - def partitionedCount = sql """select count(*) from ${partitionedTable}""" - log.info("Checking row count after regular INSERT for ${partitionedTable}: result=${partitionedCount}") - assertEquals(4, partitionedCount[0][0].toString().toInteger()) - - assertCurrentFilesDoNotContainRowLineageColumns( - partitionedTable, - format, - "Partitioned normal INSERT") - } finally { - sql """drop table if exists ${partitionedTable}""" - sql """drop table if exists ${unpartitionedTable}""" - } - } - } finally { - sql """set show_hidden_columns = false""" - sql """drop database if exists ${dbName} force""" - sql """drop catalog if exists ${catalogName}""" - } -} diff --git a/regression-test/suites/external_table_p0/iceberg/test_iceberg_v3_row_lineage_update_delete_merge.groovy b/regression-test/suites/external_table_p0/iceberg/test_iceberg_v3_row_lineage_update_delete_merge.groovy deleted file mode 100644 index 4bce7387f864bb..00000000000000 --- a/regression-test/suites/external_table_p0/iceberg/test_iceberg_v3_row_lineage_update_delete_merge.groovy +++ /dev/null @@ -1,292 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -suite("test_iceberg_v3_row_lineage_update_delete_merge", "p0,external,iceberg,external_docker,external_docker_iceberg") { - String enabled = context.config.otherConfigs.get("enableIcebergTest") - if (enabled == null || !enabled.equalsIgnoreCase("true")) { - logger.info("Iceberg test is disabled") - return - } - - String catalogName = "test_iceberg_v3_row_lineage_update_delete_merge" - String dbName = "test_row_lineage_update_delete_merge_db" - String restPort = context.config.otherConfigs.get("iceberg_rest_uri_port") - String minioPort = context.config.otherConfigs.get("iceberg_minio_port") - String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") - String endpoint = "http://${externalEnvIp}:${minioPort}" - - def formats = ["parquet", "orc"] - - def schemaContainsField = { schemaRows, fieldName -> - String target = fieldName.toLowerCase() - return schemaRows.any { row -> row.toString().toLowerCase().contains(target) } - } - - def fileSchemaRows = { filePath, format -> - return sql(""" - desc function s3( - "uri" = "${filePath}", - "format" = "${format}", - "s3.access_key" = "admin", - "s3.secret_key" = "password", - "s3.endpoint" = "${endpoint}", - "s3.region" = "us-east-1" - ) - """) - } - - def assertDeleteFilesArePuffin = { tableName -> - def deleteFiles = sql(""" - select file_path, lower(file_format) - from ${tableName}\$delete_files - order by file_path - """) - log.info("Checking delete files for ${tableName}: ${deleteFiles}") - assertTrue(deleteFiles.size() > 0, "V3 table ${tableName} should produce delete files") - deleteFiles.each { row -> - assertTrue(row[0].toString().endsWith(".puffin"), - "V3 delete file should be Puffin: ${row}") - assertEquals("puffin", row[1].toString()) - } - } - - def assertAtLeastOneCurrentDataFileHasRowLineageColumns = { tableName, format -> - def currentFiles = sql("""select file_path, lower(file_format) from ${tableName}\$data_files order by file_path""") - log.info("Checking current data files for physical row lineage columns in ${tableName}: ${currentFiles}") - assertTrue(currentFiles.size() > 0, "Current data files should exist for ${tableName}") - - boolean found = false - currentFiles.each { row -> - assertEquals(format, row[1].toString()) - assertTrue(row[0].toString().endsWith(format == "parquet" ? ".parquet" : ".orc"), - "Current data file should match ${format} for ${tableName}, file=${row[0]}") - def schemaRows = fileSchemaRows(row[0].toString(), format) - log.info("${format} schema for ${tableName}, file=${row[0]} -> ${schemaRows}") - if (schemaContainsField(schemaRows, "_row_id") - && schemaContainsField(schemaRows, "_last_updated_sequence_number")) { - found = true - } - } - assertTrue(found, "At least one current data file should physically contain row lineage columns for ${tableName}") - } - - def assertExplicitRowLineageNonNull = { tableName, expectedRowCount -> - def rows = sql(""" - select id, _row_id, _last_updated_sequence_number - from ${tableName} - order by id - """) - log.info("Checking explicit row lineage projection for ${tableName}: rows=${rows}") - assertEquals(expectedRowCount, rows.size()) - rows.each { row -> - assertTrue(row[1] != null, "_row_id should be non-null for ${tableName}, row=${row}") - assertTrue(row[2] != null, "_last_updated_sequence_number should be non-null for ${tableName}, row=${row}") - } - } - - def lineageMap = { tableName -> - def rows = sql(""" - select id, _row_id, _last_updated_sequence_number - from ${tableName} - order by id - """) - Map> result = [:] - rows.each { row -> - result[row[0].toString().toInteger()] = [row[1].toString(), row[2].toString()] - } - log.info("Built lineage map for ${tableName}: ${result}") - return result - } - - sql """drop catalog if exists ${catalogName}""" - sql """ - create catalog if not exists ${catalogName} properties ( - "type" = "iceberg", - "iceberg.catalog.type" = "rest", - "uri" = "http://${externalEnvIp}:${restPort}", - "s3.access_key" = "admin", - "s3.secret_key" = "password", - "s3.endpoint" = "${endpoint}", - "s3.region" = "us-east-1" - ) - """ - - sql """switch ${catalogName}""" - sql """create database if not exists ${dbName}""" - sql """use ${dbName}""" - sql """set enable_fallback_to_original_planner = false""" - sql """set show_hidden_columns = false""" - - try { - formats.each { format -> - String updateDeleteTable = "test_row_lineage_v3_update_delete_${format}" - String mergeTable = "test_row_lineage_v3_merge_${format}" - log.info("Run row lineage update/delete/merge test with format ${format}") - - try { - sql """drop table if exists ${updateDeleteTable}""" - sql """ - create table ${updateDeleteTable} ( - id int, - name string, - age int - ) engine=iceberg - properties ( - "format-version" = "3", - "write.format.default" = "${format}" - ) - """ - - sql """insert into ${updateDeleteTable} values (1, 'Alice', 25) """ - sql """insert into ${updateDeleteTable} values (2, 'Bob', 30) """ - sql """insert into ${updateDeleteTable} values (3, 'Charlie', 35)""" - - def updateDeleteLineageBefore = lineageMap(updateDeleteTable) - log.info("Lineage before UPDATE/DELETE on ${updateDeleteTable}: ${updateDeleteLineageBefore}") - sql """update ${updateDeleteTable} set name = 'Alice_u', age = 26 where id = 1""" - sql """delete from ${updateDeleteTable} where id = 2""" - - // Assert baseline: - // 1. UPDATE keeps rows readable and applies the new values. - // 2. DELETE removes the target row. - // 3. V3 delete files use Puffin deletion vectors instead of delete_pos parquet/orc files. - // 4. Explicit row lineage reads remain non-null after DML. - def updateDeleteRows = sql """select * from ${updateDeleteTable} order by id""" - log.info("Checking table rows after UPDATE/DELETE on ${updateDeleteTable}: ${updateDeleteRows}") - assertEquals(2, updateDeleteRows.size()) - assertEquals(1, updateDeleteRows[0][0].toString().toInteger()) - assertEquals("Alice_u", updateDeleteRows[0][1]) - assertEquals(26, updateDeleteRows[0][2].toString().toInteger()) - assertEquals(3, updateDeleteRows[1][0].toString().toInteger()) - assertEquals("Charlie", updateDeleteRows[1][1]) - assertEquals(35, updateDeleteRows[1][2].toString().toInteger()) - - assertExplicitRowLineageNonNull(updateDeleteTable, 2) - def updateDeleteLineageAfter = lineageMap(updateDeleteTable) - log.info("Lineage after UPDATE/DELETE on ${updateDeleteTable}: ${updateDeleteLineageAfter}") - assertEquals(updateDeleteLineageBefore[1][0], updateDeleteLineageAfter[1][0]) - assertTrue(updateDeleteLineageBefore[1][1] != updateDeleteLineageAfter[1][1], - "UPDATE should change _last_updated_sequence_number for id=1") - assertTrue(updateDeleteLineageAfter[1][1].toLong() > updateDeleteLineageBefore[1][1].toLong(), - "UPDATE should advance _last_updated_sequence_number for id=1") - assertEquals(updateDeleteLineageBefore[3][0], updateDeleteLineageAfter[3][0]) - assertEquals(updateDeleteLineageBefore[3][1], updateDeleteLineageAfter[3][1]) - assertTrue(!updateDeleteLineageAfter.containsKey(2), "Deleted row id=2 should not remain after DELETE") - assertDeleteFilesArePuffin(updateDeleteTable) - assertAtLeastOneCurrentDataFileHasRowLineageColumns(updateDeleteTable, format) - - def minRowIdAfterUpdate = sql """ - select min(_row_id) - from ${updateDeleteTable} - """ - def rowIdFilterResult = sql """ - select count(*) - from ${updateDeleteTable} - where _row_id = ${minRowIdAfterUpdate[0][0]} - """ - log.info("Checking _row_id filter after UPDATE/DELETE on ${updateDeleteTable}: minRowId=${minRowIdAfterUpdate}, result=${rowIdFilterResult}") - assertEquals(1, rowIdFilterResult[0][0].toString().toInteger()) - - sql """drop table if exists ${mergeTable}""" - sql """ - create table ${mergeTable} ( - id int, - name string, - age int, - dt date - ) engine=iceberg - partition by list (day(dt)) () - properties ( - "format-version" = "3", - "write.format.default" = "${format}" - ) - """ - - sql """ insert into ${mergeTable} values (1, 'Penny', 21, '2024-01-01') """ - sql """ insert into ${mergeTable} values (2, 'Quinn', 22, '2024-01-02') """ - sql """ insert into ${mergeTable} values (3, 'Rita', 23, '2024-01-03') """ - - def mergeLineageBefore = lineageMap(mergeTable) - log.info("Lineage before MERGE on ${mergeTable}: ${mergeLineageBefore}") - sql """ - merge into ${mergeTable} t - using ( - select 1 as id, 'Penny_u' as name, 31 as age, date '2024-01-01' as dt, 'U' as flag - union all - select 2, 'Quinn', 22, date '2024-01-02', 'D' - union all - select 4, 'Sara', 24, date '2024-01-04', 'I' - ) s - on t.id = s.id - when matched and s.flag = 'D' then delete - when matched then update set - name = s.name, - age = s.age - when not matched then insert (id, name, age, dt) - values (s.id, s.name, s.age, s.dt) - """ - - // Assert baseline: - // 1. MERGE applies DELETE, UPDATE, and INSERT actions in one statement. - // 2. The partitioned MERGE still writes Puffin deletion vectors. - // 3. At least one current data file written by MERGE contains physical row lineage columns. - def mergeRows = sql """select * from ${mergeTable} order by id""" - log.info("Checking table rows after MERGE on ${mergeTable}: ${mergeRows}") - assertEquals(3, mergeRows.size()) - assertEquals(1, mergeRows[0][0].toString().toInteger()) - assertEquals("Penny_u", mergeRows[0][1]) - assertEquals(31, mergeRows[0][2].toString().toInteger()) - assertEquals(3, mergeRows[1][0].toString().toInteger()) - assertEquals("Rita", mergeRows[1][1]) - assertEquals(23, mergeRows[1][2].toString().toInteger()) - assertEquals(4, mergeRows[2][0].toString().toInteger()) - assertEquals("Sara", mergeRows[2][1]) - assertEquals(24, mergeRows[2][2].toString().toInteger()) - - assertExplicitRowLineageNonNull(mergeTable, 3) - def mergeLineageAfter = lineageMap(mergeTable) - log.info("Lineage after MERGE on ${mergeTable}: ${mergeLineageAfter}") - assertEquals(mergeLineageBefore[1][0], mergeLineageAfter[1][0]) - assertTrue(mergeLineageBefore[1][1] != mergeLineageAfter[1][1], - "MERGE UPDATE should change _last_updated_sequence_number for id=1") - assertTrue(mergeLineageAfter[1][1].toLong() > mergeLineageBefore[1][1].toLong(), - "MERGE UPDATE should advance _last_updated_sequence_number for id=1") - assertEquals(mergeLineageBefore[3][0], mergeLineageAfter[3][0]) - assertEquals(mergeLineageBefore[3][1], mergeLineageAfter[3][1]) - assertTrue(!mergeLineageAfter.containsKey(2), "MERGE DELETE should remove id=2") - assertDeleteFilesArePuffin(mergeTable) - assertAtLeastOneCurrentDataFileHasRowLineageColumns(mergeTable, format) - - def insertedRowLineage = sql """ - select _row_id, _last_updated_sequence_number - from ${mergeTable} - where id = 4 - """ - log.info("Checking inserted MERGE row lineage for ${mergeTable}: ${insertedRowLineage}") - assertEquals(1, insertedRowLineage.size()) - assertTrue(insertedRowLineage[0][0] != null, "Inserted MERGE row should get generated _row_id") - assertTrue(insertedRowLineage[0][1] != null, "Inserted MERGE row should get generated _last_updated_sequence_number") - } finally { - sql """drop table if exists ${mergeTable}""" - sql """drop table if exists ${updateDeleteTable}""" - } - } - } finally { - sql """drop database if exists ${dbName} force""" - sql """drop catalog if exists ${catalogName}""" - } -}