diff --git a/CMakeLists.txt b/CMakeLists.txt index 0dd40437f..0eec8fe90 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -353,52 +353,52 @@ set(DUCKDB_SRC_FILES src/duckdb/third_party/zstd/dict/divsufsort.cpp src/duckdb/third_party/zstd/dict/fastcover.cpp src/duckdb/third_party/zstd/dict/zdict.cpp - src/duckdb/extension/core_functions/function_list.cpp - src/duckdb/extension/core_functions/lambda_functions.cpp src/duckdb/extension/core_functions/core_functions_extension.cpp - src/duckdb/ub_extension_core_functions_scalar_blob.cpp + src/duckdb/extension/core_functions/lambda_functions.cpp + src/duckdb/extension/core_functions/function_list.cpp + src/duckdb/ub_extension_core_functions_aggregate_holistic.cpp + src/duckdb/ub_extension_core_functions_aggregate_nested.cpp + src/duckdb/ub_extension_core_functions_aggregate_distributive.cpp + src/duckdb/ub_extension_core_functions_aggregate_algebraic.cpp + src/duckdb/ub_extension_core_functions_aggregate_regression.cpp + src/duckdb/extension/core_functions/scalar/bit/bitstring.cpp + src/duckdb/ub_extension_core_functions_scalar_generic.cpp src/duckdb/extension/core_functions/scalar/enum/enum_functions.cpp - src/duckdb/ub_extension_core_functions_scalar_date.cpp + src/duckdb/ub_extension_core_functions_scalar_debug.cpp + src/duckdb/extension/core_functions/scalar/math/numeric.cpp src/duckdb/ub_extension_core_functions_scalar_array.cpp - src/duckdb/ub_extension_core_functions_scalar_struct.cpp - src/duckdb/ub_extension_core_functions_scalar_list.cpp src/duckdb/ub_extension_core_functions_scalar_union.cpp - src/duckdb/extension/core_functions/scalar/bit/bitstring.cpp - src/duckdb/ub_extension_core_functions_scalar_debug.cpp + src/duckdb/ub_extension_core_functions_scalar_blob.cpp + src/duckdb/ub_extension_core_functions_scalar_list.cpp src/duckdb/ub_extension_core_functions_scalar_map.cpp - src/duckdb/ub_extension_core_functions_scalar_generic.cpp + src/duckdb/ub_extension_core_functions_scalar_date.cpp src/duckdb/extension/core_functions/scalar/operators/bitwise.cpp src/duckdb/ub_extension_core_functions_scalar_random.cpp - src/duckdb/extension/core_functions/scalar/math/numeric.cpp src/duckdb/ub_extension_core_functions_scalar_string.cpp - src/duckdb/ub_extension_core_functions_aggregate_distributive.cpp - src/duckdb/ub_extension_core_functions_aggregate_nested.cpp - src/duckdb/ub_extension_core_functions_aggregate_holistic.cpp - src/duckdb/ub_extension_core_functions_aggregate_regression.cpp - src/duckdb/ub_extension_core_functions_aggregate_algebraic.cpp + src/duckdb/ub_extension_core_functions_scalar_struct.cpp src/duckdb/extension/parquet/parquet_reader.cpp - src/duckdb/extension/parquet/parquet_geometry.cpp - src/duckdb/extension/parquet/parquet_extension.cpp - src/duckdb/extension/parquet/parquet_timestamp.cpp - src/duckdb/extension/parquet/serialize_parquet.cpp src/duckdb/extension/parquet/parquet_statistics.cpp + src/duckdb/extension/parquet/zstd_file_system.cpp src/duckdb/extension/parquet/parquet_file_metadata_cache.cpp - src/duckdb/extension/parquet/parquet_multi_file_info.cpp src/duckdb/extension/parquet/parquet_shredding.cpp + src/duckdb/extension/parquet/parquet_multi_file_info.cpp + src/duckdb/extension/parquet/parquet_metadata.cpp + src/duckdb/extension/parquet/parquet_timestamp.cpp + src/duckdb/extension/parquet/parquet_crypto.cpp + src/duckdb/extension/parquet/serialize_parquet.cpp src/duckdb/extension/parquet/parquet_column_schema.cpp - src/duckdb/extension/parquet/zstd_file_system.cpp + src/duckdb/extension/parquet/parquet_extension.cpp src/duckdb/extension/parquet/parquet_field_id.cpp + src/duckdb/extension/parquet/column_writer.cpp src/duckdb/extension/parquet/column_reader.cpp - src/duckdb/extension/parquet/parquet_float16.cpp src/duckdb/extension/parquet/parquet_writer.cpp - src/duckdb/extension/parquet/column_writer.cpp - src/duckdb/extension/parquet/parquet_crypto.cpp - src/duckdb/extension/parquet/parquet_metadata.cpp - src/duckdb/ub_extension_parquet_writer.cpp - src/duckdb/ub_extension_parquet_writer_variant.cpp + src/duckdb/extension/parquet/parquet_float16.cpp + src/duckdb/extension/parquet/parquet_geometry.cpp src/duckdb/ub_extension_parquet_reader.cpp src/duckdb/ub_extension_parquet_reader_variant.cpp src/duckdb/ub_extension_parquet_decoder.cpp + src/duckdb/ub_extension_parquet_writer.cpp + src/duckdb/ub_extension_parquet_writer_variant.cpp src/duckdb/third_party/parquet/parquet_types.cpp src/duckdb/third_party/thrift/thrift/protocol/TProtocol.cpp src/duckdb/third_party/thrift/thrift/transport/TTransportException.cpp @@ -437,32 +437,32 @@ set(DUCKDB_SRC_FILES src/duckdb/third_party/brotli/enc/metablock.cpp src/duckdb/third_party/brotli/enc/static_dict.cpp src/duckdb/third_party/brotli/enc/utf8_util.cpp - src/duckdb/extension/icu/./icu-current.cpp - src/duckdb/extension/icu/./icu-timezone.cpp - src/duckdb/extension/icu/./icu-datetrunc.cpp - src/duckdb/extension/icu/./icu-dateadd.cpp - src/duckdb/extension/icu/./icu-list-range.cpp + src/duckdb/extension/icu/./icu-timebucket.cpp src/duckdb/extension/icu/./icu-datepart.cpp + src/duckdb/extension/icu/./icu-datetrunc.cpp + src/duckdb/extension/icu/./icu-current.cpp + src/duckdb/extension/icu/./icu_extension.cpp src/duckdb/extension/icu/./icu-table-range.cpp - src/duckdb/extension/icu/./icu-timebucket.cpp - src/duckdb/extension/icu/./icu-datesub.cpp - src/duckdb/extension/icu/./icu-makedate.cpp src/duckdb/extension/icu/./icu-datefunc.cpp - src/duckdb/extension/icu/./icu_extension.cpp + src/duckdb/extension/icu/./icu-makedate.cpp + src/duckdb/extension/icu/./icu-dateadd.cpp + src/duckdb/extension/icu/./icu-timezone.cpp src/duckdb/extension/icu/./icu-strptime.cpp + src/duckdb/extension/icu/./icu-datesub.cpp + src/duckdb/extension/icu/./icu-list-range.cpp src/duckdb/ub_extension_icu_third_party_icu_common.cpp src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp src/duckdb/extension/icu/third_party/icu/stubdata/stubdata.cpp + src/duckdb/extension/json/json_reader.cpp + src/duckdb/extension/json/json_functions.cpp + src/duckdb/extension/json/json_deserializer.cpp src/duckdb/extension/json/json_serializer.cpp + src/duckdb/extension/json/json_scan.cpp src/duckdb/extension/json/serialize_json.cpp - src/duckdb/extension/json/json_extension.cpp src/duckdb/extension/json/json_enums.cpp - src/duckdb/extension/json/json_scan.cpp - src/duckdb/extension/json/json_common.cpp - src/duckdb/extension/json/json_functions.cpp + src/duckdb/extension/json/json_extension.cpp src/duckdb/extension/json/json_multi_file_info.cpp - src/duckdb/extension/json/json_deserializer.cpp - src/duckdb/extension/json/json_reader.cpp + src/duckdb/extension/json/json_common.cpp src/duckdb/ub_extension_json_json_functions.cpp src/duckdb/generated_extension_loader_package_build.cpp) diff --git a/src/duckdb/extension/core_functions/aggregate/nested/binned_histogram.cpp b/src/duckdb/extension/core_functions/aggregate/nested/binned_histogram.cpp index d830d4bcf..267dba5ae 100644 --- a/src/duckdb/extension/core_functions/aggregate/nested/binned_histogram.cpp +++ b/src/duckdb/extension/core_functions/aggregate/nested/binned_histogram.cpp @@ -266,6 +266,19 @@ void IsHistogramOtherBinFunction(DataChunk &args, ExpressionState &state, Vector auto v = OtherBucketValue(input_type); Vector ref(v); VectorOperations::NotDistinctFrom(args.data[0], ref, result, args.size()); + + // Set NULL if input is NULL. + UnifiedVectorFormat input_data; + args.data[0].ToUnifiedFormat(args.size(), input_data); + if (!input_data.validity.AllValid()) { + auto &result_validity = FlatVector::Validity(result); + for (idx_t idx = 0; idx < args.size(); ++idx) { + auto input_idx = input_data.sel->get_index(idx); + if (!input_data.validity.RowIsValid(input_idx)) { + result_validity.SetInvalid(idx); + } + } + } } template diff --git a/src/duckdb/src/common/types/row/tuple_data_collection.cpp b/src/duckdb/src/common/types/row/tuple_data_collection.cpp index 6301d3170..92c631283 100644 --- a/src/duckdb/src/common/types/row/tuple_data_collection.cpp +++ b/src/duckdb/src/common/types/row/tuple_data_collection.cpp @@ -40,6 +40,10 @@ TupleDataCollection::~TupleDataCollection() { void TupleDataCollection::Initialize() { D_ASSERT(!layout.GetTypes().empty()); + if (TuplesPerBlock() == 0) { + throw NotImplementedException("Too many columns: tuple width exceeds block size of %llu", + allocator->GetBufferManager().GetBlockSize()); + } this->count = 0; this->data_size = 0; if (layout.IsSortKeyLayout()) { diff --git a/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index 078d04b77..3e824fd4c 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -1510,24 +1510,27 @@ void StringValueScanner::ProcessOverBufferValue() { } else { value = string_t(over_buffer_string.c_str(), UnsafeNumericCast(over_buffer_string.size())); if (result.escaped) { - if (result.parse_chunk.data[result.chunk_col_id].GetType() != LogicalType::VARCHAR) { - // We cant have escapes on non varchar columns - result.current_errors.Insert(CAST_ERROR, result.cur_col_id, result.chunk_col_id, - result.last_position); - if (!result.state_machine.options.IgnoreErrors()) { - // We have to write the cast error message. - std::ostringstream error; - // Casting Error Message - error << "Could not convert string \"" - << std::string(over_buffer_string.c_str(), over_buffer_string.size()) << "\" to \'" - << LogicalTypeIdToString(result.parse_types[result.chunk_col_id].type_id) << "\'"; - auto error_string = error.str(); - FullLinePosition::SanitizeError(error_string); - result.current_errors.ModifyErrorMessageOfLastError(error_string); + if (result.cur_col_id >= result.number_of_columns && + !result.state_machine.state_machine_options.strict_mode.GetValue()) { + result.used_unstrictness = true; + } else if (!result.HandleTooManyColumnsError(over_buffer_string.c_str(), over_buffer_string.size())) { + if (result.parse_chunk.data[result.chunk_col_id].GetType() != LogicalType::VARCHAR) { + // We cant have escapes on non varchar columns + result.current_errors.Insert(CAST_ERROR, result.cur_col_id, result.chunk_col_id, + result.last_position); + if (!result.state_machine.options.IgnoreErrors()) { + // We have to write the cast error message. + std::ostringstream error; + // Casting Error Message + error << "Could not convert string \"" + << std::string(over_buffer_string.c_str(), over_buffer_string.size()) << "\" to \'" + << LogicalTypeIdToString(result.parse_types[result.chunk_col_id].type_id) << "\'"; + auto error_string = error.str(); + FullLinePosition::SanitizeError(error_string); + result.current_errors.ModifyErrorMessageOfLastError(error_string); + } + return; } - return; - } - if (!result.HandleTooManyColumnsError(over_buffer_string.c_str(), over_buffer_string.size())) { value = RemoveEscape(over_buffer_string.c_str(), over_buffer_string.size(), state_machine->dialect_options.state_machine_options.escape.GetValue(), diff --git a/src/duckdb/src/execution/operator/join/physical_hash_join.cpp b/src/duckdb/src/execution/operator/join/physical_hash_join.cpp index ebef663af..9c0942669 100644 --- a/src/duckdb/src/execution/operator/join/physical_hash_join.cpp +++ b/src/duckdb/src/execution/operator/join/physical_hash_join.cpp @@ -418,13 +418,10 @@ static bool FinalizeSingleThreaded(const HashJoinGlobalSinkState &sink, const bo } static idx_t GetTupleWidth(const vector &types, bool &all_constant) { - idx_t tuple_width = 0; - all_constant = true; - for (auto &type : types) { - tuple_width += GetTypeIdSize(type.InternalType()); - all_constant &= TypeIsConstantSize(type.InternalType()); - } - return tuple_width + AlignValue(types.size()) / 8 + GetTypeIdSize(PhysicalType::UINT64); + TupleDataLayout layout; + layout.Initialize(types, TupleDataValidityType::CAN_HAVE_NULL_VALUES); + all_constant = layout.AllConstant(); + return layout.GetRowWidth(); } static idx_t GetPartitioningSpaceRequirement(ClientContext &context, const vector &types, @@ -433,7 +430,11 @@ static idx_t GetPartitioningSpaceRequirement(ClientContext &context, const vecto bool all_constant; idx_t tuple_width = GetTupleWidth(types, all_constant); - auto tuples_per_block = buffer_manager.GetBlockSize() / tuple_width + 1; + if (tuple_width == 0) { + throw InternalException("GetPartitioningSpaceRequirement: tuple width should not be 0"); + } + + auto tuples_per_block = MaxValue(buffer_manager.GetBlockSize() / tuple_width, 1); auto blocks_per_chunk = (STANDARD_VECTOR_SIZE + tuples_per_block) / tuples_per_block + 1; if (!all_constant) { blocks_per_chunk += 2; diff --git a/src/duckdb/src/function/table/table_scan.cpp b/src/duckdb/src/function/table/table_scan.cpp index 4d24e2110..83bab5aef 100644 --- a/src/duckdb/src/function/table/table_scan.cpp +++ b/src/duckdb/src/function/table/table_scan.cpp @@ -13,6 +13,7 @@ #include "duckdb/function/table_function.hpp" #include "duckdb/main/attached_database.hpp" #include "duckdb/main/client_config.hpp" +#include "duckdb/main/database.hpp" #include "duckdb/planner/expression.hpp" #include "duckdb/planner/expression/bound_columnref_expression.hpp" #include "duckdb/planner/operator/logical_get.hpp" @@ -29,6 +30,7 @@ #include "duckdb/planner/filter/conjunction_filter.hpp" #include "duckdb/common/types/value_map.hpp" #include "duckdb/main/settings.hpp" +#include "duckdb/transaction/duck_transaction_manager.hpp" namespace duckdb { @@ -115,6 +117,9 @@ class DuckIndexScanState : public TableScanGlobalState { bool started_last_phase; //! Synchronize changes to the global index scan state. mutex index_scan_lock; + //! Synchronize > when vacuum_rebuild_indexes is enabled (since + //! ART indexes are rebuilt during vacuuming with this setting). + unique_ptr vacuum_lock; public: unique_ptr InitLocalState(ExecutionContext &context, @@ -407,8 +412,10 @@ unique_ptr DuckTableScanInitGlobal(ClientContext &cont } unique_ptr DuckIndexScanInitGlobal(ClientContext &context, TableFunctionInitInput &input, - const TableScanBindData &bind_data, set &row_ids) { + const TableScanBindData &bind_data, set &row_ids, + unique_ptr vacuum_lock) { auto g_state = make_uniq(context, input.bind_data.get()); + g_state->vacuum_lock = std::move(vacuum_lock); g_state->finished_first_phase = row_ids.empty() ? true : false; g_state->started_last_phase = false; @@ -693,6 +700,17 @@ unique_ptr TableScanInitGlobal(ClientContext &context, bool index_scan = false; set row_ids; + // If vacuum_rebuild_indexes is enabled, grab a shared vacuum lock before + // scanning the index. This prevents the checkpoint from rebuilding the index and swapping + // row groups while we hold row IDs from the ART, ensuring we always see a consistent + // pairing. + unique_ptr vacuum_lock; + auto &db = DatabaseInstance::GetDatabase(context); + if (Settings::Get(db) > 0) { + auto &transaction_manager = DuckTransactionManager::Get(storage.GetAttached()); + vacuum_lock = transaction_manager.SharedVacuumLock(); + } + info->BindIndexes(context, ART::TYPE_NAME); for (auto &entry : indexes.IndexEntries()) { auto &index = *entry.index; @@ -711,7 +729,7 @@ unique_ptr TableScanInitGlobal(ClientContext &context, if (!index_scan) { return DuckTableScanInitGlobal(context, input, storage, bind_data); } - return DuckIndexScanInitGlobal(context, input, bind_data, row_ids); + return DuckIndexScanInitGlobal(context, input, bind_data, row_ids, std::move(vacuum_lock)); } static unique_ptr TableScanStatistics(ClientContext &context, TableFunctionGetStatisticsInput &input) { diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index bf7fe6b04..79ca65f3c 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "2-dev419" +#define DUCKDB_PATCH_VERSION "2-dev458" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 5 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.5.2-dev419" +#define DUCKDB_VERSION "v1.5.2-dev458" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "c241da23ae" +#define DUCKDB_SOURCE_ID "17491eb887" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" diff --git a/src/duckdb/src/include/duckdb/execution/index/bound_index.hpp b/src/duckdb/src/include/duckdb/execution/index/bound_index.hpp index 9bc40582c..9b10cf228 100644 --- a/src/duckdb/src/include/duckdb/execution/index/bound_index.hpp +++ b/src/duckdb/src/include/duckdb/execution/index/bound_index.hpp @@ -123,7 +123,9 @@ class BoundIndex : public Index { //! Deletes all data from the index. The lock obtained from InitializeLock must be held virtual void CommitDrop(IndexLock &index_lock) = 0; + //! Deletes all data from the index + // FIXME: we can rename this to ResetStorage(). void CommitDrop() override; //! Delete a chunk of entries from the index. The lock obtained from InitializeLock must be held. //! Returns the amount of rows successfully deleted from the index. diff --git a/src/duckdb/src/include/duckdb/main/extension_entries.hpp b/src/duckdb/src/include/duckdb/main/extension_entries.hpp index 6ad14a255..5503716d7 100644 --- a/src/duckdb/src/include/duckdb/main/extension_entries.hpp +++ b/src/duckdb/src/include/duckdb/main/extension_entries.hpp @@ -477,7 +477,9 @@ static constexpr ExtensionFunctionEntry EXTENSION_FUNCTIONS[] = { {"mismatches", "core_functions", CatalogType::SCALAR_FUNCTION_ENTRY}, {"mode", "core_functions", CatalogType::AGGREGATE_FUNCTION_ENTRY}, {"mysql_clear_cache", "mysql_scanner", CatalogType::TABLE_FUNCTION_ENTRY}, + {"mysql_debug_execution_plan", "mysql_scanner", CatalogType::TABLE_FUNCTION_ENTRY}, {"mysql_execute", "mysql_scanner", CatalogType::TABLE_FUNCTION_ENTRY}, + {"mysql_explain_federated", "mysql_scanner", CatalogType::TABLE_FUNCTION_ENTRY}, {"mysql_query", "mysql_scanner", CatalogType::TABLE_FUNCTION_ENTRY}, {"nanosecond", "core_functions", CatalogType::SCALAR_FUNCTION_ENTRY}, {"netmask", "inet", CatalogType::SCALAR_FUNCTION_ENTRY}, @@ -514,6 +516,7 @@ static constexpr ExtensionFunctionEntry EXTENSION_FUNCTIONS[] = { {"pi", "core_functions", CatalogType::SCALAR_FUNCTION_ENTRY}, {"position", "core_functions", CatalogType::SCALAR_FUNCTION_ENTRY}, {"postgres_attach", "postgres_scanner", CatalogType::TABLE_FUNCTION_ENTRY}, + {"postgres_configure_pool", "postgres_scanner", CatalogType::TABLE_FUNCTION_ENTRY}, {"postgres_execute", "postgres_scanner", CatalogType::TABLE_FUNCTION_ENTRY}, {"postgres_query", "postgres_scanner", CatalogType::TABLE_FUNCTION_ENTRY}, {"postgres_scan", "postgres_scanner", CatalogType::TABLE_FUNCTION_ENTRY}, @@ -538,6 +541,7 @@ static constexpr ExtensionFunctionEntry EXTENSION_FUNCTIONS[] = { {"read_ndjson_auto", "json", CatalogType::TABLE_FUNCTION_ENTRY}, {"read_ndjson_objects", "json", CatalogType::TABLE_FUNCTION_ENTRY}, {"read_parquet", "parquet", CatalogType::TABLE_FUNCTION_ENTRY}, + {"read_postgres_binary", "postgres_scanner", CatalogType::TABLE_FUNCTION_ENTRY}, {"read_xlsx", "excel", CatalogType::TABLE_FUNCTION_ENTRY}, {"reduce", "core_functions", CatalogType::SCALAR_FUNCTION_ENTRY}, {"reduce_sql_statement", "sqlsmith", CatalogType::TABLE_FUNCTION_ENTRY}, @@ -1080,20 +1084,33 @@ static constexpr ExtensionEntry EXTENSION_SETTINGS[] = { {"iceberg_test_force_token_expiry", "iceberg"}, {"iceberg_via_aws_sdk_for_catalog_interactions", "iceberg"}, {"merge_http_secret_into_s3_request", "httpfs"}, + {"mysql_adaptive_replan_enabled", "mysql_scanner"}, + {"mysql_aggregate_pushdown_enabled", "mysql_scanner"}, {"mysql_bit1_as_boolean", "mysql_scanner"}, {"mysql_compression_aware_costs", "mysql_scanner"}, {"mysql_compression_ratio", "mysql_scanner"}, {"mysql_debug_show_queries", "mysql_scanner"}, {"mysql_enable_transactions", "mysql_scanner"}, {"mysql_experimental_filter_pushdown", "mysql_scanner"}, + {"mysql_explain_validation_enabled", "mysql_scanner"}, + {"mysql_hint_injection_enabled", "mysql_scanner"}, + {"mysql_hint_staleness_threshold", "mysql_scanner"}, {"mysql_incomplete_dates_as_nulls", "mysql_scanner"}, + {"mysql_order_pushdown_enabled", "mysql_scanner"}, {"mysql_pool_acquire_mode", "mysql_scanner"}, + {"mysql_pool_connection_idle_timeout_millis", "mysql_scanner"}, + {"mysql_pool_connection_max_lifetime_millis", "mysql_scanner"}, + {"mysql_pool_enable_reaper_thread", "mysql_scanner"}, + {"mysql_pool_enable_thread_local_cache", "mysql_scanner"}, {"mysql_pool_size", "mysql_scanner"}, - {"mysql_pool_timeout_ms", "mysql_scanner"}, + {"mysql_pool_wait_timeout_millis", "mysql_scanner"}, {"mysql_push_threshold_no_index", "mysql_scanner"}, {"mysql_push_threshold_with_index", "mysql_scanner"}, + {"mysql_query_timeout_enabled", "mysql_scanner"}, + {"mysql_query_timeout_max_ms", "mysql_scanner"}, + {"mysql_query_timeout_min_ms", "mysql_scanner"}, {"mysql_session_time_zone", "mysql_scanner"}, - {"mysql_thread_local_cache", "mysql_scanner"}, + {"mysql_sql_buffer_result", "mysql_scanner"}, {"mysql_time_as_time", "mysql_scanner"}, {"mysql_tinyint1_as_boolean", "mysql_scanner"}, {"parquet_metadata_cache", "parquet"}, @@ -1102,8 +1119,17 @@ static constexpr ExtensionEntry EXTENSION_SETTINGS[] = { {"pg_connection_limit", "postgres_scanner"}, {"pg_debug_show_queries", "postgres_scanner"}, {"pg_experimental_filter_pushdown", "postgres_scanner"}, + {"pg_idle_in_transaction_timeout_millis", "postgres_scanner"}, {"pg_null_byte_replacement", "postgres_scanner"}, {"pg_pages_per_task", "postgres_scanner"}, + {"pg_pool_enable_reaper_thread", "postgres_scanner"}, + {"pg_pool_enable_thread_local_cache", "postgres_scanner"}, + {"pg_pool_health_check_query", "postgres_scanner"}, + {"pg_pool_idle_timeout_millis", "postgres_scanner"}, + {"pg_pool_max_connections", "postgres_scanner"}, + {"pg_pool_max_lifetime_millis", "postgres_scanner"}, + {"pg_pool_wait_timeout_millis", "postgres_scanner"}, + {"pg_statement_timeout_millis", "postgres_scanner"}, {"pg_use_binary_copy", "postgres_scanner"}, {"pg_use_ctid_scan", "postgres_scanner"}, {"pg_use_text_protocol", "postgres_scanner"}, diff --git a/src/duckdb/src/include/duckdb/main/settings.hpp b/src/duckdb/src/include/duckdb/main/settings.hpp index d1277a9e1..cb339df2e 100644 --- a/src/duckdb/src/include/duckdb/main/settings.hpp +++ b/src/duckdb/src/include/duckdb/main/settings.hpp @@ -1547,6 +1547,19 @@ struct UsernameSetting { static constexpr idx_t SettingIndex = 89; }; +struct VacuumRebuildIndexesSetting { + using RETURN_TYPE = idx_t; + static constexpr const char *Name = "vacuum_rebuild_indexes"; + static constexpr const char *Description = + "(Experimental) Allow vacuum to compact row groups on tables with bound ART indexes, rebuilding the indexes " + "afterward. Tables with a row count exceeding this threshold are skipped. 0 = disabled."; + static constexpr const char *InputType = "UBIGINT"; + static constexpr const char *DefaultValue = "0"; + static constexpr SettingScopeTarget Scope = SettingScopeTarget::GLOBAL_DEFAULT; + static constexpr idx_t SettingIndex = 90; + static void OnSet(SettingCallbackInfo &info, Value &input); +}; + struct ValidateExternalFileCacheSetting { using RETURN_TYPE = CacheValidationMode; static constexpr const char *Name = "validate_external_file_cache"; @@ -1556,7 +1569,7 @@ struct ValidateExternalFileCacheSetting { static constexpr const char *InputType = "VARCHAR"; static constexpr const char *DefaultValue = "VALIDATE_ALL"; static constexpr SettingScopeTarget Scope = SettingScopeTarget::GLOBAL_DEFAULT; - static constexpr idx_t SettingIndex = 90; + static constexpr idx_t SettingIndex = 91; static void OnSet(SettingCallbackInfo &info, Value &input); }; @@ -1568,7 +1581,7 @@ struct VariantMinimumShreddingSizeSetting { static constexpr const char *InputType = "BIGINT"; static constexpr const char *DefaultValue = "30000"; static constexpr SettingScopeTarget Scope = SettingScopeTarget::GLOBAL_ONLY; - static constexpr idx_t SettingIndex = 91; + static constexpr idx_t SettingIndex = 92; }; struct WalAutocheckpointEntriesSetting { @@ -1579,7 +1592,7 @@ struct WalAutocheckpointEntriesSetting { static constexpr const char *InputType = "UBIGINT"; static constexpr const char *DefaultValue = "0"; static constexpr SettingScopeTarget Scope = SettingScopeTarget::GLOBAL_DEFAULT; - static constexpr idx_t SettingIndex = 92; + static constexpr idx_t SettingIndex = 93; }; struct WarningsAsErrorsSetting { @@ -1589,7 +1602,7 @@ struct WarningsAsErrorsSetting { static constexpr const char *InputType = "BOOLEAN"; static constexpr const char *DefaultValue = "false"; static constexpr SettingScopeTarget Scope = SettingScopeTarget::GLOBAL_ONLY; - static constexpr idx_t SettingIndex = 93; + static constexpr idx_t SettingIndex = 94; static void OnSet(SettingCallbackInfo &info, Value &input); }; @@ -1601,7 +1614,7 @@ struct WriteBufferRowGroupCountSetting { static constexpr const char *InputType = "UBIGINT"; static constexpr const char *DefaultValue = "5"; static constexpr SettingScopeTarget Scope = SettingScopeTarget::GLOBAL_DEFAULT; - static constexpr idx_t SettingIndex = 94; + static constexpr idx_t SettingIndex = 95; }; struct ZstdMinStringLengthSetting { @@ -1612,11 +1625,11 @@ struct ZstdMinStringLengthSetting { static constexpr const char *InputType = "UBIGINT"; static constexpr const char *DefaultValue = "4096"; static constexpr SettingScopeTarget Scope = SettingScopeTarget::GLOBAL_ONLY; - static constexpr idx_t SettingIndex = 95; + static constexpr idx_t SettingIndex = 96; }; struct GeneratedSettingInfo { - static constexpr idx_t MaxSettingIndex = 96; + static constexpr idx_t MaxSettingIndex = 97; }; //===----------------------------------------------------------------------===// diff --git a/src/duckdb/src/include/duckdb/storage/checkpoint/table_data_writer.hpp b/src/duckdb/src/include/duckdb/storage/checkpoint/table_data_writer.hpp index 0dc897b42..d457692c7 100644 --- a/src/duckdb/src/include/duckdb/storage/checkpoint/table_data_writer.hpp +++ b/src/duckdb/src/include/duckdb/storage/checkpoint/table_data_writer.hpp @@ -45,6 +45,12 @@ class TableDataWriter { void SetRowGroupCount(optional_idx row_group_count_p) { row_group_count = row_group_count_p; } + bool GetRebuildIndexes() const { + return rebuild_indexes; + } + void SetRebuildIndexes() { + rebuild_indexes = true; + } DatabaseInstance &GetDatabase(); unique_ptr CreateTaskExecutor(); @@ -56,6 +62,7 @@ class TableDataWriter { vector row_group_pointers; optional_idx row_group_count; + bool rebuild_indexes = false; }; class SingleFileTableDataWriter : public TableDataWriter { diff --git a/src/duckdb/src/include/duckdb/storage/data_table.hpp b/src/duckdb/src/include/duckdb/storage/data_table.hpp index 11902c01a..0fa52bd8e 100644 --- a/src/duckdb/src/include/duckdb/storage/data_table.hpp +++ b/src/duckdb/src/include/duckdb/storage/data_table.hpp @@ -297,6 +297,9 @@ class DataTable : public enable_shared_from_this { void InitializeScanWithOffset(DuckTransaction &transaction, TableScanState &state, const vector &column_ids, idx_t start_row, idx_t end_row); + //! Rebuild all indexes after vacuuming changed rowid's (used with vacuum_rebuild_indexes setting). + void RebuildIndexes(); + void VerifyForeignKeyConstraint(optional_ptr storage, const BoundForeignKeyConstraint &bound_foreign_key, ClientContext &context, DataChunk &chunk, VerifyExistenceType type); diff --git a/src/duckdb/src/include/duckdb/storage/table/table_index_list.hpp b/src/duckdb/src/include/duckdb/storage/table/table_index_list.hpp index 3e4354d9a..b0d512bef 100644 --- a/src/duckdb/src/include/duckdb/storage/table/table_index_list.hpp +++ b/src/duckdb/src/include/duckdb/storage/table/table_index_list.hpp @@ -91,6 +91,8 @@ class TableIndexList { lock_guard lock(index_entries_lock); return unbound_count != 0; } + //! Returns the set of distinct index types across all bound indexes. + unordered_set DistinctIndexTypes() const; //! Overwrite this list with the other list. void Move(TableIndexList &other) { D_ASSERT(index_entries.empty()); diff --git a/src/duckdb/src/main/config.cpp b/src/duckdb/src/main/config.cpp index 479be82c4..f5f56660b 100644 --- a/src/duckdb/src/main/config.cpp +++ b/src/duckdb/src/main/config.cpp @@ -199,6 +199,7 @@ static const ConfigurationOption internal_options[] = { DUCKDB_SETTING_CALLBACK(TempFileEncryptionSetting), DUCKDB_GLOBAL(ThreadsSetting), DUCKDB_SETTING(UsernameSetting), + DUCKDB_SETTING_CALLBACK(VacuumRebuildIndexesSetting), DUCKDB_SETTING_CALLBACK(ValidateExternalFileCacheSetting), DUCKDB_SETTING(VariantMinimumShreddingSizeSetting), DUCKDB_SETTING(WalAutocheckpointEntriesSetting), diff --git a/src/duckdb/src/main/settings/custom_settings.cpp b/src/duckdb/src/main/settings/custom_settings.cpp index e762e54e2..ba82c71d5 100644 --- a/src/duckdb/src/main/settings/custom_settings.cpp +++ b/src/duckdb/src/main/settings/custom_settings.cpp @@ -728,6 +728,15 @@ void DuckDBAPISetting::OnSet(SettingCallbackInfo &info, Value &input) { } } +//===----------------------------------------------------------------------===// +// Vacuum Rebuild Indexes +//===----------------------------------------------------------------------===// +void VacuumRebuildIndexesSetting::OnSet(SettingCallbackInfo &info, Value &input) { + if (info.db || info.context) { + throw InvalidInputException("Cannot change vacuum_rebuild_indexes setting while database is running"); + } +} + //===----------------------------------------------------------------------===// // Enable External Access //===----------------------------------------------------------------------===// diff --git a/src/duckdb/src/optimizer/topn_window_elimination.cpp b/src/duckdb/src/optimizer/topn_window_elimination.cpp index 8828893a3..56954e9c5 100644 --- a/src/duckdb/src/optimizer/topn_window_elimination.cpp +++ b/src/duckdb/src/optimizer/topn_window_elimination.cpp @@ -1078,7 +1078,17 @@ unique_ptr TopNWindowElimination::TryPrepareLateMaterialization if (op.HasProjectionMap()) { auto &filter = op.Cast(); for (const auto rowid_idx : rhs_rowid_idxs) { - filter.projection_map.push_back(rowid_idx); + // The rowid_idx is the index into the rhs_get.column_ids, + // not the index of the rhs_get schema. + auto schema_idx = rowid_idx; + if (last_table_idx == rhs_get.table_index && !rhs_get.projection_ids.empty()) { + for (schema_idx = 0; schema_idx < rhs_get.projection_ids.size(); ++schema_idx) { + if (rhs_get.projection_ids[schema_idx] == rowid_idx) { + break; + } + } + } + filter.projection_map.push_back(schema_idx); } } break; diff --git a/src/duckdb/src/parallel/executor.cpp b/src/duckdb/src/parallel/executor.cpp index 13de3b403..bc8b75723 100644 --- a/src/duckdb/src/parallel/executor.cpp +++ b/src/duckdb/src/parallel/executor.cpp @@ -542,7 +542,9 @@ void Executor::AddToBeRescheduled(shared_ptr &task_p) { if (to_be_rescheduled_tasks.find(task_p.get()) != to_be_rescheduled_tasks.end()) { return; } - to_be_rescheduled_tasks[task_p.get()] = std::move(task_p); + // Save raw pointer before move — evaluation order of operator[] key and assignment value is unspecified pre-C++17 + auto raw_ptr = task_p.get(); + to_be_rescheduled_tasks[raw_ptr] = std::move(task_p); } bool Executor::ExecutionIsFinished() { diff --git a/src/duckdb/src/planner/operator/logical_update.cpp b/src/duckdb/src/planner/operator/logical_update.cpp index 36ceaad5f..c63bf7019 100644 --- a/src/duckdb/src/planner/operator/logical_update.cpp +++ b/src/duckdb/src/planner/operator/logical_update.cpp @@ -56,8 +56,9 @@ void LogicalUpdate::RewriteInPlaceUpdates(LogicalOperator &update_op) { return; } auto needs_reinsert = false; - for (auto &col : update.table.GetColumns().GetColumnTypes()) { - if (!col.SupportsRegularUpdate()) { + for (auto &col_idx : update.columns) { + auto &column = update.table.GetColumns().GetColumn(col_idx); + if (!column.Type().SupportsRegularUpdate()) { needs_reinsert = true; break; } diff --git a/src/duckdb/src/storage/data_table.cpp b/src/duckdb/src/storage/data_table.cpp index 230573941..6c6c3d8bc 100644 --- a/src/duckdb/src/storage/data_table.cpp +++ b/src/duckdb/src/storage/data_table.cpp @@ -358,6 +358,63 @@ void DataTable::VacuumIndexes() { } } +void DataTable::RebuildIndexes() { + auto &indexes = info->indexes; + auto &types = row_groups->GetTypes(); + + for (auto &index : indexes.Indexes()) { + if (!index.IsBound()) { + throw InternalException("RebuildIndexes expects all indexes to be bound during checkpoint"); + } + auto &bound_index = index.Cast(); + bound_index.CommitDrop(); + + auto &col_ids = bound_index.GetColumnIds(); + + vector scan_column_ids; + vector scan_types; + for (auto col_id : col_ids) { + scan_column_ids.emplace_back(col_id); + scan_types.push_back(types[col_id]); + } + scan_column_ids.emplace_back(COLUMN_IDENTIFIER_ROW_ID); + scan_types.push_back(LogicalType::ROW_TYPE); + + DataChunk scan_chunk; + scan_chunk.Initialize(Allocator::Get(db), scan_types); + + CreateIndexScanState state; + auto scan_type = TableScanType::TABLE_SCAN_COMMITTED_ROWS; + state.Initialize(scan_column_ids, nullptr); + QueryContext context; + row_groups->InitializeScan(context, state.table_state, scan_column_ids, nullptr); + row_groups->InitializeCreateIndexScan(state); + + DataChunk table_chunk; + table_chunk.InitializeEmpty(types); + + while (true) { + scan_chunk.Reset(); + state.table_state.Scan(scan_chunk, scan_type, state.segment_lock); + if (scan_chunk.size() == 0) { + break; + } + for (idx_t i = 0; i < col_ids.size(); i++) { + table_chunk.data[col_ids[i]].Reference(scan_chunk.data[i]); + } + table_chunk.SetCardinality(scan_chunk); + Vector &row_ids = scan_chunk.data[col_ids.size()]; + + auto error = bound_index.Append(table_chunk, row_ids); + if (error.HasError()) { + throw InternalException("Failed to rebuild index '%s' after vacuum: %s", bound_index.GetIndexName(), + error.Message()); + } + } + bound_index.Verify(); + } +} + void DataTable::VerifyIndexBuffers() { info->VerifyIndexBuffers(); } @@ -1760,6 +1817,9 @@ void DataTable::Checkpoint(TableDataWriter &writer, Serializer &serializer) { TableStatistics global_stats; row_groups->Checkpoint(writer, global_stats); row_groups->SetAppendRequiresNewRowGroup(); + if (writer.GetRebuildIndexes()) { + RebuildIndexes(); + } // The row group payload data has been written. Now write: // sample // column stats diff --git a/src/duckdb/src/storage/table/row_group_collection.cpp b/src/duckdb/src/storage/table/row_group_collection.cpp index abf4935a8..6f235eb00 100644 --- a/src/duckdb/src/storage/table/row_group_collection.cpp +++ b/src/duckdb/src/storage/table/row_group_collection.cpp @@ -1155,6 +1155,11 @@ class CheckpointTask : public BaseCheckpointTask { struct VacuumState { bool can_vacuum_deletes = true; bool can_change_row_ids = false; + //! Whether we are allowed to rebuild indexes after a vacuum (only true when vacuum_rebuild_indexes + //! threshold is set, the table's row count is within the threshold, and all indexes are bound ART's). + bool can_rebuild_indexes = false; + //! Whether any operation (empty group drop or vacuum merge) actually remapped row IDs + bool row_ids_changed = false; idx_t row_start = 0; idx_t next_vacuum_idx = 0; vector row_group_counts; @@ -1298,7 +1303,20 @@ void RowGroupCollection::InitializeVacuumState(CollectionCheckpointState &checkp // if there are indexes - we cannot change row-ids // this limits what kind of vacuuming we can do - state.can_change_row_ids = info->GetIndexes().Empty(); + bool has_indexes = !info->GetIndexes().Empty(); + + // *unless* vacuum_rebuild_indexes threshold is set, the table's row count + // is within the threshold, and all indexes are bound ART indexes, + // in which case we allow vacuuming and rebuild the indexes afterward. + auto vacuum_rebuild_threshold = Settings::Get(checkpoint_state.writer.GetDatabase()); + auto index_types = info->GetIndexes().DistinctIndexTypes(); + state.can_rebuild_indexes = has_indexes && !info->GetIndexes().HasUnbound() && index_types.size() == 1 && + index_types.count(ART::TYPE_NAME) && vacuum_rebuild_threshold > 0 && + GetTotalRows() <= vacuum_rebuild_threshold; + + // We can move around rowids if we either 1) don't have any indexes at all or 2) can_rebuild_indexes is true (in + // which case indexes are entirely rebuilt after vacuuming). + state.can_change_row_ids = !has_indexes || state.can_rebuild_indexes; // obtain the set of committed row counts for each row group auto row_group_count = checkpoint_state.SegmentCount(); vector committed_counts; @@ -1310,13 +1328,14 @@ void RowGroupCollection::InitializeVacuumState(CollectionCheckpointState &checkp state.can_vacuum_deletes = false; return; } + bool dropped_any_rowgroups = false; for (auto &entry : checkpoint_state.row_groups.SegmentNodes()) { auto &row_group = entry.GetNode(); auto row_group_count = row_group.GetCommittedRowCount(); if (!state.can_change_row_ids) { idx_t total_count = row_group.count; committed_counts.emplace_back(row_group_count); - // we cannot change row ids and this row group has deletes + // we cannot change row ids, and this row group has deletes // vacuuming here would alter row ids - so skip it if (total_count != row_group_count) { state.row_group_counts.emplace_back(); @@ -1324,15 +1343,23 @@ void RowGroupCollection::InitializeVacuumState(CollectionCheckpointState &checkp } } if (row_group_count == 0) { - // empty row group - we can drop it entirely + // empty row group - we can drop it entirely. row_group.CommitDrop(); checkpoint_state.DropSegment(entry.GetIndex()); + dropped_any_rowgroups = true; + state.row_group_counts.push_back(row_group_count); + continue; + } + if (dropped_any_rowgroups) { + // if there are any dropped row groups before a live row group, all the row ids of the row groups following + // the dropped row group will have their row ids shifted forward (to keep row ids contiguous). + state.row_ids_changed = true; } state.row_group_counts.push_back(row_group_count); } if (!state.can_change_row_ids && options.type != CheckpointType::CONCURRENT_CHECKPOINT) { // if we cannot change row ids we might still be able to vacuum trailing deletions - // since that would not change the row-ids of any non-deleted rows + // since that would not change the row ids of any non-deleted rows auto segment_count = state.row_group_counts.size(); for (idx_t i = segment_count; i > 0; i--) { auto segment_idx = i - 1; @@ -1475,6 +1502,7 @@ void RowGroupCollection::Checkpoint(TableDataWriter &writer, TableStatistics &gl if (vacuum_tasks) { // vacuum tasks were scheduled - don't schedule a checkpoint task yet total_vacuum_tasks++; + vacuum_state.row_ids_changed = true; continue; } if (checkpoint_state.SegmentIsDropped(segment_idx)) { @@ -1739,6 +1767,15 @@ void RowGroupCollection::Checkpoint(TableDataWriter &writer, TableStatistics &gl total_rows = new_total_rows; SetRowGroups(std::move(new_row_groups)); Verify(); + // Rebuild indexes if: + // 1) can_rebuild_indexes is set (it is set when the vacuum_rebuild_indexes + // threshold is set, the table's row count is within the threshold, + // and all the indexes are bound ART's), + // and + // 2) we have changed rowids. + if (vacuum_state.can_rebuild_indexes && vacuum_state.row_ids_changed) { + writer.SetRebuildIndexes(); + } } //===--------------------------------------------------------------------===// diff --git a/src/duckdb/src/storage/table_index_list.cpp b/src/duckdb/src/storage/table_index_list.cpp index 0f6a97175..641341107 100644 --- a/src/duckdb/src/storage/table_index_list.cpp +++ b/src/duckdb/src/storage/table_index_list.cpp @@ -120,6 +120,15 @@ void TableIndexList::CommitDrop(const string &name) { } } +unordered_set TableIndexList::DistinctIndexTypes() const { + lock_guard lock(index_entries_lock); + unordered_set result; + for (auto &entry : index_entries) { + result.insert(entry->index->GetIndexType()); + } + return result; +} + bool TableIndexList::NameIsUnique(const string &name) { // Only covers PK, FK, and UNIQUE indexes. lock_guard lock(index_entries_lock); diff --git a/src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp b/src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp index 657a4987e..58605c3ad 100644 --- a/src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp +++ b/src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp @@ -348,17 +348,17 @@ #include "extension/icu/third_party/icu/i18n/wintzimpl.cpp" -#include "extension/icu/third_party/icu/i18n/double-conversion-string-to-double.cpp" - #include "extension/icu/third_party/icu/i18n/double-conversion-cached-powers.cpp" -#include "extension/icu/third_party/icu/i18n/double-conversion-bignum-dtoa.cpp" +#include "extension/icu/third_party/icu/i18n/double-conversion-bignum.cpp" -#include "extension/icu/third_party/icu/i18n/double-conversion-fast-dtoa.cpp" +#include "extension/icu/third_party/icu/i18n/double-conversion-string-to-double.cpp" -#include "extension/icu/third_party/icu/i18n/double-conversion-strtod.cpp" +#include "extension/icu/third_party/icu/i18n/double-conversion-bignum-dtoa.cpp" #include "extension/icu/third_party/icu/i18n/double-conversion-double-to-string.cpp" -#include "extension/icu/third_party/icu/i18n/double-conversion-bignum.cpp" +#include "extension/icu/third_party/icu/i18n/double-conversion-strtod.cpp" + +#include "extension/icu/third_party/icu/i18n/double-conversion-fast-dtoa.cpp"