Skip to content

Commit f13ec22

Browse files
Update vendored DuckDB sources to 535b7a6a14
1 parent 57442c5 commit f13ec22

212 files changed

Lines changed: 23273 additions & 18570 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

src/duckdb/extension/json/include/json_structure.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,9 @@ struct JSONStructureDescription {
8080

8181
//! Candidate types (if auto-detecting and type == LogicalTypeId::VARCHAR)
8282
vector<LogicalTypeId> candidate_types;
83+
84+
//! Whether any UBIGINT value exceeds the BIGINT range (i.e., >= 2^63)
85+
bool has_large_ubigint = false;
8386
};
8487

8588
struct JSONStructure {

src/duckdb/extension/json/json_functions/copy_json.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,9 @@ static BoundStatement CopyToJSONPlan(Binder &binder, CopyStatement &stmt) {
5555
csv_copy_options["suffix"] = {"\n]\n"};
5656
csv_copy_options["new_line"] = {",\n\t"};
5757
}
58+
} else if (loption == "file_extension") {
59+
// Since we set the file extension to "json" above, we need to override it
60+
csv_copy_options["file_extension"] = {StringValue::Get(kv.second.back())};
5861
} else if (SUPPORTED_BASE_OPTIONS.find(loption) != SUPPORTED_BASE_OPTIONS.end()) {
5962
// We support these base options
6063
csv_copy_options.insert(kv);

src/duckdb/extension/json/json_functions/json_structure.cpp

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,23 @@
99
namespace duckdb {
1010

1111
static bool IsNumeric(LogicalTypeId type) {
12-
return type == LogicalTypeId::DOUBLE || type == LogicalTypeId::UBIGINT || type == LogicalTypeId::BIGINT;
12+
return type == LogicalTypeId::DOUBLE || type == LogicalTypeId::UBIGINT || type == LogicalTypeId::BIGINT ||
13+
type == LogicalTypeId::HUGEINT;
1314
}
1415

1516
static LogicalTypeId MaxNumericType(const LogicalTypeId &a, const LogicalTypeId &b) {
1617
D_ASSERT(a != b);
1718
if (a == LogicalTypeId::DOUBLE || b == LogicalTypeId::DOUBLE) {
1819
return LogicalTypeId::DOUBLE;
1920
}
21+
if (a == LogicalTypeId::HUGEINT || b == LogicalTypeId::HUGEINT) {
22+
return LogicalTypeId::HUGEINT;
23+
}
24+
// One is BIGINT and the other is UBIGINT - need HUGEINT to represent both ranges
25+
if ((a == LogicalTypeId::BIGINT && b == LogicalTypeId::UBIGINT) ||
26+
(a == LogicalTypeId::UBIGINT && b == LogicalTypeId::BIGINT)) {
27+
return LogicalTypeId::HUGEINT;
28+
}
2029
return LogicalTypeId::BIGINT;
2130
}
2231

@@ -343,6 +352,7 @@ static void SwapJSONStructureDescription(JSONStructureDescription &a, JSONStruct
343352
std::swap(a.key_map, b.key_map);
344353
std::swap(a.children, b.children);
345354
std::swap(a.candidate_types, b.candidate_types);
355+
std::swap(a.has_large_ubigint, b.has_large_ubigint);
346356
}
347357

348358
JSONStructureDescription::JSONStructureDescription(JSONStructureDescription &&other) noexcept {
@@ -427,7 +437,11 @@ static void ExtractStructureObject(yyjson_val *obj, JSONStructureNode &node, con
427437

428438
static void ExtractStructureVal(yyjson_val *val, JSONStructureNode &node) {
429439
D_ASSERT(!yyjson_is_arr(val) && !yyjson_is_obj(val));
430-
node.GetOrCreateDescription(JSONCommon::ValTypeToLogicalTypeId(val));
440+
auto &desc = node.GetOrCreateDescription(JSONCommon::ValTypeToLogicalTypeId(val));
441+
if (desc.type == LogicalTypeId::UBIGINT &&
442+
unsafe_yyjson_get_uint(val) > static_cast<uint64_t>(NumericLimits<int64_t>::Maximum())) {
443+
desc.has_large_ubigint = true;
444+
}
431445
}
432446

433447
void JSONStructure::ExtractStructure(yyjson_val *val, JSONStructureNode &node, const bool ignore_errors) {
@@ -558,6 +572,9 @@ static void MergeNodeVal(JSONStructureNode &merged, const JSONStructureDescripti
558572
const bool node_initialized) {
559573
D_ASSERT(child_desc.type != LogicalTypeId::LIST && child_desc.type != LogicalTypeId::STRUCT);
560574
auto &merged_desc = merged.GetOrCreateDescription(child_desc.type);
575+
if (child_desc.has_large_ubigint) {
576+
merged_desc.has_large_ubigint = true;
577+
}
561578
if (merged_desc.type != LogicalTypeId::VARCHAR || !node_initialized || merged.descriptions.size() != 1) {
562579
return;
563580
}
@@ -798,7 +815,10 @@ LogicalType JSONStructure::StructureToType(ClientContext &context, const JSONStr
798815
case LogicalTypeId::VARCHAR:
799816
return StructureToTypeString(node);
800817
case LogicalTypeId::UBIGINT:
801-
return LogicalTypeId::BIGINT; // We prefer not to return UBIGINT in our type auto-detection
818+
if (desc.has_large_ubigint) {
819+
return LogicalTypeId::HUGEINT;
820+
}
821+
return LogicalTypeId::BIGINT;
802822
case LogicalTypeId::SQLNULL:
803823
return null_type;
804824
default:

src/duckdb/extension/json/json_reader.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
#include "json_reader.hpp"
22

3+
#include <utility>
4+
35
#include "duckdb/common/file_opener.hpp"
6+
#include "duckdb/common/file_open_flags.hpp"
47
#include "duckdb/common/serializer/deserializer.hpp"
58
#include "duckdb/common/serializer/serializer.hpp"
9+
#include "duckdb/storage/caching_mode.hpp"
610
#include "json_scan.hpp"
7-
#include <utility>
811

912
namespace duckdb {
1013

@@ -183,7 +186,9 @@ void JSONReader::OpenJSONFile() {
183186
lock_guard<mutex> guard(lock);
184187
if (!IsOpen()) {
185188
auto &fs = FileSystem::GetFileSystem(context);
186-
auto regular_file_handle = fs.OpenFile(file, FileFlags::FILE_FLAGS_READ | options.compression);
189+
FileOpenFlags flags = FileFlags::FILE_FLAGS_READ | options.compression;
190+
flags.SetCachingMode(CachingMode::CACHE_REMOTE_ONLY);
191+
auto regular_file_handle = fs.OpenFile(file, flags);
187192
file_handle = make_uniq<JSONFileHandle>(context, std::move(regular_file_handle), BufferAllocator::Get(context));
188193
}
189194
Reset();
@@ -1052,8 +1057,9 @@ void JSONReader::ReadNextBufferSeek(JSONReaderScanState &scan_state) {
10521057
if (!raw_handle.OnDiskFile() && raw_handle.CanSeek()) {
10531058
if (!scan_state.thread_local_filehandle ||
10541059
scan_state.thread_local_filehandle->GetPath() != raw_handle.GetPath()) {
1055-
scan_state.thread_local_filehandle = scan_state.fs.OpenFile(
1056-
raw_handle.GetPath(), FileFlags::FILE_FLAGS_READ | FileFlags::FILE_FLAGS_DIRECT_IO);
1060+
FileOpenFlags flags = FileFlags::FILE_FLAGS_READ | FileFlags::FILE_FLAGS_DIRECT_IO;
1061+
flags.SetCachingMode(CachingMode::CACHE_REMOTE_ONLY);
1062+
scan_state.thread_local_filehandle = scan_state.fs.OpenFile(raw_handle.GetPath(), flags);
10571063
}
10581064
} else if (scan_state.thread_local_filehandle) {
10591065
scan_state.thread_local_filehandle = nullptr;

src/duckdb/extension/parquet/include/parquet_geometry.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -85,14 +85,14 @@ class GeoParquetFileMetadata {
8585
public:
8686
explicit GeoParquetFileMetadata(GeoParquetVersion geo_parquet_version) : version(geo_parquet_version) {
8787
}
88-
void AddGeoParquetStats(const string &column_name, const LogicalType &type, const GeometryStatsData &stats,
89-
GeoParquetVersion version);
88+
void AddGeoParquetStats(ClientContext &context, const string &column_name, const LogicalType &type,
89+
const GeometryStatsData &stats, GeoParquetVersion version);
9090
void Write(duckdb_parquet::FileMetaData &file_meta_data);
9191

9292
// Try to read GeoParquet metadata. Returns nullptr if not found, invalid or the required spatial extension is not
9393
// available.
9494
static unique_ptr<GeoParquetFileMetadata> TryRead(const duckdb_parquet::FileMetaData &file_meta_data,
95-
const ClientContext &context);
95+
ClientContext &context);
9696
const unordered_map<string, GeoParquetColumnMetadata> &GetColumnMeta() const;
9797
optional_ptr<const GeoParquetColumnMetadata> GetColumnMeta(const string &column_name) const;
9898

src/duckdb/extension/parquet/include/parquet_reader.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,6 @@ struct ParquetOptions {
109109
bool binary_as_string = false;
110110
bool file_row_number = false;
111111
shared_ptr<ParquetEncryptionConfig> encryption_config;
112-
bool debug_use_openssl = true;
113112

114113
vector<ParquetColumnDefinition> schema;
115114
idx_t explicit_cardinality = 0;

src/duckdb/extension/parquet/include/parquet_writer.hpp

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -110,8 +110,8 @@ class ParquetWriter {
110110
ShreddingType shredding_types, const vector<pair<string, string>> &kv_metadata,
111111
shared_ptr<ParquetEncryptionConfig> encryption_config, optional_idx dictionary_size_limit,
112112
idx_t string_dictionary_page_size_limit, bool enable_bloom_filters,
113-
double bloom_filter_false_positive_ratio, int64_t compression_level, bool debug_use_openssl,
114-
ParquetVersion parquet_version, GeoParquetVersion geoparquet_version);
113+
double bloom_filter_false_positive_ratio, int64_t compression_level, ParquetVersion parquet_version,
114+
GeoParquetVersion geoparquet_version);
115115
~ParquetWriter();
116116

117117
public:
@@ -123,7 +123,7 @@ class ParquetWriter {
123123

124124
static duckdb_parquet::Type::type DuckDBTypeToParquetType(const LogicalType &duckdb_type);
125125
static void SetSchemaProperties(const LogicalType &duckdb_type, duckdb_parquet::SchemaElement &schema_ele,
126-
bool allow_geometry);
126+
bool allow_geometry, ClientContext &context);
127127

128128
ClientContext &GetContext() {
129129
return context;
@@ -207,7 +207,6 @@ class ParquetWriter {
207207
bool enable_bloom_filters;
208208
double bloom_filter_false_positive_ratio;
209209
int64_t compression_level;
210-
bool debug_use_openssl;
211210
shared_ptr<EncryptionUtil> encryption_util;
212211
ParquetVersion parquet_version;
213212
GeoParquetVersion geoparquet_version;

src/duckdb/extension/parquet/parquet_crypto.cpp

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -163,8 +163,11 @@ class EncryptionTransport : public TTransport {
163163
public:
164164
EncryptionTransport(TProtocol &prot_p, const string &key, const EncryptionUtil &encryption_util_p)
165165
: prot(prot_p), trans(*prot.getTransport()),
166-
aes(encryption_util_p.CreateEncryptionState(EncryptionTypes::GCM, key.size())),
167166
allocator(Allocator::DefaultAllocator(), ParquetCrypto::CRYPTO_BLOCK_SIZE) {
167+
auto metadata = make_uniq<EncryptionStateMetadata>(EncryptionTypes::GCM, key.size(),
168+
EncryptionTypes::EncryptionVersion::NONE);
169+
aes = encryption_util_p.CreateEncryptionState(std::move(metadata));
170+
168171
Initialize(key);
169172
}
170173

@@ -190,8 +193,8 @@ class EncryptionTransport : public TTransport {
190193
const uint32_t total_length = ParquetCrypto::NONCE_BYTES + ciphertext_length + ParquetCrypto::TAG_BYTES;
191194

192195
trans.write(const_data_ptr_cast(&total_length), ParquetCrypto::LENGTH_BYTES);
193-
// Write nonce at beginning of encrypted chunk
194-
trans.write(nonce, ParquetCrypto::NONCE_BYTES);
196+
// Write nonce at the start of encrypted chunk
197+
trans.write(nonce.data(), ParquetCrypto::NONCE_BYTES);
195198

196199
data_t aes_buffer[ParquetCrypto::CRYPTO_BLOCK_SIZE];
197200
auto current = allocator.GetTail();
@@ -220,10 +223,9 @@ class EncryptionTransport : public TTransport {
220223
private:
221224
void Initialize(const string &key) {
222225
// Generate Nonce
223-
aes->GenerateRandomData(nonce, ParquetCrypto::NONCE_BYTES);
226+
aes->GenerateRandomData(nonce.data(), nonce.size());
224227
// Initialize Encryption
225-
aes->InitializeEncryption(nonce, ParquetCrypto::NONCE_BYTES, reinterpret_cast<const_data_ptr_t>(key.data()),
226-
key.size());
228+
aes->InitializeEncryption(nonce, reinterpret_cast<const_data_ptr_t>(key.data()));
227229
}
228230

229231
private:
@@ -235,7 +237,7 @@ class EncryptionTransport : public TTransport {
235237
shared_ptr<EncryptionState> aes;
236238

237239
//! Nonce created by Initialize()
238-
data_t nonce[ParquetCrypto::NONCE_BYTES];
240+
EncryptionNonce nonce;
239241

240242
//! Arena Allocator to fully materialize in memory before encrypting
241243
ArenaAllocator allocator;
@@ -246,9 +248,10 @@ class DecryptionTransport : public TTransport {
246248
public:
247249
DecryptionTransport(TProtocol &prot_p, const string &key, const EncryptionUtil &encryption_util_p,
248250
const CryptoMetaData &crypto_meta_data)
249-
: prot(prot_p), trans(*prot.getTransport()),
250-
aes(encryption_util_p.CreateEncryptionState(EncryptionTypes::GCM, key.size())), read_buffer_size(0),
251-
read_buffer_offset(0) {
251+
: prot(prot_p), trans(*prot.getTransport()), read_buffer_size(0), read_buffer_offset(0) {
252+
auto metadata = make_uniq<EncryptionStateMetadata>(EncryptionTypes::GCM, key.size(),
253+
EncryptionTypes::EncryptionVersion::NONE);
254+
aes = encryption_util_p.CreateEncryptionState(std::move(metadata));
252255
Initialize(key, crypto_meta_data);
253256
}
254257

@@ -306,16 +309,15 @@ class DecryptionTransport : public TTransport {
306309
total_bytes = Load<uint32_t>(length_buf);
307310
transport_remaining = total_bytes;
308311
// Read nonce and initialize AES
309-
transport_remaining -= trans.read(nonce, ParquetCrypto::NONCE_BYTES);
312+
transport_remaining -= trans.read(nonce.data(), nonce.total_size());
310313
// check whether context is initialized
311314
if (!crypto_meta_data.IsEmpty()) {
312-
aes->InitializeDecryption(nonce, ParquetCrypto::NONCE_BYTES, reinterpret_cast<const_data_ptr_t>(key.data()),
313-
key.size(), crypto_meta_data.additional_authenticated_data->data(),
315+
aes->InitializeDecryption(nonce, reinterpret_cast<const_data_ptr_t>(key.data()),
316+
crypto_meta_data.additional_authenticated_data->data(),
314317
crypto_meta_data.additional_authenticated_data->size());
315318
crypto_meta_data.additional_authenticated_data->Rewind();
316319
} else {
317-
aes->InitializeDecryption(nonce, ParquetCrypto::NONCE_BYTES, reinterpret_cast<const_data_ptr_t>(key.data()),
318-
key.size());
320+
aes->InitializeDecryption(nonce, reinterpret_cast<const_data_ptr_t>(key.data()));
319321
}
320322
}
321323

@@ -353,7 +355,7 @@ class DecryptionTransport : public TTransport {
353355
uint32_t total_bytes;
354356
uint32_t transport_remaining;
355357
//! Nonce read by Initialize()
356-
data_t nonce[ParquetCrypto::NONCE_BYTES];
358+
EncryptionNonce nonce;
357359
};
358360

359361
class SimpleReadTransport : public TTransport {

src/duckdb/extension/parquet/parquet_extension.cpp

Lines changed: 7 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,8 @@ struct ParquetWriteBindData : public TableFunctionData {
6767
idx_t row_group_size = DEFAULT_ROW_GROUP_SIZE;
6868
idx_t row_group_size_bytes = NumericLimits<idx_t>::Maximum();
6969

70-
//! How/Whether to encrypt the data
70+
//! Encryption configuration
7171
shared_ptr<ParquetEncryptionConfig> encryption_config;
72-
bool debug_use_openssl = true;
7372

7473
//! After how many distinct values should we abandon dictionary compression and bloom filters?
7574
//! Defaults to 1/5th of the row group size if unset (in templated_column_writer.hpp)
@@ -131,8 +130,8 @@ static void ParquetListCopyOptions(ClientContext &context, CopyOptionsInput &inp
131130
copy_options["dictionary_size_limit"] = CopyOption(LogicalType::BIGINT, CopyOptionMode::WRITE_ONLY);
132131
copy_options["string_dictionary_page_size_limit"] = CopyOption(LogicalType::UBIGINT, CopyOptionMode::WRITE_ONLY);
133132
copy_options["bloom_filter_false_positive_ratio"] = CopyOption(LogicalType::DOUBLE, CopyOptionMode::WRITE_ONLY);
134-
copy_options["write_bloom_filter"] = CopyOption(LogicalType::BOOLEAN, CopyOptionMode::WRITE_ONLY);
135133
copy_options["debug_use_openssl"] = CopyOption(LogicalType::BOOLEAN, CopyOptionMode::READ_WRITE);
134+
copy_options["write_bloom_filter"] = CopyOption(LogicalType::BOOLEAN, CopyOptionMode::WRITE_ONLY);
136135
copy_options["compression_level"] = CopyOption(LogicalType::BIGINT, CopyOptionMode::WRITE_ONLY);
137136
copy_options["parquet_version"] = CopyOption(LogicalType::VARCHAR, CopyOptionMode::WRITE_ONLY);
138137
copy_options["binary_as_string"] = CopyOption(LogicalType::BOOLEAN, CopyOptionMode::READ_ONLY);
@@ -272,7 +271,7 @@ static unique_ptr<FunctionData> ParquetWriteBind(ClientContext &context, CopyFun
272271
}
273272
} else if (loption == "encryption_config") {
274273
bind_data->encryption_config = ParquetEncryptionConfig::Create(context, option.second[0]);
275-
} else if (loption == "dictionary_compression_ratio_threshold") {
274+
} else if (loption == "dictionary_compression_ratio_threshold" || loption == "debug_use_openssl") {
276275
// deprecated, ignore setting
277276
} else if (loption == "dictionary_size_limit") {
278277
auto val = option.second[0].GetValue<int64_t>();
@@ -296,15 +295,6 @@ static unique_ptr<FunctionData> ParquetWriteBind(ClientContext &context, CopyFun
296295
throw BinderException("bloom_filter_false_positive_ratio must be greater than 0");
297296
}
298297
bind_data->bloom_filter_false_positive_ratio = val;
299-
} else if (loption == "debug_use_openssl") {
300-
auto val = StringUtil::Lower(option.second[0].GetValue<std::string>());
301-
if (val == "false") {
302-
bind_data->debug_use_openssl = false;
303-
} else if (val == "true") {
304-
bind_data->debug_use_openssl = true;
305-
} else {
306-
throw BinderException("Expected debug_use_openssl to be a BOOLEAN");
307-
}
308298
} else if (loption == "compression_level") {
309299
const auto val = option.second[0].GetValue<int64_t>();
310300
if (val < ZStdFileSystem::MinimumCompressionLevel() || val > ZStdFileSystem::MaximumCompressionLevel()) {
@@ -367,8 +357,8 @@ static unique_ptr<GlobalFunctionData> ParquetWriteInitializeGlobal(ClientContext
367357
parquet_bind.field_ids.Copy(), parquet_bind.shredding_types.Copy(), parquet_bind.kv_metadata,
368358
parquet_bind.encryption_config, parquet_bind.dictionary_size_limit,
369359
parquet_bind.string_dictionary_page_size_limit, parquet_bind.enable_bloom_filters,
370-
parquet_bind.bloom_filter_false_positive_ratio, parquet_bind.compression_level, parquet_bind.debug_use_openssl,
371-
parquet_bind.parquet_version, parquet_bind.geoparquet_version);
360+
parquet_bind.bloom_filter_false_positive_ratio, parquet_bind.compression_level, parquet_bind.parquet_version,
361+
parquet_bind.geoparquet_version);
372362
return std::move(global_state);
373363
}
374364

@@ -611,8 +601,6 @@ static void ParquetCopySerialize(Serializer &serializer, const FunctionData &bin
611601
serializer.WritePropertyWithDefault(109, "compression_level", compression_level);
612602
serializer.WritePropertyWithDefault(110, "row_groups_per_file", bind_data.row_groups_per_file,
613603
default_value.row_groups_per_file);
614-
serializer.WritePropertyWithDefault(111, "debug_use_openssl", bind_data.debug_use_openssl,
615-
default_value.debug_use_openssl);
616604
serializer.WritePropertyWithDefault(112, "dictionary_size_limit", bind_data.dictionary_size_limit,
617605
default_value.dictionary_size_limit);
618606
serializer.WritePropertyWithDefault(113, "bloom_filter_false_positive_ratio",
@@ -648,8 +636,7 @@ static unique_ptr<FunctionData> ParquetCopyDeserialize(Deserializer &deserialize
648636
ParquetWriteBindData default_value;
649637
data->row_groups_per_file = deserializer.ReadPropertyWithExplicitDefault<optional_idx>(
650638
110, "row_groups_per_file", default_value.row_groups_per_file);
651-
data->debug_use_openssl =
652-
deserializer.ReadPropertyWithExplicitDefault<bool>(111, "debug_use_openssl", default_value.debug_use_openssl);
639+
deserializer.ReadDeletedProperty<bool>(111, "debug_use_openssl");
653640
data->dictionary_size_limit =
654641
deserializer.ReadPropertyWithExplicitDefault<optional_idx>(112, "dictionary_size_limit", optional_idx());
655642
data->bloom_filter_false_positive_ratio = deserializer.ReadPropertyWithExplicitDefault<double>(
@@ -660,8 +647,7 @@ static unique_ptr<FunctionData> ParquetCopyDeserialize(Deserializer &deserialize
660647
115, "string_dictionary_page_size_limit", default_value.string_dictionary_page_size_limit);
661648
data->geoparquet_version =
662649
deserializer.ReadPropertyWithExplicitDefault(116, "geoparquet_version", default_value.geoparquet_version);
663-
data->shredding_types =
664-
deserializer.ReadPropertyWithExplicitDefault<ShreddingType>(117, "shredding_types", ShreddingType());
650+
data->shredding_types = deserializer.ReadProperty<ShreddingType>(117, "shredding_types");
665651

666652
return std::move(data);
667653
}

0 commit comments

Comments
 (0)