From 7403979bf6190e0bcf17f04478063313177529db Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Fri, 30 Jan 2026 13:34:00 +0900 Subject: [PATCH 1/2] GH-49069: [C++] Share Trie instances across CSV value decoders --- cpp/src/arrow/csv/converter.cc | 107 +++++++++++++++++++-------------- cpp/src/arrow/csv/converter.h | 1 + 2 files changed, 64 insertions(+), 44 deletions(-) diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc index ec31d4b1ceb4..7fc72251c129 100644 --- a/cpp/src/arrow/csv/converter.cc +++ b/cpp/src/arrow/csv/converter.cc @@ -105,31 +105,45 @@ Status PresizeBuilder(const BlockParser& parser, BuilderType* builder) { } } +///////////////////////////////////////////////////////////////////////// +// Shared Tries cache to avoid rebuilding them for each decoder instance + +struct TrieCache { + Trie null_trie; + Trie true_trie; + Trie false_trie; + + static Result> Make(const ConvertOptions& options) { + auto cache = std::make_shared(); + RETURN_NOT_OK(InitializeTrie(options.null_values, &cache->null_trie)); + RETURN_NOT_OK(InitializeTrie(options.true_values, &cache->true_trie)); + RETURN_NOT_OK(InitializeTrie(options.false_values, &cache->false_trie)); + return cache; + } +}; + ///////////////////////////////////////////////////////////////////////// // Per-type value decoders struct ValueDecoder { explicit ValueDecoder(const std::shared_ptr& type, - const ConvertOptions& options) - : type_(type), options_(options) {} + const ConvertOptions& options, const TrieCache* trie_cache) + : type_(type), options_(options), trie_cache_(*trie_cache) {} - Status Initialize() { - // TODO no need to build a separate Trie for each instance - return InitializeTrie(options_.null_values, &null_trie_); - } + Status Initialize() { return Status::OK(); } bool IsNull(const uint8_t* data, uint32_t size, bool quoted) { if (quoted && !options_.quoted_strings_can_be_null) { return false; } - return null_trie_.Find(std::string_view(reinterpret_cast(data), size)) >= - 0; + return trie_cache_.null_trie.Find( + std::string_view(reinterpret_cast(data), size)) >= 0; } protected: - Trie null_trie_; const std::shared_ptr type_; const ConvertOptions& options_; + const TrieCache& trie_cache_; }; // @@ -140,8 +154,9 @@ struct FixedSizeBinaryValueDecoder : public ValueDecoder { using value_type = const uint8_t*; explicit FixedSizeBinaryValueDecoder(const std::shared_ptr& type, - const ConvertOptions& options) - : ValueDecoder(type, options), + const ConvertOptions& options, + const TrieCache* trie_cache) + : ValueDecoder(type, options, trie_cache), byte_width_(checked_cast(*type).byte_width()) {} Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) { @@ -207,8 +222,8 @@ struct NumericValueDecoder : public ValueDecoder { using value_type = typename T::c_type; NumericValueDecoder(const std::shared_ptr& type, - const ConvertOptions& options) - : ValueDecoder(type, options), + const ConvertOptions& options, const TrieCache* trie_cache) + : ValueDecoder(type, options, trie_cache), concrete_type_(checked_cast(*type)), string_converter_(MakeStringConverter(options)) {} @@ -236,31 +251,20 @@ struct BooleanValueDecoder : public ValueDecoder { using ValueDecoder::ValueDecoder; - Status Initialize() { - // TODO no need to build separate Tries for each instance - RETURN_NOT_OK(InitializeTrie(options_.true_values, &true_trie_)); - RETURN_NOT_OK(InitializeTrie(options_.false_values, &false_trie_)); - return ValueDecoder::Initialize(); - } - Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) { // XXX should quoted values be allowed at all? - if (false_trie_.Find(std::string_view(reinterpret_cast(data), size)) >= - 0) { + if (trie_cache_.false_trie.Find( + std::string_view(reinterpret_cast(data), size)) >= 0) { *out = false; return Status::OK(); } - if (ARROW_PREDICT_TRUE(true_trie_.Find(std::string_view( + if (ARROW_PREDICT_TRUE(trie_cache_.true_trie.Find(std::string_view( reinterpret_cast(data), size)) >= 0)) { *out = true; return Status::OK(); } return GenericConversionError(type_, data, size); } - - protected: - Trie true_trie_; - Trie false_trie_; }; // @@ -271,8 +275,8 @@ struct DecimalValueDecoder : public ValueDecoder { using value_type = Decimal128; explicit DecimalValueDecoder(const std::shared_ptr& type, - const ConvertOptions& options) - : ValueDecoder(type, options), + const ConvertOptions& options, const TrieCache* trie_cache) + : ValueDecoder(type, options, trie_cache), decimal_type_(internal::checked_cast(*type_)), type_precision_(decimal_type_.precision()), type_scale_(decimal_type_.scale()) {} @@ -310,8 +314,10 @@ struct CustomDecimalPointValueDecoder : public ValueDecoder { using value_type = typename WrappedDecoder::value_type; explicit CustomDecimalPointValueDecoder(const std::shared_ptr& type, - const ConvertOptions& options) - : ValueDecoder(type, options), wrapped_decoder_(type, options) {} + const ConvertOptions& options, + const TrieCache* trie_cache) + : ValueDecoder(type, options, trie_cache), + wrapped_decoder_(type, options, trie_cache) {} Status Initialize() { RETURN_NOT_OK(wrapped_decoder_.Initialize()); @@ -321,7 +327,7 @@ struct CustomDecimalPointValueDecoder : public ValueDecoder { mapping_[options_.decimal_point] = '.'; mapping_['.'] = options_.decimal_point; // error out on standard decimal point temp_.resize(30); - return Status::OK(); + return ValueDecoder::Initialize(); } Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) { @@ -357,8 +363,9 @@ struct InlineISO8601ValueDecoder : public ValueDecoder { using value_type = int64_t; explicit InlineISO8601ValueDecoder(const std::shared_ptr& type, - const ConvertOptions& options) - : ValueDecoder(type, options), + const ConvertOptions& options, + const TrieCache* trie_cache) + : ValueDecoder(type, options, trie_cache), unit_(checked_cast(*type_).unit()), expect_timezone_(!checked_cast(*type_).timezone().empty()) { } @@ -396,8 +403,9 @@ struct SingleParserTimestampValueDecoder : public ValueDecoder { using value_type = int64_t; explicit SingleParserTimestampValueDecoder(const std::shared_ptr& type, - const ConvertOptions& options) - : ValueDecoder(type, options), + const ConvertOptions& options, + const TrieCache* trie_cache) + : ValueDecoder(type, options, trie_cache), unit_(checked_cast(*type_).unit()), expect_timezone_(!checked_cast(*type_).timezone().empty()), parser_(*options_.timestamp_parsers[0]) {} @@ -436,8 +444,9 @@ struct MultipleParsersTimestampValueDecoder : public ValueDecoder { using value_type = int64_t; explicit MultipleParsersTimestampValueDecoder(const std::shared_ptr& type, - const ConvertOptions& options) - : ValueDecoder(type, options), + const ConvertOptions& options, + const TrieCache* trie_cache) + : ValueDecoder(type, options, trie_cache), unit_(checked_cast(*type_).unit()), expect_timezone_(!checked_cast(*type_).timezone().empty()), parsers_(GetParsers(options_)) {} @@ -477,8 +486,9 @@ struct DurationValueDecoder : public ValueDecoder { using value_type = int64_t; explicit DurationValueDecoder(const std::shared_ptr& type, - const ConvertOptions& options) - : ValueDecoder(type, options), + const ConvertOptions& options, + const TrieCache* trie_cache) + : ValueDecoder(type, options, trie_cache), concrete_type_(checked_cast(*type)), string_converter_() {} @@ -517,7 +527,8 @@ class NullConverter : public ConcreteConverter { public: NullConverter(const std::shared_ptr& type, const ConvertOptions& options, MemoryPool* pool) - : ConcreteConverter(type, options, pool), decoder_(type_, options_) {} + : ConcreteConverter(type, options, pool), + decoder_(type_, options_, static_cast(trie_cache_.get())) {} Result> Convert(const BlockParser& parser, int32_t col_index) override { @@ -551,7 +562,8 @@ class PrimitiveConverter : public ConcreteConverter { public: PrimitiveConverter(const std::shared_ptr& type, const ConvertOptions& options, MemoryPool* pool) - : ConcreteConverter(type, options, pool), decoder_(type_, options_) {} + : ConcreteConverter(type, options, pool), + decoder_(type_, options_, static_cast(trie_cache_.get())) {} Result> Convert(const BlockParser& parser, int32_t col_index) override { @@ -593,7 +605,8 @@ class TypedDictionaryConverter : public ConcreteDictionaryConverter { TypedDictionaryConverter(const std::shared_ptr& value_type, const ConvertOptions& options, MemoryPool* pool) : ConcreteDictionaryConverter(value_type, options, pool), - decoder_(value_type, options_) {} + decoder_(value_type, options_, static_cast(trie_cache_.get())) { + } Result> Convert(const BlockParser& parser, int32_t col_index) override { @@ -684,7 +697,13 @@ std::shared_ptr MakeRealConverter(const std::shared_ptr Converter::Converter(const std::shared_ptr& type, const ConvertOptions& options, MemoryPool* pool) - : options_(options), pool_(pool), type_(type) {} + : options_(options), pool_(pool), type_(type) { + // Build shared Trie cache (errors handled in Initialize()) + auto maybe_cache = TrieCache::Make(options); + if (maybe_cache.ok()) { + trie_cache_ = std::static_pointer_cast(*std::move(maybe_cache)); + } +} DictionaryConverter::DictionaryConverter(const std::shared_ptr& value_type, const ConvertOptions& options, MemoryPool* pool) diff --git a/cpp/src/arrow/csv/converter.h b/cpp/src/arrow/csv/converter.h index 639f692f26a1..23df9f4fe09f 100644 --- a/cpp/src/arrow/csv/converter.h +++ b/cpp/src/arrow/csv/converter.h @@ -57,6 +57,7 @@ class ARROW_EXPORT Converter { const ConvertOptions& options_; MemoryPool* pool_; std::shared_ptr type_; + std::shared_ptr trie_cache_; // Opaque pointer to TrieCache }; class ARROW_EXPORT DictionaryConverter : public Converter { From ed65f92f9bbed69e473fc575b1865504383e3cc3 Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Fri, 30 Jan 2026 19:20:41 +0900 Subject: [PATCH 2/2] Address a review comment --- cpp/src/arrow/csv/converter.cc | 10 +++++----- cpp/src/arrow/csv/converter.h | 4 +++- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc index 7fc72251c129..bb59d02cd206 100644 --- a/cpp/src/arrow/csv/converter.cc +++ b/cpp/src/arrow/csv/converter.cc @@ -128,7 +128,7 @@ struct TrieCache { struct ValueDecoder { explicit ValueDecoder(const std::shared_ptr& type, const ConvertOptions& options, const TrieCache* trie_cache) - : type_(type), options_(options), trie_cache_(*trie_cache) {} + : type_(type), options_(options), trie_cache_(trie_cache) {} Status Initialize() { return Status::OK(); } @@ -136,14 +136,14 @@ struct ValueDecoder { if (quoted && !options_.quoted_strings_can_be_null) { return false; } - return trie_cache_.null_trie.Find( + return trie_cache_->null_trie.Find( std::string_view(reinterpret_cast(data), size)) >= 0; } protected: const std::shared_ptr type_; const ConvertOptions& options_; - const TrieCache& trie_cache_; + const TrieCache* trie_cache_; }; // @@ -253,12 +253,12 @@ struct BooleanValueDecoder : public ValueDecoder { Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) { // XXX should quoted values be allowed at all? - if (trie_cache_.false_trie.Find( + if (trie_cache_->false_trie.Find( std::string_view(reinterpret_cast(data), size)) >= 0) { *out = false; return Status::OK(); } - if (ARROW_PREDICT_TRUE(trie_cache_.true_trie.Find(std::string_view( + if (ARROW_PREDICT_TRUE(trie_cache_->true_trie.Find(std::string_view( reinterpret_cast(data), size)) >= 0)) { *out = true; return Status::OK(); diff --git a/cpp/src/arrow/csv/converter.h b/cpp/src/arrow/csv/converter.h index 23df9f4fe09f..c6254bd7ca1f 100644 --- a/cpp/src/arrow/csv/converter.h +++ b/cpp/src/arrow/csv/converter.h @@ -57,7 +57,9 @@ class ARROW_EXPORT Converter { const ConvertOptions& options_; MemoryPool* pool_; std::shared_ptr type_; - std::shared_ptr trie_cache_; // Opaque pointer to TrieCache + // Opaque TrieCache pointer. TrieCache destructor is called via control block. + // https://en.cppreference.com/w/cpp/memory/shared_ptr + std::shared_ptr trie_cache_; }; class ARROW_EXPORT DictionaryConverter : public Converter {