From 99eabffb0872d05817aaac3a684fea45e2cb2207 Mon Sep 17 00:00:00 2001 From: "TF.Text Team" Date: Tue, 24 Feb 2026 17:02:03 -0800 Subject: [PATCH] Fixed a memory safety bug in FastWordpieceTokenizer concerning StringVocab lifetime. This prevents temporary copies that were previously invalidating std::string_view references to internal vocabulary data, ensuring memory stability during tokenization. PiperOrigin-RevId: 874855389 --- .../core/kernels/fast_wordpiece_tokenizer_model_builder.cc | 6 +++--- .../core/kernels/phrase_tokenizer_model_builder.cc | 4 ++-- tensorflow_text/core/kernels/string_vocab.cc | 1 + tensorflow_text/core/kernels/string_vocab.h | 2 ++ 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.cc b/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.cc index 584ae560b..9467c4d6e 100644 --- a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.cc +++ b/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.cc @@ -206,7 +206,7 @@ class FastWordpieceBuilder { trie_array_[node_id] &= 0xFFFFFEFF; } - absl::optional vocab_; + std::unique_ptr vocab_; int max_bytes_per_token_ = -1; @@ -264,7 +264,7 @@ absl::Status FastWordpieceBuilder::BuildModel( no_pretokenization_ = no_pretokenization; support_detokenization_ = support_detokenization; - vocab_.emplace(vocab); + vocab_ = std::make_unique(vocab); if (vocab_->Size() != vocab.size()) { return absl::FailedPreconditionError( "Tokens in the vocabulary must be unique."); @@ -830,7 +830,7 @@ absl::Status FastWordpieceBuilder::PrecomputeResultForSuffixIndicator() { LookupStatus status = WordpieceTokenize( suffix_indicator_, max_bytes_per_token_, /*max_chars_per_subtoken=*/-1, suffix_indicator_, /*use_unknown_token=*/true, unk_token_, - /*split_unknown_characters=*/false, &vocab_.value(), &subwords, + /*split_unknown_characters=*/false, vocab_.get(), &subwords, &begin_offset, &end_offset, &num_word_pieces); precomputed_result_for_suffix_indicator_.reserve(subwords.size()); if (!status.success) { diff --git a/tensorflow_text/core/kernels/phrase_tokenizer_model_builder.cc b/tensorflow_text/core/kernels/phrase_tokenizer_model_builder.cc index 07ec2c85f..268aeb32c 100644 --- a/tensorflow_text/core/kernels/phrase_tokenizer_model_builder.cc +++ b/tensorflow_text/core/kernels/phrase_tokenizer_model_builder.cc @@ -45,7 +45,7 @@ class PhraseBuilder { absl::StatusOr ExportToFlatBuffer() const; private: - absl::optional vocab_; + std::unique_ptr vocab_; std::vector trie_data_; std::string unk_token_; int unk_token_id_; @@ -64,7 +64,7 @@ absl::Status PhraseBuilder::BuildModel(const std::vector& vocab, prob_ = prob; split_end_punctuation_ = split_end_punctuation; - vocab_.emplace(vocab); + vocab_ = std::make_unique(vocab); if (vocab_->Size() != vocab.size()) { return absl::FailedPreconditionError( "Tokens in the vocabulary must be unique."); diff --git a/tensorflow_text/core/kernels/string_vocab.cc b/tensorflow_text/core/kernels/string_vocab.cc index 2dcc1bc94..a2c239a93 100644 --- a/tensorflow_text/core/kernels/string_vocab.cc +++ b/tensorflow_text/core/kernels/string_vocab.cc @@ -19,6 +19,7 @@ namespace text { StringVocab::StringVocab(const std::vector& vocab) : vocab_(vocab) { + index_map_.reserve(vocab.size()); for (int i = 0; i < vocab.size(); ++i) { index_map_[vocab_[i]] = i; } diff --git a/tensorflow_text/core/kernels/string_vocab.h b/tensorflow_text/core/kernels/string_vocab.h index 0bcf95187..4590f2775 100644 --- a/tensorflow_text/core/kernels/string_vocab.h +++ b/tensorflow_text/core/kernels/string_vocab.h @@ -30,6 +30,8 @@ namespace text { class StringVocab : public WordpieceVocab { public: explicit StringVocab(const std::vector& vocab); + StringVocab(const StringVocab&) = delete; + StringVocab& operator=(const StringVocab&) = delete; LookupStatus Contains(absl::string_view key, bool* value) const override; absl::optional LookupId(absl::string_view key) const; // Returns the key of `vocab_id` or empty if `vocab_id` is not valid.