From 99eabffb0872d05817aaac3a684fea45e2cb2207 Mon Sep 17 00:00:00 2001
From: "TF.Text Team" <tf-text-github-robot@google.com>
Date: Tue, 24 Feb 2026 17:02:03 -0800
Subject: [PATCH] Fixed a memory safety bug in FastWordpieceTokenizer
 concerning StringVocab lifetime. This prevents temporary copies that were
 previously invalidating std::string_view references to internal vocabulary
 data, ensuring memory stability during tokenization.

PiperOrigin-RevId: 874855389
---
 .../core/kernels/fast_wordpiece_tokenizer_model_builder.cc  | 6 +++---
 .../core/kernels/phrase_tokenizer_model_builder.cc          | 4 ++--
 tensorflow_text/core/kernels/string_vocab.cc                | 1 +
 tensorflow_text/core/kernels/string_vocab.h                 | 2 ++
 4 files changed, 8 insertions(+), 5 deletions(-)
diff --git a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.cc b/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.cc
index 584ae560b..9467c4d6e 100644
--- a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.cc
+++ b/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.cc
@@ -206,7 +206,7 @@ class FastWordpieceBuilder {
     trie_array_[node_id] &= 0xFFFFFEFF;
   }
 
-  absl::optional<StringVocab> vocab_;
+  std::unique_ptr<StringVocab> vocab_;
 
   int max_bytes_per_token_ = -1;
 
@@ -264,7 +264,7 @@ absl::Status FastWordpieceBuilder::BuildModel(
   no_pretokenization_ = no_pretokenization;
   support_detokenization_ = support_detokenization;
 
-  vocab_.emplace(vocab);
+  vocab_ = std::make_unique<StringVocab>(vocab);
   if (vocab_->Size() != vocab.size()) {
     return absl::FailedPreconditionError(
         "Tokens in the vocabulary must be unique.");
@@ -830,7 +830,7 @@ absl::Status FastWordpieceBuilder::PrecomputeResultForSuffixIndicator() {
   LookupStatus status = WordpieceTokenize(
       suffix_indicator_, max_bytes_per_token_, /*max_chars_per_subtoken=*/-1,
       suffix_indicator_, /*use_unknown_token=*/true, unk_token_,
-      /*split_unknown_characters=*/false, &vocab_.value(), &subwords,
+      /*split_unknown_characters=*/false, vocab_.get(), &subwords,
       &begin_offset, &end_offset, &num_word_pieces);
   precomputed_result_for_suffix_indicator_.reserve(subwords.size());
   if (!status.success) {
diff --git a/tensorflow_text/core/kernels/phrase_tokenizer_model_builder.cc b/tensorflow_text/core/kernels/phrase_tokenizer_model_builder.cc
index 07ec2c85f..268aeb32c 100644
--- a/tensorflow_text/core/kernels/phrase_tokenizer_model_builder.cc
+++ b/tensorflow_text/core/kernels/phrase_tokenizer_model_builder.cc
@@ -45,7 +45,7 @@ class PhraseBuilder {
   absl::StatusOr<std::string> ExportToFlatBuffer() const;
 
  private:
-  absl::optional<StringVocab> vocab_;
+  std::unique_ptr<StringVocab> vocab_;
   std::vector<uint32_t> trie_data_;
   std::string unk_token_;
   int unk_token_id_;
@@ -64,7 +64,7 @@ absl::Status PhraseBuilder::BuildModel(const std::vector<std::string>& vocab,
   prob_ = prob;
   split_end_punctuation_ = split_end_punctuation;
 
-  vocab_.emplace(vocab);
+  vocab_ = std::make_unique<StringVocab>(vocab);
   if (vocab_->Size() != vocab.size()) {
     return absl::FailedPreconditionError(
         "Tokens in the vocabulary must be unique.");
diff --git a/tensorflow_text/core/kernels/string_vocab.cc b/tensorflow_text/core/kernels/string_vocab.cc
index 2dcc1bc94..a2c239a93 100644
--- a/tensorflow_text/core/kernels/string_vocab.cc
+++ b/tensorflow_text/core/kernels/string_vocab.cc
@@ -19,6 +19,7 @@ namespace text {
 
 StringVocab::StringVocab(const std::vector<std::string>& vocab)
     : vocab_(vocab) {
+  index_map_.reserve(vocab.size());
   for (int i = 0; i < vocab.size(); ++i) {
     index_map_[vocab_[i]] = i;
   }
diff --git a/tensorflow_text/core/kernels/string_vocab.h b/tensorflow_text/core/kernels/string_vocab.h
index 0bcf95187..4590f2775 100644
--- a/tensorflow_text/core/kernels/string_vocab.h
+++ b/tensorflow_text/core/kernels/string_vocab.h
@@ -30,6 +30,8 @@ namespace text {
 class StringVocab : public WordpieceVocab {
  public:
   explicit StringVocab(const std::vector<std::string>& vocab);
+  StringVocab(const StringVocab&) = delete;
+  StringVocab& operator=(const StringVocab&) = delete;
   LookupStatus Contains(absl::string_view key, bool* value) const override;
   absl::optional<int> LookupId(absl::string_view key) const;
   // Returns the key of `vocab_id` or empty if `vocab_id` is not valid.