From ff40b116526a3ef14f5425703bbb187bd0ccf2f4 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 10 Jun 2026 23:15:43 +0000
Subject: [PATCH 1/4] Initial plan


From 862be328ec68313ce82c18c3e0671f310dd894c1 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 10 Jun 2026 23:41:51 +0000
Subject: [PATCH 2/4] Add public SentencePieceTokenizer.Create(vocab) and
 CreateFromTokenizerJson APIs

Co-authored-by: ericstj <8918108+ericstj@users.noreply.github.com>
---
 .../Model/SentencePieceBaseModel.cs           |  44 ++++
 .../Model/SentencePieceTokenizer.cs           | 238 ++++++++++++++++++
 .../Model/SentencePieceUnigramModel.cs        | 138 ++++++++++
 .../UnigramTests.cs                           | 103 ++++++++
 4 files changed, 523 insertions(+)
diff --git a/src/Microsoft.ML.Tokenizers/Model/SentencePieceBaseModel.cs b/src/Microsoft.ML.Tokenizers/Model/SentencePieceBaseModel.cs
index 5bd204f501..885f541c5b 100644
--- a/src/Microsoft.ML.Tokenizers/Model/SentencePieceBaseModel.cs
+++ b/src/Microsoft.ML.Tokenizers/Model/SentencePieceBaseModel.cs
@@ -59,6 +59,50 @@ internal SentencePieceBaseModel(ModelProto modelProto, bool addBos = false, bool
                                 specialTokens);
         }
 
+        internal SentencePieceBaseModel(
+            bool addBos, bool addEos,
+            string bosToken, int bosId,
+            string eosToken, int eosId,
+            string unkToken, int unkId,
+            bool addDummyPrefix, bool escapeWhiteSpaces,
+            bool treatWhitespaceAsSuffix, bool byteFallback,
+            ReadOnlySpan<byte> precompiledCharsmap, bool removeExtraWhitespaces,
+            IReadOnlyDictionary<string, int>? specialTokens)
+        {
+            AddBeginningOfSentence = addBos;
+            AddEndOfSentence = addEos;
+            BeginningOfSentenceToken = bosToken;
+            BeginningOfSentenceId = Math.Max(0, bosId);
+            EndOfSentenceToken = eosToken;
+            EndOfSentenceId = Math.Max(0, eosId);
+            UnknownToken = unkToken;
+            UnknownId = Math.Max(0, unkId);
+            AddDummyPrefix = addDummyPrefix;
+            EscapeWhiteSpaces = escapeWhiteSpaces;
+            TreatWhitespaceAsSuffix = treatWhitespaceAsSuffix;
+            ByteFallback = byteFallback;
+            SpecialTokens = specialTokens;
+
+            if (specialTokens is not null && specialTokens.Count > 0)
+            {
+                InternalSpecialTokens = new Dictionary<StringSpanOrdinalKey, int>();
+                SpecialTokensReverse = new Dictionary<int, string>();
+
+                foreach (var item in specialTokens)
+                {
+                    InternalSpecialTokens.Add(new StringSpanOrdinalKey(item.Key), item.Value);
+                    SpecialTokensReverse.Add(item.Value, item.Key);
+                }
+
+                SpecialTokensRegex = new Regex(string.Join("|", specialTokens.Keys.Select(s => Regex.Escape(s))), RegexOptions.Compiled);
+            }
+
+            Normalizer = new SentencePieceNormalizer(
+                                precompiledCharsmap, removeExtraWhitespaces,
+                                addDummyPrefix, escapeWhiteSpaces,
+                                treatWhitespaceAsSuffix, specialTokens);
+        }
+
         internal Regex? SpecialTokensRegex { get; }
 
         internal Dictionary<StringSpanOrdinalKey, int>? InternalSpecialTokens { get; }
diff --git a/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs
index cb945d24fa..2176fba0fa 100644
--- a/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs
+++ b/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs
@@ -7,6 +7,7 @@
 using System.Buffers;
 using System.Collections.Generic;
 using System.IO;
+using System.Text.Json;
 
 namespace Microsoft.ML.Tokenizers
 {
@@ -30,6 +31,11 @@ internal SentencePieceTokenizer(ModelProto modelProto, bool addBos, bool addEos,
             };
         }
 
+        private SentencePieceTokenizer(SentencePieceBaseModel model)
+        {
+            _model = model;
+        }
+
         /// <summary>
         /// The special tokens.
         /// </summary>
@@ -457,5 +463,237 @@ public static SentencePieceTokenizer Create(
 
             return new SentencePieceTokenizer(modelProto, addBeginningOfSentence, addEndOfSentence, specialTokens);
         }
+
+        /// <summary>
+        /// Creates a Unigram <see cref="SentencePieceTokenizer"/> from an in-memory vocabulary of (piece, score) pairs.
+        /// </summary>
+        /// <param name="vocab">
+        /// The vocabulary as an ordered sequence of (piece, score) pairs. The position of each pair
+        /// in the sequence determines its token ID.
+        /// </param>
+        /// <param name="unkId">The index (token ID) of the unknown token in <paramref name="vocab"/>.</param>
+        /// <param name="addBeginningOfSentence">Whether to emit the beginning-of-sentence token during encoding.</param>
+        /// <param name="addEndOfSentence">Whether to emit the end-of-sentence token during encoding.</param>
+        /// <param name="precompiledCharsMap">
+        /// Optional precompiled character normalization map (as found in the SentencePiece <c>normalizer_spec.precompiled_charsmap</c>
+        /// field or in the Hugging Face <c>tokenizer.json</c> <c>normalizer.precompiled_charsmap</c> property).
+        /// Pass <see langword="default"/> to skip precompiled normalization.
+        /// </param>
+        /// <param name="addDummyPrefix">Whether to prepend the dummy whitespace prefix character (U+2581) at the start of the input.</param>
+        /// <param name="escapeWhiteSpaces">Whether to replace spaces with the dummy whitespace character (U+2581) during normalization.</param>
+        /// <param name="treatWhitespaceAsSuffix">Whether to emit the U+2581 character at the end of the last token rather than the beginning of the first token.</param>
+        /// <param name="specialTokens">Additional special tokens to recognize, supplied as a mapping of token string to token ID.</param>
+        /// <returns>A new <see cref="SentencePieceTokenizer"/> instance.</returns>
+        /// <remarks>
+        /// The beginning-of-sentence and end-of-sentence token IDs are auto-detected by looking for pieces
+        /// named <c>&lt;s&gt;</c> and <c>&lt;/s&gt;</c> in <paramref name="vocab"/>. If not found, positions 1 and 2
+        /// are used as fallbacks (the SentencePiece convention). Similarly, a <c>&lt;pad&gt;</c> piece is
+        /// detected automatically if present.
+        /// <para>
+        /// When creating the tokenizer, ensure that the vocabulary is sourced from a trusted provider.
+        /// </para>
+        /// </remarks>
+        public static SentencePieceTokenizer Create(
+            IEnumerable<(string Piece, float Score)> vocab,
+            int unkId,
+            bool addBeginningOfSentence = true,
+            bool addEndOfSentence = false,
+            ReadOnlySpan<byte> precompiledCharsMap = default,
+            bool addDummyPrefix = true,
+            bool escapeWhiteSpaces = true,
+            bool treatWhitespaceAsSuffix = false,
+            IReadOnlyDictionary<string, int>? specialTokens = null)
+        {
+            if (vocab is null)
+            {
+                throw new ArgumentNullException(nameof(vocab));
+            }
+
+            IReadOnlyList<(string Piece, float Score)> pieces = vocab as IReadOnlyList<(string Piece, float Score)>
+                ?? new List<(string Piece, float Score)>(vocab);
+
+            SentencePieceUnigramModel model = new SentencePieceUnigramModel(
+                pieces, unkId, addBeginningOfSentence, addEndOfSentence,
+                precompiledCharsMap, addDummyPrefix, escapeWhiteSpaces,
+                treatWhitespaceAsSuffix, removeExtraWhitespaces: true, specialTokens);
+
+            return new SentencePieceTokenizer(model);
+        }
+
+        /// <summary>
+        /// Creates a Unigram <see cref="SentencePieceTokenizer"/> by parsing a Hugging Face <c>tokenizer.json</c>
+        /// that contains a Unigram model (<c>model.type == "Unigram"</c>).
+        /// </summary>
+        /// <param name="tokenizerJsonStream">A stream containing the UTF-8-encoded <c>tokenizer.json</c> content.</param>
+        /// <param name="addBeginningOfSentence">Whether to emit the beginning-of-sentence token during encoding.</param>
+        /// <param name="addEndOfSentence">Whether to emit the end-of-sentence token during encoding.</param>
+        /// <param name="specialTokens">Additional special tokens to recognize, supplied as a mapping of token string to token ID.</param>
+        /// <returns>A new <see cref="SentencePieceTokenizer"/> instance.</returns>
+        /// <remarks>
+        /// The following fields are read from the JSON:
+        /// <list type="bullet">
+        ///   <item><description><c>model.vocab</c> — array of <c>[piece, score]</c> pairs (required).</description></item>
+        ///   <item><description><c>model.unk_id</c> — index of the unknown token (required).</description></item>
+        ///   <item><description><c>normalizer.precompiled_charsmap</c> (base64) — normalization map; also searched inside a <c>Sequence</c> normalizer.</description></item>
+        ///   <item><description><c>pre_tokenizer</c> of type <c>Metaspace</c> — <c>add_prefix_space</c> and <c>replacement</c>; also searched inside a <c>Sequence</c> pre-tokenizer.</description></item>
+        /// </list>
+        /// <para>
+        /// When creating the tokenizer, ensure that the JSON stream is sourced from a trusted provider.
+        /// </para>
+        /// </remarks>
+        public static SentencePieceTokenizer CreateFromTokenizerJson(
+            Stream tokenizerJsonStream,
+            bool addBeginningOfSentence = true,
+            bool addEndOfSentence = false,
+            IReadOnlyDictionary<string, int>? specialTokens = null)
+        {
+            if (tokenizerJsonStream is null)
+            {
+                throw new ArgumentNullException(nameof(tokenizerJsonStream));
+            }
+
+            using JsonDocument doc = JsonDocument.Parse(tokenizerJsonStream);
+            JsonElement root = doc.RootElement;
+
+            // Validate model type
+            if (!root.TryGetProperty("model", out JsonElement modelElement))
+            {
+                throw new InvalidDataException("The tokenizer.json does not contain a 'model' property.");
+            }
+
+            if (modelElement.TryGetProperty("type", out JsonElement modelTypeElement) &&
+                !string.Equals(modelTypeElement.GetString(), "Unigram", StringComparison.OrdinalIgnoreCase))
+            {
+                throw new InvalidDataException($"Expected model type 'Unigram' but found '{modelTypeElement.GetString()}'.");
+            }
+
+            if (!modelElement.TryGetProperty("unk_id", out JsonElement unkIdElement))
+            {
+                throw new InvalidDataException("The tokenizer.json model does not contain an 'unk_id' property.");
+            }
+
+            int unkId = unkIdElement.GetInt32();
+
+            if (!modelElement.TryGetProperty("vocab", out JsonElement vocabElement) ||
+                vocabElement.ValueKind != JsonValueKind.Array)
+            {
+                throw new InvalidDataException("The tokenizer.json model does not contain a valid 'vocab' array.");
+            }
+
+            List<(string Piece, float Score)> vocab = new List<(string Piece, float Score)>(vocabElement.GetArrayLength());
+            foreach (JsonElement entry in vocabElement.EnumerateArray())
+            {
+                if (entry.ValueKind != JsonValueKind.Array || entry.GetArrayLength() < 2)
+                {
+                    throw new InvalidDataException("Each entry in 'model.vocab' must be a [piece, score] array.");
+                }
+
+                string? piece = entry[0].GetString();
+                if (piece is null)
+                {
+                    throw new InvalidDataException("A piece string in 'model.vocab' is null.");
+                }
+
+                vocab.Add((piece, entry[1].GetSingle()));
+            }
+
+            // Extract normalizer settings
+            byte[]? precompiledCharsMap = null;
+            bool addDummyPrefix = true;
+            bool removeExtraWhitespaces = true;
+            if (root.TryGetProperty("normalizer", out JsonElement normalizerElement))
+            {
+                precompiledCharsMap = ExtractPrecompiledCharsMap(normalizerElement);
+            }
+
+            // Extract pre_tokenizer settings
+            bool escapeWhiteSpaces = true;
+            bool treatWhitespaceAsSuffix = false;
+            if (root.TryGetProperty("pre_tokenizer", out JsonElement preTokenizerElement))
+            {
+                ExtractMetaspaceSettings(preTokenizerElement, ref addDummyPrefix, ref escapeWhiteSpaces, ref treatWhitespaceAsSuffix);
+            }
+
+            SentencePieceUnigramModel model = new SentencePieceUnigramModel(
+                vocab, unkId, addBeginningOfSentence, addEndOfSentence,
+                precompiledCharsMap is not null ? precompiledCharsMap.AsSpan() : default,
+                addDummyPrefix, escapeWhiteSpaces, treatWhitespaceAsSuffix, removeExtraWhitespaces, specialTokens);
+
+            return new SentencePieceTokenizer(model);
+        }
+
+        private static byte[]? ExtractPrecompiledCharsMap(JsonElement normalizer)
+        {
+            if (!normalizer.TryGetProperty("type", out JsonElement typeEl))
+            {
+                return null;
+            }
+
+            string? type = typeEl.GetString();
+            if (string.Equals(type, "Precompiled", StringComparison.OrdinalIgnoreCase))
+            {
+                if (normalizer.TryGetProperty("precompiled_charsmap", out JsonElement mapEl))
+                {
+                    string? base64 = mapEl.GetString();
+                    if (base64 is not null)
+                    {
+                        return Convert.FromBase64String(base64);
+                    }
+                }
+            }
+            else if (string.Equals(type, "Sequence", StringComparison.OrdinalIgnoreCase) &&
+                     normalizer.TryGetProperty("normalizers", out JsonElement normalizersEl) &&
+                     normalizersEl.ValueKind == JsonValueKind.Array)
+            {
+                foreach (JsonElement inner in normalizersEl.EnumerateArray())
+                {
+                    byte[]? result = ExtractPrecompiledCharsMap(inner);
+                    if (result is not null)
+                    {
+                        return result;
+                    }
+                }
+            }
+
+            return null;
+        }
+
+        private static void ExtractMetaspaceSettings(JsonElement preTokenizer, ref bool addDummyPrefix, ref bool escapeWhiteSpaces, ref bool treatWhitespaceAsSuffix)
+        {
+            if (!preTokenizer.TryGetProperty("type", out JsonElement typeEl))
+            {
+                return;
+            }
+
+            string? type = typeEl.GetString();
+            if (string.Equals(type, "Metaspace", StringComparison.OrdinalIgnoreCase))
+            {
+                if (preTokenizer.TryGetProperty("add_prefix_space", out JsonElement addPrefixEl))
+                {
+                    addDummyPrefix = addPrefixEl.GetBoolean();
+                }
+
+                if (preTokenizer.TryGetProperty("replacement", out JsonElement replacementEl))
+                {
+                    string? replacement = replacementEl.GetString();
+                    escapeWhiteSpaces = replacement == "\u2581"; // U+2581 LOWER ONE EIGHTH BLOCK (▁)
+                }
+
+                if (preTokenizer.TryGetProperty("prepend_scheme", out JsonElement prependSchemeEl))
+                {
+                    string? scheme = prependSchemeEl.GetString();
+                    treatWhitespaceAsSuffix = string.Equals(scheme, "last", StringComparison.OrdinalIgnoreCase);
+                }
+            }
+            else if (string.Equals(type, "Sequence", StringComparison.OrdinalIgnoreCase) &&
+                     preTokenizer.TryGetProperty("pretokenizers", out JsonElement preTokenizersEl) &&
+                     preTokenizersEl.ValueKind == JsonValueKind.Array)
+            {
+                foreach (JsonElement inner in preTokenizersEl.EnumerateArray())
+                {
+                    ExtractMetaspaceSettings(inner, ref addDummyPrefix, ref escapeWhiteSpaces, ref treatWhitespaceAsSuffix);
+                }
+            }
+        }
     }
 }
diff --git a/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs b/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs
index 3714206cf0..adc9f09f13 100644
--- a/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs
+++ b/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs
@@ -93,6 +93,144 @@ public SentencePieceUnigramModel(ModelProto modelProto, bool addBos, bool addEos
             }
         }
 
+        // Constructor that builds a Unigram model directly from a list of (piece, score) pairs.
+        // BOS, EOS, and PAD tokens are identified by their names ("<s>", "</s>", "<pad>") in the vocab;
+        // if not found by name, SentencePiece-conventional positions (1, 2, none) are used as fallbacks.
+        internal SentencePieceUnigramModel(
+            IReadOnlyList<(string Piece, float Score)> pieces,
+            int unkId,
+            bool addBos,
+            bool addEos,
+            ReadOnlySpan<byte> precompiledCharsmap,
+            bool addDummyPrefix,
+            bool escapeWhiteSpaces,
+            bool treatWhitespaceAsSuffix,
+            bool removeExtraWhitespaces,
+            IReadOnlyDictionary<string, int>? specialTokens)
+            : this(pieces, unkId, addBos, addEos, precompiledCharsmap, addDummyPrefix, escapeWhiteSpaces,
+                   treatWhitespaceAsSuffix, removeExtraWhitespaces, specialTokens,
+                   FindSpecialTokenId(pieces, "<s>", 1),
+                   FindSpecialTokenId(pieces, "</s>", 2),
+                   FindSpecialTokenId(pieces, "<pad>", -1))
+        {
+        }
+
+        private SentencePieceUnigramModel(
+            IReadOnlyList<(string Piece, float Score)> pieces,
+            int unkId,
+            bool addBos,
+            bool addEos,
+            ReadOnlySpan<byte> precompiledCharsmap,
+            bool addDummyPrefix,
+            bool escapeWhiteSpaces,
+            bool treatWhitespaceAsSuffix,
+            bool removeExtraWhitespaces,
+            IReadOnlyDictionary<string, int>? specialTokens,
+            int bosId, int eosId, int padId)
+            : base(addBos, addEos,
+                   bosId >= 0 && bosId < GetPieceCount(pieces) ? pieces[bosId].Piece : "<s>", bosId,
+                   eosId >= 0 && eosId < GetPieceCount(pieces) ? pieces[eosId].Piece : "</s>", eosId,
+                   GetPieceAtIndex(pieces, unkId), unkId,
+                   addDummyPrefix, escapeWhiteSpaces, treatWhitespaceAsSuffix, byteFallback: false,
+                   precompiledCharsmap, removeExtraWhitespaces, specialTokens)
+        {
+            Debug.Assert(pieces is not null);
+
+            _vocab = new SortedDictionary<string, int>(OrdinalUtf8StringComparer.Instance);
+            _vocabReverse = new (string Piece, float Score, ModelProto.Types.SentencePiece.Types.Type Type)[pieces!.Count];
+            _minScore = float.MaxValue;
+            _maxScore = float.MinValue;
+
+            for (int i = 0; i < pieces.Count; i++)
+            {
+                var (piece, score) = pieces[i];
+                if (i == unkId)
+                {
+                    _vocabReverse[i] = (piece, score, ModelProto.Types.SentencePiece.Types.Type.Unknown);
+                }
+                else if (i == bosId || i == eosId || (padId >= 0 && i == padId))
+                {
+                    _vocabReverse[i] = (piece, score, ModelProto.Types.SentencePiece.Types.Type.Control);
+                }
+                else
+                {
+                    _vocabReverse[i] = (piece, score, ModelProto.Types.SentencePiece.Types.Type.Normal);
+                    _vocab.Add(piece, i);
+                    _minScore = Math.Min(_minScore, score);
+                    _maxScore = Math.Max(_maxScore, score);
+                }
+            }
+
+            ByteCodeToIdOffset = _vocab.TryGetValue("<0x00>", out int id) ? id : MaxByteId;
+            OneByteUtf8EncodingMaxId = ByteCodeToIdOffset + 0x7F;
+            MaxIdByteFallbackId = ByteCodeToIdOffset + 0xFF;
+
+            _trie = new DoubleArrayTrie(_vocab);
+
+            // Add special tokens to vocab after trie is built.
+            string unkToken = pieces[unkId].Piece;
+            _vocab[unkToken] = unkId;
+            _vocabReverse[unkId] = (unkToken, 0f, ModelProto.Types.SentencePiece.Types.Type.Unknown);
+
+            if (bosId >= 0 && bosId < pieces.Count)
+            {
+                string bos = pieces[bosId].Piece;
+                _vocab[bos] = bosId;
+                _vocabReverse[bosId] = (bos, 0f, ModelProto.Types.SentencePiece.Types.Type.Control);
+            }
+
+            if (eosId >= 0 && eosId < pieces.Count)
+            {
+                string eos = pieces[eosId].Piece;
+                _vocab[eos] = eosId;
+                _vocabReverse[eosId] = (eos, 0f, ModelProto.Types.SentencePiece.Types.Type.Control);
+            }
+
+            if (padId >= 0 && padId < pieces.Count)
+            {
+                string pad = pieces[padId].Piece;
+                _vocab[pad] = padId;
+                _vocabReverse[padId] = (pad, 0f, ModelProto.Types.SentencePiece.Types.Type.Control);
+            }
+        }
+
+        private static int GetPieceCount(IReadOnlyList<(string Piece, float Score)>? pieces)
+            => pieces?.Count ?? 0;
+
+        private static string GetPieceAtIndex(IReadOnlyList<(string Piece, float Score)>? pieces, int index)
+        {
+            if (pieces is null)
+            {
+                throw new ArgumentNullException("vocab");
+            }
+
+            if ((uint)index >= (uint)pieces.Count)
+            {
+                throw new ArgumentOutOfRangeException("unkId", "unkId must be a valid index in the vocabulary.");
+            }
+
+            return pieces[index].Piece;
+        }
+
+        // Finds a special token by name; falls back to defaultId if not found (returns -1 if defaultId is out of range).
+        private static int FindSpecialTokenId(IReadOnlyList<(string Piece, float Score)>? pieces, string tokenName, int defaultId)
+        {
+            if (pieces is null)
+            {
+                return defaultId;
+            }
+
+            for (int i = 0; i < pieces.Count; i++)
+            {
+                if (pieces[i].Piece == tokenName)
+                {
+                    return i;
+                }
+            }
+
+            return defaultId >= 0 && defaultId < pieces.Count ? defaultId : -1;
+        }
+
         public override IReadOnlyDictionary<string, int> Vocabulary => new ReadOnlyDictionary<string, int>(_vocab);
 
         public int MaxIdByteFallbackId { get; }
diff --git a/test/Microsoft.ML.Tokenizers.Tests/UnigramTests.cs b/test/Microsoft.ML.Tokenizers.Tests/UnigramTests.cs
index ca671ddebe..5e0d6765b2 100644
--- a/test/Microsoft.ML.Tokenizers.Tests/UnigramTests.cs
+++ b/test/Microsoft.ML.Tokenizers.Tests/UnigramTests.cs
@@ -562,5 +562,108 @@ public void SpecialTokensTest()
             Assert.Equal("</s>", _unigramTokenizer.EndOfSentenceToken);
             Assert.Equal(2, _unigramTokenizer.EndOfSentenceId);
         }
+
+        [Fact]
+        public void CreateFromVocabTest()
+        {
+            // Build a minimal synthetic Unigram vocab: <unk>=0, <s>=1, </s>=2, then normal tokens
+            var vocab = new List<(string Piece, float Score)>
+            {
+                ("<unk>", 0f),
+                ("<s>",   0f),
+                ("</s>",  0f),
+                ("▁Hello", -1f),
+                (",",      -2f),
+                ("▁world", -3f),
+                ("!",      -4f),
+            };
+
+            SentencePieceTokenizer tokenizer = SentencePieceTokenizer.Create(
+                vocab, unkId: 0, addBeginningOfSentence: false, addEndOfSentence: false);
+
+            Assert.Equal("<unk>", tokenizer.UnknownToken);
+            Assert.Equal(0, tokenizer.UnknownId);
+            Assert.Equal("<s>", tokenizer.BeginningOfSentenceToken);
+            Assert.Equal(1, tokenizer.BeginningOfSentenceId);
+            Assert.Equal("</s>", tokenizer.EndOfSentenceToken);
+            Assert.Equal(2, tokenizer.EndOfSentenceId);
+
+            IReadOnlyList<int> ids = tokenizer.EncodeToIds("Hello, world!", addBeginningOfSentence: false, addEndOfSentence: false);
+            Assert.Equal(new[] { 3, 4, 5, 6 }, ids);
+
+            string decoded = tokenizer.Decode(ids, considerSpecialTokens: false);
+            Assert.Equal("Hello, world!", decoded);
+        }
+
+        [Fact]
+        public void CreateFromVocabNullTest()
+        {
+            Assert.Throws<ArgumentNullException>(() =>
+                SentencePieceTokenizer.Create((IEnumerable<(string Piece, float Score)>)null!, unkId: 0));
+        }
+
+        [Fact]
+        public void CreateFromVocabInvalidUnkIdTest()
+        {
+            var vocab = new List<(string Piece, float Score)> { ("a", 0f) };
+            Assert.Throws<ArgumentOutOfRangeException>(() =>
+                SentencePieceTokenizer.Create(vocab, unkId: 5));
+        }
+
+        [Fact]
+        public void CreateFromTokenizerJsonTest()
+        {
+            using Stream jsonStream = File.OpenRead(Path.Combine("Paraphrase-multilingual-MiniLM-L12-v2", "tokenizer.json"));
+            SentencePieceTokenizer jsonTokenizer = SentencePieceTokenizer.CreateFromTokenizerJson(
+                jsonStream, addBeginningOfSentence: false, addEndOfSentence: false);
+
+            // The tokenizer.json vocab has <s>=0, <pad>=1, </s>=2, <unk>=3, then normal tokens
+            // (shifted +1 relative to .model which has <unk>=0, <s>=1, </s>=2)
+            Assert.Equal("<unk>", jsonTokenizer.UnknownToken);
+            Assert.Equal(3, jsonTokenizer.UnknownId);
+            Assert.Equal("<s>", jsonTokenizer.BeginningOfSentenceToken);
+            Assert.Equal(0, jsonTokenizer.BeginningOfSentenceId);
+            Assert.Equal("</s>", jsonTokenizer.EndOfSentenceToken);
+            Assert.Equal(2, jsonTokenizer.EndOfSentenceId);
+
+            // Pieces produced should match the .model tokenizer; IDs are shifted by +1
+            IReadOnlyList<EncodedToken> jsonTokens = jsonTokenizer.EncodeToTokens("Hello, world!", out _, addBeginningOfSentence: false, addEndOfSentence: false);
+            IReadOnlyList<EncodedToken> modelTokens = _unigramTokenizer.EncodeToTokens("Hello, world!", out _, addBeginningOfSentence: false, addEndOfSentence: false);
+
+            Assert.Equal(modelTokens.Count, jsonTokens.Count);
+            for (int i = 0; i < modelTokens.Count; i++)
+            {
+                Assert.Equal(modelTokens[i].Value, jsonTokens[i].Value);
+                // JSON IDs are offset by 1 from the .model IDs for normal tokens
+                Assert.Equal(modelTokens[i].Id + 1, jsonTokens[i].Id);
+            }
+        }
+
+        [Fact]
+        public void CreateFromTokenizerJsonNullStreamTest()
+        {
+            Assert.Throws<ArgumentNullException>(() =>
+                SentencePieceTokenizer.CreateFromTokenizerJson(null!));
+        }
+
+        [Fact]
+        public void CreateFromTokenizerJsonNormalizationTest()
+        {
+            // Verify that the JSON tokenizer applies the precompiled charsmap normalization
+            // (same normalization as the .model tokenizer)
+            using Stream jsonStream = File.OpenRead(Path.Combine("Paraphrase-multilingual-MiniLM-L12-v2", "tokenizer.json"));
+            SentencePieceTokenizer jsonTokenizer = SentencePieceTokenizer.CreateFromTokenizerJson(
+                jsonStream, addBeginningOfSentence: false, addEndOfSentence: false);
+
+            // "㍻" normalizes to "平成" via the precompiled charsmap (NFKC normalization)
+            IReadOnlyList<int> jsonIds = jsonTokenizer.EncodeToIds("㍻", addBeginningOfSentence: false, addEndOfSentence: false);
+            IReadOnlyList<int> modelIds = _unigramTokenizer.EncodeToIds("㍻", addBeginningOfSentence: false, addEndOfSentence: false);
+
+            Assert.Equal(modelIds.Count, jsonIds.Count);
+            for (int i = 0; i < modelIds.Count; i++)
+            {
+                Assert.Equal(modelIds[i] + 1, jsonIds[i]);
+            }
+        }
     }
 }

From 112ee2b87f5efde8a09fe487bca1c862e46b4ba2 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 11 Jun 2026 00:50:01 +0000
Subject: [PATCH 3/4] Fix BOS/EOS positional fallback, normalizer type
 validation, and prepend_scheme handling

Co-authored-by: ericstj <8918108+ericstj@users.noreply.github.com>
---
 .../Model/SentencePieceTokenizer.cs           | 30 +++++++++++++++----
 .../Model/SentencePieceUnigramModel.cs        | 16 +++++-----
 2 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs
index 2176fba0fa..e570f75378 100644
--- a/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs
+++ b/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs
@@ -640,22 +640,36 @@ public static SentencePieceTokenizer CreateFromTokenizerJson(
                         return Convert.FromBase64String(base64);
                     }
                 }
+                return null;
             }
             else if (string.Equals(type, "Sequence", StringComparison.OrdinalIgnoreCase) &&
                      normalizer.TryGetProperty("normalizers", out JsonElement normalizersEl) &&
                      normalizersEl.ValueKind == JsonValueKind.Array)
             {
+                byte[]? result = null;
                 foreach (JsonElement inner in normalizersEl.EnumerateArray())
                 {
-                    byte[]? result = ExtractPrecompiledCharsMap(inner);
-                    if (result is not null)
+                    if (!inner.TryGetProperty("type", out JsonElement innerTypeEl))
+                    {
+                        continue;
+                    }
+
+                    string? innerType = innerTypeEl.GetString();
+                    if (string.Equals(innerType, "Precompiled", StringComparison.OrdinalIgnoreCase))
+                    {
+                        result = ExtractPrecompiledCharsMap(inner);
+                    }
+                    else
                     {
-                        return result;
+                        throw new NotSupportedException($"Normalizer type '{innerType}' in Sequence is not supported. Only 'Precompiled' normalizer is supported.");
                     }
                 }
+                return result;
+            }
+            else
+            {
+                throw new NotSupportedException($"Normalizer type '{type}' is not supported. Only 'Precompiled' and 'Sequence' normalizers are supported.");
             }
-
-            return null;
         }
 
         private static void ExtractMetaspaceSettings(JsonElement preTokenizer, ref bool addDummyPrefix, ref bool escapeWhiteSpaces, ref bool treatWhitespaceAsSuffix)
@@ -682,7 +696,11 @@ private static void ExtractMetaspaceSettings(JsonElement preTokenizer, ref bool
                 if (preTokenizer.TryGetProperty("prepend_scheme", out JsonElement prependSchemeEl))
                 {
                     string? scheme = prependSchemeEl.GetString();
-                    treatWhitespaceAsSuffix = string.Equals(scheme, "last", StringComparison.OrdinalIgnoreCase);
+                    // "never" suppresses the dummy prefix; "always"/"first" keep the default (true)
+                    if (string.Equals(scheme, "never", StringComparison.OrdinalIgnoreCase))
+                    {
+                        addDummyPrefix = false;
+                    }
                 }
             }
             else if (string.Equals(type, "Sequence", StringComparison.OrdinalIgnoreCase) &&
diff --git a/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs b/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs
index adc9f09f13..4e865504be 100644
--- a/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs
+++ b/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs
@@ -95,7 +95,7 @@ public SentencePieceUnigramModel(ModelProto modelProto, bool addBos, bool addEos
 
         // Constructor that builds a Unigram model directly from a list of (piece, score) pairs.
         // BOS, EOS, and PAD tokens are identified by their names ("<s>", "</s>", "<pad>") in the vocab;
-        // if not found by name, SentencePiece-conventional positions (1, 2, none) are used as fallbacks.
+        // if not found by name, they are treated as absent (id = -1) to avoid misidentifying real pieces.
         internal SentencePieceUnigramModel(
             IReadOnlyList<(string Piece, float Score)> pieces,
             int unkId,
@@ -109,9 +109,9 @@ internal SentencePieceUnigramModel(
             IReadOnlyDictionary<string, int>? specialTokens)
             : this(pieces, unkId, addBos, addEos, precompiledCharsmap, addDummyPrefix, escapeWhiteSpaces,
                    treatWhitespaceAsSuffix, removeExtraWhitespaces, specialTokens,
-                   FindSpecialTokenId(pieces, "<s>", 1),
-                   FindSpecialTokenId(pieces, "</s>", 2),
-                   FindSpecialTokenId(pieces, "<pad>", -1))
+                   FindSpecialTokenId(pieces, "<s>"),
+                   FindSpecialTokenId(pieces, "</s>"),
+                   FindSpecialTokenId(pieces, "<pad>"))
         {
         }
 
@@ -212,12 +212,12 @@ private static string GetPieceAtIndex(IReadOnlyList<(string Piece, float Score)>
             return pieces[index].Piece;
         }
 
-        // Finds a special token by name; falls back to defaultId if not found (returns -1 if defaultId is out of range).
-        private static int FindSpecialTokenId(IReadOnlyList<(string Piece, float Score)>? pieces, string tokenName, int defaultId)
+        // Finds a special token by name; returns -1 if not found.
+        private static int FindSpecialTokenId(IReadOnlyList<(string Piece, float Score)>? pieces, string tokenName)
         {
             if (pieces is null)
             {
-                return defaultId;
+                return -1;
             }
 
             for (int i = 0; i < pieces.Count; i++)
@@ -228,7 +228,7 @@ private static int FindSpecialTokenId(IReadOnlyList<(string Piece, float Score)>
                 }
             }
 
-            return defaultId >= 0 && defaultId < pieces.Count ? defaultId : -1;
+            return -1;
         }
 
         public override IReadOnlyDictionary<string, int> Vocabulary => new ReadOnlyDictionary<string, int>(_vocab);

From 7e32c05ec38457e08554ffccc193627bcaa40fb5 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 11 Jun 2026 01:22:50 +0000
Subject: [PATCH 4/4] Fix null normalizer guard, recursive Sequence support,
 BOS/EOS validation, and add tests

Co-authored-by: ericstj <8918108+ericstj@users.noreply.github.com>
---
 .../Model/SentencePieceTokenizer.cs           | 15 ++--
 .../Model/SentencePieceUnigramModel.cs        | 32 ++++++-
 .../UnigramTests.cs                           | 84 +++++++++++++++++++
 3 files changed, 119 insertions(+), 12 deletions(-)

diff --git a/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs
index e570f75378..9768826425 100644
--- a/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs
+++ b/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs
@@ -601,7 +601,8 @@ public static SentencePieceTokenizer CreateFromTokenizerJson(
             byte[]? precompiledCharsMap = null;
             bool addDummyPrefix = true;
             bool removeExtraWhitespaces = true;
-            if (root.TryGetProperty("normalizer", out JsonElement normalizerElement))
+            if (root.TryGetProperty("normalizer", out JsonElement normalizerElement) &&
+                normalizerElement.ValueKind == JsonValueKind.Object)
             {
                 precompiledCharsMap = ExtractPrecompiledCharsMap(normalizerElement);
             }
@@ -649,19 +650,15 @@ public static SentencePieceTokenizer CreateFromTokenizerJson(
                 byte[]? result = null;
                 foreach (JsonElement inner in normalizersEl.EnumerateArray())
                 {
-                    if (!inner.TryGetProperty("type", out JsonElement innerTypeEl))
+                    if (inner.ValueKind != JsonValueKind.Object)
                     {
                         continue;
                     }
 
-                    string? innerType = innerTypeEl.GetString();
-                    if (string.Equals(innerType, "Precompiled", StringComparison.OrdinalIgnoreCase))
+                    byte[]? innerResult = ExtractPrecompiledCharsMap(inner);
+                    if (innerResult is not null)
                     {
-                        result = ExtractPrecompiledCharsMap(inner);
-                    }
-                    else
-                    {
-                        throw new NotSupportedException($"Normalizer type '{innerType}' in Sequence is not supported. Only 'Precompiled' normalizer is supported.");
+                        result = innerResult;
                     }
                 }
                 return result;
diff --git a/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs b/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs
index 4e865504be..c876327009 100644
--- a/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs
+++ b/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs
@@ -109,7 +109,7 @@ internal SentencePieceUnigramModel(
             IReadOnlyDictionary<string, int>? specialTokens)
             : this(pieces, unkId, addBos, addEos, precompiledCharsmap, addDummyPrefix, escapeWhiteSpaces,
                    treatWhitespaceAsSuffix, removeExtraWhitespaces, specialTokens,
-                   FindSpecialTokenId(pieces, "<s>"),
+                   FindSpecialTokenId(ValidateVocab(pieces, unkId), "<s>"),
                    FindSpecialTokenId(pieces, "</s>"),
                    FindSpecialTokenId(pieces, "<pad>"))
         {
@@ -128,8 +128,8 @@ private SentencePieceUnigramModel(
             IReadOnlyDictionary<string, int>? specialTokens,
             int bosId, int eosId, int padId)
             : base(addBos, addEos,
-                   bosId >= 0 && bosId < GetPieceCount(pieces) ? pieces[bosId].Piece : "<s>", bosId,
-                   eosId >= 0 && eosId < GetPieceCount(pieces) ? pieces[eosId].Piece : "</s>", eosId,
+                   bosId >= 0 && bosId < GetPieceCount(pieces) ? pieces[bosId].Piece : "<s>", CheckSpecialId(addBos, bosId, "addBeginningOfSentence"),
+                   eosId >= 0 && eosId < GetPieceCount(pieces) ? pieces[eosId].Piece : "</s>", CheckSpecialId(addEos, eosId, "addEndOfSentence"),
                    GetPieceAtIndex(pieces, unkId), unkId,
                    addDummyPrefix, escapeWhiteSpaces, treatWhitespaceAsSuffix, byteFallback: false,
                    precompiledCharsmap, removeExtraWhitespaces, specialTokens)
@@ -212,6 +212,23 @@ private static string GetPieceAtIndex(IReadOnlyList<(string Piece, float Score)>
             return pieces[index].Piece;
         }
 
+        // Validates pieces is not null and unkId is in range; returns pieces unchanged.
+        private static IReadOnlyList<(string Piece, float Score)> ValidateVocab(
+            IReadOnlyList<(string Piece, float Score)>? pieces, int unkId)
+        {
+            if (pieces is null)
+            {
+                throw new ArgumentNullException("vocab");
+            }
+
+            if ((uint)unkId >= (uint)pieces.Count)
+            {
+                throw new ArgumentOutOfRangeException("unkId", "unkId must be a valid index in the vocabulary.");
+            }
+
+            return pieces;
+        }
+
         // Finds a special token by name; returns -1 if not found.
         private static int FindSpecialTokenId(IReadOnlyList<(string Piece, float Score)>? pieces, string tokenName)
         {
@@ -231,6 +248,15 @@ private static int FindSpecialTokenId(IReadOnlyList<(string Piece, float Score)>
             return -1;
         }
 
+        private static int CheckSpecialId(bool required, int id, string paramName)
+        {
+            if (required && id < 0)
+            {
+                throw new ArgumentException($"The vocabulary does not contain the required special token.", paramName);
+            }
+            return id;
+        }
+
         public override IReadOnlyDictionary<string, int> Vocabulary => new ReadOnlyDictionary<string, int>(_vocab);
 
         public int MaxIdByteFallbackId { get; }
diff --git a/test/Microsoft.ML.Tokenizers.Tests/UnigramTests.cs b/test/Microsoft.ML.Tokenizers.Tests/UnigramTests.cs
index 5e0d6765b2..2d7a63373a 100644
--- a/test/Microsoft.ML.Tokenizers.Tests/UnigramTests.cs
+++ b/test/Microsoft.ML.Tokenizers.Tests/UnigramTests.cs
@@ -665,5 +665,89 @@ public void CreateFromTokenizerJsonNormalizationTest()
                 Assert.Equal(modelIds[i] + 1, jsonIds[i]);
             }
         }
+
+        [Fact]
+        public void CreateFromVocabNoSpecialTokensTest()
+        {
+            // Vocab without <s>/<pad>/</s> — resembles bge-m3/potion layout.
+            // Verify that real pieces (e.g. ",") are not marked Control and remain encodable.
+            var vocab = new List<(string Piece, float Score)>
+            {
+                ("[PAD]", 0f),   // 0
+                ("[UNK]", 0f),   // 1
+                (",",     -1f),  // 2
+                ("▁Hello", -2f), // 3
+                ("▁world", -3f), // 4
+                ("!",     -4f),  // 5
+            };
+
+            SentencePieceTokenizer tokenizer = SentencePieceTokenizer.Create(
+                vocab, unkId: 1, addBeginningOfSentence: false, addEndOfSentence: false);
+
+            // "," must be in the vocabulary and encodable (not silently dropped as Control)
+            IReadOnlyList<int> ids = tokenizer.EncodeToIds("Hello, world!", addBeginningOfSentence: false, addEndOfSentence: false);
+            Assert.Contains(2, ids); // id 2 is ","
+        }
+
+        [Fact]
+        public void CreateFromVocabBosRequiredButAbsentTest()
+        {
+            // Vocab without <s>: addBeginningOfSentence:true should throw rather than emit index 0.
+            var vocab = new List<(string Piece, float Score)>
+            {
+                ("[UNK]", 0f),
+                ("▁Hello", -1f),
+            };
+
+            Assert.Throws<ArgumentException>(() =>
+                SentencePieceTokenizer.Create(vocab, unkId: 0, addBeginningOfSentence: true));
+        }
+
+        [Fact]
+        public void CreateFromTokenizerJsonUnsupportedNormalizerTest()
+        {
+            // A Sequence normalizer containing a non-Precompiled step should throw NotSupportedException.
+            string json = """
+                {
+                  "model": {
+                    "type": "Unigram",
+                    "unk_id": 0,
+                    "vocab": [["<unk>", 0.0], ["a", -1.0]]
+                  },
+                  "normalizer": {
+                    "type": "Sequence",
+                    "normalizers": [
+                      { "type": "Precompiled", "precompiled_charsmap": "" },
+                      { "type": "Replace", "pattern": " ", "content": "_" }
+                    ]
+                  }
+                }
+                """;
+
+            using Stream stream = new System.IO.MemoryStream(System.Text.Encoding.UTF8.GetBytes(json));
+            Assert.Throws<NotSupportedException>(() =>
+                SentencePieceTokenizer.CreateFromTokenizerJson(stream, addBeginningOfSentence: false));
+        }
+
+        [Fact]
+        public void CreateFromTokenizerJsonNullNormalizerTest()
+        {
+            // A null normalizer value in JSON should not throw.
+            string json = """
+                {
+                  "model": {
+                    "type": "Unigram",
+                    "unk_id": 0,
+                    "vocab": [["<unk>", 0.0], ["a", -1.0]]
+                  },
+                  "normalizer": null
+                }
+                """;
+
+            using Stream stream = new System.IO.MemoryStream(System.Text.Encoding.UTF8.GetBytes(json));
+            SentencePieceTokenizer tokenizer = SentencePieceTokenizer.CreateFromTokenizerJson(
+                stream, addBeginningOfSentence: false);
+            Assert.NotNull(tokenizer);
+        }
     }
 }