From ff40b116526a3ef14f5425703bbb187bd0ccf2f4 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 10 Jun 2026 23:15:43 +0000 Subject: [PATCH 1/4] Initial plan From 862be328ec68313ce82c18c3e0671f310dd894c1 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 10 Jun 2026 23:41:51 +0000 Subject: [PATCH 2/4] Add public SentencePieceTokenizer.Create(vocab) and CreateFromTokenizerJson APIs Co-authored-by: ericstj <8918108+ericstj@users.noreply.github.com> --- .../Model/SentencePieceBaseModel.cs | 44 ++++ .../Model/SentencePieceTokenizer.cs | 238 ++++++++++++++++++ .../Model/SentencePieceUnigramModel.cs | 138 ++++++++++ .../UnigramTests.cs | 103 ++++++++ 4 files changed, 523 insertions(+) diff --git a/src/Microsoft.ML.Tokenizers/Model/SentencePieceBaseModel.cs b/src/Microsoft.ML.Tokenizers/Model/SentencePieceBaseModel.cs index 5bd204f501..885f541c5b 100644 --- a/src/Microsoft.ML.Tokenizers/Model/SentencePieceBaseModel.cs +++ b/src/Microsoft.ML.Tokenizers/Model/SentencePieceBaseModel.cs @@ -59,6 +59,50 @@ internal SentencePieceBaseModel(ModelProto modelProto, bool addBos = false, bool specialTokens); } + internal SentencePieceBaseModel( + bool addBos, bool addEos, + string bosToken, int bosId, + string eosToken, int eosId, + string unkToken, int unkId, + bool addDummyPrefix, bool escapeWhiteSpaces, + bool treatWhitespaceAsSuffix, bool byteFallback, + ReadOnlySpan precompiledCharsmap, bool removeExtraWhitespaces, + IReadOnlyDictionary? specialTokens) + { + AddBeginningOfSentence = addBos; + AddEndOfSentence = addEos; + BeginningOfSentenceToken = bosToken; + BeginningOfSentenceId = Math.Max(0, bosId); + EndOfSentenceToken = eosToken; + EndOfSentenceId = Math.Max(0, eosId); + UnknownToken = unkToken; + UnknownId = Math.Max(0, unkId); + AddDummyPrefix = addDummyPrefix; + EscapeWhiteSpaces = escapeWhiteSpaces; + TreatWhitespaceAsSuffix = treatWhitespaceAsSuffix; + ByteFallback = byteFallback; + SpecialTokens = specialTokens; + + if (specialTokens is not null && specialTokens.Count > 0) + { + InternalSpecialTokens = new Dictionary(); + SpecialTokensReverse = new Dictionary(); + + foreach (var item in specialTokens) + { + InternalSpecialTokens.Add(new StringSpanOrdinalKey(item.Key), item.Value); + SpecialTokensReverse.Add(item.Value, item.Key); + } + + SpecialTokensRegex = new Regex(string.Join("|", specialTokens.Keys.Select(s => Regex.Escape(s))), RegexOptions.Compiled); + } + + Normalizer = new SentencePieceNormalizer( + precompiledCharsmap, removeExtraWhitespaces, + addDummyPrefix, escapeWhiteSpaces, + treatWhitespaceAsSuffix, specialTokens); + } + internal Regex? SpecialTokensRegex { get; } internal Dictionary? InternalSpecialTokens { get; } diff --git a/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs index cb945d24fa..2176fba0fa 100644 --- a/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs +++ b/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs @@ -7,6 +7,7 @@ using System.Buffers; using System.Collections.Generic; using System.IO; +using System.Text.Json; namespace Microsoft.ML.Tokenizers { @@ -30,6 +31,11 @@ internal SentencePieceTokenizer(ModelProto modelProto, bool addBos, bool addEos, }; } + private SentencePieceTokenizer(SentencePieceBaseModel model) + { + _model = model; + } + /// /// The special tokens. /// @@ -457,5 +463,237 @@ public static SentencePieceTokenizer Create( return new SentencePieceTokenizer(modelProto, addBeginningOfSentence, addEndOfSentence, specialTokens); } + + /// + /// Creates a Unigram from an in-memory vocabulary of (piece, score) pairs. + /// + /// + /// The vocabulary as an ordered sequence of (piece, score) pairs. The position of each pair + /// in the sequence determines its token ID. + /// + /// The index (token ID) of the unknown token in . + /// Whether to emit the beginning-of-sentence token during encoding. + /// Whether to emit the end-of-sentence token during encoding. + /// + /// Optional precompiled character normalization map (as found in the SentencePiece normalizer_spec.precompiled_charsmap + /// field or in the Hugging Face tokenizer.json normalizer.precompiled_charsmap property). + /// Pass to skip precompiled normalization. + /// + /// Whether to prepend the dummy whitespace prefix character (U+2581) at the start of the input. + /// Whether to replace spaces with the dummy whitespace character (U+2581) during normalization. + /// Whether to emit the U+2581 character at the end of the last token rather than the beginning of the first token. + /// Additional special tokens to recognize, supplied as a mapping of token string to token ID. + /// A new instance. + /// + /// The beginning-of-sentence and end-of-sentence token IDs are auto-detected by looking for pieces + /// named <s> and </s> in . If not found, positions 1 and 2 + /// are used as fallbacks (the SentencePiece convention). Similarly, a <pad> piece is + /// detected automatically if present. + /// + /// When creating the tokenizer, ensure that the vocabulary is sourced from a trusted provider. + /// + /// + public static SentencePieceTokenizer Create( + IEnumerable<(string Piece, float Score)> vocab, + int unkId, + bool addBeginningOfSentence = true, + bool addEndOfSentence = false, + ReadOnlySpan precompiledCharsMap = default, + bool addDummyPrefix = true, + bool escapeWhiteSpaces = true, + bool treatWhitespaceAsSuffix = false, + IReadOnlyDictionary? specialTokens = null) + { + if (vocab is null) + { + throw new ArgumentNullException(nameof(vocab)); + } + + IReadOnlyList<(string Piece, float Score)> pieces = vocab as IReadOnlyList<(string Piece, float Score)> + ?? new List<(string Piece, float Score)>(vocab); + + SentencePieceUnigramModel model = new SentencePieceUnigramModel( + pieces, unkId, addBeginningOfSentence, addEndOfSentence, + precompiledCharsMap, addDummyPrefix, escapeWhiteSpaces, + treatWhitespaceAsSuffix, removeExtraWhitespaces: true, specialTokens); + + return new SentencePieceTokenizer(model); + } + + /// + /// Creates a Unigram by parsing a Hugging Face tokenizer.json + /// that contains a Unigram model (model.type == "Unigram"). + /// + /// A stream containing the UTF-8-encoded tokenizer.json content. + /// Whether to emit the beginning-of-sentence token during encoding. + /// Whether to emit the end-of-sentence token during encoding. + /// Additional special tokens to recognize, supplied as a mapping of token string to token ID. + /// A new instance. + /// + /// The following fields are read from the JSON: + /// + /// model.vocab — array of [piece, score] pairs (required). + /// model.unk_id — index of the unknown token (required). + /// normalizer.precompiled_charsmap (base64) — normalization map; also searched inside a Sequence normalizer. + /// pre_tokenizer of type Metaspaceadd_prefix_space and replacement; also searched inside a Sequence pre-tokenizer. + /// + /// + /// When creating the tokenizer, ensure that the JSON stream is sourced from a trusted provider. + /// + /// + public static SentencePieceTokenizer CreateFromTokenizerJson( + Stream tokenizerJsonStream, + bool addBeginningOfSentence = true, + bool addEndOfSentence = false, + IReadOnlyDictionary? specialTokens = null) + { + if (tokenizerJsonStream is null) + { + throw new ArgumentNullException(nameof(tokenizerJsonStream)); + } + + using JsonDocument doc = JsonDocument.Parse(tokenizerJsonStream); + JsonElement root = doc.RootElement; + + // Validate model type + if (!root.TryGetProperty("model", out JsonElement modelElement)) + { + throw new InvalidDataException("The tokenizer.json does not contain a 'model' property."); + } + + if (modelElement.TryGetProperty("type", out JsonElement modelTypeElement) && + !string.Equals(modelTypeElement.GetString(), "Unigram", StringComparison.OrdinalIgnoreCase)) + { + throw new InvalidDataException($"Expected model type 'Unigram' but found '{modelTypeElement.GetString()}'."); + } + + if (!modelElement.TryGetProperty("unk_id", out JsonElement unkIdElement)) + { + throw new InvalidDataException("The tokenizer.json model does not contain an 'unk_id' property."); + } + + int unkId = unkIdElement.GetInt32(); + + if (!modelElement.TryGetProperty("vocab", out JsonElement vocabElement) || + vocabElement.ValueKind != JsonValueKind.Array) + { + throw new InvalidDataException("The tokenizer.json model does not contain a valid 'vocab' array."); + } + + List<(string Piece, float Score)> vocab = new List<(string Piece, float Score)>(vocabElement.GetArrayLength()); + foreach (JsonElement entry in vocabElement.EnumerateArray()) + { + if (entry.ValueKind != JsonValueKind.Array || entry.GetArrayLength() < 2) + { + throw new InvalidDataException("Each entry in 'model.vocab' must be a [piece, score] array."); + } + + string? piece = entry[0].GetString(); + if (piece is null) + { + throw new InvalidDataException("A piece string in 'model.vocab' is null."); + } + + vocab.Add((piece, entry[1].GetSingle())); + } + + // Extract normalizer settings + byte[]? precompiledCharsMap = null; + bool addDummyPrefix = true; + bool removeExtraWhitespaces = true; + if (root.TryGetProperty("normalizer", out JsonElement normalizerElement)) + { + precompiledCharsMap = ExtractPrecompiledCharsMap(normalizerElement); + } + + // Extract pre_tokenizer settings + bool escapeWhiteSpaces = true; + bool treatWhitespaceAsSuffix = false; + if (root.TryGetProperty("pre_tokenizer", out JsonElement preTokenizerElement)) + { + ExtractMetaspaceSettings(preTokenizerElement, ref addDummyPrefix, ref escapeWhiteSpaces, ref treatWhitespaceAsSuffix); + } + + SentencePieceUnigramModel model = new SentencePieceUnigramModel( + vocab, unkId, addBeginningOfSentence, addEndOfSentence, + precompiledCharsMap is not null ? precompiledCharsMap.AsSpan() : default, + addDummyPrefix, escapeWhiteSpaces, treatWhitespaceAsSuffix, removeExtraWhitespaces, specialTokens); + + return new SentencePieceTokenizer(model); + } + + private static byte[]? ExtractPrecompiledCharsMap(JsonElement normalizer) + { + if (!normalizer.TryGetProperty("type", out JsonElement typeEl)) + { + return null; + } + + string? type = typeEl.GetString(); + if (string.Equals(type, "Precompiled", StringComparison.OrdinalIgnoreCase)) + { + if (normalizer.TryGetProperty("precompiled_charsmap", out JsonElement mapEl)) + { + string? base64 = mapEl.GetString(); + if (base64 is not null) + { + return Convert.FromBase64String(base64); + } + } + } + else if (string.Equals(type, "Sequence", StringComparison.OrdinalIgnoreCase) && + normalizer.TryGetProperty("normalizers", out JsonElement normalizersEl) && + normalizersEl.ValueKind == JsonValueKind.Array) + { + foreach (JsonElement inner in normalizersEl.EnumerateArray()) + { + byte[]? result = ExtractPrecompiledCharsMap(inner); + if (result is not null) + { + return result; + } + } + } + + return null; + } + + private static void ExtractMetaspaceSettings(JsonElement preTokenizer, ref bool addDummyPrefix, ref bool escapeWhiteSpaces, ref bool treatWhitespaceAsSuffix) + { + if (!preTokenizer.TryGetProperty("type", out JsonElement typeEl)) + { + return; + } + + string? type = typeEl.GetString(); + if (string.Equals(type, "Metaspace", StringComparison.OrdinalIgnoreCase)) + { + if (preTokenizer.TryGetProperty("add_prefix_space", out JsonElement addPrefixEl)) + { + addDummyPrefix = addPrefixEl.GetBoolean(); + } + + if (preTokenizer.TryGetProperty("replacement", out JsonElement replacementEl)) + { + string? replacement = replacementEl.GetString(); + escapeWhiteSpaces = replacement == "\u2581"; // U+2581 LOWER ONE EIGHTH BLOCK (▁) + } + + if (preTokenizer.TryGetProperty("prepend_scheme", out JsonElement prependSchemeEl)) + { + string? scheme = prependSchemeEl.GetString(); + treatWhitespaceAsSuffix = string.Equals(scheme, "last", StringComparison.OrdinalIgnoreCase); + } + } + else if (string.Equals(type, "Sequence", StringComparison.OrdinalIgnoreCase) && + preTokenizer.TryGetProperty("pretokenizers", out JsonElement preTokenizersEl) && + preTokenizersEl.ValueKind == JsonValueKind.Array) + { + foreach (JsonElement inner in preTokenizersEl.EnumerateArray()) + { + ExtractMetaspaceSettings(inner, ref addDummyPrefix, ref escapeWhiteSpaces, ref treatWhitespaceAsSuffix); + } + } + } } } diff --git a/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs b/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs index 3714206cf0..adc9f09f13 100644 --- a/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs +++ b/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs @@ -93,6 +93,144 @@ public SentencePieceUnigramModel(ModelProto modelProto, bool addBos, bool addEos } } + // Constructor that builds a Unigram model directly from a list of (piece, score) pairs. + // BOS, EOS, and PAD tokens are identified by their names ("", "", "") in the vocab; + // if not found by name, SentencePiece-conventional positions (1, 2, none) are used as fallbacks. + internal SentencePieceUnigramModel( + IReadOnlyList<(string Piece, float Score)> pieces, + int unkId, + bool addBos, + bool addEos, + ReadOnlySpan precompiledCharsmap, + bool addDummyPrefix, + bool escapeWhiteSpaces, + bool treatWhitespaceAsSuffix, + bool removeExtraWhitespaces, + IReadOnlyDictionary? specialTokens) + : this(pieces, unkId, addBos, addEos, precompiledCharsmap, addDummyPrefix, escapeWhiteSpaces, + treatWhitespaceAsSuffix, removeExtraWhitespaces, specialTokens, + FindSpecialTokenId(pieces, "", 1), + FindSpecialTokenId(pieces, "", 2), + FindSpecialTokenId(pieces, "", -1)) + { + } + + private SentencePieceUnigramModel( + IReadOnlyList<(string Piece, float Score)> pieces, + int unkId, + bool addBos, + bool addEos, + ReadOnlySpan precompiledCharsmap, + bool addDummyPrefix, + bool escapeWhiteSpaces, + bool treatWhitespaceAsSuffix, + bool removeExtraWhitespaces, + IReadOnlyDictionary? specialTokens, + int bosId, int eosId, int padId) + : base(addBos, addEos, + bosId >= 0 && bosId < GetPieceCount(pieces) ? pieces[bosId].Piece : "", bosId, + eosId >= 0 && eosId < GetPieceCount(pieces) ? pieces[eosId].Piece : "", eosId, + GetPieceAtIndex(pieces, unkId), unkId, + addDummyPrefix, escapeWhiteSpaces, treatWhitespaceAsSuffix, byteFallback: false, + precompiledCharsmap, removeExtraWhitespaces, specialTokens) + { + Debug.Assert(pieces is not null); + + _vocab = new SortedDictionary(OrdinalUtf8StringComparer.Instance); + _vocabReverse = new (string Piece, float Score, ModelProto.Types.SentencePiece.Types.Type Type)[pieces!.Count]; + _minScore = float.MaxValue; + _maxScore = float.MinValue; + + for (int i = 0; i < pieces.Count; i++) + { + var (piece, score) = pieces[i]; + if (i == unkId) + { + _vocabReverse[i] = (piece, score, ModelProto.Types.SentencePiece.Types.Type.Unknown); + } + else if (i == bosId || i == eosId || (padId >= 0 && i == padId)) + { + _vocabReverse[i] = (piece, score, ModelProto.Types.SentencePiece.Types.Type.Control); + } + else + { + _vocabReverse[i] = (piece, score, ModelProto.Types.SentencePiece.Types.Type.Normal); + _vocab.Add(piece, i); + _minScore = Math.Min(_minScore, score); + _maxScore = Math.Max(_maxScore, score); + } + } + + ByteCodeToIdOffset = _vocab.TryGetValue("<0x00>", out int id) ? id : MaxByteId; + OneByteUtf8EncodingMaxId = ByteCodeToIdOffset + 0x7F; + MaxIdByteFallbackId = ByteCodeToIdOffset + 0xFF; + + _trie = new DoubleArrayTrie(_vocab); + + // Add special tokens to vocab after trie is built. + string unkToken = pieces[unkId].Piece; + _vocab[unkToken] = unkId; + _vocabReverse[unkId] = (unkToken, 0f, ModelProto.Types.SentencePiece.Types.Type.Unknown); + + if (bosId >= 0 && bosId < pieces.Count) + { + string bos = pieces[bosId].Piece; + _vocab[bos] = bosId; + _vocabReverse[bosId] = (bos, 0f, ModelProto.Types.SentencePiece.Types.Type.Control); + } + + if (eosId >= 0 && eosId < pieces.Count) + { + string eos = pieces[eosId].Piece; + _vocab[eos] = eosId; + _vocabReverse[eosId] = (eos, 0f, ModelProto.Types.SentencePiece.Types.Type.Control); + } + + if (padId >= 0 && padId < pieces.Count) + { + string pad = pieces[padId].Piece; + _vocab[pad] = padId; + _vocabReverse[padId] = (pad, 0f, ModelProto.Types.SentencePiece.Types.Type.Control); + } + } + + private static int GetPieceCount(IReadOnlyList<(string Piece, float Score)>? pieces) + => pieces?.Count ?? 0; + + private static string GetPieceAtIndex(IReadOnlyList<(string Piece, float Score)>? pieces, int index) + { + if (pieces is null) + { + throw new ArgumentNullException("vocab"); + } + + if ((uint)index >= (uint)pieces.Count) + { + throw new ArgumentOutOfRangeException("unkId", "unkId must be a valid index in the vocabulary."); + } + + return pieces[index].Piece; + } + + // Finds a special token by name; falls back to defaultId if not found (returns -1 if defaultId is out of range). + private static int FindSpecialTokenId(IReadOnlyList<(string Piece, float Score)>? pieces, string tokenName, int defaultId) + { + if (pieces is null) + { + return defaultId; + } + + for (int i = 0; i < pieces.Count; i++) + { + if (pieces[i].Piece == tokenName) + { + return i; + } + } + + return defaultId >= 0 && defaultId < pieces.Count ? defaultId : -1; + } + public override IReadOnlyDictionary Vocabulary => new ReadOnlyDictionary(_vocab); public int MaxIdByteFallbackId { get; } diff --git a/test/Microsoft.ML.Tokenizers.Tests/UnigramTests.cs b/test/Microsoft.ML.Tokenizers.Tests/UnigramTests.cs index ca671ddebe..5e0d6765b2 100644 --- a/test/Microsoft.ML.Tokenizers.Tests/UnigramTests.cs +++ b/test/Microsoft.ML.Tokenizers.Tests/UnigramTests.cs @@ -562,5 +562,108 @@ public void SpecialTokensTest() Assert.Equal("", _unigramTokenizer.EndOfSentenceToken); Assert.Equal(2, _unigramTokenizer.EndOfSentenceId); } + + [Fact] + public void CreateFromVocabTest() + { + // Build a minimal synthetic Unigram vocab: =0, =1, =2, then normal tokens + var vocab = new List<(string Piece, float Score)> + { + ("", 0f), + ("", 0f), + ("", 0f), + ("▁Hello", -1f), + (",", -2f), + ("▁world", -3f), + ("!", -4f), + }; + + SentencePieceTokenizer tokenizer = SentencePieceTokenizer.Create( + vocab, unkId: 0, addBeginningOfSentence: false, addEndOfSentence: false); + + Assert.Equal("", tokenizer.UnknownToken); + Assert.Equal(0, tokenizer.UnknownId); + Assert.Equal("", tokenizer.BeginningOfSentenceToken); + Assert.Equal(1, tokenizer.BeginningOfSentenceId); + Assert.Equal("", tokenizer.EndOfSentenceToken); + Assert.Equal(2, tokenizer.EndOfSentenceId); + + IReadOnlyList ids = tokenizer.EncodeToIds("Hello, world!", addBeginningOfSentence: false, addEndOfSentence: false); + Assert.Equal(new[] { 3, 4, 5, 6 }, ids); + + string decoded = tokenizer.Decode(ids, considerSpecialTokens: false); + Assert.Equal("Hello, world!", decoded); + } + + [Fact] + public void CreateFromVocabNullTest() + { + Assert.Throws(() => + SentencePieceTokenizer.Create((IEnumerable<(string Piece, float Score)>)null!, unkId: 0)); + } + + [Fact] + public void CreateFromVocabInvalidUnkIdTest() + { + var vocab = new List<(string Piece, float Score)> { ("a", 0f) }; + Assert.Throws(() => + SentencePieceTokenizer.Create(vocab, unkId: 5)); + } + + [Fact] + public void CreateFromTokenizerJsonTest() + { + using Stream jsonStream = File.OpenRead(Path.Combine("Paraphrase-multilingual-MiniLM-L12-v2", "tokenizer.json")); + SentencePieceTokenizer jsonTokenizer = SentencePieceTokenizer.CreateFromTokenizerJson( + jsonStream, addBeginningOfSentence: false, addEndOfSentence: false); + + // The tokenizer.json vocab has =0, =1, =2, =3, then normal tokens + // (shifted +1 relative to .model which has =0, =1, =2) + Assert.Equal("", jsonTokenizer.UnknownToken); + Assert.Equal(3, jsonTokenizer.UnknownId); + Assert.Equal("", jsonTokenizer.BeginningOfSentenceToken); + Assert.Equal(0, jsonTokenizer.BeginningOfSentenceId); + Assert.Equal("", jsonTokenizer.EndOfSentenceToken); + Assert.Equal(2, jsonTokenizer.EndOfSentenceId); + + // Pieces produced should match the .model tokenizer; IDs are shifted by +1 + IReadOnlyList jsonTokens = jsonTokenizer.EncodeToTokens("Hello, world!", out _, addBeginningOfSentence: false, addEndOfSentence: false); + IReadOnlyList modelTokens = _unigramTokenizer.EncodeToTokens("Hello, world!", out _, addBeginningOfSentence: false, addEndOfSentence: false); + + Assert.Equal(modelTokens.Count, jsonTokens.Count); + for (int i = 0; i < modelTokens.Count; i++) + { + Assert.Equal(modelTokens[i].Value, jsonTokens[i].Value); + // JSON IDs are offset by 1 from the .model IDs for normal tokens + Assert.Equal(modelTokens[i].Id + 1, jsonTokens[i].Id); + } + } + + [Fact] + public void CreateFromTokenizerJsonNullStreamTest() + { + Assert.Throws(() => + SentencePieceTokenizer.CreateFromTokenizerJson(null!)); + } + + [Fact] + public void CreateFromTokenizerJsonNormalizationTest() + { + // Verify that the JSON tokenizer applies the precompiled charsmap normalization + // (same normalization as the .model tokenizer) + using Stream jsonStream = File.OpenRead(Path.Combine("Paraphrase-multilingual-MiniLM-L12-v2", "tokenizer.json")); + SentencePieceTokenizer jsonTokenizer = SentencePieceTokenizer.CreateFromTokenizerJson( + jsonStream, addBeginningOfSentence: false, addEndOfSentence: false); + + // "㍻" normalizes to "平成" via the precompiled charsmap (NFKC normalization) + IReadOnlyList jsonIds = jsonTokenizer.EncodeToIds("㍻", addBeginningOfSentence: false, addEndOfSentence: false); + IReadOnlyList modelIds = _unigramTokenizer.EncodeToIds("㍻", addBeginningOfSentence: false, addEndOfSentence: false); + + Assert.Equal(modelIds.Count, jsonIds.Count); + for (int i = 0; i < modelIds.Count; i++) + { + Assert.Equal(modelIds[i] + 1, jsonIds[i]); + } + } } } From 112ee2b87f5efde8a09fe487bca1c862e46b4ba2 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 11 Jun 2026 00:50:01 +0000 Subject: [PATCH 3/4] Fix BOS/EOS positional fallback, normalizer type validation, and prepend_scheme handling Co-authored-by: ericstj <8918108+ericstj@users.noreply.github.com> --- .../Model/SentencePieceTokenizer.cs | 30 +++++++++++++++---- .../Model/SentencePieceUnigramModel.cs | 16 +++++----- 2 files changed, 32 insertions(+), 14 deletions(-) diff --git a/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs index 2176fba0fa..e570f75378 100644 --- a/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs +++ b/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs @@ -640,22 +640,36 @@ public static SentencePieceTokenizer CreateFromTokenizerJson( return Convert.FromBase64String(base64); } } + return null; } else if (string.Equals(type, "Sequence", StringComparison.OrdinalIgnoreCase) && normalizer.TryGetProperty("normalizers", out JsonElement normalizersEl) && normalizersEl.ValueKind == JsonValueKind.Array) { + byte[]? result = null; foreach (JsonElement inner in normalizersEl.EnumerateArray()) { - byte[]? result = ExtractPrecompiledCharsMap(inner); - if (result is not null) + if (!inner.TryGetProperty("type", out JsonElement innerTypeEl)) + { + continue; + } + + string? innerType = innerTypeEl.GetString(); + if (string.Equals(innerType, "Precompiled", StringComparison.OrdinalIgnoreCase)) + { + result = ExtractPrecompiledCharsMap(inner); + } + else { - return result; + throw new NotSupportedException($"Normalizer type '{innerType}' in Sequence is not supported. Only 'Precompiled' normalizer is supported."); } } + return result; + } + else + { + throw new NotSupportedException($"Normalizer type '{type}' is not supported. Only 'Precompiled' and 'Sequence' normalizers are supported."); } - - return null; } private static void ExtractMetaspaceSettings(JsonElement preTokenizer, ref bool addDummyPrefix, ref bool escapeWhiteSpaces, ref bool treatWhitespaceAsSuffix) @@ -682,7 +696,11 @@ private static void ExtractMetaspaceSettings(JsonElement preTokenizer, ref bool if (preTokenizer.TryGetProperty("prepend_scheme", out JsonElement prependSchemeEl)) { string? scheme = prependSchemeEl.GetString(); - treatWhitespaceAsSuffix = string.Equals(scheme, "last", StringComparison.OrdinalIgnoreCase); + // "never" suppresses the dummy prefix; "always"/"first" keep the default (true) + if (string.Equals(scheme, "never", StringComparison.OrdinalIgnoreCase)) + { + addDummyPrefix = false; + } } } else if (string.Equals(type, "Sequence", StringComparison.OrdinalIgnoreCase) && diff --git a/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs b/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs index adc9f09f13..4e865504be 100644 --- a/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs +++ b/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs @@ -95,7 +95,7 @@ public SentencePieceUnigramModel(ModelProto modelProto, bool addBos, bool addEos // Constructor that builds a Unigram model directly from a list of (piece, score) pairs. // BOS, EOS, and PAD tokens are identified by their names ("", "", "") in the vocab; - // if not found by name, SentencePiece-conventional positions (1, 2, none) are used as fallbacks. + // if not found by name, they are treated as absent (id = -1) to avoid misidentifying real pieces. internal SentencePieceUnigramModel( IReadOnlyList<(string Piece, float Score)> pieces, int unkId, @@ -109,9 +109,9 @@ internal SentencePieceUnigramModel( IReadOnlyDictionary? specialTokens) : this(pieces, unkId, addBos, addEos, precompiledCharsmap, addDummyPrefix, escapeWhiteSpaces, treatWhitespaceAsSuffix, removeExtraWhitespaces, specialTokens, - FindSpecialTokenId(pieces, "", 1), - FindSpecialTokenId(pieces, "", 2), - FindSpecialTokenId(pieces, "", -1)) + FindSpecialTokenId(pieces, ""), + FindSpecialTokenId(pieces, ""), + FindSpecialTokenId(pieces, "")) { } @@ -212,12 +212,12 @@ private static string GetPieceAtIndex(IReadOnlyList<(string Piece, float Score)> return pieces[index].Piece; } - // Finds a special token by name; falls back to defaultId if not found (returns -1 if defaultId is out of range). - private static int FindSpecialTokenId(IReadOnlyList<(string Piece, float Score)>? pieces, string tokenName, int defaultId) + // Finds a special token by name; returns -1 if not found. + private static int FindSpecialTokenId(IReadOnlyList<(string Piece, float Score)>? pieces, string tokenName) { if (pieces is null) { - return defaultId; + return -1; } for (int i = 0; i < pieces.Count; i++) @@ -228,7 +228,7 @@ private static int FindSpecialTokenId(IReadOnlyList<(string Piece, float Score)> } } - return defaultId >= 0 && defaultId < pieces.Count ? defaultId : -1; + return -1; } public override IReadOnlyDictionary Vocabulary => new ReadOnlyDictionary(_vocab); From 7e32c05ec38457e08554ffccc193627bcaa40fb5 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 11 Jun 2026 01:22:50 +0000 Subject: [PATCH 4/4] Fix null normalizer guard, recursive Sequence support, BOS/EOS validation, and add tests Co-authored-by: ericstj <8918108+ericstj@users.noreply.github.com> --- .../Model/SentencePieceTokenizer.cs | 15 ++-- .../Model/SentencePieceUnigramModel.cs | 32 ++++++- .../UnigramTests.cs | 84 +++++++++++++++++++ 3 files changed, 119 insertions(+), 12 deletions(-) diff --git a/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs index e570f75378..9768826425 100644 --- a/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs +++ b/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs @@ -601,7 +601,8 @@ public static SentencePieceTokenizer CreateFromTokenizerJson( byte[]? precompiledCharsMap = null; bool addDummyPrefix = true; bool removeExtraWhitespaces = true; - if (root.TryGetProperty("normalizer", out JsonElement normalizerElement)) + if (root.TryGetProperty("normalizer", out JsonElement normalizerElement) && + normalizerElement.ValueKind == JsonValueKind.Object) { precompiledCharsMap = ExtractPrecompiledCharsMap(normalizerElement); } @@ -649,19 +650,15 @@ public static SentencePieceTokenizer CreateFromTokenizerJson( byte[]? result = null; foreach (JsonElement inner in normalizersEl.EnumerateArray()) { - if (!inner.TryGetProperty("type", out JsonElement innerTypeEl)) + if (inner.ValueKind != JsonValueKind.Object) { continue; } - string? innerType = innerTypeEl.GetString(); - if (string.Equals(innerType, "Precompiled", StringComparison.OrdinalIgnoreCase)) + byte[]? innerResult = ExtractPrecompiledCharsMap(inner); + if (innerResult is not null) { - result = ExtractPrecompiledCharsMap(inner); - } - else - { - throw new NotSupportedException($"Normalizer type '{innerType}' in Sequence is not supported. Only 'Precompiled' normalizer is supported."); + result = innerResult; } } return result; diff --git a/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs b/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs index 4e865504be..c876327009 100644 --- a/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs +++ b/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs @@ -109,7 +109,7 @@ internal SentencePieceUnigramModel( IReadOnlyDictionary? specialTokens) : this(pieces, unkId, addBos, addEos, precompiledCharsmap, addDummyPrefix, escapeWhiteSpaces, treatWhitespaceAsSuffix, removeExtraWhitespaces, specialTokens, - FindSpecialTokenId(pieces, ""), + FindSpecialTokenId(ValidateVocab(pieces, unkId), ""), FindSpecialTokenId(pieces, ""), FindSpecialTokenId(pieces, "")) { @@ -128,8 +128,8 @@ private SentencePieceUnigramModel( IReadOnlyDictionary? specialTokens, int bosId, int eosId, int padId) : base(addBos, addEos, - bosId >= 0 && bosId < GetPieceCount(pieces) ? pieces[bosId].Piece : "", bosId, - eosId >= 0 && eosId < GetPieceCount(pieces) ? pieces[eosId].Piece : "", eosId, + bosId >= 0 && bosId < GetPieceCount(pieces) ? pieces[bosId].Piece : "", CheckSpecialId(addBos, bosId, "addBeginningOfSentence"), + eosId >= 0 && eosId < GetPieceCount(pieces) ? pieces[eosId].Piece : "", CheckSpecialId(addEos, eosId, "addEndOfSentence"), GetPieceAtIndex(pieces, unkId), unkId, addDummyPrefix, escapeWhiteSpaces, treatWhitespaceAsSuffix, byteFallback: false, precompiledCharsmap, removeExtraWhitespaces, specialTokens) @@ -212,6 +212,23 @@ private static string GetPieceAtIndex(IReadOnlyList<(string Piece, float Score)> return pieces[index].Piece; } + // Validates pieces is not null and unkId is in range; returns pieces unchanged. + private static IReadOnlyList<(string Piece, float Score)> ValidateVocab( + IReadOnlyList<(string Piece, float Score)>? pieces, int unkId) + { + if (pieces is null) + { + throw new ArgumentNullException("vocab"); + } + + if ((uint)unkId >= (uint)pieces.Count) + { + throw new ArgumentOutOfRangeException("unkId", "unkId must be a valid index in the vocabulary."); + } + + return pieces; + } + // Finds a special token by name; returns -1 if not found. private static int FindSpecialTokenId(IReadOnlyList<(string Piece, float Score)>? pieces, string tokenName) { @@ -231,6 +248,15 @@ private static int FindSpecialTokenId(IReadOnlyList<(string Piece, float Score)> return -1; } + private static int CheckSpecialId(bool required, int id, string paramName) + { + if (required && id < 0) + { + throw new ArgumentException($"The vocabulary does not contain the required special token.", paramName); + } + return id; + } + public override IReadOnlyDictionary Vocabulary => new ReadOnlyDictionary(_vocab); public int MaxIdByteFallbackId { get; } diff --git a/test/Microsoft.ML.Tokenizers.Tests/UnigramTests.cs b/test/Microsoft.ML.Tokenizers.Tests/UnigramTests.cs index 5e0d6765b2..2d7a63373a 100644 --- a/test/Microsoft.ML.Tokenizers.Tests/UnigramTests.cs +++ b/test/Microsoft.ML.Tokenizers.Tests/UnigramTests.cs @@ -665,5 +665,89 @@ public void CreateFromTokenizerJsonNormalizationTest() Assert.Equal(modelIds[i] + 1, jsonIds[i]); } } + + [Fact] + public void CreateFromVocabNoSpecialTokensTest() + { + // Vocab without // — resembles bge-m3/potion layout. + // Verify that real pieces (e.g. ",") are not marked Control and remain encodable. + var vocab = new List<(string Piece, float Score)> + { + ("[PAD]", 0f), // 0 + ("[UNK]", 0f), // 1 + (",", -1f), // 2 + ("▁Hello", -2f), // 3 + ("▁world", -3f), // 4 + ("!", -4f), // 5 + }; + + SentencePieceTokenizer tokenizer = SentencePieceTokenizer.Create( + vocab, unkId: 1, addBeginningOfSentence: false, addEndOfSentence: false); + + // "," must be in the vocabulary and encodable (not silently dropped as Control) + IReadOnlyList ids = tokenizer.EncodeToIds("Hello, world!", addBeginningOfSentence: false, addEndOfSentence: false); + Assert.Contains(2, ids); // id 2 is "," + } + + [Fact] + public void CreateFromVocabBosRequiredButAbsentTest() + { + // Vocab without : addBeginningOfSentence:true should throw rather than emit index 0. + var vocab = new List<(string Piece, float Score)> + { + ("[UNK]", 0f), + ("▁Hello", -1f), + }; + + Assert.Throws(() => + SentencePieceTokenizer.Create(vocab, unkId: 0, addBeginningOfSentence: true)); + } + + [Fact] + public void CreateFromTokenizerJsonUnsupportedNormalizerTest() + { + // A Sequence normalizer containing a non-Precompiled step should throw NotSupportedException. + string json = """ + { + "model": { + "type": "Unigram", + "unk_id": 0, + "vocab": [["", 0.0], ["a", -1.0]] + }, + "normalizer": { + "type": "Sequence", + "normalizers": [ + { "type": "Precompiled", "precompiled_charsmap": "" }, + { "type": "Replace", "pattern": " ", "content": "_" } + ] + } + } + """; + + using Stream stream = new System.IO.MemoryStream(System.Text.Encoding.UTF8.GetBytes(json)); + Assert.Throws(() => + SentencePieceTokenizer.CreateFromTokenizerJson(stream, addBeginningOfSentence: false)); + } + + [Fact] + public void CreateFromTokenizerJsonNullNormalizerTest() + { + // A null normalizer value in JSON should not throw. + string json = """ + { + "model": { + "type": "Unigram", + "unk_id": 0, + "vocab": [["", 0.0], ["a", -1.0]] + }, + "normalizer": null + } + """; + + using Stream stream = new System.IO.MemoryStream(System.Text.Encoding.UTF8.GetBytes(json)); + SentencePieceTokenizer tokenizer = SentencePieceTokenizer.CreateFromTokenizerJson( + stream, addBeginningOfSentence: false); + Assert.NotNull(tokenizer); + } } }