66
77from tokenizers import Tokenizer
88
9+ from model2vec .distill .utils import Token
10+
911logger = logging .getLogger (__name__ )
1012
1113
1719}
1820
1921
20- def _pre_tokenize_vocabulary (tokenizer : Tokenizer , tokens : list [str ]) -> list [str ]:
22+ def _pre_tokenize_vocabulary (tokenizer : Tokenizer , tokens : list [Token ]) -> list [str ]:
2123 """
2224 Apply pre-tokenization to vocabulary tokens if a pre-tokenizer is present.
2325
@@ -33,14 +35,14 @@ def _pre_tokenize_vocabulary(tokenizer: Tokenizer, tokens: list[str]) -> list[st
3335
3436 if tokenizer .pre_tokenizer is not None :
3537 for token in tokens :
36- if token in current_tokenizer_vocab :
37- pre_tokenized_tokens .append (token )
38+ if token . is_subword :
39+ pre_tokenized_tokens .append (token . form )
3840 else :
3941 # We know 100% sure that all pretokenized tokens will have length 1.
40- pretokenized_tokens , _ = zip (* tokenizer .pre_tokenizer .pre_tokenize_str (f" { token } " ))
42+ pretokenized_tokens , _ = zip (* tokenizer .pre_tokenizer .pre_tokenize_str (f" { token . form } " ))
4143 pre_tokenized_tokens .append (pretokenized_tokens [- 1 ])
4244 else :
43- pre_tokenized_tokens = tokens
45+ pre_tokenized_tokens = [ token . form for token in tokens ]
4446
4547 return pre_tokenized_tokens
4648
@@ -106,7 +108,7 @@ def _make_new_merges_from_vocab(
106108
107109
108110def replace_vocabulary (
109- tokenizer : Tokenizer , new_vocabulary : list [str ], unk_token : str | None , pad_token : str | None
111+ tokenizer : Tokenizer , new_vocabulary : list [Token ], unk_token : str | None , pad_token : str | None
110112) -> Tokenizer :
111113 """Replace the vocabulary of a tokenizer with a new one."""
112114 tokenizer_json : dict [str , Any ] = json .loads (tokenizer .to_str ())
@@ -139,8 +141,8 @@ def replace_vocabulary(
139141 vocab = tokenizer_json ["model" ]["vocab" ]
140142 unk_token = vocab [unk_id ][0 ] if unk_id is not None else None
141143 current_probas = dict (tokenizer_json ["model" ]["vocab" ])
142- lowest_proba = min (current_probas .values ())
143- new_probas = {word : current_probas .get (word , lowest_proba ) for word in pre_tokenized_tokens }
144+ avg_proba = sum (current_probas .values ()) / len ( current_probas )
145+ new_probas = {word : current_probas .get (word , avg_proba ) for word in pre_tokenized_tokens }
144146 tokenizer_json ["model" ]["vocab" ] = sorted (new_probas .items (), key = lambda x : x [1 ], reverse = True )
145147
146148 tokens , _ = zip (* tokenizer_json ["model" ]["vocab" ])
0 commit comments