From 5b7972f66ab445e85b0c0e66341121a7273e4568 Mon Sep 17 00:00:00 2001 From: Mustafa Eyceoz Date: Mon, 23 Feb 2026 23:34:50 +0000 Subject: [PATCH 1/6] Add backwards compatibility for transformers v4.57 --- requirements.txt | 2 +- src/instructlab/training/data_process.py | 6 +++--- src/instructlab/training/tokenizer_utils.py | 23 ++++++++++++++++----- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/requirements.txt b/requirements.txt index 8769aafe..8276d45b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ wheel>=0.43 pyyaml py-cpuinfo torch>=2.6.0 -transformers>=5.0.0 +transformers>=4.57.0 datasets>=2.15.0 numba>=0.62.0 diff --git a/src/instructlab/training/data_process.py b/src/instructlab/training/data_process.py index 01266639..ba1903cd 100644 --- a/src/instructlab/training/data_process.py +++ b/src/instructlab/training/data_process.py @@ -23,7 +23,7 @@ # First Party from instructlab.training.config import DataProcessArgs from instructlab.training.logger import setup_root_logger -from instructlab.training.tokenizer_utils import get_sp_token, setup_tokenizer +from instructlab.training.tokenizer_utils import get_sp_token, setup_tokenizer, SPECIAL_TOKENS_KEY from instructlab.training.type_definitions import Message, ProcessedMessagesData from instructlab.training.utils import log_rank_0, retrieve_chat_template @@ -393,7 +393,7 @@ def process_messages_into_input_ids_with_chat_template(args: DataProcessArgs): # Adding after tokenizer setup as these are temp tokens, not to be saved tokenizer.add_special_tokens( - {"extra_special_tokens": ["<|pretrain|>", "<|/pretrain|>", "<|MASK|>"]} + {SPECIAL_TOKENS_KEY: ["<|pretrain|>", "<|/pretrain|>", "<|MASK|>"]} ) try: @@ -1300,7 +1300,7 @@ def configure_tokenizer(model_path: str) -> PreTrainedTokenizer: # Add special tokens for masking tokenizer.add_special_tokens( { - "extra_special_tokens": [ + SPECIAL_TOKENS_KEY: [ UNMASK_BEGIN_TOKEN, UNMASK_END_TOKEN, UNMASK_REASONING_BEGIN_TOKEN, diff --git a/src/instructlab/training/tokenizer_utils.py b/src/instructlab/training/tokenizer_utils.py index ef9a92c7..02600f25 100644 --- a/src/instructlab/training/tokenizer_utils.py +++ b/src/instructlab/training/tokenizer_utils.py @@ -1,11 +1,23 @@ # SPDX-License-Identifier: Apache-2.0 # Third Party +import transformers from transformers import AutoTokenizer, PreTrainedTokenizer # First Party from instructlab.training.utils import log_rank_0, retrieve_chat_template +# Transformers v5 renamed 'additional_special_tokens' to 'extra_special_tokens' +_TRANSFORMERS_V5 = int(transformers.__version__.split(".")[0]) >= 5 +SPECIAL_TOKENS_KEY = "extra_special_tokens" if _TRANSFORMERS_V5 else "additional_special_tokens" + + +def get_extra_special_tokens(tokenizer: PreTrainedTokenizer) -> list[str]: + """Get extra/additional special tokens, compatible with both transformers v4 and v5.""" + if _TRANSFORMERS_V5: + return tokenizer.extra_special_tokens + return tokenizer.additional_special_tokens + def setup_tokenizer_with_existing_chat_template( tokenizer: PreTrainedTokenizer, @@ -19,16 +31,17 @@ def setup_tokenizer_with_existing_chat_template( tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token}) # ensure the pad token is in the extra special tokens without duplicating anything else + current_special = get_extra_special_tokens(tokenizer) new_tokens = [] - if tokenizer.pad_token not in tokenizer.extra_special_tokens: + if tokenizer.pad_token not in current_special: new_tokens.append(tokenizer.pad_token) - if tokenizer.eos_token not in tokenizer.extra_special_tokens: + if tokenizer.eos_token not in current_special: new_tokens.append(tokenizer.eos_token) # ensure the tokens are being sorted to prevent any issues new_tokens = sorted(new_tokens) - extra_special_tokens = tokenizer.extra_special_tokens + new_tokens - tokenizer.add_special_tokens({"extra_special_tokens": extra_special_tokens}) + extra_special_tokens = current_special + new_tokens + tokenizer.add_special_tokens({SPECIAL_TOKENS_KEY: extra_special_tokens}) # ensure the necessary tokens exist assert len(get_sp_token(tokenizer, tokenizer.pad_token)) == 1, ( @@ -55,7 +68,7 @@ def setup_tokenizer_from_new_chat_template( } ) tokenizer.add_special_tokens( - {"extra_special_tokens": SPECIAL_TOKENS.get_tokens_to_add()} + {SPECIAL_TOKENS_KEY: SPECIAL_TOKENS.get_tokens_to_add()} ) if getattr(tokenizer, "add_bos_token", False) or getattr( tokenizer, "add_eos_token", False From 257805c482162decbae49b09800d912b5fa2d859 Mon Sep 17 00:00:00 2001 From: Mustafa Eyceoz Date: Mon, 23 Feb 2026 23:45:23 +0000 Subject: [PATCH 2/6] Run ruff formatting --- src/instructlab/training/data_process.py | 6 +++++- src/instructlab/training/tokenizer_utils.py | 4 +++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/instructlab/training/data_process.py b/src/instructlab/training/data_process.py index ba1903cd..9395b4e0 100644 --- a/src/instructlab/training/data_process.py +++ b/src/instructlab/training/data_process.py @@ -23,7 +23,11 @@ # First Party from instructlab.training.config import DataProcessArgs from instructlab.training.logger import setup_root_logger -from instructlab.training.tokenizer_utils import get_sp_token, setup_tokenizer, SPECIAL_TOKENS_KEY +from instructlab.training.tokenizer_utils import ( + get_sp_token, + setup_tokenizer, + SPECIAL_TOKENS_KEY, +) from instructlab.training.type_definitions import Message, ProcessedMessagesData from instructlab.training.utils import log_rank_0, retrieve_chat_template diff --git a/src/instructlab/training/tokenizer_utils.py b/src/instructlab/training/tokenizer_utils.py index 02600f25..0c272984 100644 --- a/src/instructlab/training/tokenizer_utils.py +++ b/src/instructlab/training/tokenizer_utils.py @@ -9,7 +9,9 @@ # Transformers v5 renamed 'additional_special_tokens' to 'extra_special_tokens' _TRANSFORMERS_V5 = int(transformers.__version__.split(".")[0]) >= 5 -SPECIAL_TOKENS_KEY = "extra_special_tokens" if _TRANSFORMERS_V5 else "additional_special_tokens" +SPECIAL_TOKENS_KEY = ( + "extra_special_tokens" if _TRANSFORMERS_V5 else "additional_special_tokens" +) def get_extra_special_tokens(tokenizer: PreTrainedTokenizer) -> list[str]: From 2ce52b9d01fa103badafe83dd1d96295b9fc2e73 Mon Sep 17 00:00:00 2001 From: Mustafa Eyceoz Date: Mon, 23 Feb 2026 23:51:57 +0000 Subject: [PATCH 3/6] Fix isort import ordering --- src/instructlab/training/data_process.py | 2 +- src/instructlab/training/tokenizer_utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/instructlab/training/data_process.py b/src/instructlab/training/data_process.py index 9395b4e0..8d68a725 100644 --- a/src/instructlab/training/data_process.py +++ b/src/instructlab/training/data_process.py @@ -24,9 +24,9 @@ from instructlab.training.config import DataProcessArgs from instructlab.training.logger import setup_root_logger from instructlab.training.tokenizer_utils import ( + SPECIAL_TOKENS_KEY, get_sp_token, setup_tokenizer, - SPECIAL_TOKENS_KEY, ) from instructlab.training.type_definitions import Message, ProcessedMessagesData from instructlab.training.utils import log_rank_0, retrieve_chat_template diff --git a/src/instructlab/training/tokenizer_utils.py b/src/instructlab/training/tokenizer_utils.py index 0c272984..1bda9d45 100644 --- a/src/instructlab/training/tokenizer_utils.py +++ b/src/instructlab/training/tokenizer_utils.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # Third Party -import transformers from transformers import AutoTokenizer, PreTrainedTokenizer +import transformers # First Party from instructlab.training.utils import log_rank_0, retrieve_chat_template From 08027e3e66cf77646ad911efbd8ad86e814a3b62 Mon Sep 17 00:00:00 2001 From: Mustafa Eyceoz Date: Tue, 24 Feb 2026 19:51:40 +0000 Subject: [PATCH 4/6] Fix pylint use-maxsplit-arg warning --- src/instructlab/training/tokenizer_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/instructlab/training/tokenizer_utils.py b/src/instructlab/training/tokenizer_utils.py index 1bda9d45..96b8767c 100644 --- a/src/instructlab/training/tokenizer_utils.py +++ b/src/instructlab/training/tokenizer_utils.py @@ -8,7 +8,7 @@ from instructlab.training.utils import log_rank_0, retrieve_chat_template # Transformers v5 renamed 'additional_special_tokens' to 'extra_special_tokens' -_TRANSFORMERS_V5 = int(transformers.__version__.split(".")[0]) >= 5 +_TRANSFORMERS_V5 = int(transformers.__version__.split(".", maxsplit=1)[0]) >= 5 SPECIAL_TOKENS_KEY = ( "extra_special_tokens" if _TRANSFORMERS_V5 else "additional_special_tokens" ) From 8c44b13f7781f37e50d8a78845cd3d9447bc9625 Mon Sep 17 00:00:00 2001 From: Mustafa Eyceoz Date: Wed, 25 Feb 2026 16:06:21 +0000 Subject: [PATCH 5/6] Address CodeRabbit review: fix yanked version floor and update test files --- requirements.txt | 2 +- tests/unit/test_data_process.py | 13 ++++++------- tests/unit/test_unmask_messages.py | 5 +++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/requirements.txt b/requirements.txt index 8276d45b..fe7c0ea8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ wheel>=0.43 pyyaml py-cpuinfo torch>=2.6.0 -transformers>=4.57.0 +transformers>=4.57.1 datasets>=2.15.0 numba>=0.62.0 diff --git a/tests/unit/test_data_process.py b/tests/unit/test_data_process.py index c69cc9f8..00120501 100644 --- a/tests/unit/test_data_process.py +++ b/tests/unit/test_data_process.py @@ -19,6 +19,7 @@ unmask_sample, wrap_masked_messages, ) +from instructlab.training.tokenizer_utils import SPECIAL_TOKENS_KEY from instructlab.training.type_definitions import Message, ProcessedMessagesData @@ -520,11 +521,9 @@ def setUp(self): # Mock tokenizer for basic tests self.mock_tokenizer = MagicMock(spec=PreTrainedTokenizerBase) self.mock_tokenizer.name_or_path = "test-model" - self.mock_tokenizer.encode.side_effect = ( - lambda text, add_special_tokens=False: [ - hash(text) % 1000 for _ in text.split() - ] - ) + self.mock_tokenizer.encode.side_effect = lambda text, add_special_tokens=False: [ + hash(text) % 1000 for _ in text.split() + ] self.mock_tokenizer.decode.side_effect = lambda tokens: " ".join( [f"token_{t}" for t in tokens] ) @@ -853,7 +852,7 @@ def test_with_qwen_tokenizer(self): # Add the unmask tokens to the tokenizer tokenizer.add_special_tokens( { - "additional_special_tokens": [ + SPECIAL_TOKENS_KEY: [ UNMASK_BEGIN_TOKEN, UNMASK_END_TOKEN, UNMASK_REASONING_BEGIN_TOKEN, @@ -909,7 +908,7 @@ def test_with_phi_tokenizer(self): # Add the unmask tokens to the tokenizer tokenizer.add_special_tokens( { - "additional_special_tokens": [ + SPECIAL_TOKENS_KEY: [ UNMASK_BEGIN_TOKEN, UNMASK_END_TOKEN, UNMASK_REASONING_BEGIN_TOKEN, diff --git a/tests/unit/test_unmask_messages.py b/tests/unit/test_unmask_messages.py index 7b2de057..722fa557 100644 --- a/tests/unit/test_unmask_messages.py +++ b/tests/unit/test_unmask_messages.py @@ -24,6 +24,7 @@ unmask_sample, wrap_masked_messages, ) +from instructlab.training.tokenizer_utils import SPECIAL_TOKENS_KEY from instructlab.training.type_definitions import Message @@ -342,7 +343,7 @@ def test_tokenizer(self): # Add the special tokens tokenizer.add_special_tokens( { - "additional_special_tokens": [ + SPECIAL_TOKENS_KEY: [ UNMASK_BEGIN_TOKEN, UNMASK_END_TOKEN, UNMASK_REASONING_BEGIN_TOKEN, @@ -492,7 +493,7 @@ def real_tokenizer(self, request): # Add the special unmask tokens tokenizer.add_special_tokens( { - "additional_special_tokens": [ + SPECIAL_TOKENS_KEY: [ UNMASK_BEGIN_TOKEN, UNMASK_END_TOKEN, UNMASK_REASONING_BEGIN_TOKEN, From 69f067ad9fad7f7541f94c88c38072b726b22a9e Mon Sep 17 00:00:00 2001 From: Mustafa Eyceoz Date: Wed, 25 Feb 2026 16:10:26 +0000 Subject: [PATCH 6/6] Fix ruff formatting to match CI ruff version --- tests/unit/test_data_process.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/unit/test_data_process.py b/tests/unit/test_data_process.py index 00120501..8c59c25a 100644 --- a/tests/unit/test_data_process.py +++ b/tests/unit/test_data_process.py @@ -521,9 +521,11 @@ def setUp(self): # Mock tokenizer for basic tests self.mock_tokenizer = MagicMock(spec=PreTrainedTokenizerBase) self.mock_tokenizer.name_or_path = "test-model" - self.mock_tokenizer.encode.side_effect = lambda text, add_special_tokens=False: [ - hash(text) % 1000 for _ in text.split() - ] + self.mock_tokenizer.encode.side_effect = ( + lambda text, add_special_tokens=False: [ + hash(text) % 1000 for _ in text.split() + ] + ) self.mock_tokenizer.decode.side_effect = lambda tokens: " ".join( [f"token_{t}" for t in tokens] )