From 5b7972f66ab445e85b0c0e66341121a7273e4568 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Mon, 23 Feb 2026 23:34:50 +0000
Subject: [PATCH 1/6] Add backwards compatibility for transformers v4.57

---
 requirements.txt                            |  2 +-
 src/instructlab/training/data_process.py    |  6 +++---
 src/instructlab/training/tokenizer_utils.py | 23 ++++++++++++++++-----
 3 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 8769aafe..8276d45b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ wheel>=0.43
 pyyaml
 py-cpuinfo
 torch>=2.6.0
-transformers>=5.0.0
+transformers>=4.57.0
 
 datasets>=2.15.0
 numba>=0.62.0
diff --git a/src/instructlab/training/data_process.py b/src/instructlab/training/data_process.py
index 01266639..ba1903cd 100644
--- a/src/instructlab/training/data_process.py
+++ b/src/instructlab/training/data_process.py
@@ -23,7 +23,7 @@
 # First Party
 from instructlab.training.config import DataProcessArgs
 from instructlab.training.logger import setup_root_logger
-from instructlab.training.tokenizer_utils import get_sp_token, setup_tokenizer
+from instructlab.training.tokenizer_utils import get_sp_token, setup_tokenizer, SPECIAL_TOKENS_KEY
 from instructlab.training.type_definitions import Message, ProcessedMessagesData
 from instructlab.training.utils import log_rank_0, retrieve_chat_template
 
@@ -393,7 +393,7 @@ def process_messages_into_input_ids_with_chat_template(args: DataProcessArgs):
 
     # Adding after tokenizer setup as these are temp tokens, not to be saved
     tokenizer.add_special_tokens(
-        {"extra_special_tokens": ["<|pretrain|>", "<|/pretrain|>", "<|MASK|>"]}
+        {SPECIAL_TOKENS_KEY: ["<|pretrain|>", "<|/pretrain|>", "<|MASK|>"]}
     )
 
     try:
@@ -1300,7 +1300,7 @@ def configure_tokenizer(model_path: str) -> PreTrainedTokenizer:
     # Add special tokens for masking
     tokenizer.add_special_tokens(
         {
-            "extra_special_tokens": [
+            SPECIAL_TOKENS_KEY: [
                 UNMASK_BEGIN_TOKEN,
                 UNMASK_END_TOKEN,
                 UNMASK_REASONING_BEGIN_TOKEN,
diff --git a/src/instructlab/training/tokenizer_utils.py b/src/instructlab/training/tokenizer_utils.py
index ef9a92c7..02600f25 100644
--- a/src/instructlab/training/tokenizer_utils.py
+++ b/src/instructlab/training/tokenizer_utils.py
@@ -1,11 +1,23 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Third Party
+import transformers
 from transformers import AutoTokenizer, PreTrainedTokenizer
 
 # First Party
 from instructlab.training.utils import log_rank_0, retrieve_chat_template
 
+# Transformers v5 renamed 'additional_special_tokens' to 'extra_special_tokens'
+_TRANSFORMERS_V5 = int(transformers.__version__.split(".")[0]) >= 5
+SPECIAL_TOKENS_KEY = "extra_special_tokens" if _TRANSFORMERS_V5 else "additional_special_tokens"
+
+
+def get_extra_special_tokens(tokenizer: PreTrainedTokenizer) -> list[str]:
+    """Get extra/additional special tokens, compatible with both transformers v4 and v5."""
+    if _TRANSFORMERS_V5:
+        return tokenizer.extra_special_tokens
+    return tokenizer.additional_special_tokens
+
 
 def setup_tokenizer_with_existing_chat_template(
     tokenizer: PreTrainedTokenizer,
@@ -19,16 +31,17 @@ def setup_tokenizer_with_existing_chat_template(
         tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token})
 
     # ensure the pad token is in the extra special tokens without duplicating anything else
+    current_special = get_extra_special_tokens(tokenizer)
     new_tokens = []
-    if tokenizer.pad_token not in tokenizer.extra_special_tokens:
+    if tokenizer.pad_token not in current_special:
         new_tokens.append(tokenizer.pad_token)
-    if tokenizer.eos_token not in tokenizer.extra_special_tokens:
+    if tokenizer.eos_token not in current_special:
         new_tokens.append(tokenizer.eos_token)
 
     # ensure the tokens are being sorted to prevent any issues
     new_tokens = sorted(new_tokens)
-    extra_special_tokens = tokenizer.extra_special_tokens + new_tokens
-    tokenizer.add_special_tokens({"extra_special_tokens": extra_special_tokens})
+    extra_special_tokens = current_special + new_tokens
+    tokenizer.add_special_tokens({SPECIAL_TOKENS_KEY: extra_special_tokens})
 
     # ensure the necessary tokens exist
     assert len(get_sp_token(tokenizer, tokenizer.pad_token)) == 1, (
@@ -55,7 +68,7 @@ def setup_tokenizer_from_new_chat_template(
         }
     )
     tokenizer.add_special_tokens(
-        {"extra_special_tokens": SPECIAL_TOKENS.get_tokens_to_add()}
+        {SPECIAL_TOKENS_KEY: SPECIAL_TOKENS.get_tokens_to_add()}
     )
     if getattr(tokenizer, "add_bos_token", False) or getattr(
         tokenizer, "add_eos_token", False

From 257805c482162decbae49b09800d912b5fa2d859 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Mon, 23 Feb 2026 23:45:23 +0000
Subject: [PATCH 2/6] Run ruff formatting

---
 src/instructlab/training/data_process.py    | 6 +++++-
 src/instructlab/training/tokenizer_utils.py | 4 +++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/instructlab/training/data_process.py b/src/instructlab/training/data_process.py
index ba1903cd..9395b4e0 100644
--- a/src/instructlab/training/data_process.py
+++ b/src/instructlab/training/data_process.py
@@ -23,7 +23,11 @@
 # First Party
 from instructlab.training.config import DataProcessArgs
 from instructlab.training.logger import setup_root_logger
-from instructlab.training.tokenizer_utils import get_sp_token, setup_tokenizer, SPECIAL_TOKENS_KEY
+from instructlab.training.tokenizer_utils import (
+    get_sp_token,
+    setup_tokenizer,
+    SPECIAL_TOKENS_KEY,
+)
 from instructlab.training.type_definitions import Message, ProcessedMessagesData
 from instructlab.training.utils import log_rank_0, retrieve_chat_template
 
diff --git a/src/instructlab/training/tokenizer_utils.py b/src/instructlab/training/tokenizer_utils.py
index 02600f25..0c272984 100644
--- a/src/instructlab/training/tokenizer_utils.py
+++ b/src/instructlab/training/tokenizer_utils.py
@@ -9,7 +9,9 @@
 
 # Transformers v5 renamed 'additional_special_tokens' to 'extra_special_tokens'
 _TRANSFORMERS_V5 = int(transformers.__version__.split(".")[0]) >= 5
-SPECIAL_TOKENS_KEY = "extra_special_tokens" if _TRANSFORMERS_V5 else "additional_special_tokens"
+SPECIAL_TOKENS_KEY = (
+    "extra_special_tokens" if _TRANSFORMERS_V5 else "additional_special_tokens"
+)
 
 
 def get_extra_special_tokens(tokenizer: PreTrainedTokenizer) -> list[str]:

From 2ce52b9d01fa103badafe83dd1d96295b9fc2e73 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Mon, 23 Feb 2026 23:51:57 +0000
Subject: [PATCH 3/6] Fix isort import ordering

---
 src/instructlab/training/data_process.py    | 2 +-
 src/instructlab/training/tokenizer_utils.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/instructlab/training/data_process.py b/src/instructlab/training/data_process.py
index 9395b4e0..8d68a725 100644
--- a/src/instructlab/training/data_process.py
+++ b/src/instructlab/training/data_process.py
@@ -24,9 +24,9 @@
 from instructlab.training.config import DataProcessArgs
 from instructlab.training.logger import setup_root_logger
 from instructlab.training.tokenizer_utils import (
+    SPECIAL_TOKENS_KEY,
     get_sp_token,
     setup_tokenizer,
-    SPECIAL_TOKENS_KEY,
 )
 from instructlab.training.type_definitions import Message, ProcessedMessagesData
 from instructlab.training.utils import log_rank_0, retrieve_chat_template
diff --git a/src/instructlab/training/tokenizer_utils.py b/src/instructlab/training/tokenizer_utils.py
index 0c272984..1bda9d45 100644
--- a/src/instructlab/training/tokenizer_utils.py
+++ b/src/instructlab/training/tokenizer_utils.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Third Party
-import transformers
 from transformers import AutoTokenizer, PreTrainedTokenizer
+import transformers
 
 # First Party
 from instructlab.training.utils import log_rank_0, retrieve_chat_template

From 08027e3e66cf77646ad911efbd8ad86e814a3b62 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Tue, 24 Feb 2026 19:51:40 +0000
Subject: [PATCH 4/6] Fix pylint use-maxsplit-arg warning

---
 src/instructlab/training/tokenizer_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/instructlab/training/tokenizer_utils.py b/src/instructlab/training/tokenizer_utils.py
index 1bda9d45..96b8767c 100644
--- a/src/instructlab/training/tokenizer_utils.py
+++ b/src/instructlab/training/tokenizer_utils.py
@@ -8,7 +8,7 @@
 from instructlab.training.utils import log_rank_0, retrieve_chat_template
 
 # Transformers v5 renamed 'additional_special_tokens' to 'extra_special_tokens'
-_TRANSFORMERS_V5 = int(transformers.__version__.split(".")[0]) >= 5
+_TRANSFORMERS_V5 = int(transformers.__version__.split(".", maxsplit=1)[0]) >= 5
 SPECIAL_TOKENS_KEY = (
     "extra_special_tokens" if _TRANSFORMERS_V5 else "additional_special_tokens"
 )

From 8c44b13f7781f37e50d8a78845cd3d9447bc9625 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Wed, 25 Feb 2026 16:06:21 +0000
Subject: [PATCH 5/6] Address CodeRabbit review: fix yanked version floor and
 update test files

---
 requirements.txt                   |  2 +-
 tests/unit/test_data_process.py    | 13 ++++++-------
 tests/unit/test_unmask_messages.py |  5 +++--
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 8276d45b..fe7c0ea8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ wheel>=0.43
 pyyaml
 py-cpuinfo
 torch>=2.6.0
-transformers>=4.57.0
+transformers>=4.57.1
 
 datasets>=2.15.0
 numba>=0.62.0
diff --git a/tests/unit/test_data_process.py b/tests/unit/test_data_process.py
index c69cc9f8..00120501 100644
--- a/tests/unit/test_data_process.py
+++ b/tests/unit/test_data_process.py
@@ -19,6 +19,7 @@
     unmask_sample,
     wrap_masked_messages,
 )
+from instructlab.training.tokenizer_utils import SPECIAL_TOKENS_KEY
 from instructlab.training.type_definitions import Message, ProcessedMessagesData
 
 
@@ -520,11 +521,9 @@ def setUp(self):
         # Mock tokenizer for basic tests
         self.mock_tokenizer = MagicMock(spec=PreTrainedTokenizerBase)
         self.mock_tokenizer.name_or_path = "test-model"
-        self.mock_tokenizer.encode.side_effect = (
-            lambda text, add_special_tokens=False: [
-                hash(text) % 1000 for _ in text.split()
-            ]
-        )
+        self.mock_tokenizer.encode.side_effect = lambda text, add_special_tokens=False: [
+            hash(text) % 1000 for _ in text.split()
+        ]
         self.mock_tokenizer.decode.side_effect = lambda tokens: " ".join(
             [f"token_{t}" for t in tokens]
         )
@@ -853,7 +852,7 @@ def test_with_qwen_tokenizer(self):
         # Add the unmask tokens to the tokenizer
         tokenizer.add_special_tokens(
             {
-                "additional_special_tokens": [
+                SPECIAL_TOKENS_KEY: [
                     UNMASK_BEGIN_TOKEN,
                     UNMASK_END_TOKEN,
                     UNMASK_REASONING_BEGIN_TOKEN,
@@ -909,7 +908,7 @@ def test_with_phi_tokenizer(self):
         # Add the unmask tokens to the tokenizer
         tokenizer.add_special_tokens(
             {
-                "additional_special_tokens": [
+                SPECIAL_TOKENS_KEY: [
                     UNMASK_BEGIN_TOKEN,
                     UNMASK_END_TOKEN,
                     UNMASK_REASONING_BEGIN_TOKEN,
diff --git a/tests/unit/test_unmask_messages.py b/tests/unit/test_unmask_messages.py
index 7b2de057..722fa557 100644
--- a/tests/unit/test_unmask_messages.py
+++ b/tests/unit/test_unmask_messages.py
@@ -24,6 +24,7 @@
     unmask_sample,
     wrap_masked_messages,
 )
+from instructlab.training.tokenizer_utils import SPECIAL_TOKENS_KEY
 from instructlab.training.type_definitions import Message
 
 
@@ -342,7 +343,7 @@ def test_tokenizer(self):
             # Add the special tokens
             tokenizer.add_special_tokens(
                 {
-                    "additional_special_tokens": [
+                    SPECIAL_TOKENS_KEY: [
                         UNMASK_BEGIN_TOKEN,
                         UNMASK_END_TOKEN,
                         UNMASK_REASONING_BEGIN_TOKEN,
@@ -492,7 +493,7 @@ def real_tokenizer(self, request):
             # Add the special unmask tokens
             tokenizer.add_special_tokens(
                 {
-                    "additional_special_tokens": [
+                    SPECIAL_TOKENS_KEY: [
                         UNMASK_BEGIN_TOKEN,
                         UNMASK_END_TOKEN,
                         UNMASK_REASONING_BEGIN_TOKEN,

From 69f067ad9fad7f7541f94c88c38072b726b22a9e Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Wed, 25 Feb 2026 16:10:26 +0000
Subject: [PATCH 6/6] Fix ruff formatting to match CI ruff version

---
 tests/unit/test_data_process.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/unit/test_data_process.py b/tests/unit/test_data_process.py
index 00120501..8c59c25a 100644
--- a/tests/unit/test_data_process.py
+++ b/tests/unit/test_data_process.py
@@ -521,9 +521,11 @@ def setUp(self):
         # Mock tokenizer for basic tests
         self.mock_tokenizer = MagicMock(spec=PreTrainedTokenizerBase)
         self.mock_tokenizer.name_or_path = "test-model"
-        self.mock_tokenizer.encode.side_effect = lambda text, add_special_tokens=False: [
-            hash(text) % 1000 for _ in text.split()
-        ]
+        self.mock_tokenizer.encode.side_effect = (
+            lambda text, add_special_tokens=False: [
+                hash(text) % 1000 for _ in text.split()
+            ]
+        )
         self.mock_tokenizer.decode.side_effect = lambda tokens: " ".join(
             [f"token_{t}" for t in tokens]
         )