From 57f0e5f1bcd4e9f00e37f8803e59396cf53b1d57 Mon Sep 17 00:00:00 2001
From: Orion-zhen <128988082+Orion-zhen@users.noreply.github.com>
Date: Sat, 1 Mar 2025 13:02:29 +0800
Subject: [PATCH 1/8] :sparkles: reasoning parser support

---
 common/config_models.py                | 21 +++++++++++++
 config_sample.yml                      | 10 +++++++
 endpoints/OAI/types/chat_completion.py |  1 +
 endpoints/OAI/utils/chat_completion.py | 41 +++++++++++++++++++++++---
 4 files changed, 69 insertions(+), 4 deletions(-)
diff --git a/common/config_models.py b/common/config_models.py
index f78408f4..5b91a476 100644
--- a/common/config_models.py
+++ b/common/config_models.py
@@ -99,6 +99,27 @@ class NetworkConfig(BaseConfigModel):
             "Possible values: OAI, Kobold."
         ),
     )
+    reasoning_parser: bool = Field(
+        False,
+        description=(
+            "Enable the reasoning parser (default: False).\n"
+            "This will split response message into reasoning_content and content fields."
+        )
+    )
+    reasoning_start_token: str = Field(
+        "<think>",
+        description=(
+            "Start token for the reasoning parser (default: <think>).\n"
+            "This token is used to split the response message into reasoning_content and content fields."
+        )
+    )
+    reasoning_end_token: str = Field(
+        "</think>",
+        description=(
+            "End token for the reasoning parser (default: </think>).\n"
+            "This token is used to split the response message into reasoning_content and content fields."
+        )
+    )
 
     # Converts all strings in the api_servers list to lowercase
     # NOTE: Expand if more models need this validator
diff --git a/config_sample.yml b/config_sample.yml
index 745433bd..fcf23683 100644
--- a/config_sample.yml
+++ b/config_sample.yml
@@ -31,6 +31,16 @@ network:
   # Possible values: OAI, Kobold.
   api_servers: ["OAI"]
 
+  # Enable reasoning parser (default: False).
+  # Do NOT enable this if the model is not a reasoning model (e.g. deepseek-r1 series)
+  reasoning_parser: false
+
+  # The start token for reasoning conetnt (default: "<think>")
+  reasoning_start_token: "<think>"
+
+  # The end token for reasoning conetnt (default: "</think>")
+  reasoning_end_token: "</think>"
+
 # Options for logging
 logging:
   # Enable prompt logging (default: False).
diff --git a/endpoints/OAI/types/chat_completion.py b/endpoints/OAI/types/chat_completion.py
index 86a22477..f6d74d80 100644
--- a/endpoints/OAI/types/chat_completion.py
+++ b/endpoints/OAI/types/chat_completion.py
@@ -31,6 +31,7 @@ class ChatCompletionMessagePart(BaseModel):
 class ChatCompletionMessage(BaseModel):
     role: str = "user"
     content: Optional[Union[str, List[ChatCompletionMessagePart]]] = None
+    reasoning_content: Optional[str] = None
     tool_calls: Optional[List[ToolCall]] = None
     tool_calls_json: SkipJsonSchema[Optional[str]] = None
 
diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py
index a646924c..93da1a80 100644
--- a/endpoints/OAI/utils/chat_completion.py
+++ b/endpoints/OAI/utils/chat_completion.py
@@ -17,6 +17,7 @@
     handle_request_error,
     request_disconnect_loop,
 )
+from common.tabby_config import config
 from common.utils import unwrap
 from endpoints.OAI.types.chat_completion import (
     ChatCompletionLogprobs,
@@ -33,6 +34,20 @@
 from endpoints.OAI.types.tools import ToolCall
 
 
+def _extract_think_content(text: str) -> tuple[Optional[str], Optional[str]]:
+    """Extract content between <think> tags and the remaining content. Only available in none-streaming mode."""
+    if config.network.reasoning_start_token not in text and config.network.reasoning_end_token not in text:
+        return None, text
+    elif config.network.reasoning_start_token in text:
+        start_reasoning = text.split(config.network.reasoning_start_token)[1]
+        reasoning_content = start_reasoning.split(config.network.reasoning_end_token)[0]
+        content = start_reasoning.split(config.network.reasoning_end_token)[1]
+        return reasoning_content, content
+    else:
+        reasoning_content = text.split(config.network.reasoning_end_token)[0]
+        content = text.split(config.network.reasoning_end_token)[1]
+        return reasoning_content, content
+
 def _create_response(
     request_id: str, generations: List[dict], model_name: Optional[str]
 ):
@@ -43,9 +58,16 @@ def _create_response(
 
     choices = []
     for index, generation in enumerate(generations):
-        message = ChatCompletionMessage(
-            role="assistant", content=unwrap(generation.get("text"), "")
-        )
+        if config.network.reasoning_parser:
+            raw_content = unwrap(generation.get("text"), "")
+            reasoning_content, content = _extract_think_content(raw_content)
+            message = ChatCompletionMessage(
+                role="assistant", reasoning_content=reasoning_content, content=content
+            )
+        else:
+            message = ChatCompletionMessage(
+                role="assistant", content=unwrap(generation.get("text"), "")
+            )
 
         tool_calls = generation["tool_calls"]
         if tool_calls:
@@ -103,6 +125,7 @@ def _create_stream_chunk(
     generation: Optional[dict] = None,
     model_name: Optional[str] = None,
     is_usage_chunk: bool = False,
+    is_reasoning_chunk: bool = False,
 ):
     """Create a chat completion stream chunk from the provided text."""
 
@@ -137,6 +160,8 @@ def _create_stream_chunk(
 
     else:
         message = ChatCompletionMessage(
+            role="assistant", reasoning_content=unwrap(generation.get("text"), "")
+        ) if is_reasoning_chunk else ChatCompletionMessage(
             role="assistant", content=unwrap(generation.get("text"), "")
         )
 
@@ -328,6 +353,8 @@ async def stream_generate_chat_completion(
 
         # We need to keep track of the text generated so we can resume the tool calls
         current_generation_text = ""
+        
+        is_reasoning_chunk = config.network.reasoning_parser
 
         # Consumer loop
         while True:
@@ -356,8 +383,14 @@ async def stream_generate_chat_completion(
             if isinstance(generation, Exception):
                 raise generation
 
+            if unwrap(generation.get("text"), "") == config.network.reasoning_end_token:
+                # Update reasoning chunk flag
+                is_reasoning_chunk = False
+                # And skip this token
+                continue
+            
             response = _create_stream_chunk(
-                request.state.id, generation, model_path.name
+                request.state.id, generation, model_path.name, is_reasoning_chunk=is_reasoning_chunk
             )
             yield response.model_dump_json()
 

From dfded6525403b629c3a41d28236f8b855e82326e Mon Sep 17 00:00:00 2001
From: Orion <128988082+Orion-zhen@users.noreply.github.com>
Date: Sat, 1 Mar 2025 15:31:23 +0800
Subject: [PATCH 2/8] :art: format code according to ruff

---
 common/config_models.py                |  6 +++---
 endpoints/OAI/utils/chat_completion.py | 27 ++++++++++++++++++--------
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/common/config_models.py b/common/config_models.py
index 5b91a476..8d264041 100644
--- a/common/config_models.py
+++ b/common/config_models.py
@@ -104,21 +104,21 @@ class NetworkConfig(BaseConfigModel):
         description=(
             "Enable the reasoning parser (default: False).\n"
             "This will split response message into reasoning_content and content fields."
-        )
+        ),
     )
     reasoning_start_token: str = Field(
         "<think>",
         description=(
             "Start token for the reasoning parser (default: <think>).\n"
             "This token is used to split the response message into reasoning_content and content fields."
-        )
+        ),
     )
     reasoning_end_token: str = Field(
         "</think>",
         description=(
             "End token for the reasoning parser (default: </think>).\n"
             "This token is used to split the response message into reasoning_content and content fields."
-        )
+        ),
     )
 
     # Converts all strings in the api_servers list to lowercase
diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py
index 93da1a80..3b0d6959 100644
--- a/endpoints/OAI/utils/chat_completion.py
+++ b/endpoints/OAI/utils/chat_completion.py
@@ -36,7 +36,10 @@
 
 def _extract_think_content(text: str) -> tuple[Optional[str], Optional[str]]:
     """Extract content between <think> tags and the remaining content. Only available in none-streaming mode."""
-    if config.network.reasoning_start_token not in text and config.network.reasoning_end_token not in text:
+    if (
+        config.network.reasoning_start_token not in text
+        and config.network.reasoning_end_token not in text
+    ):
         return None, text
     elif config.network.reasoning_start_token in text:
         start_reasoning = text.split(config.network.reasoning_start_token)[1]
@@ -48,6 +51,7 @@ def _extract_think_content(text: str) -> tuple[Optional[str], Optional[str]]:
         content = text.split(config.network.reasoning_end_token)[1]
         return reasoning_content, content
 
+
 def _create_response(
     request_id: str, generations: List[dict], model_name: Optional[str]
 ):
@@ -159,10 +163,14 @@ def _create_stream_chunk(
         choices.append(choice)
 
     else:
-        message = ChatCompletionMessage(
-            role="assistant", reasoning_content=unwrap(generation.get("text"), "")
-        ) if is_reasoning_chunk else ChatCompletionMessage(
-            role="assistant", content=unwrap(generation.get("text"), "")
+        message = (
+            ChatCompletionMessage(
+                role="assistant", reasoning_content=unwrap(generation.get("text"), "")
+            )
+            if is_reasoning_chunk
+            else ChatCompletionMessage(
+                role="assistant", content=unwrap(generation.get("text"), "")
+            )
         )
 
         logprob_response = None
@@ -353,7 +361,7 @@ async def stream_generate_chat_completion(
 
         # We need to keep track of the text generated so we can resume the tool calls
         current_generation_text = ""
-        
+
         is_reasoning_chunk = config.network.reasoning_parser
 
         # Consumer loop
@@ -388,9 +396,12 @@ async def stream_generate_chat_completion(
                 is_reasoning_chunk = False
                 # And skip this token
                 continue
-            
+
             response = _create_stream_chunk(
-                request.state.id, generation, model_path.name, is_reasoning_chunk=is_reasoning_chunk
+                request.state.id,
+                generation,
+                model_path.name,
+                is_reasoning_chunk=is_reasoning_chunk,
             )
             yield response.model_dump_json()
 

From 1bd3968e25d676fe93be218968ebfda3b695280c Mon Sep 17 00:00:00 2001
From: Orion <128988082+Orion-zhen@users.noreply.github.com>
Date: Sat, 1 Mar 2025 15:36:28 +0800
Subject: [PATCH 3/8] :art: shorten lines

---
 common/config_models.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/common/config_models.py b/common/config_models.py
index 8d264041..46f784af 100644
--- a/common/config_models.py
+++ b/common/config_models.py
@@ -103,21 +103,19 @@ class NetworkConfig(BaseConfigModel):
         False,
         description=(
             "Enable the reasoning parser (default: False).\n"
-            "This will split response message into reasoning_content and content fields."
+            "Split response message into reasoning_content and content fields."
         ),
     )
     reasoning_start_token: str = Field(
         "<think>",
         description=(
-            "Start token for the reasoning parser (default: <think>).\n"
-            "This token is used to split the response message into reasoning_content and content fields."
+            "Start token for the reasoning parser (default: <think>)."
         ),
     )
     reasoning_end_token: str = Field(
         "</think>",
         description=(
-            "End token for the reasoning parser (default: </think>).\n"
-            "This token is used to split the response message into reasoning_content and content fields."
+            "End token for the reasoning parser (default: </think>)."
         ),
     )
 

From 41a8aa639f41ae2b8dfb8af0958095f3940a03c2 Mon Sep 17 00:00:00 2001
From: Orion <128988082+Orion-zhen@users.noreply.github.com>
Date: Sat, 1 Mar 2025 15:38:06 +0800
Subject: [PATCH 4/8] :art: god, please let me pass format check

---
 common/config_models.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/common/config_models.py b/common/config_models.py
index 46f784af..bbcfdda5 100644
--- a/common/config_models.py
+++ b/common/config_models.py
@@ -108,15 +108,11 @@ class NetworkConfig(BaseConfigModel):
     )
     reasoning_start_token: str = Field(
         "<think>",
-        description=(
-            "Start token for the reasoning parser (default: <think>)."
-        ),
+        description=("Start token for the reasoning parser (default: <think>)."),
     )
     reasoning_end_token: str = Field(
         "</think>",
-        description=(
-            "End token for the reasoning parser (default: </think>)."
-        ),
+        description=("End token for the reasoning parser (default: </think>)."),
     )
 
     # Converts all strings in the api_servers list to lowercase

From f0888437b1fb14fb061dc71ddcc231cf79eda39b Mon Sep 17 00:00:00 2001
From: Orion <128988082+Orion-zhen@users.noreply.github.com>
Date: Sat, 1 Mar 2025 15:42:12 +0800
Subject: [PATCH 5/8] :art: TAT

---
 endpoints/OAI/utils/chat_completion.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py
index 3b0d6959..c4b51050 100644
--- a/endpoints/OAI/utils/chat_completion.py
+++ b/endpoints/OAI/utils/chat_completion.py
@@ -35,7 +35,8 @@
 
 
 def _extract_think_content(text: str) -> tuple[Optional[str], Optional[str]]:
-    """Extract content between <think> tags and the remaining content. Only available in none-streaming mode."""
+    """Extract content between <think> tags and the remaining content.
+    Only available in none-streaming mode."""
     if (
         config.network.reasoning_start_token not in text
         and config.network.reasoning_end_token not in text

From 9efb7aab391fba0a62d32d20439bda113e56256a Mon Sep 17 00:00:00 2001
From: Orion-zhen <128988082+Orion-zhen@users.noreply.github.com>
Date: Thu, 6 Mar 2025 11:42:56 +0800
Subject: [PATCH 6/8] :sparkles: handle reasoning start token

---
 endpoints/OAI/utils/chat_completion.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py
index c4b51050..ffbd751a 100644
--- a/endpoints/OAI/utils/chat_completion.py
+++ b/endpoints/OAI/utils/chat_completion.py
@@ -392,7 +392,19 @@ async def stream_generate_chat_completion(
             if isinstance(generation, Exception):
                 raise generation
 
-            if unwrap(generation.get("text"), "") == config.network.reasoning_end_token:
+            if (
+                unwrap(generation.get("text"), "")
+                == config.network.reasoning_start_token
+                and config.network.reasoning_parser
+            ):
+                # Update reasoning chunk flag
+                is_reasoning_chunk = True
+                # And skip this token
+                continue
+            if (
+                unwrap(generation.get("text"), "") == config.network.reasoning_end_token
+                and config.network.reasoning_parser
+            ):
                 # Update reasoning chunk flag
                 is_reasoning_chunk = False
                 # And skip this token

From 45190004cfd723b045a7853cd4ad466d006fd6d7 Mon Sep 17 00:00:00 2001
From: Orion-zhen <128988082+Orion-zhen@users.noreply.github.com>
Date: Tue, 18 Mar 2025 23:15:38 +0800
Subject: [PATCH 7/8] :wrench: move reasoning config to model section

---
 common/config_models.py                | 30 +++++++++++++-------------
 config_sample.yml                      | 20 ++++++++---------
 endpoints/OAI/utils/chat_completion.py | 29 ++++++++++++-------------
 3 files changed, 39 insertions(+), 40 deletions(-)

diff --git a/common/config_models.py b/common/config_models.py
index bbcfdda5..d32f7e95 100644
--- a/common/config_models.py
+++ b/common/config_models.py
@@ -99,21 +99,6 @@ class NetworkConfig(BaseConfigModel):
             "Possible values: OAI, Kobold."
         ),
     )
-    reasoning_parser: bool = Field(
-        False,
-        description=(
-            "Enable the reasoning parser (default: False).\n"
-            "Split response message into reasoning_content and content fields."
-        ),
-    )
-    reasoning_start_token: str = Field(
-        "<think>",
-        description=("Start token for the reasoning parser (default: <think>)."),
-    )
-    reasoning_end_token: str = Field(
-        "</think>",
-        description=("End token for the reasoning parser (default: </think>)."),
-    )
 
     # Converts all strings in the api_servers list to lowercase
     # NOTE: Expand if more models need this validator
@@ -309,6 +294,21 @@ class ModelConfig(BaseConfigModel):
             "Enables vision support if the model supports it. (default: False)"
         ),
     )
+    reasoning: bool = Field(
+        False,
+        description=(
+            "Enable the reasoning parser (default: False).\n"
+            "Split response message into reasoning_content and content fields."
+        ),
+    )
+    reasoning_start_token: str = Field(
+        "<think>",
+        description=("Start token for the reasoning parser (default: <think>)."),
+    )
+    reasoning_end_token: str = Field(
+        "</think>",
+        description=("End token for the reasoning parser (default: </think>)."),
+    )
     num_experts_per_token: Optional[int] = Field(
         None,
         description=(
diff --git a/config_sample.yml b/config_sample.yml
index fcf23683..0d16a4ab 100644
--- a/config_sample.yml
+++ b/config_sample.yml
@@ -31,16 +31,6 @@ network:
   # Possible values: OAI, Kobold.
   api_servers: ["OAI"]
 
-  # Enable reasoning parser (default: False).
-  # Do NOT enable this if the model is not a reasoning model (e.g. deepseek-r1 series)
-  reasoning_parser: false
-
-  # The start token for reasoning conetnt (default: "<think>")
-  reasoning_start_token: "<think>"
-
-  # The end token for reasoning conetnt (default: "</think>")
-  reasoning_end_token: "</think>"
-
 # Options for logging
 logging:
   # Enable prompt logging (default: False).
@@ -146,6 +136,16 @@ model:
   # Enables vision support if the model supports it. (default: False)
   vision: false
 
+  # Enable reasoning parser (default: False).
+  # Do NOT enable this if the model is not a reasoning model (e.g. deepseek-r1 series)
+  reasoning: false
+
+  # The start token for reasoning conetnt (default: "<think>")
+  reasoning_start_token: "<think>"
+
+  # The end token for reasoning conetnt (default: "</think>")
+  reasoning_end_token: "</think>"
+
   # Number of experts to use per token.
   # Fetched from the model's config.json if empty.
   # NOTE: For MoE models only.
diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py
index ffbd751a..3c639cdc 100644
--- a/endpoints/OAI/utils/chat_completion.py
+++ b/endpoints/OAI/utils/chat_completion.py
@@ -38,18 +38,18 @@ def _extract_think_content(text: str) -> tuple[Optional[str], Optional[str]]:
     """Extract content between <think> tags and the remaining content.
     Only available in none-streaming mode."""
     if (
-        config.network.reasoning_start_token not in text
-        and config.network.reasoning_end_token not in text
+        config.model.reasoning_start_token not in text
+        and config.model.reasoning_end_token not in text
     ):
         return None, text
-    elif config.network.reasoning_start_token in text:
-        start_reasoning = text.split(config.network.reasoning_start_token)[1]
-        reasoning_content = start_reasoning.split(config.network.reasoning_end_token)[0]
-        content = start_reasoning.split(config.network.reasoning_end_token)[1]
+    elif config.model.reasoning_start_token in text:
+        start_reasoning = text.split(config.model.reasoning_start_token)[1]
+        reasoning_content = start_reasoning.split(config.model.reasoning_end_token)[0]
+        content = start_reasoning.split(config.model.reasoning_end_token)[1]
         return reasoning_content, content
     else:
-        reasoning_content = text.split(config.network.reasoning_end_token)[0]
-        content = text.split(config.network.reasoning_end_token)[1]
+        reasoning_content = text.split(config.model.reasoning_end_token)[0]
+        content = text.split(config.model.reasoning_end_token)[1]
         return reasoning_content, content
 
 
@@ -63,7 +63,7 @@ def _create_response(
 
     choices = []
     for index, generation in enumerate(generations):
-        if config.network.reasoning_parser:
+        if config.model.reasoning:
             raw_content = unwrap(generation.get("text"), "")
             reasoning_content, content = _extract_think_content(raw_content)
             message = ChatCompletionMessage(
@@ -363,7 +363,7 @@ async def stream_generate_chat_completion(
         # We need to keep track of the text generated so we can resume the tool calls
         current_generation_text = ""
 
-        is_reasoning_chunk = config.network.reasoning_parser
+        is_reasoning_chunk = config.model.reasoning
 
         # Consumer loop
         while True:
@@ -393,17 +393,16 @@ async def stream_generate_chat_completion(
                 raise generation
 
             if (
-                unwrap(generation.get("text"), "")
-                == config.network.reasoning_start_token
-                and config.network.reasoning_parser
+                unwrap(generation.get("text"), "") == config.model.reasoning_start_token
+                and config.model.reasoning
             ):
                 # Update reasoning chunk flag
                 is_reasoning_chunk = True
                 # And skip this token
                 continue
             if (
-                unwrap(generation.get("text"), "") == config.network.reasoning_end_token
-                and config.network.reasoning_parser
+                unwrap(generation.get("text"), "") == config.model.reasoning_end_token
+                and config.model.reasoning
             ):
                 # Update reasoning chunk flag
                 is_reasoning_chunk = False

From beef2d081f63edab0f5f72c83ed3940c99ba63bb Mon Sep 17 00:00:00 2001
From: Orion-zhen <128988082+Orion-zhen@users.noreply.github.com>
Date: Fri, 21 Mar 2025 09:31:32 +0800
Subject: [PATCH 8/8] :sparkles: strip contents

---
 endpoints/OAI/utils/chat_completion.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py
index a728f29b..56c1d3b6 100644
--- a/endpoints/OAI/utils/chat_completion.py
+++ b/endpoints/OAI/utils/chat_completion.py
@@ -46,11 +46,11 @@ def _extract_think_content(text: str) -> tuple[Optional[str], Optional[str]]:
         start_reasoning = text.split(config.model.reasoning_start_token)[1]
         reasoning_content = start_reasoning.split(config.model.reasoning_end_token)[0]
         content = start_reasoning.split(config.model.reasoning_end_token)[1]
-        return reasoning_content, content
+        return reasoning_content.strip(), content.strip()
     else:
         reasoning_content = text.split(config.model.reasoning_end_token)[0]
         content = text.split(config.model.reasoning_end_token)[1]
-        return reasoning_content, content
+        return reasoning_content.strip(), content.strip()
 
 
 def _create_response(