From 57f0e5f1bcd4e9f00e37f8803e59396cf53b1d57 Mon Sep 17 00:00:00 2001
From: Orion-zhen <128988082+Orion-zhen@users.noreply.github.com>
Date: Sat, 1 Mar 2025 13:02:29 +0800
Subject: [PATCH 1/8] :sparkles: reasoning parser support
---
common/config_models.py | 21 +++++++++++++
config_sample.yml | 10 +++++++
endpoints/OAI/types/chat_completion.py | 1 +
endpoints/OAI/utils/chat_completion.py | 41 +++++++++++++++++++++++---
4 files changed, 69 insertions(+), 4 deletions(-)
diff --git a/common/config_models.py b/common/config_models.py
index f78408f4..5b91a476 100644
--- a/common/config_models.py
+++ b/common/config_models.py
@@ -99,6 +99,27 @@ class NetworkConfig(BaseConfigModel):
"Possible values: OAI, Kobold."
),
)
+ reasoning_parser: bool = Field(
+ False,
+ description=(
+ "Enable the reasoning parser (default: False).\n"
+ "This will split response message into reasoning_content and content fields."
+ )
+ )
+ reasoning_start_token: str = Field(
+ "",
+ description=(
+ "Start token for the reasoning parser (default: ).\n"
+ "This token is used to split the response message into reasoning_content and content fields."
+ )
+ )
+ reasoning_end_token: str = Field(
+ "",
+ description=(
+ "End token for the reasoning parser (default: ).\n"
+ "This token is used to split the response message into reasoning_content and content fields."
+ )
+ )
# Converts all strings in the api_servers list to lowercase
# NOTE: Expand if more models need this validator
diff --git a/config_sample.yml b/config_sample.yml
index 745433bd..fcf23683 100644
--- a/config_sample.yml
+++ b/config_sample.yml
@@ -31,6 +31,16 @@ network:
# Possible values: OAI, Kobold.
api_servers: ["OAI"]
+ # Enable reasoning parser (default: False).
+ # Do NOT enable this if the model is not a reasoning model (e.g. deepseek-r1 series)
+ reasoning_parser: false
+
+ # The start token for reasoning conetnt (default: "")
+ reasoning_start_token: ""
+
+ # The end token for reasoning conetnt (default: "")
+ reasoning_end_token: ""
+
# Options for logging
logging:
# Enable prompt logging (default: False).
diff --git a/endpoints/OAI/types/chat_completion.py b/endpoints/OAI/types/chat_completion.py
index 86a22477..f6d74d80 100644
--- a/endpoints/OAI/types/chat_completion.py
+++ b/endpoints/OAI/types/chat_completion.py
@@ -31,6 +31,7 @@ class ChatCompletionMessagePart(BaseModel):
class ChatCompletionMessage(BaseModel):
role: str = "user"
content: Optional[Union[str, List[ChatCompletionMessagePart]]] = None
+ reasoning_content: Optional[str] = None
tool_calls: Optional[List[ToolCall]] = None
tool_calls_json: SkipJsonSchema[Optional[str]] = None
diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py
index a646924c..93da1a80 100644
--- a/endpoints/OAI/utils/chat_completion.py
+++ b/endpoints/OAI/utils/chat_completion.py
@@ -17,6 +17,7 @@
handle_request_error,
request_disconnect_loop,
)
+from common.tabby_config import config
from common.utils import unwrap
from endpoints.OAI.types.chat_completion import (
ChatCompletionLogprobs,
@@ -33,6 +34,20 @@
from endpoints.OAI.types.tools import ToolCall
+def _extract_think_content(text: str) -> tuple[Optional[str], Optional[str]]:
+ """Extract content between tags and the remaining content. Only available in none-streaming mode."""
+ if config.network.reasoning_start_token not in text and config.network.reasoning_end_token not in text:
+ return None, text
+ elif config.network.reasoning_start_token in text:
+ start_reasoning = text.split(config.network.reasoning_start_token)[1]
+ reasoning_content = start_reasoning.split(config.network.reasoning_end_token)[0]
+ content = start_reasoning.split(config.network.reasoning_end_token)[1]
+ return reasoning_content, content
+ else:
+ reasoning_content = text.split(config.network.reasoning_end_token)[0]
+ content = text.split(config.network.reasoning_end_token)[1]
+ return reasoning_content, content
+
def _create_response(
request_id: str, generations: List[dict], model_name: Optional[str]
):
@@ -43,9 +58,16 @@ def _create_response(
choices = []
for index, generation in enumerate(generations):
- message = ChatCompletionMessage(
- role="assistant", content=unwrap(generation.get("text"), "")
- )
+ if config.network.reasoning_parser:
+ raw_content = unwrap(generation.get("text"), "")
+ reasoning_content, content = _extract_think_content(raw_content)
+ message = ChatCompletionMessage(
+ role="assistant", reasoning_content=reasoning_content, content=content
+ )
+ else:
+ message = ChatCompletionMessage(
+ role="assistant", content=unwrap(generation.get("text"), "")
+ )
tool_calls = generation["tool_calls"]
if tool_calls:
@@ -103,6 +125,7 @@ def _create_stream_chunk(
generation: Optional[dict] = None,
model_name: Optional[str] = None,
is_usage_chunk: bool = False,
+ is_reasoning_chunk: bool = False,
):
"""Create a chat completion stream chunk from the provided text."""
@@ -137,6 +160,8 @@ def _create_stream_chunk(
else:
message = ChatCompletionMessage(
+ role="assistant", reasoning_content=unwrap(generation.get("text"), "")
+ ) if is_reasoning_chunk else ChatCompletionMessage(
role="assistant", content=unwrap(generation.get("text"), "")
)
@@ -328,6 +353,8 @@ async def stream_generate_chat_completion(
# We need to keep track of the text generated so we can resume the tool calls
current_generation_text = ""
+
+ is_reasoning_chunk = config.network.reasoning_parser
# Consumer loop
while True:
@@ -356,8 +383,14 @@ async def stream_generate_chat_completion(
if isinstance(generation, Exception):
raise generation
+ if unwrap(generation.get("text"), "") == config.network.reasoning_end_token:
+ # Update reasoning chunk flag
+ is_reasoning_chunk = False
+ # And skip this token
+ continue
+
response = _create_stream_chunk(
- request.state.id, generation, model_path.name
+ request.state.id, generation, model_path.name, is_reasoning_chunk=is_reasoning_chunk
)
yield response.model_dump_json()
From dfded6525403b629c3a41d28236f8b855e82326e Mon Sep 17 00:00:00 2001
From: Orion <128988082+Orion-zhen@users.noreply.github.com>
Date: Sat, 1 Mar 2025 15:31:23 +0800
Subject: [PATCH 2/8] :art: format code according to ruff
---
common/config_models.py | 6 +++---
endpoints/OAI/utils/chat_completion.py | 27 ++++++++++++++++++--------
2 files changed, 22 insertions(+), 11 deletions(-)
diff --git a/common/config_models.py b/common/config_models.py
index 5b91a476..8d264041 100644
--- a/common/config_models.py
+++ b/common/config_models.py
@@ -104,21 +104,21 @@ class NetworkConfig(BaseConfigModel):
description=(
"Enable the reasoning parser (default: False).\n"
"This will split response message into reasoning_content and content fields."
- )
+ ),
)
reasoning_start_token: str = Field(
"",
description=(
"Start token for the reasoning parser (default: ).\n"
"This token is used to split the response message into reasoning_content and content fields."
- )
+ ),
)
reasoning_end_token: str = Field(
"",
description=(
"End token for the reasoning parser (default: ).\n"
"This token is used to split the response message into reasoning_content and content fields."
- )
+ ),
)
# Converts all strings in the api_servers list to lowercase
diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py
index 93da1a80..3b0d6959 100644
--- a/endpoints/OAI/utils/chat_completion.py
+++ b/endpoints/OAI/utils/chat_completion.py
@@ -36,7 +36,10 @@
def _extract_think_content(text: str) -> tuple[Optional[str], Optional[str]]:
"""Extract content between tags and the remaining content. Only available in none-streaming mode."""
- if config.network.reasoning_start_token not in text and config.network.reasoning_end_token not in text:
+ if (
+ config.network.reasoning_start_token not in text
+ and config.network.reasoning_end_token not in text
+ ):
return None, text
elif config.network.reasoning_start_token in text:
start_reasoning = text.split(config.network.reasoning_start_token)[1]
@@ -48,6 +51,7 @@ def _extract_think_content(text: str) -> tuple[Optional[str], Optional[str]]:
content = text.split(config.network.reasoning_end_token)[1]
return reasoning_content, content
+
def _create_response(
request_id: str, generations: List[dict], model_name: Optional[str]
):
@@ -159,10 +163,14 @@ def _create_stream_chunk(
choices.append(choice)
else:
- message = ChatCompletionMessage(
- role="assistant", reasoning_content=unwrap(generation.get("text"), "")
- ) if is_reasoning_chunk else ChatCompletionMessage(
- role="assistant", content=unwrap(generation.get("text"), "")
+ message = (
+ ChatCompletionMessage(
+ role="assistant", reasoning_content=unwrap(generation.get("text"), "")
+ )
+ if is_reasoning_chunk
+ else ChatCompletionMessage(
+ role="assistant", content=unwrap(generation.get("text"), "")
+ )
)
logprob_response = None
@@ -353,7 +361,7 @@ async def stream_generate_chat_completion(
# We need to keep track of the text generated so we can resume the tool calls
current_generation_text = ""
-
+
is_reasoning_chunk = config.network.reasoning_parser
# Consumer loop
@@ -388,9 +396,12 @@ async def stream_generate_chat_completion(
is_reasoning_chunk = False
# And skip this token
continue
-
+
response = _create_stream_chunk(
- request.state.id, generation, model_path.name, is_reasoning_chunk=is_reasoning_chunk
+ request.state.id,
+ generation,
+ model_path.name,
+ is_reasoning_chunk=is_reasoning_chunk,
)
yield response.model_dump_json()
From 1bd3968e25d676fe93be218968ebfda3b695280c Mon Sep 17 00:00:00 2001
From: Orion <128988082+Orion-zhen@users.noreply.github.com>
Date: Sat, 1 Mar 2025 15:36:28 +0800
Subject: [PATCH 3/8] :art: shorten lines
---
common/config_models.py | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/common/config_models.py b/common/config_models.py
index 8d264041..46f784af 100644
--- a/common/config_models.py
+++ b/common/config_models.py
@@ -103,21 +103,19 @@ class NetworkConfig(BaseConfigModel):
False,
description=(
"Enable the reasoning parser (default: False).\n"
- "This will split response message into reasoning_content and content fields."
+ "Split response message into reasoning_content and content fields."
),
)
reasoning_start_token: str = Field(
"",
description=(
- "Start token for the reasoning parser (default: ).\n"
- "This token is used to split the response message into reasoning_content and content fields."
+ "Start token for the reasoning parser (default: )."
),
)
reasoning_end_token: str = Field(
"",
description=(
- "End token for the reasoning parser (default: ).\n"
- "This token is used to split the response message into reasoning_content and content fields."
+ "End token for the reasoning parser (default: )."
),
)
From 41a8aa639f41ae2b8dfb8af0958095f3940a03c2 Mon Sep 17 00:00:00 2001
From: Orion <128988082+Orion-zhen@users.noreply.github.com>
Date: Sat, 1 Mar 2025 15:38:06 +0800
Subject: [PATCH 4/8] :art: god, please let me pass format check
---
common/config_models.py | 8 ++------
1 file changed, 2 insertions(+), 6 deletions(-)
diff --git a/common/config_models.py b/common/config_models.py
index 46f784af..bbcfdda5 100644
--- a/common/config_models.py
+++ b/common/config_models.py
@@ -108,15 +108,11 @@ class NetworkConfig(BaseConfigModel):
)
reasoning_start_token: str = Field(
"",
- description=(
- "Start token for the reasoning parser (default: )."
- ),
+ description=("Start token for the reasoning parser (default: )."),
)
reasoning_end_token: str = Field(
"",
- description=(
- "End token for the reasoning parser (default: )."
- ),
+ description=("End token for the reasoning parser (default: )."),
)
# Converts all strings in the api_servers list to lowercase
From f0888437b1fb14fb061dc71ddcc231cf79eda39b Mon Sep 17 00:00:00 2001
From: Orion <128988082+Orion-zhen@users.noreply.github.com>
Date: Sat, 1 Mar 2025 15:42:12 +0800
Subject: [PATCH 5/8] :art: TAT
---
endpoints/OAI/utils/chat_completion.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py
index 3b0d6959..c4b51050 100644
--- a/endpoints/OAI/utils/chat_completion.py
+++ b/endpoints/OAI/utils/chat_completion.py
@@ -35,7 +35,8 @@
def _extract_think_content(text: str) -> tuple[Optional[str], Optional[str]]:
- """Extract content between tags and the remaining content. Only available in none-streaming mode."""
+ """Extract content between tags and the remaining content.
+ Only available in none-streaming mode."""
if (
config.network.reasoning_start_token not in text
and config.network.reasoning_end_token not in text
From 9efb7aab391fba0a62d32d20439bda113e56256a Mon Sep 17 00:00:00 2001
From: Orion-zhen <128988082+Orion-zhen@users.noreply.github.com>
Date: Thu, 6 Mar 2025 11:42:56 +0800
Subject: [PATCH 6/8] :sparkles: handle reasoning start token
---
endpoints/OAI/utils/chat_completion.py | 14 +++++++++++++-
1 file changed, 13 insertions(+), 1 deletion(-)
diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py
index c4b51050..ffbd751a 100644
--- a/endpoints/OAI/utils/chat_completion.py
+++ b/endpoints/OAI/utils/chat_completion.py
@@ -392,7 +392,19 @@ async def stream_generate_chat_completion(
if isinstance(generation, Exception):
raise generation
- if unwrap(generation.get("text"), "") == config.network.reasoning_end_token:
+ if (
+ unwrap(generation.get("text"), "")
+ == config.network.reasoning_start_token
+ and config.network.reasoning_parser
+ ):
+ # Update reasoning chunk flag
+ is_reasoning_chunk = True
+ # And skip this token
+ continue
+ if (
+ unwrap(generation.get("text"), "") == config.network.reasoning_end_token
+ and config.network.reasoning_parser
+ ):
# Update reasoning chunk flag
is_reasoning_chunk = False
# And skip this token
From 45190004cfd723b045a7853cd4ad466d006fd6d7 Mon Sep 17 00:00:00 2001
From: Orion-zhen <128988082+Orion-zhen@users.noreply.github.com>
Date: Tue, 18 Mar 2025 23:15:38 +0800
Subject: [PATCH 7/8] :wrench: move reasoning config to model section
---
common/config_models.py | 30 +++++++++++++-------------
config_sample.yml | 20 ++++++++---------
endpoints/OAI/utils/chat_completion.py | 29 ++++++++++++-------------
3 files changed, 39 insertions(+), 40 deletions(-)
diff --git a/common/config_models.py b/common/config_models.py
index bbcfdda5..d32f7e95 100644
--- a/common/config_models.py
+++ b/common/config_models.py
@@ -99,21 +99,6 @@ class NetworkConfig(BaseConfigModel):
"Possible values: OAI, Kobold."
),
)
- reasoning_parser: bool = Field(
- False,
- description=(
- "Enable the reasoning parser (default: False).\n"
- "Split response message into reasoning_content and content fields."
- ),
- )
- reasoning_start_token: str = Field(
- "",
- description=("Start token for the reasoning parser (default: )."),
- )
- reasoning_end_token: str = Field(
- "",
- description=("End token for the reasoning parser (default: )."),
- )
# Converts all strings in the api_servers list to lowercase
# NOTE: Expand if more models need this validator
@@ -309,6 +294,21 @@ class ModelConfig(BaseConfigModel):
"Enables vision support if the model supports it. (default: False)"
),
)
+ reasoning: bool = Field(
+ False,
+ description=(
+ "Enable the reasoning parser (default: False).\n"
+ "Split response message into reasoning_content and content fields."
+ ),
+ )
+ reasoning_start_token: str = Field(
+ "",
+ description=("Start token for the reasoning parser (default: )."),
+ )
+ reasoning_end_token: str = Field(
+ "",
+ description=("End token for the reasoning parser (default: )."),
+ )
num_experts_per_token: Optional[int] = Field(
None,
description=(
diff --git a/config_sample.yml b/config_sample.yml
index fcf23683..0d16a4ab 100644
--- a/config_sample.yml
+++ b/config_sample.yml
@@ -31,16 +31,6 @@ network:
# Possible values: OAI, Kobold.
api_servers: ["OAI"]
- # Enable reasoning parser (default: False).
- # Do NOT enable this if the model is not a reasoning model (e.g. deepseek-r1 series)
- reasoning_parser: false
-
- # The start token for reasoning conetnt (default: "")
- reasoning_start_token: ""
-
- # The end token for reasoning conetnt (default: "")
- reasoning_end_token: ""
-
# Options for logging
logging:
# Enable prompt logging (default: False).
@@ -146,6 +136,16 @@ model:
# Enables vision support if the model supports it. (default: False)
vision: false
+ # Enable reasoning parser (default: False).
+ # Do NOT enable this if the model is not a reasoning model (e.g. deepseek-r1 series)
+ reasoning: false
+
+ # The start token for reasoning conetnt (default: "")
+ reasoning_start_token: ""
+
+ # The end token for reasoning conetnt (default: "")
+ reasoning_end_token: ""
+
# Number of experts to use per token.
# Fetched from the model's config.json if empty.
# NOTE: For MoE models only.
diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py
index ffbd751a..3c639cdc 100644
--- a/endpoints/OAI/utils/chat_completion.py
+++ b/endpoints/OAI/utils/chat_completion.py
@@ -38,18 +38,18 @@ def _extract_think_content(text: str) -> tuple[Optional[str], Optional[str]]:
"""Extract content between tags and the remaining content.
Only available in none-streaming mode."""
if (
- config.network.reasoning_start_token not in text
- and config.network.reasoning_end_token not in text
+ config.model.reasoning_start_token not in text
+ and config.model.reasoning_end_token not in text
):
return None, text
- elif config.network.reasoning_start_token in text:
- start_reasoning = text.split(config.network.reasoning_start_token)[1]
- reasoning_content = start_reasoning.split(config.network.reasoning_end_token)[0]
- content = start_reasoning.split(config.network.reasoning_end_token)[1]
+ elif config.model.reasoning_start_token in text:
+ start_reasoning = text.split(config.model.reasoning_start_token)[1]
+ reasoning_content = start_reasoning.split(config.model.reasoning_end_token)[0]
+ content = start_reasoning.split(config.model.reasoning_end_token)[1]
return reasoning_content, content
else:
- reasoning_content = text.split(config.network.reasoning_end_token)[0]
- content = text.split(config.network.reasoning_end_token)[1]
+ reasoning_content = text.split(config.model.reasoning_end_token)[0]
+ content = text.split(config.model.reasoning_end_token)[1]
return reasoning_content, content
@@ -63,7 +63,7 @@ def _create_response(
choices = []
for index, generation in enumerate(generations):
- if config.network.reasoning_parser:
+ if config.model.reasoning:
raw_content = unwrap(generation.get("text"), "")
reasoning_content, content = _extract_think_content(raw_content)
message = ChatCompletionMessage(
@@ -363,7 +363,7 @@ async def stream_generate_chat_completion(
# We need to keep track of the text generated so we can resume the tool calls
current_generation_text = ""
- is_reasoning_chunk = config.network.reasoning_parser
+ is_reasoning_chunk = config.model.reasoning
# Consumer loop
while True:
@@ -393,17 +393,16 @@ async def stream_generate_chat_completion(
raise generation
if (
- unwrap(generation.get("text"), "")
- == config.network.reasoning_start_token
- and config.network.reasoning_parser
+ unwrap(generation.get("text"), "") == config.model.reasoning_start_token
+ and config.model.reasoning
):
# Update reasoning chunk flag
is_reasoning_chunk = True
# And skip this token
continue
if (
- unwrap(generation.get("text"), "") == config.network.reasoning_end_token
- and config.network.reasoning_parser
+ unwrap(generation.get("text"), "") == config.model.reasoning_end_token
+ and config.model.reasoning
):
# Update reasoning chunk flag
is_reasoning_chunk = False
From beef2d081f63edab0f5f72c83ed3940c99ba63bb Mon Sep 17 00:00:00 2001
From: Orion-zhen <128988082+Orion-zhen@users.noreply.github.com>
Date: Fri, 21 Mar 2025 09:31:32 +0800
Subject: [PATCH 8/8] :sparkles: strip contents
---
endpoints/OAI/utils/chat_completion.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py
index a728f29b..56c1d3b6 100644
--- a/endpoints/OAI/utils/chat_completion.py
+++ b/endpoints/OAI/utils/chat_completion.py
@@ -46,11 +46,11 @@ def _extract_think_content(text: str) -> tuple[Optional[str], Optional[str]]:
start_reasoning = text.split(config.model.reasoning_start_token)[1]
reasoning_content = start_reasoning.split(config.model.reasoning_end_token)[0]
content = start_reasoning.split(config.model.reasoning_end_token)[1]
- return reasoning_content, content
+ return reasoning_content.strip(), content.strip()
else:
reasoning_content = text.split(config.model.reasoning_end_token)[0]
content = text.split(config.model.reasoning_end_token)[1]
- return reasoning_content, content
+ return reasoning_content.strip(), content.strip()
def _create_response(