From 57f0e5f1bcd4e9f00e37f8803e59396cf53b1d57 Mon Sep 17 00:00:00 2001 From: Orion-zhen <128988082+Orion-zhen@users.noreply.github.com> Date: Sat, 1 Mar 2025 13:02:29 +0800 Subject: [PATCH 1/8] :sparkles: reasoning parser support --- common/config_models.py | 21 +++++++++++++ config_sample.yml | 10 +++++++ endpoints/OAI/types/chat_completion.py | 1 + endpoints/OAI/utils/chat_completion.py | 41 +++++++++++++++++++++++--- 4 files changed, 69 insertions(+), 4 deletions(-) diff --git a/common/config_models.py b/common/config_models.py index f78408f4..5b91a476 100644 --- a/common/config_models.py +++ b/common/config_models.py @@ -99,6 +99,27 @@ class NetworkConfig(BaseConfigModel): "Possible values: OAI, Kobold." ), ) + reasoning_parser: bool = Field( + False, + description=( + "Enable the reasoning parser (default: False).\n" + "This will split response message into reasoning_content and content fields." + ) + ) + reasoning_start_token: str = Field( + "", + description=( + "Start token for the reasoning parser (default: ).\n" + "This token is used to split the response message into reasoning_content and content fields." + ) + ) + reasoning_end_token: str = Field( + "", + description=( + "End token for the reasoning parser (default: ).\n" + "This token is used to split the response message into reasoning_content and content fields." + ) + ) # Converts all strings in the api_servers list to lowercase # NOTE: Expand if more models need this validator diff --git a/config_sample.yml b/config_sample.yml index 745433bd..fcf23683 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -31,6 +31,16 @@ network: # Possible values: OAI, Kobold. api_servers: ["OAI"] + # Enable reasoning parser (default: False). + # Do NOT enable this if the model is not a reasoning model (e.g. deepseek-r1 series) + reasoning_parser: false + + # The start token for reasoning conetnt (default: "") + reasoning_start_token: "" + + # The end token for reasoning conetnt (default: "") + reasoning_end_token: "" + # Options for logging logging: # Enable prompt logging (default: False). diff --git a/endpoints/OAI/types/chat_completion.py b/endpoints/OAI/types/chat_completion.py index 86a22477..f6d74d80 100644 --- a/endpoints/OAI/types/chat_completion.py +++ b/endpoints/OAI/types/chat_completion.py @@ -31,6 +31,7 @@ class ChatCompletionMessagePart(BaseModel): class ChatCompletionMessage(BaseModel): role: str = "user" content: Optional[Union[str, List[ChatCompletionMessagePart]]] = None + reasoning_content: Optional[str] = None tool_calls: Optional[List[ToolCall]] = None tool_calls_json: SkipJsonSchema[Optional[str]] = None diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py index a646924c..93da1a80 100644 --- a/endpoints/OAI/utils/chat_completion.py +++ b/endpoints/OAI/utils/chat_completion.py @@ -17,6 +17,7 @@ handle_request_error, request_disconnect_loop, ) +from common.tabby_config import config from common.utils import unwrap from endpoints.OAI.types.chat_completion import ( ChatCompletionLogprobs, @@ -33,6 +34,20 @@ from endpoints.OAI.types.tools import ToolCall +def _extract_think_content(text: str) -> tuple[Optional[str], Optional[str]]: + """Extract content between tags and the remaining content. Only available in none-streaming mode.""" + if config.network.reasoning_start_token not in text and config.network.reasoning_end_token not in text: + return None, text + elif config.network.reasoning_start_token in text: + start_reasoning = text.split(config.network.reasoning_start_token)[1] + reasoning_content = start_reasoning.split(config.network.reasoning_end_token)[0] + content = start_reasoning.split(config.network.reasoning_end_token)[1] + return reasoning_content, content + else: + reasoning_content = text.split(config.network.reasoning_end_token)[0] + content = text.split(config.network.reasoning_end_token)[1] + return reasoning_content, content + def _create_response( request_id: str, generations: List[dict], model_name: Optional[str] ): @@ -43,9 +58,16 @@ def _create_response( choices = [] for index, generation in enumerate(generations): - message = ChatCompletionMessage( - role="assistant", content=unwrap(generation.get("text"), "") - ) + if config.network.reasoning_parser: + raw_content = unwrap(generation.get("text"), "") + reasoning_content, content = _extract_think_content(raw_content) + message = ChatCompletionMessage( + role="assistant", reasoning_content=reasoning_content, content=content + ) + else: + message = ChatCompletionMessage( + role="assistant", content=unwrap(generation.get("text"), "") + ) tool_calls = generation["tool_calls"] if tool_calls: @@ -103,6 +125,7 @@ def _create_stream_chunk( generation: Optional[dict] = None, model_name: Optional[str] = None, is_usage_chunk: bool = False, + is_reasoning_chunk: bool = False, ): """Create a chat completion stream chunk from the provided text.""" @@ -137,6 +160,8 @@ def _create_stream_chunk( else: message = ChatCompletionMessage( + role="assistant", reasoning_content=unwrap(generation.get("text"), "") + ) if is_reasoning_chunk else ChatCompletionMessage( role="assistant", content=unwrap(generation.get("text"), "") ) @@ -328,6 +353,8 @@ async def stream_generate_chat_completion( # We need to keep track of the text generated so we can resume the tool calls current_generation_text = "" + + is_reasoning_chunk = config.network.reasoning_parser # Consumer loop while True: @@ -356,8 +383,14 @@ async def stream_generate_chat_completion( if isinstance(generation, Exception): raise generation + if unwrap(generation.get("text"), "") == config.network.reasoning_end_token: + # Update reasoning chunk flag + is_reasoning_chunk = False + # And skip this token + continue + response = _create_stream_chunk( - request.state.id, generation, model_path.name + request.state.id, generation, model_path.name, is_reasoning_chunk=is_reasoning_chunk ) yield response.model_dump_json() From dfded6525403b629c3a41d28236f8b855e82326e Mon Sep 17 00:00:00 2001 From: Orion <128988082+Orion-zhen@users.noreply.github.com> Date: Sat, 1 Mar 2025 15:31:23 +0800 Subject: [PATCH 2/8] :art: format code according to ruff --- common/config_models.py | 6 +++--- endpoints/OAI/utils/chat_completion.py | 27 ++++++++++++++++++-------- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/common/config_models.py b/common/config_models.py index 5b91a476..8d264041 100644 --- a/common/config_models.py +++ b/common/config_models.py @@ -104,21 +104,21 @@ class NetworkConfig(BaseConfigModel): description=( "Enable the reasoning parser (default: False).\n" "This will split response message into reasoning_content and content fields." - ) + ), ) reasoning_start_token: str = Field( "", description=( "Start token for the reasoning parser (default: ).\n" "This token is used to split the response message into reasoning_content and content fields." - ) + ), ) reasoning_end_token: str = Field( "", description=( "End token for the reasoning parser (default: ).\n" "This token is used to split the response message into reasoning_content and content fields." - ) + ), ) # Converts all strings in the api_servers list to lowercase diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py index 93da1a80..3b0d6959 100644 --- a/endpoints/OAI/utils/chat_completion.py +++ b/endpoints/OAI/utils/chat_completion.py @@ -36,7 +36,10 @@ def _extract_think_content(text: str) -> tuple[Optional[str], Optional[str]]: """Extract content between tags and the remaining content. Only available in none-streaming mode.""" - if config.network.reasoning_start_token not in text and config.network.reasoning_end_token not in text: + if ( + config.network.reasoning_start_token not in text + and config.network.reasoning_end_token not in text + ): return None, text elif config.network.reasoning_start_token in text: start_reasoning = text.split(config.network.reasoning_start_token)[1] @@ -48,6 +51,7 @@ def _extract_think_content(text: str) -> tuple[Optional[str], Optional[str]]: content = text.split(config.network.reasoning_end_token)[1] return reasoning_content, content + def _create_response( request_id: str, generations: List[dict], model_name: Optional[str] ): @@ -159,10 +163,14 @@ def _create_stream_chunk( choices.append(choice) else: - message = ChatCompletionMessage( - role="assistant", reasoning_content=unwrap(generation.get("text"), "") - ) if is_reasoning_chunk else ChatCompletionMessage( - role="assistant", content=unwrap(generation.get("text"), "") + message = ( + ChatCompletionMessage( + role="assistant", reasoning_content=unwrap(generation.get("text"), "") + ) + if is_reasoning_chunk + else ChatCompletionMessage( + role="assistant", content=unwrap(generation.get("text"), "") + ) ) logprob_response = None @@ -353,7 +361,7 @@ async def stream_generate_chat_completion( # We need to keep track of the text generated so we can resume the tool calls current_generation_text = "" - + is_reasoning_chunk = config.network.reasoning_parser # Consumer loop @@ -388,9 +396,12 @@ async def stream_generate_chat_completion( is_reasoning_chunk = False # And skip this token continue - + response = _create_stream_chunk( - request.state.id, generation, model_path.name, is_reasoning_chunk=is_reasoning_chunk + request.state.id, + generation, + model_path.name, + is_reasoning_chunk=is_reasoning_chunk, ) yield response.model_dump_json() From 1bd3968e25d676fe93be218968ebfda3b695280c Mon Sep 17 00:00:00 2001 From: Orion <128988082+Orion-zhen@users.noreply.github.com> Date: Sat, 1 Mar 2025 15:36:28 +0800 Subject: [PATCH 3/8] :art: shorten lines --- common/config_models.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/common/config_models.py b/common/config_models.py index 8d264041..46f784af 100644 --- a/common/config_models.py +++ b/common/config_models.py @@ -103,21 +103,19 @@ class NetworkConfig(BaseConfigModel): False, description=( "Enable the reasoning parser (default: False).\n" - "This will split response message into reasoning_content and content fields." + "Split response message into reasoning_content and content fields." ), ) reasoning_start_token: str = Field( "", description=( - "Start token for the reasoning parser (default: ).\n" - "This token is used to split the response message into reasoning_content and content fields." + "Start token for the reasoning parser (default: )." ), ) reasoning_end_token: str = Field( "", description=( - "End token for the reasoning parser (default: ).\n" - "This token is used to split the response message into reasoning_content and content fields." + "End token for the reasoning parser (default: )." ), ) From 41a8aa639f41ae2b8dfb8af0958095f3940a03c2 Mon Sep 17 00:00:00 2001 From: Orion <128988082+Orion-zhen@users.noreply.github.com> Date: Sat, 1 Mar 2025 15:38:06 +0800 Subject: [PATCH 4/8] :art: god, please let me pass format check --- common/config_models.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/common/config_models.py b/common/config_models.py index 46f784af..bbcfdda5 100644 --- a/common/config_models.py +++ b/common/config_models.py @@ -108,15 +108,11 @@ class NetworkConfig(BaseConfigModel): ) reasoning_start_token: str = Field( "", - description=( - "Start token for the reasoning parser (default: )." - ), + description=("Start token for the reasoning parser (default: )."), ) reasoning_end_token: str = Field( "", - description=( - "End token for the reasoning parser (default: )." - ), + description=("End token for the reasoning parser (default: )."), ) # Converts all strings in the api_servers list to lowercase From f0888437b1fb14fb061dc71ddcc231cf79eda39b Mon Sep 17 00:00:00 2001 From: Orion <128988082+Orion-zhen@users.noreply.github.com> Date: Sat, 1 Mar 2025 15:42:12 +0800 Subject: [PATCH 5/8] :art: TAT --- endpoints/OAI/utils/chat_completion.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py index 3b0d6959..c4b51050 100644 --- a/endpoints/OAI/utils/chat_completion.py +++ b/endpoints/OAI/utils/chat_completion.py @@ -35,7 +35,8 @@ def _extract_think_content(text: str) -> tuple[Optional[str], Optional[str]]: - """Extract content between tags and the remaining content. Only available in none-streaming mode.""" + """Extract content between tags and the remaining content. + Only available in none-streaming mode.""" if ( config.network.reasoning_start_token not in text and config.network.reasoning_end_token not in text From 9efb7aab391fba0a62d32d20439bda113e56256a Mon Sep 17 00:00:00 2001 From: Orion-zhen <128988082+Orion-zhen@users.noreply.github.com> Date: Thu, 6 Mar 2025 11:42:56 +0800 Subject: [PATCH 6/8] :sparkles: handle reasoning start token --- endpoints/OAI/utils/chat_completion.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py index c4b51050..ffbd751a 100644 --- a/endpoints/OAI/utils/chat_completion.py +++ b/endpoints/OAI/utils/chat_completion.py @@ -392,7 +392,19 @@ async def stream_generate_chat_completion( if isinstance(generation, Exception): raise generation - if unwrap(generation.get("text"), "") == config.network.reasoning_end_token: + if ( + unwrap(generation.get("text"), "") + == config.network.reasoning_start_token + and config.network.reasoning_parser + ): + # Update reasoning chunk flag + is_reasoning_chunk = True + # And skip this token + continue + if ( + unwrap(generation.get("text"), "") == config.network.reasoning_end_token + and config.network.reasoning_parser + ): # Update reasoning chunk flag is_reasoning_chunk = False # And skip this token From 45190004cfd723b045a7853cd4ad466d006fd6d7 Mon Sep 17 00:00:00 2001 From: Orion-zhen <128988082+Orion-zhen@users.noreply.github.com> Date: Tue, 18 Mar 2025 23:15:38 +0800 Subject: [PATCH 7/8] :wrench: move reasoning config to model section --- common/config_models.py | 30 +++++++++++++------------- config_sample.yml | 20 ++++++++--------- endpoints/OAI/utils/chat_completion.py | 29 ++++++++++++------------- 3 files changed, 39 insertions(+), 40 deletions(-) diff --git a/common/config_models.py b/common/config_models.py index bbcfdda5..d32f7e95 100644 --- a/common/config_models.py +++ b/common/config_models.py @@ -99,21 +99,6 @@ class NetworkConfig(BaseConfigModel): "Possible values: OAI, Kobold." ), ) - reasoning_parser: bool = Field( - False, - description=( - "Enable the reasoning parser (default: False).\n" - "Split response message into reasoning_content and content fields." - ), - ) - reasoning_start_token: str = Field( - "", - description=("Start token for the reasoning parser (default: )."), - ) - reasoning_end_token: str = Field( - "", - description=("End token for the reasoning parser (default: )."), - ) # Converts all strings in the api_servers list to lowercase # NOTE: Expand if more models need this validator @@ -309,6 +294,21 @@ class ModelConfig(BaseConfigModel): "Enables vision support if the model supports it. (default: False)" ), ) + reasoning: bool = Field( + False, + description=( + "Enable the reasoning parser (default: False).\n" + "Split response message into reasoning_content and content fields." + ), + ) + reasoning_start_token: str = Field( + "", + description=("Start token for the reasoning parser (default: )."), + ) + reasoning_end_token: str = Field( + "", + description=("End token for the reasoning parser (default: )."), + ) num_experts_per_token: Optional[int] = Field( None, description=( diff --git a/config_sample.yml b/config_sample.yml index fcf23683..0d16a4ab 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -31,16 +31,6 @@ network: # Possible values: OAI, Kobold. api_servers: ["OAI"] - # Enable reasoning parser (default: False). - # Do NOT enable this if the model is not a reasoning model (e.g. deepseek-r1 series) - reasoning_parser: false - - # The start token for reasoning conetnt (default: "") - reasoning_start_token: "" - - # The end token for reasoning conetnt (default: "") - reasoning_end_token: "" - # Options for logging logging: # Enable prompt logging (default: False). @@ -146,6 +136,16 @@ model: # Enables vision support if the model supports it. (default: False) vision: false + # Enable reasoning parser (default: False). + # Do NOT enable this if the model is not a reasoning model (e.g. deepseek-r1 series) + reasoning: false + + # The start token for reasoning conetnt (default: "") + reasoning_start_token: "" + + # The end token for reasoning conetnt (default: "") + reasoning_end_token: "" + # Number of experts to use per token. # Fetched from the model's config.json if empty. # NOTE: For MoE models only. diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py index ffbd751a..3c639cdc 100644 --- a/endpoints/OAI/utils/chat_completion.py +++ b/endpoints/OAI/utils/chat_completion.py @@ -38,18 +38,18 @@ def _extract_think_content(text: str) -> tuple[Optional[str], Optional[str]]: """Extract content between tags and the remaining content. Only available in none-streaming mode.""" if ( - config.network.reasoning_start_token not in text - and config.network.reasoning_end_token not in text + config.model.reasoning_start_token not in text + and config.model.reasoning_end_token not in text ): return None, text - elif config.network.reasoning_start_token in text: - start_reasoning = text.split(config.network.reasoning_start_token)[1] - reasoning_content = start_reasoning.split(config.network.reasoning_end_token)[0] - content = start_reasoning.split(config.network.reasoning_end_token)[1] + elif config.model.reasoning_start_token in text: + start_reasoning = text.split(config.model.reasoning_start_token)[1] + reasoning_content = start_reasoning.split(config.model.reasoning_end_token)[0] + content = start_reasoning.split(config.model.reasoning_end_token)[1] return reasoning_content, content else: - reasoning_content = text.split(config.network.reasoning_end_token)[0] - content = text.split(config.network.reasoning_end_token)[1] + reasoning_content = text.split(config.model.reasoning_end_token)[0] + content = text.split(config.model.reasoning_end_token)[1] return reasoning_content, content @@ -63,7 +63,7 @@ def _create_response( choices = [] for index, generation in enumerate(generations): - if config.network.reasoning_parser: + if config.model.reasoning: raw_content = unwrap(generation.get("text"), "") reasoning_content, content = _extract_think_content(raw_content) message = ChatCompletionMessage( @@ -363,7 +363,7 @@ async def stream_generate_chat_completion( # We need to keep track of the text generated so we can resume the tool calls current_generation_text = "" - is_reasoning_chunk = config.network.reasoning_parser + is_reasoning_chunk = config.model.reasoning # Consumer loop while True: @@ -393,17 +393,16 @@ async def stream_generate_chat_completion( raise generation if ( - unwrap(generation.get("text"), "") - == config.network.reasoning_start_token - and config.network.reasoning_parser + unwrap(generation.get("text"), "") == config.model.reasoning_start_token + and config.model.reasoning ): # Update reasoning chunk flag is_reasoning_chunk = True # And skip this token continue if ( - unwrap(generation.get("text"), "") == config.network.reasoning_end_token - and config.network.reasoning_parser + unwrap(generation.get("text"), "") == config.model.reasoning_end_token + and config.model.reasoning ): # Update reasoning chunk flag is_reasoning_chunk = False From beef2d081f63edab0f5f72c83ed3940c99ba63bb Mon Sep 17 00:00:00 2001 From: Orion-zhen <128988082+Orion-zhen@users.noreply.github.com> Date: Fri, 21 Mar 2025 09:31:32 +0800 Subject: [PATCH 8/8] :sparkles: strip contents --- endpoints/OAI/utils/chat_completion.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py index a728f29b..56c1d3b6 100644 --- a/endpoints/OAI/utils/chat_completion.py +++ b/endpoints/OAI/utils/chat_completion.py @@ -46,11 +46,11 @@ def _extract_think_content(text: str) -> tuple[Optional[str], Optional[str]]: start_reasoning = text.split(config.model.reasoning_start_token)[1] reasoning_content = start_reasoning.split(config.model.reasoning_end_token)[0] content = start_reasoning.split(config.model.reasoning_end_token)[1] - return reasoning_content, content + return reasoning_content.strip(), content.strip() else: reasoning_content = text.split(config.model.reasoning_end_token)[0] content = text.split(config.model.reasoning_end_token)[1] - return reasoning_content, content + return reasoning_content.strip(), content.strip() def _create_response(