diff --git a/prepare_llm_models.sh b/prepare_llm_models.sh index 43a9cc1df2..3ab5dbc562 100755 --- a/prepare_llm_models.sh +++ b/prepare_llm_models.sh @@ -34,6 +34,7 @@ HERMES3_MODEL="NousResearch/Hermes-3-Llama-3.1-8B" PHI4_MODEL="microsoft/Phi-4-mini-instruct" MISTRAL_MODEL="mistralai/Mistral-7B-Instruct-v0.3" GPT_OSS="openai/gpt-oss-20b" +DEVSTRAL_MODEL="unsloth/Devstral-Small-2507" if [ "$(python3 -c 'import sys; print(sys.version_info[1])')" -le "8" ]; then echo "Prepare models with python > 3.8."; exit 1 ; fi @@ -173,3 +174,14 @@ if [ ! -f "$1/$GPT_OSS/$TOKENIZER_FILE" ]; then echo "[ERROR] Models file $1/$GPT_OSS/$TOKENIZER_FILE does not exist." exit 1 fi + +if [ -f "$1/$DEVSTRAL_MODEL/$TOKENIZER_FILE" ]; then + echo "Models file $1/$DEVSTRAL_MODEL/$TOKENIZER_FILE exists. Skipping downloading models." +else + mkdir -p $1/$DEVSTRAL_MODEL + convert_tokenizer $DEVSTRAL_MODEL --with_detokenizer -o $1/$DEVSTRAL_MODEL +fi +if [ ! -f "$1/$DEVSTRAL_MODEL/$TOKENIZER_FILE" ]; then + echo "[ERROR] Models file $1/$DEVSTRAL_MODEL/$TOKENIZER_FILE does not exist." + exit 1 +fi diff --git a/src/llm/BUILD b/src/llm/BUILD index bfe45b3036..ae37d936ca 100644 --- a/src/llm/BUILD +++ b/src/llm/BUILD @@ -137,6 +137,7 @@ ovms_cc_library( # TODO split further so we don't have to recompile everything w "io_processing/hermes3/tool_parser.hpp", "io_processing/llama3/tool_parser.hpp", "io_processing/phi4/tool_parser.hpp", + "io_processing/devstral/tool_parser.hpp", "io_processing/mistral/tool_parser.hpp", "io_processing/qwen3/reasoning_parser.hpp", "io_processing/gptoss/reasoning_parser.hpp", @@ -148,6 +149,7 @@ ovms_cc_library( # TODO split further so we don't have to recompile everything w "io_processing/hermes3/tool_parser.cpp", "io_processing/llama3/tool_parser.cpp", "io_processing/phi4/tool_parser.cpp", + "io_processing/devstral/tool_parser.cpp", "io_processing/mistral/tool_parser.cpp", "io_processing/qwen3/reasoning_parser.cpp", "io_processing/gptoss/reasoning_parser.cpp", @@ -176,11 +178,13 @@ ovms_cc_library( "io_processing/phi4/generation_config_builder.hpp", "io_processing/llama3/generation_config_builder.hpp", "io_processing/hermes3/generation_config_builder.hpp", + "io_processing/devstral/generation_config_builder.hpp", "io_processing/generation_config_builder.hpp"], srcs = ["io_processing/base_generation_config_builder.cpp", "io_processing/phi4/generation_config_builder.cpp", "io_processing/llama3/generation_config_builder.cpp", - "io_processing/hermes3/generation_config_builder.cpp"], + "io_processing/hermes3/generation_config_builder.cpp", + "io_processing/devstral/generation_config_builder.cpp"], deps = [ ":openai_request", "//src:libovmslogging", diff --git a/src/llm/io_processing/devstral/generation_config_builder.cpp b/src/llm/io_processing/devstral/generation_config_builder.cpp new file mode 100644 index 0000000000..5b097c5aa4 --- /dev/null +++ b/src/llm/io_processing/devstral/generation_config_builder.cpp @@ -0,0 +1,57 @@ +//***************************************************************************** +// Copyright 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +#include +#include +#include +#include + +#include "generation_config_builder.hpp" + +namespace ovms { + +void DevstralGenerationConfigBuilder::parseConfigFromRequest(const OpenAIChatCompletionsRequest& request) { + // Call the base class method to fill in common configuration + BaseGenerationConfigBuilder::parseConfigFromRequest(request); + + // For now the only specific part is related to tools, so if there are no tools provided in the request + // we can exit early + if (request.toolNameSchemaMap.empty()) { + return; + } + + if (enableToolGuidedGeneration || request.toolChoice == "required") { + // Set tool guided generation config specific to Devstral model + auto triggeredTags = std::make_shared(); + triggeredTags->triggers.push_back("[TOOL_CALLS]"); + + for (const auto& [toolName, toolSchemaWrapper] : request.toolNameSchemaMap) { + const auto& toolSchema = toolSchemaWrapper.stringRepr; + ov::genai::StructuredOutputConfig::Tag tagItem; + tagItem.begin = "[TOOL_CALLS]" + toolName + "[ARGS]"; + // tagItem.end = ""; + tagItem.content = ov::genai::StructuredOutputConfig::JSONSchema(toolSchema); + triggeredTags->tags.push_back(tagItem); + } + if (request.toolChoice == "required") { + triggeredTags->at_least_one = true; + } + ov::genai::StructuredOutputConfig::StructuralTag structuralTag = triggeredTags; + setStructuralTagsConfig(structuralTag); + } +} + +} // namespace ovms diff --git a/src/llm/io_processing/devstral/generation_config_builder.hpp b/src/llm/io_processing/devstral/generation_config_builder.hpp new file mode 100644 index 0000000000..ec69a054fe --- /dev/null +++ b/src/llm/io_processing/devstral/generation_config_builder.hpp @@ -0,0 +1,33 @@ +//***************************************************************************** +// Copyright 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#pragma once +#include "../base_generation_config_builder.hpp" + +namespace ovms { + +/* + * Phi4GenerationConfigBuilder extends BaseGenerationConfigBuilder to provide specific configuration for Phi-4 model. + * It overrides the parseConfigFromRequest method to set tool guided generation config. + */ +class DevstralGenerationConfigBuilder : public BaseGenerationConfigBuilder { +public: + DevstralGenerationConfigBuilder() = delete; + explicit DevstralGenerationConfigBuilder(const ov::genai::GenerationConfig& baseConfig, bool enableToolGuidedGeneration, DecodingMethod decodingMethod) : + BaseGenerationConfigBuilder(baseConfig, enableToolGuidedGeneration, decodingMethod) {} + + void parseConfigFromRequest(const OpenAIChatCompletionsRequest& request) override; +}; +} // namespace ovms diff --git a/src/llm/io_processing/devstral/tool_parser.cpp b/src/llm/io_processing/devstral/tool_parser.cpp new file mode 100644 index 0000000000..3512736499 --- /dev/null +++ b/src/llm/io_processing/devstral/tool_parser.cpp @@ -0,0 +1,154 @@ +//***************************************************************************** +// Copyright 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +#include +#include +#include +#include + +#include "src/port/rapidjson_document.hpp" + +#include "../../../logging.hpp" +#include "tool_parser.hpp" +#include "../utils.hpp" +#include "src/stringutils.hpp" + +namespace ovms { + +void DevstralToolParser::parse(ParsedOutput& parsedOutput, const std::vector& generatedTokens) { + std::vector tools; + // expected format: [TOOL_CALLS]tool_name[ARGS]{"arg1": "value1", ...} + if (parsedOutput.content.empty() || generatedTokens.size() <= 0) { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "No content to parse for tool calls"); + return; + } + size_t firstToolTokenIndex; + auto it = std::find(generatedTokens.begin(), generatedTokens.end(), this->botTokenId); + if (it != generatedTokens.end()) { + firstToolTokenIndex = std::distance(generatedTokens.begin(), it); + } else { + return; + } + + size_t firstArgsTokenIndex; + auto itArgs = std::find(generatedTokens.begin() + firstToolTokenIndex, generatedTokens.end(), this->argsTokenId); + if (itArgs != generatedTokens.end()) { + firstArgsTokenIndex = std::distance(generatedTokens.begin(), itArgs); + } else { + return; + } + if (firstToolTokenIndex > firstArgsTokenIndex) { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "First tool token index is greater than first args token index."); + return; + } + std::vector toolNameTokens(generatedTokens.begin() + (firstToolTokenIndex + 1), generatedTokens.begin() + (firstArgsTokenIndex)); + std::vector argumentsTokens(generatedTokens.begin() + (firstArgsTokenIndex + 1), generatedTokens.end()); + + ToolCall toolCall; + std::string toolName = tokenizer.decode(toolNameTokens, ov::AnyMap{ov::genai::skip_special_tokens(true)}); + std::string arguments = tokenizer.decode(argumentsTokens, ov::AnyMap{ov::genai::skip_special_tokens(true)}); + toolCall.name = toolName; + toolCall.arguments = arguments; + toolCall.id = generateRandomId(); // Generate a random ID for the tool call + parsedOutput.toolCalls.push_back(toolCall); + + // get subset of generatedTokens starting from begin() to firstArgsTokenIndex + std::vector contentTokens; + if (firstToolTokenIndex > 0) { + contentTokens = std::vector(generatedTokens.begin(), generatedTokens.begin() + firstToolTokenIndex); + parsedOutput.content = tokenizer.decode(contentTokens, ov::AnyMap{ov::genai::skip_special_tokens(true)}); // Return only the content till tool call + } else { + parsedOutput.content = ""; + } + return; +} + +std::optional DevstralToolParser::sendFullDelta(ToolCall& toolCall) { + rapidjson::Document argsDelta; + argsDelta.Parse(toolCall.arguments.c_str()); + rapidjson::Document argumentsWrapper; + argumentsWrapper.SetObject(); + rapidjson::Document::AllocatorType& allocator = argumentsWrapper.GetAllocator(); + // now we need to add string toolCall.arguments to argumentsWrapper under "arguments" key + rapidjson::Value toolCallsString(rapidjson::kStringType); + toolCallsString.SetString(toolCall.arguments.c_str(), allocator); + argumentsWrapper.AddMember("arguments", toolCallsString, allocator); + auto currentDelta = wrapDelta(argumentsWrapper, this->toolCallIndex); + return currentDelta; +} + +std::optional DevstralToolParser::parseChunk(const std::string& chunk, ov::genai::GenerationFinishReason finishReason) { + /* + Devstral [TOOL_CALL]tool_name[ARGS]arguments[] + It does not support parallel tool calls, so tool calls are always in sequence. + + We have three processing states: + AWAITING_START_TAG, + AWAITING_ARGS_TAG, + PROCESSING_ARGS + + We store the history of chunks in streamContent string. After state changes are detected, we clear the streamContent to only keep unprocessed part. + */ + + this->streamContent += chunk; + SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Chunk content: '{}'", chunk); + if (this->internalState == AWAITING_START_TAG) { + size_t pos = chunk.find(this->streamingParsingToolCallsStartTag); + if (pos != std::string::npos) { + this->internalState = AWAITING_ARGS_TAG; + this->toolCallIndex++; + if (pos == 0) { + this->streamContent.clear(); + } else { + this->streamContent = this->streamContent.substr(pos + this->streamingParsingToolCallsStartTag.length()); // "[TOOLS_CALLS]" length is 13 + } + } else { + return std::nullopt; + } + } + if (this->internalState == AWAITING_ARGS_TAG) { + // check if [ARGS] tag is present in the chunk and update state accordingly + size_t pos = this->streamContent.find(this->streamingParsingArgsStartTag); + if (pos != std::string::npos) { + this->internalState = PROCESSING_ARGS; + this->toolName = this->streamContent.substr(0, pos); + this->streamContent = this->streamContent.substr(pos + this->streamingParsingArgsStartTag.length()); // "[ARGS]" length is 6 + return wrapFirstDelta(this->toolName, this->toolCallIndex); + } else { + return std::nullopt; + } + } + if (finishReason != ov::genai::GenerationFinishReason::NONE) { + size_t endPos = this->streamContent.find(this->streamingEndTag); + std::string arguments; + if (endPos != std::string::npos) { + arguments = this->streamContent.substr(0, endPos); + } else { + arguments = this->streamContent; + } + if (!arguments.empty()) { + ToolCall toolCall; + toolCall.arguments = arguments; + toolCall.name = this->toolName; + return sendFullDelta(toolCall); + } else { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "No valid arguments found in streamContent."); + return std::nullopt; + } + } + return std::nullopt; +} +} // namespace ovms diff --git a/src/llm/io_processing/devstral/tool_parser.hpp b/src/llm/io_processing/devstral/tool_parser.hpp new file mode 100644 index 0000000000..ea839f06d4 --- /dev/null +++ b/src/llm/io_processing/devstral/tool_parser.hpp @@ -0,0 +1,95 @@ +//***************************************************************************** +// Copyright 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#pragma once + +#include +#include +#include +#include + +#include "src/port/rapidjson_document.hpp" + +#include "src/llm/io_processing/base_output_parser.hpp" +#include "src/llm/io_processing/partial_json_builder.hpp" +#include "src/llm/apis/tool_schema_wrapper.hpp" + +namespace ovms { +class DevstralToolParser : public BaseOutputParser { + const int64_t argsTokenId; // [ARGS] + const int64_t botTokenId; // [TOOL_CALLS] + + // in streaming mode we can rely on tags in string format as tokens are not available + const std::string streamingParsingArgsStartTag = "[ARGS]"; + const std::string streamingParsingToolCallsStartTag = "[TOOL_CALLS]"; + const std::string streamingEndTag = ""; + + enum InternalState { + AWAITING_START_TAG, + AWAITING_ARGS_TAG, + PROCESSING_ARGS + }; + + InternalState internalState = AWAITING_START_TAG; + const ToolsSchemas_t& toolSchemas; + // Index to track the current tool call being processed (-1 means no tool call has been started yet) + int toolCallIndex = -1; + std::string streamContent = ""; // content accumulated from stream chunks + std::string toolName = ""; + std::optional sendFullDelta(ToolCall& toolCall); + +public: + DevstralToolParser() = delete; + DevstralToolParser(ov::genai::Tokenizer& tokenizer, const ToolsSchemas_t& toolSchemas) : + BaseOutputParser(tokenizer), + argsTokenId([&tokenizer, this]() { + // can not use streamingParsingArgsStartTag because object is not initialized yet + auto encoded = tokenizer.encode("[ARGS]", {{"add_special_tokens", false}}).input_ids; + if (encoded.get_shape()[0] != 1) { + throw std::runtime_error("[ARGS] must be a single token in the tokenizer vocabulary."); + } + return encoded.data()[0]; + }()), + botTokenId([&tokenizer, this]() { + // can not use streamingParsingToolCallsStartTag because object is not initialized yet + auto encoded = tokenizer.encode("[TOOL_CALLS]", {{"add_special_tokens", false}}).input_ids; + if (encoded.get_shape()[0] != 1) { + throw std::runtime_error("[TOOL_CALLS] must be a single token in the tokenizer vocabulary."); + } + return encoded.data()[0]; + }()), + toolSchemas(toolSchemas) {} + + void parse(ParsedOutput& parsedOutput, const std::vector& generatedTokens) override; + std::optional parseChunk(const std::string& chunk, ov::genai::GenerationFinishReason finishReason) override; + const std::vector& getParsingStartTags() const override { + static const std::vector toolCallStartTags{streamingParsingToolCallsStartTag}; + return toolCallStartTags; + } + const std::vector& getSpecialParsingStartTags() const override { + static const std::vector specialParsingStartTags{}; + return specialParsingStartTags; + } + // Tools calls are expected to be the last part of the content, so we do not specify an end tag. + const std::string& getParsingEndTag() const override { + static const std::string toolCallEndTag = ""; + return toolCallEndTag; + } + + bool requiresStreamingWithSpecialTokens() const override { + return true; + } +}; +} // namespace ovms diff --git a/src/llm/io_processing/generation_config_builder.hpp b/src/llm/io_processing/generation_config_builder.hpp index 663d4a9b1a..2423cd074d 100644 --- a/src/llm/io_processing/generation_config_builder.hpp +++ b/src/llm/io_processing/generation_config_builder.hpp @@ -24,6 +24,7 @@ #include "phi4/generation_config_builder.hpp" #include "llama3/generation_config_builder.hpp" #include "hermes3/generation_config_builder.hpp" +#include "devstral/generation_config_builder.hpp" #include "../apis/openai_request.hpp" #include "../../logging.hpp" @@ -44,6 +45,8 @@ class GenerationConfigBuilder { builder_impl = std::make_unique(baseConfig, enableToolGuidedGeneration, decodingMethod); } else if (toolParserName == "phi4") { builder_impl = std::make_unique(baseConfig, enableToolGuidedGeneration, decodingMethod); + } else if (toolParserName == "devstral") { + builder_impl = std::make_unique(baseConfig, enableToolGuidedGeneration, decodingMethod); } else { if (enableToolGuidedGeneration) { SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Option enable_tool_guided_generation is set, but will not be effective since no valid tool parser has been provided."); diff --git a/src/llm/io_processing/output_parser.cpp b/src/llm/io_processing/output_parser.cpp index cf0a805f59..1c060375df 100644 --- a/src/llm/io_processing/output_parser.cpp +++ b/src/llm/io_processing/output_parser.cpp @@ -27,6 +27,7 @@ #include "gptoss/tool_parser.hpp" #include "qwen3/reasoning_parser.hpp" #include "qwen3coder/qwen3coder_tool_parser.hpp" +#include "devstral/tool_parser.hpp" #include "gptoss/reasoning_parser.hpp" namespace ovms { @@ -168,6 +169,8 @@ OutputParser::OutputParser(ov::genai::Tokenizer& tokenizer, const std::string to toolParser = std::make_unique(tokenizer); } else if (toolParserName == "qwen3coder") { toolParser = std::make_unique(tokenizer, toolNameSchemaMap); + } else if (toolParserName == "devstral") { + toolParser = std::make_unique(tokenizer, toolNameSchemaMap); } else if (!toolParserName.empty()) { throw std::runtime_error("Unsupported tool parser: " + toolParserName); } diff --git a/src/llm/io_processing/output_parser.hpp b/src/llm/io_processing/output_parser.hpp index 613e0a993e..5aa5e74570 100644 --- a/src/llm/io_processing/output_parser.hpp +++ b/src/llm/io_processing/output_parser.hpp @@ -87,8 +87,13 @@ class OutputParser { std::optional parseChunk(const std::string& chunkResponse, const bool toolsAvailable, ov::genai::GenerationFinishReason finishReason); bool requiresStreamingWithSpecialTokens() const { - return (reasoningParser && reasoningParser->requiresStreamingWithSpecialTokens()) && - (toolParser && toolParser->requiresStreamingWithSpecialTokens()); + if (!reasoningParser) { + return toolParser && toolParser->requiresStreamingWithSpecialTokens(); + } else if (!toolParser) { + return reasoningParser && reasoningParser->requiresStreamingWithSpecialTokens(); + } else { + return (reasoningParser && reasoningParser->requiresStreamingWithSpecialTokens()) && (toolParser && toolParser->requiresStreamingWithSpecialTokens()); + } } }; } // namespace ovms diff --git a/src/llm/servable.cpp b/src/llm/servable.cpp index 345d1c362b..c3f390d62a 100644 --- a/src/llm/servable.cpp +++ b/src/llm/servable.cpp @@ -103,12 +103,17 @@ absl::Status GenAiServable::processTokenizeRequest(std::shared_ptr& executionContext) { + try { executionContext->apiHandler = std::make_shared(*executionContext->payload.parsedJson, executionContext->endpoint, std::chrono::system_clock::now(), getProperties()->tokenizer, getProperties()->toolParserName, getProperties()->reasoningParserName); + } catch (const std::exception& e) { + SPDLOG_LOGGER_ERROR(llm_calculator_logger, "Failed to create API handler: {}", e.what()); + return absl::InvalidArgumentError(std::string("Failed to create API handler: ") + e.what()); + } auto& config = ovms::Config::instance(); auto status = executionContext->apiHandler->parseRequest(getProperties()->maxTokensLimit, getProperties()->bestOfLimit, getProperties()->maxModelLength, config.getServerSettings().allowedLocalMediaPath); diff --git a/src/test/llm/output_parsers/devstral_output_parser_test.cpp b/src/test/llm/output_parsers/devstral_output_parser_test.cpp new file mode 100644 index 0000000000..eae604cf0d --- /dev/null +++ b/src/test/llm/output_parsers/devstral_output_parser_test.cpp @@ -0,0 +1,303 @@ +//***************************************************************************** +// Copyright 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#include +#include +#include +#include + +#include "../../../llm/io_processing/base_output_parser.hpp" +#include "../../../llm/io_processing/output_parser.hpp" +#include "../../platform_utils.hpp" + +using namespace ovms; + +#ifdef _WIN32 +const std::string tokenizerPath = getWindowsRepoRootPath() + "\\src\\test\\llm_testing\\unsloth\\Devstral-Small-2507"; +#else +// Hardcoded for usage in docker container +const std::string tokenizerPath = "/ovms/src/test/llm_testing/unsloth/Devstral-Small-2507/"; +#endif + +static ovms::ToolsSchemas_t EMPTY_TOOLS_SCHEMA = {}; // not used for mistral +static std::unique_ptr devstralTokenizer; + +class DevstralOutputParserTest : public ::testing::Test { +protected: + std::unique_ptr outputParserWithRegularToolParsing; + + static void SetUpTestSuite() { + try { + devstralTokenizer = std::make_unique(tokenizerPath); + } catch (const std::exception& e) { + FAIL() << "Failed to initialize devstral tokenizer: " << e.what(); + } catch (...) { + FAIL() << "Failed to initialize devstral tokenizer due to unknown error."; + } + } + + static void TearDownTestSuite() { + devstralTokenizer.reset(); + } + + void SetUp() override { + // declare tools_schema + static std::map toolSchemasInput = { + {"example_tool", R"({"properties": {"arg1": {"type": "string", "description": "A string argument."}}, "required": ["arg1"]})"}, + }; + + static std::vector> schemaDocsStorage; + + auto convertStringToolSchemasStringToToolsSchemas = []( + const std::map& input) -> ToolsSchemas_t { + ToolsSchemas_t result; + schemaDocsStorage.clear(); + for (const auto& [name, schemaStr] : input) { + auto schemaDoc = std::make_unique(); + if (schemaDoc->Parse(schemaStr.c_str()).HasParseError()) { + throw std::runtime_error("Failed to parse schema for tool: " + name); + } + result[name] = {schemaDoc.get(), schemaStr}; + schemaDocsStorage.push_back(std::move(schemaDoc)); + } + return result; + }; + + static ovms::ToolsSchemas_t toolsSchemas = convertStringToolSchemasStringToToolsSchemas(toolSchemasInput); + outputParserWithRegularToolParsing = std::make_unique(*devstralTokenizer, "devstral", "", toolsSchemas); + } +}; + +TEST_F(DevstralOutputParserTest, ParseToolCallOutputWithSingleToolCall) { + std::string input = "[TOOL_CALLS]example_tool[ARGS]{\"arg1\":\"value1\",\"arg2\":42}"; + std::string testInput = input; + auto generatedTensor = devstralTokenizer->encode(testInput, ov::genai::add_special_tokens(false)).input_ids; + std::vector generatedTokens(generatedTensor.data(), generatedTensor.data() + generatedTensor.get_size()); + ParsedOutput parsedOutput = outputParserWithRegularToolParsing->parse(generatedTokens, true); + EXPECT_EQ(parsedOutput.content, ""); + EXPECT_EQ(parsedOutput.reasoning, ""); + ASSERT_EQ(parsedOutput.toolCalls.size(), 1); + EXPECT_EQ(parsedOutput.toolCalls[0].name, "example_tool"); + EXPECT_EQ(parsedOutput.toolCalls[0].arguments, "{\"arg1\":\"value1\",\"arg2\":42}"); + EXPECT_EQ(parsedOutput.toolCalls[0].id.empty(), false); +} + +TEST_F(DevstralOutputParserTest, ParseToolCallOutputWithSingleToolCall_MissingEndTag) { + std::string testInput = "Reasoninig before tool call [TOOL_CALLS]example_tool[ARGS]{\"arg1\":\"value1\",\"arg2\":42}"; + auto generatedTensor = devstralTokenizer->encode(testInput, ov::genai::add_special_tokens(false)).input_ids; + std::vector generatedTokens(generatedTensor.data(), generatedTensor.data() + generatedTensor.get_size()); + ParsedOutput parsedOutput = outputParserWithRegularToolParsing->parse(generatedTokens, true); + EXPECT_EQ(parsedOutput.content, "Reasoninig before tool call "); + EXPECT_EQ(parsedOutput.reasoning, ""); + ASSERT_EQ(parsedOutput.toolCalls.size(), 1); + EXPECT_EQ(parsedOutput.toolCalls[0].name, "example_tool"); + EXPECT_EQ(parsedOutput.toolCalls[0].arguments, "{\"arg1\":\"value1\",\"arg2\":42}"); + EXPECT_EQ(parsedOutput.toolCalls[0].id.empty(), false); +} + +TEST_F(DevstralOutputParserTest, ParseToolCallOutputWithContentAndNoToolCalls) { + std::string input = "This is a regular model response without tool calls."; + auto generatedTensor = devstralTokenizer->encode(input, ov::genai::add_special_tokens(false)).input_ids; + std::vector generatedTokens(generatedTensor.data(), generatedTensor.data() + generatedTensor.get_size()); + ParsedOutput parsedOutput = outputParserWithRegularToolParsing->parse(generatedTokens, true); + EXPECT_EQ(parsedOutput.content, "This is a regular model response without tool calls."); + ASSERT_EQ(parsedOutput.toolCalls.size(), 0); + EXPECT_EQ(parsedOutput.reasoning, ""); +} + +TEST_F(DevstralOutputParserTest, ParseToolCallOutputWithContentAndSingleToolCall) { + std::string testInput = "Reasoninig before tool call [TOOL_CALLS]example_tool[ARGS]{\"arg1\":\"value1\",\"arg2\":42}"; + auto generatedTensor = devstralTokenizer->encode(testInput, ov::genai::add_special_tokens(false)).input_ids; + std::vector generatedTokens(generatedTensor.data(), generatedTensor.data() + generatedTensor.get_size()); + ParsedOutput parsedOutput = outputParserWithRegularToolParsing->parse(generatedTokens, true); + EXPECT_EQ(parsedOutput.content, "Reasoninig before tool call "); + EXPECT_EQ(parsedOutput.reasoning, ""); + ASSERT_EQ(parsedOutput.toolCalls.size(), 1); + EXPECT_EQ(parsedOutput.toolCalls[0].name, "example_tool"); + EXPECT_EQ(parsedOutput.toolCalls[0].arguments, "{\"arg1\":\"value1\",\"arg2\":42}"); + EXPECT_EQ(parsedOutput.toolCalls[0].id.empty(), false); +} + +TEST_F(DevstralOutputParserTest, ParseToolCallOutputWithInvalidOrder) { + std::string testInput = "Reasoninig before tool call [ARGS]example_tool[TOOL_CALLS]{\"arg1\":\"value1\",\"arg2\":42}"; + auto generatedTensor = devstralTokenizer->encode(testInput, ov::genai::add_special_tokens(false)).input_ids; + std::vector generatedTokens(generatedTensor.data(), generatedTensor.data() + generatedTensor.get_size()); + ParsedOutput parsedOutput = outputParserWithRegularToolParsing->parse(generatedTokens, true); + EXPECT_EQ(parsedOutput.content, "Reasoninig before tool call example_tool{\"arg1\":\"value1\",\"arg2\":42}"); + EXPECT_EQ(parsedOutput.reasoning, ""); + ASSERT_EQ(parsedOutput.toolCalls.size(), 0); +} + +TEST_F(DevstralOutputParserTest, ParseToolCallOutputWithMissingArgsTag) { + std::string input = "Some content [TOOL_CALLS]example_tool{\"arg1\":\"value1\",\"arg2\":42}"; + std::string testInput = input; + auto generatedTensor = devstralTokenizer->encode(testInput, ov::genai::add_special_tokens(false)).input_ids; + std::vector generatedTokens(generatedTensor.data(), generatedTensor.data() + generatedTensor.get_size()); + ParsedOutput parsedOutput = outputParserWithRegularToolParsing->parse(generatedTokens, true); + // Same expected content as tokenizer does not add special tokens + EXPECT_EQ(parsedOutput.content, "Some content example_tool{\"arg1\":\"value1\",\"arg2\":42}"); + EXPECT_EQ(parsedOutput.reasoning, ""); + ASSERT_EQ(parsedOutput.toolCalls.size(), 0); +} + +TEST_F(DevstralOutputParserTest, ParseToolCallOutputWithArrayArguments) { + std::string input = "[TOOL_CALLS]example_tool[ARGS]{\"filepath\":\"/var/log/db.log\",\"status\":[\"completed\",\"failed\"],\"encoding\":\"utf-8\",\"processFunction\":\"processFunction\"}"; + std::string testInput = input; + auto generatedTensor = devstralTokenizer->encode(testInput, ov::genai::add_special_tokens(false)).input_ids; + std::vector generatedTokens(generatedTensor.data(), generatedTensor.data() + generatedTensor.get_size()); + ParsedOutput parsedOutput = outputParserWithRegularToolParsing->parse(generatedTokens, true); + EXPECT_EQ(parsedOutput.content, ""); + EXPECT_EQ(parsedOutput.reasoning, ""); + ASSERT_EQ(parsedOutput.toolCalls.size(), 1); + EXPECT_EQ(parsedOutput.toolCalls[0].name, "example_tool"); + EXPECT_EQ(parsedOutput.toolCalls[0].arguments, "{\"filepath\":\"/var/log/db.log\",\"status\":[\"completed\",\"failed\"],\"encoding\":\"utf-8\",\"processFunction\":\"processFunction\"}"); + EXPECT_EQ(parsedOutput.toolCalls[0].id.empty(), false); +} + +TEST_F(DevstralOutputParserTest, ParseToolCallOutputWithInvalidArguments) { + std::string input = "[TOOL_CALLS]example_tool[ARGS]{ \"filepath\": \"/var/log/db.log\", \"status\": "; + std::string testInput = input; + auto generatedTensor = devstralTokenizer->encode(testInput, ov::genai::add_special_tokens(false)).input_ids; + std::vector generatedTokens(generatedTensor.data(), generatedTensor.data() + generatedTensor.get_size()); + ParsedOutput parsedOutput = outputParserWithRegularToolParsing->parse(generatedTokens, true); + EXPECT_EQ(parsedOutput.content, ""); + EXPECT_EQ(parsedOutput.reasoning, ""); + ASSERT_EQ(parsedOutput.toolCalls.size(), 1); + EXPECT_EQ(parsedOutput.toolCalls[0].name, "example_tool"); + EXPECT_EQ(parsedOutput.toolCalls[0].arguments, "{ \"filepath\": \"/var/log/db.log\", \"status\": "); + EXPECT_EQ(parsedOutput.toolCalls[0].id.empty(), false); +} + +TEST_F(DevstralOutputParserTest, ParseToolCallOutputWithMissingTool_name) { + std::string input = "[TOOL_CALLS]wrong_name[ARGS]{ \"filepath\": \"/var/log/db.log\"}"; + std::string testInput = input; + auto generatedTensor = devstralTokenizer->encode(testInput, ov::genai::add_special_tokens(false)).input_ids; + std::vector generatedTokens(generatedTensor.data(), generatedTensor.data() + generatedTensor.get_size()); + ParsedOutput parsedOutput = outputParserWithRegularToolParsing->parse(generatedTokens, true); + EXPECT_EQ(parsedOutput.content, "wrong_name{ \"filepath\": \"/var/log/db.log\"}"); + EXPECT_EQ(parsedOutput.reasoning, ""); + ASSERT_EQ(parsedOutput.toolCalls.size(), 0); +} + +TEST_F(DevstralOutputParserTest, HolisticStreaming) { + std::vector>> chunkToDeltaVec{ + // Tool call phase + // Starting first tool. Collecting chunk until full name is received. Don't return until then. + {"Reasoning", ov::genai::GenerationFinishReason::NONE, R"({"delta":{"content":"Reasoning"}})"}, + {"example", ov::genai::GenerationFinishReason::NONE, R"({"delta":{"content":"example"}})"}, + {"[TOOL_CALLS]", ov::genai::GenerationFinishReason::NONE, std::nullopt}, + {"get", ov::genai::GenerationFinishReason::NONE, std::nullopt}, + {"_", ov::genai::GenerationFinishReason::NONE, std::nullopt}, + {"weather", ov::genai::GenerationFinishReason::NONE, std::nullopt}, + {"[ARGS]", ov::genai::GenerationFinishReason::NONE, R"({"delta":{"tool_calls":[{"id":"XXXXXXXXX","type":"function","index":0,"function":{"name":"get_weather"}}]}})"}, + {"{\"", ov::genai::GenerationFinishReason::NONE, std::nullopt}, + {"city\":", ov::genai::GenerationFinishReason::NONE, std::nullopt}, + {" \"Paris", ov::genai::GenerationFinishReason::NONE, std::nullopt}, + // Last chunk is added in the for loop below + }; + ToolsSchemas_t tools_schemas = { + {"get_weather", ToolSchemaWrapper{}}}; + for (auto lastFinishReason : {ov::genai::GenerationFinishReason::STOP, ov::genai::GenerationFinishReason::LENGTH}) { + // Need to have new output parser per case to simulate separate request processing + outputParserWithRegularToolParsing = std::make_unique(*devstralTokenizer, "devstral", "", tools_schemas); + auto chunkToDeltaVecCopy = chunkToDeltaVec; + if (lastFinishReason == ov::genai::GenerationFinishReason::STOP) { + chunkToDeltaVecCopy.push_back({"\"}", ov::genai::GenerationFinishReason::STOP, R"({"delta":{"tool_calls":[{"index":0,"function":{"arguments":"{\"city\": \"Paris\"}"}}]}})"}); + } else { + chunkToDeltaVecCopy.push_back({"\"", ov::genai::GenerationFinishReason::LENGTH, R"({"delta":{"tool_calls":[{"index":0,"function":{"arguments":"{\"city\": \"Paris\""}}]}})"}); + } + int64_t chunkIteration = -1; + for (const auto& [chunk, finishReason, expectedDelta] : chunkToDeltaVecCopy) { + chunkIteration++; + std::optional doc = outputParserWithRegularToolParsing->parseChunk(chunk, true, finishReason); + if (!expectedDelta.has_value() && !doc.has_value()) { + continue; // Both are nullopt, OK + } + if (expectedDelta.has_value() && doc.has_value()) { + rapidjson::StringBuffer buffer; + rapidjson::Writer writer(buffer); + doc->Accept(writer); + std::string docStr = buffer.GetString(); + // If both strings contain "id":"...", compare id values by length and alphanumeric, else compare whole strings + std::string expected = expectedDelta.value(); + std::string idKey = "\"id\":\""; + auto docIdPos = docStr.find(idKey); + auto expectedIdPos = expected.find(idKey); + if (docIdPos != std::string::npos && expectedIdPos != std::string::npos) { + auto docIdStart = docIdPos + idKey.size(); + auto docIdEnd = docStr.find("\"", docIdStart); + auto expectedIdStart = expectedIdPos + idKey.size(); + auto expectedIdEnd = expected.find("\"", expectedIdStart); + ASSERT_NE(docIdEnd, std::string::npos); + ASSERT_NE(expectedIdEnd, std::string::npos); + std::string docId = docStr.substr(docIdStart, docIdEnd - docIdStart); + std::string expectedId = expected.substr(expectedIdStart, expectedIdEnd - expectedIdStart); + EXPECT_EQ(docId.size(), expectedId.size()) << "ID length mismatch for chunk: " << chunk; + EXPECT_TRUE(std::all_of(docId.begin(), docId.end(), ::isalnum)) << "ID not alphanumeric for chunk: " << chunk; + // Compare everything except the id value + std::string docStrNoId = docStr; + std::string expectedNoId = expected; + docStrNoId.replace(docIdStart, docId.size(), std::string(docId.size(), '*')); + expectedNoId.replace(expectedIdStart, expectedId.size(), std::string(expectedId.size(), '*')); + EXPECT_EQ(docStrNoId, expectedNoId) << "Mismatch for chunk (ignoring id value): " << chunk; + } else { + EXPECT_EQ(docStr, expected) << "Mismatch for chunk: [" << chunk << "] got [" << docStr << "] but expected [" << expected << "]" << chunkIteration; + } + } else if (expectedDelta.has_value()) { + FAIL() << "Mismatch for chunk: [" << chunk << "] got nothing but expected [" << expectedDelta.value() << "]" << chunkIteration; + } else if (doc.has_value()) { + rapidjson::StringBuffer buffer; + rapidjson::Writer writer(buffer); + doc->Accept(writer); + std::string docStr = buffer.GetString(); + FAIL() << "Mismatch for chunk: [" << chunk << "] expected nothing but got [" << docStr << "]" << chunkIteration; + } else { + FAIL() << "Mismatch for chunk: [" << chunk << "] " << chunkIteration; + } + } + } +} + +TEST_F(DevstralOutputParserTest, ToolCallsWithoutToolsInTheRequestStreaming) { + std::vector>> chunkToDeltaVec{ + // Tool parser is available, but tools are not in the request so every chunk is just a regular content + {"[TOOL_CALLS]", "{\"delta\":{\"content\":\"[TOOL_CALLS]\"}}"}, + {"get_", "{\"delta\":{\"content\":\"get_\"}}"}, + {"weather", "{\"delta\":{\"content\":\"weather\"}}"}, + {"[ARGS]", "{\"delta\":{\"content\":\"[ARGS]\"}}"}, + {"{\"", "{\"delta\":{\"content\":\"{\\\"\"}}"}, + {"city\":", "{\"delta\":{\"content\":\"city\\\":\"}}"}, + {"\"Paris\"", "{\"delta\":{\"content\":\"\\\"Paris\\\"\"}}"}, + {"}", "{\"delta\":{\"content\":\"}\"}}"}, + }; + + for (const auto& [chunk, expectedDelta] : chunkToDeltaVec) { + // Second argument is false as we simulate the case where tools have not been provided in the request + std::optional doc = outputParserWithRegularToolParsing->parseChunk(chunk, false, ov::genai::GenerationFinishReason::NONE); + if (!expectedDelta.has_value() && !doc.has_value()) { + continue; // Both are nullopt, OK + } + if (expectedDelta.has_value() && doc.has_value()) { + rapidjson::StringBuffer buffer; + rapidjson::Writer writer(buffer); + doc->Accept(writer); + std::string docStr = buffer.GetString(); + std::string expected = expectedDelta.value(); + EXPECT_EQ(docStr, expected) << "Mismatch for chunk: " << chunk; + } else { + FAIL() << "Mismatch between expectedDelta and doc for chunk: " << chunk; + } + } +} diff --git a/windows_prepare_llm_models.bat b/windows_prepare_llm_models.bat index be18265bf8..57a634c046 100644 --- a/windows_prepare_llm_models.bat +++ b/windows_prepare_llm_models.bat @@ -42,6 +42,7 @@ set "HERMES3_MODEL=NousResearch/Hermes-3-Llama-3.1-8B" set "PHI4_MODEL=microsoft/Phi-4-mini-instruct" set "MISTRAL_MODEL=mistralai/Mistral-7B-Instruct-v0.3" set "GPTOSS_MODEL=openai/gpt-oss-20b" +set "DEVSTRAL_MODEL=unsloth/Devstral-Small-2507" echo Downloading LLM testing models to directory %~1 set "PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly" @@ -202,4 +203,17 @@ if not exist "%~1\%GPTOSS_MODEL%\%TOKENIZER_FILE%" ( exit /b 1 ) +if exist "%~1\%DEVSTRAL_MODEL%\%TOKENIZER_FILE%" ( + echo Models file %~1\%DEVSTRAL_MODEL%\%TOKENIZER_FILE% exists. Skipping downloading models. +) else ( + echo Downloading tokenizer and detokenizer for Devstral model to %~1\%DEVSTRAL_MODEL% directory. + mkdir "%~1\%DEVSTRAL_MODEL%" + convert_tokenizer "%DEVSTRAL_MODEL%" --with_detokenizer -o "%~1\%DEVSTRAL_MODEL%" + if !errorlevel! neq 0 exit /b !errorlevel! +) +if not exist "%~1\%DEVSTRAL_MODEL%\%TOKENIZER_FILE%" ( + echo Models file %~1\%DEVSTRAL_MODEL%\%TOKENIZER_FILE% does not exists. + exit /b 1 +) + endlocal