Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/http_rest_api_handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -531,7 +531,7 @@ static Status createV3HttpPayload(
return Status(StatusCode::JSON_INVALID, "model field is not a string");
}

bool isTextGenerationEndpoint = uri.find("completions") != std::string_view::npos;
bool isTextGenerationEndpoint = (uri.find("completions") != std::string_view::npos) || (uri.find("responses") != std::string_view::npos);
if (isTextGenerationEndpoint) {
auto streamIt = parsedJson->FindMember("stream");
if (streamIt != parsedJson->MemberEnd()) {
Expand Down
862 changes: 831 additions & 31 deletions src/llm/apis/openai_completions.cpp

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions src/llm/apis/openai_completions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ namespace ovms {
enum class Endpoint {
CHAT_COMPLETIONS,
COMPLETIONS,
RESPONSES,
TOKENIZE,
};

Expand All @@ -69,12 +70,17 @@ class OpenAIChatCompletionsHandler {
std::chrono::time_point<std::chrono::system_clock> created;
ov::genai::Tokenizer tokenizer;
size_t processedTokens = 0; // tracks overall number of tokens processed by the pipeline
size_t responsesStreamingSequenceNumber = 0;
bool responsesStreamingInitialized = false;
std::string responsesStreamingOutputText;

// Output parser is used to parse chat completions response to extract specific fields like tool calls and reasoning.
std::unique_ptr<OutputParser> outputParser = nullptr;

absl::Status parseCompletionsPart();
absl::Status parseChatCompletionsPart(std::optional<uint32_t> maxTokensLimit, std::optional<std::string> allowedLocalMediaPath, std::optional<std::vector<std::string>> allowedMediaDomains);
absl::Status parseResponsesPart(std::optional<uint32_t> maxTokensLimit, std::optional<std::string> allowedLocalMediaPath, std::optional<std::vector<std::string>> allowedMediaDomains);
absl::Status parseResponsesInputDirectly(std::optional<std::string> allowedLocalMediaPath, std::optional<std::vector<std::string>> allowedMediaDomains);
absl::Status parseCommonPart(std::optional<uint32_t> maxTokensLimit, uint32_t bestOfLimit, std::optional<uint32_t> maxModelLength);

ParsedOutput parseOutputIfNeeded(const std::vector<int64_t>& generatedIds);
Expand Down
56 changes: 53 additions & 3 deletions src/llm/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,12 @@ absl::Status GenAiServable::loadRequest(std::shared_ptr<GenAiServableExecutionCo
executionContext->endpoint = Endpoint::CHAT_COMPLETIONS;
} else if (payload.uri == "/v3/completions" || payload.uri == "/v3/v1/completions") {
executionContext->endpoint = Endpoint::COMPLETIONS;
} else if (payload.uri == "/v3/responses" || payload.uri == "/v3/v1/responses") {
executionContext->endpoint = Endpoint::RESPONSES;
} else if (TokenizeParser::isTokenizeEndpoint(payload.uri)) {
executionContext->endpoint = Endpoint::TOKENIZE;
} else {
return absl::InvalidArgumentError("Wrong endpoint. Allowed endpoints: /v3/chat/completions, /v3/completions");
return absl::InvalidArgumentError("Wrong endpoint. Allowed endpoints: /v3/chat/completions, /v3/completions, /v3/responses, /v3/tokenize");
}
executionContext->payload = payload;
return absl::OkStatus();
Expand Down Expand Up @@ -204,6 +206,50 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptr<GenAiServableExecution
}
break;
}
case Endpoint::RESPONSES: {
if (executionContext->apiHandler->getChatHistory().size() > 0) {
#if (PYTHON_DISABLE == 0)
bool success;
if (executionContext->apiHandler->getProcessedJson().size() > 0) {
success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, executionContext->apiHandler->getProcessedJson(), inputText);
} else {
success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, executionContext->payload.body, inputText);
}
if (!success) {
return absl::Status(absl::StatusCode::kInvalidArgument, inputText);
}
#else
ov::genai::ChatHistory& chatHistory = executionContext->apiHandler->getChatHistory();
constexpr bool add_generation_prompt = true;
auto toolsStatus = executionContext->apiHandler->parseToolsToJsonContainer();
if (!toolsStatus.ok()) {
return toolsStatus.status();
}
const auto& tools = toolsStatus.value();
auto chatTemplateKwargsStatus = executionContext->apiHandler->parseChatTemplateKwargsToJsonContainer();
if (!chatTemplateKwargsStatus.ok()) {
return chatTemplateKwargsStatus.status();
}
const auto& chatTemplateKwargs = chatTemplateKwargsStatus.value();
try {
inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}, tools, chatTemplateKwargs);
} catch (const std::exception& e) {
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to apply chat template: {}", e.what());
return absl::Status(absl::StatusCode::kInvalidArgument, "Failed to apply chat template. The model either does not have chat template or has an invalid one.");
}
#endif
if (inputText.size() == 0) {
return absl::Status(absl::StatusCode::kInvalidArgument, "Final prompt after applying chat template is empty");
}
} else {
auto prompt = executionContext->apiHandler->getPrompt();
if (!prompt.has_value()) {
return absl::Status(absl::StatusCode::kInvalidArgument, "input is missing");
}
inputText = prompt.value();
}
break;
}
case Endpoint::COMPLETIONS: {
inputText = executionContext->apiHandler->getPrompt().value();
break;
Expand Down Expand Up @@ -277,8 +323,12 @@ absl::Status GenAiServable::preparePartialResponse(std::shared_ptr<GenAiServable
if (!serializedChunk.empty()) {
executionContext->response = wrapTextInServerSideEventMessage(serializedChunk);
}
if (executionContext->apiHandler->getStreamOptions().includeUsage)
executionContext->response += wrapTextInServerSideEventMessage(executionContext->apiHandler->serializeStreamingUsageChunk());
if (executionContext->apiHandler->getStreamOptions().includeUsage) {
std::string usageChunk = executionContext->apiHandler->serializeStreamingUsageChunk();
if (!usageChunk.empty()) {
executionContext->response += wrapTextInServerSideEventMessage(usageChunk);
}
}

executionContext->response += wrapTextInServerSideEventMessage("[DONE]");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,12 @@ absl::Status VisualLanguageModelServable::loadRequest(std::shared_ptr<GenAiServa
}
if (payload.uri == "/v3/chat/completions" || payload.uri == "/v3/v1/chat/completions") {
executionContext->endpoint = Endpoint::CHAT_COMPLETIONS;
} else if (payload.uri == "/v3/responses" || payload.uri == "/v3/v1/responses") {
executionContext->endpoint = Endpoint::RESPONSES;
} else if (TokenizeParser::isTokenizeEndpoint(payload.uri)) {
executionContext->endpoint = Endpoint::TOKENIZE;
} else {
return absl::InvalidArgumentError("Wrong endpoint. VLM Servable allowed only on /v3/chat/completions endpoint or /v3/tokenize");
return absl::InvalidArgumentError("Wrong endpoint. VLM Servable allowed only on /v3/chat/completions, /v3/responses endpoint or /v3/tokenize");
}
executionContext->payload = payload;
return absl::OkStatus();
Expand All @@ -67,7 +69,7 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptr<GenAiSer
if (vlmExecutionContext->apiHandler == nullptr) {
return absl::Status(absl::StatusCode::kInvalidArgument, "API handler is not initialized");
}
if (executionContext->endpoint == Endpoint::CHAT_COMPLETIONS) {
if (executionContext->endpoint == Endpoint::CHAT_COMPLETIONS || executionContext->endpoint == Endpoint::RESPONSES) {
ov::genai::ChatHistory& chatHistory = vlmExecutionContext->apiHandler->getChatHistory();

for (size_t i = 0; i < chatHistory.size(); i++) {
Expand Down
6 changes: 4 additions & 2 deletions src/llm/visual_language_model/legacy/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,12 @@ absl::Status VisualLanguageModelLegacyServable::loadRequest(std::shared_ptr<GenA
}
if (payload.uri == "/v3/chat/completions" || payload.uri == "/v3/v1/chat/completions") {
executionContext->endpoint = Endpoint::CHAT_COMPLETIONS;
} else if (payload.uri == "/v3/responses" || payload.uri == "/v3/v1/responses") {
executionContext->endpoint = Endpoint::RESPONSES;
} else if (TokenizeParser::isTokenizeEndpoint(payload.uri)) {
executionContext->endpoint = Endpoint::TOKENIZE;
} else {
return absl::InvalidArgumentError("Wrong endpoint. VLM Servable allowed only on /v3/chat/completions endpoint or /v3/tokenize");
return absl::InvalidArgumentError("Wrong endpoint. VLM Servable allowed only on /v3/chat/completions, /v3/responses endpoint or /v3/tokenize");
}
executionContext->payload = payload;
return absl::OkStatus();
Expand Down Expand Up @@ -237,7 +239,7 @@ absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptr<Ge
if (vlmExecutionContext->apiHandler == nullptr) {
return absl::Status(absl::StatusCode::kInvalidArgument, "API handler is not initialized");
}
if (executionContext->endpoint == Endpoint::CHAT_COMPLETIONS) {
if (executionContext->endpoint == Endpoint::CHAT_COMPLETIONS || executionContext->endpoint == Endpoint::RESPONSES) {
ov::genai::ChatHistory& chatHistory = vlmExecutionContext->apiHandler->getChatHistory();

for (size_t i = 0; i < chatHistory.size(); i++) {
Expand Down
Loading