From 716e391cea4e34b52eb6382a894fe4f274647a6d Mon Sep 17 00:00:00 2001 From: Pawel Date: Mon, 15 Dec 2025 12:47:29 +0100 Subject: [PATCH 1/7] save --- demos/rerank/README.md | 32 ++++ demos/rerank/config.json | 22 --- demos/rerank/models/graph.pbtxt | 29 ---- src/rerank/BUILD | 1 + src/rerank/rerank_calculator_ov.cc | 20 +++ src/test/reranknode_test.cpp | 238 +++++++++++++++++++++++++++++ 6 files changed, 291 insertions(+), 51 deletions(-) delete mode 100644 demos/rerank/config.json delete mode 100644 demos/rerank/models/graph.pbtxt diff --git a/demos/rerank/README.md b/demos/rerank/README.md index 04f4ae55eb..a6b1b4b82b 100644 --- a/demos/rerank/README.md +++ b/demos/rerank/README.md @@ -260,4 +260,36 @@ tomaarsen/Qwen3-Reranker-0.6B-seq-cls Check [RAG demo](../continuous_batching/rag/README.md) which employs `rerank` endpoint together with `chat/completions` and `embeddings`. +# Usage of tokenize endpoint (release 2025.6 or weekly) + +The `tokenize` endpoint provides a simple API for tokenizing input text using the same tokenizer as the deployed rerank model. This allows you to see how your text will be split into tokens before feature extraction or inference. The endpoint accepts a string or list of strings and returns the corresponding token IDs. + +Example usage: +```console +curl http://localhost:8000/v3/tokenize -H "Content-Type: application/json" -d "{ \"model\": \"BAAI/bge-reranker-large\", \"text\": \"hello world\" }" +``` +Response: +```json +{ + "tokens": [33600,31,8999] +} +``` + +It's possible to use additional parameters: + - `pad_to_max_length` - whether to pad the sequence to the maximum length. Default is False. + - `max_length` - maximum length of the sequence. If specified, it truncates the tokens to the provided number. + - `padding_side` - side to pad the sequence, can be `left` or `right`. Default is `right`. + - `add_special_tokens` - whether to add special tokens like BOS, EOS, PAD. Default is True. + + Example usage: +```console +curl http://localhost:8000/v3/tokenize -H "Content-Type: application/json" -d "{ \"model\": \"BAAI/bge-reranker-large\", \"text\": \"hello world\", \"max_length\": 10, \"pad_to_max_length\": true, \"padding_side\": \"left\", \"add_special_tokens\": true }" +``` + +Response: +```json +{ + "tokens": [1,1,1,1,1,1,1,33600,31,8999] +} +``` diff --git a/demos/rerank/config.json b/demos/rerank/config.json deleted file mode 100644 index 6510ff1f1e..0000000000 --- a/demos/rerank/config.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "model_config_list": [ - { - "config": { - "name": "tokenizer", - "base_path": "/workspace/models/BAAI/bge-reranker-large-tokenizer" - } - }, - { - "config": { - "name": "rerank_model", - "base_path": "/workspace/models/BAAI/bge-reranker-large-rerank" - } - } - ], - "mediapipe_config_list": [ - { - "name": "rerank", - "graph_path": "/workspace/models/graph.pbtxt" - } - ] -} diff --git a/demos/rerank/models/graph.pbtxt b/demos/rerank/models/graph.pbtxt deleted file mode 100644 index d68fa2c511..0000000000 --- a/demos/rerank/models/graph.pbtxt +++ /dev/null @@ -1,29 +0,0 @@ -input_stream: "REQUEST_PAYLOAD:input" -output_stream: "RESPONSE_PAYLOAD:output" -node { - calculator: "OpenVINOModelServerSessionCalculator" - output_side_packet: "SESSION:tokenizer" - node_options: { - [type.googleapis.com / mediapipe.OpenVINOModelServerSessionCalculatorOptions]: { - servable_name: "tokenizer" - servable_version: "1" - } - } -} -node { - calculator: "OpenVINOModelServerSessionCalculator" - output_side_packet: "SESSION:rerank" - node_options: { - [type.googleapis.com / mediapipe.OpenVINOModelServerSessionCalculatorOptions]: { - servable_name: "rerank_model" - servable_version: "1" - } - } -} -node { - input_side_packet: "TOKENIZER_SESSION:tokenizer" - input_side_packet: "RERANK_SESSION:rerank" - calculator: "RerankCalculator" - input_stream: "REQUEST_PAYLOAD:input" - output_stream: "RESPONSE_PAYLOAD:output" -} diff --git a/src/rerank/BUILD b/src/rerank/BUILD index 9d3f88605a..7f3b1a6ec9 100644 --- a/src/rerank/BUILD +++ b/src/rerank/BUILD @@ -80,6 +80,7 @@ ovms_cc_library( "//src:model_metric_reporter", "//src:executingstreamidguard", "//src:libovms_execution_context", + "//src/tokenize:tokenize_parser", ], visibility = ["//visibility:public"], alwayslink = 1, diff --git a/src/rerank/rerank_calculator_ov.cc b/src/rerank/rerank_calculator_ov.cc index 5cefcaddce..d3e151e855 100644 --- a/src/rerank/rerank_calculator_ov.cc +++ b/src/rerank/rerank_calculator_ov.cc @@ -42,6 +42,7 @@ #include "rerank_servable.hpp" #include "../model_metric_reporter.hpp" #include "../executingstreamidguard.hpp" +#include "../tokenize/tokenize_parser.hpp" using namespace rapidjson; using namespace ovms; @@ -289,6 +290,25 @@ class RerankCalculatorOV : public CalculatorBase { InputDataType payload = cc->Inputs().Tag(INPUT_TAG_NAME).Get(); SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "Request body: {}", payload.body); SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "Request uri: {}", payload.uri); + + if(TokenizeParser::isTokenizeEndpoint(payload.uri)) { + TokenizeRequest tokenizeRequest; + absl::Status status = TokenizeParser::parseTokenizeRequest(*payload.parsedJson, tokenizeRequest); + if (!status.ok()) { + return status; + } + if (auto strings = std::get_if>(&tokenizeRequest.input)) { + auto tokens = rerank_session->getTokenizer().encode(*strings, tokenizeRequest.parameters); + StringBuffer buffer; + status = TokenizeParser::parseTokenizeResponse(buffer, tokens, tokenizeRequest.parameters); + cc->Outputs().Tag(OUTPUT_TAG_NAME).Add(new std::string(buffer.GetString()), timestamp); + return absl::OkStatus(); + } else { + SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "Rerank tokenize input is of not supported type"); + return absl::InvalidArgumentError("Input should be string or array of strings"); + } + } + RerankHandler handler(*payload.parsedJson); absl::Status status = handler.parseRequest(); if (!status.ok()) { diff --git a/src/test/reranknode_test.cpp b/src/test/reranknode_test.cpp index 89cdba5422..09434fbddd 100644 --- a/src/test/reranknode_test.cpp +++ b/src/test/reranknode_test.cpp @@ -383,3 +383,241 @@ INSTANTIATE_TEST_SUITE_P( RerankWithInvalidParamsHttpTestInstances, RerankWithInvalidParamsHttpTest, graphs); + +class RerankTokenizeHttpTest : public V3HttpTest { +protected: + static std::unique_ptr t; + +public: + const std::string endpointTokenize = "/v3/tokenize"; + static void SetUpTestSuite() { + std::string port = "9173"; + std::string configPath = getGenericFullPathForSrcTest("/ovms/src/test/rerank/with_params/invalid_config.json"); + SetUpSuite(port, configPath, t); + } + + static void TearDownTestSuite() { + TearDownSuite(t); + } + + static void AssertTokenizationResult(const std::string& response, const std::vector& expectedTokens) { + rapidjson::Document d; + rapidjson::ParseResult ok = d.Parse(response.c_str()); + ASSERT_EQ(ok.Code(), 0); + ASSERT_TRUE(d.HasMember("tokens")); + ASSERT_TRUE(d["tokens"].IsArray()); + ASSERT_EQ(d["tokens"].Size(), expectedTokens.size()); + for (size_t i = 0; i < expectedTokens.size(); ++i) { + ASSERT_EQ(d["tokens"][(rapidjson::SizeType)i].GetInt(), expectedTokens[i]); + } + } + + static void AssertTokenizationResult(const std::string& response, const std::vector>& expectedTokensBatch) { + rapidjson::Document d; + rapidjson::ParseResult ok = d.Parse(response.c_str()); + ASSERT_EQ(ok.Code(), 0); + ASSERT_TRUE(d.HasMember("tokens")); + ASSERT_TRUE(d["tokens"].IsArray()); + ASSERT_EQ(d["tokens"].Size(), expectedTokensBatch.size()); + for (size_t i = 0; i < expectedTokensBatch.size(); ++i) { + const auto& expectedTokens = expectedTokensBatch[i]; + ASSERT_TRUE(d["tokens"][(rapidjson::SizeType)i].IsArray()); + ASSERT_EQ(d["tokens"][(rapidjson::SizeType)i].Size(), expectedTokens.size()); + for (size_t j = 0; j < expectedTokens.size(); ++j) { + ASSERT_EQ(d["tokens"][(rapidjson::SizeType)i][(rapidjson::SizeType)j].GetInt(), expectedTokens[j]); + } + } + } +}; + +std::unique_ptr RerankTokenizeHttpTest::t; + +TEST_F(RerankTokenizeHttpTest, tokenizePositive) { + std::string requestBody = R"( + { + "model": "rerank_ov", + "text": "hello world" + } + )"; + std::vector expectedTokens = {33600,31,8999}; + ASSERT_EQ( + handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + AssertTokenizationResult(response, expectedTokens); +} + +TEST_F(RerankTokenizeHttpTest, tokenizeNegativeMissingText) { + std::string requestBody = R"( + { + "model": "rerank_ov" + } + )"; + Status status = handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser); + ASSERT_EQ(status, ovms::StatusCode::MEDIAPIPE_EXECUTION_ERROR) << status.string(); +} + +TEST_F(RerankTokenizeHttpTest, tokenizeNegativeInvalidModel) { + std::string requestBody = R"( + { + "model": "non_existing_model", + "text": "hello world" + } + )"; + Status status = handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser); + ASSERT_EQ(status, ovms::StatusCode::MEDIAPIPE_DEFINITION_NAME_MISSING) << status.string(); +} + +TEST_F(RerankTokenizeHttpTest, tokenizePositiveMaxLenParam) { + std::string requestBody = R"( + { + "model": "rerank_ov", + "text": "hello world", + "max_length": 3 + } + )"; + std::vector expectedTokens = {101, 7592, 102}; + ASSERT_EQ( + handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + AssertTokenizationResult(response, expectedTokens); +} + +TEST_F(RerankTokenizeHttpTest, tokenizePositivePadToMaxLenParam) { + std::string requestBody = R"( + { + "model": "rerank_ov", + "text": "hello world", + "max_length": 100, + "pad_to_max_length": true + } + )"; + std::vector expectedTokens(96, 0); + expectedTokens.insert(expectedTokens.begin(), {101, 7592, 2088, 102}); + ASSERT_EQ( + handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + AssertTokenizationResult(response, expectedTokens); +} + +TEST_F(RerankTokenizeHttpTest, tokenizePositivePaddingSideLeft) { + std::string requestBody = R"( + { + "model": "rerank_ov", + "text": "hello world", + "max_length": 100, + "pad_to_max_length": true, + "padding_side": "left" + } + )"; + std::vector expectedTokens(96, 0); + expectedTokens.insert(expectedTokens.end(), {101, 7592, 2088, 102}); + ASSERT_EQ( + handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + AssertTokenizationResult(response, expectedTokens); +} + +TEST_F(RerankTokenizeHttpTest, tokenizePositivePaddingSideRight) { + std::string requestBody = R"( + { + "model": "rerank_ov", + "text": "hello world", + "max_length": 100, + "pad_to_max_length": true, + "padding_side": "right" + } + )"; + std::vector expectedTokens(96, 0); + expectedTokens.insert(expectedTokens.begin(), {101, 7592, 2088, 102}); + ASSERT_EQ( + handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + AssertTokenizationResult(response, expectedTokens); +} + +TEST_F(RerankTokenizeHttpTest, tokenizeNegativeInvalidPaddingSide) { + std::string requestBody = R"( + { + "model": "rerank_ov", + "text": "hello world", + "padding_side": "invalid_value" + } + )"; + Status status = handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser); + ASSERT_EQ(status, ovms::StatusCode::MEDIAPIPE_EXECUTION_ERROR) << status.string(); +} + +TEST_F(RerankTokenizeHttpTest, tokenizePositiveAddSpecialTokensFalse) { + std::string requestBody = R"( + { + "model": "rerank_ov", + "text": "hello world", + "add_special_tokens": false + } + )"; + std::vector expectedTokens = {7592, 2088}; + ASSERT_EQ( + handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + AssertTokenizationResult(response, expectedTokens); +} + +TEST_F(RerankTokenizeHttpTest, tokenizePositiveMaxLengthIgnored) { + std::string requestBody = R"( + { + "model": "rerank_ov", + "text": "hello world", + "max_length": 513, + "pad_to_max_length": true + } + )"; + std::vector expectedTokens(509, 0); + expectedTokens.insert(expectedTokens.begin(), {101, 7592, 2088, 102}); + ASSERT_EQ(handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + AssertTokenizationResult(response, expectedTokens); +} + +TEST_F(RerankTokenizeHttpTest, tokenizePositiveBatch) { + std::string requestBody = R"( + { + "model": "rerank_ov", + "text": ["hello", "hello world", "hello hello hello world"] + } + )"; + Status status = handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser); + std::vector> expectedTokens = { + {101, 7592, 102}, + {101, 7592, 2088, 102}, + {101, 7592, 7592, 7592, 2088, 102}}; + rapidjson::Document d; + rapidjson::ParseResult ok = d.Parse(response.c_str()); + ASSERT_EQ(ok.Code(), 0); + ASSERT_EQ( + handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + AssertTokenizationResult(response, expectedTokens); +} + +TEST_F(RerankTokenizeHttpTest, tokenizeBatchWithPadToMaxLen) { + std::string requestBody = R"( + { + "model": "rerank_ov", + "text": ["hello", "hello world", "hello hello hello world"], + "max_length": 6, + "pad_to_max_length": true + } + )"; + Status status = handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser); + std::vector> expectedTokens = { + {101, 7592, 102, 0, 0, 0}, + {101, 7592, 2088, 102, 0, 0}, + {101, 7592, 7592, 7592, 2088, 102}}; + rapidjson::Document d; + rapidjson::ParseResult ok = d.Parse(response.c_str()); + ASSERT_EQ(ok.Code(), 0); + ASSERT_EQ( + handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + AssertTokenizationResult(response, expectedTokens); +} From 4cc3b160b3a053d90521e4c8456f4f7b29e9ed31 Mon Sep 17 00:00:00 2001 From: Pawel Date: Wed, 17 Dec 2025 12:14:25 +0100 Subject: [PATCH 2/7] updated README and tessts --- demos/rerank/README.md | 3 +-- src/rerank/rerank_calculator_ov.cc | 1 + src/test/reranknode_test.cpp | 39 +++++++++++++++--------------- 3 files changed, 21 insertions(+), 22 deletions(-) diff --git a/demos/rerank/README.md b/demos/rerank/README.md index a6b1b4b82b..44ffe0c7c9 100644 --- a/demos/rerank/README.md +++ b/demos/rerank/README.md @@ -279,11 +279,10 @@ It's possible to use additional parameters: - `pad_to_max_length` - whether to pad the sequence to the maximum length. Default is False. - `max_length` - maximum length of the sequence. If specified, it truncates the tokens to the provided number. - `padding_side` - side to pad the sequence, can be `left` or `right`. Default is `right`. - - `add_special_tokens` - whether to add special tokens like BOS, EOS, PAD. Default is True. Example usage: ```console -curl http://localhost:8000/v3/tokenize -H "Content-Type: application/json" -d "{ \"model\": \"BAAI/bge-reranker-large\", \"text\": \"hello world\", \"max_length\": 10, \"pad_to_max_length\": true, \"padding_side\": \"left\", \"add_special_tokens\": true }" +curl http://localhost:8000/v3/tokenize -H "Content-Type: application/json" -d "{ \"model\": \"BAAI/bge-reranker-large\", \"text\": \"hello world\", \"max_length\": 10, \"pad_to_max_length\": true, \"padding_side\": \"left\"}" ``` Response: diff --git a/src/rerank/rerank_calculator_ov.cc b/src/rerank/rerank_calculator_ov.cc index d3e151e855..c911c26532 100644 --- a/src/rerank/rerank_calculator_ov.cc +++ b/src/rerank/rerank_calculator_ov.cc @@ -294,6 +294,7 @@ class RerankCalculatorOV : public CalculatorBase { if(TokenizeParser::isTokenizeEndpoint(payload.uri)) { TokenizeRequest tokenizeRequest; absl::Status status = TokenizeParser::parseTokenizeRequest(*payload.parsedJson, tokenizeRequest); + tokenizeRequest.parameters["add_special_tokens"] = false; // Rerank model tokenizer should not add special tokens if (!status.ok()) { return status; } diff --git a/src/test/reranknode_test.cpp b/src/test/reranknode_test.cpp index 09434fbddd..fa114b0c9c 100644 --- a/src/test/reranknode_test.cpp +++ b/src/test/reranknode_test.cpp @@ -392,7 +392,7 @@ class RerankTokenizeHttpTest : public V3HttpTest { const std::string endpointTokenize = "/v3/tokenize"; static void SetUpTestSuite() { std::string port = "9173"; - std::string configPath = getGenericFullPathForSrcTest("/ovms/src/test/rerank/with_params/invalid_config.json"); + std::string configPath = getGenericFullPathForSrcTest("/ovms/src/test/rerank/config.json"); SetUpSuite(port, configPath, t); } @@ -471,11 +471,11 @@ TEST_F(RerankTokenizeHttpTest, tokenizePositiveMaxLenParam) { std::string requestBody = R"( { "model": "rerank_ov", - "text": "hello world", + "text": "hello world hello world", "max_length": 3 } )"; - std::vector expectedTokens = {101, 7592, 102}; + std::vector expectedTokens = {33600,31,8999}; ASSERT_EQ( handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), ovms::StatusCode::OK); @@ -491,8 +491,8 @@ TEST_F(RerankTokenizeHttpTest, tokenizePositivePadToMaxLenParam) { "pad_to_max_length": true } )"; - std::vector expectedTokens(96, 0); - expectedTokens.insert(expectedTokens.begin(), {101, 7592, 2088, 102}); + std::vector expectedTokens(97, 1); + expectedTokens.insert(expectedTokens.begin(), {33600,31,8999}); ASSERT_EQ( handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), ovms::StatusCode::OK); @@ -509,8 +509,8 @@ TEST_F(RerankTokenizeHttpTest, tokenizePositivePaddingSideLeft) { "padding_side": "left" } )"; - std::vector expectedTokens(96, 0); - expectedTokens.insert(expectedTokens.end(), {101, 7592, 2088, 102}); + std::vector expectedTokens(97, 1); + expectedTokens.insert(expectedTokens.end(), {33600,31,8999}); ASSERT_EQ( handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), ovms::StatusCode::OK); @@ -527,8 +527,8 @@ TEST_F(RerankTokenizeHttpTest, tokenizePositivePaddingSideRight) { "padding_side": "right" } )"; - std::vector expectedTokens(96, 0); - expectedTokens.insert(expectedTokens.begin(), {101, 7592, 2088, 102}); + std::vector expectedTokens(97, 1); + expectedTokens.insert(expectedTokens.begin(), {33600,31,8999}); ASSERT_EQ( handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), ovms::StatusCode::OK); @@ -551,11 +551,10 @@ TEST_F(RerankTokenizeHttpTest, tokenizePositiveAddSpecialTokensFalse) { std::string requestBody = R"( { "model": "rerank_ov", - "text": "hello world", - "add_special_tokens": false + "text": "hello world" } )"; - std::vector expectedTokens = {7592, 2088}; + std::vector expectedTokens = {33600, 31, 8999}; ASSERT_EQ( handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), ovms::StatusCode::OK); @@ -571,8 +570,8 @@ TEST_F(RerankTokenizeHttpTest, tokenizePositiveMaxLengthIgnored) { "pad_to_max_length": true } )"; - std::vector expectedTokens(509, 0); - expectedTokens.insert(expectedTokens.begin(), {101, 7592, 2088, 102}); + std::vector expectedTokens(510, 1); + expectedTokens.insert(expectedTokens.begin(), {33600,31,8999}); ASSERT_EQ(handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), ovms::StatusCode::OK); AssertTokenizationResult(response, expectedTokens); @@ -587,9 +586,9 @@ TEST_F(RerankTokenizeHttpTest, tokenizePositiveBatch) { )"; Status status = handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser); std::vector> expectedTokens = { - {101, 7592, 102}, - {101, 7592, 2088, 102}, - {101, 7592, 7592, 7592, 2088, 102}}; + {33600, 31}, + {33600, 31, 8999}, + {33600,31,33600,31,33600,31,8999}}; rapidjson::Document d; rapidjson::ParseResult ok = d.Parse(response.c_str()); ASSERT_EQ(ok.Code(), 0); @@ -610,9 +609,9 @@ TEST_F(RerankTokenizeHttpTest, tokenizeBatchWithPadToMaxLen) { )"; Status status = handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser); std::vector> expectedTokens = { - {101, 7592, 102, 0, 0, 0}, - {101, 7592, 2088, 102, 0, 0}, - {101, 7592, 7592, 7592, 2088, 102}}; + {33600, 31, 1, 1, 1, 1}, + {33600, 31, 8999, 1, 1, 1}, + {33600,31,33600,31,33600,31}}; rapidjson::Document d; rapidjson::ParseResult ok = d.Parse(response.c_str()); ASSERT_EQ(ok.Code(), 0); From 0ed3199d5df437ced45e61780b9906b74dbeb346 Mon Sep 17 00:00:00 2001 From: Pawel Date: Wed, 17 Dec 2025 13:15:23 +0100 Subject: [PATCH 3/7] corrected tests --- src/test/reranknode_test.cpp | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/test/reranknode_test.cpp b/src/test/reranknode_test.cpp index fa114b0c9c..69047c733e 100644 --- a/src/test/reranknode_test.cpp +++ b/src/test/reranknode_test.cpp @@ -547,20 +547,6 @@ TEST_F(RerankTokenizeHttpTest, tokenizeNegativeInvalidPaddingSide) { ASSERT_EQ(status, ovms::StatusCode::MEDIAPIPE_EXECUTION_ERROR) << status.string(); } -TEST_F(RerankTokenizeHttpTest, tokenizePositiveAddSpecialTokensFalse) { - std::string requestBody = R"( - { - "model": "rerank_ov", - "text": "hello world" - } - )"; - std::vector expectedTokens = {33600, 31, 8999}; - ASSERT_EQ( - handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), - ovms::StatusCode::OK); - AssertTokenizationResult(response, expectedTokens); -} - TEST_F(RerankTokenizeHttpTest, tokenizePositiveMaxLengthIgnored) { std::string requestBody = R"( { @@ -620,3 +606,19 @@ TEST_F(RerankTokenizeHttpTest, tokenizeBatchWithPadToMaxLen) { ovms::StatusCode::OK); AssertTokenizationResult(response, expectedTokens); } + +TEST_F(RerankTokenizeHttpTest, tokenizeIgnooreAddSpecialTokensParameter) { + std::string requestBody = R"( + { + "model": "rerank_ov", + "text": "hello world", + "max_length": 3, + "add_special_tokens": true + } + )"; + std::vector expectedTokens = {33600, 31, 8999}; + ASSERT_EQ( + handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + AssertTokenizationResult(response, expectedTokens); +} From b64c90c7a93e28c4b8eed0cef5c5ada10d950551 Mon Sep 17 00:00:00 2001 From: Pawel Date: Wed, 17 Dec 2025 14:21:58 +0100 Subject: [PATCH 4/7] README changes --- demos/rerank/README.md | 134 +++++++++++++++++--------------- demos/rerank/compare_results.py | 2 +- 2 files changed, 71 insertions(+), 65 deletions(-) diff --git a/demos/rerank/README.md b/demos/rerank/README.md index 44ffe0c7c9..6c9550b69e 100644 --- a/demos/rerank/README.md +++ b/demos/rerank/README.md @@ -2,60 +2,56 @@ ## Prerequisites -**Model preparation**: Python 3.9 or higher with pip +**Model preparation and Model Server deployment**: Installed Docker Engine or OVMS binary package according to the [baremetal deployment guide](../../docs/deploying_server_baremetal.md) -**Model Server deployment**: Installed Docker Engine or OVMS binary package according to the [baremetal deployment guide](../../docs/deploying_server_baremetal.md) - -**(Optional) Client**: Python with pip +**(Optional) Client**: Python 3.9 or higher with pip ## Model preparation -Here, the original Pytorch LLM model and the tokenizer will be converted to IR format and optionally quantized. -That ensures faster initialization time, better performance and lower memory consumption. +You can pull rerank models directly from HuggingFace using OVMS. -Download export script, install it's dependencies and create directory for the models: -```console -curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/export_model.py -o export_model.py -pip3 install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/requirements.txt -mkdir models +:::{dropdown} **Preparation with Docker** +**CPU** +```bash +mkdir models +docker run -d --rm -u $(id -u):$(id -g) -v $(pwd)/models:/workspace openvino/model_server:latest --pull --source_model OpenVINO/bge-reranker-base-int8-ov --model_repository_path /workspace --task rerank +docker run --rm -u $(id -u):$(id -g) -v $(pwd)/models:/workspace openvino/model_server:latest --add_to_config --model_name OpenVINO/bge-reranker-base-int8-ov --model_repository_path /workspace --config_path /workspace/config.json ``` +**GPU** +```bash +mkdir models +docker run -d --rm -u $(id -u):$(id -g) -v $(pwd)/models:/workspace openvino/model_server:latest --pull --source_model OpenVINO/bge-reranker-base-int8-ov --model_repository_path /workspace --task rerank --target_device GPU +docker run -d --rm -u $(id -u):$(id -g) -v $(pwd)/models:/workspace openvino/model_server:latest --add_to_config --model_repository_path /workspace/models --config_path /workspace/config.json +``` +> **Note:** The users in China need to set environment variable HF_ENDPOINT="https://hf-mirror.com" before running the export script to connect to the HF Hub. -Run `export_model.py` script to download and quantize the model: +::: -**CPU** +:::{dropdown} **Preparation On Bare Metal** -```console -python export_model.py rerank_ov --source_model BAAI/bge-reranker-large --weight-format int8 --config_file_path models/config.json --model_repository_path models -``` +Assuming you have unpacked model server package, make sure to: -**GPU**: -```console -python export_model.py rerank_ov --source_model BAAI/bge-reranker-large --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models -``` -> **Note:** The users in China need to set environment variable HF_ENDPOINT="https://hf-mirror.com" before running the export script to connect to the HF Hub. +- **On Windows**: run `setupvars` script +- **On Linux**: set `LD_LIBRARY_PATH` and `PATH` environment variables -You should have a model folder like below: -``` -tree models -models -├── BAAI -│   └── bge-reranker-large -│ ├── config.json -│ ├── graph.pbtxt -│ ├── openvino_model.bin -│ |── openvino_model.xml -│ ├── openvino_tokenizer.bin -│ ├── openvino_tokenizer.xml -│ ├── sentencepiece.bpe.model -│ ├── special_tokens_map.json -│ ├── tokenizer_config.json -│ ├── tokenizer.json -└── config.json +as mentioned in [deployment guide](../../docs/deploying_server_baremetal.md), in every new shell that will start OpenVINO Model Server. +**CPU** +```bat +mkdir models +ovms --pull --source_model OpenVINO/bge-reranker-base-int8-ov --model_repository_path models --task rerank --target_device CPU +ovms --add_to_config --model_name OpenVINO/bge-reranker-base-int8-ov --model_repository_path models --config_path models/config.json +``` +**GPU** +```bat +mkdir models +ovms --pull --source_model OpenVINO/bge-reranker-base-int8-ov --model_repository_path models --task rerank --target_device GPU +ovms --add_to_config --model_name OpenVINO/bge-reranker-base-int8-ov --model_repository_path models --config_path models/config.json ``` -> **Note** The actual models support version management and can be automatically swapped to newer version when new model is uploaded in newer version folder. +> **Note:** The users in China need to set environment variable HF_ENDPOINT="https://hf-mirror.com" before running the export script to connect to the HF Hub. +::: ## Server Deployment :::{dropdown} **Deploying with Docker** @@ -76,15 +72,6 @@ docker run -d --rm -p 8000:8000 --device /dev/dri --group-add=$(stat -c "%g" /de :::{dropdown} **Deploying On Bare Metal** -Assuming you have unpacked model server package, make sure to: - -- **On Windows**: run `setupvars` script -- **On Linux**: set `LD_LIBRARY_PATH` and `PATH` environment variables - -as mentioned in [deployment guide](../../docs/deploying_server_baremetal.md), in every new shell that will start OpenVINO Model Server. - -Depending on how you prepared models in the first step of this demo, they are deployed to either CPU or GPU (it's defined in `config.json`). If you run on GPU make sure to have appropriate drivers installed, so the device is accessible for the model server. - ```bat ovms --rest_port 8000 --config_path ./models/config.json ``` @@ -94,7 +81,7 @@ ovms --rest_port 8000 --config_path ./models/config.json Readiness of the model can be reported with a simple curl command. ```bash -curl -i http://localhost:8000/v2/models/BAAI%2Fbge-reranker-large/ready +curl -i http://localhost:8000/v2/models/OpenVINO%2Fbge-reranker-base-int8-ov/ready HTTP/1.1 200 OK content-length: 0 content-type: application/json; charset=utf-8 @@ -105,18 +92,18 @@ content-type: application/json; charset=utf-8 :::{dropdown} **Requesting rerank score with cURL** ```bash -curl http://localhost:8000/v3/rerank -H "Content-Type: application/json" -d "{ \"model\": \"BAAI/bge-reranker-large\", \"query\": \"welcome\", \"documents\":[\"good morning\",\"farewell\"]}" +curl http://localhost:8000/v3/rerank -H "Content-Type: application/json" -d "{ \"model\": \"OpenVINO/bge-reranker-base-int8-ov\", \"query\": \"welcome\", \"documents\":[\"good morning\",\"farewell\"]}" ``` ```json { "results": [ { "index": 0, - "relevance_score": 0.3886180520057678 + "relevance_score": 0.9410624504089355 }, { "index": 1, - "relevance_score": 0.0055549247190356255 + "relevance_score": 0.9381139278411865 } ] } @@ -131,7 +118,7 @@ pip3 install cohere echo ' import cohere client = cohere.Client(base_url="http://127.0.0.1:8000/v3", api_key="not_used") -responses = client.rerank(query="hello",documents=["welcome","farewell"], model="BAAI/bge-reranker-large") +responses = client.rerank(query="hello",documents=["welcome","farewell"], model="OpenVINO/bge-reranker-base-int8-ov") for response in responses.results: print(f"index {response.index}, relevance_score {response.relevance_score}")' > rerank_client.py @@ -139,14 +126,14 @@ python rerank_client.py ``` It will return response similar to: ``` -index 0, relevance_score 0.9968273043632507 -index 1, relevance_score 0.09138210117816925 +index 0, relevance_score 0.9388121962547302 +index 1, relevance_score 0.9375657439231873 ``` ::: :::{dropdown} **Requesting rerank score with model that requires template applying on query and documents** -tomaarsen/Qwen3-Reranker-0.6B-seq-cls is a copy of the Qwen3-Reranker-0.6B model (original model is not supported in OVMS) modified as a sequence classification model instead. It requires applying template on input, here is example client that does it: +OpenVINO/Qwen3-Reranker-0.6B-seq-cls-fp16-ov is a copy of the Qwen3-Reranker-0.6B model (original model is not supported in OVMS) modified as a sequence classification model instead. It requires applying template on input, here is example client that does it: ```bash pip3 install requests @@ -180,7 +167,7 @@ documents = [ response = requests.post("http://127.0.0.1:8000/v3/rerank", json={ - "model": "tomaarsen/Qwen3-Reranker-0.6B-seq-cls", + "model": "OpenVINO/Qwen3-Reranker-0.6B-seq-cls-fp16-ov", "query": query, "documents": documents, }).json() @@ -191,21 +178,40 @@ python rerank_client.py ``` It will return response similar to: ``` -{'results': [{'index': 0, 'relevance_score': 0.024518223479390144}, {'index': 1, 'relevance_score': 0.0026006349362432957}]} +{'results': [{'index': 0, 'relevance_score': 0.0216273982077837}, {'index': 1, 'relevance_score': 0.018804751336574554}]} ``` ::: ## Comparison with Hugging Faces +Download export script, install it's dependencies: +```console +curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/export_model.py -o export_model.py +pip3 install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/requirements.txt +``` + +Run `export_model.py` script to download and quantize the model: + +**CPU** +```console +python export_model.py rerank_ov --source_model BAAI/bge-reranker-large --weight-format int8 --config_file_path models/config.json --model_repository_path models +``` + +**GPU**: +```console +python export_model.py rerank_ov --source_model BAAI/bge-reranker-large --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models +``` +> **Note:** The users in China need to set environment variable HF_ENDPOINT="https://hf-mirror.com" before running the export script to connect to the HF Hub. + ```bash git clone https://github.com/openvinotoolkit/model_server python model_server/demos/rerank/compare_results.py --query "hello" --document "welcome" --document "farewell" --base_url http://127.0.0.1:8000/v3/ query hello documents ['welcome', 'farewell'] -HF Duration: 145.731 ms -OVMS Duration: 23.227 ms -HF reranking: [0.99640983 0.08154089] -OVMS reranking: [0.9968273 0.0913821] +HF Duration: 126.124 ms +OVMS Duration: 89.483 ms +HF reranking: [0.99640983 0.08154131] +OVMS reranking: [0.99646771 0.07699008] ``` ## Performance benchmarking @@ -266,7 +272,7 @@ The `tokenize` endpoint provides a simple API for tokenizing input text using th Example usage: ```console -curl http://localhost:8000/v3/tokenize -H "Content-Type: application/json" -d "{ \"model\": \"BAAI/bge-reranker-large\", \"text\": \"hello world\" }" +curl http://localhost:8000/v3/tokenize -H "Content-Type: application/json" -d "{ \"model\": \"OpenVINO/bge-reranker-base-int8-ov\", \"text\": \"hello world\" }" ``` Response: ```json @@ -282,7 +288,7 @@ It's possible to use additional parameters: Example usage: ```console -curl http://localhost:8000/v3/tokenize -H "Content-Type: application/json" -d "{ \"model\": \"BAAI/bge-reranker-large\", \"text\": \"hello world\", \"max_length\": 10, \"pad_to_max_length\": true, \"padding_side\": \"left\"}" +curl http://localhost:8000/v3/tokenize -H "Content-Type: application/json" -d "{ \"model\": \"OpenVINO/bge-reranker-base-int8-ov\", \"text\": \"hello world\", \"max_length\": 10, \"pad_to_max_length\": true, \"padding_side\": \"left\"}" ``` Response: diff --git a/demos/rerank/compare_results.py b/demos/rerank/compare_results.py index 22a5ba54dd..4f0e682a72 100644 --- a/demos/rerank/compare_results.py +++ b/demos/rerank/compare_results.py @@ -25,7 +25,7 @@ parser = argparse.ArgumentParser(description='Compare rerank responses from HF transformers OVMS') parser.add_argument('--base_url', required=False, default='http://localhost:8000/v3/', help='Specify url to embeddings endpoint. default:http://localhost:8000/v3', dest='base_url') -parser.add_argument('--model_name', default='BAAI/bge-reranker-large', help='Model name to query. default: Alibaba-NLP/gte-large-en-v1.5', +parser.add_argument('--model_name', default='BAAI/bge-reranker-large', help='Model name to query. default: BAAI/bge-reranker-large', dest='model_name') parser.add_argument('--query', default='', help='Query string to rerank.', dest='query') From 1ae78bee538558560f4ef1b64808d9f8cb6aee54 Mon Sep 17 00:00:00 2001 From: Pawel Date: Thu, 18 Dec 2025 08:38:51 +0100 Subject: [PATCH 5/7] clang format --- src/rerank/rerank_calculator_ov.cc | 2 +- src/test/reranknode_test.cpp | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/rerank/rerank_calculator_ov.cc b/src/rerank/rerank_calculator_ov.cc index c911c26532..48036362e8 100644 --- a/src/rerank/rerank_calculator_ov.cc +++ b/src/rerank/rerank_calculator_ov.cc @@ -291,7 +291,7 @@ class RerankCalculatorOV : public CalculatorBase { SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "Request body: {}", payload.body); SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "Request uri: {}", payload.uri); - if(TokenizeParser::isTokenizeEndpoint(payload.uri)) { + if (TokenizeParser::isTokenizeEndpoint(payload.uri)) { TokenizeRequest tokenizeRequest; absl::Status status = TokenizeParser::parseTokenizeRequest(*payload.parsedJson, tokenizeRequest); tokenizeRequest.parameters["add_special_tokens"] = false; // Rerank model tokenizer should not add special tokens diff --git a/src/test/reranknode_test.cpp b/src/test/reranknode_test.cpp index 69047c733e..e3f8dea020 100644 --- a/src/test/reranknode_test.cpp +++ b/src/test/reranknode_test.cpp @@ -439,7 +439,7 @@ TEST_F(RerankTokenizeHttpTest, tokenizePositive) { "text": "hello world" } )"; - std::vector expectedTokens = {33600,31,8999}; + std::vector expectedTokens = {33600, 31, 8999}; ASSERT_EQ( handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), ovms::StatusCode::OK); @@ -475,7 +475,7 @@ TEST_F(RerankTokenizeHttpTest, tokenizePositiveMaxLenParam) { "max_length": 3 } )"; - std::vector expectedTokens = {33600,31,8999}; + std::vector expectedTokens = {33600, 31, 8999}; ASSERT_EQ( handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), ovms::StatusCode::OK); @@ -492,7 +492,7 @@ TEST_F(RerankTokenizeHttpTest, tokenizePositivePadToMaxLenParam) { } )"; std::vector expectedTokens(97, 1); - expectedTokens.insert(expectedTokens.begin(), {33600,31,8999}); + expectedTokens.insert(expectedTokens.begin(), {33600, 31, 8999}); ASSERT_EQ( handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), ovms::StatusCode::OK); @@ -510,7 +510,7 @@ TEST_F(RerankTokenizeHttpTest, tokenizePositivePaddingSideLeft) { } )"; std::vector expectedTokens(97, 1); - expectedTokens.insert(expectedTokens.end(), {33600,31,8999}); + expectedTokens.insert(expectedTokens.end(), {33600, 31, 8999}); ASSERT_EQ( handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), ovms::StatusCode::OK); @@ -528,7 +528,7 @@ TEST_F(RerankTokenizeHttpTest, tokenizePositivePaddingSideRight) { } )"; std::vector expectedTokens(97, 1); - expectedTokens.insert(expectedTokens.begin(), {33600,31,8999}); + expectedTokens.insert(expectedTokens.begin(), {33600, 31, 8999}); ASSERT_EQ( handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), ovms::StatusCode::OK); @@ -557,7 +557,7 @@ TEST_F(RerankTokenizeHttpTest, tokenizePositiveMaxLengthIgnored) { } )"; std::vector expectedTokens(510, 1); - expectedTokens.insert(expectedTokens.begin(), {33600,31,8999}); + expectedTokens.insert(expectedTokens.begin(), {33600, 31, 8999}); ASSERT_EQ(handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), ovms::StatusCode::OK); AssertTokenizationResult(response, expectedTokens); @@ -574,7 +574,7 @@ TEST_F(RerankTokenizeHttpTest, tokenizePositiveBatch) { std::vector> expectedTokens = { {33600, 31}, {33600, 31, 8999}, - {33600,31,33600,31,33600,31,8999}}; + {33600, 31, 33600, 31, 33600, 31, 8999}}; rapidjson::Document d; rapidjson::ParseResult ok = d.Parse(response.c_str()); ASSERT_EQ(ok.Code(), 0); @@ -597,7 +597,7 @@ TEST_F(RerankTokenizeHttpTest, tokenizeBatchWithPadToMaxLen) { std::vector> expectedTokens = { {33600, 31, 1, 1, 1, 1}, {33600, 31, 8999, 1, 1, 1}, - {33600,31,33600,31,33600,31}}; + {33600, 31, 33600, 31, 33600, 31}}; rapidjson::Document d; rapidjson::ParseResult ok = d.Parse(response.c_str()); ASSERT_EQ(ok.Code(), 0); From 5216d018a8aaf860ca8cec6a0db5858cc9f324b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Rzepecki?= Date: Thu, 18 Dec 2025 11:20:48 +0100 Subject: [PATCH 6/7] Update README.md --- demos/rerank/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demos/rerank/README.md b/demos/rerank/README.md index 6c9550b69e..e8e22cf3b6 100644 --- a/demos/rerank/README.md +++ b/demos/rerank/README.md @@ -266,7 +266,7 @@ tomaarsen/Qwen3-Reranker-0.6B-seq-cls Check [RAG demo](../continuous_batching/rag/README.md) which employs `rerank` endpoint together with `chat/completions` and `embeddings`. -# Usage of tokenize endpoint (release 2025.6 or weekly) +# Usage of tokenize endpoint (release 2026.0 or weekly) The `tokenize` endpoint provides a simple API for tokenizing input text using the same tokenizer as the deployed rerank model. This allows you to see how your text will be split into tokens before feature extraction or inference. The endpoint accepts a string or list of strings and returns the corresponding token IDs. From 6e64072372ed16de83acbac2d48452ed14eafc66 Mon Sep 17 00:00:00 2001 From: Pawel Date: Thu, 18 Dec 2025 12:11:22 +0100 Subject: [PATCH 7/7] reverting changed model --- demos/rerank/README.md | 124 ++++++++++++++++++++--------------------- 1 file changed, 59 insertions(+), 65 deletions(-) diff --git a/demos/rerank/README.md b/demos/rerank/README.md index e8e22cf3b6..189c3cc9e8 100644 --- a/demos/rerank/README.md +++ b/demos/rerank/README.md @@ -2,56 +2,60 @@ ## Prerequisites -**Model preparation and Model Server deployment**: Installed Docker Engine or OVMS binary package according to the [baremetal deployment guide](../../docs/deploying_server_baremetal.md) +**Model preparation**: Python 3.9 or higher with pip -**(Optional) Client**: Python 3.9 or higher with pip +**Model Server deployment**: Installed Docker Engine or OVMS binary package according to the [baremetal deployment guide](../../docs/deploying_server_baremetal.md) + +**(Optional) Client**: Python with pip ## Model preparation -You can pull rerank models directly from HuggingFace using OVMS. +Here, the original Pytorch LLM model and the tokenizer will be converted to IR format and optionally quantized. +That ensures faster initialization time, better performance and lower memory consumption. -:::{dropdown} **Preparation with Docker** -**CPU** -```bash -mkdir models -docker run -d --rm -u $(id -u):$(id -g) -v $(pwd)/models:/workspace openvino/model_server:latest --pull --source_model OpenVINO/bge-reranker-base-int8-ov --model_repository_path /workspace --task rerank -docker run --rm -u $(id -u):$(id -g) -v $(pwd)/models:/workspace openvino/model_server:latest --add_to_config --model_name OpenVINO/bge-reranker-base-int8-ov --model_repository_path /workspace --config_path /workspace/config.json -``` -**GPU** -```bash -mkdir models -docker run -d --rm -u $(id -u):$(id -g) -v $(pwd)/models:/workspace openvino/model_server:latest --pull --source_model OpenVINO/bge-reranker-base-int8-ov --model_repository_path /workspace --task rerank --target_device GPU -docker run -d --rm -u $(id -u):$(id -g) -v $(pwd)/models:/workspace openvino/model_server:latest --add_to_config --model_repository_path /workspace/models --config_path /workspace/config.json +Download export script, install it's dependencies and create directory for the models: +```console +curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/export_model.py -o export_model.py +pip3 install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/requirements.txt +mkdir models ``` -> **Note:** The users in China need to set environment variable HF_ENDPOINT="https://hf-mirror.com" before running the export script to connect to the HF Hub. - -::: -:::{dropdown} **Preparation On Bare Metal** +Run `export_model.py` script to download and quantize the model: -Assuming you have unpacked model server package, make sure to: +**CPU** -- **On Windows**: run `setupvars` script -- **On Linux**: set `LD_LIBRARY_PATH` and `PATH` environment variables +```console +python export_model.py rerank_ov --source_model BAAI/bge-reranker-large --weight-format int8 --config_file_path models/config.json --model_repository_path models +``` -as mentioned in [deployment guide](../../docs/deploying_server_baremetal.md), in every new shell that will start OpenVINO Model Server. +**GPU**: +```console +python export_model.py rerank_ov --source_model BAAI/bge-reranker-large --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models +``` +> **Note:** The users in China need to set environment variable HF_ENDPOINT="https://hf-mirror.com" before running the export script to connect to the HF Hub. -**CPU** -```bat -mkdir models -ovms --pull --source_model OpenVINO/bge-reranker-base-int8-ov --model_repository_path models --task rerank --target_device CPU -ovms --add_to_config --model_name OpenVINO/bge-reranker-base-int8-ov --model_repository_path models --config_path models/config.json +You should have a model folder like below: ``` -**GPU** -```bat -mkdir models -ovms --pull --source_model OpenVINO/bge-reranker-base-int8-ov --model_repository_path models --task rerank --target_device GPU -ovms --add_to_config --model_name OpenVINO/bge-reranker-base-int8-ov --model_repository_path models --config_path models/config.json +tree models +models +├── BAAI +│   └── bge-reranker-large +│ ├── config.json +│ ├── graph.pbtxt +│ ├── openvino_model.bin +│ |── openvino_model.xml +│ ├── openvino_tokenizer.bin +│ ├── openvino_tokenizer.xml +│ ├── sentencepiece.bpe.model +│ ├── special_tokens_map.json +│ ├── tokenizer_config.json +│ ├── tokenizer.json +└── config.json + ``` +> **Note** The actual models support version management and can be automatically swapped to newer version when new model is uploaded in newer version folder. -> **Note:** The users in China need to set environment variable HF_ENDPOINT="https://hf-mirror.com" before running the export script to connect to the HF Hub. -::: ## Server Deployment :::{dropdown} **Deploying with Docker** @@ -72,6 +76,15 @@ docker run -d --rm -p 8000:8000 --device /dev/dri --group-add=$(stat -c "%g" /de :::{dropdown} **Deploying On Bare Metal** +Assuming you have unpacked model server package, make sure to: + +- **On Windows**: run `setupvars` script +- **On Linux**: set `LD_LIBRARY_PATH` and `PATH` environment variables + +as mentioned in [deployment guide](../../docs/deploying_server_baremetal.md), in every new shell that will start OpenVINO Model Server. + +Depending on how you prepared models in the first step of this demo, they are deployed to either CPU or GPU (it's defined in `config.json`). If you run on GPU make sure to have appropriate drivers installed, so the device is accessible for the model server. + ```bat ovms --rest_port 8000 --config_path ./models/config.json ``` @@ -81,7 +94,7 @@ ovms --rest_port 8000 --config_path ./models/config.json Readiness of the model can be reported with a simple curl command. ```bash -curl -i http://localhost:8000/v2/models/OpenVINO%2Fbge-reranker-base-int8-ov/ready +curl -i http://localhost:8000/v2/models/BAAI%2Fbge-reranker-large/ready HTTP/1.1 200 OK content-length: 0 content-type: application/json; charset=utf-8 @@ -92,18 +105,18 @@ content-type: application/json; charset=utf-8 :::{dropdown} **Requesting rerank score with cURL** ```bash -curl http://localhost:8000/v3/rerank -H "Content-Type: application/json" -d "{ \"model\": \"OpenVINO/bge-reranker-base-int8-ov\", \"query\": \"welcome\", \"documents\":[\"good morning\",\"farewell\"]}" +curl http://localhost:8000/v3/rerank -H "Content-Type: application/json" -d "{ \"model\": \"BAAI/bge-reranker-large\", \"query\": \"welcome\", \"documents\":[\"good morning\",\"farewell\"]}" ``` ```json { "results": [ { "index": 0, - "relevance_score": 0.9410624504089355 + "relevance_score": 0.3886180520057678 }, { "index": 1, - "relevance_score": 0.9381139278411865 + "relevance_score": 0.0055549247190356255 } ] } @@ -118,7 +131,7 @@ pip3 install cohere echo ' import cohere client = cohere.Client(base_url="http://127.0.0.1:8000/v3", api_key="not_used") -responses = client.rerank(query="hello",documents=["welcome","farewell"], model="OpenVINO/bge-reranker-base-int8-ov") +responses = client.rerank(query="hello",documents=["welcome","farewell"], model="BAAI/bge-reranker-large") for response in responses.results: print(f"index {response.index}, relevance_score {response.relevance_score}")' > rerank_client.py @@ -126,8 +139,8 @@ python rerank_client.py ``` It will return response similar to: ``` -index 0, relevance_score 0.9388121962547302 -index 1, relevance_score 0.9375657439231873 +index 0, relevance_score 0.9968273043632507 +index 1, relevance_score 0.09138210117816925 ``` ::: @@ -184,34 +197,15 @@ It will return response similar to: ## Comparison with Hugging Faces -Download export script, install it's dependencies: -```console -curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/export_model.py -o export_model.py -pip3 install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/requirements.txt -``` - -Run `export_model.py` script to download and quantize the model: - -**CPU** -```console -python export_model.py rerank_ov --source_model BAAI/bge-reranker-large --weight-format int8 --config_file_path models/config.json --model_repository_path models -``` - -**GPU**: -```console -python export_model.py rerank_ov --source_model BAAI/bge-reranker-large --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models -``` -> **Note:** The users in China need to set environment variable HF_ENDPOINT="https://hf-mirror.com" before running the export script to connect to the HF Hub. - ```bash git clone https://github.com/openvinotoolkit/model_server python model_server/demos/rerank/compare_results.py --query "hello" --document "welcome" --document "farewell" --base_url http://127.0.0.1:8000/v3/ query hello documents ['welcome', 'farewell'] -HF Duration: 126.124 ms -OVMS Duration: 89.483 ms -HF reranking: [0.99640983 0.08154131] -OVMS reranking: [0.99646771 0.07699008] +HF Duration: 145.731 ms +OVMS Duration: 23.227 ms +HF reranking: [0.99640983 0.08154089] +OVMS reranking: [0.9968273 0.0913821] ``` ## Performance benchmarking