diff --git a/Dockerfile.redhat b/Dockerfile.redhat index d867be5638..928cf83345 100644 --- a/Dockerfile.redhat +++ b/Dockerfile.redhat @@ -111,8 +111,8 @@ ARG VERBOSE_LOGS=OFF ARG LTO_ENABLE=OFF # hadolint ignore=DL3041 -RUN dnf install -y https://rpmfind.net/linux/almalinux/8.10/PowerTools/x86_64/os/Packages/opencl-headers-2.2-1.20180306gite986688.el8.noarch.rpm && \ - dnf update -d6 -y && dnf install -d6 -y \ +RUN dnf install -y -d6 \ + https://vault.centos.org/centos/8-stream/PowerTools/x86_64/os/Packages/opencl-headers-2.2-1.20180306gite986688.el8.noarch.rpm \ gdb \ java-11-openjdk-devel \ tzdata-java \ @@ -221,22 +221,16 @@ WORKDIR /openvino_tokenizers/ ARG ov_tokenizers_branch=85be884a69f10270703f81f970a5ee596a4c8df7 ARG ov_tokenizers_org=openvinotoolkit ARG SDL_OPS="-fpic -O2 -U_FORTIFY_SOURCE -fstack-protector -fno-omit-frame-pointer -D_FORTIFY_SOURCE=1 -fno-strict-overflow -Wall -Wno-unknown-pragmas -Wno-error=sign-compare -fno-delete-null-pointer-checks -fwrapv -fstack-clash-protection -Wformat -Wformat-security -Werror=format-security -s -D_GLIBCXX_USE_CXX11_ABI=1 -Wno-error=deprecated-declarations -Wuninitialized" -# hadolint ignore=DL3003 -RUN git clone https://github.com/$ov_tokenizers_org/openvino_tokenizers.git /openvino_tokenizers && cd /openvino_tokenizers && git checkout $ov_tokenizers_branch && git submodule update --init --recursive -RUN if ! [[ $debug_bazel_flags == *"_py_off"* ]]; then \ - mkdir -p /opt/intel/openvino/python/openvino_tokenizers/lib ; \ - cp -r python/* /opt/intel/openvino/python/ ; \ - mkdir -p /opt/intel/openvino/python/openvino_tokenizers-2025.4.dist-info ; \ - echo $'Metadata-Version: 1.0\nName: openvino-tokenizers\nVersion: 2025.4\nRequires-Python: >=3.9\nRequires-Dist: openvino~=2025.4.1' > /opt/intel/openvino/python/openvino_tokenizers-2025.4.dist-info/METADATA ; \ - ln -s /ovms/lib/libopenvino_tokenizers.so /opt/intel/openvino/python/openvino_tokenizers/lib/libopenvino_tokenizers.so ; \ - fi # hadolint ignore=DL3003 -RUN if [ "$ov_use_binary" == "0" ]; then true ; else exit 0 ; fi ; \ - cd /openvino_tokenizers && cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_VERBOSE_MAKEFILE="${VERBOSE_LOGS}" -DCMAKE_CXX_FLAGS=" ${SDL_OPS} ${LTO_CXX_FLAGS} " -DCMAKE_SHARED_LINKER_FLAGS="${LTO_LD_FLAGS}" -S ./ -B ./build/ && cmake --build ./build/ --parallel $JOBS && cp /openvino_tokenizers/build/src/lib*.so /opt/intel/openvino/runtime/lib/intel64/ ; \ - # Install the openvino_tokenizers python bindings and copy to OpenVINO location - if ! [[ $debug_bazel_flags == *"_py_off"* ]]; then \ - cp build/python/* /opt/intel/openvino/python/openvino_tokenizers/ ; \ +RUN if ! [[ $debug_bazel_flags == *"_py_off"* ]]; then \ +# python tokenizers built always from source because it is not in binary package + git clone https://github.com/$ov_tokenizers_org/openvino_tokenizers.git /openvino_tokenizers && cd /openvino_tokenizers && git checkout $ov_tokenizers_branch && git submodule update --init --recursive && \ + sed -i '/openvino~=/d' /openvino_tokenizers/pyproject.toml && \ + sed -i '/requires-python/d' /openvino_tokenizers/pyproject.toml && \ + cd /openvino_tokenizers && cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_VERBOSE_MAKEFILE="${VERBOSE_LOGS}" -DCMAKE_CXX_FLAGS=" ${SDL_OPS} ${LTO_CXX_FLAGS} " -DCMAKE_SHARED_LINKER_FLAGS="${LTO_LD_FLAGS}" -S ./ -B ./build/ && cmake --build ./build/ --parallel $JOBS && cp /openvino_tokenizers/build/src/lib*.so /opt/intel/openvino/runtime/lib/intel64/ && \ + python3 -m pip wheel -v --no-deps --wheel-dir wheel /openvino_tokenizers && \ + python3 -m pip install --no-cache-dir "$(find wheel -name 'openvino_tokenizers*.whl')" --target /opt/intel/openvino/python ; \ fi WORKDIR /openvino_genai/ @@ -289,8 +283,8 @@ WORKDIR /ovms/src/example/SampleCpuExtension/ RUN make RUN if ! [[ $debug_bazel_flags == *"py_off"* ]]; then true ; else exit 0 ; fi ; \ - mkdir -p /opt/intel/openvino/python/openvino-4.dist-info && \ - echo $'Metadata-Version: 1.0\nName: openvino\nVersion: 2025.4' > /opt/intel/openvino/python/openvino-4.dist-info/METADATA + mkdir -p /opt/intel/openvino/python/openvino-2025.4.1.dist-info && \ + echo $'Metadata-Version: 1.0\nName: openvino\nVersion: 2025.4.1' > /opt/intel/openvino/python/openvino-2025.4.1.dist-info/METADATA ENV PYTHONPATH=/opt/intel/openvino/python:/ovms/bazel-bin/src/python/binding WORKDIR /patchelf diff --git a/ci/build_test_OnCommit.groovy b/ci/build_test_OnCommit.groovy index dedc01cdf4..b35d8c5c8d 100644 --- a/ci/build_test_OnCommit.groovy +++ b/ci/build_test_OnCommit.groovy @@ -166,7 +166,7 @@ pipeline { label "${agent_name_linux}" } steps { - sh "make release_image RUN_TESTS=0 OV_USE_BINARY=0 BASE_OS=redhat OVMS_CPP_IMAGE_TAG=${shortCommit} BUILD_IMAGE=openvino/model_server-build:${shortCommit}" + sh "make release_image RUN_TESTS=1 OV_USE_BINARY=0 BASE_OS=redhat OVMS_CPP_IMAGE_TAG=${shortCommit} BUILD_IMAGE=openvino/model_server-build:${shortCommit}" sh "make run_lib_files_test BASE_OS=redhat OVMS_CPP_IMAGE_TAG=${shortCommit}" script { dir ('internal_tests'){ diff --git a/create_package.sh b/create_package.sh index 75a692506c..39df2d8ff7 100755 --- a/create_package.sh +++ b/create_package.sh @@ -67,10 +67,10 @@ fi # Add Python bindings for pyovms, openvino, openvino_tokenizers and openvino_genai, so they are all available for OVMS Python servables if ! [[ $debug_bazel_flags == *"_py_off"* ]]; then cp -r /opt/intel/openvino/python /ovms_release/lib/python ; fi if ! [[ $debug_bazel_flags == *"_py_off"* ]] && [ "$FUZZER_BUILD" == "0" ]; then mv /ovms_release/lib/pyovms.so /ovms_release/lib/python ; fi -if ! [[ $debug_bazel_flags == *"_py_off"* ]]; then echo $'#!/bin/bash\npython3 -m openvino_tokenizers.cli "$@"' > /ovms_release/bin/convert_tokenizer ; \ +if ! [[ $debug_bazel_flags == *"_py_off"* ]]; then mv /ovms_release/lib/python/bin/convert_tokenizer /ovms_release/bin/convert_tokenizer ; \ chmod +x /ovms_release/bin/convert_tokenizer ; fi -if ! [[ $debug_bazel_flags == *"_py_off"* ]]; then mkdir -p /ovms_release/lib/python/openvino_genai-2025.4.dist-info ; \ - echo $'Metadata-Version: 1.0\nName: openvino-genai\nVersion: 2025.4\nRequires-Python: >=3.9\nRequires-Dist: openvino-genai~=2025.4.1' > /ovms_release/lib/python/openvino_genai-2025.4.dist-info/METADATA; fi +if ! [[ $debug_bazel_flags == *"_py_off"* ]]; then mkdir -p /ovms_release/lib/python/openvino_genai-2025.4.1.dist-info ; \ + echo $'Metadata-Version: 1.0\nName: openvino-genai\nVersion: 2025.4.1\nRequires-Python: >=3.9\nRequires-Dist: openvino-genai~=2025.4.1' > /ovms_release/lib/python/openvino_genai-2025.4.1.dist-info/METADATA; fi if [ -f /opt/intel/openvino/runtime/lib/intel64/plugins.xml ]; then cp /opt/intel/openvino/runtime/lib/intel64/plugins.xml /ovms_release/lib/ ; fi find /opt/intel/openvino/runtime/lib/intel64/ -iname '*.mvcmd*' -exec cp -v {} /ovms_release/lib/ \; diff --git a/demos/common/export_models/requirements.txt b/demos/common/export_models/requirements.txt index 4d8be6ab95..f71ecff2e7 100644 --- a/demos/common/export_models/requirements.txt +++ b/demos/common/export_models/requirements.txt @@ -1,12 +1,12 @@ --extra-index-url "https://download.pytorch.org/whl/cpu" -optimum-intel@git+https://github.com/huggingface/optimum-intel.git@aed07975d817c124fd5d45375ac131d4a068b557 +optimum-intel@git+https://github.com/huggingface/optimum-intel.git@a484bc6ee1175bbe8868bb53d2c42ab4c4802aa6 accelerate==1.11.0 diffusers==0.35.2 # for image generation einops==0.8.1 nncf==2.19.0 numpy==2.2.6 -openvino-tokenizers==2025.4.0.0 -openvino==2025.4.0 +openvino-tokenizers==2025.4.1.0 +openvino==2025.4.1 #optimum is in dependency list of optimum-intel pillow==12.0.0 sentence_transformers==5.1.2 diff --git a/demos/python_demos/Dockerfile.redhat b/demos/python_demos/Dockerfile.redhat index 31b79b0ecb..cacd66da79 100644 --- a/demos/python_demos/Dockerfile.redhat +++ b/demos/python_demos/Dockerfile.redhat @@ -21,6 +21,6 @@ ENV PYTHONPATH=/ovms/lib/python RUN if [ -f /usr/bin/dnf ] ; then export DNF_TOOL=dnf ; else export DNF_TOOL=microdnf ; fi ; \ $DNF_TOOL install -y python3-pip git COPY requirements.txt . -RUN BUILD_CUDA_EXT=0 pip3 install -r requirements.txt +RUN pip3 install -r requirements.txt USER ovms ENTRYPOINT [ "/ovms/bin/ovms" ] diff --git a/demos/python_demos/requirements.txt b/demos/python_demos/requirements.txt index c6124f16d9..8a12d95d3b 100644 --- a/demos/python_demos/requirements.txt +++ b/demos/python_demos/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url "https://download.pytorch.org/whl/cpu" -optimum-intel@git+https://github.com/huggingface/optimum-intel.git +optimum-intel@git+https://github.com/huggingface/optimum-intel.git@a484bc6ee1175bbe8868bb53d2c42ab4c4802aa6 pillow==10.3.0 tritonclient[grpc]==2.57.0 # Required to use batch string serialization/deserialization (4byte length prepend) numpy<2.0 @@ -7,7 +7,7 @@ huggingface_hub==0.32.0 nncf>=2.11.0 sentence_transformers sentencepiece==0.2.0 -transformers<=4.53 +transformers<4.56 einops torchvision timm==1.0.15 diff --git a/docs/deploying_server_baremetal.md b/docs/deploying_server_baremetal.md index 7a444ac59b..5f0e301250 100644 --- a/docs/deploying_server_baremetal.md +++ b/docs/deploying_server_baremetal.md @@ -15,12 +15,12 @@ You can download model server package in two configurations. One with Python sup :sync: ubuntu-22-04 Download precompiled package (without python): ```{code} sh -wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.4/ovms_ubuntu22.tar.gz +wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.4.1/ovms_ubuntu22.tar.gz tar -xzvf ovms_ubuntu22.tar.gz ``` or precompiled package (with python): ```{code} sh -wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.4/ovms_ubuntu22_python_on.tar.gz +wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.4.1/ovms_ubuntu22_python_on.tar.gz tar -xzvf ovms_ubuntu22_python_on.tar.gz ``` Install required libraries: @@ -50,12 +50,12 @@ Model server version with Python is shipped with those packages and new installa :sync: ubuntu-24-04 Download precompiled package (without python): ```{code} sh -wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.4/ovms_ubuntu24.tar.gz +wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.4.1/ovms_ubuntu24.tar.gz tar -xzvf ovms_ubuntu24.tar.gz ``` or precompiled package (with python): ```{code} sh -wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.4/ovms_ubuntu24_python_on.tar.gz +wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.4.1/ovms_ubuntu24_python_on.tar.gz tar -xzvf ovms_ubuntu24_python_on.tar.gz ``` Install required libraries: @@ -85,12 +85,12 @@ Model server version with Python is shipped with those packages and new installa :sync: rhel-9.6 Download precompiled package (without python): ```{code} sh -wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.4/ovms_redhat.tar.gz +wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.4.1/ovms_redhat.tar.gz tar -xzvf ovms_redhat.tar.gz ``` or precompiled package (with python): ```{code} sh -wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.4/ovms_redhat_python_on.tar.gz +wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.4.1/ovms_redhat_python_on.tar.gz tar -xzvf ovms_redhat_python_on.tar.gz ``` Install required libraries: @@ -124,14 +124,14 @@ Make sure you have [Microsoft Visual C++ Redistributable](https://aka.ms/vs/17/r Download and unpack model server archive for Windows(with python): ```bat -curl -L https://github.com/openvinotoolkit/model_server/releases/download/v2025.4/ovms_windows_python_on.zip -o ovms.zip +curl -L https://github.com/openvinotoolkit/model_server/releases/download/v2025.4.1/ovms_windows_python_on.zip -o ovms.zip tar -xf ovms.zip ``` or archive without python: ```bat -curl -L https://github.com/openvinotoolkit/model_server/releases/download/v2025.4/ovms_windows_python_off.zip -o ovms.zip +curl -L https://github.com/openvinotoolkit/model_server/releases/download/v2025.4.1/ovms_windows_python_off.zip -o ovms.zip tar -xf ovms.zip ``` diff --git a/prepare_llm_models.sh b/prepare_llm_models.sh index 017cfa48be..43ac64c1e8 100755 --- a/prepare_llm_models.sh +++ b/prepare_llm_models.sh @@ -14,13 +14,15 @@ # See the License for the specific language governing permissions and # limitations under the License. # + set -e if [ -z "$1" ]; then echo "Error: No directory specified." exit 1 fi -CB_MODEL="facebook/opt-125m" +CB_MODEL="HuggingFaceTB/SmolLM2-360M-Instruct" +FACEBOOK="facebook/opt-125m" TOKENIZER_FILE="openvino_tokenizer.bin" LEGACY_MODEL_FILE="1/model.bin" EMBEDDING_MODEL="thenlper/gte-small" @@ -29,7 +31,7 @@ VLM_MODEL="OpenGVLab/InternVL2-1B" # Models for tools testing. Only tokenizers are downloaded. QWEN3_MODEL="Qwen/Qwen3-8B" -LLAMA3_MODEL="meta-llama/Llama-3.1-8B-Instruct" +LLAMA3_MODEL="unsloth/Llama-3.1-8B-Instruct" HERMES3_MODEL="NousResearch/Hermes-3-Llama-3.1-8B" PHI4_MODEL="microsoft/Phi-4-mini-instruct" MISTRAL_MODEL="mistralai/Mistral-7B-Instruct-v0.3" @@ -57,12 +59,10 @@ if [ "$(python3 -c 'import sys; print(sys.version_info[1])')" -le "8" ]; then ec echo "Downloading LLM testing models to directory $1" export PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly" if [ "$2" = "docker" ]; then - sed -i '/openvino~=/d' /openvino_tokenizers/pyproject.toml - python3 -m pip wheel -v --no-deps --wheel-dir wheel /openvino_tokenizers - python3 -m pip install $(find wheel -name 'openvino_tokenizers*.whl') - python3 -m pip install "optimum-intel"@git+https://github.com/huggingface/optimum-intel.git nncf sentence_transformers==3.1.1 + export PATH=$PATH:/opt/intel/openvino/python/bin + python3 -m pip install "optimum-intel"@git+https://github.com/huggingface/optimum-intel.git@75d6b7d3bc9544487e2111a610b59f8d62e0ef89 nncf sentence_transformers einops timm sentencepiece else - python3.10 -m venv .venv + python3 -m venv .venv . .venv/bin/activate pip3 install -U pip pip3 install -U -r demos/common/export_models/requirements.txt @@ -79,6 +79,16 @@ if [ ! -f "$1/$CB_MODEL/$TOKENIZER_FILE" ]; then exit 1 fi +if [ -f "$1/$FACEBOOK/$TOKENIZER_FILE" ]; then + echo "Models file $1/$FACEBOOK/$TOKENIZER_FILE exists. Skipping downloading models." +else + python3 demos/common/export_models/export_model.py text_generation --source_model "$FACEBOOK" --weight-format int8 --model_repository_path $1 +fi +if [ ! -f "$1/$FACEBOOK/$TOKENIZER_FILE" ]; then + echo "[ERROR] Models file $1/$FACEBOOK/$TOKENIZER_FILE does not exist." + exit 1 +fi + if [ -f "$1/$VLM_MODEL/$TOKENIZER_FILE" ]; then echo "Model file $1/$VLM_MODEL/$TOKENIZER_FILE exists. Skipping downloading models." else diff --git a/run_unit_tests.sh b/run_unit_tests.sh index 43ff00558c..a7786f4e08 100755 --- a/run_unit_tests.sh +++ b/run_unit_tests.sh @@ -35,6 +35,12 @@ ${debug_bazel_flags} \ LD_LIBRARY_PATH=/opt/opencv/lib/:/opt/intel/openvino/runtime/lib/intel64/:/opt/intel/openvino/runtime/3rdparty/tbb/lib/ PYTHONPATH=/opt/intel/openvino/python:/ovms/bazel-bin/src/python/binding +# if https proxy is set in the environment and file .user.bazelrc doesn't exist yet, add proxy env for bazel test +if [ -n "${HTTPS_PROXY}" ] && [ ! -f .user.bazelrc ] ; then + echo test:linux --test_env https_proxy=${HTTPS_PROXY} >> .user.bazelrc + echo test:linux --test_env http_proxy=${HTTP_PROXY} >> .user.bazelrc +fi + # Check if RUN_GPU_TESTS is set and add it to SHARED_OPTIONS if [ "$RUN_GPU_TESTS" == "1" ]; then if grep -q "ID=ubuntu" /etc/os-release; then diff --git a/src/test/llm/llmnode_test.cpp b/src/test/llm/llmnode_test.cpp index 11e6b4c0b5..7cead86c94 100644 --- a/src/test/llm/llmnode_test.cpp +++ b/src/test/llm/llmnode_test.cpp @@ -99,7 +99,7 @@ class LLMFlowHttpTest : public ::testing::Test { plugin_config_t pluginConfig; // Setting precision to f32 fails on SPR hosts - to be investigated // JsonParser::parsePluginConfig("{\"INFERENCE_PRECISION_HINT\":\"f32\"}", pluginConfig); - cbPipe = std::make_shared(getGenericFullPathForSrcTest("/ovms/src/test/llm_testing/facebook/opt-125m"), schedulerConfig, device, pluginConfig, tokenizerPluginConfig); + cbPipe = std::make_shared(getGenericFullPathForSrcTest("/ovms/src/test/llm_testing/HuggingFaceTB/SmolLM2-360M-Instruct"), schedulerConfig, device, pluginConfig, tokenizerPluginConfig); llmExecutorWrapper = std::make_shared(cbPipe); } catch (const std::exception& e) { SPDLOG_ERROR("Error during llm node initialization for models_path exception: {}", e.what()); @@ -598,6 +598,7 @@ TEST_P(LLMFlowHttpTestParameterized, unaryCompletionsJsonSpaceStopString) { "stream": false, "ignore_eos": false, "max_tokens": 1000, + "temperature": 0, "stop": " ", "include_stop_str_in_output": true, "prompt": " | | | " @@ -1419,8 +1420,8 @@ TEST_P(LLMFlowHttpTestParameterized, unaryChatCompletionsPromptTokensWithMaxToke GTEST_SKIP(); } std::string prompt; - // creating prompt that will be tokenized to 2048 tokens when model max length is 2048 - for (int i = 0; i < 2044; i++) { + // creating prompt that will be tokenized to 8189 tokens when model max length is 8192; 29 are tokens from chat template. + for (int i = 0; i < 8192 - 29 - 3; i++) { prompt += "hello "; } std::string requestBody = R"( @@ -1429,7 +1430,7 @@ TEST_P(LLMFlowHttpTestParameterized, unaryChatCompletionsPromptTokensWithMaxToke R"(", "stream": false, "seed" : 1, - "max_tokens" : 5, + "max_tokens" : 10, "messages": [ { "role": "user", @@ -1451,8 +1452,8 @@ TEST_P(LLMFlowHttpTestParameterized, unaryChatCompletionsPromptTokensWithMaxComp GTEST_SKIP(); } std::string prompt; - // creating prompt that will be tokenized to 2048 tokens when model max length is 2048 - for (int i = 0; i < 2044; i++) { + // creating prompt that will be tokenized to 8189 tokens when model max length is 8192; 25 are tokens from chat template. + for (int i = 0; i < 8191 - 25 - 3; i++) { prompt += "hello "; } std::string requestBody = R"( @@ -1461,7 +1462,7 @@ TEST_P(LLMFlowHttpTestParameterized, unaryChatCompletionsPromptTokensWithMaxComp R"(", "stream": false, "seed" : 1, - "max_completion_tokens": 5, + "max_completion_tokens": 10, "messages": [ { "role": "user", @@ -1483,8 +1484,8 @@ TEST_P(LLMFlowHttpTestParameterized, unaryChatCompletionsPromptTokensEqualToMaxM GTEST_SKIP(); } std::string prompt; - // creating prompt that will be tokenized to 2048 tokens when model max length is 2048 - for (int i = 0; i < 2048; i++) { + // creating prompt that will be tokenized to 8194 tokens when model max length is 8192. + for (int i = 0; i < 8192 - 29; i++) { prompt += "hello "; } std::string requestBody = R"( @@ -1514,8 +1515,8 @@ TEST_P(LLMFlowHttpTestParameterized, unaryChatCompletionsStoppedByMaxModelLength GTEST_SKIP(); } std::string prompt; - // creating prompt that will be tokenized to 2044 tokens when model max length is 2048 - for (int i = 0; i < 2044; i++) { + // creating prompt that will be tokenized to 2044 tokens when model max length is 8192; 25 are tokens from chat template. + for (int i = 0; i < 8192 - 29 - 3; i++) { prompt += "hello "; } std::string requestBody = R"( diff --git a/src/test/llm/lm_cb_regular.pbtxt b/src/test/llm/lm_cb_regular.pbtxt index 9d2343a142..5c7ea6b775 100644 --- a/src/test/llm/lm_cb_regular.pbtxt +++ b/src/test/llm/lm_cb_regular.pbtxt @@ -28,7 +28,7 @@ node { } node_options: { [type.googleapis.com/mediapipe.LLMCalculatorOptions]: { - models_path: "/ovms/src/test/llm_testing/facebook/opt-125m" + models_path: "/ovms/src/test/llm_testing/HuggingFaceTB/SmolLM2-360M-Instruct" cache_size: 1 } } diff --git a/src/test/llm/lm_legacy_regular.pbtxt b/src/test/llm/lm_legacy_regular.pbtxt index 76da452c24..74816dbd41 100644 --- a/src/test/llm/lm_legacy_regular.pbtxt +++ b/src/test/llm/lm_legacy_regular.pbtxt @@ -28,7 +28,7 @@ node { } node_options: { [type.googleapis.com/mediapipe.LLMCalculatorOptions]: { - models_path: "/ovms/src/test/llm_testing/facebook/opt-125m" + models_path: "/ovms/src/test/llm_testing/HuggingFaceTB/SmolLM2-360M-Instruct" cache_size: 1 pipeline_type: LM } diff --git a/src/test/llm/output_parsers/llama3_output_parser_test.cpp b/src/test/llm/output_parsers/llama3_output_parser_test.cpp index c1022aed47..a26da4703f 100644 --- a/src/test/llm/output_parsers/llama3_output_parser_test.cpp +++ b/src/test/llm/output_parsers/llama3_output_parser_test.cpp @@ -25,10 +25,10 @@ using namespace ovms; #ifdef _WIN32 -const std::string tokenizerPath = getWindowsRepoRootPath() + "\\src\\test\\llm_testing\\meta-llama\\Llama-3.1-8B-Instruct"; +const std::string tokenizerPath = getWindowsRepoRootPath() + "\\src\\test\\llm_testing\\unsloth\\Llama-3.1-8B-Instruct"; #else // Hardcoded for usage in docker container -const std::string tokenizerPath = "/ovms/src/test/llm_testing/meta-llama/Llama-3.1-8B-Instruct"; +const std::string tokenizerPath = "/ovms/src/test/llm_testing/unsloth/Llama-3.1-8B-Instruct"; #endif static const ovms::ToolsSchemas_t EMPTY_TOOLS_SCHEMA = {}; // not used for llama3 diff --git a/src/test/llm/output_parsers/mistral_output_parser_test.cpp b/src/test/llm/output_parsers/mistral_output_parser_test.cpp index 891598b7c0..bdc0a6f887 100644 --- a/src/test/llm/output_parsers/mistral_output_parser_test.cpp +++ b/src/test/llm/output_parsers/mistral_output_parser_test.cpp @@ -143,7 +143,7 @@ TEST_F(MistralOutputParserTest, ParseToolCallOutputWithContentAndSingleToolCall) auto generatedTensor = mistralTokenizer->encode(input, ov::genai::add_special_tokens(false)).input_ids; std::vector generatedTokens(generatedTensor.data(), generatedTensor.data() + generatedTensor.get_size()); ParsedOutput parsedOutput = outputParserWithRegularToolParsing->parse(generatedTokens, true); - EXPECT_EQ(parsedOutput.content, "This is a content part and next will be a tool call.\n\n [{\"name\": \"example_tool\", \"arguments\": {\"arg1\": \"value1\", \"arg2\": 42}}]"); + EXPECT_EQ(parsedOutput.content, "This is a content part and next will be a tool call.\n\n[{\"name\": \"example_tool\", \"arguments\": {\"arg1\": \"value1\", \"arg2\": 42}}]"); EXPECT_EQ(parsedOutput.reasoning, ""); ASSERT_EQ(parsedOutput.toolCalls.size(), 0); } @@ -153,7 +153,7 @@ TEST_F(MistralOutputParserTest, ParseToolCallOutputWithContentOnBothSidesAndSing auto generatedTensor = mistralTokenizer->encode(input, ov::genai::add_special_tokens(false)).input_ids; std::vector generatedTokens(generatedTensor.data(), generatedTensor.data() + generatedTensor.get_size()); ParsedOutput parsedOutput = outputParserWithRegularToolParsing->parse(generatedTokens, true); - EXPECT_EQ(parsedOutput.content, "This is a content part and next will be a tool call.\n\n [{\"name\": \"example_tool\", \"arguments\": {\"arg1\": \"value1\", \"arg2\": 42}}] This is a content part after tool call."); + EXPECT_EQ(parsedOutput.content, "This is a content part and next will be a tool call.\n\n[{\"name\": \"example_tool\", \"arguments\": {\"arg1\": \"value1\", \"arg2\": 42}}] This is a content part after tool call."); EXPECT_EQ(parsedOutput.reasoning, ""); ASSERT_EQ(parsedOutput.toolCalls.size(), 0); } @@ -165,7 +165,7 @@ TEST_F(MistralOutputParserTest, ParseToolCallOutputWithMultipleToolCallsReturnsC std::vector generatedTokens(generatedTensor.data(), generatedTensor.data() + generatedTensor.get_size()); ParsedOutput parsedOutput = outputParserWithRegularToolParsing->parse(generatedTokens, true); // Same expected content as tokenizer does not add special tokens - EXPECT_EQ(parsedOutput.content, "[{\"name\": \"tool1\", \"arguments\": {\"a\": 1}}] \n\nThis is some content\n\n [{\"name\": \"tool2\", \"arguments\": {\"b\": 2}}]"); + EXPECT_EQ(parsedOutput.content, "[{\"name\": \"tool1\", \"arguments\": {\"a\": 1}}] \n\nThis is some content\n\n[{\"name\": \"tool2\", \"arguments\": {\"b\": 2}}]"); EXPECT_EQ(parsedOutput.reasoning, ""); ASSERT_EQ(parsedOutput.toolCalls.size(), 0); } diff --git a/src/test/pull_gguf_hf_model_test.cpp b/src/test/pull_gguf_hf_model_test.cpp index e1c35f3fe2..267e1caf3b 100644 --- a/src/test/pull_gguf_hf_model_test.cpp +++ b/src/test/pull_gguf_hf_model_test.cpp @@ -57,7 +57,7 @@ class GGUFDownloaderPullHfModelWithServer : public TestWithTempDir { ovms::Server& server = ovms::Server::instance(); std::unique_ptr t; - void SetUpServerForDownloadAndStartGGUF(std::string& ggufFile, std::string& sourceModel, std::string& downloadPath, std::string& task, int timeoutSeconds = 120) { + void SetUpServerForDownloadAndStartGGUF(std::string& ggufFile, std::string& sourceModel, std::string& downloadPath, std::string& task, int timeoutSeconds = 300) { ::SetUpServerForDownloadAndStartGGUF(this->t, this->server, ggufFile, sourceModel, downloadPath, task, timeoutSeconds); } void TearDown() { @@ -465,7 +465,6 @@ TEST_P(GGUFDownloaderPullHfModelGGUFFilenameParameterizedNegative, NonMatchingPa } std::vector> ggufPartsParams = { - std::make_tuple("qwen2.5-7b-instruct-q4_k_m-000001-of-00002.gguf", "https://modelscope.cn/"), std::make_tuple("qwen2.5-7b-instruct-q4_k_m-000001-of-00002.gguf", "https://huggingface.co/"), std::make_tuple("qwen2.5-7b-instruct-q4_k_m-00001-of-000002.gguf", "https://huggingface.co/"), std::make_tuple("qwen2.5-7b-instruct-q4_k_m-0001-of-00002.gguf", "https://huggingface.co/"), diff --git a/windows_prepare_llm_models.bat b/windows_prepare_llm_models.bat index 9d100bb00f..671b169375 100644 --- a/windows_prepare_llm_models.bat +++ b/windows_prepare_llm_models.bat @@ -31,13 +31,14 @@ IF /I EXIST c:\opt\llm_testing ( set "EMBEDDING_MODEL=thenlper/gte-small" set "RERANK_MODEL=BAAI/bge-reranker-base" set "TEXT_GENERATION_MODEL=facebook/opt-125m" +set "LLM_MODEL=HuggingFaceTB/SmolLM2-360M-Instruct" set "VLM_MODEL=OpenGVLab/InternVL2-1B" set "TOKENIZER_FILE=openvino_tokenizer.bin" set "LEGACY_MODEL_FILE=1\model.bin" :: Models for tools testing. Only tokenizers are downloaded. set "QWEN3_MODEL=Qwen/Qwen3-8B" -set "LLAMA3_MODEL=meta-llama/Llama-3.1-8B-Instruct" +set "LLAMA3_MODEL=unsloth/Llama-3.1-8B-Instruct" set "HERMES3_MODEL=NousResearch/Hermes-3-Llama-3.1-8B" set "PHI4_MODEL=microsoft/Phi-4-mini-instruct" set "MISTRAL_MODEL=mistralai/Mistral-7B-Instruct-v0.3" @@ -84,7 +85,19 @@ if exist "%~1\%TEXT_GENERATION_MODEL%\%TOKENIZER_FILE%" ( if not exist "%~1\%TEXT_GENERATION_MODEL%\%TOKENIZER_FILE%" ( echo Models file %~1\%TEXT_GENERATION_MODEL%\%TOKENIZER_FILE% does not exists. exit /b 1 -) +) + +if exist "%~1\%LLM_MODEL%\%TOKENIZER_FILE%" ( + echo Models file %~1\%LLM_MODEL%\%TOKENIZER_FILE% exists. Skipping downloading models. +) else ( + echo Downloading text generation model to %~1\%LLM_MODEL% directory. + python demos\common\export_models\export_model.py text_generation --source_model "%LLM_MODEL%" --weight-format int8 --model_repository_path %~1 + if !errorlevel! neq 0 exit /b !errorlevel! +) +if not exist "%~1\%LLM_MODEL%\%TOKENIZER_FILE%" ( + echo Models file %~1\%LLM_MODEL%\%TOKENIZER_FILE% does not exists. + exit /b 1 +) if exist "%~1\%EMBEDDING_MODEL%\ov\%TOKENIZER_FILE%" ( echo Models file %~1\%EMBEDDING_MODEL%\ov\%TOKENIZER_FILE% exists. Skipping downloading models.