From 5d5b25bbf8ca923ee027451e2e3658e1c45e5296 Mon Sep 17 00:00:00 2001 From: dkalinowski Date: Thu, 28 May 2026 16:33:08 +0200 Subject: [PATCH 1/3] Add Qwen3.6 to documentation (#4247) --- .../continuous_batching/agentic_ai/.gitignore | 1 + .../continuous_batching/agentic_ai/README.md | 70 +++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 demos/continuous_batching/agentic_ai/.gitignore diff --git a/demos/continuous_batching/agentic_ai/.gitignore b/demos/continuous_batching/agentic_ai/.gitignore new file mode 100644 index 0000000000..38d1f85679 --- /dev/null +++ b/demos/continuous_batching/agentic_ai/.gitignore @@ -0,0 +1 @@ +mcp_weather_server diff --git a/demos/continuous_batching/agentic_ai/README.md b/demos/continuous_batching/agentic_ai/README.md index 8427d7d3b8..1b76928aeb 100644 --- a/demos/continuous_batching/agentic_ai/README.md +++ b/demos/continuous_batching/agentic_ai/README.md @@ -123,6 +123,28 @@ Exemplary output: The current weather in Tokyo is Overcast with a temperature of 9.4°C (feels like 6.4°C), relative humidity at 42%, and dew point at -2.9°C. The wind is blowing from the northeast at 3.6 km/h with gusts up to 24.8 km/h. The atmospheric pressure is 1018.9 hPa with 84% cloud cover. Visibility is 24.1 km. ``` ::: +:::{tab-item} Qwen3.6-35B-A3B +:sync: Qwen3.6-35B-A3B +Vision Language MoE model (35B total / 3B active parameters). Requires OpenVINO 2026.2 or newer and a GPU with sufficient memory to fit the INT4 weights. Tested on PantherLake iGPU with 32GB RAM with iGPU allocation increase and B70 dGPU. + +Pull and start OVMS: +```bat +ovms.exe --rest_port 8000 --source_model OpenVINO/Qwen3.6-35B-A3B-int4-ov --model_repository_path c:\models --reasoning_parser qwen3 --tool_parser qwen3coder --target_device GPU --task text_generation --cache_dir .cache --allowed_media_domains raw.githubusercontent.com +``` + +Use MCP server, with additional image of Gdańsk old town. VLM model deduces location and calls `get_weather` tool to summarize the weather conditions in the city. + +```{image} https://images.pexels.com/photos/20015887/pexels-photo-20015887.jpeg +:alt: poland +:width: 360px +``` + +> **Note**: Image source: [Link](https://images.pexels.com/photos/20015887/pexels-photo-20015887.jpeg) + +```bat +python openai_agent.py --query "What is the current weather in location depicted in the image?" --image https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/releases/2026/1/demos/continuous_batching/agentic_ai/photo.jpeg --model OpenVINO/Qwen3.6-35B-A3B-int4-ov --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather +``` +::: :::{tab-item} gpt-oss-20b :sync: gpt-oss-20b Pull and start OVMS: @@ -283,6 +305,30 @@ Exemplary output: The current weather in Tokyo is overcast with a temperature of 9.4°C (feels like 6.4°C). The relative humidity is 42%, and the dew point is -2.9°C. Wind is blowing from the northeast at 3.6 km/h, with gusts up to 24.8 km/h. The atmospheric pressure is 1018.9 hPa, and there is 84% cloud cover. Visibility is 24.1 km. ``` ::: +:::{tab-item} Qwen3.6-35B-A3B +:sync: Qwen3.6-35B-A3B +Vision Language MoE model (35B total / 3B active parameters). Requires OpenVINO 2026.2 or newer and enough host memory to fit the INT4 weights. Tested on PantherLake iGPU with 32GB RAM with iGPU allocation increase and B70 dGPU. + +Pull and start OVMS: +```bash +mkdir -p ${HOME}/models +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:weekly \ +--rest_port 8000 --source_model OpenVINO/Qwen3.6-35B-A3B-int4-ov --model_repository_path /models --reasoning_parser qwen3 --tool_parser qwen3coder --task text_generation --allowed_media_domains raw.githubusercontent.com +``` + +Use MCP server, with additional image of Gdańsk old town. VLM model deduces location and calls `get_weather` tool to summarize the weather conditions in the city. + +```{image} https://images.pexels.com/photos/20015887/pexels-photo-20015887.jpeg +:alt: poland +:width: 360px +``` + +> **Note**: Image source: [Link](https://images.pexels.com/photos/20015887/pexels-photo-20015887.jpeg) + +```bash +python openai_agent.py --query "What is the current weather in location depicted in the image?" --image https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/releases/2026/1/demos/continuous_batching/agentic_ai/photo.jpeg --model OpenVINO/Qwen3.6-35B-A3B-int4-ov --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather +``` +::: :::{tab-item} gpt-oss-20b :sync: gpt-oss-20b Pull and start OVMS: @@ -408,6 +454,30 @@ Exemplary output: The current weather in Tokyo is overcast with a temperature of 9.4°C (feels like 6.4°C). The relative humidity is 42%, and the dew point is -2.9°C. Wind is blowing from the northeast at 3.6 km/h, with gusts up to 24.8 km/h. The atmospheric pressure is 1018.9 hPa, and there is 84% cloud cover. Visibility is 24.1 km. ``` ::: +:::{tab-item} Qwen3.6-35B-A3B +:sync: Qwen3.6-35B-A3B +Vision Language MoE model (35B total / 3B active parameters). Requires OpenVINO 2026.2 or newer and a GPU with sufficient memory to fit the INT4 weights. Tested on PantherLake iGPU with 32GB RAM with iGPU allocation increase and B70 dGPU. + +Pull and start OVMS: +```bash +mkdir -p ${HOME}/models +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ +--rest_port 8000 --source_model OpenVINO/Qwen3.6-35B-A3B-int4-ov --model_repository_path /models --reasoning_parser qwen3 --tool_parser qwen3coder --target_device GPU --task text_generation --allowed_media_domains raw.githubusercontent.com +``` + +Use MCP server, with additional image of Gdańsk old town. VLM model deduces location and calls `get_weather` tool to summarize the weather conditions in the city. + +```{image} https://images.pexels.com/photos/20015887/pexels-photo-20015887.jpeg +:alt: poland +:width: 360px +``` + +> **Note**: Image source: [Link](https://images.pexels.com/photos/20015887/pexels-photo-20015887.jpeg) + +```bash +python openai_agent.py --query "What is the current weather in location depicted in the image?" --image https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/releases/2026/1/demos/continuous_batching/agentic_ai/photo.jpeg --model OpenVINO/Qwen3.6-35B-A3B-int4-ov --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather +``` +::: :::{tab-item} gpt-oss-20b :sync: gpt-oss-20b Pull and start OVMS: From c92651e52f6075494d6038ed1042516c69d864ff Mon Sep 17 00:00:00 2001 From: dkalinowski Date: Thu, 28 May 2026 16:59:35 +0200 Subject: [PATCH 2/3] Package links 2026.2 / image tag change (#4256) --- demos/c_api_minimal_app/Makefile | 4 +-- demos/code_local_assistant/README.md | 10 +++---- demos/continuous_batching/README.md | 2 +- .../continuous_batching/agentic_ai/README.md | 30 +++++++++---------- .../speculative_decoding/README.md | 2 +- demos/gguf/README.md | 2 +- demos/integration_with_OpenWebUI/README.md | 26 ++++++++-------- docs/deploying_server_baremetal.md | 28 ++++++++--------- docs/deploying_server_docker.md | 2 +- docs/pull_hf_models.md | 4 +-- docs/pull_optimum_cli.md | 2 +- extras/openshift_AI/ServingRuntime.yaml | 2 +- 12 files changed, 57 insertions(+), 57 deletions(-) diff --git a/demos/c_api_minimal_app/Makefile b/demos/c_api_minimal_app/Makefile index 86e28febd5..8fd0473b75 100644 --- a/demos/c_api_minimal_app/Makefile +++ b/demos/c_api_minimal_app/Makefile @@ -25,13 +25,13 @@ BASE_OS ?= ubuntu24 ifeq ($(BASE_OS),ubuntu24) BASE_OS_TAG_UBUNTU ?= 24.04 - PACKAGE_URL ?="https://github.com/openvinotoolkit/model_server/releases/download/v2026.1/ovms_ubuntu24_2026.1.0_python_off.tar.gz" + PACKAGE_URL ?="https://github.com/openvinotoolkit/model_server/releases/download/v2026.2/ovms_ubuntu24_2026.2.0_python_off.tar.gz" BASE_IMAGE ?= ubuntu:$(BASE_OS_TAG_UBUNTU) DIST_OS=ubuntu endif ifeq ($(BASE_OS),redhat) BASE_OS_TAG_REDHAT ?= 9.6 - PACKAGE_URL ="https://github.com/openvinotoolkit/model_server/releases/download/v2026.1/ovms_redhat_2026.1.0_python_off.tar.gz" + PACKAGE_URL ="https://github.com/openvinotoolkit/model_server/releases/download/v2026.2/ovms_redhat_2026.2.0_python_off.tar.gz" BASE_IMAGE ?= registry.access.redhat.com/ubi9/ubi:$(BASE_OS_TAG_REDHAT) DIST_OS=redhat endif diff --git a/demos/code_local_assistant/README.md b/demos/code_local_assistant/README.md index c55160e602..32e24e0610 100644 --- a/demos/code_local_assistant/README.md +++ b/demos/code_local_assistant/README.md @@ -66,7 +66,7 @@ ovms --model_repository_path c:\models --source_model OpenVINO/Qwen3-VL-8B-Instr ```bash mkdir -p models docker run -d -p 8000:8000 --rm --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \ - openvino/model_server:weekly \ + openvino/model_server:latest-gpu \ --model_repository_path /models --source_model OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int4-ov --task text_generation --target_device GPU --tool_parser qwen3coder --rest_port 8000 --model_name Qwen3-Coder-30B-A3B-Instruct ``` > **Note:** For deployment, the model requires ~16GB disk space and recommended 19GB+ of VRAM on the GPU. @@ -79,7 +79,7 @@ docker run -d -p 8000:8000 --rm --user $(id -u):$(id -g) -v $(pwd)/models:/model ```bash mkdir -p models docker run -d -p 8000:8000 --rm --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \ - openvino/model_server:weekly \ + openvino/model_server:latest-gpu \ --model_repository_path /models --source_model OpenVINO/gpt-oss-20b-int4-ov --task text_generation --target_device GPU --tool_parser gptoss --reasoning_parser gptoss --rest_port 8000 --model_name gpt-oss-20b ``` > **Note:** For deployment, the model requires ~12GB disk space and recommended 16GB+ of VRAM on the GPU. @@ -90,7 +90,7 @@ docker run -d -p 8000:8000 --rm --user $(id -u):$(id -g) -v $(pwd)/models:/model ```bash mkdir c:\models docker run -d -p 8000:8000 --rm --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \ - openvino/model_server:weekly \ + openvino/model_server:latest-gpu \ --model_repository_path /models --source_model OpenVINO/Qwen3-8B-int4-ov --task text_generation --target_device GPU --tool_parser hermes3 --reasoning_parser qwen3 --rest_port 8000 --model_name Qwen3-8B ``` > **Note:** For deployment, the model requires ~4GB disk space and recommended 6GB+ of VRAM on the GPU. @@ -100,7 +100,7 @@ docker run -d -p 8000:8000 --rm --user $(id -u):$(id -g) -v $(pwd)/models:/model ```bash mkdir -p models docker run -d -p 8000:8000 --rm --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \ - openvino/model_server:weekly \ + openvino/model_server:latest-gpu \ --model_repository_path /models --source_model OpenVINO/Qwen3-8B-int4-cw-ov --task text_generation --target_device NPU --tool_parser hermes3 --rest_port 8000 --max_prompt_len 16384 --plugin_config '{"NPUW_LLM_PREFILL_ATTENTION_HINT":"PYRAMID"}' --model_name Qwen3-8B ``` > **Note:** First model initialization might be long. With the compilation cache, sequential model loading will be fast. @@ -110,7 +110,7 @@ docker run -d -p 8000:8000 --rm --user $(id -u):$(id -g) -v $(pwd)/models:/model ```bash mkdir -p models docker run -d -p 8000:8000 --rm --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \ - openvino/model_server:weekly \ + openvino/model_server:latest-gpu \ --model_repository_path /models --source_model OpenVINO/Qwen3-VL-8B-Instruct-int4-ov --task text_generation --target_device GPU --pipeline_type VLM_CB --rest_port 8000 --model_name Qwen3-VL-8B-Instruct ``` > **Note:** This is a Vision Language Model (VLM) that supports image inputs. For deployment, recommended 7GB+ of VRAM on the GPU. diff --git a/demos/continuous_batching/README.md b/demos/continuous_batching/README.md index dfc6fef039..7d6ed25ddd 100644 --- a/demos/continuous_batching/README.md +++ b/demos/continuous_batching/README.md @@ -35,7 +35,7 @@ That makes it easy to use and efficient especially on on Intel® Xeon® processo Running this command starts the container with CPU only target device: ```bash mkdir -p ${HOME}/models -docker run -it -p 8000:8000 --rm --user $(id -u):$(id -g) -v ${HOME}/models:/models/:rw openvino/model_server:weekly --model_repository_path /models --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --task text_generation --target_device CPU --tool_parser hermes3 --rest_port 8000 --model_name Qwen3-30B-A3B-Instruct-2507-int4-ov +docker run -it -p 8000:8000 --rm --user $(id -u):$(id -g) -v ${HOME}/models:/models/:rw openvino/model_server:latest-gpu --model_repository_path /models --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --task text_generation --target_device CPU --tool_parser hermes3 --rest_port 8000 --model_name Qwen3-30B-A3B-Instruct-2507-int4-ov ``` > **Note:** In case you want to use GPU target device, add extra docker parameters `--device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1)` to `docker run` command. The parameter `--target_device` should be also updated to `GPU`. diff --git a/demos/continuous_batching/agentic_ai/README.md b/demos/continuous_batching/agentic_ai/README.md index 1b76928aeb..b31251ab15 100644 --- a/demos/continuous_batching/agentic_ai/README.md +++ b/demos/continuous_batching/agentic_ai/README.md @@ -226,7 +226,7 @@ The current weather in Tokyo is overcast with a temperature of 9.4°C (feels lik Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:weekly \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:latest-gpu \ --rest_port 8000 --model_repository_path /models --source_model OpenVINO/Qwen3-VL-8B-Instruct-int4-ov --tool_parser hermes3 --task text_generation --pipeline_type VLM_CB --allowed_media_domains raw.githubusercontent.com ``` @@ -253,7 +253,7 @@ The current weather in Gdańsk is overcast with a temperature of 8.8°C (feels l Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:weekly \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:latest-gpu \ --rest_port 8000 --model_repository_path /models --source_model OpenVINO/Qwen3-4B-int4-ov --tool_parser hermes3 --task text_generation ``` @@ -272,7 +272,7 @@ The current weather in Tokyo is overcast with a temperature of 9.4°C (feels lik Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:weekly \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:latest-gpu \ --rest_port 8000 --model_repository_path /models --source_model OpenVINO/Phi-4-mini-instruct-int4-ov --tool_parser phi4 --task text_generation --max_num_batched_tokens 99999 ``` @@ -291,7 +291,7 @@ The current weather in Tokyo is as follows: The sky is mostly covered with cloud Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:weekly \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:latest-gpu \ --rest_port 8000 --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --model_repository_path /models --tool_parser hermes3 --task text_generation ``` @@ -312,7 +312,7 @@ Vision Language MoE model (35B total / 3B active parameters). Requires OpenVINO Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:weekly \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:latest-gpu \ --rest_port 8000 --source_model OpenVINO/Qwen3.6-35B-A3B-int4-ov --model_repository_path /models --reasoning_parser qwen3 --tool_parser qwen3coder --task text_generation --allowed_media_domains raw.githubusercontent.com ``` @@ -334,7 +334,7 @@ python openai_agent.py --query "What is the current weather in location depicted Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:weekly \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:latest-gpu \ --rest_port 8000 --source_model OpenVINO/gpt-oss-20b-int4-ov --model_repository_path /models \ --tool_parser gptoss --reasoning_parser gptoss --task text_generation ``` @@ -375,7 +375,7 @@ It can be applied using the commands below: Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:latest-gpu \ --rest_port 8000 --model_repository_path /models --source_model OpenVINO/Qwen3-VL-8B-Instruct-int4-ov --tool_parser hermes3 --target_device GPU --task text_generation --pipeline_type VLM_CB --allowed_media_domains raw.githubusercontent.com ``` @@ -402,7 +402,7 @@ The current weather in Gdańsk is overcast with a temperature of 8.8°C (feels l Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:latest-gpu \ --rest_port 8000 --model_repository_path /models --source_model OpenVINO/Qwen3-4B-int4-ov --tool_parser hermes3 --target_device GPU --task text_generation ``` @@ -421,7 +421,7 @@ The current weather in Tokyo is overcast with a temperature of 9.4°C (feels lik Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:latest-gpu \ --rest_port 8000 --model_repository_path /models --source_model OpenVINO/Phi-4-mini-instruct-int4-ov --tool_parser phi4 --task text_generation --target_device GPU --max_num_batched_tokens 99999 ``` @@ -440,7 +440,7 @@ The current weather in Tokyo is overcast with a temperature of 9.4°C (feels lik Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:latest-gpu \ --rest_port 8000 --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --model_repository_path /models --tool_parser hermes3 --target_device GPU --task text_generation --enable_tool_guided_generation true ``` @@ -461,7 +461,7 @@ Vision Language MoE model (35B total / 3B active parameters). Requires OpenVINO Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:latest-gpu \ --rest_port 8000 --source_model OpenVINO/Qwen3.6-35B-A3B-int4-ov --model_repository_path /models --reasoning_parser qwen3 --tool_parser qwen3coder --target_device GPU --task text_generation --allowed_media_domains raw.githubusercontent.com ``` @@ -483,7 +483,7 @@ python openai_agent.py --query "What is the current weather in location depicted Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:latest-gpu \ --rest_port 8000 --source_model OpenVINO/gpt-oss-20b-int4-ov --model_repository_path /models \ --tool_parser gptoss --reasoning_parser gptoss --target_device GPU --task text_generation ``` @@ -524,7 +524,7 @@ It can be applied using the commands below: Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -1) openvino/model_server:weekly \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -1) openvino/model_server:latest-gpu \ --rest_port 8000 --model_repository_path /models --source_model OpenVINO/Qwen3-8B-int4-cw-ov --tool_parser hermes3 --target_device NPU --task text_generation --max_prompt_len 8000 ``` @@ -543,7 +543,7 @@ The current weather in Tokyo is overcast with a temperature of 9.4°C (feels lik Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:latest-gpu \ --rest_port 8000 --model_repository_path /models --source_model FluidInference/qwen3-4b-int4-ov-npu --tool_parser hermes3 --target_device NPU --task text_generation --max_prompt_len 8000 ``` @@ -568,7 +568,7 @@ The current weather in Tokyo is overcast with a temperature of 9.4°C (feels lik Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:weekly \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:latest-gpu \ --rest_port 8000 --model_repository_path /models --source_model OpenVINO/Qwen3-8B-int4-ov --tool_parser hermes3 --task text_generation ``` diff --git a/demos/continuous_batching/speculative_decoding/README.md b/demos/continuous_batching/speculative_decoding/README.md index ee1541d8a0..bc74e44e50 100644 --- a/demos/continuous_batching/speculative_decoding/README.md +++ b/demos/continuous_batching/speculative_decoding/README.md @@ -81,7 +81,7 @@ models :::{dropdown} **Deploying with Docker** ```bash -docker run -d --rm -p 8000:8000 -v $(pwd)/models:/workspace:ro openvino/model_server:weekly --rest_port 8000 --rest_workers 2 --config_path /workspace/config.json +docker run -d --rm -p 8000:8000 -v $(pwd)/models:/workspace:ro openvino/model_server:latest-gpu --rest_port 8000 --rest_workers 2 --config_path /workspace/config.json ``` Running above command starts the container with no accelerators support. diff --git a/demos/gguf/README.md b/demos/gguf/README.md index a1ed003de8..aa15f2ea6b 100644 --- a/demos/gguf/README.md +++ b/demos/gguf/README.md @@ -20,7 +20,7 @@ Start docker container: mkdir models docker run -d --rm --user $(id -u):$(id -g) -p 8000:8000 -v $(pwd)/models:/models/:rw \ -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy \ - openvino/model_server:weekly \ + openvino/model_server:latest-gpu \ --rest_port 8000 \ --model_repository_path /models/ \ --task text_generation \ diff --git a/demos/integration_with_OpenWebUI/README.md b/demos/integration_with_OpenWebUI/README.md index f077c98de9..e6aa161646 100644 --- a/demos/integration_with_OpenWebUI/README.md +++ b/demos/integration_with_OpenWebUI/README.md @@ -40,9 +40,9 @@ ovms.exe --rest_port 8000 --config_path models\config.json --allowed_media_domai :sync: Linux ```bash mkdir models -docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly --pull --source_model OpenVINO/gpt-oss-20b-int4-ov --model_repository_path /models --task text_generation --tool_parser gptoss --reasoning_parser gptoss --target_device GPU -docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models openvino/model_server:weekly --add_to_config --config_path /models/config.json --model_path OpenVINO/gpt-oss-20b-int4-ov --model_name ovms-model -docker run -d -u $(id -u):$(id -g) -v $PWD/models:/models -p 8000:8000 --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly --rest_port 8000 --config_path /models/config.json --allowed_media_domains raw.githubusercontent.com +docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:latest-gpu --pull --source_model OpenVINO/gpt-oss-20b-int4-ov --model_repository_path /models --task text_generation --tool_parser gptoss --reasoning_parser gptoss --target_device GPU +docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models openvino/model_server:latest-gpu --add_to_config --config_path /models/config.json --model_path OpenVINO/gpt-oss-20b-int4-ov --model_name ovms-model +docker run -d -u $(id -u):$(id -g) -v $PWD/models:/models -p 8000:8000 --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:latest-gpu --rest_port 8000 --config_path /models/config.json --allowed_media_domains raw.githubusercontent.com ``` ::: :::: @@ -133,10 +133,10 @@ ovms.exe --add_to_config --config_path models\config.json --model_path OpenVINO\ :::{tab-item} Linux (using Docker) :sync: Linux ```bash -docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly --pull --source_model OpenVINO/Qwen3-Embedding-0.6B-fp16-ov --model_repository_path models --task embeddings --target_device GPU -docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models openvino/model_server:weekly --add_to_config --config_path /models/config.json --model_path OpenVINO/Qwen3-Embedding-0.6B-fp16-ov --model_name OpenVINO/Qwen3-Embedding-0.6B-fp16-ov -docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly --pull --source_model OpenVINO/Qwen3-Reranker-0.6B-seq-cls-fp16-ov --model_repository_path models --task rerank --target_device GPU -docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models openvino/model_server:weekly --add_to_config --config_path /models/config.json --model_path OpenVINO/Qwen3-Reranker-0.6B-seq-cls-fp16-ov --model_name OpenVINO/Qwen3-Reranker-0.6B-seq-cls-fp16-ov +docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:latest-gpu --pull --source_model OpenVINO/Qwen3-Embedding-0.6B-fp16-ov --model_repository_path models --task embeddings --target_device GPU +docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models openvino/model_server:latest-gpu --add_to_config --config_path /models/config.json --model_path OpenVINO/Qwen3-Embedding-0.6B-fp16-ov --model_name OpenVINO/Qwen3-Embedding-0.6B-fp16-ov +docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:latest-gpu --pull --source_model OpenVINO/Qwen3-Reranker-0.6B-seq-cls-fp16-ov --model_repository_path models --task rerank --target_device GPU +docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models openvino/model_server:latest-gpu --add_to_config --config_path /models/config.json --model_path OpenVINO/Qwen3-Reranker-0.6B-seq-cls-fp16-ov --model_name OpenVINO/Qwen3-Reranker-0.6B-seq-cls-fp16-ov ``` ::: :::: @@ -228,8 +228,8 @@ ovms.exe --add_to_config --config_path models\config.json --model_path OpenVINO\ :::{tab-item} Linux (using Docker) :sync: Linux ```bash -docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly --pull --source_model OpenVINO/FLUX.1-schnell-int4-ov --model_repository_path models --model_name OpenVINO/FLUX.1-schnell-int4-ov --task image_generation --default_num_inference_steps 3 --target_device GPU -docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models openvino/model_server:weekly --add_to_config --config_path /models/config.json --model_path OpenVINO/FLUX.1-schnell-int4-ov --model_name OpenVINO/FLUX.1-schnell-int4-ov +docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:latest-gpu --pull --source_model OpenVINO/FLUX.1-schnell-int4-ov --model_repository_path models --model_name OpenVINO/FLUX.1-schnell-int4-ov --task image_generation --default_num_inference_steps 3 --target_device GPU +docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models openvino/model_server:latest-gpu --add_to_config --config_path /models/config.json --model_path OpenVINO/FLUX.1-schnell-int4-ov --model_name OpenVINO/FLUX.1-schnell-int4-ov ``` ::: :::: @@ -298,8 +298,8 @@ ovms.exe --add_to_config --config_path models\config.json --model_path OpenVINO/ :::{tab-item} Linux (using Docker) :sync: Linux ```bash -docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly --pull --source_model OpenVINO/Qwen3-VL-8B-Instruct-int4-ov --model_repository_path /models --model_name ovms-model-vl --task text_generation --pipeline_type VLM_CB --target_device GPU -docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models openvino/model_server:weekly --add_to_config --config_path /models/config.json --model_path OpenVINO/Qwen3-VL-8B-Instruct-int4-ov --model_name ovms-model-vl +docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:latest-gpu --pull --source_model OpenVINO/Qwen3-VL-8B-Instruct-int4-ov --model_repository_path /models --model_name ovms-model-vl --task text_generation --pipeline_type VLM_CB --target_device GPU +docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models openvino/model_server:latest-gpu --add_to_config --config_path /models/config.json --model_path OpenVINO/Qwen3-VL-8B-Instruct-int4-ov --model_name ovms-model-vl ``` ::: :::: @@ -481,8 +481,8 @@ ovms.exe --add_to_config --config_path models\config.json --model_path OpenVINO :::{tab-item} Linux (using Docker) :sync: Linux ```bash -docker run --rm -u $(id -u):$(id -g) --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -v $PWD/models:/models openvino/model_server:weekly --pull --source_model OpenVINO/whisper-base-fp16-ov --model_repository_path /models --task speech2text --target_device GPU -docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models openvino/model_server:weekly --add_to_config --config_path /models/config.json --model_path OpenVINO/whisper-base-fp16-ov --model_name OpenVINO/whisper-base-fp16-ov +docker run --rm -u $(id -u):$(id -g) --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -v $PWD/models:/models openvino/model_server:latest-gpu --pull --source_model OpenVINO/whisper-base-fp16-ov --model_repository_path /models --task speech2text --target_device GPU +docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models openvino/model_server:latest-gpu --add_to_config --config_path /models/config.json --model_path OpenVINO/whisper-base-fp16-ov --model_name OpenVINO/whisper-base-fp16-ov ``` ::: :::: diff --git a/docs/deploying_server_baremetal.md b/docs/deploying_server_baremetal.md index 7302d373fb..6f56df1e6d 100644 --- a/docs/deploying_server_baremetal.md +++ b/docs/deploying_server_baremetal.md @@ -15,13 +15,13 @@ You can download model server package in two configurations. One with Python sup :sync: ubuntu-22-04 Download precompiled package (without python): ```{code} sh -wget https://github.com/openvinotoolkit/model_server/releases/download/v2026.1/ovms_ubuntu22_2026.1.0_python_off.tar.gz -tar -xzvf ovms_ubuntu22_2026.1.0_python_off.tar.gz +wget https://github.com/openvinotoolkit/model_server/releases/download/v2026.2/ovms_ubuntu22_2026.2.0_python_off.tar.gz +tar -xzvf ovms_ubuntu22_2026.2.0_python_off.tar.gz ``` or precompiled package (with python): ```{code} sh -wget https://github.com/openvinotoolkit/model_server/releases/download/v2026.1/ovms_ubuntu22_2026.1.0_python_on.tar.gz -tar -xzvf ovms_ubuntu22_2026.1.0_python_on.tar.gz +wget https://github.com/openvinotoolkit/model_server/releases/download/v2026.2/ovms_ubuntu22_2026.2.0_python_on.tar.gz +tar -xzvf ovms_ubuntu22_2026.2.0_python_on.tar.gz ``` Install required libraries: ```{code} sh @@ -50,13 +50,13 @@ Model server version with Python is shipped with those packages and new installa :sync: ubuntu-24-04 Download precompiled package (without python): ```{code} sh -wget https://github.com/openvinotoolkit/model_server/releases/download/v2026.1/ovms_ubuntu24_2026.1.0_python_off.tar.gz -tar -xzvf ovms_ubuntu24_2026.1.0_python_off.tar.gz +wget https://github.com/openvinotoolkit/model_server/releases/download/v2026.2/ovms_ubuntu24_2026.2.0_python_off.tar.gz +tar -xzvf ovms_ubuntu24_2026.2.0_python_off.tar.gz ``` or precompiled package (with python): ```{code} sh -wget https://github.com/openvinotoolkit/model_server/releases/download/v2026.1/ovms_ubuntu24_2026.1.0_python_on.tar.gz -tar -xzvf ovms_ubuntu24_2026.1.0_python_on.tar.gz +wget https://github.com/openvinotoolkit/model_server/releases/download/v2026.2/ovms_ubuntu24_2026.2.0_python_on.tar.gz +tar -xzvf ovms_ubuntu24_2026.2.0_python_on.tar.gz ``` Install required libraries: ```{code} sh @@ -85,13 +85,13 @@ Model server version with Python is shipped with those packages and new installa :sync: rhel-9.6 Download precompiled package (without python): ```{code} sh -wget https://github.com/openvinotoolkit/model_server/releases/download/v2026.1/ovms_redhat_2026.1.0_python_off.tar.gz -tar -xzvf ovms_redhat_2026.1.0_python_off.tar.gz +wget https://github.com/openvinotoolkit/model_server/releases/download/v2026.2/ovms_redhat_2026.2.0_python_off.tar.gz +tar -xzvf ovms_redhat_2026.2.0_python_off.tar.gz ``` or precompiled package (with python): ```{code} sh -wget https://github.com/openvinotoolkit/model_server/releases/download/v2026.1/ovms_redhat_2026.1.0_python_on.tar.gz -tar -xzvf ovms_redhat_2026.1.0_python_on.tar.gz +wget https://github.com/openvinotoolkit/model_server/releases/download/v2026.2/ovms_redhat_2026.2.0_python_on.tar.gz +tar -xzvf ovms_redhat_2026.2.0_python_on.tar.gz ``` Install required libraries: ```{code} sh @@ -124,14 +124,14 @@ Make sure you have [Microsoft Visual C++ Redistributable](https://aka.ms/vs/17/r Download and unpack model server archive for Windows(with python): ```bat -curl -L https://github.com/openvinotoolkit/model_server/releases/download/v2026.1/ovms_windows_2026.1.0_python_on.zip -o ovms.zip +curl -L https://github.com/openvinotoolkit/model_server/releases/download/v2026.2/ovms_windows_2026.2.0_python_on.zip -o ovms.zip tar -xf ovms.zip ``` or archive without python: ```bat -curl -L https://github.com/openvinotoolkit/model_server/releases/download/v2026.1/ovms_windows_2026.1.0_python_off.zip -o ovms.zip +curl -L https://github.com/openvinotoolkit/model_server/releases/download/v2026.2/ovms_windows_2026.2.0_python_off.zip -o ovms.zip tar -xf ovms.zip ``` diff --git a/docs/deploying_server_docker.md b/docs/deploying_server_docker.md index e2c51d4d79..086e6d0a12 100644 --- a/docs/deploying_server_docker.md +++ b/docs/deploying_server_docker.md @@ -27,7 +27,7 @@ or [RedHat Ecosystem Catalog](https://catalog.redhat.com/software/containers/int docker pull registry.connect.redhat.com/intel/openvino-model-server:latest ``` -> **NOTE**: You can also pull public image `openvino/model_server:weekly` with development version of the model server, which is built from the main branch. It allow you to evaluate the latest features ahead of official releases. +> **NOTE**: You can also pull public image `openvino/model_server:latest-gpu` with development version of the model server, which is built from the main branch. It allow you to evaluate the latest features ahead of official releases. #### Step 2. Prepare Data for Serving diff --git a/docs/pull_hf_models.md b/docs/pull_hf_models.md index 77ab0dfb60..bdf057fdde 100644 --- a/docs/pull_hf_models.md +++ b/docs/pull_hf_models.md @@ -17,7 +17,7 @@ There is a special OVMS mode to pull the model from Hugging Face without startin :sync: docker **Required:** Docker Engine installed ```text -docker run $(id -u):$(id -g) --rm -v :/models:rw openvino/model_server:weekly --pull --source_model --model_repository_path /models --model_name --target_device [--gguf_filename SPECIFIC_QUANTIZATION_FILENAME.gguf] --task [TASK_SPECIFIC_PARAMETERS] +docker run $(id -u):$(id -g) --rm -v :/models:rw openvino/model_server:latest-gpu --pull --source_model --model_repository_path /models --model_name --target_device [--gguf_filename SPECIFIC_QUANTIZATION_FILENAME.gguf] --task [TASK_SPECIFIC_PARAMETERS] ``` ::: @@ -63,7 +63,7 @@ Example for pulling GGUF model `unsloth/Llama-3.2-1B-Instruct-GGUF` with Q4_K_M **Required:** Docker Engine installed ```text -docker run $(id -u):$(id -g) --rm -v :/models:rw openvino/model_server:weekly --pull --source_model "unsloth/Llama-3.2-1B-Instruct-GGUF" --model_repository_path /models --model_name unsloth/Llama-3.2-1B-Instruct-GGUF --task text_generation --gguf_filename Llama-3.2-1B-Instruct-Q4_K_M.gguf +docker run $(id -u):$(id -g) --rm -v :/models:rw openvino/model_server:latest-gpu --pull --source_model "unsloth/Llama-3.2-1B-Instruct-GGUF" --model_repository_path /models --model_name unsloth/Llama-3.2-1B-Instruct-GGUF --task text_generation --gguf_filename Llama-3.2-1B-Instruct-Q4_K_M.gguf ``` ::: diff --git a/docs/pull_optimum_cli.md b/docs/pull_optimum_cli.md index bcf3159132..8177a7946e 100644 --- a/docs/pull_optimum_cli.md +++ b/docs/pull_optimum_cli.md @@ -15,7 +15,7 @@ mkdir models ## Add optimum-cli to OVMS installation on windows ```bat -curl -L https://github.com/openvinotoolkit/model_server/releases/download/v2026.1/ovms_windows_2026.1.0_python_on.zip -o ovms.zip +curl -L https://github.com/openvinotoolkit/model_server/releases/download/v2026.2/ovms_windows_2026.2.0_python_on.zip -o ovms.zip tar -xf ovms.zip ovms\setupvars.bat ovms\python\python -m pip install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/requirements.txt diff --git a/extras/openshift_AI/ServingRuntime.yaml b/extras/openshift_AI/ServingRuntime.yaml index c7d01e3889..bd83ba6efb 100644 --- a/extras/openshift_AI/ServingRuntime.yaml +++ b/extras/openshift_AI/ServingRuntime.yaml @@ -21,7 +21,7 @@ spec: - --model_path=/mnt/models - --file_system_poll_wait_seconds=0 - --metrics_enable - image: docker.io/openvino/model_server:weekly + image: docker.io/openvino/model_server:latest-gpu startupProbe: periodSeconds: 5 failureThreshold: 9999 From 0813110c684e10dbc99ddd3a3fe060f02d291fd4 Mon Sep 17 00:00:00 2001 From: Damian Kalinowski Date: Fri, 29 May 2026 09:18:35 +0200 Subject: [PATCH 3/3] fix --- demos/code_local_assistant/README.md | 10 +++---- demos/continuous_batching/README.md | 2 +- .../continuous_batching/agentic_ai/README.md | 30 +++++++++---------- .../speculative_decoding/README.md | 2 +- demos/gguf/README.md | 2 +- demos/integration_with_OpenWebUI/README.md | 26 ++++++++-------- docs/deploying_server_docker.md | 2 +- docs/pull_hf_models.md | 4 +-- extras/openshift_AI/ServingRuntime.yaml | 2 +- 9 files changed, 40 insertions(+), 40 deletions(-) diff --git a/demos/code_local_assistant/README.md b/demos/code_local_assistant/README.md index 32e24e0610..c55160e602 100644 --- a/demos/code_local_assistant/README.md +++ b/demos/code_local_assistant/README.md @@ -66,7 +66,7 @@ ovms --model_repository_path c:\models --source_model OpenVINO/Qwen3-VL-8B-Instr ```bash mkdir -p models docker run -d -p 8000:8000 --rm --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \ - openvino/model_server:latest-gpu \ + openvino/model_server:weekly \ --model_repository_path /models --source_model OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int4-ov --task text_generation --target_device GPU --tool_parser qwen3coder --rest_port 8000 --model_name Qwen3-Coder-30B-A3B-Instruct ``` > **Note:** For deployment, the model requires ~16GB disk space and recommended 19GB+ of VRAM on the GPU. @@ -79,7 +79,7 @@ docker run -d -p 8000:8000 --rm --user $(id -u):$(id -g) -v $(pwd)/models:/model ```bash mkdir -p models docker run -d -p 8000:8000 --rm --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \ - openvino/model_server:latest-gpu \ + openvino/model_server:weekly \ --model_repository_path /models --source_model OpenVINO/gpt-oss-20b-int4-ov --task text_generation --target_device GPU --tool_parser gptoss --reasoning_parser gptoss --rest_port 8000 --model_name gpt-oss-20b ``` > **Note:** For deployment, the model requires ~12GB disk space and recommended 16GB+ of VRAM on the GPU. @@ -90,7 +90,7 @@ docker run -d -p 8000:8000 --rm --user $(id -u):$(id -g) -v $(pwd)/models:/model ```bash mkdir c:\models docker run -d -p 8000:8000 --rm --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \ - openvino/model_server:latest-gpu \ + openvino/model_server:weekly \ --model_repository_path /models --source_model OpenVINO/Qwen3-8B-int4-ov --task text_generation --target_device GPU --tool_parser hermes3 --reasoning_parser qwen3 --rest_port 8000 --model_name Qwen3-8B ``` > **Note:** For deployment, the model requires ~4GB disk space and recommended 6GB+ of VRAM on the GPU. @@ -100,7 +100,7 @@ docker run -d -p 8000:8000 --rm --user $(id -u):$(id -g) -v $(pwd)/models:/model ```bash mkdir -p models docker run -d -p 8000:8000 --rm --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \ - openvino/model_server:latest-gpu \ + openvino/model_server:weekly \ --model_repository_path /models --source_model OpenVINO/Qwen3-8B-int4-cw-ov --task text_generation --target_device NPU --tool_parser hermes3 --rest_port 8000 --max_prompt_len 16384 --plugin_config '{"NPUW_LLM_PREFILL_ATTENTION_HINT":"PYRAMID"}' --model_name Qwen3-8B ``` > **Note:** First model initialization might be long. With the compilation cache, sequential model loading will be fast. @@ -110,7 +110,7 @@ docker run -d -p 8000:8000 --rm --user $(id -u):$(id -g) -v $(pwd)/models:/model ```bash mkdir -p models docker run -d -p 8000:8000 --rm --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \ - openvino/model_server:latest-gpu \ + openvino/model_server:weekly \ --model_repository_path /models --source_model OpenVINO/Qwen3-VL-8B-Instruct-int4-ov --task text_generation --target_device GPU --pipeline_type VLM_CB --rest_port 8000 --model_name Qwen3-VL-8B-Instruct ``` > **Note:** This is a Vision Language Model (VLM) that supports image inputs. For deployment, recommended 7GB+ of VRAM on the GPU. diff --git a/demos/continuous_batching/README.md b/demos/continuous_batching/README.md index 7d6ed25ddd..dfc6fef039 100644 --- a/demos/continuous_batching/README.md +++ b/demos/continuous_batching/README.md @@ -35,7 +35,7 @@ That makes it easy to use and efficient especially on on Intel® Xeon® processo Running this command starts the container with CPU only target device: ```bash mkdir -p ${HOME}/models -docker run -it -p 8000:8000 --rm --user $(id -u):$(id -g) -v ${HOME}/models:/models/:rw openvino/model_server:latest-gpu --model_repository_path /models --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --task text_generation --target_device CPU --tool_parser hermes3 --rest_port 8000 --model_name Qwen3-30B-A3B-Instruct-2507-int4-ov +docker run -it -p 8000:8000 --rm --user $(id -u):$(id -g) -v ${HOME}/models:/models/:rw openvino/model_server:weekly --model_repository_path /models --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --task text_generation --target_device CPU --tool_parser hermes3 --rest_port 8000 --model_name Qwen3-30B-A3B-Instruct-2507-int4-ov ``` > **Note:** In case you want to use GPU target device, add extra docker parameters `--device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1)` to `docker run` command. The parameter `--target_device` should be also updated to `GPU`. diff --git a/demos/continuous_batching/agentic_ai/README.md b/demos/continuous_batching/agentic_ai/README.md index b31251ab15..1b76928aeb 100644 --- a/demos/continuous_batching/agentic_ai/README.md +++ b/demos/continuous_batching/agentic_ai/README.md @@ -226,7 +226,7 @@ The current weather in Tokyo is overcast with a temperature of 9.4°C (feels lik Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:latest-gpu \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:weekly \ --rest_port 8000 --model_repository_path /models --source_model OpenVINO/Qwen3-VL-8B-Instruct-int4-ov --tool_parser hermes3 --task text_generation --pipeline_type VLM_CB --allowed_media_domains raw.githubusercontent.com ``` @@ -253,7 +253,7 @@ The current weather in Gdańsk is overcast with a temperature of 8.8°C (feels l Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:latest-gpu \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:weekly \ --rest_port 8000 --model_repository_path /models --source_model OpenVINO/Qwen3-4B-int4-ov --tool_parser hermes3 --task text_generation ``` @@ -272,7 +272,7 @@ The current weather in Tokyo is overcast with a temperature of 9.4°C (feels lik Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:latest-gpu \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:weekly \ --rest_port 8000 --model_repository_path /models --source_model OpenVINO/Phi-4-mini-instruct-int4-ov --tool_parser phi4 --task text_generation --max_num_batched_tokens 99999 ``` @@ -291,7 +291,7 @@ The current weather in Tokyo is as follows: The sky is mostly covered with cloud Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:latest-gpu \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:weekly \ --rest_port 8000 --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --model_repository_path /models --tool_parser hermes3 --task text_generation ``` @@ -312,7 +312,7 @@ Vision Language MoE model (35B total / 3B active parameters). Requires OpenVINO Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:latest-gpu \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:weekly \ --rest_port 8000 --source_model OpenVINO/Qwen3.6-35B-A3B-int4-ov --model_repository_path /models --reasoning_parser qwen3 --tool_parser qwen3coder --task text_generation --allowed_media_domains raw.githubusercontent.com ``` @@ -334,7 +334,7 @@ python openai_agent.py --query "What is the current weather in location depicted Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:latest-gpu \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:weekly \ --rest_port 8000 --source_model OpenVINO/gpt-oss-20b-int4-ov --model_repository_path /models \ --tool_parser gptoss --reasoning_parser gptoss --task text_generation ``` @@ -375,7 +375,7 @@ It can be applied using the commands below: Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:latest-gpu \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ --rest_port 8000 --model_repository_path /models --source_model OpenVINO/Qwen3-VL-8B-Instruct-int4-ov --tool_parser hermes3 --target_device GPU --task text_generation --pipeline_type VLM_CB --allowed_media_domains raw.githubusercontent.com ``` @@ -402,7 +402,7 @@ The current weather in Gdańsk is overcast with a temperature of 8.8°C (feels l Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:latest-gpu \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ --rest_port 8000 --model_repository_path /models --source_model OpenVINO/Qwen3-4B-int4-ov --tool_parser hermes3 --target_device GPU --task text_generation ``` @@ -421,7 +421,7 @@ The current weather in Tokyo is overcast with a temperature of 9.4°C (feels lik Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:latest-gpu \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ --rest_port 8000 --model_repository_path /models --source_model OpenVINO/Phi-4-mini-instruct-int4-ov --tool_parser phi4 --task text_generation --target_device GPU --max_num_batched_tokens 99999 ``` @@ -440,7 +440,7 @@ The current weather in Tokyo is overcast with a temperature of 9.4°C (feels lik Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:latest-gpu \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ --rest_port 8000 --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --model_repository_path /models --tool_parser hermes3 --target_device GPU --task text_generation --enable_tool_guided_generation true ``` @@ -461,7 +461,7 @@ Vision Language MoE model (35B total / 3B active parameters). Requires OpenVINO Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:latest-gpu \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ --rest_port 8000 --source_model OpenVINO/Qwen3.6-35B-A3B-int4-ov --model_repository_path /models --reasoning_parser qwen3 --tool_parser qwen3coder --target_device GPU --task text_generation --allowed_media_domains raw.githubusercontent.com ``` @@ -483,7 +483,7 @@ python openai_agent.py --query "What is the current weather in location depicted Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:latest-gpu \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ --rest_port 8000 --source_model OpenVINO/gpt-oss-20b-int4-ov --model_repository_path /models \ --tool_parser gptoss --reasoning_parser gptoss --target_device GPU --task text_generation ``` @@ -524,7 +524,7 @@ It can be applied using the commands below: Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -1) openvino/model_server:latest-gpu \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -1) openvino/model_server:weekly \ --rest_port 8000 --model_repository_path /models --source_model OpenVINO/Qwen3-8B-int4-cw-ov --tool_parser hermes3 --target_device NPU --task text_generation --max_prompt_len 8000 ``` @@ -543,7 +543,7 @@ The current weather in Tokyo is overcast with a temperature of 9.4°C (feels lik Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:latest-gpu \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ --rest_port 8000 --model_repository_path /models --source_model FluidInference/qwen3-4b-int4-ov-npu --tool_parser hermes3 --target_device NPU --task text_generation --max_prompt_len 8000 ``` @@ -568,7 +568,7 @@ The current weather in Tokyo is overcast with a temperature of 9.4°C (feels lik Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:latest-gpu \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:weekly \ --rest_port 8000 --model_repository_path /models --source_model OpenVINO/Qwen3-8B-int4-ov --tool_parser hermes3 --task text_generation ``` diff --git a/demos/continuous_batching/speculative_decoding/README.md b/demos/continuous_batching/speculative_decoding/README.md index bc74e44e50..ee1541d8a0 100644 --- a/demos/continuous_batching/speculative_decoding/README.md +++ b/demos/continuous_batching/speculative_decoding/README.md @@ -81,7 +81,7 @@ models :::{dropdown} **Deploying with Docker** ```bash -docker run -d --rm -p 8000:8000 -v $(pwd)/models:/workspace:ro openvino/model_server:latest-gpu --rest_port 8000 --rest_workers 2 --config_path /workspace/config.json +docker run -d --rm -p 8000:8000 -v $(pwd)/models:/workspace:ro openvino/model_server:weekly --rest_port 8000 --rest_workers 2 --config_path /workspace/config.json ``` Running above command starts the container with no accelerators support. diff --git a/demos/gguf/README.md b/demos/gguf/README.md index aa15f2ea6b..a1ed003de8 100644 --- a/demos/gguf/README.md +++ b/demos/gguf/README.md @@ -20,7 +20,7 @@ Start docker container: mkdir models docker run -d --rm --user $(id -u):$(id -g) -p 8000:8000 -v $(pwd)/models:/models/:rw \ -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy \ - openvino/model_server:latest-gpu \ + openvino/model_server:weekly \ --rest_port 8000 \ --model_repository_path /models/ \ --task text_generation \ diff --git a/demos/integration_with_OpenWebUI/README.md b/demos/integration_with_OpenWebUI/README.md index e6aa161646..f077c98de9 100644 --- a/demos/integration_with_OpenWebUI/README.md +++ b/demos/integration_with_OpenWebUI/README.md @@ -40,9 +40,9 @@ ovms.exe --rest_port 8000 --config_path models\config.json --allowed_media_domai :sync: Linux ```bash mkdir models -docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:latest-gpu --pull --source_model OpenVINO/gpt-oss-20b-int4-ov --model_repository_path /models --task text_generation --tool_parser gptoss --reasoning_parser gptoss --target_device GPU -docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models openvino/model_server:latest-gpu --add_to_config --config_path /models/config.json --model_path OpenVINO/gpt-oss-20b-int4-ov --model_name ovms-model -docker run -d -u $(id -u):$(id -g) -v $PWD/models:/models -p 8000:8000 --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:latest-gpu --rest_port 8000 --config_path /models/config.json --allowed_media_domains raw.githubusercontent.com +docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly --pull --source_model OpenVINO/gpt-oss-20b-int4-ov --model_repository_path /models --task text_generation --tool_parser gptoss --reasoning_parser gptoss --target_device GPU +docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models openvino/model_server:weekly --add_to_config --config_path /models/config.json --model_path OpenVINO/gpt-oss-20b-int4-ov --model_name ovms-model +docker run -d -u $(id -u):$(id -g) -v $PWD/models:/models -p 8000:8000 --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly --rest_port 8000 --config_path /models/config.json --allowed_media_domains raw.githubusercontent.com ``` ::: :::: @@ -133,10 +133,10 @@ ovms.exe --add_to_config --config_path models\config.json --model_path OpenVINO\ :::{tab-item} Linux (using Docker) :sync: Linux ```bash -docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:latest-gpu --pull --source_model OpenVINO/Qwen3-Embedding-0.6B-fp16-ov --model_repository_path models --task embeddings --target_device GPU -docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models openvino/model_server:latest-gpu --add_to_config --config_path /models/config.json --model_path OpenVINO/Qwen3-Embedding-0.6B-fp16-ov --model_name OpenVINO/Qwen3-Embedding-0.6B-fp16-ov -docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:latest-gpu --pull --source_model OpenVINO/Qwen3-Reranker-0.6B-seq-cls-fp16-ov --model_repository_path models --task rerank --target_device GPU -docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models openvino/model_server:latest-gpu --add_to_config --config_path /models/config.json --model_path OpenVINO/Qwen3-Reranker-0.6B-seq-cls-fp16-ov --model_name OpenVINO/Qwen3-Reranker-0.6B-seq-cls-fp16-ov +docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly --pull --source_model OpenVINO/Qwen3-Embedding-0.6B-fp16-ov --model_repository_path models --task embeddings --target_device GPU +docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models openvino/model_server:weekly --add_to_config --config_path /models/config.json --model_path OpenVINO/Qwen3-Embedding-0.6B-fp16-ov --model_name OpenVINO/Qwen3-Embedding-0.6B-fp16-ov +docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly --pull --source_model OpenVINO/Qwen3-Reranker-0.6B-seq-cls-fp16-ov --model_repository_path models --task rerank --target_device GPU +docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models openvino/model_server:weekly --add_to_config --config_path /models/config.json --model_path OpenVINO/Qwen3-Reranker-0.6B-seq-cls-fp16-ov --model_name OpenVINO/Qwen3-Reranker-0.6B-seq-cls-fp16-ov ``` ::: :::: @@ -228,8 +228,8 @@ ovms.exe --add_to_config --config_path models\config.json --model_path OpenVINO\ :::{tab-item} Linux (using Docker) :sync: Linux ```bash -docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:latest-gpu --pull --source_model OpenVINO/FLUX.1-schnell-int4-ov --model_repository_path models --model_name OpenVINO/FLUX.1-schnell-int4-ov --task image_generation --default_num_inference_steps 3 --target_device GPU -docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models openvino/model_server:latest-gpu --add_to_config --config_path /models/config.json --model_path OpenVINO/FLUX.1-schnell-int4-ov --model_name OpenVINO/FLUX.1-schnell-int4-ov +docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly --pull --source_model OpenVINO/FLUX.1-schnell-int4-ov --model_repository_path models --model_name OpenVINO/FLUX.1-schnell-int4-ov --task image_generation --default_num_inference_steps 3 --target_device GPU +docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models openvino/model_server:weekly --add_to_config --config_path /models/config.json --model_path OpenVINO/FLUX.1-schnell-int4-ov --model_name OpenVINO/FLUX.1-schnell-int4-ov ``` ::: :::: @@ -298,8 +298,8 @@ ovms.exe --add_to_config --config_path models\config.json --model_path OpenVINO/ :::{tab-item} Linux (using Docker) :sync: Linux ```bash -docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:latest-gpu --pull --source_model OpenVINO/Qwen3-VL-8B-Instruct-int4-ov --model_repository_path /models --model_name ovms-model-vl --task text_generation --pipeline_type VLM_CB --target_device GPU -docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models openvino/model_server:latest-gpu --add_to_config --config_path /models/config.json --model_path OpenVINO/Qwen3-VL-8B-Instruct-int4-ov --model_name ovms-model-vl +docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly --pull --source_model OpenVINO/Qwen3-VL-8B-Instruct-int4-ov --model_repository_path /models --model_name ovms-model-vl --task text_generation --pipeline_type VLM_CB --target_device GPU +docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models openvino/model_server:weekly --add_to_config --config_path /models/config.json --model_path OpenVINO/Qwen3-VL-8B-Instruct-int4-ov --model_name ovms-model-vl ``` ::: :::: @@ -481,8 +481,8 @@ ovms.exe --add_to_config --config_path models\config.json --model_path OpenVINO :::{tab-item} Linux (using Docker) :sync: Linux ```bash -docker run --rm -u $(id -u):$(id -g) --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -v $PWD/models:/models openvino/model_server:latest-gpu --pull --source_model OpenVINO/whisper-base-fp16-ov --model_repository_path /models --task speech2text --target_device GPU -docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models openvino/model_server:latest-gpu --add_to_config --config_path /models/config.json --model_path OpenVINO/whisper-base-fp16-ov --model_name OpenVINO/whisper-base-fp16-ov +docker run --rm -u $(id -u):$(id -g) --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -v $PWD/models:/models openvino/model_server:weekly --pull --source_model OpenVINO/whisper-base-fp16-ov --model_repository_path /models --task speech2text --target_device GPU +docker run --rm -u $(id -u):$(id -g) -v $PWD/models:/models openvino/model_server:weekly --add_to_config --config_path /models/config.json --model_path OpenVINO/whisper-base-fp16-ov --model_name OpenVINO/whisper-base-fp16-ov ``` ::: :::: diff --git a/docs/deploying_server_docker.md b/docs/deploying_server_docker.md index 086e6d0a12..e2c51d4d79 100644 --- a/docs/deploying_server_docker.md +++ b/docs/deploying_server_docker.md @@ -27,7 +27,7 @@ or [RedHat Ecosystem Catalog](https://catalog.redhat.com/software/containers/int docker pull registry.connect.redhat.com/intel/openvino-model-server:latest ``` -> **NOTE**: You can also pull public image `openvino/model_server:latest-gpu` with development version of the model server, which is built from the main branch. It allow you to evaluate the latest features ahead of official releases. +> **NOTE**: You can also pull public image `openvino/model_server:weekly` with development version of the model server, which is built from the main branch. It allow you to evaluate the latest features ahead of official releases. #### Step 2. Prepare Data for Serving diff --git a/docs/pull_hf_models.md b/docs/pull_hf_models.md index bdf057fdde..77ab0dfb60 100644 --- a/docs/pull_hf_models.md +++ b/docs/pull_hf_models.md @@ -17,7 +17,7 @@ There is a special OVMS mode to pull the model from Hugging Face without startin :sync: docker **Required:** Docker Engine installed ```text -docker run $(id -u):$(id -g) --rm -v :/models:rw openvino/model_server:latest-gpu --pull --source_model --model_repository_path /models --model_name --target_device [--gguf_filename SPECIFIC_QUANTIZATION_FILENAME.gguf] --task [TASK_SPECIFIC_PARAMETERS] +docker run $(id -u):$(id -g) --rm -v :/models:rw openvino/model_server:weekly --pull --source_model --model_repository_path /models --model_name --target_device [--gguf_filename SPECIFIC_QUANTIZATION_FILENAME.gguf] --task [TASK_SPECIFIC_PARAMETERS] ``` ::: @@ -63,7 +63,7 @@ Example for pulling GGUF model `unsloth/Llama-3.2-1B-Instruct-GGUF` with Q4_K_M **Required:** Docker Engine installed ```text -docker run $(id -u):$(id -g) --rm -v :/models:rw openvino/model_server:latest-gpu --pull --source_model "unsloth/Llama-3.2-1B-Instruct-GGUF" --model_repository_path /models --model_name unsloth/Llama-3.2-1B-Instruct-GGUF --task text_generation --gguf_filename Llama-3.2-1B-Instruct-Q4_K_M.gguf +docker run $(id -u):$(id -g) --rm -v :/models:rw openvino/model_server:weekly --pull --source_model "unsloth/Llama-3.2-1B-Instruct-GGUF" --model_repository_path /models --model_name unsloth/Llama-3.2-1B-Instruct-GGUF --task text_generation --gguf_filename Llama-3.2-1B-Instruct-Q4_K_M.gguf ``` ::: diff --git a/extras/openshift_AI/ServingRuntime.yaml b/extras/openshift_AI/ServingRuntime.yaml index bd83ba6efb..c7d01e3889 100644 --- a/extras/openshift_AI/ServingRuntime.yaml +++ b/extras/openshift_AI/ServingRuntime.yaml @@ -21,7 +21,7 @@ spec: - --model_path=/mnt/models - --file_system_poll_wait_seconds=0 - --metrics_enable - image: docker.io/openvino/model_server:latest-gpu + image: docker.io/openvino/model_server:weekly startupProbe: periodSeconds: 5 failureThreshold: 9999