|
61 | 61 | from model_engine_server.common.dtos.tasks import SyncEndpointPredictV1Request, TaskStatus |
62 | 62 | from model_engine_server.common.resource_limits import validate_resource_requests |
63 | 63 | from model_engine_server.core.auth.authentication_repository import User |
| 64 | +from model_engine_server.core.config import infra_config |
64 | 65 | from model_engine_server.core.configmap import read_config_map |
65 | 66 | from model_engine_server.core.loggers import ( |
66 | 67 | LoggerTagKey, |
@@ -369,6 +370,10 @@ def __init__( |
369 | 370 | def check_docker_image_exists_for_image_tag( |
370 | 371 | self, framework_image_tag: str, repository_name: str |
371 | 372 | ): |
| 373 | + # Skip ECR validation for on-prem deployments - images are in local registry |
| 374 | + if infra_config().cloud_provider == "onprem": |
| 375 | + return |
| 376 | + |
372 | 377 | if not self.docker_repository.image_exists( |
373 | 378 | image_tag=framework_image_tag, |
374 | 379 | repository_name=repository_name, |
@@ -640,8 +645,13 @@ def load_model_weights_sub_commands_s3( |
640 | 645 | file_selection_str = '--include "*.model" --include "*.model.v*" --include "*.json" --include "*.safetensors" --include "*.txt" --exclude "optimizer*"' |
641 | 646 | if trust_remote_code: |
642 | 647 | file_selection_str += ' --include "*.py"' |
| 648 | + |
| 649 | + # Support for MinIO/on-prem S3-compatible storage via S3_ENDPOINT_URL env var |
| 650 | + endpoint_flag = ( |
| 651 | + '$(if [ -n "$S3_ENDPOINT_URL" ]; then echo "--endpoint-url $S3_ENDPOINT_URL"; fi)' |
| 652 | + ) |
643 | 653 | subcommands.append( |
644 | | - f"{s5cmd} --numworkers 512 cp --concurrency 10 {file_selection_str} {os.path.join(checkpoint_path, '*')} {final_weights_folder}" |
| 654 | + f"{s5cmd} {endpoint_flag} --numworkers 512 cp --concurrency 10 {file_selection_str} {os.path.join(checkpoint_path, '*')} {final_weights_folder}" |
645 | 655 | ) |
646 | 656 | return subcommands |
647 | 657 |
|
@@ -693,8 +703,12 @@ def load_model_files_sub_commands_trt_llm( |
693 | 703 | and llm-engine/model-engine/model_engine_server/inference/tensorrt-llm/triton_model_repo/postprocessing/config.pbtxt |
694 | 704 | """ |
695 | 705 | if checkpoint_path.startswith("s3://"): |
| 706 | + # Support for MinIO/on-prem S3-compatible storage via S3_ENDPOINT_URL env var |
| 707 | + endpoint_flag = ( |
| 708 | + '$(if [ -n "$S3_ENDPOINT_URL" ]; then echo "--endpoint-url $S3_ENDPOINT_URL"; fi)' |
| 709 | + ) |
696 | 710 | subcommands = [ |
697 | | - f"./s5cmd --numworkers 512 cp --concurrency 50 {os.path.join(checkpoint_path, '*')} ./" |
| 711 | + f"./s5cmd {endpoint_flag} --numworkers 512 cp --concurrency 50 {os.path.join(checkpoint_path, '*')} ./" |
698 | 712 | ] |
699 | 713 | else: |
700 | 714 | subcommands.extend( |
@@ -1053,8 +1067,9 @@ async def create_vllm_bundle( |
1053 | 1067 | protocol="http", |
1054 | 1068 | readiness_initial_delay_seconds=10, |
1055 | 1069 | healthcheck_route="/health", |
1056 | | - predict_route="/predict", |
1057 | | - streaming_predict_route="/stream", |
| 1070 | + # vLLM 0.5+ uses OpenAI-compatible endpoints |
| 1071 | + predict_route=OPENAI_COMPLETION_PATH, # "/v1/completions" |
| 1072 | + streaming_predict_route=OPENAI_COMPLETION_PATH, # "/v1/completions" (streaming via same endpoint) |
1058 | 1073 | routes=[ |
1059 | 1074 | OPENAI_CHAT_COMPLETION_PATH, |
1060 | 1075 | OPENAI_COMPLETION_PATH, |
@@ -1135,8 +1150,9 @@ async def create_vllm_multinode_bundle( |
1135 | 1150 | protocol="http", |
1136 | 1151 | readiness_initial_delay_seconds=10, |
1137 | 1152 | healthcheck_route="/health", |
1138 | | - predict_route="/predict", |
1139 | | - streaming_predict_route="/stream", |
| 1153 | + # vLLM 0.5+ uses OpenAI-compatible endpoints |
| 1154 | + predict_route=OPENAI_COMPLETION_PATH, # "/v1/completions" |
| 1155 | + streaming_predict_route=OPENAI_COMPLETION_PATH, # "/v1/completions" (streaming via same endpoint) |
1140 | 1156 | routes=[OPENAI_CHAT_COMPLETION_PATH, OPENAI_COMPLETION_PATH], |
1141 | 1157 | env=common_vllm_envs, |
1142 | 1158 | worker_command=worker_command, |
@@ -1937,18 +1953,42 @@ def model_output_to_completion_output( |
1937 | 1953 |
|
1938 | 1954 | elif model_content.inference_framework == LLMInferenceFramework.VLLM: |
1939 | 1955 | tokens = None |
1940 | | - if with_token_probs: |
1941 | | - tokens = [ |
1942 | | - TokenOutput( |
1943 | | - token=model_output["tokens"][index], |
1944 | | - log_prob=list(t.values())[0], |
1945 | | - ) |
1946 | | - for index, t in enumerate(model_output["log_probs"]) |
1947 | | - ] |
| 1956 | + # Handle OpenAI-compatible format (vLLM 0.5+) vs legacy format |
| 1957 | + if "choices" in model_output and model_output["choices"]: |
| 1958 | + # OpenAI-compatible format: {"choices": [{"text": "...", ...}], "usage": {...}} |
| 1959 | + choice = model_output["choices"][0] |
| 1960 | + text = choice.get("text", "") |
| 1961 | + usage = model_output.get("usage", {}) |
| 1962 | + num_prompt_tokens = usage.get("prompt_tokens", 0) |
| 1963 | + num_completion_tokens = usage.get("completion_tokens", 0) |
| 1964 | + # OpenAI format logprobs are in choice.logprobs |
| 1965 | + if with_token_probs and choice.get("logprobs"): |
| 1966 | + logprobs = choice["logprobs"] |
| 1967 | + if logprobs.get("tokens") and logprobs.get("token_logprobs"): |
| 1968 | + tokens = [ |
| 1969 | + TokenOutput( |
| 1970 | + token=logprobs["tokens"][i], |
| 1971 | + log_prob=logprobs["token_logprobs"][i] or 0.0, |
| 1972 | + ) |
| 1973 | + for i in range(len(logprobs["tokens"])) |
| 1974 | + ] |
| 1975 | + else: |
| 1976 | + # Legacy format: {"text": "...", "count_prompt_tokens": ..., ...} |
| 1977 | + text = model_output["text"] |
| 1978 | + num_prompt_tokens = model_output["count_prompt_tokens"] |
| 1979 | + num_completion_tokens = model_output["count_output_tokens"] |
| 1980 | + if with_token_probs and model_output.get("log_probs"): |
| 1981 | + tokens = [ |
| 1982 | + TokenOutput( |
| 1983 | + token=model_output["tokens"][index], |
| 1984 | + log_prob=list(t.values())[0], |
| 1985 | + ) |
| 1986 | + for index, t in enumerate(model_output["log_probs"]) |
| 1987 | + ] |
1948 | 1988 | return CompletionOutput( |
1949 | | - text=model_output["text"], |
1950 | | - num_prompt_tokens=model_output["count_prompt_tokens"], |
1951 | | - num_completion_tokens=model_output["count_output_tokens"], |
| 1989 | + text=text, |
| 1990 | + num_prompt_tokens=num_prompt_tokens, |
| 1991 | + num_completion_tokens=num_completion_tokens, |
1952 | 1992 | tokens=tokens, |
1953 | 1993 | ) |
1954 | 1994 | elif model_content.inference_framework == LLMInferenceFramework.LIGHTLLM: |
@@ -2688,20 +2728,43 @@ async def _response_chunk_generator( |
2688 | 2728 | # VLLM |
2689 | 2729 | elif model_content.inference_framework == LLMInferenceFramework.VLLM: |
2690 | 2730 | token = None |
2691 | | - if request.return_token_log_probs: |
2692 | | - token = TokenOutput( |
2693 | | - token=result["result"]["text"], |
2694 | | - log_prob=list(result["result"]["log_probs"].values())[0], |
2695 | | - ) |
2696 | | - finished = result["result"]["finished"] |
2697 | | - num_prompt_tokens = result["result"]["count_prompt_tokens"] |
| 2731 | + vllm_output: dict = result["result"] |
| 2732 | + # Handle OpenAI-compatible streaming format (vLLM 0.5+) vs legacy format |
| 2733 | + if "choices" in vllm_output and vllm_output["choices"]: |
| 2734 | + # OpenAI streaming format: {"choices": [{"text": "...", "finish_reason": ...}], ...} |
| 2735 | + choice = vllm_output["choices"][0] |
| 2736 | + text = choice.get("text", "") |
| 2737 | + finished = choice.get("finish_reason") is not None |
| 2738 | + usage = vllm_output.get("usage", {}) |
| 2739 | + num_prompt_tokens = usage.get("prompt_tokens", 0) |
| 2740 | + num_completion_tokens = usage.get("completion_tokens", 0) |
| 2741 | + if request.return_token_log_probs and choice.get("logprobs"): |
| 2742 | + logprobs = choice["logprobs"] |
| 2743 | + if logprobs.get("tokens") and logprobs.get("token_logprobs"): |
| 2744 | + # Get the last token from the logprobs |
| 2745 | + idx = len(logprobs["tokens"]) - 1 |
| 2746 | + token = TokenOutput( |
| 2747 | + token=logprobs["tokens"][idx], |
| 2748 | + log_prob=logprobs["token_logprobs"][idx] or 0.0, |
| 2749 | + ) |
| 2750 | + else: |
| 2751 | + # Legacy format: {"text": "...", "finished": ..., ...} |
| 2752 | + text = vllm_output["text"] |
| 2753 | + finished = vllm_output["finished"] |
| 2754 | + num_prompt_tokens = vllm_output["count_prompt_tokens"] |
| 2755 | + num_completion_tokens = vllm_output["count_output_tokens"] |
| 2756 | + if request.return_token_log_probs and vllm_output.get("log_probs"): |
| 2757 | + token = TokenOutput( |
| 2758 | + token=vllm_output["text"], |
| 2759 | + log_prob=list(vllm_output["log_probs"].values())[0], |
| 2760 | + ) |
2698 | 2761 | yield CompletionStreamV1Response( |
2699 | 2762 | request_id=request_id, |
2700 | 2763 | output=CompletionStreamOutput( |
2701 | | - text=result["result"]["text"], |
| 2764 | + text=text, |
2702 | 2765 | finished=finished, |
2703 | 2766 | num_prompt_tokens=num_prompt_tokens if finished else None, |
2704 | | - num_completion_tokens=result["result"]["count_output_tokens"], |
| 2767 | + num_completion_tokens=num_completion_tokens, |
2705 | 2768 | token=token, |
2706 | 2769 | ), |
2707 | 2770 | ) |
@@ -2750,12 +2813,14 @@ def validate_endpoint_supports_openai_completion( |
2750 | 2813 | f"The endpoint's inference framework ({endpoint_content.inference_framework}) does not support openai compatible completion." |
2751 | 2814 | ) |
2752 | 2815 |
|
2753 | | - if not isinstance( |
2754 | | - endpoint.record.current_model_bundle.flavor, RunnableImageLike |
2755 | | - ) or OPENAI_COMPLETION_PATH not in ( |
2756 | | - endpoint.record.current_model_bundle.flavor.extra_routes |
2757 | | - + endpoint.record.current_model_bundle.flavor.routes |
2758 | | - ): |
| 2816 | + if not isinstance(endpoint.record.current_model_bundle.flavor, RunnableImageLike): |
| 2817 | + raise EndpointUnsupportedRequestException( |
| 2818 | + "Endpoint does not support v2 openai compatible completion" |
| 2819 | + ) |
| 2820 | + |
| 2821 | + flavor = endpoint.record.current_model_bundle.flavor |
| 2822 | + all_routes = flavor.extra_routes + flavor.routes |
| 2823 | + if OPENAI_COMPLETION_PATH not in all_routes: |
2759 | 2824 | raise EndpointUnsupportedRequestException( |
2760 | 2825 | "Endpoint does not support v2 openai compatible completion" |
2761 | 2826 | ) |
@@ -3042,12 +3107,12 @@ def validate_endpoint_supports_chat_completion( |
3042 | 3107 | f"The endpoint's inference framework ({endpoint_content.inference_framework}) does not support chat completion." |
3043 | 3108 | ) |
3044 | 3109 |
|
3045 | | - if not isinstance( |
3046 | | - endpoint.record.current_model_bundle.flavor, RunnableImageLike |
3047 | | - ) or OPENAI_CHAT_COMPLETION_PATH not in ( |
3048 | | - endpoint.record.current_model_bundle.flavor.extra_routes |
3049 | | - + endpoint.record.current_model_bundle.flavor.routes |
3050 | | - ): |
| 3110 | + if not isinstance(endpoint.record.current_model_bundle.flavor, RunnableImageLike): |
| 3111 | + raise EndpointUnsupportedRequestException("Endpoint does not support chat completion") |
| 3112 | + |
| 3113 | + flavor = endpoint.record.current_model_bundle.flavor |
| 3114 | + all_routes = flavor.extra_routes + flavor.routes |
| 3115 | + if OPENAI_CHAT_COMPLETION_PATH not in all_routes: |
3051 | 3116 | raise EndpointUnsupportedRequestException("Endpoint does not support chat completion") |
3052 | 3117 |
|
3053 | 3118 |
|
|
0 commit comments