scaleapi
diff --git a/‎charts/model-engine/templates/_helpers.tpl‎
Lines changed: 23 additions & 1 deletion b/‎charts/model-engine/templates/_helpers.tpl‎
Lines changed: 23 additions & 1 deletion
diff --git a/‎model-engine/model_engine_server/api/dependencies.py‎
Lines changed: 6 additions & 3 deletions b/‎model-engine/model_engine_server/api/dependencies.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎model-engine/model_engine_server/common/config.py‎
Lines changed: 7 additions & 1 deletion b/‎model-engine/model_engine_server/common/config.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎model-engine/model_engine_server/common/io.py‎
Lines changed: 6 additions & 4 deletions b/‎model-engine/model_engine_server/common/io.py‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎model-engine/model_engine_server/core/aws/roles.py‎
Lines changed: 10 additions & 1 deletion b/‎model-engine/model_engine_server/core/aws/roles.py‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎model-engine/model_engine_server/core/aws/storage_client.py‎
Lines changed: 5 additions & 0 deletions b/‎model-engine/model_engine_server/core/aws/storage_client.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py‎
Lines changed: 103 additions & 38 deletions b/‎model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py‎
Lines changed: 103 additions & 38 deletions
@@ -256,6 +256,10 @@ env:
   - name: ABS_CONTAINER_NAME
     value: {{ .Values.azure.abs_container_name }}
   {{- end }}
+  {{- if .Values.s3EndpointUrl }}
+  - name: S3_ENDPOINT_URL
+    value: {{ .Values.s3EndpointUrl | quote }}
+  {{- end }}
 {{- end }}
 
 {{- define "modelEngine.syncForwarderTemplateEnv" -}}
@@ -342,9 +346,27 @@ env:
     value: "/workspace/model-engine/model_engine_server/core/configs/config.yaml"
   {{- end }}
   - name: CELERY_ELASTICACHE_ENABLED
-    value: "true"
+    value: {{ .Values.celeryElasticacheEnabled | default true | quote }}
   - name: LAUNCH_SERVICE_TEMPLATE_FOLDER
     value: "/workspace/model-engine/model_engine_server/infra/gateways/resources/templates"
+  {{- if .Values.s3EndpointUrl }}
+  - name: S3_ENDPOINT_URL
+    value: {{ .Values.s3EndpointUrl | quote }}
+  {{- end }}
+  {{- if .Values.redisHost }}
+  - name: REDIS_HOST
+    value: {{ .Values.redisHost | quote }}
+  - name: REDIS_PORT
+    value: {{ .Values.redisPort | default "6379" | quote }}
+  {{- end }}
+  {{- if .Values.celeryBrokerUrl }}
+  - name: CELERY_BROKER_URL
+    value: {{ .Values.celeryBrokerUrl | quote }}
+  {{- end }}
+  {{- if .Values.celeryResultBackend }}
+  - name: CELERY_RESULT_BACKEND
+    value: {{ .Values.celeryResultBackend | quote }}
+  {{- end }}
   {{- if .Values.redis.auth}}
   - name: REDIS_AUTH_TOKEN
     value: {{ .Values.redis.auth }}
 
@@ -225,7 +225,8 @@ def _get_external_interfaces(
     )
 
     queue_delegate: QueueEndpointResourceDelegate
-    if CIRCLECI:
+    if CIRCLECI or infra_config().cloud_provider == "onprem":
+        # On-prem uses fake queue delegate (no SQS/ServiceBus)
         queue_delegate = FakeQueueEndpointResourceDelegate()
     elif infra_config().cloud_provider == "azure":
         queue_delegate = ASBQueueEndpointResourceDelegate()
@@ -238,7 +239,8 @@ def _get_external_interfaces(
 
     inference_task_queue_gateway: TaskQueueGateway
     infra_task_queue_gateway: TaskQueueGateway
-    if CIRCLECI:
+    if CIRCLECI or infra_config().cloud_provider == "onprem":
+        # On-prem uses Redis-based task queues
         inference_task_queue_gateway = redis_24h_task_queue_gateway
         infra_task_queue_gateway = redis_task_queue_gateway
     elif infra_config().cloud_provider == "azure":
@@ -366,7 +368,8 @@ def _get_external_interfaces(
         file_storage_gateway = S3FileStorageGateway()
 
     docker_repository: DockerRepository
-    if CIRCLECI:
+    if CIRCLECI or infra_config().cloud_provider == "onprem":
+        # On-prem uses fake docker repository (no ECR/ACR validation)
         docker_repository = FakeDockerRepository()
     elif infra_config().cloud_provider == "azure":
         docker_repository = ACRDockerRepository()
 
@@ -70,12 +70,13 @@ class HostedModelInferenceServiceConfig:
     user_inference_tensorflow_repository: str
     docker_image_layer_cache_repository: str
     sensitive_log_mode: bool
-    # Exactly one of the following three must be specified
+    # Exactly one of the following must be specified for Redis cache
     cache_redis_aws_url: Optional[str] = None  # also using this to store sync autoscaling metrics
     cache_redis_azure_host: Optional[str] = None
     cache_redis_aws_secret_name: Optional[str] = (
         None  # Not an env var because the redis cache info is already here
     )
+    cache_redis_onprem_url: Optional[str] = None  # For on-prem Redis (e.g., redis://redis:6379/0)
     sglang_repository: Optional[str] = None
 
     @classmethod
@@ -90,8 +91,13 @@ def from_yaml(cls, yaml_path):
 
     @property
     def cache_redis_url(self) -> str:
+        # On-prem Redis support (explicit URL, no cloud provider dependency)
+        if self.cache_redis_onprem_url:
+            return self.cache_redis_onprem_url
+
         cloud_provider = infra_config().cloud_provider
 
+        # On-prem: support REDIS_HOST env var fallback
         if cloud_provider == "onprem":
             if self.cache_redis_aws_url:
                 logger.info("On-prem deployment using cache_redis_aws_url")
 
@@ -3,17 +3,19 @@
 import os
 from typing import Any
 
+import boto3
 import smart_open
 from model_engine_server.core.config import infra_config
 
 
 def open_wrapper(uri: str, mode: str = "rt", **kwargs):
     client: Any
+    cloud_provider: str
+    # This follows the 5.1.0 smart_open API
     try:
         cloud_provider = infra_config().cloud_provider
     except Exception:
         cloud_provider = "aws"
-
     if cloud_provider == "azure":
         from azure.identity import DefaultAzureCredential
         from azure.storage.blob import BlobServiceClient
@@ -23,9 +25,9 @@ def open_wrapper(uri: str, mode: str = "rt", **kwargs):
             DefaultAzureCredential(),
         )
     else:
-        from model_engine_server.infra.gateways.s3_utils import get_s3_client
-
-        client = get_s3_client(kwargs)
+        profile_name = kwargs.get("aws_profile", os.getenv("AWS_PROFILE"))
+        session = boto3.Session(profile_name=profile_name)
+        client = session.client("s3")
 
     transport_params = {"client": client}
     return smart_open.open(uri, mode, transport_params=transport_params)
@@ -119,12 +119,21 @@ def session(role: Optional[str], session_type: SessionT = Session) -> SessionT:
 
     :param:`session_type` defines the type of session to return. Most users will use
     the default boto3 type. Some users required a special type (e.g aioboto3 session).
+
+    For on-prem deployments without AWS profiles, pass role=None or role=""
+    to use default credentials from environment variables (AWS_ACCESS_KEY_ID, etc).
     """
     # Do not assume roles in CIRCLECI
     if os.getenv("CIRCLECI"):
         logger.warning(f"In circleci, not assuming role (ignoring: {role})")
         role = None
-    sesh: SessionT = session_type(profile_name=role)
+
+    # Use profile-based auth only if role is specified
+    # For on-prem with MinIO, role will be None or empty - use env var credentials
+    if role:
+        sesh: SessionT = session_type(profile_name=role)
+    else:
+        sesh: SessionT = session_type()  # Uses default credential chain (env vars)
     return sesh
 
 
 
@@ -1,3 +1,4 @@
+import os
 import time
 from typing import IO, Callable, Iterable, Optional, Sequence
 
@@ -20,6 +21,10 @@
 
 
 def sync_storage_client(**kwargs) -> BaseClient:
+    # Support for MinIO/on-prem S3-compatible storage
+    endpoint_url = os.getenv("S3_ENDPOINT_URL")
+    if endpoint_url and "endpoint_url" not in kwargs:
+        kwargs["endpoint_url"] = endpoint_url
     return session(infra_config().profile_ml_worker).client("s3", **kwargs)  # type: ignore
 
 
 
@@ -61,6 +61,7 @@
 from model_engine_server.common.dtos.tasks import SyncEndpointPredictV1Request, TaskStatus
 from model_engine_server.common.resource_limits import validate_resource_requests
 from model_engine_server.core.auth.authentication_repository import User
+from model_engine_server.core.config import infra_config
 from model_engine_server.core.configmap import read_config_map
 from model_engine_server.core.loggers import (
     LoggerTagKey,
@@ -369,6 +370,10 @@ def __init__(
     def check_docker_image_exists_for_image_tag(
         self, framework_image_tag: str, repository_name: str
     ):
+        # Skip ECR validation for on-prem deployments - images are in local registry
+        if infra_config().cloud_provider == "onprem":
+            return
+
         if not self.docker_repository.image_exists(
             image_tag=framework_image_tag,
             repository_name=repository_name,
@@ -640,8 +645,13 @@ def load_model_weights_sub_commands_s3(
         file_selection_str = '--include "*.model" --include "*.model.v*" --include "*.json" --include "*.safetensors" --include "*.txt" --exclude "optimizer*"'
         if trust_remote_code:
             file_selection_str += ' --include "*.py"'
+
+        # Support for MinIO/on-prem S3-compatible storage via S3_ENDPOINT_URL env var
+        endpoint_flag = (
+            '$(if [ -n "$S3_ENDPOINT_URL" ]; then echo "--endpoint-url $S3_ENDPOINT_URL"; fi)'
+        )
         subcommands.append(
-            f"{s5cmd} --numworkers 512 cp --concurrency 10 {file_selection_str} {os.path.join(checkpoint_path, '*')} {final_weights_folder}"
+            f"{s5cmd} {endpoint_flag} --numworkers 512 cp --concurrency 10 {file_selection_str} {os.path.join(checkpoint_path, '*')} {final_weights_folder}"
         )
         return subcommands
 
@@ -693,8 +703,12 @@ def load_model_files_sub_commands_trt_llm(
         and llm-engine/model-engine/model_engine_server/inference/tensorrt-llm/triton_model_repo/postprocessing/config.pbtxt
         """
         if checkpoint_path.startswith("s3://"):
+            # Support for MinIO/on-prem S3-compatible storage via S3_ENDPOINT_URL env var
+            endpoint_flag = (
+                '$(if [ -n "$S3_ENDPOINT_URL" ]; then echo "--endpoint-url $S3_ENDPOINT_URL"; fi)'
+            )
             subcommands = [
-                f"./s5cmd --numworkers 512 cp --concurrency 50 {os.path.join(checkpoint_path, '*')} ./"
+                f"./s5cmd {endpoint_flag} --numworkers 512 cp --concurrency 50 {os.path.join(checkpoint_path, '*')} ./"
             ]
         else:
             subcommands.extend(
@@ -1053,8 +1067,9 @@ async def create_vllm_bundle(
                 protocol="http",
                 readiness_initial_delay_seconds=10,
                 healthcheck_route="/health",
-                predict_route="/predict",
-                streaming_predict_route="/stream",
+                # vLLM 0.5+ uses OpenAI-compatible endpoints
+                predict_route=OPENAI_COMPLETION_PATH,  # "/v1/completions"
+                streaming_predict_route=OPENAI_COMPLETION_PATH,  # "/v1/completions" (streaming via same endpoint)
                 routes=[
                     OPENAI_CHAT_COMPLETION_PATH,
                     OPENAI_COMPLETION_PATH,
@@ -1135,8 +1150,9 @@ async def create_vllm_multinode_bundle(
                 protocol="http",
                 readiness_initial_delay_seconds=10,
                 healthcheck_route="/health",
-                predict_route="/predict",
-                streaming_predict_route="/stream",
+                # vLLM 0.5+ uses OpenAI-compatible endpoints
+                predict_route=OPENAI_COMPLETION_PATH,  # "/v1/completions"
+                streaming_predict_route=OPENAI_COMPLETION_PATH,  # "/v1/completions" (streaming via same endpoint)
                 routes=[OPENAI_CHAT_COMPLETION_PATH, OPENAI_COMPLETION_PATH],
                 env=common_vllm_envs,
                 worker_command=worker_command,
@@ -1937,18 +1953,42 @@ def model_output_to_completion_output(
 
         elif model_content.inference_framework == LLMInferenceFramework.VLLM:
             tokens = None
-            if with_token_probs:
-                tokens = [
-                    TokenOutput(
-                        token=model_output["tokens"][index],
-                        log_prob=list(t.values())[0],
-                    )
-                    for index, t in enumerate(model_output["log_probs"])
-                ]
+            # Handle OpenAI-compatible format (vLLM 0.5+) vs legacy format
+            if "choices" in model_output and model_output["choices"]:
+                # OpenAI-compatible format: {"choices": [{"text": "...", ...}], "usage": {...}}
+                choice = model_output["choices"][0]
+                text = choice.get("text", "")
+                usage = model_output.get("usage", {})
+                num_prompt_tokens = usage.get("prompt_tokens", 0)
+                num_completion_tokens = usage.get("completion_tokens", 0)
+                # OpenAI format logprobs are in choice.logprobs
+                if with_token_probs and choice.get("logprobs"):
+                    logprobs = choice["logprobs"]
+                    if logprobs.get("tokens") and logprobs.get("token_logprobs"):
+                        tokens = [
+                            TokenOutput(
+                                token=logprobs["tokens"][i],
+                                log_prob=logprobs["token_logprobs"][i] or 0.0,
+                            )
+                            for i in range(len(logprobs["tokens"]))
+                        ]
+            else:
+                # Legacy format: {"text": "...", "count_prompt_tokens": ..., ...}
+                text = model_output["text"]
+                num_prompt_tokens = model_output["count_prompt_tokens"]
+                num_completion_tokens = model_output["count_output_tokens"]
+                if with_token_probs and model_output.get("log_probs"):
+                    tokens = [
+                        TokenOutput(
+                            token=model_output["tokens"][index],
+                            log_prob=list(t.values())[0],
+                        )
+                        for index, t in enumerate(model_output["log_probs"])
+                    ]
             return CompletionOutput(
-                text=model_output["text"],
-                num_prompt_tokens=model_output["count_prompt_tokens"],
-                num_completion_tokens=model_output["count_output_tokens"],
+                text=text,
+                num_prompt_tokens=num_prompt_tokens,
+                num_completion_tokens=num_completion_tokens,
                 tokens=tokens,
             )
         elif model_content.inference_framework == LLMInferenceFramework.LIGHTLLM:
@@ -2688,20 +2728,43 @@ async def _response_chunk_generator(
                 # VLLM
                 elif model_content.inference_framework == LLMInferenceFramework.VLLM:
                     token = None
-                    if request.return_token_log_probs:
-                        token = TokenOutput(
-                            token=result["result"]["text"],
-                            log_prob=list(result["result"]["log_probs"].values())[0],
-                        )
-                    finished = result["result"]["finished"]
-                    num_prompt_tokens = result["result"]["count_prompt_tokens"]
+                    vllm_output: dict = result["result"]
+                    # Handle OpenAI-compatible streaming format (vLLM 0.5+) vs legacy format
+                    if "choices" in vllm_output and vllm_output["choices"]:
+                        # OpenAI streaming format: {"choices": [{"text": "...", "finish_reason": ...}], ...}
+                        choice = vllm_output["choices"][0]
+                        text = choice.get("text", "")
+                        finished = choice.get("finish_reason") is not None
+                        usage = vllm_output.get("usage", {})
+                        num_prompt_tokens = usage.get("prompt_tokens", 0)
+                        num_completion_tokens = usage.get("completion_tokens", 0)
+                        if request.return_token_log_probs and choice.get("logprobs"):
+                            logprobs = choice["logprobs"]
+                            if logprobs.get("tokens") and logprobs.get("token_logprobs"):
+                                # Get the last token from the logprobs
+                                idx = len(logprobs["tokens"]) - 1
+                                token = TokenOutput(
+                                    token=logprobs["tokens"][idx],
+                                    log_prob=logprobs["token_logprobs"][idx] or 0.0,
+                                )
+                    else:
+                        # Legacy format: {"text": "...", "finished": ..., ...}
+                        text = vllm_output["text"]
+                        finished = vllm_output["finished"]
+                        num_prompt_tokens = vllm_output["count_prompt_tokens"]
+                        num_completion_tokens = vllm_output["count_output_tokens"]
+                        if request.return_token_log_probs and vllm_output.get("log_probs"):
+                            token = TokenOutput(
+                                token=vllm_output["text"],
+                                log_prob=list(vllm_output["log_probs"].values())[0],
+                            )
                     yield CompletionStreamV1Response(
                         request_id=request_id,
                         output=CompletionStreamOutput(
-                            text=result["result"]["text"],
+                            text=text,
                             finished=finished,
                             num_prompt_tokens=num_prompt_tokens if finished else None,
-                            num_completion_tokens=result["result"]["count_output_tokens"],
+                            num_completion_tokens=num_completion_tokens,
                             token=token,
                         ),
                     )
@@ -2750,12 +2813,14 @@ def validate_endpoint_supports_openai_completion(
             f"The endpoint's inference framework ({endpoint_content.inference_framework}) does not support openai compatible completion."
         )
 
-    if not isinstance(
-        endpoint.record.current_model_bundle.flavor, RunnableImageLike
-    ) or OPENAI_COMPLETION_PATH not in (
-        endpoint.record.current_model_bundle.flavor.extra_routes
-        + endpoint.record.current_model_bundle.flavor.routes
-    ):
+    if not isinstance(endpoint.record.current_model_bundle.flavor, RunnableImageLike):
+        raise EndpointUnsupportedRequestException(
+            "Endpoint does not support v2 openai compatible completion"
+        )
+
+    flavor = endpoint.record.current_model_bundle.flavor
+    all_routes = flavor.extra_routes + flavor.routes
+    if OPENAI_COMPLETION_PATH not in all_routes:
         raise EndpointUnsupportedRequestException(
             "Endpoint does not support v2 openai compatible completion"
         )
@@ -3042,12 +3107,12 @@ def validate_endpoint_supports_chat_completion(
             f"The endpoint's inference framework ({endpoint_content.inference_framework}) does not support chat completion."
         )
 
-    if not isinstance(
-        endpoint.record.current_model_bundle.flavor, RunnableImageLike
-    ) or OPENAI_CHAT_COMPLETION_PATH not in (
-        endpoint.record.current_model_bundle.flavor.extra_routes
-        + endpoint.record.current_model_bundle.flavor.routes
-    ):
+    if not isinstance(endpoint.record.current_model_bundle.flavor, RunnableImageLike):
+        raise EndpointUnsupportedRequestException("Endpoint does not support chat completion")
+
+    flavor = endpoint.record.current_model_bundle.flavor
+    all_routes = flavor.extra_routes + flavor.routes
+    if OPENAI_CHAT_COMPLETION_PATH not in all_routes:
         raise EndpointUnsupportedRequestException("Endpoint does not support chat completion")