huggingface · oOraph · Apr 11, 2025 · Sep 19, 2025 · Nov 12, 2025 · Nov 17, 2025
diff --git a/dockerfiles/pytorch/Dockerfile b/dockerfiles/pytorch/Dockerfile
@@ -32,8 +32,7 @@ RUN apt-get update && \
     && apt-get clean autoremove --yes \
     && rm -rf /var/lib/{apt,dpkg,cache,log}
 
-# Copying only necessary files as filtered by .dockerignore
-COPY . .
+RUN mkdir -p /var/lib/dpkg && touch /var/lib/dpkg/status
 
 # Set Python 3.11 as the default python version
 RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \
@@ -47,6 +46,11 @@ RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
 # Upgrade pip
 RUN pip install --no-cache-dir --upgrade pip
 
+COPY requirements.txt .
+RUN pip install -r requirements.txt && rm -rf /root/.cache
+
+# Copying only necessary files as filtered by .dockerignore
+COPY . .
 # Install wheel and setuptools
 RUN pip install --no-cache-dir --upgrade pip ".[torch,st,diffusers]"
 

diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,23 @@
+kenlm@ git+https://github.com/kpu/kenlm@ba83eafdce6553addd885ed3da461bb0d60f8df7
+transformers[audio,sentencepiece,sklearn,vision]==4.51.3
+huggingface_hub[hf_transfer,hf_xet]==0.31.1
+Pillow
+librosa
+pyctcdecode>=0.3.0
+phonemizer
+ffmpeg
+starlette
+uvicorn
+gunicorn
+pandas
+orjson
+einops
+timm
+sentence_transformers==4.0.2
+diffusers==0.33.1
+accelerate==1.6.0
+torch==2.5.1
+torchvision
+torchaudio
+peft==0.15.1
+psutil>=6.0.0
diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh
@@ -59,4 +59,4 @@ if [[ ! -z "${HF_MODEL_DIR}" ]]; then
 fi
 
 # Start the server
-exec uvicorn webservice_starlette:app --host 0.0.0.0 --port ${PORT}
+exec gunicorn webservice_starlette:app -k uvicorn.workers.UvicornWorker --workers ${WORKERS:-1} --bind 0.0.0.0:${PORT} --timeout 30
diff --git a/setup.py b/setup.py
@@ -1,7 +1,19 @@
 from __future__ import absolute_import
-
+import os
 from setuptools import find_packages, setup
 
+lib_folder = os.path.dirname(os.path.realpath(__file__))
+requirements_path = f"{lib_folder}/requirements.txt"
+install_requires = [] # Here we'll add: ["gunicorn", "docutils>=0.3", "lxml==0.5a7"]
+if os.path.isfile(requirements_path):
+    with open(requirements_path) as f:
+        install_requires = f.read().splitlines()
+
+test_requirements_path =  f"{lib_folder}/test-requirements.txt"
+if os.path.isfile(test_requirements_path):
+    with open(test_requirements_path) as f:
+        test_requirements = f.read().splitlines()
+
 # We don't declare our dependency on transformers here because we build with
 # different packages for different variants
 
@@ -12,47 +24,14 @@
 # ffmpeg: ffmpeg is required for audio processing. On Ubuntu it can be installed as follows: apt install ffmpeg
 # libavcodec-extra : libavcodec-extra  includes additional codecs for ffmpeg
 
-install_requires = [
-    # Due to an error affecting kenlm and cmake (see https://github.com/kpu/kenlm/pull/464)
-    # Also see the transformers patch for it https://github.com/huggingface/transformers/pull/37091
-    "kenlm@git+https://github.com/kpu/kenlm@ba83eafdce6553addd885ed3da461bb0d60f8df7",
-    "transformers[sklearn,sentencepiece,audio,vision]==4.51.3",
-    "huggingface_hub[hf_transfer]==0.30.2",
-    # vision
-    "Pillow",
-    "librosa",
-    # speech + torchaudio
-    "pyctcdecode>=0.3.0",
-    "phonemizer",
-    "ffmpeg",
-    # web api
-    "starlette",
-    "uvicorn",
-    "pandas",
-    "orjson",
-    "einops",
-]
-
 extras = {}
-
 extras["st"] = ["sentence_transformers==4.0.2"]
 extras["diffusers"] = ["diffusers==0.33.1", "accelerate==1.6.0"]
 # Includes `peft` as PEFT requires `torch` so having `peft` as a core dependency
 # means that `torch` will be installed even if the `torch` extra is not specified.
 extras["torch"] = ["torch==2.5.1", "torchvision", "torchaudio", "peft==0.15.1"]
-extras["test"] = [
-    "pytest==7.2.1",
-    "pytest-xdist",
-    "parameterized",
-    "psutil",
-    "datasets",
-    "pytest-sugar",
-    "mock==2.0.0",
-    "docker",
-    "requests",
-    "tenacity",
-]
 extras["quality"] = ["isort", "ruff"]
+extras["test"] = test_requirements
 extras["inf2"] = ["optimum-neuron"]
 extras["google"] = ["google-cloud-storage", "crcmod==1.7"]
 

diff --git a/src/huggingface_inference_toolkit/async_utils.py b/src/huggingface_inference_toolkit/async_utils.py
@@ -5,6 +5,8 @@
 from anyio import Semaphore
 from typing_extensions import ParamSpec
 
+from huggingface_inference_toolkit.logging import logger
+
 # To not have too many threads running (which could happen on too many concurrent
 # requests, we limit it with a semaphore.
 MAX_CONCURRENT_THREADS = 1
@@ -15,6 +17,8 @@
 
 # moves blocking call to asyncio threadpool limited to 1 to not overload the system
 # REF: https://stackoverflow.com/a/70929141
-async def async_handler_call(handler: Callable[P, T], body: Dict[str, Any]) -> T:
+async def async_call(handler: Callable[P, T], *args, **kwargs) -> T:
+    logger.info("Setting blocking call to async handler")
     async with MAX_THREADS_GUARD:
-        return await anyio.to_thread.run_sync(functools.partial(handler, body))
+        logger.info("Async call semaphore passed")
+        return await anyio.to_thread.run_sync(handler, *args, **kwargs)
diff --git a/src/huggingface_inference_toolkit/diffusers_utils.py b/src/huggingface_inference_toolkit/diffusers_utils.py
@@ -1,4 +1,5 @@
 import importlib.util
+import os
 from typing import Union
 
 from transformers.utils.import_utils import is_torch_bf16_gpu_available
@@ -63,6 +64,16 @@ def __call__(
             kwargs.pop("num_images_per_prompt")
             logger.warning("Sending num_images_per_prompt > 1 to pipeline is not supported. Using default value 1.")
 
+        if "num_inference_steps" not in kwargs:
+            default_num_steps = os.environ.get("DEFAULT_NUM_INFERENCE_STEPS")
+            if default_num_steps:
+                kwargs["num_inference_steps"] = int(default_num_steps)
+
+        if "guidance_scale" not in kwargs:
+            guidance_scale = os.environ.get("DEFAULT_GUIDANCE_SCALE")
+            if guidance_scale is not None:
+                kwargs["guidance_scale"] = float(guidance_scale)
+
         if "target_size" in kwargs:
             kwargs["height"] = kwargs["target_size"].pop("height")
             kwargs["width"] = kwargs["target_size"].pop("width")

diff --git a/src/huggingface_inference_toolkit/env_utils.py b/src/huggingface_inference_toolkit/env_utils.py
@@ -1,3 +1,6 @@
+import os
+
+
 def strtobool(val: str) -> bool:
     """Convert a string representation of truth to True or False booleans.
     True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values
@@ -20,3 +23,11 @@ def strtobool(val: str) -> bool:
     raise ValueError(
         f"Invalid truth value, it should be a string but {val} was provided instead."
     )
+
+
+def api_inference_compat():
+    return strtobool(os.getenv("API_INFERENCE_COMPAT", "false"))
+
+
+def ignore_custom_handler():
+    return strtobool(os.getenv("IGNORE_CUSTOM_HANDLER", "false"))
diff --git a/src/huggingface_inference_toolkit/handler.py b/src/huggingface_inference_toolkit/handler.py
@@ -1,12 +1,16 @@
 import os
 from pathlib import Path
+from time import perf_counter
 from typing import Any, Dict, Literal, Optional, Union
 
+from huggingface_inference_toolkit import logging
 from huggingface_inference_toolkit.const import HF_TRUST_REMOTE_CODE
-from huggingface_inference_toolkit.sentence_transformers_utils import SENTENCE_TRANSFORMERS_TASKS
+from huggingface_inference_toolkit.env_utils import api_inference_compat, ignore_custom_handler
+from huggingface_inference_toolkit.logging import logger
 from huggingface_inference_toolkit.utils import (
+    already_left,
     check_and_register_custom_pipeline_from_directory,
-    get_pipeline,
+    should_discard_left,
 )
 
 
@@ -19,24 +23,46 @@ class HuggingFaceHandler:
     def __init__(
         self, model_dir: Union[str, Path], task: Union[str, None] = None, framework: Literal["pt"] = "pt"
     ) -> None:
+        from huggingface_inference_toolkit.heavy_utils import get_pipeline
         self.pipeline = get_pipeline(
             model_dir=model_dir,  # type: ignore
             task=task,  # type: ignore
             framework=framework,
             trust_remote_code=HF_TRUST_REMOTE_CODE,
         )
 
-    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+    def __call__(self, data: Dict[str, Any]):
         """
         Handles an inference request with input data and makes a prediction.
         Args:
             :data: (obj): the raw request body data.
         :return: prediction output
         """
+        start = perf_counter()
+        pred = self._timed_call(data)
+        end = perf_counter()
+        logger.info("Inference duration: %.2f ms", (end - start) * 1000)
+        return pred
+
+    def _timed_call(self, data: Dict[str, Any]):
+        logger.debug("Calling HF default handler")
+        # import as late as possible to reduce the footprint
+        from huggingface_inference_toolkit.sentence_transformers_utils import SENTENCE_TRANSFORMERS_TASKS
+
         inputs = data.pop("inputs", data)
         parameters = data.pop("parameters", {})
 
-        # diffusers and sentence transformers pipelines do not have the `task` arg
+        if "handler_params" in data:
+            handler_params = data.pop("handler_params")
+            if should_discard_left():
+                request = handler_params.get("request")
+                if not request:
+                    logger.warn("Cannot know if request caller already left, missing request handler param")
+                elif already_left(request):
+                    logger.info("Discarding request as the caller already left")
+                    return None
+
+    # diffusers and sentence transformers pipelines do not have the `task` arg
         if not hasattr(self.pipeline, "task"):
             # sentence transformers parameters not supported yet
             if any(isinstance(self.pipeline, v) for v in SENTENCE_TRANSFORMERS_TASKS.values()):
@@ -101,9 +127,83 @@ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
                     "or `candidateLabels`."
                 )
 
-        return (
-            self.pipeline(**inputs, **parameters) if isinstance(inputs, dict) else self.pipeline(inputs, **parameters)  # type: ignore
-        )
+        if api_inference_compat():
+            if self.pipeline.task == "text-classification" and isinstance(inputs, str):
+                inputs = [inputs]
+                parameters.setdefault("top_k", os.environ.get("DEFAULT_TOP_K", 5))
+            if self.pipeline.task == "token-classification":
+                parameters.setdefault("aggregation_strategy", os.environ.get("DEFAULT_AGGREGATION_STRATEGY", "simple"))
+
+        logger.debug("Performing inference")
+        resp = self.pipeline(**inputs, **parameters) if isinstance(inputs, dict) else \
+            self.pipeline(inputs, **parameters)
+
+        if api_inference_compat():
+            if self.pipeline.task == "text-classification":
+                # We don't want to return {} but [{}] in any case
+                if isinstance(resp, list) and len(resp) > 0:
+                    if not isinstance(resp[0], list):
+                        return [resp]
+                return resp
+            if self.pipeline.task == "feature-extraction":
+                # If the library used is Transformers then the feature-extraction is returning the headless encoder
+                # outputs as embeddings. The shape is a 3D or 4D array
+                # [n_inputs, batch_size = 1, n_sentence_tokens, num_hidden_dim].
+                # Let's just discard the batch size dim that always seems to be 1 and return a 2D/3D array
+                # https://github.com/huggingface/transformers/blob/5c47d08b0d6835b8d8fc1c06d9a1bc71f6e78ace/src/transformers/pipelines/feature_extraction.py#L27
+                # for api inference (reason: mainly display)
+                new_resp = []
+                if isinstance(inputs, list):
+                    if isinstance(resp, list) and len(resp) == len(inputs):
+                        for it in resp:
+                            # Batch size dim is the first it level, discard it
+                            if isinstance(it, list) and len(it) == 1:
+                                new_resp.append(it[0])
+                            else:
+                                logging.logger.warning("One of the output batch size differs from 1: %d", len(it))
+                                return resp
+                        return new_resp
+                    else:
+                        logging.logger.warning("Inputs and resp len differ (or resp is not a list, type %s)",
+                                               type(resp))
+                        return resp
+                elif isinstance(inputs, str):
+                    if isinstance(resp, list) and len(resp) == 1:
+                        return resp[0]
+                    else:
+                        logging.logger.warning("The output batch size differs from 1: %d", len(resp))
+                        return resp
+                else:
+                    logging.logger.warning("Output unexpected type %s", type(resp))
+                    return resp
+            if self.pipeline.task == "image-segmentation":
+                if isinstance(resp, list):
+                    new_resp = []
+                    for el in resp:
+                        if isinstance(el, dict) and el.get("score") is None:
+                            el["score"] = 1
+                        new_resp.append(el)
+                    resp = new_resp
+            if self.pipeline.task == "zero-shot-classification":
+                try:
+                    if isinstance(resp, dict):
+                        if 'labels' in resp and 'scores' in resp:
+                            labels = resp['labels']
+                            scores = resp['scores']
+                            if len(labels) == len(scores):
+                                new_resp = []
+                                for label, score in zip(labels, scores, strict=True):
+                                    new_resp.append({"label": label, "score": score})
+                                resp = new_resp
+                            else:
+                                raise Exception("labels and scores do not have the same len, {} != {}".format(
+                                    len(labels), len(scores)))
+                        else:
+                            raise Exception("Missing labels or scores key in response dict {}".format(resp))
+                except Exception as e:
+                    logging.logger.warning("Unable to remap response for api inference compat")
+                    logging.logger.exception(e)
+        return resp
 
 
 class VertexAIHandler(HuggingFaceHandler):
@@ -149,7 +249,10 @@ def get_inference_handler_either_custom_or_default_handler(model_dir: Path, task
     Returns:
         InferenceHandler: The appropriate inference handler based on the given model directory and task.
     """
-    custom_pipeline = check_and_register_custom_pipeline_from_directory(model_dir)
+    if ignore_custom_handler():
+        custom_pipeline = None
+    else:
+        custom_pipeline = check_and_register_custom_pipeline_from_directory(model_dir)
     if custom_pipeline is not None:
         return custom_pipeline