Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions dockerfiles/pytorch/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,7 @@ RUN apt-get update && \
&& apt-get clean autoremove --yes \
&& rm -rf /var/lib/{apt,dpkg,cache,log}

# Copying only necessary files as filtered by .dockerignore
COPY . .
RUN mkdir -p /var/lib/dpkg && touch /var/lib/dpkg/status

# Set Python 3.11 as the default python version
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \
Expand All @@ -47,6 +46,11 @@ RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
# Upgrade pip
RUN pip install --no-cache-dir --upgrade pip

COPY requirements.txt .
RUN pip install -r requirements.txt && rm -rf /root/.cache

# Copying only necessary files as filtered by .dockerignore
COPY . .
# Install wheel and setuptools
RUN pip install --no-cache-dir --upgrade pip ".[torch,st,diffusers]"

Expand Down
23 changes: 23 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
kenlm@ git+https://github.com/kpu/kenlm@ba83eafdce6553addd885ed3da461bb0d60f8df7
transformers[audio,sentencepiece,sklearn,vision]==4.51.3
huggingface_hub[hf_transfer,hf_xet]==0.31.1
Pillow
librosa
pyctcdecode>=0.3.0
phonemizer
ffmpeg
starlette
uvicorn
gunicorn
pandas
orjson
einops
timm
sentence_transformers==4.0.2
diffusers==0.33.1
accelerate==1.6.0
torch==2.5.1
torchvision
torchaudio
peft==0.15.1
psutil>=6.0.0
2 changes: 1 addition & 1 deletion scripts/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -59,4 +59,4 @@ if [[ ! -z "${HF_MODEL_DIR}" ]]; then
fi

# Start the server
exec uvicorn webservice_starlette:app --host 0.0.0.0 --port ${PORT}
exec gunicorn webservice_starlette:app -k uvicorn.workers.UvicornWorker --workers ${WORKERS:-1} --bind 0.0.0.0:${PORT} --timeout 30
49 changes: 14 additions & 35 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,19 @@
from __future__ import absolute_import

import os
from setuptools import find_packages, setup

lib_folder = os.path.dirname(os.path.realpath(__file__))
requirements_path = f"{lib_folder}/requirements.txt"
install_requires = [] # Here we'll add: ["gunicorn", "docutils>=0.3", "lxml==0.5a7"]
if os.path.isfile(requirements_path):
with open(requirements_path) as f:
install_requires = f.read().splitlines()

test_requirements_path = f"{lib_folder}/test-requirements.txt"
if os.path.isfile(test_requirements_path):
with open(test_requirements_path) as f:
test_requirements = f.read().splitlines()

# We don't declare our dependency on transformers here because we build with
# different packages for different variants

Expand All @@ -12,47 +24,14 @@
# ffmpeg: ffmpeg is required for audio processing. On Ubuntu it can be installed as follows: apt install ffmpeg
# libavcodec-extra : libavcodec-extra includes additional codecs for ffmpeg

install_requires = [
# Due to an error affecting kenlm and cmake (see https://github.com/kpu/kenlm/pull/464)
# Also see the transformers patch for it https://github.com/huggingface/transformers/pull/37091
"kenlm@git+https://github.com/kpu/kenlm@ba83eafdce6553addd885ed3da461bb0d60f8df7",
"transformers[sklearn,sentencepiece,audio,vision]==4.51.3",
"huggingface_hub[hf_transfer]==0.30.2",
# vision
"Pillow",
"librosa",
# speech + torchaudio
"pyctcdecode>=0.3.0",
"phonemizer",
"ffmpeg",
# web api
"starlette",
"uvicorn",
"pandas",
"orjson",
"einops",
]

extras = {}

extras["st"] = ["sentence_transformers==4.0.2"]
extras["diffusers"] = ["diffusers==0.33.1", "accelerate==1.6.0"]
# Includes `peft` as PEFT requires `torch` so having `peft` as a core dependency
# means that `torch` will be installed even if the `torch` extra is not specified.
extras["torch"] = ["torch==2.5.1", "torchvision", "torchaudio", "peft==0.15.1"]
extras["test"] = [
"pytest==7.2.1",
"pytest-xdist",
"parameterized",
"psutil",
"datasets",
"pytest-sugar",
"mock==2.0.0",
"docker",
"requests",
"tenacity",
]
extras["quality"] = ["isort", "ruff"]
extras["test"] = test_requirements
extras["inf2"] = ["optimum-neuron"]
extras["google"] = ["google-cloud-storage", "crcmod==1.7"]

Expand Down
8 changes: 6 additions & 2 deletions src/huggingface_inference_toolkit/async_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from anyio import Semaphore
from typing_extensions import ParamSpec

from huggingface_inference_toolkit.logging import logger

# To not have too many threads running (which could happen on too many concurrent
# requests, we limit it with a semaphore.
MAX_CONCURRENT_THREADS = 1
Expand All @@ -15,6 +17,8 @@

# moves blocking call to asyncio threadpool limited to 1 to not overload the system
# REF: https://stackoverflow.com/a/70929141
async def async_handler_call(handler: Callable[P, T], body: Dict[str, Any]) -> T:
async def async_call(handler: Callable[P, T], *args, **kwargs) -> T:
logger.info("Setting blocking call to async handler")
async with MAX_THREADS_GUARD:
return await anyio.to_thread.run_sync(functools.partial(handler, body))
logger.info("Async call semaphore passed")
return await anyio.to_thread.run_sync(handler, *args, **kwargs)
11 changes: 11 additions & 0 deletions src/huggingface_inference_toolkit/diffusers_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import importlib.util
import os
from typing import Union

from transformers.utils.import_utils import is_torch_bf16_gpu_available
Expand Down Expand Up @@ -63,6 +64,16 @@ def __call__(
kwargs.pop("num_images_per_prompt")
logger.warning("Sending num_images_per_prompt > 1 to pipeline is not supported. Using default value 1.")

if "num_inference_steps" not in kwargs:
default_num_steps = os.environ.get("DEFAULT_NUM_INFERENCE_STEPS")
if default_num_steps:
kwargs["num_inference_steps"] = int(default_num_steps)

if "guidance_scale" not in kwargs:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

useful for sd 3.5 turbo -> we want guidance scale 0 by default (e.g when not specified by user) because the num steps is too low, so that generated images are ok

guidance_scale = os.environ.get("DEFAULT_GUIDANCE_SCALE")
if guidance_scale is not None:
kwargs["guidance_scale"] = float(guidance_scale)

if "target_size" in kwargs:
kwargs["height"] = kwargs["target_size"].pop("height")
kwargs["width"] = kwargs["target_size"].pop("width")
Expand Down
11 changes: 11 additions & 0 deletions src/huggingface_inference_toolkit/env_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import os


def strtobool(val: str) -> bool:
"""Convert a string representation of truth to True or False booleans.
True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values
Expand All @@ -20,3 +23,11 @@ def strtobool(val: str) -> bool:
raise ValueError(
f"Invalid truth value, it should be a string but {val} was provided instead."
)


def api_inference_compat():
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

with this env var we intend to handle the small response differences between the api inference widgets on the hub and on endpoints ui. TODO: we should probably unify both widgets instead

return strtobool(os.getenv("API_INFERENCE_COMPAT", "false"))


def ignore_custom_handler():
return strtobool(os.getenv("IGNORE_CUSTOM_HANDLER", "false"))
119 changes: 111 additions & 8 deletions src/huggingface_inference_toolkit/handler.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import os
from pathlib import Path
from time import perf_counter
from typing import Any, Dict, Literal, Optional, Union

from huggingface_inference_toolkit import logging
from huggingface_inference_toolkit.const import HF_TRUST_REMOTE_CODE
from huggingface_inference_toolkit.sentence_transformers_utils import SENTENCE_TRANSFORMERS_TASKS
from huggingface_inference_toolkit.env_utils import api_inference_compat, ignore_custom_handler
from huggingface_inference_toolkit.logging import logger
from huggingface_inference_toolkit.utils import (
already_left,
check_and_register_custom_pipeline_from_directory,
get_pipeline,
should_discard_left,
)


Expand All @@ -19,24 +23,46 @@ class HuggingFaceHandler:
def __init__(
self, model_dir: Union[str, Path], task: Union[str, None] = None, framework: Literal["pt"] = "pt"
) -> None:
from huggingface_inference_toolkit.heavy_utils import get_pipeline
self.pipeline = get_pipeline(
model_dir=model_dir, # type: ignore
task=task, # type: ignore
framework=framework,
trust_remote_code=HF_TRUST_REMOTE_CODE,
)

def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
def __call__(self, data: Dict[str, Any]):
"""
Handles an inference request with input data and makes a prediction.
Args:
:data: (obj): the raw request body data.
:return: prediction output
"""
start = perf_counter()
pred = self._timed_call(data)
end = perf_counter()
logger.info("Inference duration: %.2f ms", (end - start) * 1000)
return pred

def _timed_call(self, data: Dict[str, Any]):
logger.debug("Calling HF default handler")
# import as late as possible to reduce the footprint
from huggingface_inference_toolkit.sentence_transformers_utils import SENTENCE_TRANSFORMERS_TASKS

inputs = data.pop("inputs", data)
parameters = data.pop("parameters", {})

# diffusers and sentence transformers pipelines do not have the `task` arg
if "handler_params" in data:
handler_params = data.pop("handler_params")
if should_discard_left():
request = handler_params.get("request")
if not request:
logger.warn("Cannot know if request caller already left, missing request handler param")
elif already_left(request):
logger.info("Discarding request as the caller already left")
return None

# diffusers and sentence transformers pipelines do not have the `task` arg
if not hasattr(self.pipeline, "task"):
# sentence transformers parameters not supported yet
if any(isinstance(self.pipeline, v) for v in SENTENCE_TRANSFORMERS_TASKS.values()):
Expand Down Expand Up @@ -101,9 +127,83 @@ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
"or `candidateLabels`."
)

return (
self.pipeline(**inputs, **parameters) if isinstance(inputs, dict) else self.pipeline(inputs, **parameters) # type: ignore
)
if api_inference_compat():
if self.pipeline.task == "text-classification" and isinstance(inputs, str):
inputs = [inputs]
parameters.setdefault("top_k", os.environ.get("DEFAULT_TOP_K", 5))
if self.pipeline.task == "token-classification":
parameters.setdefault("aggregation_strategy", os.environ.get("DEFAULT_AGGREGATION_STRATEGY", "simple"))

logger.debug("Performing inference")
resp = self.pipeline(**inputs, **parameters) if isinstance(inputs, dict) else \
self.pipeline(inputs, **parameters)

if api_inference_compat():
if self.pipeline.task == "text-classification":
# We don't want to return {} but [{}] in any case
if isinstance(resp, list) and len(resp) > 0:
if not isinstance(resp[0], list):
return [resp]
return resp
if self.pipeline.task == "feature-extraction":
# If the library used is Transformers then the feature-extraction is returning the headless encoder
# outputs as embeddings. The shape is a 3D or 4D array
# [n_inputs, batch_size = 1, n_sentence_tokens, num_hidden_dim].
# Let's just discard the batch size dim that always seems to be 1 and return a 2D/3D array
# https://github.com/huggingface/transformers/blob/5c47d08b0d6835b8d8fc1c06d9a1bc71f6e78ace/src/transformers/pipelines/feature_extraction.py#L27
# for api inference (reason: mainly display)
new_resp = []
if isinstance(inputs, list):
if isinstance(resp, list) and len(resp) == len(inputs):
for it in resp:
# Batch size dim is the first it level, discard it
if isinstance(it, list) and len(it) == 1:
new_resp.append(it[0])
else:
logging.logger.warning("One of the output batch size differs from 1: %d", len(it))
return resp
return new_resp
else:
logging.logger.warning("Inputs and resp len differ (or resp is not a list, type %s)",
type(resp))
return resp
elif isinstance(inputs, str):
if isinstance(resp, list) and len(resp) == 1:
return resp[0]
else:
logging.logger.warning("The output batch size differs from 1: %d", len(resp))
return resp
else:
logging.logger.warning("Output unexpected type %s", type(resp))
return resp
if self.pipeline.task == "image-segmentation":
if isinstance(resp, list):
new_resp = []
for el in resp:
if isinstance(el, dict) and el.get("score") is None:
el["score"] = 1
new_resp.append(el)
resp = new_resp
if self.pipeline.task == "zero-shot-classification":
try:
if isinstance(resp, dict):
if 'labels' in resp and 'scores' in resp:
labels = resp['labels']
scores = resp['scores']
if len(labels) == len(scores):
new_resp = []
for label, score in zip(labels, scores, strict=True):
new_resp.append({"label": label, "score": score})
resp = new_resp
else:
raise Exception("labels and scores do not have the same len, {} != {}".format(
len(labels), len(scores)))
else:
raise Exception("Missing labels or scores key in response dict {}".format(resp))
except Exception as e:
logging.logger.warning("Unable to remap response for api inference compat")
logging.logger.exception(e)
return resp


class VertexAIHandler(HuggingFaceHandler):
Expand Down Expand Up @@ -149,7 +249,10 @@ def get_inference_handler_either_custom_or_default_handler(model_dir: Path, task
Returns:
InferenceHandler: The appropriate inference handler based on the given model directory and task.
"""
custom_pipeline = check_and_register_custom_pipeline_from_directory(model_dir)
if ignore_custom_handler():
custom_pipeline = None
else:
custom_pipeline = check_and_register_custom_pipeline_from_directory(model_dir)
if custom_pipeline is not None:
return custom_pipeline

Expand Down
Loading
Loading