From 17524538897e994a8ded41d5d0efc8c7b9ac47a0 Mon Sep 17 00:00:00 2001 From: Raphael Glon Date: Thu, 7 Mar 2024 10:50:54 +0100 Subject: [PATCH 01/23] Build neuron image Signed-off-by: Raphael Glon --- dockerfiles/pytorch/Dockerfile | 13 +++++++++---- dockerfiles/pytorch/neuronx.sh | 30 ++++++++++++++++++++++++++++++ makefile | 5 ++++- setup.py | 2 ++ 4 files changed, 45 insertions(+), 5 deletions(-) create mode 100755 dockerfiles/pytorch/neuronx.sh diff --git a/dockerfiles/pytorch/Dockerfile b/dockerfiles/pytorch/Dockerfile index 8e4c4d35..d2f53ae2 100644 --- a/dockerfiles/pytorch/Dockerfile +++ b/dockerfiles/pytorch/Dockerfile @@ -1,6 +1,9 @@ ARG BASE_IMAGE=nvidia/cuda:12.1.0-devel-ubuntu22.04 FROM $BASE_IMAGE + +ARG NEURONX=0 + SHELL ["/bin/bash", "-c"] LABEL maintainer="Hugging Face" @@ -31,12 +34,12 @@ RUN apt-get update && \ libsndfile1-dev \ ffmpeg \ && apt-get clean autoremove --yes \ - && rm -rf /var/lib/{apt,dpkg,cache,log} + && rm -rf /var/lib/{apt,cache,log} + # Copying only necessary files as filtered by .dockerignore COPY . . -# install wheel and setuptools -RUN pip install --no-cache-dir -U pip ".[torch, st, diffusers]" +RUN if [[ "$NEURONX" == "1" ]];then /bin/bash -c "./dockerfiles/pytorch/neuronx.sh";else pip install --no-cache-dir -U pip ".[torch, st, diffusers]";fi # copy application COPY src/huggingface_inference_toolkit huggingface_inference_toolkit @@ -45,4 +48,6 @@ COPY src/huggingface_inference_toolkit/webservice_starlette.py webservice_starle # copy entrypoint and change permissions COPY --chmod=0755 scripts/entrypoint.sh entrypoint.sh -ENTRYPOINT ["bash", "-c", "./entrypoint.sh"] \ No newline at end of file +ENTRYPOINT ["bash", "-c", "./entrypoint.sh"] + +RUN apt-get update && apt-get install -y vim diff --git a/dockerfiles/pytorch/neuronx.sh b/dockerfiles/pytorch/neuronx.sh new file mode 100755 index 00000000..dd954804 --- /dev/null +++ b/dockerfiles/pytorch/neuronx.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +set -e + +# Install system prerequisites +apt-get update -y \ + && apt-get install -y --no-install-recommends \ + gnupg2 \ + wget + +echo "deb https://apt.repos.neuron.amazonaws.com jammy main" > /etc/apt/sources.list.d/neuron.list +wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - + +apt-get update -y \ + && apt-get install -y --no-install-recommends \ + aws-neuronx-dkms=2.* \ + aws-neuronx-collectives=2.* \ + aws-neuronx-runtime-lib=2.* \ + aws-neuronx-tools=2.* + +pip install -U pip + +pip3 install neuronx-cc==2.12.68.0 \ + torch-neuronx==1.13.1.1.13.1 \ + transformers-neuronx==0.9.474 \ + --extra-index-url=https://pip.repos.neuron.amazonaws.com + +pip3 install optimum[neuronx,diffusers] + +pip install ".[st,torch1]" diff --git a/makefile b/makefile index a9490428..d6c8a53e 100644 --- a/makefile +++ b/makefile @@ -26,5 +26,8 @@ inference-pytorch-gpu: inference-pytorch-cpu: docker build --build-arg="BASE_IMAGE=ubuntu:22.04" -f dockerfiles/pytorch/Dockerfile -t integration-test-pytorch:cpu . +inference-pytorch-neuron: + docker build --build-arg=BASE_IMAGE=ubuntu:22.04 --build-arg=NEURONX=1 -f dockerfiles/pytorch/Dockerfile -t integration-test-pytorch:neuron . + stop-all: - docker stop $$(docker ps -a -q) && docker container prune --force \ No newline at end of file + docker stop $$(docker ps -a -q) && docker container prune --force diff --git a/setup.py b/setup.py index 5e99df02..f8d669c0 100644 --- a/setup.py +++ b/setup.py @@ -39,6 +39,8 @@ extras["st"] = ["sentence_transformers==2.4.0"] extras["diffusers"] = ["diffusers==0.26.3", "accelerate==0.27.2"] extras["torch"] = ["torch==2.2.0", "torchvision", "torchaudio"] +# For neuronx +extras["torch1"] = ["torch==1.13.1", "torchvision", "torchaudio"] extras["tensorflow"] = ["tensorflow"] extras["test"] = [ "pytest==7.2.1", From ce71d1ec3474f709b1ac1425c797b24048e6699a Mon Sep 17 00:00:00 2001 From: Raphael Glon Date: Thu, 7 Mar 2024 11:44:00 +0100 Subject: [PATCH 02/23] minor: typo Signed-off-by: Raphael Glon --- src/huggingface_inference_toolkit/webservice_starlette.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/huggingface_inference_toolkit/webservice_starlette.py b/src/huggingface_inference_toolkit/webservice_starlette.py index 8bc68b2e..00b6967d 100644 --- a/src/huggingface_inference_toolkit/webservice_starlette.py +++ b/src/huggingface_inference_toolkit/webservice_starlette.py @@ -50,7 +50,7 @@ async def some_startup_task(): else: raise ValueError( f"""Can't initialize model. - Please set env HF_MODEL_DIR or provider a HF_MODEL_ID. + Please set env HF_MODEL_DIR or provide a HF_MODEL_ID. Provided values are: HF_MODEL_DIR: {HF_MODEL_DIR} and HF_MODEL_ID:{HF_MODEL_ID}""" ) From a21ed5bb4c2721618ca5de8d313ddff4158b2010 Mon Sep 17 00:00:00 2001 From: Raphael Glon Date: Thu, 7 Mar 2024 11:44:21 +0100 Subject: [PATCH 03/23] Build neuron image Signed-off-by: Raphael Glon --- .github/workflows/build-container.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/build-container.yaml b/.github/workflows/build-container.yaml index fe12fbf6..3ad8ad5f 100644 --- a/.github/workflows/build-container.yaml +++ b/.github/workflows/build-container.yaml @@ -34,6 +34,16 @@ jobs: TAILSCALE_AUTHKEY: ${{ secrets.TAILSCALE_AUTHKEY }} REGISTRY_USERNAME: ${{ secrets.REGISTRY_USERNAME }} REGISTRY_PASSWORD: ${{ secrets.REGISTRY_PASSWORD }} + starlette-pytorch-neuron: + uses: ./.github/workflows/docker-build-action.yaml + with: + image: inference-pytorch-neuron + dockerfile: dockerfiles/pytorch/Dockerfile + build_args: "BASE_IMAGE=ubuntu:22.04,NEURONX=1" + secrets: + TAILSCALE_AUTHKEY: ${{ secrets.TAILSCALE_AUTHKEY }} + REGISTRY_USERNAME: ${{ secrets.REGISTRY_USERNAME }} + REGISTRY_PASSWORD: ${{ secrets.REGISTRY_PASSWORD }} starlette-tensorflow-cpu: uses: ./.github/workflows/docker-build-action.yaml with: From cfe316e67cba17b6733bd7a0985d776c18899331 Mon Sep 17 00:00:00 2001 From: Raphael Glon Date: Thu, 7 Mar 2024 11:48:13 +0100 Subject: [PATCH 04/23] ci: tmp change build tag for tests Signed-off-by: Raphael Glon --- .github/workflows/docker-build-action.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker-build-action.yaml b/.github/workflows/docker-build-action.yaml index 62cba961..37889934 100644 --- a/.github/workflows/docker-build-action.yaml +++ b/.github/workflows/docker-build-action.yaml @@ -64,8 +64,8 @@ jobs: context: ${{ inputs.context }} build-args: ${{ inputs.build_args }} file: ${{ inputs.context }}/${{ inputs.dockerfile }} - tags: ${{ inputs.repository }}/${{ inputs.image }}:sha-${{ env.GITHUB_SHA_SHORT }},${{ inputs.repository }}/${{ inputs.image }}:latest - + # tags: ${{ inputs.repository }}/${{ inputs.image }}:sha-${{ env.GITHUB_SHA_SHORT }},${{ inputs.repository }}/${{ inputs.image }}:latest + tags: ${{ inputs.repository }}/${{ inputs.image }}:testraph - name: Tailscale Wait if: ${{ failure() || runner.debug == '1' }} uses: huggingface/tailscale-action@v1 From 1756a6df7a24008fc8ba168795e868af3986cef9 Mon Sep 17 00:00:00 2001 From: Raphael Glon Date: Tue, 14 May 2024 11:04:17 +0200 Subject: [PATCH 05/23] wip stable diffusion for neuron Signed-off-by: Raphael Glon --- .../diffusers_utils.py | 70 ++++++++++++++++--- src/huggingface_inference_toolkit/utils.py | 25 +++++-- 2 files changed, 82 insertions(+), 13 deletions(-) diff --git a/src/huggingface_inference_toolkit/diffusers_utils.py b/src/huggingface_inference_toolkit/diffusers_utils.py index 521a85df..75227cf1 100644 --- a/src/huggingface_inference_toolkit/diffusers_utils.py +++ b/src/huggingface_inference_toolkit/diffusers_utils.py @@ -1,7 +1,9 @@ import importlib.util import logging +import os from transformers.utils.import_utils import is_torch_bf16_gpu_available +from optimum import neuron logger = logging.getLogger(__name__) logging.basicConfig(format="%(asctime)s | %(levelname)s | %(message)s", level=logging.INFO) @@ -49,14 +51,64 @@ def __call__( out = self.pipeline(prompt, num_images_per_prompt=1, **kwargs) return out.images[0] - -DIFFUSERS_TASKS = { - "text-to-image": IEAutoPipelineForText2Image, -} - - -def get_diffusers_pipeline(task=None, model_dir=None, device=-1, **kwargs): +# +# DIFFUSERS_TASKS = { +# "text-to-image": [NeuronStableDiffusionXLPipeline], +# } + + +def load_optimum_diffusion_pipeline(task, model_dir): + + # Step 1: load config and look for _class_name + try: + config = StableDiffusionPipeline.load_config(pretrained_model_name_or_path=model_dir) + except OSError as e: + logger.error("Unable to load config file for repository %s", model_dir) + logger.exception(e) + raise + + pipeline_class_name = config['_class_name'] + + logger.debug("Repository pipeline class name %s", pipeline_class_name) + if pipeline_class_name.contains("Diffusion") and pipeline_class_name.contains("XL"): + if task == "image-to-image": + pipeline_class = neuron.NeuronStableDiffusionXLImg2ImgPipeline + else: + pipeline_class = neuron.NeuronStableDiffusionXLPipeline + else: + if task == "image-to-image": + pipeline_class = neuron.NeuronStableDiffusionImg2ImgPipeline + else: + pipeline_class = neuron.NeuronStableDiffusionPipeline + + logger.debug("Pipeline class %s", pipeline_class.__class__) + + # if is neuron model, no need for additional kwargs + if pipeline_class_name.contains("Neuron"): + kwargs = {} + else: + # Model will be compiled and exported on the flight as the cached models cause a performance drop + # for diffusion models, unless otherwise specified through an explicit env variable + + # Image shapes need to be frozen at loading/compilation time + compiler_args = { + "auto_cast": "matmul", + "auto_cast_type": "bf16", + "inline_weights_to_neff": os.environ.get("INLINE_WEIGHTS_TO_NEFF", + "false").lower() in ["false", "no", "0"], + "data_parallel_mode": os.environ.get("DATA_PARALLEL_MODE", "unet") + } + input_shapes = {"batch_size": 1, + "height": int(os.environ("IMAGE_HEIGHT", 512)), + "width": int(os.environ("IMAGE_WIDTH", 512))} + kwargs = {**compiler_args, **input_shapes, "export": True} + + # In the second case, exporting can take a huge amount of time, which makes endpoints not a really suited solution + # at least as long as the cache is not really an option for diffusion + return pipeline_class(kwargs) + + +def get_diffusers_pipeline(task=None, model_dir=None, **kwargs): """Get a pipeline for Diffusers models.""" - device = "cuda" if device == 0 else "cpu" - pipeline = DIFFUSERS_TASKS[task](model_dir=model_dir, device=device) + pipeline = load_optimum_diffusion_pipeline(task=task, model_dir=model_dir) return pipeline diff --git a/src/huggingface_inference_toolkit/utils.py b/src/huggingface_inference_toolkit/utils.py index 1570317b..4cab2a39 100644 --- a/src/huggingface_inference_toolkit/utils.py +++ b/src/huggingface_inference_toolkit/utils.py @@ -33,7 +33,7 @@ def is_optimum_available(): - return False + return True # TODO: change when supported # return _optimum_available @@ -229,7 +229,7 @@ def get_pipeline( create pipeline class for a specific task based on local saved model """ device = get_device() - logger.info(f"Using device { 'GPU' if device == 0 else 'CPU'}") + logger.info(f"Using device { 'GPU' if device == 0 else 'CPU/TPU/Neuron...)'}") if task is None: raise EnvironmentError( @@ -265,11 +265,10 @@ def get_pipeline( device=device, **kwargs ) - elif is_diffusers_available() and task == "text-to-image": + elif is_diffusers_available() and task in ["text-to-image", "image-to-image"]: hf_pipeline = get_diffusers_pipeline( task=task, model_dir=model_dir, - device=device, **kwargs ) else: @@ -308,3 +307,21 @@ def convert_params_to_int_or_bool(params): if v == "true": params[k] = True return params + + +# def local_model_card(model_dir: str) -> Optional[ModelCard]: +# +# logger.debug("Rebuilding offline model info for repo %s", model_dir) +# +# # Let's rebuild some partial model info from what we see in cache, info extracted should be enough +# # for most use cases +# +# card_path = Path(model_dir) / "README.md" +# if not card_path.exists(): +# logger.debug("Unable to build model info for directory %s", model_dir) +# return None +# +# logger.debug("Loading model card from model readme %s", card_path) +# model_card = ModelCard.load(card_path) +# logger.info("Local repo %s, model card data %s", model_dir, model_card.data.to_dict()) +# return model_card From ca5f79c9d2609b43ff8688b03e272a3d0523b574 Mon Sep 17 00:00:00 2001 From: Raphael Glon Date: Tue, 14 May 2024 15:43:55 +0200 Subject: [PATCH 06/23] wip Signed-off-by: Raphael Glon --- dockerfiles/pytorch/neuronx.sh | 15 +++++++++------ setup.py | 6 +++--- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/dockerfiles/pytorch/neuronx.sh b/dockerfiles/pytorch/neuronx.sh index dd954804..38951756 100755 --- a/dockerfiles/pytorch/neuronx.sh +++ b/dockerfiles/pytorch/neuronx.sh @@ -8,7 +8,10 @@ apt-get update -y \ gnupg2 \ wget -echo "deb https://apt.repos.neuron.amazonaws.com jammy main" > /etc/apt/sources.list.d/neuron.list +. /etc/os-release +tee /etc/apt/sources.list.d/neuron.list > /dev/null <=4.36.0", + "huggingface_hub==0.23.0", "orjson", # vision "Pillow", @@ -40,7 +40,7 @@ extras["diffusers"] = ["diffusers==0.26.3", "accelerate==0.27.2"] extras["torch"] = ["torch==2.2.0", "torchvision", "torchaudio"] # For neuronx -extras["torch1"] = ["torch==1.13.1", "torchvision", "torchaudio"] +extras["torch-neuronx"] = ["torch-neuronx", "torchvision", "torchaudio"] extras["tensorflow"] = ["tensorflow"] extras["test"] = [ "pytest==7.2.1", From 8b6364000d678a0d94b21c029a07f9f1e1586848 Mon Sep 17 00:00:00 2001 From: Raphael Glon Date: Tue, 14 May 2024 16:07:36 +0200 Subject: [PATCH 07/23] wip Signed-off-by: Raphael Glon --- dockerfiles/pytorch/neuronx.sh | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/dockerfiles/pytorch/neuronx.sh b/dockerfiles/pytorch/neuronx.sh index 38951756..fdefa001 100755 --- a/dockerfiles/pytorch/neuronx.sh +++ b/dockerfiles/pytorch/neuronx.sh @@ -23,11 +23,17 @@ apt-get update -y \ pip install -U pip -pip3 install neuronx-cc==2.* \ - torch-neuronx==2.* \ - transformers-neuronx\ +# Taken from optimum neuron, tgi dockerfile +pip3 install \ + neuronx-cc==2.13.66.0 \ + torch-neuronx==2.1.2.2.1.0 \ + transformers-neuronx==0.10.0.21 \ --extra-index-url=https://pip.repos.neuron.amazonaws.com pip3 install --extra-index-url=https://pip.repos.neuron.amazonaws.com optimum[neuronx,diffusers] pip install ".[st,torch-neuronx]" + +apt-get clean autoremove --yes + +rm -rf /var/lib/{apt,cache,log} fi From 1c76d87af186cb8d6000e4e13951532b8db84f88 Mon Sep 17 00:00:00 2001 From: Raphael Glon Date: Tue, 14 May 2024 17:01:43 +0200 Subject: [PATCH 08/23] rm vim Signed-off-by: Raphael Glon --- dockerfiles/pytorch/Dockerfile | 2 -- 1 file changed, 2 deletions(-) diff --git a/dockerfiles/pytorch/Dockerfile b/dockerfiles/pytorch/Dockerfile index d2f53ae2..d01f0df2 100644 --- a/dockerfiles/pytorch/Dockerfile +++ b/dockerfiles/pytorch/Dockerfile @@ -49,5 +49,3 @@ COPY src/huggingface_inference_toolkit/webservice_starlette.py webservice_starle COPY --chmod=0755 scripts/entrypoint.sh entrypoint.sh ENTRYPOINT ["bash", "-c", "./entrypoint.sh"] - -RUN apt-get update && apt-get install -y vim From 0f0d1802ba6a97481b6011d5a653cd87420da579 Mon Sep 17 00:00:00 2001 From: Raphael Glon Date: Tue, 14 May 2024 17:13:08 +0200 Subject: [PATCH 09/23] wip update path Signed-off-by: Raphael Glon --- dockerfiles/pytorch/neuronx.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dockerfiles/pytorch/neuronx.sh b/dockerfiles/pytorch/neuronx.sh index fdefa001..bc3c6e84 100755 --- a/dockerfiles/pytorch/neuronx.sh +++ b/dockerfiles/pytorch/neuronx.sh @@ -37,3 +37,5 @@ pip install ".[st,torch-neuronx]" apt-get clean autoremove --yes rm -rf /var/lib/{apt,cache,log} fi + +echo "PATH=\"$PATH:/opt/aws/neuron/bin\"" > /etc/environment From 4749c57db4122c52fda3dec2bef137382d72a696 Mon Sep 17 00:00:00 2001 From: Raphael Glon Date: Tue, 14 May 2024 17:39:06 +0200 Subject: [PATCH 10/23] Fix Signed-off-by: Raphael Glon --- src/huggingface_inference_toolkit/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/huggingface_inference_toolkit/utils.py b/src/huggingface_inference_toolkit/utils.py index 4cab2a39..8309cdea 100644 --- a/src/huggingface_inference_toolkit/utils.py +++ b/src/huggingface_inference_toolkit/utils.py @@ -33,7 +33,7 @@ def is_optimum_available(): - return True + return False # TODO: change when supported # return _optimum_available From 72787a5763e64bf3bc47909a473b5fd086958e8c Mon Sep 17 00:00:00 2001 From: Raphael Glon Date: Tue, 14 May 2024 17:39:34 +0200 Subject: [PATCH 11/23] PATH Signed-off-by: Raphael Glon --- dockerfiles/pytorch/neuronx.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dockerfiles/pytorch/neuronx.sh b/dockerfiles/pytorch/neuronx.sh index bc3c6e84..9b8f35f4 100755 --- a/dockerfiles/pytorch/neuronx.sh +++ b/dockerfiles/pytorch/neuronx.sh @@ -38,4 +38,4 @@ apt-get clean autoremove --yes rm -rf /var/lib/{apt,cache,log} fi -echo "PATH=\"$PATH:/opt/aws/neuron/bin\"" > /etc/environment +echo "export PATH=\"$PATH:/opt/aws/neuron/bin\"" >> /root/.bashrc From 6ba2b76c0a5c171c1225b1d196351c263facdabb Mon Sep 17 00:00:00 2001 From: Raphael Glon Date: Tue, 14 May 2024 17:47:22 +0200 Subject: [PATCH 12/23] fix Signed-off-by: Raphael Glon --- src/huggingface_inference_toolkit/diffusers_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/huggingface_inference_toolkit/diffusers_utils.py b/src/huggingface_inference_toolkit/diffusers_utils.py index 75227cf1..748ac65d 100644 --- a/src/huggingface_inference_toolkit/diffusers_utils.py +++ b/src/huggingface_inference_toolkit/diffusers_utils.py @@ -70,7 +70,7 @@ def load_optimum_diffusion_pipeline(task, model_dir): pipeline_class_name = config['_class_name'] logger.debug("Repository pipeline class name %s", pipeline_class_name) - if pipeline_class_name.contains("Diffusion") and pipeline_class_name.contains("XL"): + if "Diffusion" in pipeline_class_name and "XL" in pipeline_class_name: if task == "image-to-image": pipeline_class = neuron.NeuronStableDiffusionXLImg2ImgPipeline else: @@ -84,7 +84,7 @@ def load_optimum_diffusion_pipeline(task, model_dir): logger.debug("Pipeline class %s", pipeline_class.__class__) # if is neuron model, no need for additional kwargs - if pipeline_class_name.contains("Neuron"): + if "Neuron" in pipeline_class_name: kwargs = {} else: # Model will be compiled and exported on the flight as the cached models cause a performance drop @@ -99,8 +99,8 @@ def load_optimum_diffusion_pipeline(task, model_dir): "data_parallel_mode": os.environ.get("DATA_PARALLEL_MODE", "unet") } input_shapes = {"batch_size": 1, - "height": int(os.environ("IMAGE_HEIGHT", 512)), - "width": int(os.environ("IMAGE_WIDTH", 512))} + "height": int(os.environ.get("IMAGE_HEIGHT", 512)), + "width": int(os.environ.get("IMAGE_WIDTH", 512))} kwargs = {**compiler_args, **input_shapes, "export": True} # In the second case, exporting can take a huge amount of time, which makes endpoints not a really suited solution From d0c37d6e91afff33e2466ed88ba99b63de7a8be7 Mon Sep 17 00:00:00 2001 From: Raphael Glon Date: Tue, 14 May 2024 17:58:41 +0200 Subject: [PATCH 13/23] fix Signed-off-by: Raphael Glon --- .../diffusers_utils.py | 31 +++++++------------ 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/src/huggingface_inference_toolkit/diffusers_utils.py b/src/huggingface_inference_toolkit/diffusers_utils.py index 748ac65d..50e43cae 100644 --- a/src/huggingface_inference_toolkit/diffusers_utils.py +++ b/src/huggingface_inference_toolkit/diffusers_utils.py @@ -84,28 +84,21 @@ def load_optimum_diffusion_pipeline(task, model_dir): logger.debug("Pipeline class %s", pipeline_class.__class__) # if is neuron model, no need for additional kwargs - if "Neuron" in pipeline_class_name: - kwargs = {} - else: - # Model will be compiled and exported on the flight as the cached models cause a performance drop - # for diffusion models, unless otherwise specified through an explicit env variable - - # Image shapes need to be frozen at loading/compilation time - compiler_args = { - "auto_cast": "matmul", - "auto_cast_type": "bf16", - "inline_weights_to_neff": os.environ.get("INLINE_WEIGHTS_TO_NEFF", - "false").lower() in ["false", "no", "0"], - "data_parallel_mode": os.environ.get("DATA_PARALLEL_MODE", "unet") - } - input_shapes = {"batch_size": 1, - "height": int(os.environ.get("IMAGE_HEIGHT", 512)), - "width": int(os.environ.get("IMAGE_WIDTH", 512))} - kwargs = {**compiler_args, **input_shapes, "export": True} + compiler_args = { + "auto_cast": "matmul", + "auto_cast_type": "bf16", + "inline_weights_to_neff": os.environ.get("INLINE_WEIGHTS_TO_NEFF", + "false").lower() in ["false", "no", "0"], + "data_parallel_mode": os.environ.get("DATA_PARALLEL_MODE", "unet") + } + input_shapes = {"batch_size": 1, + "height": int(os.environ.get("IMAGE_HEIGHT", 512)), + "width": int(os.environ.get("IMAGE_WIDTH", 512))} + kwargs = {**compiler_args, **input_shapes, "export": True} # In the second case, exporting can take a huge amount of time, which makes endpoints not a really suited solution # at least as long as the cache is not really an option for diffusion - return pipeline_class(kwargs) + return pipeline_class.from_pretrained(model_dir, **kwargs) def get_diffusers_pipeline(task=None, model_dir=None, **kwargs): From 957b1833e515ab2fb28bd8b67efdd4e9ed21c4a1 Mon Sep 17 00:00:00 2001 From: Raphael Glon Date: Thu, 16 May 2024 10:20:38 +0200 Subject: [PATCH 14/23] wip Signed-off-by: Raphael Glon --- .../diffusers_utils.py | 38 +++++++++++++++++-- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/src/huggingface_inference_toolkit/diffusers_utils.py b/src/huggingface_inference_toolkit/diffusers_utils.py index 50e43cae..9e9935a8 100644 --- a/src/huggingface_inference_toolkit/diffusers_utils.py +++ b/src/huggingface_inference_toolkit/diffusers_utils.py @@ -1,4 +1,5 @@ import importlib.util +import json import logging import os @@ -57,6 +58,23 @@ def __call__( # } +def _is_neuron_model(model_dir): + for root, _, files in os.walk(model_dir): + for f in files: + if f == "config.json": + filename = os.path.join(root, f) + with open(filename, 'r') as fh: + try: + config = json.load(fh) + except Exception as e: + logger.warning("Unable to load config %s properly, skipping", filename) + logger.exception(e) + continue + if 'neuron' in config.keys(): + return True + return False + + def load_optimum_diffusion_pipeline(task, model_dir): # Step 1: load config and look for _class_name @@ -83,7 +101,6 @@ def load_optimum_diffusion_pipeline(task, model_dir): logger.debug("Pipeline class %s", pipeline_class.__class__) - # if is neuron model, no need for additional kwargs compiler_args = { "auto_cast": "matmul", "auto_cast_type": "bf16", @@ -94,11 +111,26 @@ def load_optimum_diffusion_pipeline(task, model_dir): input_shapes = {"batch_size": 1, "height": int(os.environ.get("IMAGE_HEIGHT", 512)), "width": int(os.environ.get("IMAGE_WIDTH", 512))} - kwargs = {**compiler_args, **input_shapes, "export": True} + export_kwargs = {**compiler_args, **input_shapes, "export": True} + + # if is neuron model, no need for additional kwargs, any info lies within the repo + is_neuron_m = _is_neuron_model(model_dir) + if is_neuron_m: + kwargs = {} + fallback_kwargs = export_kwargs + else: + kwargs = export_kwargs + fallback_kwargs = {} # In the second case, exporting can take a huge amount of time, which makes endpoints not a really suited solution # at least as long as the cache is not really an option for diffusion - return pipeline_class.from_pretrained(model_dir, **kwargs) + try: + logger.info("Loading model %s with kwargs %s", model_dir, kwargs) + return pipeline_class.from_pretrained(model_dir, **kwargs) + except Exception as e: + logger.error("Unable to load model %s properly falling back to kwargs %s", model_dir, fallback_kwargs) + logger.exception(e) + return pipeline_class.from_pretrained(model_dir, **fallback_kwargs) def get_diffusers_pipeline(task=None, model_dir=None, **kwargs): From 247d67be9e613a9646cbd1ffdf8601da4e0f8e6d Mon Sep 17 00:00:00 2001 From: Raphael Glon Date: Thu, 16 May 2024 15:42:46 +0200 Subject: [PATCH 15/23] wip Signed-off-by: Raphael Glon --- src/huggingface_inference_toolkit/diffusers_utils.py | 3 ++- src/huggingface_inference_toolkit/webservice_starlette.py | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/huggingface_inference_toolkit/diffusers_utils.py b/src/huggingface_inference_toolkit/diffusers_utils.py index 9e9935a8..5496792e 100644 --- a/src/huggingface_inference_toolkit/diffusers_utils.py +++ b/src/huggingface_inference_toolkit/diffusers_utils.py @@ -5,6 +5,7 @@ from transformers.utils.import_utils import is_torch_bf16_gpu_available from optimum import neuron +from optimum.neuron.modeling_base import OptimizedModel logger = logging.getLogger(__name__) logging.basicConfig(format="%(asctime)s | %(levelname)s | %(message)s", level=logging.INFO) @@ -75,7 +76,7 @@ def _is_neuron_model(model_dir): return False -def load_optimum_diffusion_pipeline(task, model_dir): +def load_optimum_diffusion_pipeline(task: str, model_dir: str) -> OptimizedModel: # Step 1: load config and look for _class_name try: diff --git a/src/huggingface_inference_toolkit/webservice_starlette.py b/src/huggingface_inference_toolkit/webservice_starlette.py index 00b6967d..e06ab447 100644 --- a/src/huggingface_inference_toolkit/webservice_starlette.py +++ b/src/huggingface_inference_toolkit/webservice_starlette.py @@ -1,4 +1,5 @@ import logging +import os from pathlib import Path from time import perf_counter @@ -23,7 +24,7 @@ def config_logging(level=logging.INFO): - logging.basicConfig(format="%(asctime)s | %(levelname)s | %(message)s", datefmt="", level=level) + logging.basicConfig(format="%(asctime)s | %(levelname)s | %(message)s", datefmt="", level=level, force=True) # disable uvicorn access logs to hide /health uvicorn_access = logging.getLogger("uvicorn.access") uvicorn_access.disabled = True @@ -31,7 +32,7 @@ def config_logging(level=logging.INFO): logging.getLogger("uvicorn").removeHandler(logging.getLogger("uvicorn").handlers[0]) -config_logging() +config_logging(os.environ.get("LOG_LEVEL", logging.getLevelName(logging.INFO))) logger = logging.getLogger(__name__) From e6c4e1233b9cb2bc7a1d5d5a52a4dbd8c915a551 Mon Sep 17 00:00:00 2001 From: Raphael Glon Date: Thu, 16 May 2024 15:52:00 +0200 Subject: [PATCH 16/23] version Signed-off-by: Raphael Glon --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 3b96fdcd..98543268 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ "wheel==0.42.0", "setuptools==69.1.0", "cmake==3.28.3", - "transformers[sklearn,sentencepiece, audio, vision]>=4.36.0", + "transformers[sklearn,sentencepiece, audio, vision]>=4.38.2", "huggingface_hub==0.23.0", "orjson", # vision From f77cde30998551864eaeb1f0c15bef347cd5905a Mon Sep 17 00:00:00 2001 From: Raphael Glon Date: Thu, 16 May 2024 16:06:41 +0200 Subject: [PATCH 17/23] cleanup Signed-off-by: Raphael Glon --- src/huggingface_inference_toolkit/utils.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/src/huggingface_inference_toolkit/utils.py b/src/huggingface_inference_toolkit/utils.py index 8309cdea..baf393b6 100644 --- a/src/huggingface_inference_toolkit/utils.py +++ b/src/huggingface_inference_toolkit/utils.py @@ -307,21 +307,3 @@ def convert_params_to_int_or_bool(params): if v == "true": params[k] = True return params - - -# def local_model_card(model_dir: str) -> Optional[ModelCard]: -# -# logger.debug("Rebuilding offline model info for repo %s", model_dir) -# -# # Let's rebuild some partial model info from what we see in cache, info extracted should be enough -# # for most use cases -# -# card_path = Path(model_dir) / "README.md" -# if not card_path.exists(): -# logger.debug("Unable to build model info for directory %s", model_dir) -# return None -# -# logger.debug("Loading model card from model readme %s", card_path) -# model_card = ModelCard.load(card_path) -# logger.info("Local repo %s, model card data %s", model_dir, model_card.data.to_dict()) -# return model_card From 73eaba7c1e8a91916840a77ade2374eeaee1b520 Mon Sep 17 00:00:00 2001 From: Raphael Glon Date: Thu, 16 May 2024 16:43:52 +0200 Subject: [PATCH 18/23] cleanup Signed-off-by: Raphael Glon --- .../diffusers_utils.py | 24 +++++++++++++------ src/huggingface_inference_toolkit/utils.py | 22 +++++++++++++++-- 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/src/huggingface_inference_toolkit/diffusers_utils.py b/src/huggingface_inference_toolkit/diffusers_utils.py index 5496792e..65a7e8ec 100644 --- a/src/huggingface_inference_toolkit/diffusers_utils.py +++ b/src/huggingface_inference_toolkit/diffusers_utils.py @@ -7,6 +7,7 @@ from optimum import neuron from optimum.neuron.modeling_base import OptimizedModel + logger = logging.getLogger(__name__) logging.basicConfig(format="%(asctime)s | %(levelname)s | %(message)s", level=logging.INFO) @@ -53,10 +54,10 @@ def __call__( out = self.pipeline(prompt, num_images_per_prompt=1, **kwargs) return out.images[0] -# -# DIFFUSERS_TASKS = { -# "text-to-image": [NeuronStableDiffusionXLPipeline], -# } + +DIFFUSERS_TASKS = { + "text-to-image": IEAutoPipelineForText2Image, +} def _is_neuron_model(model_dir): @@ -76,7 +77,7 @@ def _is_neuron_model(model_dir): return False -def load_optimum_diffusion_pipeline(task: str, model_dir: str) -> OptimizedModel: +def neuron_diffusion_pipeline(task: str, model_dir: str) -> OptimizedModel: # Step 1: load config and look for _class_name try: @@ -134,7 +135,16 @@ def load_optimum_diffusion_pipeline(task: str, model_dir: str) -> OptimizedModel return pipeline_class.from_pretrained(model_dir, **fallback_kwargs) -def get_diffusers_pipeline(task=None, model_dir=None, **kwargs): +def get_diffusers_pipeline(task=None, model_dir=None, device=-1, **_kwargs): """Get a pipeline for Diffusers models.""" - pipeline = load_optimum_diffusion_pipeline(task=task, model_dir=model_dir) + if device == 0: + device = "cuda" + elif device is not None: + device = "cpu" + # None case: neuronx, no need to specify device + + if device is not None: + pipeline = DIFFUSERS_TASKS[task](model_dir=model_dir, device=device) + else: + pipeline = neuron_diffusion_pipeline(task=task, model_dir=model_dir) return pipeline diff --git a/src/huggingface_inference_toolkit/utils.py b/src/huggingface_inference_toolkit/utils.py index baf393b6..afa154e7 100644 --- a/src/huggingface_inference_toolkit/utils.py +++ b/src/huggingface_inference_toolkit/utils.py @@ -30,6 +30,14 @@ import torch _optimum_available = importlib.util.find_spec("optimum") is not None +if _optimum_available: + _optimum_neuron = importlib.util.find_spec("optimum.neuron") is not None + from optimum.neuron.modeling_decoder import get_available_cores as get_neuron_cores +else: + _optimum_neuron = False + + def get_neuron_cores(): + return 0 def is_optimum_available(): @@ -38,6 +46,10 @@ def is_optimum_available(): # return _optimum_available +def is_optimum_neuron_available(): + return _optimum_neuron + + framework2weight = { "pytorch": "pytorch*", "tensorflow": "tf*", @@ -215,6 +227,8 @@ def get_device(): if gpu: return 0 + elif get_neuron_cores() > 0: + return None else: return -1 @@ -229,7 +243,10 @@ def get_pipeline( create pipeline class for a specific task based on local saved model """ device = get_device() - logger.info(f"Using device { 'GPU' if device == 0 else 'CPU/TPU/Neuron...)'}") + logger.info(f"Using device { 'GPU' if device == 0 else 'Neuron' if device is None else 'CPU'}") + + if device is None and task != "text-to-image": + raise Exception("This container only supports text-to-image task with neurons") if task is None: raise EnvironmentError( @@ -265,10 +282,11 @@ def get_pipeline( device=device, **kwargs ) - elif is_diffusers_available() and task in ["text-to-image", "image-to-image"]: + elif is_diffusers_available() and task == "text-to-image": hf_pipeline = get_diffusers_pipeline( task=task, model_dir=model_dir, + device=device **kwargs ) else: From 429d25f3b375e6e1f611b69369bbbc5cb65d60b3 Mon Sep 17 00:00:00 2001 From: Raphael Glon Date: Thu, 16 May 2024 16:51:25 +0200 Subject: [PATCH 19/23] quality check Signed-off-by: Raphael Glon --- src/huggingface_inference_toolkit/diffusers_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/huggingface_inference_toolkit/diffusers_utils.py b/src/huggingface_inference_toolkit/diffusers_utils.py index 65a7e8ec..1f255bf9 100644 --- a/src/huggingface_inference_toolkit/diffusers_utils.py +++ b/src/huggingface_inference_toolkit/diffusers_utils.py @@ -3,10 +3,9 @@ import logging import os -from transformers.utils.import_utils import is_torch_bf16_gpu_available from optimum import neuron from optimum.neuron.modeling_base import OptimizedModel - +from transformers.utils.import_utils import is_torch_bf16_gpu_available logger = logging.getLogger(__name__) logging.basicConfig(format="%(asctime)s | %(levelname)s | %(message)s", level=logging.INFO) From 179dfc20c7db6e1e600db1cf6fbccaf5f9d0f263 Mon Sep 17 00:00:00 2001 From: Raphael Glon Date: Thu, 16 May 2024 17:02:39 +0200 Subject: [PATCH 20/23] fix tests Signed-off-by: Raphael Glon --- src/huggingface_inference_toolkit/diffusers_utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/huggingface_inference_toolkit/diffusers_utils.py b/src/huggingface_inference_toolkit/diffusers_utils.py index 1f255bf9..cfbbec43 100644 --- a/src/huggingface_inference_toolkit/diffusers_utils.py +++ b/src/huggingface_inference_toolkit/diffusers_utils.py @@ -3,14 +3,13 @@ import logging import os -from optimum import neuron -from optimum.neuron.modeling_base import OptimizedModel from transformers.utils.import_utils import is_torch_bf16_gpu_available logger = logging.getLogger(__name__) logging.basicConfig(format="%(asctime)s | %(levelname)s | %(message)s", level=logging.INFO) _diffusers = importlib.util.find_spec("diffusers") is not None +_optimum_neuron = importlib.util.find_spec("optimum.neuron") is not None def is_diffusers_available(): @@ -22,6 +21,11 @@ def is_diffusers_available(): from diffusers import AutoPipelineForText2Image, DPMSolverMultistepScheduler, StableDiffusionPipeline +if _optimum_neuron: + from optimum import neuron + from optimum.neuron.modeling_base import OptimizedModel + + class IEAutoPipelineForText2Image: def __init__(self, model_dir: str, device: str = None): # needs "cuda" for GPU dtype = torch.float32 From 2e21889f73ce631b3ac0e1eb1b20484155ac988b Mon Sep 17 00:00:00 2001 From: Raphael Glon Date: Thu, 16 May 2024 17:17:30 +0200 Subject: [PATCH 21/23] fix Signed-off-by: Raphael Glon --- src/huggingface_inference_toolkit/diffusers_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/huggingface_inference_toolkit/diffusers_utils.py b/src/huggingface_inference_toolkit/diffusers_utils.py index cfbbec43..8b4c1491 100644 --- a/src/huggingface_inference_toolkit/diffusers_utils.py +++ b/src/huggingface_inference_toolkit/diffusers_utils.py @@ -9,7 +9,11 @@ logging.basicConfig(format="%(asctime)s | %(levelname)s | %(message)s", level=logging.INFO) _diffusers = importlib.util.find_spec("diffusers") is not None -_optimum_neuron = importlib.util.find_spec("optimum.neuron") is not None +_optimum = importlib.util.find_spec("optimum") is not None +if _optimum: + _optimum_neuron = importlib.util.find_spec("optimum.neuron") is not None +else: + _optimum_neuron = False def is_diffusers_available(): From 41563f4d323d06b3442ababba49a1ec363c66756 Mon Sep 17 00:00:00 2001 From: Raphael Glon Date: Thu, 16 May 2024 17:26:19 +0200 Subject: [PATCH 22/23] fix Signed-off-by: Raphael Glon --- src/huggingface_inference_toolkit/diffusers_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/huggingface_inference_toolkit/diffusers_utils.py b/src/huggingface_inference_toolkit/diffusers_utils.py index 8b4c1491..21795e7a 100644 --- a/src/huggingface_inference_toolkit/diffusers_utils.py +++ b/src/huggingface_inference_toolkit/diffusers_utils.py @@ -27,7 +27,6 @@ def is_diffusers_available(): if _optimum_neuron: from optimum import neuron - from optimum.neuron.modeling_base import OptimizedModel class IEAutoPipelineForText2Image: @@ -84,7 +83,7 @@ def _is_neuron_model(model_dir): return False -def neuron_diffusion_pipeline(task: str, model_dir: str) -> OptimizedModel: +def neuron_diffusion_pipeline(task: str, model_dir: str): # Step 1: load config and look for _class_name try: From 64530263879465ccb2de18bf8d55313d7045d886 Mon Sep 17 00:00:00 2001 From: Raphael Glon Date: Thu, 16 May 2024 19:01:15 +0200 Subject: [PATCH 23/23] fix Signed-off-by: Raphael Glon --- src/huggingface_inference_toolkit/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/huggingface_inference_toolkit/utils.py b/src/huggingface_inference_toolkit/utils.py index afa154e7..65e3a6b4 100644 --- a/src/huggingface_inference_toolkit/utils.py +++ b/src/huggingface_inference_toolkit/utils.py @@ -286,7 +286,7 @@ def get_pipeline( hf_pipeline = get_diffusers_pipeline( task=task, model_dir=model_dir, - device=device + device=device, **kwargs ) else: