From 17524538897e994a8ded41d5d0efc8c7b9ac47a0 Mon Sep 17 00:00:00 2001
From: Raphael Glon <oOraph@users.noreply.github.com>
Date: Thu, 7 Mar 2024 10:50:54 +0100
Subject: [PATCH 01/23] Build neuron image

Signed-off-by: Raphael Glon <oOraph@users.noreply.github.com>
---
 dockerfiles/pytorch/Dockerfile | 13 +++++++++----
 dockerfiles/pytorch/neuronx.sh | 30 ++++++++++++++++++++++++++++++
 makefile                       |  5 ++++-
 setup.py                       |  2 ++
 4 files changed, 45 insertions(+), 5 deletions(-)
 create mode 100755 dockerfiles/pytorch/neuronx.sh

diff --git a/dockerfiles/pytorch/Dockerfile b/dockerfiles/pytorch/Dockerfile
index 8e4c4d35..d2f53ae2 100644
--- a/dockerfiles/pytorch/Dockerfile
+++ b/dockerfiles/pytorch/Dockerfile
@@ -1,6 +1,9 @@
 ARG BASE_IMAGE=nvidia/cuda:12.1.0-devel-ubuntu22.04
 
 FROM $BASE_IMAGE
+
+ARG NEURONX=0
+
 SHELL ["/bin/bash", "-c"]
 
 LABEL maintainer="Hugging Face"
@@ -31,12 +34,12 @@ RUN apt-get update && \
         libsndfile1-dev \
         ffmpeg \
     && apt-get clean autoremove --yes \
-    && rm -rf /var/lib/{apt,dpkg,cache,log}
+    && rm -rf /var/lib/{apt,cache,log}
+
 # Copying only necessary files as filtered by .dockerignore
 COPY . .
 
-# install wheel and setuptools
-RUN pip install --no-cache-dir -U pip ".[torch, st, diffusers]"
+RUN if [[ "$NEURONX" == "1" ]];then /bin/bash -c "./dockerfiles/pytorch/neuronx.sh";else pip install --no-cache-dir -U pip ".[torch, st, diffusers]";fi
 
 # copy application
 COPY src/huggingface_inference_toolkit huggingface_inference_toolkit
@@ -45,4 +48,6 @@ COPY src/huggingface_inference_toolkit/webservice_starlette.py webservice_starle
 # copy entrypoint and change permissions
 COPY --chmod=0755  scripts/entrypoint.sh entrypoint.sh
 
-ENTRYPOINT ["bash", "-c", "./entrypoint.sh"]
\ No newline at end of file
+ENTRYPOINT ["bash", "-c", "./entrypoint.sh"]
+
+RUN apt-get update && apt-get install -y vim
diff --git a/dockerfiles/pytorch/neuronx.sh b/dockerfiles/pytorch/neuronx.sh
new file mode 100755
index 00000000..dd954804
--- /dev/null
+++ b/dockerfiles/pytorch/neuronx.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+set -e
+
+# Install system prerequisites
+apt-get update -y \
+ && apt-get install -y --no-install-recommends \
+    gnupg2 \
+    wget
+
+echo "deb https://apt.repos.neuron.amazonaws.com jammy main" > /etc/apt/sources.list.d/neuron.list
+wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -
+
+apt-get update -y \
+ && apt-get install -y --no-install-recommends \
+    aws-neuronx-dkms=2.* \
+    aws-neuronx-collectives=2.* \
+    aws-neuronx-runtime-lib=2.* \
+    aws-neuronx-tools=2.*
+
+pip install -U pip
+
+pip3 install neuronx-cc==2.12.68.0 \
+    torch-neuronx==1.13.1.1.13.1 \
+    transformers-neuronx==0.9.474 \
+    --extra-index-url=https://pip.repos.neuron.amazonaws.com
+
+pip3 install optimum[neuronx,diffusers]
+
+pip install ".[st,torch1]"
diff --git a/makefile b/makefile
index a9490428..d6c8a53e 100644
--- a/makefile
+++ b/makefile
@@ -26,5 +26,8 @@ inference-pytorch-gpu:
 inference-pytorch-cpu:
 	docker build --build-arg="BASE_IMAGE=ubuntu:22.04" -f dockerfiles/pytorch/Dockerfile -t integration-test-pytorch:cpu .
 
+inference-pytorch-neuron:
+	docker build --build-arg=BASE_IMAGE=ubuntu:22.04 --build-arg=NEURONX=1 -f dockerfiles/pytorch/Dockerfile  -t integration-test-pytorch:neuron .
+
 stop-all:
-	docker stop $$(docker ps -a -q) && docker container prune --force
\ No newline at end of file
+	docker stop $$(docker ps -a -q) && docker container prune --force
diff --git a/setup.py b/setup.py
index 5e99df02..f8d669c0 100644
--- a/setup.py
+++ b/setup.py
@@ -39,6 +39,8 @@
 extras["st"] = ["sentence_transformers==2.4.0"]
 extras["diffusers"] = ["diffusers==0.26.3", "accelerate==0.27.2"]
 extras["torch"] = ["torch==2.2.0", "torchvision", "torchaudio"]
+# For neuronx
+extras["torch1"] = ["torch==1.13.1", "torchvision", "torchaudio"]
 extras["tensorflow"] = ["tensorflow"]
 extras["test"] = [
     "pytest==7.2.1",

From ce71d1ec3474f709b1ac1425c797b24048e6699a Mon Sep 17 00:00:00 2001
From: Raphael Glon <oOraph@users.noreply.github.com>
Date: Thu, 7 Mar 2024 11:44:00 +0100
Subject: [PATCH 02/23] minor: typo

Signed-off-by: Raphael Glon <oOraph@users.noreply.github.com>
---
 src/huggingface_inference_toolkit/webservice_starlette.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/huggingface_inference_toolkit/webservice_starlette.py b/src/huggingface_inference_toolkit/webservice_starlette.py
index 8bc68b2e..00b6967d 100644
--- a/src/huggingface_inference_toolkit/webservice_starlette.py
+++ b/src/huggingface_inference_toolkit/webservice_starlette.py
@@ -50,7 +50,7 @@ async def some_startup_task():
         else:
             raise ValueError(
                 f"""Can't initialize model.
-                Please set env HF_MODEL_DIR or provider a HF_MODEL_ID.
+                Please set env HF_MODEL_DIR or provide a HF_MODEL_ID.
                 Provided values are:
                 HF_MODEL_DIR: {HF_MODEL_DIR} and HF_MODEL_ID:{HF_MODEL_ID}"""
             )

From a21ed5bb4c2721618ca5de8d313ddff4158b2010 Mon Sep 17 00:00:00 2001
From: Raphael Glon <oOraph@users.noreply.github.com>
Date: Thu, 7 Mar 2024 11:44:21 +0100
Subject: [PATCH 03/23] Build neuron image

Signed-off-by: Raphael Glon <oOraph@users.noreply.github.com>
---
 .github/workflows/build-container.yaml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.github/workflows/build-container.yaml b/.github/workflows/build-container.yaml
index fe12fbf6..3ad8ad5f 100644
--- a/.github/workflows/build-container.yaml
+++ b/.github/workflows/build-container.yaml
@@ -34,6 +34,16 @@ jobs:
       TAILSCALE_AUTHKEY: ${{ secrets.TAILSCALE_AUTHKEY }}
       REGISTRY_USERNAME: ${{ secrets.REGISTRY_USERNAME }}
       REGISTRY_PASSWORD: ${{ secrets.REGISTRY_PASSWORD }}
+  starlette-pytorch-neuron:
+    uses: ./.github/workflows/docker-build-action.yaml
+    with:
+      image: inference-pytorch-neuron
+      dockerfile: dockerfiles/pytorch/Dockerfile
+      build_args: "BASE_IMAGE=ubuntu:22.04,NEURONX=1"
+    secrets:
+      TAILSCALE_AUTHKEY: ${{ secrets.TAILSCALE_AUTHKEY }}
+      REGISTRY_USERNAME: ${{ secrets.REGISTRY_USERNAME }}
+      REGISTRY_PASSWORD: ${{ secrets.REGISTRY_PASSWORD }}
   starlette-tensorflow-cpu:
     uses: ./.github/workflows/docker-build-action.yaml
     with:

From cfe316e67cba17b6733bd7a0985d776c18899331 Mon Sep 17 00:00:00 2001
From: Raphael Glon <oOraph@users.noreply.github.com>
Date: Thu, 7 Mar 2024 11:48:13 +0100
Subject: [PATCH 04/23] ci: tmp change build tag for tests

Signed-off-by: Raphael Glon <oOraph@users.noreply.github.com>
---
 .github/workflows/docker-build-action.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/docker-build-action.yaml b/.github/workflows/docker-build-action.yaml
index 62cba961..37889934 100644
--- a/.github/workflows/docker-build-action.yaml
+++ b/.github/workflows/docker-build-action.yaml
@@ -64,8 +64,8 @@ jobs:
           context: ${{ inputs.context }}
           build-args: ${{ inputs.build_args }}
           file:  ${{ inputs.context }}/${{ inputs.dockerfile }}
-          tags: ${{ inputs.repository }}/${{ inputs.image }}:sha-${{ env.GITHUB_SHA_SHORT }},${{ inputs.repository }}/${{ inputs.image }}:latest
-     
+          # tags: ${{ inputs.repository }}/${{ inputs.image }}:sha-${{ env.GITHUB_SHA_SHORT }},${{ inputs.repository }}/${{ inputs.image }}:latest
+          tags: ${{ inputs.repository }}/${{ inputs.image }}:testraph
       - name: Tailscale Wait
         if: ${{ failure() || runner.debug == '1' }}
         uses: huggingface/tailscale-action@v1

From 1756a6df7a24008fc8ba168795e868af3986cef9 Mon Sep 17 00:00:00 2001
From: Raphael Glon <oOraph@users.noreply.github.com>
Date: Tue, 14 May 2024 11:04:17 +0200
Subject: [PATCH 05/23] wip stable diffusion for neuron

Signed-off-by: Raphael Glon <oOraph@users.noreply.github.com>
---
 .../diffusers_utils.py                        | 70 ++++++++++++++++---
 src/huggingface_inference_toolkit/utils.py    | 25 +++++--
 2 files changed, 82 insertions(+), 13 deletions(-)

diff --git a/src/huggingface_inference_toolkit/diffusers_utils.py b/src/huggingface_inference_toolkit/diffusers_utils.py
index 521a85df..75227cf1 100644
--- a/src/huggingface_inference_toolkit/diffusers_utils.py
+++ b/src/huggingface_inference_toolkit/diffusers_utils.py
@@ -1,7 +1,9 @@
 import importlib.util
 import logging
+import os
 
 from transformers.utils.import_utils import is_torch_bf16_gpu_available
+from optimum import neuron
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(format="%(asctime)s | %(levelname)s | %(message)s", level=logging.INFO)
@@ -49,14 +51,64 @@ def __call__(
         out = self.pipeline(prompt, num_images_per_prompt=1, **kwargs)
         return out.images[0]
 
-
-DIFFUSERS_TASKS = {
-    "text-to-image": IEAutoPipelineForText2Image,
-}
-
-
-def get_diffusers_pipeline(task=None, model_dir=None, device=-1, **kwargs):
+#
+# DIFFUSERS_TASKS = {
+#     "text-to-image": [NeuronStableDiffusionXLPipeline],
+# }
+
+
+def load_optimum_diffusion_pipeline(task, model_dir):
+
+    # Step 1: load config and look for _class_name
+    try:
+        config = StableDiffusionPipeline.load_config(pretrained_model_name_or_path=model_dir)
+    except OSError as e:
+        logger.error("Unable to load config file for repository %s", model_dir)
+        logger.exception(e)
+        raise
+
+    pipeline_class_name = config['_class_name']
+
+    logger.debug("Repository pipeline class name %s", pipeline_class_name)
+    if pipeline_class_name.contains("Diffusion") and pipeline_class_name.contains("XL"):
+        if task == "image-to-image":
+            pipeline_class = neuron.NeuronStableDiffusionXLImg2ImgPipeline
+        else:
+            pipeline_class = neuron.NeuronStableDiffusionXLPipeline
+    else:
+        if task == "image-to-image":
+            pipeline_class = neuron.NeuronStableDiffusionImg2ImgPipeline
+        else:
+            pipeline_class = neuron.NeuronStableDiffusionPipeline
+
+    logger.debug("Pipeline class %s", pipeline_class.__class__)
+
+    # if is neuron model, no need for additional kwargs
+    if pipeline_class_name.contains("Neuron"):
+        kwargs = {}
+    else:
+        # Model will be compiled and exported on the flight as the cached models cause a performance drop
+        # for diffusion models, unless otherwise specified through an explicit env variable
+
+        # Image shapes need to be frozen at loading/compilation time
+        compiler_args = {
+            "auto_cast": "matmul",
+            "auto_cast_type": "bf16",
+            "inline_weights_to_neff": os.environ.get("INLINE_WEIGHTS_TO_NEFF",
+                                                     "false").lower() in ["false", "no", "0"],
+            "data_parallel_mode": os.environ.get("DATA_PARALLEL_MODE", "unet")
+        }
+        input_shapes = {"batch_size": 1,
+                        "height": int(os.environ("IMAGE_HEIGHT", 512)),
+                        "width": int(os.environ("IMAGE_WIDTH", 512))}
+        kwargs = {**compiler_args, **input_shapes, "export": True}
+
+    # In the second case, exporting can take a huge amount of time, which makes endpoints not a really suited solution
+    # at least as long as the cache is not really an option for diffusion
+    return pipeline_class(kwargs)
+
+
+def get_diffusers_pipeline(task=None, model_dir=None, **kwargs):
     """Get a pipeline for Diffusers models."""
-    device = "cuda" if device == 0 else "cpu"
-    pipeline = DIFFUSERS_TASKS[task](model_dir=model_dir, device=device)
+    pipeline = load_optimum_diffusion_pipeline(task=task, model_dir=model_dir)
     return pipeline
diff --git a/src/huggingface_inference_toolkit/utils.py b/src/huggingface_inference_toolkit/utils.py
index 1570317b..4cab2a39 100644
--- a/src/huggingface_inference_toolkit/utils.py
+++ b/src/huggingface_inference_toolkit/utils.py
@@ -33,7 +33,7 @@
 
 
 def is_optimum_available():
-    return False
+    return True
     # TODO: change when supported
     # return _optimum_available
 
@@ -229,7 +229,7 @@ def get_pipeline(
     create pipeline class for a specific task based on local saved model
     """
     device = get_device()
-    logger.info(f"Using device { 'GPU' if device == 0 else 'CPU'}")
+    logger.info(f"Using device { 'GPU' if device == 0 else 'CPU/TPU/Neuron...)'}")
 
     if task is None:
         raise EnvironmentError(
@@ -265,11 +265,10 @@ def get_pipeline(
             device=device,
             **kwargs
         )
-    elif is_diffusers_available() and task == "text-to-image":
+    elif is_diffusers_available() and task in ["text-to-image", "image-to-image"]:
         hf_pipeline = get_diffusers_pipeline(
             task=task,
             model_dir=model_dir,
-            device=device,
             **kwargs
         )
     else:
@@ -308,3 +307,21 @@ def convert_params_to_int_or_bool(params):
         if v == "true":
             params[k] = True
     return params
+
+
+# def local_model_card(model_dir: str) -> Optional[ModelCard]:
+#
+#     logger.debug("Rebuilding offline model info for repo %s", model_dir)
+#
+#     # Let's rebuild some partial model info from what we see in cache, info extracted should be enough
+#     # for most use cases
+#
+#     card_path = Path(model_dir) / "README.md"
+#     if not card_path.exists():
+#         logger.debug("Unable to build model info for directory %s", model_dir)
+#         return None
+#
+#     logger.debug("Loading model card from model readme %s", card_path)
+#     model_card = ModelCard.load(card_path)
+#     logger.info("Local repo %s, model card data %s", model_dir, model_card.data.to_dict())
+#     return model_card

From ca5f79c9d2609b43ff8688b03e272a3d0523b574 Mon Sep 17 00:00:00 2001
From: Raphael Glon <oOraph@users.noreply.github.com>
Date: Tue, 14 May 2024 15:43:55 +0200
Subject: [PATCH 06/23] wip

Signed-off-by: Raphael Glon <oOraph@users.noreply.github.com>
---
 dockerfiles/pytorch/neuronx.sh | 15 +++++++++------
 setup.py                       |  6 +++---
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/dockerfiles/pytorch/neuronx.sh b/dockerfiles/pytorch/neuronx.sh
index dd954804..38951756 100755
--- a/dockerfiles/pytorch/neuronx.sh
+++ b/dockerfiles/pytorch/neuronx.sh
@@ -8,7 +8,10 @@ apt-get update -y \
     gnupg2 \
     wget
 
-echo "deb https://apt.repos.neuron.amazonaws.com jammy main" > /etc/apt/sources.list.d/neuron.list
+. /etc/os-release
+tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
+deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main
+EOF
 wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -
 
 apt-get update -y \
@@ -20,11 +23,11 @@ apt-get update -y \
 
 pip install -U pip
 
-pip3 install neuronx-cc==2.12.68.0 \
-    torch-neuronx==1.13.1.1.13.1 \
-    transformers-neuronx==0.9.474 \
+pip3 install neuronx-cc==2.* \
+    torch-neuronx==2.* \
+    transformers-neuronx\
     --extra-index-url=https://pip.repos.neuron.amazonaws.com
 
-pip3 install optimum[neuronx,diffusers]
+pip3 install --extra-index-url=https://pip.repos.neuron.amazonaws.com optimum[neuronx,diffusers]
 
-pip install ".[st,torch1]"
+pip install ".[st,torch-neuronx]"
diff --git a/setup.py b/setup.py
index f8d669c0..3b96fdcd 100644
--- a/setup.py
+++ b/setup.py
@@ -17,8 +17,8 @@
     "wheel==0.42.0",
     "setuptools==69.1.0",
     "cmake==3.28.3",
-    "transformers[sklearn,sentencepiece, audio, vision]==4.38.2",
-    "huggingface_hub==0.20.3",
+    "transformers[sklearn,sentencepiece, audio, vision]>=4.36.0",
+    "huggingface_hub==0.23.0",
     "orjson",
     # vision
     "Pillow",
@@ -40,7 +40,7 @@
 extras["diffusers"] = ["diffusers==0.26.3", "accelerate==0.27.2"]
 extras["torch"] = ["torch==2.2.0", "torchvision", "torchaudio"]
 # For neuronx
-extras["torch1"] = ["torch==1.13.1", "torchvision", "torchaudio"]
+extras["torch-neuronx"] = ["torch-neuronx", "torchvision", "torchaudio"]
 extras["tensorflow"] = ["tensorflow"]
 extras["test"] = [
     "pytest==7.2.1",

From 8b6364000d678a0d94b21c029a07f9f1e1586848 Mon Sep 17 00:00:00 2001
From: Raphael Glon <oOraph@users.noreply.github.com>
Date: Tue, 14 May 2024 16:07:36 +0200
Subject: [PATCH 07/23] wip

Signed-off-by: Raphael Glon <oOraph@users.noreply.github.com>
---
 dockerfiles/pytorch/neuronx.sh | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/dockerfiles/pytorch/neuronx.sh b/dockerfiles/pytorch/neuronx.sh
index 38951756..fdefa001 100755
--- a/dockerfiles/pytorch/neuronx.sh
+++ b/dockerfiles/pytorch/neuronx.sh
@@ -23,11 +23,17 @@ apt-get update -y \
 
 pip install -U pip
 
-pip3 install neuronx-cc==2.* \
-    torch-neuronx==2.* \
-    transformers-neuronx\
+# Taken from optimum neuron, tgi dockerfile
+pip3 install \
+    neuronx-cc==2.13.66.0 \
+    torch-neuronx==2.1.2.2.1.0 \
+    transformers-neuronx==0.10.0.21 \
     --extra-index-url=https://pip.repos.neuron.amazonaws.com
 
 pip3 install --extra-index-url=https://pip.repos.neuron.amazonaws.com optimum[neuronx,diffusers]
 
 pip install ".[st,torch-neuronx]"
+
+apt-get clean autoremove --yes
+
+rm -rf /var/lib/{apt,cache,log} fi

From 1c76d87af186cb8d6000e4e13951532b8db84f88 Mon Sep 17 00:00:00 2001
From: Raphael Glon <oOraph@users.noreply.github.com>
Date: Tue, 14 May 2024 17:01:43 +0200
Subject: [PATCH 08/23] rm vim

Signed-off-by: Raphael Glon <oOraph@users.noreply.github.com>
---
 dockerfiles/pytorch/Dockerfile | 2 --
 1 file changed, 2 deletions(-)

diff --git a/dockerfiles/pytorch/Dockerfile b/dockerfiles/pytorch/Dockerfile
index d2f53ae2..d01f0df2 100644
--- a/dockerfiles/pytorch/Dockerfile
+++ b/dockerfiles/pytorch/Dockerfile
@@ -49,5 +49,3 @@ COPY src/huggingface_inference_toolkit/webservice_starlette.py webservice_starle
 COPY --chmod=0755  scripts/entrypoint.sh entrypoint.sh
 
 ENTRYPOINT ["bash", "-c", "./entrypoint.sh"]
-
-RUN apt-get update && apt-get install -y vim

From 0f0d1802ba6a97481b6011d5a653cd87420da579 Mon Sep 17 00:00:00 2001
From: Raphael Glon <oOraph@users.noreply.github.com>
Date: Tue, 14 May 2024 17:13:08 +0200
Subject: [PATCH 09/23] wip update path

Signed-off-by: Raphael Glon <oOraph@users.noreply.github.com>
---
 dockerfiles/pytorch/neuronx.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dockerfiles/pytorch/neuronx.sh b/dockerfiles/pytorch/neuronx.sh
index fdefa001..bc3c6e84 100755
--- a/dockerfiles/pytorch/neuronx.sh
+++ b/dockerfiles/pytorch/neuronx.sh
@@ -37,3 +37,5 @@ pip install ".[st,torch-neuronx]"
 apt-get clean autoremove --yes
 
 rm -rf /var/lib/{apt,cache,log} fi
+
+echo "PATH=\"$PATH:/opt/aws/neuron/bin\"" > /etc/environment

From 4749c57db4122c52fda3dec2bef137382d72a696 Mon Sep 17 00:00:00 2001
From: Raphael Glon <oOraph@users.noreply.github.com>
Date: Tue, 14 May 2024 17:39:06 +0200
Subject: [PATCH 10/23] Fix

Signed-off-by: Raphael Glon <oOraph@users.noreply.github.com>
---
 src/huggingface_inference_toolkit/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/huggingface_inference_toolkit/utils.py b/src/huggingface_inference_toolkit/utils.py
index 4cab2a39..8309cdea 100644
--- a/src/huggingface_inference_toolkit/utils.py
+++ b/src/huggingface_inference_toolkit/utils.py
@@ -33,7 +33,7 @@
 
 
 def is_optimum_available():
-    return True
+    return False
     # TODO: change when supported
     # return _optimum_available
 

From 72787a5763e64bf3bc47909a473b5fd086958e8c Mon Sep 17 00:00:00 2001
From: Raphael Glon <oOraph@users.noreply.github.com>
Date: Tue, 14 May 2024 17:39:34 +0200
Subject: [PATCH 11/23] PATH

Signed-off-by: Raphael Glon <oOraph@users.noreply.github.com>
---
 dockerfiles/pytorch/neuronx.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dockerfiles/pytorch/neuronx.sh b/dockerfiles/pytorch/neuronx.sh
index bc3c6e84..9b8f35f4 100755
--- a/dockerfiles/pytorch/neuronx.sh
+++ b/dockerfiles/pytorch/neuronx.sh
@@ -38,4 +38,4 @@ apt-get clean autoremove --yes
 
 rm -rf /var/lib/{apt,cache,log} fi
 
-echo "PATH=\"$PATH:/opt/aws/neuron/bin\"" > /etc/environment
+echo "export PATH=\"$PATH:/opt/aws/neuron/bin\"" >> /root/.bashrc

From 6ba2b76c0a5c171c1225b1d196351c263facdabb Mon Sep 17 00:00:00 2001
From: Raphael Glon <oOraph@users.noreply.github.com>
Date: Tue, 14 May 2024 17:47:22 +0200
Subject: [PATCH 12/23] fix

Signed-off-by: Raphael Glon <oOraph@users.noreply.github.com>
---
 src/huggingface_inference_toolkit/diffusers_utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/huggingface_inference_toolkit/diffusers_utils.py b/src/huggingface_inference_toolkit/diffusers_utils.py
index 75227cf1..748ac65d 100644
--- a/src/huggingface_inference_toolkit/diffusers_utils.py
+++ b/src/huggingface_inference_toolkit/diffusers_utils.py
@@ -70,7 +70,7 @@ def load_optimum_diffusion_pipeline(task, model_dir):
     pipeline_class_name = config['_class_name']
 
     logger.debug("Repository pipeline class name %s", pipeline_class_name)
-    if pipeline_class_name.contains("Diffusion") and pipeline_class_name.contains("XL"):
+    if "Diffusion" in pipeline_class_name and "XL" in pipeline_class_name:
         if task == "image-to-image":
             pipeline_class = neuron.NeuronStableDiffusionXLImg2ImgPipeline
         else:
@@ -84,7 +84,7 @@ def load_optimum_diffusion_pipeline(task, model_dir):
     logger.debug("Pipeline class %s", pipeline_class.__class__)
 
     # if is neuron model, no need for additional kwargs
-    if pipeline_class_name.contains("Neuron"):
+    if "Neuron" in pipeline_class_name:
         kwargs = {}
     else:
         # Model will be compiled and exported on the flight as the cached models cause a performance drop
@@ -99,8 +99,8 @@ def load_optimum_diffusion_pipeline(task, model_dir):
             "data_parallel_mode": os.environ.get("DATA_PARALLEL_MODE", "unet")
         }
         input_shapes = {"batch_size": 1,
-                        "height": int(os.environ("IMAGE_HEIGHT", 512)),
-                        "width": int(os.environ("IMAGE_WIDTH", 512))}
+                        "height": int(os.environ.get("IMAGE_HEIGHT", 512)),
+                        "width": int(os.environ.get("IMAGE_WIDTH", 512))}
         kwargs = {**compiler_args, **input_shapes, "export": True}
 
     # In the second case, exporting can take a huge amount of time, which makes endpoints not a really suited solution

From d0c37d6e91afff33e2466ed88ba99b63de7a8be7 Mon Sep 17 00:00:00 2001
From: Raphael Glon <oOraph@users.noreply.github.com>
Date: Tue, 14 May 2024 17:58:41 +0200
Subject: [PATCH 13/23] fix

Signed-off-by: Raphael Glon <oOraph@users.noreply.github.com>
---
 .../diffusers_utils.py                        | 31 +++++++------------
 1 file changed, 12 insertions(+), 19 deletions(-)

diff --git a/src/huggingface_inference_toolkit/diffusers_utils.py b/src/huggingface_inference_toolkit/diffusers_utils.py
index 748ac65d..50e43cae 100644
--- a/src/huggingface_inference_toolkit/diffusers_utils.py
+++ b/src/huggingface_inference_toolkit/diffusers_utils.py
@@ -84,28 +84,21 @@ def load_optimum_diffusion_pipeline(task, model_dir):
     logger.debug("Pipeline class %s", pipeline_class.__class__)
 
     # if is neuron model, no need for additional kwargs
-    if "Neuron" in pipeline_class_name:
-        kwargs = {}
-    else:
-        # Model will be compiled and exported on the flight as the cached models cause a performance drop
-        # for diffusion models, unless otherwise specified through an explicit env variable
-
-        # Image shapes need to be frozen at loading/compilation time
-        compiler_args = {
-            "auto_cast": "matmul",
-            "auto_cast_type": "bf16",
-            "inline_weights_to_neff": os.environ.get("INLINE_WEIGHTS_TO_NEFF",
-                                                     "false").lower() in ["false", "no", "0"],
-            "data_parallel_mode": os.environ.get("DATA_PARALLEL_MODE", "unet")
-        }
-        input_shapes = {"batch_size": 1,
-                        "height": int(os.environ.get("IMAGE_HEIGHT", 512)),
-                        "width": int(os.environ.get("IMAGE_WIDTH", 512))}
-        kwargs = {**compiler_args, **input_shapes, "export": True}
+    compiler_args = {
+        "auto_cast": "matmul",
+        "auto_cast_type": "bf16",
+        "inline_weights_to_neff": os.environ.get("INLINE_WEIGHTS_TO_NEFF",
+                                                 "false").lower() in ["false", "no", "0"],
+        "data_parallel_mode": os.environ.get("DATA_PARALLEL_MODE", "unet")
+    }
+    input_shapes = {"batch_size": 1,
+                    "height": int(os.environ.get("IMAGE_HEIGHT", 512)),
+                    "width": int(os.environ.get("IMAGE_WIDTH", 512))}
+    kwargs = {**compiler_args, **input_shapes, "export": True}
 
     # In the second case, exporting can take a huge amount of time, which makes endpoints not a really suited solution
     # at least as long as the cache is not really an option for diffusion
-    return pipeline_class(kwargs)
+    return pipeline_class.from_pretrained(model_dir, **kwargs)
 
 
 def get_diffusers_pipeline(task=None, model_dir=None, **kwargs):

From 957b1833e515ab2fb28bd8b67efdd4e9ed21c4a1 Mon Sep 17 00:00:00 2001
From: Raphael Glon <oOraph@users.noreply.github.com>
Date: Thu, 16 May 2024 10:20:38 +0200
Subject: [PATCH 14/23] wip

Signed-off-by: Raphael Glon <oOraph@users.noreply.github.com>
---
 .../diffusers_utils.py                        | 38 +++++++++++++++++--
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/src/huggingface_inference_toolkit/diffusers_utils.py b/src/huggingface_inference_toolkit/diffusers_utils.py
index 50e43cae..9e9935a8 100644
--- a/src/huggingface_inference_toolkit/diffusers_utils.py
+++ b/src/huggingface_inference_toolkit/diffusers_utils.py
@@ -1,4 +1,5 @@
 import importlib.util
+import json
 import logging
 import os
 
@@ -57,6 +58,23 @@ def __call__(
 # }
 
 
+def _is_neuron_model(model_dir):
+    for root, _, files in os.walk(model_dir):
+        for f in files:
+            if f == "config.json":
+                filename = os.path.join(root, f)
+                with open(filename, 'r') as fh:
+                    try:
+                        config = json.load(fh)
+                    except Exception as e:
+                        logger.warning("Unable to load config %s properly, skipping", filename)
+                        logger.exception(e)
+                        continue
+                    if 'neuron' in config.keys():
+                        return True
+    return False
+
+
 def load_optimum_diffusion_pipeline(task, model_dir):
 
     # Step 1: load config and look for _class_name
@@ -83,7 +101,6 @@ def load_optimum_diffusion_pipeline(task, model_dir):
 
     logger.debug("Pipeline class %s", pipeline_class.__class__)
 
-    # if is neuron model, no need for additional kwargs
     compiler_args = {
         "auto_cast": "matmul",
         "auto_cast_type": "bf16",
@@ -94,11 +111,26 @@ def load_optimum_diffusion_pipeline(task, model_dir):
     input_shapes = {"batch_size": 1,
                     "height": int(os.environ.get("IMAGE_HEIGHT", 512)),
                     "width": int(os.environ.get("IMAGE_WIDTH", 512))}
-    kwargs = {**compiler_args, **input_shapes, "export": True}
+    export_kwargs = {**compiler_args, **input_shapes, "export": True}
+
+    # if is neuron model, no need for additional kwargs, any info lies within the repo
+    is_neuron_m = _is_neuron_model(model_dir)
+    if is_neuron_m:
+        kwargs = {}
+        fallback_kwargs = export_kwargs
+    else:
+        kwargs = export_kwargs
+        fallback_kwargs = {}
 
     # In the second case, exporting can take a huge amount of time, which makes endpoints not a really suited solution
     # at least as long as the cache is not really an option for diffusion
-    return pipeline_class.from_pretrained(model_dir, **kwargs)
+    try:
+        logger.info("Loading model %s with kwargs %s", model_dir, kwargs)
+        return pipeline_class.from_pretrained(model_dir, **kwargs)
+    except Exception as e:
+        logger.error("Unable to load model %s properly falling back to kwargs %s", model_dir, fallback_kwargs)
+        logger.exception(e)
+        return pipeline_class.from_pretrained(model_dir, **fallback_kwargs)
 
 
 def get_diffusers_pipeline(task=None, model_dir=None, **kwargs):

From 247d67be9e613a9646cbd1ffdf8601da4e0f8e6d Mon Sep 17 00:00:00 2001
From: Raphael Glon <oOraph@users.noreply.github.com>
Date: Thu, 16 May 2024 15:42:46 +0200
Subject: [PATCH 15/23] wip

Signed-off-by: Raphael Glon <oOraph@users.noreply.github.com>
---
 src/huggingface_inference_toolkit/diffusers_utils.py      | 3 ++-
 src/huggingface_inference_toolkit/webservice_starlette.py | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/huggingface_inference_toolkit/diffusers_utils.py b/src/huggingface_inference_toolkit/diffusers_utils.py
index 9e9935a8..5496792e 100644
--- a/src/huggingface_inference_toolkit/diffusers_utils.py
+++ b/src/huggingface_inference_toolkit/diffusers_utils.py
@@ -5,6 +5,7 @@
 
 from transformers.utils.import_utils import is_torch_bf16_gpu_available
 from optimum import neuron
+from optimum.neuron.modeling_base import OptimizedModel
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(format="%(asctime)s | %(levelname)s | %(message)s", level=logging.INFO)
@@ -75,7 +76,7 @@ def _is_neuron_model(model_dir):
     return False
 
 
-def load_optimum_diffusion_pipeline(task, model_dir):
+def load_optimum_diffusion_pipeline(task: str, model_dir: str) -> OptimizedModel:
 
     # Step 1: load config and look for _class_name
     try:
diff --git a/src/huggingface_inference_toolkit/webservice_starlette.py b/src/huggingface_inference_toolkit/webservice_starlette.py
index 00b6967d..e06ab447 100644
--- a/src/huggingface_inference_toolkit/webservice_starlette.py
+++ b/src/huggingface_inference_toolkit/webservice_starlette.py
@@ -1,4 +1,5 @@
 import logging
+import os
 from pathlib import Path
 from time import perf_counter
 
@@ -23,7 +24,7 @@
 
 
 def config_logging(level=logging.INFO):
-    logging.basicConfig(format="%(asctime)s | %(levelname)s | %(message)s", datefmt="", level=level)
+    logging.basicConfig(format="%(asctime)s | %(levelname)s | %(message)s", datefmt="", level=level, force=True)
     # disable uvicorn access logs to hide /health
     uvicorn_access = logging.getLogger("uvicorn.access")
     uvicorn_access.disabled = True
@@ -31,7 +32,7 @@ def config_logging(level=logging.INFO):
     logging.getLogger("uvicorn").removeHandler(logging.getLogger("uvicorn").handlers[0])
 
 
-config_logging()
+config_logging(os.environ.get("LOG_LEVEL", logging.getLevelName(logging.INFO)))
 logger = logging.getLogger(__name__)
 
 

From e6c4e1233b9cb2bc7a1d5d5a52a4dbd8c915a551 Mon Sep 17 00:00:00 2001
From: Raphael Glon <oOraph@users.noreply.github.com>
Date: Thu, 16 May 2024 15:52:00 +0200
Subject: [PATCH 16/23] version

Signed-off-by: Raphael Glon <oOraph@users.noreply.github.com>
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 3b96fdcd..98543268 100644
--- a/setup.py
+++ b/setup.py
@@ -17,7 +17,7 @@
     "wheel==0.42.0",
     "setuptools==69.1.0",
     "cmake==3.28.3",
-    "transformers[sklearn,sentencepiece, audio, vision]>=4.36.0",
+    "transformers[sklearn,sentencepiece, audio, vision]>=4.38.2",
     "huggingface_hub==0.23.0",
     "orjson",
     # vision

From f77cde30998551864eaeb1f0c15bef347cd5905a Mon Sep 17 00:00:00 2001
From: Raphael Glon <oOraph@users.noreply.github.com>
Date: Thu, 16 May 2024 16:06:41 +0200
Subject: [PATCH 17/23] cleanup

Signed-off-by: Raphael Glon <oOraph@users.noreply.github.com>
---
 src/huggingface_inference_toolkit/utils.py | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/src/huggingface_inference_toolkit/utils.py b/src/huggingface_inference_toolkit/utils.py
index 8309cdea..baf393b6 100644
--- a/src/huggingface_inference_toolkit/utils.py
+++ b/src/huggingface_inference_toolkit/utils.py
@@ -307,21 +307,3 @@ def convert_params_to_int_or_bool(params):
         if v == "true":
             params[k] = True
     return params
-
-
-# def local_model_card(model_dir: str) -> Optional[ModelCard]:
-#
-#     logger.debug("Rebuilding offline model info for repo %s", model_dir)
-#
-#     # Let's rebuild some partial model info from what we see in cache, info extracted should be enough
-#     # for most use cases
-#
-#     card_path = Path(model_dir) / "README.md"
-#     if not card_path.exists():
-#         logger.debug("Unable to build model info for directory %s", model_dir)
-#         return None
-#
-#     logger.debug("Loading model card from model readme %s", card_path)
-#     model_card = ModelCard.load(card_path)
-#     logger.info("Local repo %s, model card data %s", model_dir, model_card.data.to_dict())
-#     return model_card

From 73eaba7c1e8a91916840a77ade2374eeaee1b520 Mon Sep 17 00:00:00 2001
From: Raphael Glon <oOraph@users.noreply.github.com>
Date: Thu, 16 May 2024 16:43:52 +0200
Subject: [PATCH 18/23] cleanup

Signed-off-by: Raphael Glon <oOraph@users.noreply.github.com>
---
 .../diffusers_utils.py                        | 24 +++++++++++++------
 src/huggingface_inference_toolkit/utils.py    | 22 +++++++++++++++--
 2 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/src/huggingface_inference_toolkit/diffusers_utils.py b/src/huggingface_inference_toolkit/diffusers_utils.py
index 5496792e..65a7e8ec 100644
--- a/src/huggingface_inference_toolkit/diffusers_utils.py
+++ b/src/huggingface_inference_toolkit/diffusers_utils.py
@@ -7,6 +7,7 @@
 from optimum import neuron
 from optimum.neuron.modeling_base import OptimizedModel
 
+
 logger = logging.getLogger(__name__)
 logging.basicConfig(format="%(asctime)s | %(levelname)s | %(message)s", level=logging.INFO)
 
@@ -53,10 +54,10 @@ def __call__(
         out = self.pipeline(prompt, num_images_per_prompt=1, **kwargs)
         return out.images[0]
 
-#
-# DIFFUSERS_TASKS = {
-#     "text-to-image": [NeuronStableDiffusionXLPipeline],
-# }
+
+DIFFUSERS_TASKS = {
+    "text-to-image": IEAutoPipelineForText2Image,
+}
 
 
 def _is_neuron_model(model_dir):
@@ -76,7 +77,7 @@ def _is_neuron_model(model_dir):
     return False
 
 
-def load_optimum_diffusion_pipeline(task: str, model_dir: str) -> OptimizedModel:
+def neuron_diffusion_pipeline(task: str, model_dir: str) -> OptimizedModel:
 
     # Step 1: load config and look for _class_name
     try:
@@ -134,7 +135,16 @@ def load_optimum_diffusion_pipeline(task: str, model_dir: str) -> OptimizedModel
         return pipeline_class.from_pretrained(model_dir, **fallback_kwargs)
 
 
-def get_diffusers_pipeline(task=None, model_dir=None, **kwargs):
+def get_diffusers_pipeline(task=None, model_dir=None, device=-1, **_kwargs):
     """Get a pipeline for Diffusers models."""
-    pipeline = load_optimum_diffusion_pipeline(task=task, model_dir=model_dir)
+    if device == 0:
+        device = "cuda"
+    elif device is not None:
+        device = "cpu"
+    # None case: neuronx, no need to specify device
+
+    if device is not None:
+        pipeline = DIFFUSERS_TASKS[task](model_dir=model_dir, device=device)
+    else:
+        pipeline = neuron_diffusion_pipeline(task=task, model_dir=model_dir)
     return pipeline
diff --git a/src/huggingface_inference_toolkit/utils.py b/src/huggingface_inference_toolkit/utils.py
index baf393b6..afa154e7 100644
--- a/src/huggingface_inference_toolkit/utils.py
+++ b/src/huggingface_inference_toolkit/utils.py
@@ -30,6 +30,14 @@
     import torch
 
 _optimum_available = importlib.util.find_spec("optimum") is not None
+if _optimum_available:
+    _optimum_neuron = importlib.util.find_spec("optimum.neuron") is not None
+    from optimum.neuron.modeling_decoder import get_available_cores as get_neuron_cores
+else:
+    _optimum_neuron = False
+
+    def get_neuron_cores():
+        return 0
 
 
 def is_optimum_available():
@@ -38,6 +46,10 @@ def is_optimum_available():
     # return _optimum_available
 
 
+def is_optimum_neuron_available():
+    return _optimum_neuron
+
+
 framework2weight = {
     "pytorch": "pytorch*",
     "tensorflow": "tf*",
@@ -215,6 +227,8 @@ def get_device():
 
     if gpu:
         return 0
+    elif get_neuron_cores() > 0:
+        return None
     else:
         return -1
 
@@ -229,7 +243,10 @@ def get_pipeline(
     create pipeline class for a specific task based on local saved model
     """
     device = get_device()
-    logger.info(f"Using device { 'GPU' if device == 0 else 'CPU/TPU/Neuron...)'}")
+    logger.info(f"Using device { 'GPU' if device == 0 else 'Neuron' if device is None else 'CPU'}")
+
+    if device is None and task != "text-to-image":
+        raise Exception("This container only supports text-to-image task with neurons")
 
     if task is None:
         raise EnvironmentError(
@@ -265,10 +282,11 @@ def get_pipeline(
             device=device,
             **kwargs
         )
-    elif is_diffusers_available() and task in ["text-to-image", "image-to-image"]:
+    elif is_diffusers_available() and task == "text-to-image":
         hf_pipeline = get_diffusers_pipeline(
             task=task,
             model_dir=model_dir,
+            device=device
             **kwargs
         )
     else:

From 429d25f3b375e6e1f611b69369bbbc5cb65d60b3 Mon Sep 17 00:00:00 2001
From: Raphael Glon <oOraph@users.noreply.github.com>
Date: Thu, 16 May 2024 16:51:25 +0200
Subject: [PATCH 19/23] quality check

Signed-off-by: Raphael Glon <oOraph@users.noreply.github.com>
---
 src/huggingface_inference_toolkit/diffusers_utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/huggingface_inference_toolkit/diffusers_utils.py b/src/huggingface_inference_toolkit/diffusers_utils.py
index 65a7e8ec..1f255bf9 100644
--- a/src/huggingface_inference_toolkit/diffusers_utils.py
+++ b/src/huggingface_inference_toolkit/diffusers_utils.py
@@ -3,10 +3,9 @@
 import logging
 import os
 
-from transformers.utils.import_utils import is_torch_bf16_gpu_available
 from optimum import neuron
 from optimum.neuron.modeling_base import OptimizedModel
-
+from transformers.utils.import_utils import is_torch_bf16_gpu_available
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(format="%(asctime)s | %(levelname)s | %(message)s", level=logging.INFO)

From 179dfc20c7db6e1e600db1cf6fbccaf5f9d0f263 Mon Sep 17 00:00:00 2001
From: Raphael Glon <oOraph@users.noreply.github.com>
Date: Thu, 16 May 2024 17:02:39 +0200
Subject: [PATCH 20/23] fix tests

Signed-off-by: Raphael Glon <oOraph@users.noreply.github.com>
---
 src/huggingface_inference_toolkit/diffusers_utils.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/huggingface_inference_toolkit/diffusers_utils.py b/src/huggingface_inference_toolkit/diffusers_utils.py
index 1f255bf9..cfbbec43 100644
--- a/src/huggingface_inference_toolkit/diffusers_utils.py
+++ b/src/huggingface_inference_toolkit/diffusers_utils.py
@@ -3,14 +3,13 @@
 import logging
 import os
 
-from optimum import neuron
-from optimum.neuron.modeling_base import OptimizedModel
 from transformers.utils.import_utils import is_torch_bf16_gpu_available
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(format="%(asctime)s | %(levelname)s | %(message)s", level=logging.INFO)
 
 _diffusers = importlib.util.find_spec("diffusers") is not None
+_optimum_neuron = importlib.util.find_spec("optimum.neuron") is not None
 
 
 def is_diffusers_available():
@@ -22,6 +21,11 @@ def is_diffusers_available():
     from diffusers import AutoPipelineForText2Image, DPMSolverMultistepScheduler, StableDiffusionPipeline
 
 
+if _optimum_neuron:
+    from optimum import neuron
+    from optimum.neuron.modeling_base import OptimizedModel
+
+
 class IEAutoPipelineForText2Image:
     def __init__(self, model_dir: str, device: str = None):  # needs "cuda" for GPU
         dtype = torch.float32

From 2e21889f73ce631b3ac0e1eb1b20484155ac988b Mon Sep 17 00:00:00 2001
From: Raphael Glon <oOraph@users.noreply.github.com>
Date: Thu, 16 May 2024 17:17:30 +0200
Subject: [PATCH 21/23] fix

Signed-off-by: Raphael Glon <oOraph@users.noreply.github.com>
---
 src/huggingface_inference_toolkit/diffusers_utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/huggingface_inference_toolkit/diffusers_utils.py b/src/huggingface_inference_toolkit/diffusers_utils.py
index cfbbec43..8b4c1491 100644
--- a/src/huggingface_inference_toolkit/diffusers_utils.py
+++ b/src/huggingface_inference_toolkit/diffusers_utils.py
@@ -9,7 +9,11 @@
 logging.basicConfig(format="%(asctime)s | %(levelname)s | %(message)s", level=logging.INFO)
 
 _diffusers = importlib.util.find_spec("diffusers") is not None
-_optimum_neuron = importlib.util.find_spec("optimum.neuron") is not None
+_optimum = importlib.util.find_spec("optimum") is not None
+if _optimum:
+    _optimum_neuron = importlib.util.find_spec("optimum.neuron") is not None
+else:
+    _optimum_neuron = False
 
 
 def is_diffusers_available():

From 41563f4d323d06b3442ababba49a1ec363c66756 Mon Sep 17 00:00:00 2001
From: Raphael Glon <oOraph@users.noreply.github.com>
Date: Thu, 16 May 2024 17:26:19 +0200
Subject: [PATCH 22/23] fix

Signed-off-by: Raphael Glon <oOraph@users.noreply.github.com>
---
 src/huggingface_inference_toolkit/diffusers_utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/huggingface_inference_toolkit/diffusers_utils.py b/src/huggingface_inference_toolkit/diffusers_utils.py
index 8b4c1491..21795e7a 100644
--- a/src/huggingface_inference_toolkit/diffusers_utils.py
+++ b/src/huggingface_inference_toolkit/diffusers_utils.py
@@ -27,7 +27,6 @@ def is_diffusers_available():
 
 if _optimum_neuron:
     from optimum import neuron
-    from optimum.neuron.modeling_base import OptimizedModel
 
 
 class IEAutoPipelineForText2Image:
@@ -84,7 +83,7 @@ def _is_neuron_model(model_dir):
     return False
 
 
-def neuron_diffusion_pipeline(task: str, model_dir: str) -> OptimizedModel:
+def neuron_diffusion_pipeline(task: str, model_dir: str):
 
     # Step 1: load config and look for _class_name
     try:

From 64530263879465ccb2de18bf8d55313d7045d886 Mon Sep 17 00:00:00 2001
From: Raphael Glon <oOraph@users.noreply.github.com>
Date: Thu, 16 May 2024 19:01:15 +0200
Subject: [PATCH 23/23] fix

Signed-off-by: Raphael Glon <oOraph@users.noreply.github.com>
---
 src/huggingface_inference_toolkit/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/huggingface_inference_toolkit/utils.py b/src/huggingface_inference_toolkit/utils.py
index afa154e7..65e3a6b4 100644
--- a/src/huggingface_inference_toolkit/utils.py
+++ b/src/huggingface_inference_toolkit/utils.py
@@ -286,7 +286,7 @@ def get_pipeline(
         hf_pipeline = get_diffusers_pipeline(
             task=task,
             model_dir=model_dir,
-            device=device
+            device=device,
             **kwargs
         )
     else: