Stability-AI · akx · Jul 19, 2023 · Jul 18, 2023 · Jul 19, 2023 · Jul 19, 2023
diff --git a/README.md b/README.md
@@ -41,6 +41,8 @@ For training, we use [PyTorch Lightning](https://lightning.ai/docs/pytorch/stabl
 ## Installation:
 <a name="installation"></a>
 
+**NOTE:** This is tested under `python3.8` and `python3.10`. For other Python versions, you might encounter version conflicts.
+
 #### 1. Clone the repo
 
 ```shell
@@ -52,29 +54,18 @@ cd generative-models
 
 This is assuming you have navigated to the `generative-models` root after cloning it. 
 
-**NOTE:** This is tested under `python3.8` and `python3.10`. For other python versions, you might encounter version conflicts.
-
-
-**PyTorch 1.13** 
-
 ```shell
-# install required packages from pypi
-python3 -m venv .pt1
-source .pt1/bin/activate
-pip3 install wheel
-pip3 install -r requirements_pt13.txt
+python3 -m venv venv
+source venv/bin/activate
+pip install -U setuptools wheel
 ```
 
-**PyTorch 2.0** 
+Then, depending on your use case, choose a set of requirements to install.
 
-
-```shell
-# install required packages from pypi
-python3 -m venv .pt2
-source .pt2/bin/activate
-pip3 install wheel
-pip3 install -r requirements_pt2.txt
-```
+* `pip install -r requirements-demo-streamlit.txt`: Demo inference dependencies, enough to run the Streamlit demo
+* `pip install -r requirements-demo-minimal.txt`: Demo inference dependencies, enough to run the minimal txt2img script
+* `pip install -r requirements_pt2.txt`: PyTorch 2, including training dependencies
+* `pip install -r requirements_pt13.txt`: PyTorch 1.13, including training dependencies
 
 ## Packaging
 
@@ -93,7 +84,17 @@ You will find the built package in `dist/`. You can install the wheel with `pip
 Note that the package does **not** currently specify dependencies; you will need to install the required packages,
 depending on your use case and PyTorch version, manually.
 
-## Inference:
+## Inference
+
+### Minimal txt2img demo
+
+There is a minimal SDXL 0.9 text-to-image demo available as `txt2img.py`:
+
+```
+python txt2img.py --prompt "Big fluffy cat in a cereal bowl" --steps 25 --seed 1050
+```
+
+### Streamlit demo
 
 We provide a [streamlit](https://streamlit.io/) demo for text-to-image and image-to-image sampling in `scripts/demo/sampling.py`. The following models are currently supported:
 - [SD-XL 0.9-base](https://huggingface.co/stabilityai/stable-diffusion-xl-base-0.9)

diff --git a/requirements-demo-minimal.txt b/requirements-demo-minimal.txt
@@ -0,0 +1,8 @@
+einops
+kornia~=0.6.12
+omegaconf
+open-clip-torch
+pytorch-lightning~=2.0.5
+safetensors~=0.3.1
+torchvision~=0.15.2
+transformers~=4.31.0
diff --git a/requirements-demo-streamlit.txt b/requirements-demo-streamlit.txt
@@ -0,0 +1,4 @@
+-r requirements-demo-minimal.txt
+-e git+https://github.com/openai/CLIP.git@main#egg=clip
+invisible-watermark
+streamlit
diff --git a/sgm/models/diffusion.py b/sgm/models/diffusion.py
@@ -1,5 +1,5 @@
 import logging
-from contextlib import contextmanager
+from contextlib import contextmanager, nullcontext
 from typing import Any, Dict, List, Tuple, Union
 
 import pytorch_lightning as pl
@@ -14,6 +14,7 @@
 from ..util import (
     default,
     disabled_train,
+    get_default_device_name,
     get_obj_from_str,
     instantiate_from_config,
     log_txt_as_img,
@@ -117,16 +118,22 @@ def get_input(self, batch):
         # image tensors should be scaled to -1 ... 1 and in bchw format
         return batch[self.input_key]
 
+    def _first_stage_autocast_context(self):
+        device = get_default_device_name()
+        if device not in ("cpu", "cuda"):
+            return nullcontext()
+        return torch.autocast(device, enabled=not self.disable_first_stage_autocast)
+
     @torch.no_grad()
     def decode_first_stage(self, z):
         z = 1.0 / self.scale_factor * z
-        with torch.autocast("cuda", enabled=not self.disable_first_stage_autocast):
+        with self._first_stage_autocast_context():
             out = self.first_stage_model.decode(z)
         return out
 
     @torch.no_grad()
     def encode_first_stage(self, x):
-        with torch.autocast("cuda", enabled=not self.disable_first_stage_autocast):
+        with self._first_stage_autocast_context():
             z = self.first_stage_model.encode(x)
         z = self.scale_factor * z
         return z

diff --git a/sgm/modules/autoencoding/losses/__init__.py b/sgm/modules/autoencoding/losses/__init__.py
@@ -4,9 +4,6 @@
 import torch
 import torch.nn as nn
 from einops import rearrange
-from taming.modules.discriminator.model import NLayerDiscriminator, weights_init
-from taming.modules.losses.lpips import LPIPS
-from taming.modules.losses.vqperceptual import hinge_d_loss, vanilla_d_loss
 
 from ....util import default, instantiate_from_config
 
@@ -30,6 +27,7 @@ def __init__(
         scale_tgt_to_input_size=False,
         perceptual_weight_on_inputs=0.0,
     ):
+        from taming.modules.losses.lpips import LPIPS  # late import to avoid extra dependency
         super().__init__()
         self.scale_input_to_tgt_size = scale_input_to_tgt_size
         self.scale_tgt_to_input_size = scale_tgt_to_input_size
@@ -105,6 +103,9 @@ def __init__(
         learn_logvar: bool = False,
         regularization_weights: Union[None, dict] = None,
     ):
+        from taming.modules.losses.lpips import LPIPS  # late import to avoid extra dependency
+        from taming.modules.discriminator.model import NLayerDiscriminator, weights_init  # late import to avoid extra dependency
+        from taming.modules.losses.vqperceptual import hinge_d_loss, vanilla_d_loss  # late import to avoid extra dependency
         super().__init__()
         self.dims = dims
         if self.dims > 2:

diff --git a/sgm/modules/diffusionmodules/loss.py b/sgm/modules/diffusionmodules/loss.py
@@ -3,7 +3,6 @@
 import torch
 import torch.nn as nn
 from omegaconf import ListConfig
-from taming.modules.losses.lpips import LPIPS
 
 from ...util import append_dims, instantiate_from_config
 
@@ -26,6 +25,8 @@ def __init__(
         self.offset_noise_level = offset_noise_level
 
         if type == "lpips":
+            from taming.modules.losses.lpips import LPIPS  # late import to avoid extra dependency
+
             self.lpips = LPIPS().eval()
 
         if not batch2model_keys:

diff --git a/sgm/modules/diffusionmodules/openaimodel.py b/sgm/modules/diffusionmodules/openaimodel.py
@@ -20,7 +20,7 @@
     timestep_embedding,
     zero_module,
 )
-from ...util import default, exists
+from ...util import default, exists, get_default_device_name
 
 logger = logging.getLogger(__name__)
 
@@ -1244,6 +1244,7 @@ def __init__(self, in_channels=3, model_channels=64):
                 ]
             )
 
+    device = get_default_device_name()
     model = UNetModel(
         use_checkpoint=True,
         image_size=64,
@@ -1258,8 +1259,8 @@ def __init__(self, in_channels=3, model_channels=64):
         use_linear_in_transformer=True,
         transformer_depth=1,
         legacy=False,
-    ).cuda()
-    x = th.randn(11, 4, 64, 64).cuda()
-    t = th.randint(low=0, high=10, size=(11,), device="cuda")
+    ).to(device)
+    x = th.randn(11, 4, 64, 64).to(device)
+    t = th.randint(low=0, high=10, size=(11,), device=device)
     o = model(x, t)
     print("done.")
diff --git a/sgm/modules/diffusionmodules/sampling.py b/sgm/modules/diffusionmodules/sampling.py
@@ -16,7 +16,7 @@
     to_neg_log_sigma,
     to_sigma,
 )
-from ...util import append_dims, default, instantiate_from_config
+from ...util import append_dims, default, instantiate_from_config, get_default_device_name
 
 DEFAULT_GUIDER = {"target": "sgm.modules.diffusionmodules.guiders.IdentityGuider"}
 
@@ -28,8 +28,10 @@ def __init__(
         num_steps: Union[int, None] = None,
         guider_config: Union[Dict, ListConfig, OmegaConf, None] = None,
         verbose: bool = False,
-        device: str = "cuda",
+        device: Union[str, None] = None,
     ):
+        if device is None:
+            device = get_default_device_name()
         self.num_steps = num_steps
         self.discretization = instantiate_from_config(discretization_config)
         self.guider = instantiate_from_config(

diff --git a/sgm/modules/diffusionmodules/sampling_utils.py b/sgm/modules/diffusionmodules/sampling_utils.py
@@ -1,5 +1,4 @@
 import torch
-from scipy import integrate
 
 from ...util import append_dims
 
@@ -10,6 +9,7 @@ def __call__(self, uncond, cond, scale):
 
 
 def linear_multistep_coeff(order, t, i, j, epsrel=1e-4):
+    from scipy import integrate  # late import to avoid extra dependency
     if order - 1 > i:
         raise ValueError(f"Order {order} too high for step {i}")
 

diff --git a/sgm/modules/encoders/modules.py b/sgm/modules/encoders/modules.py
@@ -30,6 +30,7 @@
     default,
     disabled_train,
     expand_dims_like,
+    get_default_device_name,
     instantiate_from_config,
 )
 
@@ -239,7 +240,9 @@ def forward(self, c):
             c = c[:, None, :]
         return c
 
-    def get_unconditional_conditioning(self, bs, device="cuda"):
+    def get_unconditional_conditioning(self, bs, device=None):
+        if device is None:
+            device = get_default_device_name()
         uc_class = (
             self.n_classes - 1
         )  # 1000 classes --> 0 ... 999, one extra class for ucg (class 1000)
@@ -264,9 +267,10 @@ class FrozenT5Embedder(AbstractEmbModel):
     """Uses the T5 transformer encoder for text"""
 
     def __init__(
-        self, version="google/t5-v1_1-xxl", device="cuda", max_length=77, freeze=True
+        self, version="google/t5-v1_1-xxl", device=None, max_length=77, freeze=True
     ):  # others are google/t5-v1_1-xl and google/t5-v1_1-xxl
         super().__init__()
+        device = device or get_default_device_name()
         self.tokenizer = T5Tokenizer.from_pretrained(version)
         self.transformer = T5EncoderModel.from_pretrained(version)
         self.device = device
@@ -307,9 +311,10 @@ class FrozenByT5Embedder(AbstractEmbModel):
     """
 
     def __init__(
-        self, version="google/byt5-base", device="cuda", max_length=77, freeze=True
+        self, version="google/byt5-base", device=None, max_length=77, freeze=True
     ):  # others are google/t5-v1_1-xl and google/t5-v1_1-xxl
         super().__init__()
+        device = device or get_default_device_name()
         self.tokenizer = ByT5Tokenizer.from_pretrained(version)
         self.transformer = T5EncoderModel.from_pretrained(version)
         self.device = device
@@ -351,14 +356,15 @@ class FrozenCLIPEmbedder(AbstractEmbModel):
     def __init__(
         self,
         version="openai/clip-vit-large-patch14",
-        device="cuda",
+        device=None,
         max_length=77,
         freeze=True,
         layer="last",
         layer_idx=None,
         always_return_pooled=False,
     ):  # clip-vit-base-patch32
         super().__init__()
+        device = device or get_default_device_name()
         assert layer in self.LAYERS
         self.tokenizer = CLIPTokenizer.from_pretrained(version)
         self.transformer = CLIPTextModel.from_pretrained(version)
@@ -419,14 +425,15 @@ def __init__(
         self,
         arch="ViT-H-14",
         version="laion2b_s32b_b79k",
-        device="cuda",
+        device=None,
         max_length=77,
         freeze=True,
         layer="last",
         always_return_pooled=False,
         legacy=True,
     ):
         super().__init__()
+        device = device or get_default_device_name()
         assert layer in self.LAYERS
         model, _, _ = open_clip.create_model_and_transforms(
             arch,
@@ -521,12 +528,13 @@ def __init__(
         self,
         arch="ViT-H-14",
         version="laion2b_s32b_b79k",
-        device="cuda",
+        device=None,
         max_length=77,
         freeze=True,
         layer="last",
     ):
         super().__init__()
+        device = device or get_default_device_name()
         assert layer in self.LAYERS
         model, _, _ = open_clip.create_model_and_transforms(
             arch, device=torch.device("cpu"), pretrained=version
@@ -591,7 +599,7 @@ def __init__(
         self,
         arch="ViT-H-14",
         version="laion2b_s32b_b79k",
-        device="cuda",
+        device=None,
         max_length=77,
         freeze=True,
         antialias=True,
@@ -602,6 +610,7 @@ def __init__(
         output_tokens=False,
     ):
         super().__init__()
+        device = device or get_default_device_name()
         model, _, _ = open_clip.create_model_and_transforms(
             arch,
             device=torch.device("cpu"),
@@ -747,11 +756,12 @@ def __init__(
         self,
         clip_version="openai/clip-vit-large-patch14",
         t5_version="google/t5-v1_1-xl",
-        device="cuda",
+        device=None,
         clip_max_length=77,
         t5_max_length=77,
     ):
         super().__init__()
+        device = device or get_default_device_name()
         self.clip_encoder = FrozenCLIPEmbedder(
             clip_version, device, max_length=clip_max_length
         )