Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,7 @@ class CrossAttention : public GGMLBlock {
int64_t context_dim;
int64_t n_head;
int64_t d_head;
bool xtra_dim = false;

public:
CrossAttention(int64_t query_dim,
Expand All @@ -289,6 +290,11 @@ class CrossAttention : public GGMLBlock {
context_dim(context_dim) {
int64_t inner_dim = d_head * n_head;

if (context_dim == 320 && d_head == 320) {
// LOG_DEBUG("CrossAttention: temp set dim to 1024 for sdxs_09");
xtra_dim = true;
context_dim = 1024;
}
blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false));
blocks["to_k"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
blocks["to_v"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
Expand All @@ -313,10 +319,20 @@ class CrossAttention : public GGMLBlock {
int64_t n_context = context->ne[1];
int64_t inner_dim = d_head * n_head;

auto q = to_q->forward(ctx, x); // [N, n_token, inner_dim]
auto q = to_q->forward(ctx, x); // [N, n_token, inner_dim]

if (xtra_dim) {
// LOG_DEBUG("CrossAttention: temp set dim to 1024 for sdxs_09");
context->ne[0] = 1024; // patch dim
}

auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim]
auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim]

if (xtra_dim) {
context->ne[0] = 320; // reset dim to orig
}

x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, ctx->flash_attn_enabled); // [N, n_token, inner_dim]

x = to_out_0->forward(ctx, x); // [N, n_token, query_dim]
Expand Down
23 changes: 20 additions & 3 deletions docs/distilled_sd.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,11 @@ torch.save(ckpt, "tinySDdistilled_fixed.ckpt")
```


### SDXS-512
### SDXS-512-DreamShaper

Another very tiny and **incredibly fast** model is SDXS by IDKiro et al. The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part.
Another very tiny and **incredibly fast** model is SDXS-512-DreamShaper by IDKiro et al. The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part.

#### Create your own safetensors file:

##### 1. Download the diffusers model from Hugging Face using Python:

Expand All @@ -127,11 +129,26 @@ python convert_diffusers_to_original_stable_diffusion.py \
--model_path sdxs --checkpoint_path sdxs.safetensors --half --use_safetensors
```

##### Alternatively, you can download the model here:

* https://huggingface.co/akleine/sdxs-512/resolve/main/sdxs.safetensors
* https://huggingface.co/concedo/sdxs-512-tinySDdistilled-GGUF/resolve/main/sdxs-512-tinySDdistilled_Q8_0.gguf


##### 3. Run the model as follows:

```bash
~/stable-diffusion.cpp/build/bin/sd-cli -m sdxs.safetensors -p "portrait of a lovely cat" \
--cfg-scale 1 --steps 1
```
Both options: ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are mandatory here.


### SDXS-512-0.9

Even though the name "SDXS-512-0.9" is similar to "SDXS-512-DreamShaper", it is *completely different*. Sometimes it is preferred, so try it yourself. You could create a safetensors file as written
in previous section ``` ... from_pretrained("IDKiro/sdxs-512-0.9")``` or simply download a ready-to-run file from here:

* https://huggingface.co/akleine/sdxs-09/resolve/main/sdxs09.safetensors

Both options: ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are mandatory here.
For the use of this model, both options ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are again absolutely necessary.
7 changes: 6 additions & 1 deletion model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1046,6 +1046,7 @@ SDVersion ModelLoader::get_sd_version() {
bool has_middle_block_1 = false;
bool has_output_block_311 = false;
bool has_output_block_71 = false;
bool has_attn_1024 = false;

for (auto& [name, tensor_storage] : tensor_storage_map) {
if (!(is_xl)) {
Expand Down Expand Up @@ -1111,6 +1112,10 @@ SDVersion ModelLoader::get_sd_version() {
if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1") != std::string::npos) {
has_output_block_71 = true;
}
if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn1.to_k.weight") != std::string::npos) {
if (tensor_storage.ne[0] == 1024)
has_attn_1024 = true;
}
if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" ||
tensor_storage.name == "cond_stage_model.model.token_embedding.weight" ||
tensor_storage.name == "text_model.embeddings.token_embedding.weight" ||
Expand Down Expand Up @@ -1193,7 +1198,7 @@ SDVersion ModelLoader::get_sd_version() {
return VERSION_SD2_INPAINT;
}
if (!has_middle_block_1) {
return VERSION_SD2_TINY_UNET;
return has_attn_1024 ? VERSION_SDXS_09 : VERSION_SD2_TINY_UNET;
}
return VERSION_SD2;
}
Expand Down
3 changes: 2 additions & 1 deletion model.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ enum SDVersion {
VERSION_SD2_INPAINT,
VERSION_SD2_TINY_UNET,
VERSION_SDXS,
VERSION_SDXS_09,
VERSION_SDXL,
VERSION_SDXL_INPAINT,
VERSION_SDXL_PIX2PIX,
Expand Down Expand Up @@ -60,7 +61,7 @@ static inline bool sd_version_is_sd1(SDVersion version) {
}

static inline bool sd_version_is_sd2(SDVersion version) {
if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET) {
if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS_09) {
return true;
}
return false;
Expand Down
15 changes: 8 additions & 7 deletions stable-diffusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ const char* model_version_to_str[] = {
"SD 2.x",
"SD 2.x Inpaint",
"SD 2.x Tiny UNet",
"SDXS",
"SDXS (DS)",
"SDXS (0.9)",
"SDXL",
"SDXL Inpaint",
"SDXL Instruct-Pix2Pix",
Expand Down Expand Up @@ -413,7 +414,7 @@ class StableDiffusionGGML {
}

bool tae_preview_only = sd_ctx_params->tae_preview_only;
if (version == VERSION_SDXS) {
if (version == VERSION_SDXS || version == VERSION_SDXS_09) {
tae_preview_only = false;
}

Expand Down Expand Up @@ -593,7 +594,7 @@ class StableDiffusionGGML {
vae_backend = backend;
}

if (!(use_tiny_autoencoder || version == VERSION_SDXS) || tae_preview_only) {
if (!(use_tiny_autoencoder || version == VERSION_SDXS || version == VERSION_SDXS_09) || tae_preview_only) {
if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
first_stage_model = std::make_shared<WAN::WanVAERunner>(vae_backend,
offload_params_to_cpu,
Expand Down Expand Up @@ -631,7 +632,7 @@ class StableDiffusionGGML {
first_stage_model->get_param_tensors(tensors, "first_stage_model");
}
}
if (use_tiny_autoencoder || version == VERSION_SDXS) {
if (use_tiny_autoencoder || version == VERSION_SDXS || version == VERSION_SDXS_09) {
if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
tae_first_stage = std::make_shared<TinyVideoAutoEncoder>(vae_backend,
offload_params_to_cpu,
Expand All @@ -646,7 +647,7 @@ class StableDiffusionGGML {
"decoder.layers",
vae_decode_only,
version);
if (version == VERSION_SDXS) {
if (version == VERSION_SDXS || version == VERSION_SDXS_09) {
tae_first_stage->alloc_params_buffer();
tae_first_stage->get_param_tensors(tensors, "first_stage_model");
}
Expand Down Expand Up @@ -809,10 +810,10 @@ class StableDiffusionGGML {
unet_params_mem_size += high_noise_diffusion_model->get_params_buffer_size();
}
size_t vae_params_mem_size = 0;
if (!(use_tiny_autoencoder || version == VERSION_SDXS) || tae_preview_only) {
if (!(use_tiny_autoencoder || version == VERSION_SDXS || version == VERSION_SDXS_09) || tae_preview_only) {
vae_params_mem_size = first_stage_model->get_params_buffer_size();
}
if (use_tiny_autoencoder || version == VERSION_SDXS) {
if (use_tiny_autoencoder || version == VERSION_SDXS || version == VERSION_SDXS_09) {
if (use_tiny_autoencoder && !tae_first_stage->load_from_file(taesd_path, n_threads)) {
return false;
}
Expand Down
6 changes: 5 additions & 1 deletion unet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ class UnetModelBlock : public GGMLBlock {
} else if (sd_version_is_unet_edit(version)) {
in_channels = 8;
}
if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS) {
if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS || version == VERSION_SDXS_09) {
num_res_blocks = 1;
channel_mult = {1, 2, 4};
tiny_unet = true;
Expand Down Expand Up @@ -265,6 +265,10 @@ class UnetModelBlock : public GGMLBlock {
if (version == VERSION_SVD) {
return new SpatialVideoTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
} else {
if (version == VERSION_SDXS_09 && n_head == 5) {
n_head = 1; // to carry a special case of sdxs_09 into CrossAttentionLayer,
d_head = 320; // works as long the product remains equal (5*64 == 1*320)
}
return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
}
};
Expand Down
Loading