From e7a8871ae8c5e9b15b45071b86cbdf61b37b349f Mon Sep 17 00:00:00 2001 From: akleine Date: Thu, 5 Feb 2026 20:11:59 +0100 Subject: [PATCH 1/3] feat: add support for SDXS-09 Even though the name "SDXS-09" is similar to "SDXS", it is completely different from SDXS. For this reason that SDXS was renamed to "SDXS (DS)", where DS stands for the DreamShaper edition by IDKiro. --- common.hpp | 16 ++++++++++++++++ model.cpp | 7 ++++++- model.h | 3 ++- stable-diffusion.cpp | 15 ++++++++------- unet.hpp | 6 +++++- 5 files changed, 37 insertions(+), 10 deletions(-) diff --git a/common.hpp b/common.hpp index d9c823df0..0f0f1bef0 100644 --- a/common.hpp +++ b/common.hpp @@ -277,6 +277,7 @@ class CrossAttention : public GGMLBlock { int64_t context_dim; int64_t n_head; int64_t d_head; + bool xtra_dim = false; public: CrossAttention(int64_t query_dim, @@ -289,6 +290,11 @@ class CrossAttention : public GGMLBlock { context_dim(context_dim) { int64_t inner_dim = d_head * n_head; + if (context_dim == 320 && d_head == 320) { + // LOG_DEBUG("CrossAttention: temp set dim to 1024 for sdxs_09"); + xtra_dim = true; + context_dim = 1024; + } blocks["to_q"] = std::shared_ptr(new Linear(query_dim, inner_dim, false)); blocks["to_k"] = std::shared_ptr(new Linear(context_dim, inner_dim, false)); blocks["to_v"] = std::shared_ptr(new Linear(context_dim, inner_dim, false)); @@ -314,9 +320,19 @@ class CrossAttention : public GGMLBlock { int64_t inner_dim = d_head * n_head; auto q = to_q->forward(ctx, x); // [N, n_token, inner_dim] + + if (xtra_dim) { + // LOG_DEBUG("CrossAttention: temp set dim to 1024 for sdxs_09"); + context->ne[0] = 1024; // patch dim + } + auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim] auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim] + if (xtra_dim) { + context->ne[0] = 320; // reset dim to orig + } + x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, ctx->flash_attn_enabled); // [N, n_token, inner_dim] x = to_out_0->forward(ctx, x); // [N, n_token, query_dim] diff --git a/model.cpp b/model.cpp index 253dd25cd..8c5f1580f 100644 --- a/model.cpp +++ b/model.cpp @@ -1046,6 +1046,7 @@ SDVersion ModelLoader::get_sd_version() { bool has_middle_block_1 = false; bool has_output_block_311 = false; bool has_output_block_71 = false; + bool has_attn_1024 = false; for (auto& [name, tensor_storage] : tensor_storage_map) { if (!(is_xl)) { @@ -1111,6 +1112,10 @@ SDVersion ModelLoader::get_sd_version() { if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1") != std::string::npos) { has_output_block_71 = true; } + if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn1.to_k.weight") != std::string::npos) { + if (tensor_storage.ne[0] == 1024) + has_attn_1024 = true; + } if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" || tensor_storage.name == "cond_stage_model.model.token_embedding.weight" || tensor_storage.name == "text_model.embeddings.token_embedding.weight" || @@ -1193,7 +1198,7 @@ SDVersion ModelLoader::get_sd_version() { return VERSION_SD2_INPAINT; } if (!has_middle_block_1) { - return VERSION_SD2_TINY_UNET; + return has_attn_1024 ? VERSION_SDXS_09 : VERSION_SD2_TINY_UNET; } return VERSION_SD2; } diff --git a/model.h b/model.h index e16ac3a07..5a85c5706 100644 --- a/model.h +++ b/model.h @@ -29,6 +29,7 @@ enum SDVersion { VERSION_SD2_INPAINT, VERSION_SD2_TINY_UNET, VERSION_SDXS, + VERSION_SDXS_09, VERSION_SDXL, VERSION_SDXL_INPAINT, VERSION_SDXL_PIX2PIX, @@ -60,7 +61,7 @@ static inline bool sd_version_is_sd1(SDVersion version) { } static inline bool sd_version_is_sd2(SDVersion version) { - if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET) { + if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS_09) { return true; } return false; diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index f5c82b215..00e533569 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -31,7 +31,8 @@ const char* model_version_to_str[] = { "SD 2.x", "SD 2.x Inpaint", "SD 2.x Tiny UNet", - "SDXS", + "SDXS (DS)", + "SDXS (0.9)", "SDXL", "SDXL Inpaint", "SDXL Instruct-Pix2Pix", @@ -413,7 +414,7 @@ class StableDiffusionGGML { } bool tae_preview_only = sd_ctx_params->tae_preview_only; - if (version == VERSION_SDXS) { + if (version == VERSION_SDXS || version == VERSION_SDXS_09) { tae_preview_only = false; } @@ -593,7 +594,7 @@ class StableDiffusionGGML { vae_backend = backend; } - if (!(use_tiny_autoencoder || version == VERSION_SDXS) || tae_preview_only) { + if (!(use_tiny_autoencoder || version == VERSION_SDXS || version == VERSION_SDXS_09) || tae_preview_only) { if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) { first_stage_model = std::make_shared(vae_backend, offload_params_to_cpu, @@ -631,7 +632,7 @@ class StableDiffusionGGML { first_stage_model->get_param_tensors(tensors, "first_stage_model"); } } - if (use_tiny_autoencoder || version == VERSION_SDXS) { + if (use_tiny_autoencoder || version == VERSION_SDXS || version == VERSION_SDXS_09) { if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) { tae_first_stage = std::make_shared(vae_backend, offload_params_to_cpu, @@ -646,7 +647,7 @@ class StableDiffusionGGML { "decoder.layers", vae_decode_only, version); - if (version == VERSION_SDXS) { + if (version == VERSION_SDXS || version == VERSION_SDXS_09) { tae_first_stage->alloc_params_buffer(); tae_first_stage->get_param_tensors(tensors, "first_stage_model"); } @@ -809,10 +810,10 @@ class StableDiffusionGGML { unet_params_mem_size += high_noise_diffusion_model->get_params_buffer_size(); } size_t vae_params_mem_size = 0; - if (!(use_tiny_autoencoder || version == VERSION_SDXS) || tae_preview_only) { + if (!(use_tiny_autoencoder || version == VERSION_SDXS || version == VERSION_SDXS_09) || tae_preview_only) { vae_params_mem_size = first_stage_model->get_params_buffer_size(); } - if (use_tiny_autoencoder || version == VERSION_SDXS) { + if (use_tiny_autoencoder || version == VERSION_SDXS || version == VERSION_SDXS_09) { if (use_tiny_autoencoder && !tae_first_stage->load_from_file(taesd_path, n_threads)) { return false; } diff --git a/unet.hpp b/unet.hpp index 2dd79e0e1..6f361b077 100644 --- a/unet.hpp +++ b/unet.hpp @@ -218,7 +218,7 @@ class UnetModelBlock : public GGMLBlock { } else if (sd_version_is_unet_edit(version)) { in_channels = 8; } - if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS) { + if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS || version == VERSION_SDXS_09) { num_res_blocks = 1; channel_mult = {1, 2, 4}; tiny_unet = true; @@ -265,6 +265,10 @@ class UnetModelBlock : public GGMLBlock { if (version == VERSION_SVD) { return new SpatialVideoTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection); } else { + if (version == VERSION_SDXS_09 && n_head == 5) { + n_head = 1; // to carry a special case of sdxs_09 into CrossAttentionLayer, + d_head = 320; // works as long the product remains equal (5*64 == 1*320) + } return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection); } }; From 22c511fbc792fcbe79ade3cc6a9b5d11efcb430b Mon Sep 17 00:00:00 2001 From: akleine Date: Fri, 6 Feb 2026 10:00:40 +0100 Subject: [PATCH 2/3] docs: update for using sdxs-09 --- docs/distilled_sd.md | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/docs/distilled_sd.md b/docs/distilled_sd.md index 3174b18f8..7a9112c34 100644 --- a/docs/distilled_sd.md +++ b/docs/distilled_sd.md @@ -109,9 +109,11 @@ torch.save(ckpt, "tinySDdistilled_fixed.ckpt") ``` -### SDXS-512 +### SDXS-512-DreamShaper -Another very tiny and **incredibly fast** model is SDXS by IDKiro et al. The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part. +Another very tiny and **incredibly fast** model is SDXS-512-DreamShaper by IDKiro et al. The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part. + +#### Create your own safetensors file: ##### 1. Download the diffusers model from Hugging Face using Python: @@ -127,11 +129,26 @@ python convert_diffusers_to_original_stable_diffusion.py \ --model_path sdxs --checkpoint_path sdxs.safetensors --half --use_safetensors ``` +##### Alternatively, you can download the model here: + + * https://huggingface.co/akleine/sdxs-512/resolve/main/sdxs.safetensors + * https://huggingface.co/concedo/sdxs-512-tinySDdistilled-GGUF/resolve/main/sdxs-512-tinySDdistilled_Q8_0.gguf + + ##### 3. Run the model as follows: ```bash ~/stable-diffusion.cpp/build/bin/sd-cli -m sdxs.safetensors -p "portrait of a lovely cat" \ --cfg-scale 1 --steps 1 ``` +Both options: ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are mandatory here. + + +### SDXS-512-0.9 + +Even though the name "SDXS-512-0.9" is similar to "SDXS-512-DreamShaper", it is *completely different*. Sometimes it is preferred, so try it yourself. You could create a safetensors file as written +in previous section ``` ... from_pretrained("IDKiro/sdxs-512-0.9")``` or simply download a ready-to-run file from here: + + * https://huggingface.co/akleine/sdxs-09/resolve/main/sdxs09.safetensors -Both options: ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are mandatory here. +For the use of this model, both options ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are again absolutely necessary. From b3c84a907c59998978a3d7a424e646ae36bf982f Mon Sep 17 00:00:00 2001 From: akleine Date: Fri, 6 Feb 2026 11:12:38 +0100 Subject: [PATCH 3/3] chore: format code --- common.hpp | 10 +++++----- model.cpp | 4 ++-- unet.hpp | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/common.hpp b/common.hpp index 0f0f1bef0..64514fece 100644 --- a/common.hpp +++ b/common.hpp @@ -277,7 +277,7 @@ class CrossAttention : public GGMLBlock { int64_t context_dim; int64_t n_head; int64_t d_head; - bool xtra_dim = false; + bool xtra_dim = false; public: CrossAttention(int64_t query_dim, @@ -292,7 +292,7 @@ class CrossAttention : public GGMLBlock { if (context_dim == 320 && d_head == 320) { // LOG_DEBUG("CrossAttention: temp set dim to 1024 for sdxs_09"); - xtra_dim = true; + xtra_dim = true; context_dim = 1024; } blocks["to_q"] = std::shared_ptr(new Linear(query_dim, inner_dim, false)); @@ -319,18 +319,18 @@ class CrossAttention : public GGMLBlock { int64_t n_context = context->ne[1]; int64_t inner_dim = d_head * n_head; - auto q = to_q->forward(ctx, x); // [N, n_token, inner_dim] + auto q = to_q->forward(ctx, x); // [N, n_token, inner_dim] if (xtra_dim) { // LOG_DEBUG("CrossAttention: temp set dim to 1024 for sdxs_09"); - context->ne[0] = 1024; // patch dim + context->ne[0] = 1024; // patch dim } auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim] auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim] if (xtra_dim) { - context->ne[0] = 320; // reset dim to orig + context->ne[0] = 320; // reset dim to orig } x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, ctx->flash_attn_enabled); // [N, n_token, inner_dim] diff --git a/model.cpp b/model.cpp index 8c5f1580f..36365afcf 100644 --- a/model.cpp +++ b/model.cpp @@ -1113,8 +1113,8 @@ SDVersion ModelLoader::get_sd_version() { has_output_block_71 = true; } if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn1.to_k.weight") != std::string::npos) { - if (tensor_storage.ne[0] == 1024) - has_attn_1024 = true; + if (tensor_storage.ne[0] == 1024) + has_attn_1024 = true; } if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" || tensor_storage.name == "cond_stage_model.model.token_embedding.weight" || diff --git a/unet.hpp b/unet.hpp index 6f361b077..7f553828c 100644 --- a/unet.hpp +++ b/unet.hpp @@ -266,8 +266,8 @@ class UnetModelBlock : public GGMLBlock { return new SpatialVideoTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection); } else { if (version == VERSION_SDXS_09 && n_head == 5) { - n_head = 1; // to carry a special case of sdxs_09 into CrossAttentionLayer, - d_head = 320; // works as long the product remains equal (5*64 == 1*320) + n_head = 1; // to carry a special case of sdxs_09 into CrossAttentionLayer, + d_head = 320; // works as long the product remains equal (5*64 == 1*320) } return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection); }