leejet · akleine · Feb 5, 2026 · Feb 6, 2026 · Feb 6, 2026
diff --git a/common.hpp b/common.hpp
@@ -277,6 +277,7 @@ class CrossAttention : public GGMLBlock {
     int64_t context_dim;
     int64_t n_head;
     int64_t d_head;
+    bool xtra_dim = false;
 
 public:
     CrossAttention(int64_t query_dim,
@@ -289,6 +290,11 @@ class CrossAttention : public GGMLBlock {
           context_dim(context_dim) {
         int64_t inner_dim = d_head * n_head;
 
+        if (context_dim == 320 && d_head == 320) {
+            // LOG_DEBUG("CrossAttention: temp set dim to 1024 for sdxs_09");
+            xtra_dim    = true;
+            context_dim = 1024;
+        }
         blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false));
         blocks["to_k"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
         blocks["to_v"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
@@ -313,10 +319,20 @@ class CrossAttention : public GGMLBlock {
         int64_t n_context = context->ne[1];
         int64_t inner_dim = d_head * n_head;
 
-        auto q = to_q->forward(ctx, x);        // [N, n_token, inner_dim]
+        auto q = to_q->forward(ctx, x);  // [N, n_token, inner_dim]
+
+        if (xtra_dim) {
+            // LOG_DEBUG("CrossAttention: temp set dim to 1024 for sdxs_09");
+            context->ne[0] = 1024;  // patch dim
+        }
+
         auto k = to_k->forward(ctx, context);  // [N, n_context, inner_dim]
         auto v = to_v->forward(ctx, context);  // [N, n_context, inner_dim]
 
+        if (xtra_dim) {
+            context->ne[0] = 320;  // reset dim to orig
+        }
+
         x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, ctx->flash_attn_enabled);  // [N, n_token, inner_dim]
 
         x = to_out_0->forward(ctx, x);  // [N, n_token, query_dim]

diff --git a/docs/distilled_sd.md b/docs/distilled_sd.md
@@ -109,9 +109,11 @@ torch.save(ckpt, "tinySDdistilled_fixed.ckpt")
 ```
 
 
-### SDXS-512
+### SDXS-512-DreamShaper
 
-Another very tiny and **incredibly fast**  model is SDXS by IDKiro et al.  The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part.
+Another very tiny and **incredibly fast**  model is SDXS-512-DreamShaper by IDKiro et al.  The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part.
+
+#### Create your own safetensors file:
 
 ##### 1. Download the diffusers model from  Hugging Face using Python:
 
@@ -127,11 +129,26 @@ python convert_diffusers_to_original_stable_diffusion.py \
     --model_path  sdxs  --checkpoint_path sdxs.safetensors --half --use_safetensors
 ```
 
+##### Alternatively, you can download the model here:
+
+ * https://huggingface.co/akleine/sdxs-512/resolve/main/sdxs.safetensors
+ * https://huggingface.co/concedo/sdxs-512-tinySDdistilled-GGUF/resolve/main/sdxs-512-tinySDdistilled_Q8_0.gguf
+
+
 ##### 3. Run the model as follows:
 
 ```bash
 ~/stable-diffusion.cpp/build/bin/sd-cli -m sdxs.safetensors -p "portrait of a lovely cat" \
   --cfg-scale 1 --steps 1
 ```
+Both options: ``` --cfg-scale 1 ``` and  ``` --steps 1 ``` are mandatory here.
+
+
+### SDXS-512-0.9
+
+Even though the name "SDXS-512-0.9" is similar to "SDXS-512-DreamShaper", it is *completely different*. Sometimes it is preferred, so try it yourself. You could create a safetensors file as written 
+in previous section ``` ... from_pretrained("IDKiro/sdxs-512-0.9")```  or simply download a ready-to-run file from here:
+
+ * https://huggingface.co/akleine/sdxs-09/resolve/main/sdxs09.safetensors
 
-Both options: ``` --cfg-scale 1 ``` and  ``` --steps 1 ``` are mandatory here.                                                 
+For the use of this model, both options ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are again absolutely necessary.
diff --git a/model.cpp b/model.cpp
@@ -1046,6 +1046,7 @@ SDVersion ModelLoader::get_sd_version() {
     bool has_middle_block_1          = false;
     bool has_output_block_311        = false;
     bool has_output_block_71         = false;
+    bool has_attn_1024               = false;
 
     for (auto& [name, tensor_storage] : tensor_storage_map) {
         if (!(is_xl)) {
@@ -1111,6 +1112,10 @@ SDVersion ModelLoader::get_sd_version() {
         if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1") != std::string::npos) {
             has_output_block_71 = true;
         }
+        if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn1.to_k.weight") != std::string::npos) {
+            if (tensor_storage.ne[0] == 1024)
+                has_attn_1024 = true;
+        }
         if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" ||
             tensor_storage.name == "cond_stage_model.model.token_embedding.weight" ||
             tensor_storage.name == "text_model.embeddings.token_embedding.weight" ||
@@ -1193,7 +1198,7 @@ SDVersion ModelLoader::get_sd_version() {
             return VERSION_SD2_INPAINT;
         }
         if (!has_middle_block_1) {
-            return VERSION_SD2_TINY_UNET;
+            return has_attn_1024 ? VERSION_SDXS_09 : VERSION_SD2_TINY_UNET;
         }
         return VERSION_SD2;
     }

diff --git a/model.h b/model.h
@@ -29,6 +29,7 @@ enum SDVersion {
     VERSION_SD2_INPAINT,
     VERSION_SD2_TINY_UNET,
     VERSION_SDXS,
+    VERSION_SDXS_09,
     VERSION_SDXL,
     VERSION_SDXL_INPAINT,
     VERSION_SDXL_PIX2PIX,
@@ -60,7 +61,7 @@ static inline bool sd_version_is_sd1(SDVersion version) {
 }
 
 static inline bool sd_version_is_sd2(SDVersion version) {
-    if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET) {
+    if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS_09) {
         return true;
     }
     return false;

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -31,7 +31,8 @@ const char* model_version_to_str[] = {
     "SD 2.x",
     "SD 2.x Inpaint",
     "SD 2.x Tiny UNet",
-    "SDXS",
+    "SDXS (DS)",
+    "SDXS (0.9)",
     "SDXL",
     "SDXL Inpaint",
     "SDXL Instruct-Pix2Pix",
@@ -413,7 +414,7 @@ class StableDiffusionGGML {
         }
 
         bool tae_preview_only = sd_ctx_params->tae_preview_only;
-        if (version == VERSION_SDXS) {
+        if (version == VERSION_SDXS || version == VERSION_SDXS_09) {
             tae_preview_only = false;
         }
 
@@ -593,7 +594,7 @@ class StableDiffusionGGML {
                 vae_backend = backend;
             }
 
-            if (!(use_tiny_autoencoder || version == VERSION_SDXS) || tae_preview_only) {
+            if (!(use_tiny_autoencoder || version == VERSION_SDXS || version == VERSION_SDXS_09) || tae_preview_only) {
                 if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
                     first_stage_model = std::make_shared<WAN::WanVAERunner>(vae_backend,
                                                                             offload_params_to_cpu,
@@ -631,7 +632,7 @@ class StableDiffusionGGML {
                     first_stage_model->get_param_tensors(tensors, "first_stage_model");
                 }
             }
-            if (use_tiny_autoencoder || version == VERSION_SDXS) {
+            if (use_tiny_autoencoder || version == VERSION_SDXS || version == VERSION_SDXS_09) {
                 if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
                     tae_first_stage = std::make_shared<TinyVideoAutoEncoder>(vae_backend,
                                                                              offload_params_to_cpu,
@@ -646,7 +647,7 @@ class StableDiffusionGGML {
                                                                              "decoder.layers",
                                                                              vae_decode_only,
                                                                              version);
-                    if (version == VERSION_SDXS) {
+                    if (version == VERSION_SDXS || version == VERSION_SDXS_09) {
                         tae_first_stage->alloc_params_buffer();
                         tae_first_stage->get_param_tensors(tensors, "first_stage_model");
                     }
@@ -809,10 +810,10 @@ class StableDiffusionGGML {
                 unet_params_mem_size += high_noise_diffusion_model->get_params_buffer_size();
             }
             size_t vae_params_mem_size = 0;
-            if (!(use_tiny_autoencoder || version == VERSION_SDXS) || tae_preview_only) {
+            if (!(use_tiny_autoencoder || version == VERSION_SDXS || version == VERSION_SDXS_09) || tae_preview_only) {
                 vae_params_mem_size = first_stage_model->get_params_buffer_size();
             }
-            if (use_tiny_autoencoder || version == VERSION_SDXS) {
+            if (use_tiny_autoencoder || version == VERSION_SDXS || version == VERSION_SDXS_09) {
                 if (use_tiny_autoencoder && !tae_first_stage->load_from_file(taesd_path, n_threads)) {
                     return false;
                 }

diff --git a/unet.hpp b/unet.hpp
@@ -218,7 +218,7 @@ class UnetModelBlock : public GGMLBlock {
         } else if (sd_version_is_unet_edit(version)) {
             in_channels = 8;
         }
-        if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS) {
+        if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS || version == VERSION_SDXS_09) {
             num_res_blocks = 1;
             channel_mult   = {1, 2, 4};
             tiny_unet      = true;
@@ -265,6 +265,10 @@ class UnetModelBlock : public GGMLBlock {
             if (version == VERSION_SVD) {
                 return new SpatialVideoTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
             } else {
+                if (version == VERSION_SDXS_09 && n_head == 5) {
+                    n_head = 1;    // to carry a special case of sdxs_09 into CrossAttentionLayer,
+                    d_head = 320;  // works as long the product remains equal (5*64 == 1*320)
+                }
                 return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
             }
         };