From b4db4be5f1a19f0691bb0a1160fa841adb61f771 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Thu, 29 Jan 2026 16:34:29 +0100
Subject: [PATCH 01/13] LoRA: Optimise LoKr at runtime

---
 ggml_extend.hpp |  83 +++++++++++++++++++++++++++++++++-
 lora.hpp        | 116 ++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 195 insertions(+), 4 deletions(-)

diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 193a2c392..dd1309955 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -1577,7 +1577,7 @@ struct WeightAdapter {
             bool force_prec_f32 = false;
             float scale         = 1.f;
         } linear;
-        struct {
+        struct conv2d_params_t{
             int s0          = 1;
             int s1          = 1;
             int p0          = 0;
@@ -2630,4 +2630,85 @@ class MultiheadAttention : public GGMLBlock {
     }
 };
 
+__STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
+    struct ggml_context* ctx,
+    struct ggml_tensor* h,    // Input: [q, batch] or [W, H, q, batch]
+    struct ggml_tensor* w1,   // Outer C (Full rank)
+    struct ggml_tensor* w1a,  // Outer A (Low rank part 1)
+    struct ggml_tensor* w1b,  // Outer B (Low rank part 2)
+    struct ggml_tensor* w2,   // Inner BA (Full rank)
+    struct ggml_tensor* w2a,  // Inner A (Low rank part 1)
+    struct ggml_tensor* w2b,  // Inner B (Low rank part 2)
+    bool is_conv,
+    WeightAdapter::ForwardParams::conv2d_params_t conv_params,
+    float scale) {
+
+    GGML_ASSERT((w1 != NULL || (w1a != NULL && w1b != NULL)));
+    GGML_ASSERT((w2 != NULL || (w2a != NULL && w2b != NULL)));
+
+    int vq = (w2 != NULL) ? w2->ne[0] : w2a->ne[0];
+    int vp = (w2 != NULL) ? w2->ne[1] : (is_conv ? w2b->ne[3] : w2b->ne[1]);
+
+    int uq = (w1 != NULL) ? w1->ne[0] : w1a->ne[0];
+    int up = (w1 != NULL) ? w1->ne[1] : w1b->ne[1];
+
+    int q_expected = uq * vq;
+    int q_actual   = is_conv ? h->ne[2] : h->ne[0];
+    GGML_ASSERT(q_actual == q_expected && "Input dimension mismatch for LoKR split");
+
+    struct ggml_tensor* hb;
+
+    if (!is_conv) {
+        // Treat input as a grid: [vq, uq * batch]
+        struct ggml_tensor* h_mat = ggml_reshape_2d(ctx, h, vq, uq * h->ne[1]);
+
+        if (w2 != NULL) {
+            hb = ggml_mul_mat(ctx, w2, h_mat);
+        } else {
+            hb = ggml_mul_mat(ctx, w2b, ggml_mul_mat(ctx, w2a, h_mat));
+        }
+    } else {
+        // Reshape so uq is in the batch dimension: [W, H, vq, uq * batch]
+        struct ggml_tensor* h_grouped = ggml_reshape_4d(ctx, h, h->ne[0], h->ne[1], vq, uq * h->ne[3]);
+
+        if (w2 != NULL) {
+            hb = ggml_ext_conv_2d(ctx, w2, h_grouped,nullptr, conv_params.s0, conv_params.s1, conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1, conv_params.direct, conv_params.circular_x, conv_params.circular_y, conv_params.scale);
+        } else {
+            // w2a is [1, 1, vq, rank], w2b is [kw, kh, rank, vp]
+            struct ggml_tensor* tmp = ggml_conv_2d(ctx, w2a, h_grouped, 1, 1, 0, 0, 1, 1);
+            hb                      = ggml_ext_conv_2d(ctx, w2b, tmp, nullptr, conv_params.s0, conv_params.s1, conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1, conv_params.direct, conv_params.circular_x, conv_params.circular_y, conv_params.scale);
+        }
+    }
+
+    // At this point hb is [W_out, H_out, vp, uq * batch]
+    // We reshape to isolate uq for matrix multiplication
+    int w_out = is_conv ? hb->ne[0] : 1;
+    int h_out = is_conv ? hb->ne[1] : 1;
+    int batch = is_conv ? h->ne[3] : h->ne[1];
+
+    // Rearrange to [vp, uq, spatial*batch]
+    struct ggml_tensor* hb_unbundled = ggml_reshape_3d(ctx, hb, vp, uq, w_out * h_out * batch);
+
+    // Transpose so uq is ne[0] for ggml_mul_mat
+    struct ggml_tensor* hb_t = ggml_transpose(ctx, hb_unbundled);
+
+    struct ggml_tensor* hc;
+    if (w1 != NULL) {
+        hc = ggml_mul_mat(ctx, w1, hb_t);
+    } else {
+        hc = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_t));
+    }
+
+    struct ggml_tensor* hc_t = ggml_transpose(ctx, hc);
+    struct ggml_tensor* out;
+    if (is_conv) {
+        out = ggml_reshape_4d(ctx, hc_t, w_out, h_out, up * vp, batch);
+    } else {
+
+        out = ggml_reshape_2d(ctx, ggml_cont(ctx, hc_t), up * vp, batch);
+    }
+
+    return ggml_scale(ctx, out, scale);
+}
+
 #endif  // __GGML_EXTEND__HPP__
diff --git a/lora.hpp b/lora.hpp
index e5d9906ff..fd461086a 100644
--- a/lora.hpp
+++ b/lora.hpp
@@ -483,7 +483,7 @@ struct LoraModel : public GGMLRunner {
             diff = get_loha_weight_diff(model_tensor_name, ctx);
         }
         // lokr
-        if (diff == nullptr) {
+        if (diff == nullptr && with_lora) {
             diff = get_lokr_weight_diff(model_tensor_name, ctx);
         }
         if (diff != nullptr) {
@@ -501,6 +501,8 @@ struct LoraModel : public GGMLRunner {
         return diff;
     }
 
+
+    
     ggml_tensor* get_out_diff(ggml_context* ctx,
                               ggml_tensor* x,
                               WeightAdapter::ForwardParams forward_params,
@@ -514,6 +516,115 @@ struct LoraModel : public GGMLRunner {
             } else {
                 key = model_tensor_name + "." + std::to_string(index);
             }
+            bool is_conv2d = forward_params.op_type == WeightAdapter::ForwardParams::op_type_t::OP_CONV2D;
+
+
+            std::string lokr_w1_name   = "lora." + key + ".lokr_w1";
+            std::string lokr_w1_a_name = "lora." + key + ".lokr_w1_a";
+            // if either of these is found, then we have a lokr lora
+            auto iter = lora_tensors.find(lokr_w1_name);
+            auto iter_a = lora_tensors.find(lokr_w1_a_name);
+            if (iter != lora_tensors.end() || iter_a != lora_tensors.end()) {
+                std::string lokr_w1_b_name = "lora." + key + ".lokr_w1_b";
+                std::string lokr_w2_name   = "lora." + key + ".lokr_w2";
+                std::string lokr_w2_a_name = "lora." + key + ".lokr_w2_a";
+                std::string lokr_w2_b_name = "lora." + key + ".lokr_w2_b";
+                std::string alpha_name     = "lora." + key + ".alpha";
+
+                ggml_tensor* lokr_w1   = nullptr;
+                ggml_tensor* lokr_w1_a = nullptr;
+                ggml_tensor* lokr_w1_b = nullptr;
+                ggml_tensor* lokr_w2   = nullptr;
+                ggml_tensor* lokr_w2_a = nullptr;
+                ggml_tensor* lokr_w2_b = nullptr;
+
+                if (iter != lora_tensors.end()) {
+                    lokr_w1 = iter->second;
+                    if (is_conv2d && lokr_w1->type != GGML_TYPE_F16) {
+                        lokr_w1 = ggml_cast(ctx, lokr_w1, GGML_TYPE_F16);
+                    }
+                }
+                iter = iter_a;
+                if (iter != lora_tensors.end()) {
+                    lokr_w1_a = iter->second;
+                    if (is_conv2d && lokr_w1_a->type != GGML_TYPE_F16) {
+                        lokr_w1_a = ggml_cast(ctx, lokr_w1_a, GGML_TYPE_F16);
+                    }
+                }
+                iter = lora_tensors.find(lokr_w1_b_name);
+                if (iter != lora_tensors.end()) {
+                    lokr_w1_b = iter->second;
+                    if (is_conv2d && lokr_w1_b->type != GGML_TYPE_F16) {
+                        lokr_w1_b = ggml_cast(ctx, lokr_w1_b, GGML_TYPE_F16);
+                    }
+                }
+
+                iter = lora_tensors.find(lokr_w2_name);
+                if (iter != lora_tensors.end()) {
+                    lokr_w2 = iter->second;
+                    if (is_conv2d && lokr_w2->type != GGML_TYPE_F16) {
+                        lokr_w2 = ggml_cast(ctx, lokr_w2, GGML_TYPE_F16);
+                    }
+                }
+                iter = lora_tensors.find(lokr_w2_a_name);
+                if (iter != lora_tensors.end()) {
+                    lokr_w2_a = iter->second;
+                    if (is_conv2d && lokr_w2_a->type != GGML_TYPE_F16) {
+                        lokr_w2_a = ggml_cast(ctx, lokr_w2_a, GGML_TYPE_F16);
+                    }
+                }
+                iter = lora_tensors.find(lokr_w2_b_name);
+                if (iter != lora_tensors.end()) {
+                    lokr_w2_b = iter->second;
+                    if (is_conv2d && lokr_w2_b->type != GGML_TYPE_F16) {
+                        lokr_w2_b = ggml_cast(ctx, lokr_w2_b, GGML_TYPE_F16);
+                    }
+                }
+
+                int rank = 1;
+                if (lokr_w1_b) {
+                    rank = lokr_w1_b->ne[ggml_n_dims(lokr_w1_b) - 1];
+                }
+                if (lokr_w2_b) {
+                    rank = lokr_w2_b->ne[ggml_n_dims(lokr_w2_b) - 1];
+                }
+
+                float scale_value = 1.0f;
+                iter              = lora_tensors.find(alpha_name);
+                if (iter != lora_tensors.end()) {
+                    float alpha = ggml_ext_backend_tensor_get_f32(iter->second);
+                    scale_value = alpha / rank;
+                    applied_lora_tensors.insert(alpha_name);
+                }
+
+                if (rank == 1) {
+                    scale_value = 1.0f;
+                }
+                scale_value *= multiplier;
+
+                auto curr_out_diff = ggml_ext_lokr_forward(ctx, x, lokr_w1, lokr_w1_a, lokr_w1_b, lokr_w2, lokr_w2_a, lokr_w2_b, is_conv2d, forward_params.conv2d, scale_value);
+                if (out_diff == nullptr) {
+                    out_diff = curr_out_diff;
+                } else {
+                    out_diff = ggml_concat(ctx, out_diff, curr_out_diff, 0);
+                }
+
+                if(lokr_w1) applied_lora_tensors.insert(lokr_w1_name);
+                if(lokr_w1_a) applied_lora_tensors.insert(lokr_w1_a_name);
+                if(lokr_w1_b) applied_lora_tensors.insert(lokr_w1_b_name);
+                if(lokr_w2) applied_lora_tensors.insert(lokr_w2_name);
+                if(lokr_w2_a) applied_lora_tensors.insert(lokr_w2_name);
+                if(lokr_w2_b) applied_lora_tensors.insert(lokr_w2_b_name);
+                applied_lora_tensors.insert(alpha_name);
+
+
+                index++;
+                continue;
+            }
+            
+            // not a lork, normal lora path
+
+
 
             std::string lora_down_name = "lora." + key + ".lora_down";
             std::string lora_up_name   = "lora." + key + ".lora_up";
@@ -525,9 +636,8 @@ struct LoraModel : public GGMLRunner {
             ggml_tensor* lora_mid  = nullptr;
             ggml_tensor* lora_down = nullptr;
 
-            bool is_conv2d = forward_params.op_type == WeightAdapter::ForwardParams::op_type_t::OP_CONV2D;
 
-            auto iter = lora_tensors.find(lora_up_name);
+            iter = lora_tensors.find(lora_up_name);
             if (iter != lora_tensors.end()) {
                 lora_up = iter->second;
                 if (is_conv2d && lora_up->type != GGML_TYPE_F16) {

From d608b37ad50a0aa32676adc150c506c7136c022e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Fri, 30 Jan 2026 21:06:12 +0100
Subject: [PATCH 02/13] lokr: fix convs

---
 ggml_extend.hpp | 96 +++++++++++++++++++++++++++++--------------------
 1 file changed, 57 insertions(+), 39 deletions(-)

diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index dd1309955..f75b8464b 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -2646,69 +2646,87 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
     GGML_ASSERT((w1 != NULL || (w1a != NULL && w1b != NULL)));
     GGML_ASSERT((w2 != NULL || (w2a != NULL && w2b != NULL)));
 
-    int vq = (w2 != NULL) ? w2->ne[0] : w2a->ne[0];
-    int vp = (w2 != NULL) ? w2->ne[1] : (is_conv ? w2b->ne[3] : w2b->ne[1]);
+    int uq = (w1 != NULL) ? (int)w1->ne[0] : (int)w1a->ne[0];
+    int up = (w1 != NULL) ? (int)w1->ne[1] : (int)w1b->ne[1];
 
-    int uq = (w1 != NULL) ? w1->ne[0] : w1a->ne[0];
-    int up = (w1 != NULL) ? w1->ne[1] : w1b->ne[1];
+    int q_actual = is_conv ? (int)h->ne[2] : (int)h->ne[0];
+    int vq       = q_actual / uq;
 
-    int q_expected = uq * vq;
-    int q_actual   = is_conv ? h->ne[2] : h->ne[0];
-    GGML_ASSERT(q_actual == q_expected && "Input dimension mismatch for LoKR split");
+    int vp = (w2 != NULL) ? (is_conv ? (int)w2->ne[3] : (int)w2->ne[1])
+                          : (int)w2a->ne[1];
+    GGML_ASSERT(q_actual == (uq * vq) && "Input dimension mismatch for LoKR split");
 
     struct ggml_tensor* hb;
 
     if (!is_conv) {
-        // Treat input as a grid: [vq, uq * batch]
-        struct ggml_tensor* h_mat = ggml_reshape_2d(ctx, h, vq, uq * h->ne[1]);
+        int batch                 = (int)h->ne[1];
+        struct ggml_tensor* h_mat = ggml_reshape_2d(ctx, h, vq, uq * batch);
 
         if (w2 != NULL) {
             hb = ggml_mul_mat(ctx, w2, h_mat);
         } else {
             hb = ggml_mul_mat(ctx, w2b, ggml_mul_mat(ctx, w2a, h_mat));
         }
+
+        struct ggml_tensor* hb_unbundled = ggml_reshape_3d(ctx, hb, vp, uq, batch);
+        struct ggml_tensor* hb_t         = ggml_transpose(ctx, hb_unbundled);
+
+        struct ggml_tensor* hc;
+        if (w1 != NULL) {
+            hc = ggml_mul_mat(ctx, w1, hb_t);
+        } else {
+            hc = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_t));
+        }
+
+        struct ggml_tensor* hc_t = ggml_transpose(ctx, hc);
+        struct ggml_tensor* out  = ggml_reshape_2d(ctx, ggml_cont(ctx, hc_t), up * vp, batch);
+        return ggml_scale(ctx, out, scale);
+
     } else {
-        // Reshape so uq is in the batch dimension: [W, H, vq, uq * batch]
-        struct ggml_tensor* h_grouped = ggml_reshape_4d(ctx, h, h->ne[0], h->ne[1], vq, uq * h->ne[3]);
+        int batch = (int)h->ne[3];
+
+        // Reshape input: [W, H, vq*uq, batch] -> [W, H, vq, uq * batch]
+        struct ggml_tensor* h_grouped = ggml_reshape_4d(ctx, h, h->ne[0], h->ne[1], vq, uq * batch);
 
         if (w2 != NULL) {
-            hb = ggml_ext_conv_2d(ctx, w2, h_grouped,nullptr, conv_params.s0, conv_params.s1, conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1, conv_params.direct, conv_params.circular_x, conv_params.circular_y, conv_params.scale);
+            hb = ggml_conv_2d(ctx, w2, h_grouped, conv_params.s0, conv_params.s1,
+                              conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1);
         } else {
-            // w2a is [1, 1, vq, rank], w2b is [kw, kh, rank, vp]
-            struct ggml_tensor* tmp = ggml_conv_2d(ctx, w2a, h_grouped, 1, 1, 0, 0, 1, 1);
-            hb                      = ggml_ext_conv_2d(ctx, w2b, tmp, nullptr, conv_params.s0, conv_params.s1, conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1, conv_params.direct, conv_params.circular_x, conv_params.circular_y, conv_params.scale);
+            // Low-rank decomposition: w2b is the spatial kernel, w2a is the 1x1 projection
+            // Inner LoRA: w2b is the spatial/down-project, w2a is the 1x1 up-project
+            int rank = (int)w2b->ne[1];
+            int k    = (int)sqrt(w2b->ne[0] / vq);
+
+            struct ggml_tensor* w2b_4d = (ggml_n_dims(w2b) < 3) ? ggml_reshape_4d(ctx, w2b, k, k, vq, rank) : w2b;
+            struct ggml_tensor* w2a_4d = (ggml_n_dims(w2a) < 3) ? ggml_reshape_4d(ctx, w2a, 1, 1, rank, vp) : w2a;
+
+            struct ggml_tensor* ha = ggml_conv_2d(ctx, w2b_4d, h_grouped, conv_params.s0, conv_params.s1,
+                                                  conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1);
+            hb                     = ggml_conv_2d(ctx, w2a_4d, ha, 1, 1, 0, 0, 1, 1);
         }
-    }
 
-    // At this point hb is [W_out, H_out, vp, uq * batch]
-    // We reshape to isolate uq for matrix multiplication
-    int w_out = is_conv ? hb->ne[0] : 1;
-    int h_out = is_conv ? hb->ne[1] : 1;
-    int batch = is_conv ? h->ne[3] : h->ne[1];
+        int w_out = (int)hb->ne[0];
+        int h_out = (int)hb->ne[1];
 
-    // Rearrange to [vp, uq, spatial*batch]
-    struct ggml_tensor* hb_unbundled = ggml_reshape_3d(ctx, hb, vp, uq, w_out * h_out * batch);
+        struct ggml_tensor* hb_flat = ggml_reshape_3d(ctx, hb, w_out * h_out * vp, uq, batch);
+        struct ggml_tensor* hb_t    = ggml_transpose(ctx, hb_flat);  
 
-    // Transpose so uq is ne[0] for ggml_mul_mat
-    struct ggml_tensor* hb_t = ggml_transpose(ctx, hb_unbundled);
+        struct ggml_tensor* hc;
+        struct ggml_tensor* w1_mat = (w1 != NULL) ? ggml_reshape_2d(ctx, w1, uq, up) : NULL;
 
-    struct ggml_tensor* hc;
-    if (w1 != NULL) {
-        hc = ggml_mul_mat(ctx, w1, hb_t);
-    } else {
-        hc = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_t));
-    }
+        if (w1_mat != NULL) {
+            hc = ggml_mul_mat(ctx, w1_mat, hb_t); 
+        } else {
+            hc = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_t));
+        }
 
-    struct ggml_tensor* hc_t = ggml_transpose(ctx, hc);
-    struct ggml_tensor* out;
-    if (is_conv) {
-        out = ggml_reshape_4d(ctx, hc_t, w_out, h_out, up * vp, batch);
-    } else {
+        struct ggml_tensor* hc_t = ggml_transpose(ctx, hc);
+        struct ggml_tensor* hc_res = ggml_reshape_4d(ctx, ggml_cont(ctx, hc_t), vp, w_out * h_out, up, batch);
+        struct ggml_tensor* hc_perm = ggml_permute(ctx, hc_res, 1, 2, 0, 3);
+        struct ggml_tensor* out = ggml_reshape_4d(ctx, ggml_cont(ctx, hc_perm), w_out, h_out, up * vp, batch);
 
-        out = ggml_reshape_2d(ctx, ggml_cont(ctx, hc_t), up * vp, batch);
+        return ggml_scale(ctx, out, scale);
     }
-
-    return ggml_scale(ctx, out, scale);
 }
 
 #endif  // __GGML_EXTEND__HPP__

From b48609762791aab61cf48bde182a700bf45d8fe3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sun, 1 Feb 2026 16:25:34 +0100
Subject: [PATCH 03/13] lokr: fix lienar forward for CUDA/HIP and CPU backends

---
 ggml_extend.hpp | 49 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 34 insertions(+), 15 deletions(-)

diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index f75b8464b..8e1a5c42b 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -2669,7 +2669,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
         }
 
         struct ggml_tensor* hb_unbundled = ggml_reshape_3d(ctx, hb, vp, uq, batch);
-        struct ggml_tensor* hb_t         = ggml_transpose(ctx, hb_unbundled);
+        struct ggml_tensor* hb_t         = ggml_cont(ctx,ggml_transpose(ctx, hb_unbundled));
 
         struct ggml_tensor* hc;
         if (w1 != NULL) {
@@ -2683,47 +2683,66 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
         return ggml_scale(ctx, out, scale);
 
     } else {
+        // very slow implementation for now (can this be optimized?)
         int batch = (int)h->ne[3];
 
-        // Reshape input: [W, H, vq*uq, batch] -> [W, H, vq, uq * batch]
+        // 1. Reshape input: [W, H, vq*uq, batch] -> [W, H, vq, uq * batch]
+        // This is free (metadata only)
         struct ggml_tensor* h_grouped = ggml_reshape_4d(ctx, h, h->ne[0], h->ne[1], vq, uq * batch);
 
+        struct ggml_tensor* hb;
         if (w2 != NULL) {
             hb = ggml_conv_2d(ctx, w2, h_grouped, conv_params.s0, conv_params.s1,
                               conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1);
         } else {
-            // Low-rank decomposition: w2b is the spatial kernel, w2a is the 1x1 projection
-            // Inner LoRA: w2b is the spatial/down-project, w2a is the 1x1 up-project
             int rank = (int)w2b->ne[1];
             int k    = (int)sqrt(w2b->ne[0] / vq);
-
-            struct ggml_tensor* w2b_4d = (ggml_n_dims(w2b) < 3) ? ggml_reshape_4d(ctx, w2b, k, k, vq, rank) : w2b;
-            struct ggml_tensor* w2a_4d = (ggml_n_dims(w2a) < 3) ? ggml_reshape_4d(ctx, w2a, 1, 1, rank, vp) : w2a;
+            struct ggml_tensor* w2b_4d = (ggml_n_dims(w2b) < 4) ? ggml_reshape_4d(ctx, w2b, k, k, vq, rank) : w2b;
+            struct ggml_tensor* w2a_4d = (ggml_n_dims(w2a) < 4) ? ggml_reshape_4d(ctx, w2a, 1, 1, rank, vp) : w2a;
 
             struct ggml_tensor* ha = ggml_conv_2d(ctx, w2b_4d, h_grouped, conv_params.s0, conv_params.s1,
                                                   conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1);
-            hb                     = ggml_conv_2d(ctx, w2a_4d, ha, 1, 1, 0, 0, 1, 1);
+            hb = ggml_conv_2d(ctx, w2a_4d, ha, 1, 1, 0, 0, 1, 1);
         }
 
+        // Current hb shape: [W_out, H_out, vp, uq * batch]
         int w_out = (int)hb->ne[0];
         int h_out = (int)hb->ne[1];
 
+        // 2. Prepare for Matrix Multiplication
+        // Collapse spatial and 'vp' into one dimension to treat as 'M' in MatMul
+        // Shape: [W*H*vp, uq, batch]
         struct ggml_tensor* hb_flat = ggml_reshape_3d(ctx, hb, w_out * h_out * vp, uq, batch);
+        // Transpose to [uq, W*H*vp, batch] so that uq is ne[0] (the shared K dimension)
         struct ggml_tensor* hb_t    = ggml_transpose(ctx, hb_flat);  
 
         struct ggml_tensor* hc;
-        struct ggml_tensor* w1_mat = (w1 != NULL) ? ggml_reshape_2d(ctx, w1, uq, up) : NULL;
-
-        if (w1_mat != NULL) {
+        if (w1 != NULL) {
+            struct ggml_tensor* w1_mat = ggml_reshape_2d(ctx, w1, uq, up);
             hc = ggml_mul_mat(ctx, w1_mat, hb_t); 
         } else {
+            // Low-rank: (up x rank) * (rank x uq) * (uq x Spatial)
             hc = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_t));
         }
 
-        struct ggml_tensor* hc_t = ggml_transpose(ctx, hc);
-        struct ggml_tensor* hc_res = ggml_reshape_4d(ctx, ggml_cont(ctx, hc_t), vp, w_out * h_out, up, batch);
-        struct ggml_tensor* hc_perm = ggml_permute(ctx, hc_res, 1, 2, 0, 3);
-        struct ggml_tensor* out = ggml_reshape_4d(ctx, ggml_cont(ctx, hc_perm), w_out, h_out, up * vp, batch);
+        // 3. Final Layout Transformation
+        // Current hc shape: [up, W*H*vp, batch]
+        // Logical dims in ne[1]: [W*H, vp]
+        // We want final shape: [W, H, up*vp, batch]
+        
+        // Split ne[1] back into spatial and vp
+        struct ggml_tensor* hc_split = ggml_reshape_4d(ctx, hc, up, w_out * h_out, vp, batch);
+        
+        // Permute to bring up and vp together: [spatial, up, vp, batch]
+        // This moves spatial to ne[0], which is necessary for the final W,H,C layout
+        struct ggml_tensor* hc_perm = ggml_permute(ctx, hc_split, 1, 0, 2, 3);
+        
+        // Resolve layout and scale in one go (if possible) or just cont
+        // This is the only mandatory copy
+        struct ggml_tensor* out_cont = ggml_cont(ctx, hc_perm);
+        
+        // Final reshape to merge up and vp into the channel dimension
+        struct ggml_tensor* out = ggml_reshape_4d(ctx, out_cont, w_out, h_out, up * vp, batch);
 
         return ggml_scale(ctx, out, scale);
     }

From 85538628ea12909f707d52fb808e546d11d8a3cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sun, 1 Feb 2026 17:52:59 +0100
Subject: [PATCH 04/13] lokr: disable "optimization" for convolutions

---
 ggml_extend.hpp | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 8e1a5c42b..90e250e08 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -2683,6 +2683,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
         return ggml_scale(ctx, out, scale);
 
     } else {
+#if 0
         // very slow implementation for now (can this be optimized?)
         int batch = (int)h->ne[3];
 
@@ -2745,6 +2746,32 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
         struct ggml_tensor* out = ggml_reshape_4d(ctx, out_cont, w_out, h_out, up * vp, batch);
 
         return ggml_scale(ctx, out, scale);
+#else
+        // compute the weight diff and do a single conv
+        if (w1 == NULL) {
+            w1 = ggml_ext_merge_lora(ctx, w1b, w1a);
+        }
+        if(ggml_n_dims(w1) < 4){
+            w1 = ggml_reshape_4d(ctx, w1, 1, 1, w1->ne[0], w1->ne[1]);
+        }
+        if (w2 == NULL) {
+            w2 = ggml_ext_merge_lora(ctx, w2b, w2a);
+        }
+        if(ggml_n_dims(w2) < 4){
+            w2 = ggml_reshape_4d(ctx, w2, 1, 1, w2->ne[0], w2->ne[1]);
+        }
+        if(w2->ne[2] * w1->ne[2] != h->ne[2]){
+            int k = sqrt(w2->ne[2] * w1->ne[2]/h->ne[2]);
+            GGML_ASSERT(k*k * h->ne[2] == w2->ne[2] * w1->ne[2]);
+            w2 = ggml_reshape_4d(ctx, w2, w2->ne[0]*k, w2->ne[1]*k, w2->ne[2]/(k*k), w2->ne[3]);
+        }
+        w1 = ggml_ext_cast_f32(ctx, w1);
+        w2 = ggml_ext_cast_f32(ctx, w2);
+        struct ggml_tensor* w = ggml_ext_kronecker(ctx, w1, w2);
+        struct ggml_tensor* out = ggml_conv_2d(ctx, w, h, conv_params.s0, conv_params.s1, conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1);
+
+        return ggml_scale(ctx, out, scale);
+#endif
     }
 }
 

From 2430989cd791dc009fd5ab7dbdbf45b686fea969 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sun, 1 Feb 2026 19:54:40 +0100
Subject: [PATCH 05/13] LoKR: re-implement conv

---
 ggml_extend.hpp | 152 +++++++++++++++++++++++++++++-------------------
 1 file changed, 93 insertions(+), 59 deletions(-)

diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 90e250e08..0a57ff519 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -1577,7 +1577,7 @@ struct WeightAdapter {
             bool force_prec_f32 = false;
             float scale         = 1.f;
         } linear;
-        struct conv2d_params_t{
+        struct conv2d_params_t {
             int s0          = 1;
             int s1          = 1;
             int p0          = 0;
@@ -2642,7 +2642,6 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
     bool is_conv,
     WeightAdapter::ForwardParams::conv2d_params_t conv_params,
     float scale) {
-
     GGML_ASSERT((w1 != NULL || (w1a != NULL && w1b != NULL)));
     GGML_ASSERT((w2 != NULL || (w2a != NULL && w2b != NULL)));
 
@@ -2660,16 +2659,16 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
 
     if (!is_conv) {
         int batch                 = (int)h->ne[1];
-        struct ggml_tensor* h_mat = ggml_reshape_2d(ctx, h, vq, uq * batch);
+        struct ggml_tensor* h_split = ggml_reshape_2d(ctx, h, vq, uq * batch);
 
         if (w2 != NULL) {
-            hb = ggml_mul_mat(ctx, w2, h_mat);
+            hb = ggml_mul_mat(ctx, w2, h_split);
         } else {
-            hb = ggml_mul_mat(ctx, w2b, ggml_mul_mat(ctx, w2a, h_mat));
+            hb = ggml_mul_mat(ctx, w2b, ggml_mul_mat(ctx, w2a, h_split));
         }
 
-        struct ggml_tensor* hb_unbundled = ggml_reshape_3d(ctx, hb, vp, uq, batch);
-        struct ggml_tensor* hb_t         = ggml_cont(ctx,ggml_transpose(ctx, hb_unbundled));
+        struct ggml_tensor* hb_cat = ggml_reshape_3d(ctx, hb, vp, uq, batch);
+        struct ggml_tensor* hb_t         = ggml_cont(ctx, ggml_transpose(ctx, hb_cat));
 
         struct ggml_tensor* hc;
         if (w1 != NULL) {
@@ -2683,92 +2682,127 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
         return ggml_scale(ctx, out, scale);
 
     } else {
-#if 0
+#if 1
         // very slow implementation for now (can this be optimized?)
         int batch = (int)h->ne[3];
 
         // 1. Reshape input: [W, H, vq*uq, batch] -> [W, H, vq, uq * batch]
         // This is free (metadata only)
-        struct ggml_tensor* h_grouped = ggml_reshape_4d(ctx, h, h->ne[0], h->ne[1], vq, uq * batch);
+        // print_ggml_tensor(h, true, "\nh");
+        struct ggml_tensor* h_split = ggml_reshape_4d(ctx, h, h->ne[0], h->ne[1], vq, uq * batch);
+        // print_ggml_tensor(h_split, true, "h_split");
 
         struct ggml_tensor* hb;
         if (w2 != NULL) {
-            hb = ggml_conv_2d(ctx, w2, h_grouped, conv_params.s0, conv_params.s1,
-                              conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1);
+            // no LoRA
+            // print_ggml_tensor(w2, true, "w2");
+            hb = ggml_ext_conv_2d(ctx, h_split, w2, nullptr,
+                                  conv_params.s0,
+                                  conv_params.s1,
+                                  conv_params.p0,
+                                  conv_params.p1,
+                                  conv_params.d0,
+                                  conv_params.d1,
+                                  conv_params.direct,
+                                  conv_params.circular_x,
+                                  conv_params.circular_y,
+                                  conv_params.scale);
+
         } else {
-            int rank = (int)w2b->ne[1];
-            int k    = (int)sqrt(w2b->ne[0] / vq);
-            struct ggml_tensor* w2b_4d = (ggml_n_dims(w2b) < 4) ? ggml_reshape_4d(ctx, w2b, k, k, vq, rank) : w2b;
-            struct ggml_tensor* w2a_4d = (ggml_n_dims(w2a) < 4) ? ggml_reshape_4d(ctx, w2a, 1, 1, rank, vp) : w2a;
+            // TODO: do not merge (loCon forward)
+            // w2a could be 2d
+            w2 = ggml_ext_merge_lora(ctx, w2b, w2a);
+            if (ggml_n_dims(w2) < 4) {
+                w2 = ggml_reshape_4d(ctx, w2, 1, 1, w2->ne[0], w2->ne[1]);
+            }
+            if (w2->ne[2] != h_split->ne[2]) {
+                int k = sqrt(w2->ne[2] / h_split->ne[2]);
+                GGML_ASSERT(k * k * h_split->ne[2] == w2->ne[2]);
+                w2 = ggml_reshape_4d(ctx, w2, w2->ne[0] * k, w2->ne[1] * k, w2->ne[2] / (k * k), w2->ne[3]);
+            }
+            hb = ggml_ext_conv_2d(ctx, h_split, w2, nullptr,
+                                            conv_params.s0,
+                                            conv_params.s1,
+                                            conv_params.p0,
+                                            conv_params.p1,
+                                            conv_params.d0,
+                                            conv_params.d1,
+                                            conv_params.direct,
+                                            conv_params.circular_x,
+                                            conv_params.circular_y,
+                                            conv_params.scale);
 
-            struct ggml_tensor* ha = ggml_conv_2d(ctx, w2b_4d, h_grouped, conv_params.s0, conv_params.s1,
-                                                  conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1);
-            hb = ggml_conv_2d(ctx, w2a_4d, ha, 1, 1, 0, 0, 1, 1);
+        
+            // TODO: figure out why this is not working:
+            // struct ggml_tensor* ha = ggml_ext_conv_2d(ctx, h_split, w2a, nullptr,
+            //                                           conv_params.s0,
+            //                                           conv_params.s1,
+            //                                           conv_params.p0,
+            //                                           conv_params.p1,
+            //                                           conv_params.d0,
+            //                                           conv_params.d1);
+            // // not supporting lora_mid here
+            // hb = ggml_ext_conv_2d(ctx,
+            //                       ha,
+            //                       w2b,
+            //                       nullptr,
+            //                       1,
+            //                       1,
+            //                       0,
+            //                       0,
+            //                       1,
+            //                       1,
+            //                       conv_params.direct,
+            //                       conv_params.circular_x,
+            //                       conv_params.circular_y,
+            //                       conv_params.scale);
         }
 
         // Current hb shape: [W_out, H_out, vp, uq * batch]
         int w_out = (int)hb->ne[0];
         int h_out = (int)hb->ne[1];
 
-        // 2. Prepare for Matrix Multiplication
-        // Collapse spatial and 'vp' into one dimension to treat as 'M' in MatMul
-        // Shape: [W*H*vp, uq, batch]
-        struct ggml_tensor* hb_flat = ggml_reshape_3d(ctx, hb, w_out * h_out * vp, uq, batch);
-        // Transpose to [uq, W*H*vp, batch] so that uq is ne[0] (the shared K dimension)
-        struct ggml_tensor* hb_t    = ggml_transpose(ctx, hb_flat);  
+        // struct ggml_tensor* hb_cat = ggml_reshape_4d(ctx, hb, w_out , h_out , vp * uq, batch);
+        // [W_out, H_out, vp * uq,  batch]
+        // Now left to compute (W1 kr Id) * hb_cat == (W1 kr W2) * h
 
-        struct ggml_tensor* hc;
+        // merge the uq groups of size vp*w_out*h_out
+        struct ggml_tensor* hb_merged = ggml_reshape_2d(ctx, hb, w_out * h_out * vp, uq * batch);
+        struct ggml_tensor* hc_t;
+        struct ggml_tensor* hb_merged_t = ggml_cont(ctx, ggml_transpose(ctx, hb_merged));
         if (w1 != NULL) {
-            struct ggml_tensor* w1_mat = ggml_reshape_2d(ctx, w1, uq, up);
-            hc = ggml_mul_mat(ctx, w1_mat, hb_t); 
+            // Would be great to be able to transpose w1 instead to avoid transposing both hb and hc
+            hc_t = ggml_mul_mat(ctx, w1, hb_merged_t);
         } else {
-            // Low-rank: (up x rank) * (rank x uq) * (uq x Spatial)
-            hc = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_t));
+            hc_t = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_merged_t));
         }
-
-        // 3. Final Layout Transformation
-        // Current hc shape: [up, W*H*vp, batch]
-        // Logical dims in ne[1]: [W*H, vp]
-        // We want final shape: [W, H, up*vp, batch]
-        
-        // Split ne[1] back into spatial and vp
-        struct ggml_tensor* hc_split = ggml_reshape_4d(ctx, hc, up, w_out * h_out, vp, batch);
-        
-        // Permute to bring up and vp together: [spatial, up, vp, batch]
-        // This moves spatial to ne[0], which is necessary for the final W,H,C layout
-        struct ggml_tensor* hc_perm = ggml_permute(ctx, hc_split, 1, 0, 2, 3);
-        
-        // Resolve layout and scale in one go (if possible) or just cont
-        // This is the only mandatory copy
-        struct ggml_tensor* out_cont = ggml_cont(ctx, hc_perm);
-        
-        // Final reshape to merge up and vp into the channel dimension
-        struct ggml_tensor* out = ggml_reshape_4d(ctx, out_cont, w_out, h_out, up * vp, batch);
-
+        struct ggml_tensor* hc = ggml_transpose(ctx, hc_t);
+        hc                     = ggml_cont(ctx, hc);
+        struct ggml_tensor* out = ggml_reshape_4d(ctx, hc, w_out, h_out, up * vp, batch);
         return ggml_scale(ctx, out, scale);
 #else
         // compute the weight diff and do a single conv
         if (w1 == NULL) {
             w1 = ggml_ext_merge_lora(ctx, w1b, w1a);
         }
-        if(ggml_n_dims(w1) < 4){
+        if (ggml_n_dims(w1) < 4) {
             w1 = ggml_reshape_4d(ctx, w1, 1, 1, w1->ne[0], w1->ne[1]);
         }
         if (w2 == NULL) {
             w2 = ggml_ext_merge_lora(ctx, w2b, w2a);
         }
-        if(ggml_n_dims(w2) < 4){
+        if (ggml_n_dims(w2) < 4) {
             w2 = ggml_reshape_4d(ctx, w2, 1, 1, w2->ne[0], w2->ne[1]);
         }
-        if(w2->ne[2] * w1->ne[2] != h->ne[2]){
-            int k = sqrt(w2->ne[2] * w1->ne[2]/h->ne[2]);
-            GGML_ASSERT(k*k * h->ne[2] == w2->ne[2] * w1->ne[2]);
-            w2 = ggml_reshape_4d(ctx, w2, w2->ne[0]*k, w2->ne[1]*k, w2->ne[2]/(k*k), w2->ne[3]);
+        if (w2->ne[2] * w1->ne[2] != h->ne[2]) {
+            int k = sqrt(w2->ne[2] * w1->ne[2] / h->ne[2]);
+            GGML_ASSERT(k * k * h->ne[2] == w2->ne[2] * w1->ne[2]);
+            w2 = ggml_reshape_4d(ctx, w2, w2->ne[0] * k, w2->ne[1] * k, w2->ne[2] / (k * k), w2->ne[3]);
         }
-        w1 = ggml_ext_cast_f32(ctx, w1);
-        w2 = ggml_ext_cast_f32(ctx, w2);
-        struct ggml_tensor* w = ggml_ext_kronecker(ctx, w1, w2);
-        struct ggml_tensor* out = ggml_conv_2d(ctx, w, h, conv_params.s0, conv_params.s1, conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1);
+        w1                      = ggml_ext_cast_f32(ctx, w1);
+        w2                      = ggml_ext_cast_f32(ctx, w2);
+        struct ggml_tensor* w   = ggml_ext_kronecker(ctx, w1, w2);
+        struct ggml_tensor* out = ggml_ext_conv_2d(ctx, h, w, nullptr, conv_params.s0, conv_params.s1, conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1, conv_params.direct, conv_params.circular_x, conv_params.circular_y, conv_params.scale);
 
         return ggml_scale(ctx, out, scale);
 #endif

From fbf401bcccadd4b614674e7550e100645bbb1a32 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Mon, 2 Feb 2026 00:21:50 +0100
Subject: [PATCH 06/13] lokr: fix conv bypass implementation

---
 ggml_extend.hpp | 126 +++++++++++++++++-------------------------------
 1 file changed, 43 insertions(+), 83 deletions(-)

diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 0a57ff519..63abe8c00 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -2682,20 +2682,12 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
         return ggml_scale(ctx, out, scale);
 
     } else {
-#if 1
-        // very slow implementation for now (can this be optimized?)
         int batch = (int)h->ne[3];
-
         // 1. Reshape input: [W, H, vq*uq, batch] -> [W, H, vq, uq * batch]
-        // This is free (metadata only)
-        // print_ggml_tensor(h, true, "\nh");
         struct ggml_tensor* h_split = ggml_reshape_4d(ctx, h, h->ne[0], h->ne[1], vq, uq * batch);
-        // print_ggml_tensor(h_split, true, "h_split");
 
         struct ggml_tensor* hb;
         if (w2 != NULL) {
-            // no LoRA
-            // print_ggml_tensor(w2, true, "w2");
             hb = ggml_ext_conv_2d(ctx, h_split, w2, nullptr,
                                   conv_params.s0,
                                   conv_params.s1,
@@ -2709,53 +2701,47 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
                                   conv_params.scale);
 
         } else {
-            // TODO: do not merge (loCon forward)
-            // w2a could be 2d
-            w2 = ggml_ext_merge_lora(ctx, w2b, w2a);
-            if (ggml_n_dims(w2) < 4) {
-                w2 = ggml_reshape_4d(ctx, w2, 1, 1, w2->ne[0], w2->ne[1]);
-            }
-            if (w2->ne[2] != h_split->ne[2]) {
-                int k = sqrt(w2->ne[2] / h_split->ne[2]);
-                GGML_ASSERT(k * k * h_split->ne[2] == w2->ne[2]);
-                w2 = ggml_reshape_4d(ctx, w2, w2->ne[0] * k, w2->ne[1] * k, w2->ne[2] / (k * k), w2->ne[3]);
+            // swap a and b order for conv lora
+            struct ggml_tensor* a = w2b;
+            struct ggml_tensor* b = w2a;
+
+            // unpack conv2d weights if needed
+            if (ggml_n_dims(a) < 4) {
+                int k = sqrt(a->ne[0] / h_split->ne[2]);
+                GGML_ASSERT(k * k * h_split->ne[2] == a->ne[0]);
+                a = ggml_reshape_4d(ctx, a, k, k, a->ne[0] / (k * k), a->ne[1]);
+            } else if (a->ne[2] != h_split->ne[2]) {
+                int k = sqrt(a->ne[2] / h_split->ne[2]);
+                GGML_ASSERT(k * k * h_split->ne[2] == a->ne[2]);
+                a = ggml_reshape_4d(ctx, a, a->ne[0] * k, a->ne[1] * k, a->ne[2] / (k * k), a->ne[3]);
             }
-            hb = ggml_ext_conv_2d(ctx, h_split, w2, nullptr,
-                                            conv_params.s0,
-                                            conv_params.s1,
-                                            conv_params.p0,
-                                            conv_params.p1,
-                                            conv_params.d0,
-                                            conv_params.d1,
-                                            conv_params.direct,
-                                            conv_params.circular_x,
-                                            conv_params.circular_y,
-                                            conv_params.scale);
-
-        
-            // TODO: figure out why this is not working:
-            // struct ggml_tensor* ha = ggml_ext_conv_2d(ctx, h_split, w2a, nullptr,
-            //                                           conv_params.s0,
-            //                                           conv_params.s1,
-            //                                           conv_params.p0,
-            //                                           conv_params.p1,
-            //                                           conv_params.d0,
-            //                                           conv_params.d1);
-            // // not supporting lora_mid here
-            // hb = ggml_ext_conv_2d(ctx,
-            //                       ha,
-            //                       w2b,
-            //                       nullptr,
-            //                       1,
-            //                       1,
-            //                       0,
-            //                       0,
-            //                       1,
-            //                       1,
-            //                       conv_params.direct,
-            //                       conv_params.circular_x,
-            //                       conv_params.circular_y,
-            //                       conv_params.scale);
+            struct ggml_tensor* ha = ggml_ext_conv_2d(ctx, h_split, a, nullptr,
+                                                      conv_params.s0,
+                                                      conv_params.s1,
+                                                      conv_params.p0,
+                                                      conv_params.p1,
+                                                      conv_params.d0,
+                                                      conv_params.d1,
+                                                      conv_params.direct,
+                                                      conv_params.circular_x,
+                                                      conv_params.circular_y,
+                                                      conv_params.scale);
+
+            // not supporting lora_mid here
+            hb = ggml_ext_conv_2d(ctx,
+                                  ha,
+                                  b,
+                                  nullptr,
+                                  1,
+                                  1,
+                                  0,
+                                  0,
+                                  1,
+                                  1,
+                                  conv_params.direct,
+                                  conv_params.circular_x,
+                                  conv_params.circular_y,
+                                  conv_params.scale);
         }
 
         // Current hb shape: [W_out, H_out, vp, uq * batch]
@@ -2764,7 +2750,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
 
         // struct ggml_tensor* hb_cat = ggml_reshape_4d(ctx, hb, w_out , h_out , vp * uq, batch);
         // [W_out, H_out, vp * uq,  batch]
-        // Now left to compute (W1 kr Id) * hb_cat == (W1 kr W2) * h
+        // Now left to compute (W1 kr Id) * hb_cat == (W1 kr W2) cv h
 
         // merge the uq groups of size vp*w_out*h_out
         struct ggml_tensor* hb_merged = ggml_reshape_2d(ctx, hb, w_out * h_out * vp, uq * batch);
@@ -2777,35 +2763,9 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
             hc_t = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_merged_t));
         }
         struct ggml_tensor* hc = ggml_transpose(ctx, hc_t);
-        hc                     = ggml_cont(ctx, hc);
-        struct ggml_tensor* out = ggml_reshape_4d(ctx, hc, w_out, h_out, up * vp, batch);
-        return ggml_scale(ctx, out, scale);
-#else
-        // compute the weight diff and do a single conv
-        if (w1 == NULL) {
-            w1 = ggml_ext_merge_lora(ctx, w1b, w1a);
-        }
-        if (ggml_n_dims(w1) < 4) {
-            w1 = ggml_reshape_4d(ctx, w1, 1, 1, w1->ne[0], w1->ne[1]);
-        }
-        if (w2 == NULL) {
-            w2 = ggml_ext_merge_lora(ctx, w2b, w2a);
-        }
-        if (ggml_n_dims(w2) < 4) {
-            w2 = ggml_reshape_4d(ctx, w2, 1, 1, w2->ne[0], w2->ne[1]);
-        }
-        if (w2->ne[2] * w1->ne[2] != h->ne[2]) {
-            int k = sqrt(w2->ne[2] * w1->ne[2] / h->ne[2]);
-            GGML_ASSERT(k * k * h->ne[2] == w2->ne[2] * w1->ne[2]);
-            w2 = ggml_reshape_4d(ctx, w2, w2->ne[0] * k, w2->ne[1] * k, w2->ne[2] / (k * k), w2->ne[3]);
-        }
-        w1                      = ggml_ext_cast_f32(ctx, w1);
-        w2                      = ggml_ext_cast_f32(ctx, w2);
-        struct ggml_tensor* w   = ggml_ext_kronecker(ctx, w1, w2);
-        struct ggml_tensor* out = ggml_ext_conv_2d(ctx, h, w, nullptr, conv_params.s0, conv_params.s1, conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1, conv_params.direct, conv_params.circular_x, conv_params.circular_y, conv_params.scale);
-
+        // ungroup
+        struct ggml_tensor* out = ggml_reshape_4d(ctx, ggml_cont(ctx, hc), w_out, h_out, up * vp, batch);
         return ggml_scale(ctx, out, scale);
-#endif
     }
 }
 

From 04f9b1f5f0b1f56358ed397d1674f932b57539c2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Mon, 2 Feb 2026 00:22:10 +0100
Subject: [PATCH 07/13] lokr: cleanup linear path code

---
 ggml_extend.hpp | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 63abe8c00..6424217fa 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -2658,27 +2658,25 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
     struct ggml_tensor* hb;
 
     if (!is_conv) {
-        int batch                 = (int)h->ne[1];
-        struct ggml_tensor* h_split = ggml_reshape_2d(ctx, h, vq, uq * batch);
+        int batch                   = (int)h->ne[1];
+        struct ggml_tensor* h_split = ggml_reshape_3d(ctx, h, vq, uq, batch);
 
         if (w2 != NULL) {
             hb = ggml_mul_mat(ctx, w2, h_split);
         } else {
             hb = ggml_mul_mat(ctx, w2b, ggml_mul_mat(ctx, w2a, h_split));
         }
+        struct ggml_tensor* hb_t = ggml_cont(ctx, ggml_transpose(ctx, hb));
 
-        struct ggml_tensor* hb_cat = ggml_reshape_3d(ctx, hb, vp, uq, batch);
-        struct ggml_tensor* hb_t         = ggml_cont(ctx, ggml_transpose(ctx, hb_cat));
-
-        struct ggml_tensor* hc;
+        struct ggml_tensor* hc_t;
         if (w1 != NULL) {
-            hc = ggml_mul_mat(ctx, w1, hb_t);
+            hc_t = ggml_mul_mat(ctx, w1, hb_t);
         } else {
-            hc = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_t));
+            hc_t = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_t));
         }
 
-        struct ggml_tensor* hc_t = ggml_transpose(ctx, hc);
-        struct ggml_tensor* out  = ggml_reshape_2d(ctx, ggml_cont(ctx, hc_t), up * vp, batch);
+        struct ggml_tensor* hc = ggml_transpose(ctx, hc_t);
+        struct ggml_tensor* out  = ggml_reshape_2d(ctx, ggml_cont(ctx, hc), up * vp, batch);
         return ggml_scale(ctx, out, scale);
 
     } else {

From 5b67c4b6d2de012285b591f2d98c04782ce9b633 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Mon, 2 Feb 2026 01:21:53 +0100
Subject: [PATCH 08/13] reshape to 2d before mat_mul

---
 ggml_extend.hpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 6424217fa..effb578d3 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -2659,14 +2659,17 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
 
     if (!is_conv) {
         int batch                   = (int)h->ne[1];
-        struct ggml_tensor* h_split = ggml_reshape_3d(ctx, h, vq, uq, batch);
-
+        struct ggml_tensor* h_split = ggml_reshape_2d(ctx, h, vq, uq * batch);
         if (w2 != NULL) {
             hb = ggml_mul_mat(ctx, w2, h_split);
         } else {
             hb = ggml_mul_mat(ctx, w2b, ggml_mul_mat(ctx, w2a, h_split));
         }
+        hb = ggml_reshape_3d(ctx, hb, vp, uq, batch);
+
         struct ggml_tensor* hb_t = ggml_cont(ctx, ggml_transpose(ctx, hb));
+        
+        hb_t = ggml_reshape_2d(ctx, hb_t, uq, vp * batch);
 
         struct ggml_tensor* hc_t;
         if (w1 != NULL) {
@@ -2674,6 +2677,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
         } else {
             hc_t = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_t));
         }
+        hc_t = ggml_reshape_3d(ctx, hc_t, up, vp, batch);
 
         struct ggml_tensor* hc = ggml_transpose(ctx, hc_t);
         struct ggml_tensor* out  = ggml_reshape_2d(ctx, ggml_cont(ctx, hc), up * vp, batch);

From f7d53b6551ea9b29c5f86ba0eacbee0d55feca9a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Mon, 2 Feb 2026 01:38:25 +0100
Subject: [PATCH 09/13] maxComputeWorkGroupCount workaround for vulkan

---
 ggml_extend.hpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index effb578d3..b55da2e15 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -2665,11 +2665,16 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
         } else {
             hb = ggml_mul_mat(ctx, w2b, ggml_mul_mat(ctx, w2a, h_split));
         }
-        hb = ggml_reshape_3d(ctx, hb, vp, uq, batch);
+        
+        if(batch > 1){
+            hb = ggml_reshape_3d(ctx, hb, vp, uq, batch);
+        }
 
         struct ggml_tensor* hb_t = ggml_cont(ctx, ggml_transpose(ctx, hb));
         
-        hb_t = ggml_reshape_2d(ctx, hb_t, uq, vp * batch);
+        if(batch > 1){
+            hb_t = ggml_reshape_2d(ctx, hb_t, uq, vp * batch);
+        }
 
         struct ggml_tensor* hc_t;
         if (w1 != NULL) {
@@ -2677,7 +2682,10 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
         } else {
             hc_t = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_t));
         }
-        hc_t = ggml_reshape_3d(ctx, hc_t, up, vp, batch);
+        
+        if(batch > 1){
+            hc_t = ggml_reshape_3d(ctx, hc_t, up, vp, batch);
+        }
 
         struct ggml_tensor* hc = ggml_transpose(ctx, hc_t);
         struct ggml_tensor* out  = ggml_reshape_2d(ctx, ggml_cont(ctx, hc), up * vp, batch);

From 244480e50f12379c8caa1361f0d7964b62f6e275 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Mon, 2 Feb 2026 17:13:29 +0100
Subject: [PATCH 10/13] Avoid too large tensors dims in matmul for smaller vk
 workgroups

---
 ggml_extend.hpp | 42 ++++++++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 14 deletions(-)

diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index b55da2e15..6c65f7a0d 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -2658,23 +2658,38 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
     struct ggml_tensor* hb;
 
     if (!is_conv) {
-        int batch                   = (int)h->ne[1];
-        struct ggml_tensor* h_split = ggml_reshape_2d(ctx, h, vq, uq * batch);
+        int max_batch      = 65535;
+        int batch          = (int)h->ne[1];
+        int max_batch_uq   = max_batch / uq;
+        int merge_batch_uq = 1;
+        for (int i = max_batch_uq; i > 0; i--) {
+            if (batch % i == 0) {
+                merge_batch_uq = i;
+                break;
+            }
+        }
+
+        int max_batch_vp   = max_batch / vp;
+        int merge_batch_vp = 1;
+        for (int i = max_batch_vp; i > 0; i--) {
+            if (batch % i == 0) {
+                merge_batch_vp = i;
+                break;
+            }
+        }
+
+        struct ggml_tensor* h_split = ggml_reshape_3d(ctx, h, vq, uq * merge_batch_uq, batch / merge_batch_uq);
         if (w2 != NULL) {
             hb = ggml_mul_mat(ctx, w2, h_split);
         } else {
             hb = ggml_mul_mat(ctx, w2b, ggml_mul_mat(ctx, w2a, h_split));
         }
-        
-        if(batch > 1){
+
+        if (batch > 1) {
             hb = ggml_reshape_3d(ctx, hb, vp, uq, batch);
         }
-
         struct ggml_tensor* hb_t = ggml_cont(ctx, ggml_transpose(ctx, hb));
-        
-        if(batch > 1){
-            hb_t = ggml_reshape_2d(ctx, hb_t, uq, vp * batch);
-        }
+        hb_t                     = ggml_reshape_3d(ctx, hb_t, uq, vp * merge_batch_vp, batch / merge_batch_vp);
 
         struct ggml_tensor* hc_t;
         if (w1 != NULL) {
@@ -2682,13 +2697,13 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
         } else {
             hc_t = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_t));
         }
-        
-        if(batch > 1){
+
+        if (batch > 1) {
             hc_t = ggml_reshape_3d(ctx, hc_t, up, vp, batch);
         }
 
-        struct ggml_tensor* hc = ggml_transpose(ctx, hc_t);
-        struct ggml_tensor* out  = ggml_reshape_2d(ctx, ggml_cont(ctx, hc), up * vp, batch);
+        struct ggml_tensor* hc  = ggml_transpose(ctx, hc_t);
+        struct ggml_tensor* out = ggml_reshape_2d(ctx, ggml_cont(ctx, hc), up * vp, batch);
         return ggml_scale(ctx, out, scale);
 
     } else {
@@ -2696,7 +2711,6 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
         // 1. Reshape input: [W, H, vq*uq, batch] -> [W, H, vq, uq * batch]
         struct ggml_tensor* h_split = ggml_reshape_4d(ctx, h, h->ne[0], h->ne[1], vq, uq * batch);
 
-        struct ggml_tensor* hb;
         if (w2 != NULL) {
             hb = ggml_ext_conv_2d(ctx, h_split, w2, nullptr,
                                   conv_params.s0,

From 1ab9ed28ea3f1b2518f5758f01f72a60ba8d6681 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Mon, 2 Feb 2026 17:13:37 +0100
Subject: [PATCH 11/13] make it vk only

---
 ggml_extend.hpp | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 6c65f7a0d..8adacdf46 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -2658,10 +2658,15 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
     struct ggml_tensor* hb;
 
     if (!is_conv) {
-        int max_batch      = 65535;
         int batch          = (int)h->ne[1];
-        int max_batch_uq   = max_batch / uq;
-        int merge_batch_uq = 1;
+        int merge_batch_uq = batch;
+        int merge_batch_vp = batch;
+
+#if SD_VULKAN
+    // no access to backend here, worst case is slightly worse perfs for other backends when built alongside Vulkan backend
+        int max_batch    = 65535;
+        int max_batch_uq = max_batch / uq;
+        merge_batch_uq   = 1;
         for (int i = max_batch_uq; i > 0; i--) {
             if (batch % i == 0) {
                 merge_batch_uq = i;
@@ -2669,14 +2674,15 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
             }
         }
 
-        int max_batch_vp   = max_batch / vp;
-        int merge_batch_vp = 1;
+        int max_batch_vp = max_batch / vp;
+        merge_batch_vp   = 1;
         for (int i = max_batch_vp; i > 0; i--) {
             if (batch % i == 0) {
                 merge_batch_vp = i;
                 break;
             }
         }
+#endif
 
         struct ggml_tensor* h_split = ggml_reshape_3d(ctx, h, vq, uq * merge_batch_uq, batch / merge_batch_uq);
         if (w2 != NULL) {
@@ -2705,7 +2711,6 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
         struct ggml_tensor* hc  = ggml_transpose(ctx, hc_t);
         struct ggml_tensor* out = ggml_reshape_2d(ctx, ggml_cont(ctx, hc), up * vp, batch);
         return ggml_scale(ctx, out, scale);
-
     } else {
         int batch = (int)h->ne[3];
         // 1. Reshape input: [W, H, vq*uq, batch] -> [W, H, vq, uq * batch]
@@ -2723,7 +2728,6 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
                                   conv_params.circular_x,
                                   conv_params.circular_y,
                                   conv_params.scale);
-
         } else {
             // swap a and b order for conv lora
             struct ggml_tensor* a = w2b;

From c7629d9b23c6509edf2f18086e37102421546426 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Mon, 2 Feb 2026 17:14:01 +0100
Subject: [PATCH 12/13] remove unncesary casts for non-conv weights

---
 lora.hpp | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/lora.hpp b/lora.hpp
index fd461086a..485963fd3 100644
--- a/lora.hpp
+++ b/lora.hpp
@@ -540,23 +540,14 @@ struct LoraModel : public GGMLRunner {
 
                 if (iter != lora_tensors.end()) {
                     lokr_w1 = iter->second;
-                    if (is_conv2d && lokr_w1->type != GGML_TYPE_F16) {
-                        lokr_w1 = ggml_cast(ctx, lokr_w1, GGML_TYPE_F16);
-                    }
                 }
                 iter = iter_a;
                 if (iter != lora_tensors.end()) {
                     lokr_w1_a = iter->second;
-                    if (is_conv2d && lokr_w1_a->type != GGML_TYPE_F16) {
-                        lokr_w1_a = ggml_cast(ctx, lokr_w1_a, GGML_TYPE_F16);
-                    }
                 }
                 iter = lora_tensors.find(lokr_w1_b_name);
                 if (iter != lora_tensors.end()) {
                     lokr_w1_b = iter->second;
-                    if (is_conv2d && lokr_w1_b->type != GGML_TYPE_F16) {
-                        lokr_w1_b = ggml_cast(ctx, lokr_w1_b, GGML_TYPE_F16);
-                    }
                 }
 
                 iter = lora_tensors.find(lokr_w2_name);

From 30051a2d9aaa5b563b1c412ff3481456443bc1b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Mon, 2 Feb 2026 17:50:20 +0100
Subject: [PATCH 13/13] fix wrong flag (oops)

---
 ggml_extend.hpp | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 8adacdf46..a9fe46fec 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -2662,24 +2662,26 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
         int merge_batch_uq = batch;
         int merge_batch_vp = batch;
 
-#if SD_VULKAN
-    // no access to backend here, worst case is slightly worse perfs for other backends when built alongside Vulkan backend
-        int max_batch    = 65535;
-        int max_batch_uq = max_batch / uq;
-        merge_batch_uq   = 1;
-        for (int i = max_batch_uq; i > 0; i--) {
-            if (batch % i == 0) {
-                merge_batch_uq = i;
-                break;
+#if SD_USE_VULKAN
+        if (batch > 1) {
+            // no access to backend here, worst case is slightly worse perfs for other backends when built alongside Vulkan backend
+            int max_batch    = 65535;
+            int max_batch_uq = max_batch / uq;
+            merge_batch_uq   = 1;
+            for (int i = max_batch_uq; i > 0; i--) {
+                if (batch % i == 0) {
+                    merge_batch_uq = i;
+                    break;
+                }
             }
-        }
 
-        int max_batch_vp = max_batch / vp;
-        merge_batch_vp   = 1;
-        for (int i = max_batch_vp; i > 0; i--) {
-            if (batch % i == 0) {
-                merge_batch_vp = i;
-                break;
+            int max_batch_vp = max_batch / vp;
+            merge_batch_vp   = 1;
+            for (int i = max_batch_vp; i > 0; i--) {
+                if (batch % i == 0) {
+                    merge_batch_vp = i;
+                    break;
+                }
             }
         }
 #endif