From b4db4be5f1a19f0691bb0a1160fa841adb61f771 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 29 Jan 2026 16:34:29 +0100 Subject: [PATCH 01/13] LoRA: Optimise LoKr at runtime --- ggml_extend.hpp | 83 +++++++++++++++++++++++++++++++++- lora.hpp | 116 ++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 195 insertions(+), 4 deletions(-) diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 193a2c392..dd1309955 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -1577,7 +1577,7 @@ struct WeightAdapter { bool force_prec_f32 = false; float scale = 1.f; } linear; - struct { + struct conv2d_params_t{ int s0 = 1; int s1 = 1; int p0 = 0; @@ -2630,4 +2630,85 @@ class MultiheadAttention : public GGMLBlock { } }; +__STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( + struct ggml_context* ctx, + struct ggml_tensor* h, // Input: [q, batch] or [W, H, q, batch] + struct ggml_tensor* w1, // Outer C (Full rank) + struct ggml_tensor* w1a, // Outer A (Low rank part 1) + struct ggml_tensor* w1b, // Outer B (Low rank part 2) + struct ggml_tensor* w2, // Inner BA (Full rank) + struct ggml_tensor* w2a, // Inner A (Low rank part 1) + struct ggml_tensor* w2b, // Inner B (Low rank part 2) + bool is_conv, + WeightAdapter::ForwardParams::conv2d_params_t conv_params, + float scale) { + + GGML_ASSERT((w1 != NULL || (w1a != NULL && w1b != NULL))); + GGML_ASSERT((w2 != NULL || (w2a != NULL && w2b != NULL))); + + int vq = (w2 != NULL) ? w2->ne[0] : w2a->ne[0]; + int vp = (w2 != NULL) ? w2->ne[1] : (is_conv ? w2b->ne[3] : w2b->ne[1]); + + int uq = (w1 != NULL) ? w1->ne[0] : w1a->ne[0]; + int up = (w1 != NULL) ? w1->ne[1] : w1b->ne[1]; + + int q_expected = uq * vq; + int q_actual = is_conv ? h->ne[2] : h->ne[0]; + GGML_ASSERT(q_actual == q_expected && "Input dimension mismatch for LoKR split"); + + struct ggml_tensor* hb; + + if (!is_conv) { + // Treat input as a grid: [vq, uq * batch] + struct ggml_tensor* h_mat = ggml_reshape_2d(ctx, h, vq, uq * h->ne[1]); + + if (w2 != NULL) { + hb = ggml_mul_mat(ctx, w2, h_mat); + } else { + hb = ggml_mul_mat(ctx, w2b, ggml_mul_mat(ctx, w2a, h_mat)); + } + } else { + // Reshape so uq is in the batch dimension: [W, H, vq, uq * batch] + struct ggml_tensor* h_grouped = ggml_reshape_4d(ctx, h, h->ne[0], h->ne[1], vq, uq * h->ne[3]); + + if (w2 != NULL) { + hb = ggml_ext_conv_2d(ctx, w2, h_grouped,nullptr, conv_params.s0, conv_params.s1, conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1, conv_params.direct, conv_params.circular_x, conv_params.circular_y, conv_params.scale); + } else { + // w2a is [1, 1, vq, rank], w2b is [kw, kh, rank, vp] + struct ggml_tensor* tmp = ggml_conv_2d(ctx, w2a, h_grouped, 1, 1, 0, 0, 1, 1); + hb = ggml_ext_conv_2d(ctx, w2b, tmp, nullptr, conv_params.s0, conv_params.s1, conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1, conv_params.direct, conv_params.circular_x, conv_params.circular_y, conv_params.scale); + } + } + + // At this point hb is [W_out, H_out, vp, uq * batch] + // We reshape to isolate uq for matrix multiplication + int w_out = is_conv ? hb->ne[0] : 1; + int h_out = is_conv ? hb->ne[1] : 1; + int batch = is_conv ? h->ne[3] : h->ne[1]; + + // Rearrange to [vp, uq, spatial*batch] + struct ggml_tensor* hb_unbundled = ggml_reshape_3d(ctx, hb, vp, uq, w_out * h_out * batch); + + // Transpose so uq is ne[0] for ggml_mul_mat + struct ggml_tensor* hb_t = ggml_transpose(ctx, hb_unbundled); + + struct ggml_tensor* hc; + if (w1 != NULL) { + hc = ggml_mul_mat(ctx, w1, hb_t); + } else { + hc = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_t)); + } + + struct ggml_tensor* hc_t = ggml_transpose(ctx, hc); + struct ggml_tensor* out; + if (is_conv) { + out = ggml_reshape_4d(ctx, hc_t, w_out, h_out, up * vp, batch); + } else { + + out = ggml_reshape_2d(ctx, ggml_cont(ctx, hc_t), up * vp, batch); + } + + return ggml_scale(ctx, out, scale); +} + #endif // __GGML_EXTEND__HPP__ diff --git a/lora.hpp b/lora.hpp index e5d9906ff..fd461086a 100644 --- a/lora.hpp +++ b/lora.hpp @@ -483,7 +483,7 @@ struct LoraModel : public GGMLRunner { diff = get_loha_weight_diff(model_tensor_name, ctx); } // lokr - if (diff == nullptr) { + if (diff == nullptr && with_lora) { diff = get_lokr_weight_diff(model_tensor_name, ctx); } if (diff != nullptr) { @@ -501,6 +501,8 @@ struct LoraModel : public GGMLRunner { return diff; } + + ggml_tensor* get_out_diff(ggml_context* ctx, ggml_tensor* x, WeightAdapter::ForwardParams forward_params, @@ -514,6 +516,115 @@ struct LoraModel : public GGMLRunner { } else { key = model_tensor_name + "." + std::to_string(index); } + bool is_conv2d = forward_params.op_type == WeightAdapter::ForwardParams::op_type_t::OP_CONV2D; + + + std::string lokr_w1_name = "lora." + key + ".lokr_w1"; + std::string lokr_w1_a_name = "lora." + key + ".lokr_w1_a"; + // if either of these is found, then we have a lokr lora + auto iter = lora_tensors.find(lokr_w1_name); + auto iter_a = lora_tensors.find(lokr_w1_a_name); + if (iter != lora_tensors.end() || iter_a != lora_tensors.end()) { + std::string lokr_w1_b_name = "lora." + key + ".lokr_w1_b"; + std::string lokr_w2_name = "lora." + key + ".lokr_w2"; + std::string lokr_w2_a_name = "lora." + key + ".lokr_w2_a"; + std::string lokr_w2_b_name = "lora." + key + ".lokr_w2_b"; + std::string alpha_name = "lora." + key + ".alpha"; + + ggml_tensor* lokr_w1 = nullptr; + ggml_tensor* lokr_w1_a = nullptr; + ggml_tensor* lokr_w1_b = nullptr; + ggml_tensor* lokr_w2 = nullptr; + ggml_tensor* lokr_w2_a = nullptr; + ggml_tensor* lokr_w2_b = nullptr; + + if (iter != lora_tensors.end()) { + lokr_w1 = iter->second; + if (is_conv2d && lokr_w1->type != GGML_TYPE_F16) { + lokr_w1 = ggml_cast(ctx, lokr_w1, GGML_TYPE_F16); + } + } + iter = iter_a; + if (iter != lora_tensors.end()) { + lokr_w1_a = iter->second; + if (is_conv2d && lokr_w1_a->type != GGML_TYPE_F16) { + lokr_w1_a = ggml_cast(ctx, lokr_w1_a, GGML_TYPE_F16); + } + } + iter = lora_tensors.find(lokr_w1_b_name); + if (iter != lora_tensors.end()) { + lokr_w1_b = iter->second; + if (is_conv2d && lokr_w1_b->type != GGML_TYPE_F16) { + lokr_w1_b = ggml_cast(ctx, lokr_w1_b, GGML_TYPE_F16); + } + } + + iter = lora_tensors.find(lokr_w2_name); + if (iter != lora_tensors.end()) { + lokr_w2 = iter->second; + if (is_conv2d && lokr_w2->type != GGML_TYPE_F16) { + lokr_w2 = ggml_cast(ctx, lokr_w2, GGML_TYPE_F16); + } + } + iter = lora_tensors.find(lokr_w2_a_name); + if (iter != lora_tensors.end()) { + lokr_w2_a = iter->second; + if (is_conv2d && lokr_w2_a->type != GGML_TYPE_F16) { + lokr_w2_a = ggml_cast(ctx, lokr_w2_a, GGML_TYPE_F16); + } + } + iter = lora_tensors.find(lokr_w2_b_name); + if (iter != lora_tensors.end()) { + lokr_w2_b = iter->second; + if (is_conv2d && lokr_w2_b->type != GGML_TYPE_F16) { + lokr_w2_b = ggml_cast(ctx, lokr_w2_b, GGML_TYPE_F16); + } + } + + int rank = 1; + if (lokr_w1_b) { + rank = lokr_w1_b->ne[ggml_n_dims(lokr_w1_b) - 1]; + } + if (lokr_w2_b) { + rank = lokr_w2_b->ne[ggml_n_dims(lokr_w2_b) - 1]; + } + + float scale_value = 1.0f; + iter = lora_tensors.find(alpha_name); + if (iter != lora_tensors.end()) { + float alpha = ggml_ext_backend_tensor_get_f32(iter->second); + scale_value = alpha / rank; + applied_lora_tensors.insert(alpha_name); + } + + if (rank == 1) { + scale_value = 1.0f; + } + scale_value *= multiplier; + + auto curr_out_diff = ggml_ext_lokr_forward(ctx, x, lokr_w1, lokr_w1_a, lokr_w1_b, lokr_w2, lokr_w2_a, lokr_w2_b, is_conv2d, forward_params.conv2d, scale_value); + if (out_diff == nullptr) { + out_diff = curr_out_diff; + } else { + out_diff = ggml_concat(ctx, out_diff, curr_out_diff, 0); + } + + if(lokr_w1) applied_lora_tensors.insert(lokr_w1_name); + if(lokr_w1_a) applied_lora_tensors.insert(lokr_w1_a_name); + if(lokr_w1_b) applied_lora_tensors.insert(lokr_w1_b_name); + if(lokr_w2) applied_lora_tensors.insert(lokr_w2_name); + if(lokr_w2_a) applied_lora_tensors.insert(lokr_w2_name); + if(lokr_w2_b) applied_lora_tensors.insert(lokr_w2_b_name); + applied_lora_tensors.insert(alpha_name); + + + index++; + continue; + } + + // not a lork, normal lora path + + std::string lora_down_name = "lora." + key + ".lora_down"; std::string lora_up_name = "lora." + key + ".lora_up"; @@ -525,9 +636,8 @@ struct LoraModel : public GGMLRunner { ggml_tensor* lora_mid = nullptr; ggml_tensor* lora_down = nullptr; - bool is_conv2d = forward_params.op_type == WeightAdapter::ForwardParams::op_type_t::OP_CONV2D; - auto iter = lora_tensors.find(lora_up_name); + iter = lora_tensors.find(lora_up_name); if (iter != lora_tensors.end()) { lora_up = iter->second; if (is_conv2d && lora_up->type != GGML_TYPE_F16) { From d608b37ad50a0aa32676adc150c506c7136c022e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Fri, 30 Jan 2026 21:06:12 +0100 Subject: [PATCH 02/13] lokr: fix convs --- ggml_extend.hpp | 96 +++++++++++++++++++++++++++++-------------------- 1 file changed, 57 insertions(+), 39 deletions(-) diff --git a/ggml_extend.hpp b/ggml_extend.hpp index dd1309955..f75b8464b 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -2646,69 +2646,87 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( GGML_ASSERT((w1 != NULL || (w1a != NULL && w1b != NULL))); GGML_ASSERT((w2 != NULL || (w2a != NULL && w2b != NULL))); - int vq = (w2 != NULL) ? w2->ne[0] : w2a->ne[0]; - int vp = (w2 != NULL) ? w2->ne[1] : (is_conv ? w2b->ne[3] : w2b->ne[1]); + int uq = (w1 != NULL) ? (int)w1->ne[0] : (int)w1a->ne[0]; + int up = (w1 != NULL) ? (int)w1->ne[1] : (int)w1b->ne[1]; - int uq = (w1 != NULL) ? w1->ne[0] : w1a->ne[0]; - int up = (w1 != NULL) ? w1->ne[1] : w1b->ne[1]; + int q_actual = is_conv ? (int)h->ne[2] : (int)h->ne[0]; + int vq = q_actual / uq; - int q_expected = uq * vq; - int q_actual = is_conv ? h->ne[2] : h->ne[0]; - GGML_ASSERT(q_actual == q_expected && "Input dimension mismatch for LoKR split"); + int vp = (w2 != NULL) ? (is_conv ? (int)w2->ne[3] : (int)w2->ne[1]) + : (int)w2a->ne[1]; + GGML_ASSERT(q_actual == (uq * vq) && "Input dimension mismatch for LoKR split"); struct ggml_tensor* hb; if (!is_conv) { - // Treat input as a grid: [vq, uq * batch] - struct ggml_tensor* h_mat = ggml_reshape_2d(ctx, h, vq, uq * h->ne[1]); + int batch = (int)h->ne[1]; + struct ggml_tensor* h_mat = ggml_reshape_2d(ctx, h, vq, uq * batch); if (w2 != NULL) { hb = ggml_mul_mat(ctx, w2, h_mat); } else { hb = ggml_mul_mat(ctx, w2b, ggml_mul_mat(ctx, w2a, h_mat)); } + + struct ggml_tensor* hb_unbundled = ggml_reshape_3d(ctx, hb, vp, uq, batch); + struct ggml_tensor* hb_t = ggml_transpose(ctx, hb_unbundled); + + struct ggml_tensor* hc; + if (w1 != NULL) { + hc = ggml_mul_mat(ctx, w1, hb_t); + } else { + hc = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_t)); + } + + struct ggml_tensor* hc_t = ggml_transpose(ctx, hc); + struct ggml_tensor* out = ggml_reshape_2d(ctx, ggml_cont(ctx, hc_t), up * vp, batch); + return ggml_scale(ctx, out, scale); + } else { - // Reshape so uq is in the batch dimension: [W, H, vq, uq * batch] - struct ggml_tensor* h_grouped = ggml_reshape_4d(ctx, h, h->ne[0], h->ne[1], vq, uq * h->ne[3]); + int batch = (int)h->ne[3]; + + // Reshape input: [W, H, vq*uq, batch] -> [W, H, vq, uq * batch] + struct ggml_tensor* h_grouped = ggml_reshape_4d(ctx, h, h->ne[0], h->ne[1], vq, uq * batch); if (w2 != NULL) { - hb = ggml_ext_conv_2d(ctx, w2, h_grouped,nullptr, conv_params.s0, conv_params.s1, conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1, conv_params.direct, conv_params.circular_x, conv_params.circular_y, conv_params.scale); + hb = ggml_conv_2d(ctx, w2, h_grouped, conv_params.s0, conv_params.s1, + conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1); } else { - // w2a is [1, 1, vq, rank], w2b is [kw, kh, rank, vp] - struct ggml_tensor* tmp = ggml_conv_2d(ctx, w2a, h_grouped, 1, 1, 0, 0, 1, 1); - hb = ggml_ext_conv_2d(ctx, w2b, tmp, nullptr, conv_params.s0, conv_params.s1, conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1, conv_params.direct, conv_params.circular_x, conv_params.circular_y, conv_params.scale); + // Low-rank decomposition: w2b is the spatial kernel, w2a is the 1x1 projection + // Inner LoRA: w2b is the spatial/down-project, w2a is the 1x1 up-project + int rank = (int)w2b->ne[1]; + int k = (int)sqrt(w2b->ne[0] / vq); + + struct ggml_tensor* w2b_4d = (ggml_n_dims(w2b) < 3) ? ggml_reshape_4d(ctx, w2b, k, k, vq, rank) : w2b; + struct ggml_tensor* w2a_4d = (ggml_n_dims(w2a) < 3) ? ggml_reshape_4d(ctx, w2a, 1, 1, rank, vp) : w2a; + + struct ggml_tensor* ha = ggml_conv_2d(ctx, w2b_4d, h_grouped, conv_params.s0, conv_params.s1, + conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1); + hb = ggml_conv_2d(ctx, w2a_4d, ha, 1, 1, 0, 0, 1, 1); } - } - // At this point hb is [W_out, H_out, vp, uq * batch] - // We reshape to isolate uq for matrix multiplication - int w_out = is_conv ? hb->ne[0] : 1; - int h_out = is_conv ? hb->ne[1] : 1; - int batch = is_conv ? h->ne[3] : h->ne[1]; + int w_out = (int)hb->ne[0]; + int h_out = (int)hb->ne[1]; - // Rearrange to [vp, uq, spatial*batch] - struct ggml_tensor* hb_unbundled = ggml_reshape_3d(ctx, hb, vp, uq, w_out * h_out * batch); + struct ggml_tensor* hb_flat = ggml_reshape_3d(ctx, hb, w_out * h_out * vp, uq, batch); + struct ggml_tensor* hb_t = ggml_transpose(ctx, hb_flat); - // Transpose so uq is ne[0] for ggml_mul_mat - struct ggml_tensor* hb_t = ggml_transpose(ctx, hb_unbundled); + struct ggml_tensor* hc; + struct ggml_tensor* w1_mat = (w1 != NULL) ? ggml_reshape_2d(ctx, w1, uq, up) : NULL; - struct ggml_tensor* hc; - if (w1 != NULL) { - hc = ggml_mul_mat(ctx, w1, hb_t); - } else { - hc = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_t)); - } + if (w1_mat != NULL) { + hc = ggml_mul_mat(ctx, w1_mat, hb_t); + } else { + hc = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_t)); + } - struct ggml_tensor* hc_t = ggml_transpose(ctx, hc); - struct ggml_tensor* out; - if (is_conv) { - out = ggml_reshape_4d(ctx, hc_t, w_out, h_out, up * vp, batch); - } else { + struct ggml_tensor* hc_t = ggml_transpose(ctx, hc); + struct ggml_tensor* hc_res = ggml_reshape_4d(ctx, ggml_cont(ctx, hc_t), vp, w_out * h_out, up, batch); + struct ggml_tensor* hc_perm = ggml_permute(ctx, hc_res, 1, 2, 0, 3); + struct ggml_tensor* out = ggml_reshape_4d(ctx, ggml_cont(ctx, hc_perm), w_out, h_out, up * vp, batch); - out = ggml_reshape_2d(ctx, ggml_cont(ctx, hc_t), up * vp, batch); + return ggml_scale(ctx, out, scale); } - - return ggml_scale(ctx, out, scale); } #endif // __GGML_EXTEND__HPP__ From b48609762791aab61cf48bde182a700bf45d8fe3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sun, 1 Feb 2026 16:25:34 +0100 Subject: [PATCH 03/13] lokr: fix lienar forward for CUDA/HIP and CPU backends --- ggml_extend.hpp | 49 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/ggml_extend.hpp b/ggml_extend.hpp index f75b8464b..8e1a5c42b 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -2669,7 +2669,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( } struct ggml_tensor* hb_unbundled = ggml_reshape_3d(ctx, hb, vp, uq, batch); - struct ggml_tensor* hb_t = ggml_transpose(ctx, hb_unbundled); + struct ggml_tensor* hb_t = ggml_cont(ctx,ggml_transpose(ctx, hb_unbundled)); struct ggml_tensor* hc; if (w1 != NULL) { @@ -2683,47 +2683,66 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( return ggml_scale(ctx, out, scale); } else { + // very slow implementation for now (can this be optimized?) int batch = (int)h->ne[3]; - // Reshape input: [W, H, vq*uq, batch] -> [W, H, vq, uq * batch] + // 1. Reshape input: [W, H, vq*uq, batch] -> [W, H, vq, uq * batch] + // This is free (metadata only) struct ggml_tensor* h_grouped = ggml_reshape_4d(ctx, h, h->ne[0], h->ne[1], vq, uq * batch); + struct ggml_tensor* hb; if (w2 != NULL) { hb = ggml_conv_2d(ctx, w2, h_grouped, conv_params.s0, conv_params.s1, conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1); } else { - // Low-rank decomposition: w2b is the spatial kernel, w2a is the 1x1 projection - // Inner LoRA: w2b is the spatial/down-project, w2a is the 1x1 up-project int rank = (int)w2b->ne[1]; int k = (int)sqrt(w2b->ne[0] / vq); - - struct ggml_tensor* w2b_4d = (ggml_n_dims(w2b) < 3) ? ggml_reshape_4d(ctx, w2b, k, k, vq, rank) : w2b; - struct ggml_tensor* w2a_4d = (ggml_n_dims(w2a) < 3) ? ggml_reshape_4d(ctx, w2a, 1, 1, rank, vp) : w2a; + struct ggml_tensor* w2b_4d = (ggml_n_dims(w2b) < 4) ? ggml_reshape_4d(ctx, w2b, k, k, vq, rank) : w2b; + struct ggml_tensor* w2a_4d = (ggml_n_dims(w2a) < 4) ? ggml_reshape_4d(ctx, w2a, 1, 1, rank, vp) : w2a; struct ggml_tensor* ha = ggml_conv_2d(ctx, w2b_4d, h_grouped, conv_params.s0, conv_params.s1, conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1); - hb = ggml_conv_2d(ctx, w2a_4d, ha, 1, 1, 0, 0, 1, 1); + hb = ggml_conv_2d(ctx, w2a_4d, ha, 1, 1, 0, 0, 1, 1); } + // Current hb shape: [W_out, H_out, vp, uq * batch] int w_out = (int)hb->ne[0]; int h_out = (int)hb->ne[1]; + // 2. Prepare for Matrix Multiplication + // Collapse spatial and 'vp' into one dimension to treat as 'M' in MatMul + // Shape: [W*H*vp, uq, batch] struct ggml_tensor* hb_flat = ggml_reshape_3d(ctx, hb, w_out * h_out * vp, uq, batch); + // Transpose to [uq, W*H*vp, batch] so that uq is ne[0] (the shared K dimension) struct ggml_tensor* hb_t = ggml_transpose(ctx, hb_flat); struct ggml_tensor* hc; - struct ggml_tensor* w1_mat = (w1 != NULL) ? ggml_reshape_2d(ctx, w1, uq, up) : NULL; - - if (w1_mat != NULL) { + if (w1 != NULL) { + struct ggml_tensor* w1_mat = ggml_reshape_2d(ctx, w1, uq, up); hc = ggml_mul_mat(ctx, w1_mat, hb_t); } else { + // Low-rank: (up x rank) * (rank x uq) * (uq x Spatial) hc = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_t)); } - struct ggml_tensor* hc_t = ggml_transpose(ctx, hc); - struct ggml_tensor* hc_res = ggml_reshape_4d(ctx, ggml_cont(ctx, hc_t), vp, w_out * h_out, up, batch); - struct ggml_tensor* hc_perm = ggml_permute(ctx, hc_res, 1, 2, 0, 3); - struct ggml_tensor* out = ggml_reshape_4d(ctx, ggml_cont(ctx, hc_perm), w_out, h_out, up * vp, batch); + // 3. Final Layout Transformation + // Current hc shape: [up, W*H*vp, batch] + // Logical dims in ne[1]: [W*H, vp] + // We want final shape: [W, H, up*vp, batch] + + // Split ne[1] back into spatial and vp + struct ggml_tensor* hc_split = ggml_reshape_4d(ctx, hc, up, w_out * h_out, vp, batch); + + // Permute to bring up and vp together: [spatial, up, vp, batch] + // This moves spatial to ne[0], which is necessary for the final W,H,C layout + struct ggml_tensor* hc_perm = ggml_permute(ctx, hc_split, 1, 0, 2, 3); + + // Resolve layout and scale in one go (if possible) or just cont + // This is the only mandatory copy + struct ggml_tensor* out_cont = ggml_cont(ctx, hc_perm); + + // Final reshape to merge up and vp into the channel dimension + struct ggml_tensor* out = ggml_reshape_4d(ctx, out_cont, w_out, h_out, up * vp, batch); return ggml_scale(ctx, out, scale); } From 85538628ea12909f707d52fb808e546d11d8a3cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sun, 1 Feb 2026 17:52:59 +0100 Subject: [PATCH 04/13] lokr: disable "optimization" for convolutions --- ggml_extend.hpp | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 8e1a5c42b..90e250e08 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -2683,6 +2683,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( return ggml_scale(ctx, out, scale); } else { +#if 0 // very slow implementation for now (can this be optimized?) int batch = (int)h->ne[3]; @@ -2745,6 +2746,32 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( struct ggml_tensor* out = ggml_reshape_4d(ctx, out_cont, w_out, h_out, up * vp, batch); return ggml_scale(ctx, out, scale); +#else + // compute the weight diff and do a single conv + if (w1 == NULL) { + w1 = ggml_ext_merge_lora(ctx, w1b, w1a); + } + if(ggml_n_dims(w1) < 4){ + w1 = ggml_reshape_4d(ctx, w1, 1, 1, w1->ne[0], w1->ne[1]); + } + if (w2 == NULL) { + w2 = ggml_ext_merge_lora(ctx, w2b, w2a); + } + if(ggml_n_dims(w2) < 4){ + w2 = ggml_reshape_4d(ctx, w2, 1, 1, w2->ne[0], w2->ne[1]); + } + if(w2->ne[2] * w1->ne[2] != h->ne[2]){ + int k = sqrt(w2->ne[2] * w1->ne[2]/h->ne[2]); + GGML_ASSERT(k*k * h->ne[2] == w2->ne[2] * w1->ne[2]); + w2 = ggml_reshape_4d(ctx, w2, w2->ne[0]*k, w2->ne[1]*k, w2->ne[2]/(k*k), w2->ne[3]); + } + w1 = ggml_ext_cast_f32(ctx, w1); + w2 = ggml_ext_cast_f32(ctx, w2); + struct ggml_tensor* w = ggml_ext_kronecker(ctx, w1, w2); + struct ggml_tensor* out = ggml_conv_2d(ctx, w, h, conv_params.s0, conv_params.s1, conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1); + + return ggml_scale(ctx, out, scale); +#endif } } From 2430989cd791dc009fd5ab7dbdbf45b686fea969 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sun, 1 Feb 2026 19:54:40 +0100 Subject: [PATCH 05/13] LoKR: re-implement conv --- ggml_extend.hpp | 152 +++++++++++++++++++++++++++++------------------- 1 file changed, 93 insertions(+), 59 deletions(-) diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 90e250e08..0a57ff519 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -1577,7 +1577,7 @@ struct WeightAdapter { bool force_prec_f32 = false; float scale = 1.f; } linear; - struct conv2d_params_t{ + struct conv2d_params_t { int s0 = 1; int s1 = 1; int p0 = 0; @@ -2642,7 +2642,6 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( bool is_conv, WeightAdapter::ForwardParams::conv2d_params_t conv_params, float scale) { - GGML_ASSERT((w1 != NULL || (w1a != NULL && w1b != NULL))); GGML_ASSERT((w2 != NULL || (w2a != NULL && w2b != NULL))); @@ -2660,16 +2659,16 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( if (!is_conv) { int batch = (int)h->ne[1]; - struct ggml_tensor* h_mat = ggml_reshape_2d(ctx, h, vq, uq * batch); + struct ggml_tensor* h_split = ggml_reshape_2d(ctx, h, vq, uq * batch); if (w2 != NULL) { - hb = ggml_mul_mat(ctx, w2, h_mat); + hb = ggml_mul_mat(ctx, w2, h_split); } else { - hb = ggml_mul_mat(ctx, w2b, ggml_mul_mat(ctx, w2a, h_mat)); + hb = ggml_mul_mat(ctx, w2b, ggml_mul_mat(ctx, w2a, h_split)); } - struct ggml_tensor* hb_unbundled = ggml_reshape_3d(ctx, hb, vp, uq, batch); - struct ggml_tensor* hb_t = ggml_cont(ctx,ggml_transpose(ctx, hb_unbundled)); + struct ggml_tensor* hb_cat = ggml_reshape_3d(ctx, hb, vp, uq, batch); + struct ggml_tensor* hb_t = ggml_cont(ctx, ggml_transpose(ctx, hb_cat)); struct ggml_tensor* hc; if (w1 != NULL) { @@ -2683,92 +2682,127 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( return ggml_scale(ctx, out, scale); } else { -#if 0 +#if 1 // very slow implementation for now (can this be optimized?) int batch = (int)h->ne[3]; // 1. Reshape input: [W, H, vq*uq, batch] -> [W, H, vq, uq * batch] // This is free (metadata only) - struct ggml_tensor* h_grouped = ggml_reshape_4d(ctx, h, h->ne[0], h->ne[1], vq, uq * batch); + // print_ggml_tensor(h, true, "\nh"); + struct ggml_tensor* h_split = ggml_reshape_4d(ctx, h, h->ne[0], h->ne[1], vq, uq * batch); + // print_ggml_tensor(h_split, true, "h_split"); struct ggml_tensor* hb; if (w2 != NULL) { - hb = ggml_conv_2d(ctx, w2, h_grouped, conv_params.s0, conv_params.s1, - conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1); + // no LoRA + // print_ggml_tensor(w2, true, "w2"); + hb = ggml_ext_conv_2d(ctx, h_split, w2, nullptr, + conv_params.s0, + conv_params.s1, + conv_params.p0, + conv_params.p1, + conv_params.d0, + conv_params.d1, + conv_params.direct, + conv_params.circular_x, + conv_params.circular_y, + conv_params.scale); + } else { - int rank = (int)w2b->ne[1]; - int k = (int)sqrt(w2b->ne[0] / vq); - struct ggml_tensor* w2b_4d = (ggml_n_dims(w2b) < 4) ? ggml_reshape_4d(ctx, w2b, k, k, vq, rank) : w2b; - struct ggml_tensor* w2a_4d = (ggml_n_dims(w2a) < 4) ? ggml_reshape_4d(ctx, w2a, 1, 1, rank, vp) : w2a; + // TODO: do not merge (loCon forward) + // w2a could be 2d + w2 = ggml_ext_merge_lora(ctx, w2b, w2a); + if (ggml_n_dims(w2) < 4) { + w2 = ggml_reshape_4d(ctx, w2, 1, 1, w2->ne[0], w2->ne[1]); + } + if (w2->ne[2] != h_split->ne[2]) { + int k = sqrt(w2->ne[2] / h_split->ne[2]); + GGML_ASSERT(k * k * h_split->ne[2] == w2->ne[2]); + w2 = ggml_reshape_4d(ctx, w2, w2->ne[0] * k, w2->ne[1] * k, w2->ne[2] / (k * k), w2->ne[3]); + } + hb = ggml_ext_conv_2d(ctx, h_split, w2, nullptr, + conv_params.s0, + conv_params.s1, + conv_params.p0, + conv_params.p1, + conv_params.d0, + conv_params.d1, + conv_params.direct, + conv_params.circular_x, + conv_params.circular_y, + conv_params.scale); - struct ggml_tensor* ha = ggml_conv_2d(ctx, w2b_4d, h_grouped, conv_params.s0, conv_params.s1, - conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1); - hb = ggml_conv_2d(ctx, w2a_4d, ha, 1, 1, 0, 0, 1, 1); + + // TODO: figure out why this is not working: + // struct ggml_tensor* ha = ggml_ext_conv_2d(ctx, h_split, w2a, nullptr, + // conv_params.s0, + // conv_params.s1, + // conv_params.p0, + // conv_params.p1, + // conv_params.d0, + // conv_params.d1); + // // not supporting lora_mid here + // hb = ggml_ext_conv_2d(ctx, + // ha, + // w2b, + // nullptr, + // 1, + // 1, + // 0, + // 0, + // 1, + // 1, + // conv_params.direct, + // conv_params.circular_x, + // conv_params.circular_y, + // conv_params.scale); } // Current hb shape: [W_out, H_out, vp, uq * batch] int w_out = (int)hb->ne[0]; int h_out = (int)hb->ne[1]; - // 2. Prepare for Matrix Multiplication - // Collapse spatial and 'vp' into one dimension to treat as 'M' in MatMul - // Shape: [W*H*vp, uq, batch] - struct ggml_tensor* hb_flat = ggml_reshape_3d(ctx, hb, w_out * h_out * vp, uq, batch); - // Transpose to [uq, W*H*vp, batch] so that uq is ne[0] (the shared K dimension) - struct ggml_tensor* hb_t = ggml_transpose(ctx, hb_flat); + // struct ggml_tensor* hb_cat = ggml_reshape_4d(ctx, hb, w_out , h_out , vp * uq, batch); + // [W_out, H_out, vp * uq, batch] + // Now left to compute (W1 kr Id) * hb_cat == (W1 kr W2) * h - struct ggml_tensor* hc; + // merge the uq groups of size vp*w_out*h_out + struct ggml_tensor* hb_merged = ggml_reshape_2d(ctx, hb, w_out * h_out * vp, uq * batch); + struct ggml_tensor* hc_t; + struct ggml_tensor* hb_merged_t = ggml_cont(ctx, ggml_transpose(ctx, hb_merged)); if (w1 != NULL) { - struct ggml_tensor* w1_mat = ggml_reshape_2d(ctx, w1, uq, up); - hc = ggml_mul_mat(ctx, w1_mat, hb_t); + // Would be great to be able to transpose w1 instead to avoid transposing both hb and hc + hc_t = ggml_mul_mat(ctx, w1, hb_merged_t); } else { - // Low-rank: (up x rank) * (rank x uq) * (uq x Spatial) - hc = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_t)); + hc_t = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_merged_t)); } - - // 3. Final Layout Transformation - // Current hc shape: [up, W*H*vp, batch] - // Logical dims in ne[1]: [W*H, vp] - // We want final shape: [W, H, up*vp, batch] - - // Split ne[1] back into spatial and vp - struct ggml_tensor* hc_split = ggml_reshape_4d(ctx, hc, up, w_out * h_out, vp, batch); - - // Permute to bring up and vp together: [spatial, up, vp, batch] - // This moves spatial to ne[0], which is necessary for the final W,H,C layout - struct ggml_tensor* hc_perm = ggml_permute(ctx, hc_split, 1, 0, 2, 3); - - // Resolve layout and scale in one go (if possible) or just cont - // This is the only mandatory copy - struct ggml_tensor* out_cont = ggml_cont(ctx, hc_perm); - - // Final reshape to merge up and vp into the channel dimension - struct ggml_tensor* out = ggml_reshape_4d(ctx, out_cont, w_out, h_out, up * vp, batch); - + struct ggml_tensor* hc = ggml_transpose(ctx, hc_t); + hc = ggml_cont(ctx, hc); + struct ggml_tensor* out = ggml_reshape_4d(ctx, hc, w_out, h_out, up * vp, batch); return ggml_scale(ctx, out, scale); #else // compute the weight diff and do a single conv if (w1 == NULL) { w1 = ggml_ext_merge_lora(ctx, w1b, w1a); } - if(ggml_n_dims(w1) < 4){ + if (ggml_n_dims(w1) < 4) { w1 = ggml_reshape_4d(ctx, w1, 1, 1, w1->ne[0], w1->ne[1]); } if (w2 == NULL) { w2 = ggml_ext_merge_lora(ctx, w2b, w2a); } - if(ggml_n_dims(w2) < 4){ + if (ggml_n_dims(w2) < 4) { w2 = ggml_reshape_4d(ctx, w2, 1, 1, w2->ne[0], w2->ne[1]); } - if(w2->ne[2] * w1->ne[2] != h->ne[2]){ - int k = sqrt(w2->ne[2] * w1->ne[2]/h->ne[2]); - GGML_ASSERT(k*k * h->ne[2] == w2->ne[2] * w1->ne[2]); - w2 = ggml_reshape_4d(ctx, w2, w2->ne[0]*k, w2->ne[1]*k, w2->ne[2]/(k*k), w2->ne[3]); + if (w2->ne[2] * w1->ne[2] != h->ne[2]) { + int k = sqrt(w2->ne[2] * w1->ne[2] / h->ne[2]); + GGML_ASSERT(k * k * h->ne[2] == w2->ne[2] * w1->ne[2]); + w2 = ggml_reshape_4d(ctx, w2, w2->ne[0] * k, w2->ne[1] * k, w2->ne[2] / (k * k), w2->ne[3]); } - w1 = ggml_ext_cast_f32(ctx, w1); - w2 = ggml_ext_cast_f32(ctx, w2); - struct ggml_tensor* w = ggml_ext_kronecker(ctx, w1, w2); - struct ggml_tensor* out = ggml_conv_2d(ctx, w, h, conv_params.s0, conv_params.s1, conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1); + w1 = ggml_ext_cast_f32(ctx, w1); + w2 = ggml_ext_cast_f32(ctx, w2); + struct ggml_tensor* w = ggml_ext_kronecker(ctx, w1, w2); + struct ggml_tensor* out = ggml_ext_conv_2d(ctx, h, w, nullptr, conv_params.s0, conv_params.s1, conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1, conv_params.direct, conv_params.circular_x, conv_params.circular_y, conv_params.scale); return ggml_scale(ctx, out, scale); #endif From fbf401bcccadd4b614674e7550e100645bbb1a32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Mon, 2 Feb 2026 00:21:50 +0100 Subject: [PATCH 06/13] lokr: fix conv bypass implementation --- ggml_extend.hpp | 126 +++++++++++++++++------------------------------- 1 file changed, 43 insertions(+), 83 deletions(-) diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 0a57ff519..63abe8c00 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -2682,20 +2682,12 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( return ggml_scale(ctx, out, scale); } else { -#if 1 - // very slow implementation for now (can this be optimized?) int batch = (int)h->ne[3]; - // 1. Reshape input: [W, H, vq*uq, batch] -> [W, H, vq, uq * batch] - // This is free (metadata only) - // print_ggml_tensor(h, true, "\nh"); struct ggml_tensor* h_split = ggml_reshape_4d(ctx, h, h->ne[0], h->ne[1], vq, uq * batch); - // print_ggml_tensor(h_split, true, "h_split"); struct ggml_tensor* hb; if (w2 != NULL) { - // no LoRA - // print_ggml_tensor(w2, true, "w2"); hb = ggml_ext_conv_2d(ctx, h_split, w2, nullptr, conv_params.s0, conv_params.s1, @@ -2709,53 +2701,47 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( conv_params.scale); } else { - // TODO: do not merge (loCon forward) - // w2a could be 2d - w2 = ggml_ext_merge_lora(ctx, w2b, w2a); - if (ggml_n_dims(w2) < 4) { - w2 = ggml_reshape_4d(ctx, w2, 1, 1, w2->ne[0], w2->ne[1]); - } - if (w2->ne[2] != h_split->ne[2]) { - int k = sqrt(w2->ne[2] / h_split->ne[2]); - GGML_ASSERT(k * k * h_split->ne[2] == w2->ne[2]); - w2 = ggml_reshape_4d(ctx, w2, w2->ne[0] * k, w2->ne[1] * k, w2->ne[2] / (k * k), w2->ne[3]); + // swap a and b order for conv lora + struct ggml_tensor* a = w2b; + struct ggml_tensor* b = w2a; + + // unpack conv2d weights if needed + if (ggml_n_dims(a) < 4) { + int k = sqrt(a->ne[0] / h_split->ne[2]); + GGML_ASSERT(k * k * h_split->ne[2] == a->ne[0]); + a = ggml_reshape_4d(ctx, a, k, k, a->ne[0] / (k * k), a->ne[1]); + } else if (a->ne[2] != h_split->ne[2]) { + int k = sqrt(a->ne[2] / h_split->ne[2]); + GGML_ASSERT(k * k * h_split->ne[2] == a->ne[2]); + a = ggml_reshape_4d(ctx, a, a->ne[0] * k, a->ne[1] * k, a->ne[2] / (k * k), a->ne[3]); } - hb = ggml_ext_conv_2d(ctx, h_split, w2, nullptr, - conv_params.s0, - conv_params.s1, - conv_params.p0, - conv_params.p1, - conv_params.d0, - conv_params.d1, - conv_params.direct, - conv_params.circular_x, - conv_params.circular_y, - conv_params.scale); - - - // TODO: figure out why this is not working: - // struct ggml_tensor* ha = ggml_ext_conv_2d(ctx, h_split, w2a, nullptr, - // conv_params.s0, - // conv_params.s1, - // conv_params.p0, - // conv_params.p1, - // conv_params.d0, - // conv_params.d1); - // // not supporting lora_mid here - // hb = ggml_ext_conv_2d(ctx, - // ha, - // w2b, - // nullptr, - // 1, - // 1, - // 0, - // 0, - // 1, - // 1, - // conv_params.direct, - // conv_params.circular_x, - // conv_params.circular_y, - // conv_params.scale); + struct ggml_tensor* ha = ggml_ext_conv_2d(ctx, h_split, a, nullptr, + conv_params.s0, + conv_params.s1, + conv_params.p0, + conv_params.p1, + conv_params.d0, + conv_params.d1, + conv_params.direct, + conv_params.circular_x, + conv_params.circular_y, + conv_params.scale); + + // not supporting lora_mid here + hb = ggml_ext_conv_2d(ctx, + ha, + b, + nullptr, + 1, + 1, + 0, + 0, + 1, + 1, + conv_params.direct, + conv_params.circular_x, + conv_params.circular_y, + conv_params.scale); } // Current hb shape: [W_out, H_out, vp, uq * batch] @@ -2764,7 +2750,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( // struct ggml_tensor* hb_cat = ggml_reshape_4d(ctx, hb, w_out , h_out , vp * uq, batch); // [W_out, H_out, vp * uq, batch] - // Now left to compute (W1 kr Id) * hb_cat == (W1 kr W2) * h + // Now left to compute (W1 kr Id) * hb_cat == (W1 kr W2) cv h // merge the uq groups of size vp*w_out*h_out struct ggml_tensor* hb_merged = ggml_reshape_2d(ctx, hb, w_out * h_out * vp, uq * batch); @@ -2777,35 +2763,9 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( hc_t = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_merged_t)); } struct ggml_tensor* hc = ggml_transpose(ctx, hc_t); - hc = ggml_cont(ctx, hc); - struct ggml_tensor* out = ggml_reshape_4d(ctx, hc, w_out, h_out, up * vp, batch); - return ggml_scale(ctx, out, scale); -#else - // compute the weight diff and do a single conv - if (w1 == NULL) { - w1 = ggml_ext_merge_lora(ctx, w1b, w1a); - } - if (ggml_n_dims(w1) < 4) { - w1 = ggml_reshape_4d(ctx, w1, 1, 1, w1->ne[0], w1->ne[1]); - } - if (w2 == NULL) { - w2 = ggml_ext_merge_lora(ctx, w2b, w2a); - } - if (ggml_n_dims(w2) < 4) { - w2 = ggml_reshape_4d(ctx, w2, 1, 1, w2->ne[0], w2->ne[1]); - } - if (w2->ne[2] * w1->ne[2] != h->ne[2]) { - int k = sqrt(w2->ne[2] * w1->ne[2] / h->ne[2]); - GGML_ASSERT(k * k * h->ne[2] == w2->ne[2] * w1->ne[2]); - w2 = ggml_reshape_4d(ctx, w2, w2->ne[0] * k, w2->ne[1] * k, w2->ne[2] / (k * k), w2->ne[3]); - } - w1 = ggml_ext_cast_f32(ctx, w1); - w2 = ggml_ext_cast_f32(ctx, w2); - struct ggml_tensor* w = ggml_ext_kronecker(ctx, w1, w2); - struct ggml_tensor* out = ggml_ext_conv_2d(ctx, h, w, nullptr, conv_params.s0, conv_params.s1, conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1, conv_params.direct, conv_params.circular_x, conv_params.circular_y, conv_params.scale); - + // ungroup + struct ggml_tensor* out = ggml_reshape_4d(ctx, ggml_cont(ctx, hc), w_out, h_out, up * vp, batch); return ggml_scale(ctx, out, scale); -#endif } } From 04f9b1f5f0b1f56358ed397d1674f932b57539c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Mon, 2 Feb 2026 00:22:10 +0100 Subject: [PATCH 07/13] lokr: cleanup linear path code --- ggml_extend.hpp | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 63abe8c00..6424217fa 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -2658,27 +2658,25 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( struct ggml_tensor* hb; if (!is_conv) { - int batch = (int)h->ne[1]; - struct ggml_tensor* h_split = ggml_reshape_2d(ctx, h, vq, uq * batch); + int batch = (int)h->ne[1]; + struct ggml_tensor* h_split = ggml_reshape_3d(ctx, h, vq, uq, batch); if (w2 != NULL) { hb = ggml_mul_mat(ctx, w2, h_split); } else { hb = ggml_mul_mat(ctx, w2b, ggml_mul_mat(ctx, w2a, h_split)); } + struct ggml_tensor* hb_t = ggml_cont(ctx, ggml_transpose(ctx, hb)); - struct ggml_tensor* hb_cat = ggml_reshape_3d(ctx, hb, vp, uq, batch); - struct ggml_tensor* hb_t = ggml_cont(ctx, ggml_transpose(ctx, hb_cat)); - - struct ggml_tensor* hc; + struct ggml_tensor* hc_t; if (w1 != NULL) { - hc = ggml_mul_mat(ctx, w1, hb_t); + hc_t = ggml_mul_mat(ctx, w1, hb_t); } else { - hc = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_t)); + hc_t = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_t)); } - struct ggml_tensor* hc_t = ggml_transpose(ctx, hc); - struct ggml_tensor* out = ggml_reshape_2d(ctx, ggml_cont(ctx, hc_t), up * vp, batch); + struct ggml_tensor* hc = ggml_transpose(ctx, hc_t); + struct ggml_tensor* out = ggml_reshape_2d(ctx, ggml_cont(ctx, hc), up * vp, batch); return ggml_scale(ctx, out, scale); } else { From 5b67c4b6d2de012285b591f2d98c04782ce9b633 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Mon, 2 Feb 2026 01:21:53 +0100 Subject: [PATCH 08/13] reshape to 2d before mat_mul --- ggml_extend.hpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 6424217fa..effb578d3 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -2659,14 +2659,17 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( if (!is_conv) { int batch = (int)h->ne[1]; - struct ggml_tensor* h_split = ggml_reshape_3d(ctx, h, vq, uq, batch); - + struct ggml_tensor* h_split = ggml_reshape_2d(ctx, h, vq, uq * batch); if (w2 != NULL) { hb = ggml_mul_mat(ctx, w2, h_split); } else { hb = ggml_mul_mat(ctx, w2b, ggml_mul_mat(ctx, w2a, h_split)); } + hb = ggml_reshape_3d(ctx, hb, vp, uq, batch); + struct ggml_tensor* hb_t = ggml_cont(ctx, ggml_transpose(ctx, hb)); + + hb_t = ggml_reshape_2d(ctx, hb_t, uq, vp * batch); struct ggml_tensor* hc_t; if (w1 != NULL) { @@ -2674,6 +2677,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( } else { hc_t = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_t)); } + hc_t = ggml_reshape_3d(ctx, hc_t, up, vp, batch); struct ggml_tensor* hc = ggml_transpose(ctx, hc_t); struct ggml_tensor* out = ggml_reshape_2d(ctx, ggml_cont(ctx, hc), up * vp, batch); From f7d53b6551ea9b29c5f86ba0eacbee0d55feca9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Mon, 2 Feb 2026 01:38:25 +0100 Subject: [PATCH 09/13] maxComputeWorkGroupCount workaround for vulkan --- ggml_extend.hpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/ggml_extend.hpp b/ggml_extend.hpp index effb578d3..b55da2e15 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -2665,11 +2665,16 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( } else { hb = ggml_mul_mat(ctx, w2b, ggml_mul_mat(ctx, w2a, h_split)); } - hb = ggml_reshape_3d(ctx, hb, vp, uq, batch); + + if(batch > 1){ + hb = ggml_reshape_3d(ctx, hb, vp, uq, batch); + } struct ggml_tensor* hb_t = ggml_cont(ctx, ggml_transpose(ctx, hb)); - hb_t = ggml_reshape_2d(ctx, hb_t, uq, vp * batch); + if(batch > 1){ + hb_t = ggml_reshape_2d(ctx, hb_t, uq, vp * batch); + } struct ggml_tensor* hc_t; if (w1 != NULL) { @@ -2677,7 +2682,10 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( } else { hc_t = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_t)); } - hc_t = ggml_reshape_3d(ctx, hc_t, up, vp, batch); + + if(batch > 1){ + hc_t = ggml_reshape_3d(ctx, hc_t, up, vp, batch); + } struct ggml_tensor* hc = ggml_transpose(ctx, hc_t); struct ggml_tensor* out = ggml_reshape_2d(ctx, ggml_cont(ctx, hc), up * vp, batch); From 244480e50f12379c8caa1361f0d7964b62f6e275 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Mon, 2 Feb 2026 17:13:29 +0100 Subject: [PATCH 10/13] Avoid too large tensors dims in matmul for smaller vk workgroups --- ggml_extend.hpp | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/ggml_extend.hpp b/ggml_extend.hpp index b55da2e15..6c65f7a0d 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -2658,23 +2658,38 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( struct ggml_tensor* hb; if (!is_conv) { - int batch = (int)h->ne[1]; - struct ggml_tensor* h_split = ggml_reshape_2d(ctx, h, vq, uq * batch); + int max_batch = 65535; + int batch = (int)h->ne[1]; + int max_batch_uq = max_batch / uq; + int merge_batch_uq = 1; + for (int i = max_batch_uq; i > 0; i--) { + if (batch % i == 0) { + merge_batch_uq = i; + break; + } + } + + int max_batch_vp = max_batch / vp; + int merge_batch_vp = 1; + for (int i = max_batch_vp; i > 0; i--) { + if (batch % i == 0) { + merge_batch_vp = i; + break; + } + } + + struct ggml_tensor* h_split = ggml_reshape_3d(ctx, h, vq, uq * merge_batch_uq, batch / merge_batch_uq); if (w2 != NULL) { hb = ggml_mul_mat(ctx, w2, h_split); } else { hb = ggml_mul_mat(ctx, w2b, ggml_mul_mat(ctx, w2a, h_split)); } - - if(batch > 1){ + + if (batch > 1) { hb = ggml_reshape_3d(ctx, hb, vp, uq, batch); } - struct ggml_tensor* hb_t = ggml_cont(ctx, ggml_transpose(ctx, hb)); - - if(batch > 1){ - hb_t = ggml_reshape_2d(ctx, hb_t, uq, vp * batch); - } + hb_t = ggml_reshape_3d(ctx, hb_t, uq, vp * merge_batch_vp, batch / merge_batch_vp); struct ggml_tensor* hc_t; if (w1 != NULL) { @@ -2682,13 +2697,13 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( } else { hc_t = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_t)); } - - if(batch > 1){ + + if (batch > 1) { hc_t = ggml_reshape_3d(ctx, hc_t, up, vp, batch); } - struct ggml_tensor* hc = ggml_transpose(ctx, hc_t); - struct ggml_tensor* out = ggml_reshape_2d(ctx, ggml_cont(ctx, hc), up * vp, batch); + struct ggml_tensor* hc = ggml_transpose(ctx, hc_t); + struct ggml_tensor* out = ggml_reshape_2d(ctx, ggml_cont(ctx, hc), up * vp, batch); return ggml_scale(ctx, out, scale); } else { @@ -2696,7 +2711,6 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( // 1. Reshape input: [W, H, vq*uq, batch] -> [W, H, vq, uq * batch] struct ggml_tensor* h_split = ggml_reshape_4d(ctx, h, h->ne[0], h->ne[1], vq, uq * batch); - struct ggml_tensor* hb; if (w2 != NULL) { hb = ggml_ext_conv_2d(ctx, h_split, w2, nullptr, conv_params.s0, From 1ab9ed28ea3f1b2518f5758f01f72a60ba8d6681 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Mon, 2 Feb 2026 17:13:37 +0100 Subject: [PATCH 11/13] make it vk only --- ggml_extend.hpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 6c65f7a0d..8adacdf46 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -2658,10 +2658,15 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( struct ggml_tensor* hb; if (!is_conv) { - int max_batch = 65535; int batch = (int)h->ne[1]; - int max_batch_uq = max_batch / uq; - int merge_batch_uq = 1; + int merge_batch_uq = batch; + int merge_batch_vp = batch; + +#if SD_VULKAN + // no access to backend here, worst case is slightly worse perfs for other backends when built alongside Vulkan backend + int max_batch = 65535; + int max_batch_uq = max_batch / uq; + merge_batch_uq = 1; for (int i = max_batch_uq; i > 0; i--) { if (batch % i == 0) { merge_batch_uq = i; @@ -2669,14 +2674,15 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( } } - int max_batch_vp = max_batch / vp; - int merge_batch_vp = 1; + int max_batch_vp = max_batch / vp; + merge_batch_vp = 1; for (int i = max_batch_vp; i > 0; i--) { if (batch % i == 0) { merge_batch_vp = i; break; } } +#endif struct ggml_tensor* h_split = ggml_reshape_3d(ctx, h, vq, uq * merge_batch_uq, batch / merge_batch_uq); if (w2 != NULL) { @@ -2705,7 +2711,6 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( struct ggml_tensor* hc = ggml_transpose(ctx, hc_t); struct ggml_tensor* out = ggml_reshape_2d(ctx, ggml_cont(ctx, hc), up * vp, batch); return ggml_scale(ctx, out, scale); - } else { int batch = (int)h->ne[3]; // 1. Reshape input: [W, H, vq*uq, batch] -> [W, H, vq, uq * batch] @@ -2723,7 +2728,6 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( conv_params.circular_x, conv_params.circular_y, conv_params.scale); - } else { // swap a and b order for conv lora struct ggml_tensor* a = w2b; From c7629d9b23c6509edf2f18086e37102421546426 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Mon, 2 Feb 2026 17:14:01 +0100 Subject: [PATCH 12/13] remove unncesary casts for non-conv weights --- lora.hpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/lora.hpp b/lora.hpp index fd461086a..485963fd3 100644 --- a/lora.hpp +++ b/lora.hpp @@ -540,23 +540,14 @@ struct LoraModel : public GGMLRunner { if (iter != lora_tensors.end()) { lokr_w1 = iter->second; - if (is_conv2d && lokr_w1->type != GGML_TYPE_F16) { - lokr_w1 = ggml_cast(ctx, lokr_w1, GGML_TYPE_F16); - } } iter = iter_a; if (iter != lora_tensors.end()) { lokr_w1_a = iter->second; - if (is_conv2d && lokr_w1_a->type != GGML_TYPE_F16) { - lokr_w1_a = ggml_cast(ctx, lokr_w1_a, GGML_TYPE_F16); - } } iter = lora_tensors.find(lokr_w1_b_name); if (iter != lora_tensors.end()) { lokr_w1_b = iter->second; - if (is_conv2d && lokr_w1_b->type != GGML_TYPE_F16) { - lokr_w1_b = ggml_cast(ctx, lokr_w1_b, GGML_TYPE_F16); - } } iter = lora_tensors.find(lokr_w2_name); From 30051a2d9aaa5b563b1c412ff3481456443bc1b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Mon, 2 Feb 2026 17:50:20 +0100 Subject: [PATCH 13/13] fix wrong flag (oops) --- ggml_extend.hpp | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 8adacdf46..a9fe46fec 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -2662,24 +2662,26 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( int merge_batch_uq = batch; int merge_batch_vp = batch; -#if SD_VULKAN - // no access to backend here, worst case is slightly worse perfs for other backends when built alongside Vulkan backend - int max_batch = 65535; - int max_batch_uq = max_batch / uq; - merge_batch_uq = 1; - for (int i = max_batch_uq; i > 0; i--) { - if (batch % i == 0) { - merge_batch_uq = i; - break; +#if SD_USE_VULKAN + if (batch > 1) { + // no access to backend here, worst case is slightly worse perfs for other backends when built alongside Vulkan backend + int max_batch = 65535; + int max_batch_uq = max_batch / uq; + merge_batch_uq = 1; + for (int i = max_batch_uq; i > 0; i--) { + if (batch % i == 0) { + merge_batch_uq = i; + break; + } } - } - int max_batch_vp = max_batch / vp; - merge_batch_vp = 1; - for (int i = max_batch_vp; i > 0; i--) { - if (batch % i == 0) { - merge_batch_vp = i; - break; + int max_batch_vp = max_batch / vp; + merge_batch_vp = 1; + for (int i = max_batch_vp; i > 0; i--) { + if (batch % i == 0) { + merge_batch_vp = i; + break; + } } } #endif