halide · alexreinking · Mar 23, 2026 · Nov 24, 2025 · Nov 28, 2025 · Nov 24, 2025
diff --git a/Makefile b/Makefile
@@ -491,6 +491,7 @@ SOURCE_FILES = \
   Debug.cpp \
   DebugArguments.cpp \
   DebugToFile.cpp \
+  DecomposeVectorShuffle.cpp \
   Definition.cpp \
   Deinterleave.cpp \
   Derivative.cpp \
@@ -687,6 +688,7 @@ HEADER_FILES = \
   Debug.h \
   DebugArguments.h \
   DebugToFile.h \
+  DecomposeVectorShuffle.h \
   Definition.h \
   Deinterleave.h \
   Derivative.h \

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -95,6 +95,7 @@ target_sources(
     Debug.h
     DebugArguments.h
     DebugToFile.h
+    DecomposeVectorShuffle.h
     Definition.h
     Deinterleave.h
     Derivative.h
@@ -279,6 +280,7 @@ target_sources(
     Debug.cpp
     DebugArguments.cpp
     DebugToFile.cpp
+    DecomposeVectorShuffle.cpp
     Definition.cpp
     Deinterleave.cpp
     Derivative.cpp

diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
@@ -4155,7 +4155,9 @@ void CodeGen_LLVM::visit(const Shuffle *op) {
             } else {
                 internal_assert(op->indices[0] == 0);
             }
-            value = create_broadcast(value, op->indices.size());
+            if (op->indices.size() > 1) {
+                value = create_broadcast(value, op->indices.size());
+            }
             return;
         }
     }
@@ -5445,6 +5447,10 @@ int CodeGen_LLVM::get_vector_num_elements(const llvm::Type *t) {
     }
 }
 
+int CodeGen_LLVM::get_vector_num_elements(const llvm::Value *v) {
+    return get_vector_num_elements(v->getType());
+}
+
 llvm::Type *CodeGen_LLVM::llvm_type_of(LLVMContext *c, Halide::Type t,
                                        int effective_vscale) const {
     if (t.lanes() == 1) {
@@ -5481,23 +5487,7 @@ llvm::Type *CodeGen_LLVM::get_vector_type(llvm::Type *t, int n,
     switch (type_constraint) {
     case VectorTypeConstraint::None:
         if (effective_vscale > 0) {
-            bool wide_enough = true;
-            // TODO(https://github.com/halide/Halide/issues/8119): Architecture
-            // specific code should not go here. Ideally part of this can go
-            // away via LLVM fixes and modifying intrinsic selection to handle
-            // scalable vs. fixed vectors. Making this method virtual is
-            // possibly expensive.
-            if (target.arch == Target::ARM) {
-                if (!target.has_feature(Target::NoNEON)) {
-                    // force booleans into bytes. TODO(https://github.com/halide/Halide/issues/8119): figure out a better way to do this.
-                    int bit_size = std::max((int)t->getScalarSizeInBits(), 8);
-                    wide_enough = (bit_size * n) > 128;
-                } else {
-                    // TODO(https://github.com/halide/Halide/issues/8119): AArch64 SVE2 support is crashy with scalable vectors of min size 1.
-                    wide_enough = (n / effective_vscale) > 1;
-                }
-            }
-            scalable = wide_enough && ((n % effective_vscale) == 0);
+            scalable = (n % effective_vscale) == 0;
             if (scalable) {
                 n = n / effective_vscale;
             }

diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
@@ -605,7 +605,10 @@ class CodeGen_LLVM : public IRVisitor {
                                    const std::function<llvm::Value *(llvm::Value *)> &fn);
 
     /** Get number of vector elements, taking into account scalable vectors. Returns 1 for scalars. */
+    // @{
     int get_vector_num_elements(const llvm::Type *t);
+    int get_vector_num_elements(const llvm::Value *v);
+    // @}
 
     /** Interface to abstract vector code generation as LLVM is now
      * providing multiple options to express even simple vector

diff --git a/src/DecomposeVectorShuffle.cpp b/src/DecomposeVectorShuffle.cpp
@@ -0,0 +1,80 @@
+#include "DecomposeVectorShuffle.h"
+
+#include <unordered_map>
+
+namespace Halide::Internal {
+
+std::vector<std::vector<NativeShuffle>> decompose_to_native_shuffles(
+    int src_lanes, const std::vector<int> &indices, int vl) {
+
+    int dst_lanes = static_cast<int>(indices.size());
+    int src_lanes_aligned = align_up(src_lanes, vl);
+
+    // Adjust indices so that src vectors are aligned up to multiple of vl
+    std::vector<int> aligned_indices = indices;
+    for (int &idx : aligned_indices) {
+        if (idx >= src_lanes) {
+            idx += src_lanes_aligned - src_lanes;
+        }
+    }
+
+    const int num_dst_slices = align_up(dst_lanes, vl) / vl;
+    std::vector<std::vector<NativeShuffle>> all_steps(num_dst_slices);
+
+    for (int dst_slice = 0; dst_slice < num_dst_slices; dst_slice++) {
+        std::unordered_map<int, int> slice_to_step;
+        auto &steps = all_steps[dst_slice];
+        const int dst_start = dst_slice * vl;
+
+        for (int dst_index = dst_start; dst_index < dst_start + vl && dst_index < dst_lanes; ++dst_index) {
+            const int src_index = aligned_indices[dst_index];
+            if (src_index < 0) {
+                continue;
+            }
+
+            const int src_slice = src_index / vl;
+            const int lane_in_src_slice = src_index % vl;
+            const int lane_in_dst_slice = dst_index - dst_start;
+
+            if (steps.empty()) {
+                // first slice in this block
+                slice_to_step[src_slice] = 0;
+                steps.emplace_back(vl, src_slice, SliceIndexNone);
+                steps.back().lane_map[lane_in_dst_slice] = lane_in_src_slice;
+
+            } else if (auto itr = slice_to_step.find(src_slice); itr != slice_to_step.end()) {
+                // slice already seen
+                NativeShuffle &step = steps[itr->second];
+                bool is_a = (step.slice_a != SliceIndexCarryPrevResult && step.slice_a == src_slice);
+                int offset = is_a ? 0 : vl;
+                step.lane_map[lane_in_dst_slice] = lane_in_src_slice + offset;
+
+            } else if (steps[0].slice_b == SliceIndexNone) {
+                // add as 'b' of first step if b is unused
+                slice_to_step[src_slice] = 0;
+                steps[0].slice_b = src_slice;
+                steps[0].lane_map[lane_in_dst_slice] = lane_in_src_slice + vl;
+
+            } else {
+                // otherwise chain a new step
+                slice_to_step[src_slice] = static_cast<int>(steps.size());
+                // new step uses previous result as 'a', so we use 'b' for this one
+                steps.emplace_back(vl, SliceIndexCarryPrevResult, src_slice);
+
+                // Except for the first step, we need to arrange indices
+                // so that the output carried from the previous step is kept
+                auto &lane_map = steps.back().lane_map;
+                // initialize lane_map as identical copy
+                for (size_t lane_idx = 0; lane_idx < lane_map.size(); ++lane_idx) {
+                    lane_map[lane_idx] = lane_idx;
+                }
+                // update for this index
+                lane_map[lane_in_dst_slice] = lane_in_src_slice + vl;
+            }
+        }
+    }
+
+    return all_steps;
+}
+
+}  // namespace Halide::Internal
diff --git a/src/DecomposeVectorShuffle.h b/src/DecomposeVectorShuffle.h
@@ -0,0 +1,163 @@
+#ifndef HALIDE_DECOMPOSE_VECTOR_SHUFFLE_H
+#define HALIDE_DECOMPOSE_VECTOR_SHUFFLE_H
+
+/** \file
+ *
+ * Perform vector shuffle by decomposing the operation to
+ * a sequence of the sub shuffle steps where each step is a shuffle of:
+ * - One or two slices as input (slice_a and slice_b)
+ * - Produce one slice (dst slice)
+ * - All the slices have the same length as target native vector (vl)
+ *
+ * The structure of the sequence of steps consists of:
+ * 1. Outer loop to iterate the slices of dst vector.
+ * 2. Inner loop to iterate the native shuffle steps to complete a single dst slice.
+ *    This can be multiple steps because a single native shuffle can take
+ *    only 2 slices (native vector length x 2) at most, while we may need
+ *    to fetch from wider location in the src vector.
+ *
+ * The following example, log of test code, illustrates how it works.
+ *
+ * src_lanes: 17, dst_lanes: 7, vl: 4
+ *  input a: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, ]
+ *  input b: [170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300, 310, 320, 330, ]
+ *  indices: [6, 13, 24, 14, 7, 11, 5, ]
+ *
+ *  slice a:[40, 50, 60, 70, ],  slice b:[120, 130, 140, 150, ],  indices:[2, 5, -1, 6, ]
+ *    => slice output:[60, 130, -559038801, 140, ]
+ *  slice a:[60, 130, -559038801, 140, ],  slice b:[210, 220, 230, 240, ],  indices:[0, 1, 7, 3, ]
+ *    => slice output:[60, 130, 240, 140, ]
+ *  slice a:[40, 50, 60, 70, ],  slice b:[80, 90, 100, 110, ],  indices:[3, 7, 1, -1, ]
+ *    => slice output:[70, 110, 50, -559038801, ]
+ *
+ *  output: [60, 130, 240, 140, 70, 110, 50, ]
+ *
+ */
+
+#include "Error.h"
+#include "Util.h"
+
+#include <optional>
+#include <vector>
+
+namespace Halide {
+namespace Internal {
+
+/** Enum to represent the special cases of slice index */
+enum {
+    SliceIndexNone = -1,
+    SliceIndexCarryPrevResult = -2,
+};
+
+struct NativeShuffle {
+    int slice_a;
+    int slice_b;
+    std::vector<int> lane_map;
+
+    NativeShuffle(int vl, int a, int b)
+        : slice_a(a), slice_b(b) {
+        lane_map.resize(vl, SliceIndexNone);
+    }
+};
+
+std::vector<std::vector<NativeShuffle>> decompose_to_native_shuffles(
+    int src_lanes, const std::vector<int> &indices, int vl);
+
+/** Algorithm logic for shuffle decomposition, parameterized on vector type
+ * and a codegen-like class that provides primitive vector operations.
+ */
+template<typename CodeGenTy, typename VecTy>
+struct DecomposeVectorShuffle {
+    // TODO: when upgrading to C++20, replace with a concept.
+    // get_vector_num_elements may be overloaded (e.g. on Type* and Value*), so use
+    // expression SFINAE rather than a method pointer to handle overload resolution.
+    static_assert(std::is_convertible_v<decltype(std::declval<CodeGenTy &>().get_vector_num_elements(std::declval<VecTy>())), int>,
+                  "CodeGenTy must provide: int get_vector_num_elements(VecTy)");
+    static_assert(std::is_invocable_r_v<VecTy, decltype(&CodeGenTy::slice_vector), CodeGenTy &, const VecTy &, int, int>,
+                  "CodeGenTy must provide: VecTy slice_vector(const VecTy &, int, int)");
+    static_assert(std::is_invocable_r_v<VecTy, decltype(&CodeGenTy::concat_vectors), CodeGenTy &, const std::vector<VecTy> &>,
+                  "CodeGenTy must provide: VecTy concat_vectors(const std::vector<VecTy> &)");
+    static_assert(std::is_invocable_r_v<VecTy, decltype(&CodeGenTy::shuffle_scalable_vectors_general), CodeGenTy &,
+                                        const VecTy &, const VecTy &, const std::vector<int> &>,
+                  "CodeGenTy must provide: VecTy shuffle_scalable_vectors_general(const VecTy &, const VecTy &, const std::vector<int> &)");
+    static_assert(std::is_invocable_r_v<VecTy, decltype(&CodeGenTy::create_undef_vector_like), CodeGenTy &, const VecTy &, int>,
+                  "CodeGenTy must provide: VecTy create_undef_vector_like(const VecTy &, int)");
+
+    DecomposeVectorShuffle(CodeGenTy &codegen, const VecTy &src_a, const VecTy &src_b, int src_lanes, int vl)
+        : codegen(codegen),
+          vl(vl),
+          src_a(align_up_vector(src_a, vl)),
+          src_b(align_up_vector(src_b, vl)),
+          src_lanes(src_lanes),
+          src_lanes_aligned(align_up(src_lanes, vl)) {
+    }
+
+    VecTy run(const std::vector<int> &indices) {
+        auto shuffle_plan = decompose_to_native_shuffles(src_lanes, indices, vl);
+        int dst_lanes = static_cast<int>(indices.size());
+
+        // process each block divided by vl
+        std::vector<VecTy> shuffled_dst_slices;
+        shuffled_dst_slices.reserve(shuffle_plan.size());
+
+        for (const auto &steps_for_dst_slice : shuffle_plan) {
+            std::optional<VecTy> dst_slice = std::nullopt;
+            for (const auto &step : steps_for_dst_slice) {
+                // Obtain 1st slice a
+                VecTy a;
+                if (step.slice_a == SliceIndexCarryPrevResult) {
+                    internal_assert(dst_slice.has_value()) << "Tried to carry from undefined previous result";
+                    a = *dst_slice;
+                } else {
+                    a = get_vl_slice(step.slice_a);
+                }
+                // Obtain 2nd slice b
+                std::optional<VecTy> b;
+                if (step.slice_b == SliceIndexNone) {
+                    b = std::nullopt;
+                } else {
+                    b = std::optional<VecTy>(get_vl_slice(step.slice_b));
+                }
+                // Perform shuffle where vector length is aligned
+                dst_slice = codegen.shuffle_scalable_vectors_general(a, b.value_or(VecTy{}), step.lane_map);
+            }
+            if (!dst_slice.has_value()) {
+                // No shuffle step for this slice, i.e. all the indices are -1
+                dst_slice = codegen.create_undef_vector_like(src_a, vl);
+            }
+            shuffled_dst_slices.push_back(*dst_slice);
+        }
+
+        return codegen.slice_vector(codegen.concat_vectors(shuffled_dst_slices), 0, dst_lanes);
+    }
+
+private:
+    // Helper to extract slice with lanes=vl
+    VecTy get_vl_slice(int slice_index) {
+        const int num_slices_a = src_lanes_aligned / vl;
+        int start_index = slice_index * vl;
+        if (slice_index < num_slices_a) {
+            return codegen.slice_vector(src_a, start_index, vl);
+        } else {
+            start_index -= src_lanes_aligned;
+            return codegen.slice_vector(src_b, start_index, vl);
+        }
+    }
+
+    VecTy align_up_vector(const VecTy &v, int align) {
+        int len = codegen.get_vector_num_elements(v);
+        return codegen.slice_vector(v, 0, align_up(len, align));
+    }
+
+    CodeGenTy &codegen;
+    int vl;
+    VecTy src_a;
+    VecTy src_b;
+    int src_lanes;
+    int src_lanes_aligned;
+};
+
+}  // namespace Internal
+}  // namespace Halide
+
+#endif
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
@@ -80,6 +80,7 @@ tests(GROUPS correctness
       debug_to_file.cpp
       debug_to_file_multiple_outputs.cpp
       debug_to_file_reorder.cpp
+      decompose_vector_shuffle.cpp
       deferred_loop_level.cpp
       deinterleave4.cpp
       device_buffer_copies_with_profile.cpp