Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
9772743
Add helpers for shuffle operations of scalable vector
stevesuzuki-arm Nov 24, 2025
5793e89
Move helpers for shuffle scalable vectors to CodeGen_ARM
stevesuzuki-arm Nov 28, 2025
263f6c6
Modify workaround of using FixedVector for ScalableVector
stevesuzuki-arm Nov 24, 2025
7cdfd64
Shuffle scalable vector in CodeGen_ARM
stevesuzuki-arm Oct 13, 2025
85ef5b5
Add DecomposeVectorShuffle to Makefile
stevesuzuki-arm Dec 11, 2025
84ec0ee
Improve performance of vector broadcast in SVE2
stevesuzuki-arm Dec 13, 2025
94c1684
Modify test cases of total bit width in simd_op_check_sve2
stevesuzuki-arm Nov 24, 2025
e3ae7ab
Simplify DecomposeVectorShuffle implementation; move test to tests/
alexreinking Feb 17, 2026
190e032
Remove extra header in decompose_vector_shuffle.cpp
alexreinking Feb 17, 2026
f671968
Add an assertion that highest_lane is a real lane.
alexreinking Feb 17, 2026
9d536a8
Assert that dst_slice has a value
alexreinking Feb 17, 2026
ec222f7
Don't assert on paths that don't require alignment
alexreinking Feb 17, 2026
6cfb39e
Assert arguments to shuffle_scalable_vectors_general are valid
alexreinking Feb 17, 2026
c58006e
Fix typo in comment in CodeGen_ARM::visit(Shuffle)
alexreinking Feb 17, 2026
f35a8a8
Eliminate need for DecomposeVectorShuffle wrappers
alexreinking Feb 17, 2026
562ac36
Drop old function prototypes
alexreinking Feb 17, 2026
26809ed
Remove unused `using std::optional`
alexreinking Feb 17, 2026
c2b6f7b
Fix edge case bug in DecomposeVectorShuffle
stevesuzuki-arm Feb 18, 2026
bc284e0
Merge branch 'main' into pr-shuffle_sve2
stevesuzuki-arm Mar 11, 2026
acb6f5b
Add "tbl" and "whilelt" in simd_op_check_sve2
stevesuzuki-arm Mar 12, 2026
be0b4a6
Merge branch 'main' into pr-shuffle_sve2
stevesuzuki-arm Mar 13, 2026
2d43f3d
Skip failing tests on SVE2 target with LLVM21
stevesuzuki-arm Mar 15, 2026
b4572b7
Merge branch 'main' into pr-shuffle_sve2
alexreinking Mar 15, 2026
80087d5
Skip performance_boundary_conditions on SVE2 with LLVM 21
alexreinking Mar 16, 2026
9ea7782
Merge branch 'main' into pr-shuffle_sve2
alexreinking Mar 23, 2026
b1a53b6
Use CamelCase instead of ALL_CAPS in enum value names
alexreinking Mar 23, 2026
2c47425
Skip Lesson 5 in CMake rather than the code.
alexreinking Mar 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,7 @@ SOURCE_FILES = \
Debug.cpp \
DebugArguments.cpp \
DebugToFile.cpp \
DecomposeVectorShuffle.cpp \
Definition.cpp \
Deinterleave.cpp \
Derivative.cpp \
Expand Down Expand Up @@ -687,6 +688,7 @@ HEADER_FILES = \
Debug.h \
DebugArguments.h \
DebugToFile.h \
DecomposeVectorShuffle.h \
Definition.h \
Deinterleave.h \
Derivative.h \
Expand Down
2 changes: 2 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ target_sources(
Debug.h
DebugArguments.h
DebugToFile.h
DecomposeVectorShuffle.h
Definition.h
Deinterleave.h
Derivative.h
Expand Down Expand Up @@ -279,6 +280,7 @@ target_sources(
Debug.cpp
DebugArguments.cpp
DebugToFile.cpp
DecomposeVectorShuffle.cpp
Comment thread
alexreinking marked this conversation as resolved.
Definition.cpp
Deinterleave.cpp
Derivative.cpp
Expand Down
418 changes: 377 additions & 41 deletions src/CodeGen_ARM.cpp

Large diffs are not rendered by default.

26 changes: 8 additions & 18 deletions src/CodeGen_LLVM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4155,7 +4155,9 @@ void CodeGen_LLVM::visit(const Shuffle *op) {
} else {
internal_assert(op->indices[0] == 0);
}
value = create_broadcast(value, op->indices.size());
if (op->indices.size() > 1) {
value = create_broadcast(value, op->indices.size());
}
return;
}
}
Expand Down Expand Up @@ -5445,6 +5447,10 @@ int CodeGen_LLVM::get_vector_num_elements(const llvm::Type *t) {
}
}

int CodeGen_LLVM::get_vector_num_elements(const llvm::Value *v) {
return get_vector_num_elements(v->getType());
}

llvm::Type *CodeGen_LLVM::llvm_type_of(LLVMContext *c, Halide::Type t,
int effective_vscale) const {
if (t.lanes() == 1) {
Expand Down Expand Up @@ -5481,23 +5487,7 @@ llvm::Type *CodeGen_LLVM::get_vector_type(llvm::Type *t, int n,
switch (type_constraint) {
case VectorTypeConstraint::None:
if (effective_vscale > 0) {
bool wide_enough = true;
// TODO(https://github.com/halide/Halide/issues/8119): Architecture
// specific code should not go here. Ideally part of this can go
// away via LLVM fixes and modifying intrinsic selection to handle
// scalable vs. fixed vectors. Making this method virtual is
// possibly expensive.
if (target.arch == Target::ARM) {
if (!target.has_feature(Target::NoNEON)) {
// force booleans into bytes. TODO(https://github.com/halide/Halide/issues/8119): figure out a better way to do this.
int bit_size = std::max((int)t->getScalarSizeInBits(), 8);
wide_enough = (bit_size * n) > 128;
} else {
// TODO(https://github.com/halide/Halide/issues/8119): AArch64 SVE2 support is crashy with scalable vectors of min size 1.
wide_enough = (n / effective_vscale) > 1;
}
}
scalable = wide_enough && ((n % effective_vscale) == 0);
scalable = (n % effective_vscale) == 0;
if (scalable) {
n = n / effective_vscale;
}
Expand Down
3 changes: 3 additions & 0 deletions src/CodeGen_LLVM.h
Original file line number Diff line number Diff line change
Expand Up @@ -605,7 +605,10 @@ class CodeGen_LLVM : public IRVisitor {
const std::function<llvm::Value *(llvm::Value *)> &fn);

/** Get number of vector elements, taking into account scalable vectors. Returns 1 for scalars. */
// @{
int get_vector_num_elements(const llvm::Type *t);
int get_vector_num_elements(const llvm::Value *v);
// @}

/** Interface to abstract vector code generation as LLVM is now
* providing multiple options to express even simple vector
Expand Down
80 changes: 80 additions & 0 deletions src/DecomposeVectorShuffle.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#include "DecomposeVectorShuffle.h"
Comment thread
alexreinking marked this conversation as resolved.

#include <unordered_map>

namespace Halide::Internal {

std::vector<std::vector<NativeShuffle>> decompose_to_native_shuffles(
int src_lanes, const std::vector<int> &indices, int vl) {

int dst_lanes = static_cast<int>(indices.size());
int src_lanes_aligned = align_up(src_lanes, vl);

// Adjust indices so that src vectors are aligned up to multiple of vl
std::vector<int> aligned_indices = indices;
for (int &idx : aligned_indices) {
if (idx >= src_lanes) {
idx += src_lanes_aligned - src_lanes;
}
}

const int num_dst_slices = align_up(dst_lanes, vl) / vl;
std::vector<std::vector<NativeShuffle>> all_steps(num_dst_slices);

for (int dst_slice = 0; dst_slice < num_dst_slices; dst_slice++) {
std::unordered_map<int, int> slice_to_step;
auto &steps = all_steps[dst_slice];
const int dst_start = dst_slice * vl;

for (int dst_index = dst_start; dst_index < dst_start + vl && dst_index < dst_lanes; ++dst_index) {
const int src_index = aligned_indices[dst_index];
if (src_index < 0) {
continue;
}

const int src_slice = src_index / vl;
const int lane_in_src_slice = src_index % vl;
const int lane_in_dst_slice = dst_index - dst_start;

if (steps.empty()) {
// first slice in this block
slice_to_step[src_slice] = 0;
steps.emplace_back(vl, src_slice, SliceIndexNone);
steps.back().lane_map[lane_in_dst_slice] = lane_in_src_slice;

} else if (auto itr = slice_to_step.find(src_slice); itr != slice_to_step.end()) {
// slice already seen
NativeShuffle &step = steps[itr->second];
bool is_a = (step.slice_a != SliceIndexCarryPrevResult && step.slice_a == src_slice);
int offset = is_a ? 0 : vl;
step.lane_map[lane_in_dst_slice] = lane_in_src_slice + offset;

} else if (steps[0].slice_b == SliceIndexNone) {
// add as 'b' of first step if b is unused
slice_to_step[src_slice] = 0;
steps[0].slice_b = src_slice;
steps[0].lane_map[lane_in_dst_slice] = lane_in_src_slice + vl;

} else {
// otherwise chain a new step
slice_to_step[src_slice] = static_cast<int>(steps.size());
// new step uses previous result as 'a', so we use 'b' for this one
steps.emplace_back(vl, SliceIndexCarryPrevResult, src_slice);

// Except for the first step, we need to arrange indices
// so that the output carried from the previous step is kept
auto &lane_map = steps.back().lane_map;
// initialize lane_map as identical copy
for (size_t lane_idx = 0; lane_idx < lane_map.size(); ++lane_idx) {
lane_map[lane_idx] = lane_idx;
}
// update for this index
lane_map[lane_in_dst_slice] = lane_in_src_slice + vl;
}
}
}

return all_steps;
}

} // namespace Halide::Internal
163 changes: 163 additions & 0 deletions src/DecomposeVectorShuffle.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
#ifndef HALIDE_DECOMPOSE_VECTOR_SHUFFLE_H
#define HALIDE_DECOMPOSE_VECTOR_SHUFFLE_H

/** \file
*
* Perform vector shuffle by decomposing the operation to
* a sequence of the sub shuffle steps where each step is a shuffle of:
* - One or two slices as input (slice_a and slice_b)
* - Produce one slice (dst slice)
* - All the slices have the same length as target native vector (vl)
*
* The structure of the sequence of steps consists of:
* 1. Outer loop to iterate the slices of dst vector.
* 2. Inner loop to iterate the native shuffle steps to complete a single dst slice.
* This can be multiple steps because a single native shuffle can take
* only 2 slices (native vector length x 2) at most, while we may need
* to fetch from wider location in the src vector.
*
* The following example, log of test code, illustrates how it works.
*
* src_lanes: 17, dst_lanes: 7, vl: 4
* input a: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, ]
* input b: [170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300, 310, 320, 330, ]
* indices: [6, 13, 24, 14, 7, 11, 5, ]
*
* slice a:[40, 50, 60, 70, ], slice b:[120, 130, 140, 150, ], indices:[2, 5, -1, 6, ]
* => slice output:[60, 130, -559038801, 140, ]
* slice a:[60, 130, -559038801, 140, ], slice b:[210, 220, 230, 240, ], indices:[0, 1, 7, 3, ]
* => slice output:[60, 130, 240, 140, ]
* slice a:[40, 50, 60, 70, ], slice b:[80, 90, 100, 110, ], indices:[3, 7, 1, -1, ]
* => slice output:[70, 110, 50, -559038801, ]
*
* output: [60, 130, 240, 140, 70, 110, 50, ]
*
*/

#include "Error.h"
#include "Util.h"

#include <optional>
#include <vector>

namespace Halide {
namespace Internal {

/** Enum to represent the special cases of slice index */
enum {
SliceIndexNone = -1,
SliceIndexCarryPrevResult = -2,
};

struct NativeShuffle {
int slice_a;
int slice_b;
std::vector<int> lane_map;

NativeShuffle(int vl, int a, int b)
: slice_a(a), slice_b(b) {
lane_map.resize(vl, SliceIndexNone);
}
};

std::vector<std::vector<NativeShuffle>> decompose_to_native_shuffles(
int src_lanes, const std::vector<int> &indices, int vl);

/** Algorithm logic for shuffle decomposition, parameterized on vector type
* and a codegen-like class that provides primitive vector operations.
*/
template<typename CodeGenTy, typename VecTy>
struct DecomposeVectorShuffle {
// TODO: when upgrading to C++20, replace with a concept.
// get_vector_num_elements may be overloaded (e.g. on Type* and Value*), so use
// expression SFINAE rather than a method pointer to handle overload resolution.
static_assert(std::is_convertible_v<decltype(std::declval<CodeGenTy &>().get_vector_num_elements(std::declval<VecTy>())), int>,
"CodeGenTy must provide: int get_vector_num_elements(VecTy)");
static_assert(std::is_invocable_r_v<VecTy, decltype(&CodeGenTy::slice_vector), CodeGenTy &, const VecTy &, int, int>,
"CodeGenTy must provide: VecTy slice_vector(const VecTy &, int, int)");
static_assert(std::is_invocable_r_v<VecTy, decltype(&CodeGenTy::concat_vectors), CodeGenTy &, const std::vector<VecTy> &>,
"CodeGenTy must provide: VecTy concat_vectors(const std::vector<VecTy> &)");
static_assert(std::is_invocable_r_v<VecTy, decltype(&CodeGenTy::shuffle_scalable_vectors_general), CodeGenTy &,
const VecTy &, const VecTy &, const std::vector<int> &>,
"CodeGenTy must provide: VecTy shuffle_scalable_vectors_general(const VecTy &, const VecTy &, const std::vector<int> &)");
static_assert(std::is_invocable_r_v<VecTy, decltype(&CodeGenTy::create_undef_vector_like), CodeGenTy &, const VecTy &, int>,
"CodeGenTy must provide: VecTy create_undef_vector_like(const VecTy &, int)");

DecomposeVectorShuffle(CodeGenTy &codegen, const VecTy &src_a, const VecTy &src_b, int src_lanes, int vl)
: codegen(codegen),
vl(vl),
src_a(align_up_vector(src_a, vl)),
src_b(align_up_vector(src_b, vl)),
src_lanes(src_lanes),
src_lanes_aligned(align_up(src_lanes, vl)) {
}

VecTy run(const std::vector<int> &indices) {
auto shuffle_plan = decompose_to_native_shuffles(src_lanes, indices, vl);
int dst_lanes = static_cast<int>(indices.size());

// process each block divided by vl
std::vector<VecTy> shuffled_dst_slices;
shuffled_dst_slices.reserve(shuffle_plan.size());

for (const auto &steps_for_dst_slice : shuffle_plan) {
std::optional<VecTy> dst_slice = std::nullopt;
for (const auto &step : steps_for_dst_slice) {
// Obtain 1st slice a
VecTy a;
if (step.slice_a == SliceIndexCarryPrevResult) {
internal_assert(dst_slice.has_value()) << "Tried to carry from undefined previous result";
a = *dst_slice;
} else {
a = get_vl_slice(step.slice_a);
}
// Obtain 2nd slice b
std::optional<VecTy> b;
if (step.slice_b == SliceIndexNone) {
b = std::nullopt;
} else {
b = std::optional<VecTy>(get_vl_slice(step.slice_b));
}
// Perform shuffle where vector length is aligned
dst_slice = codegen.shuffle_scalable_vectors_general(a, b.value_or(VecTy{}), step.lane_map);
}
if (!dst_slice.has_value()) {
// No shuffle step for this slice, i.e. all the indices are -1
dst_slice = codegen.create_undef_vector_like(src_a, vl);
}
shuffled_dst_slices.push_back(*dst_slice);
}

return codegen.slice_vector(codegen.concat_vectors(shuffled_dst_slices), 0, dst_lanes);
}

private:
// Helper to extract slice with lanes=vl
VecTy get_vl_slice(int slice_index) {
const int num_slices_a = src_lanes_aligned / vl;
int start_index = slice_index * vl;
if (slice_index < num_slices_a) {
return codegen.slice_vector(src_a, start_index, vl);
} else {
start_index -= src_lanes_aligned;
return codegen.slice_vector(src_b, start_index, vl);
}
}

VecTy align_up_vector(const VecTy &v, int align) {
int len = codegen.get_vector_num_elements(v);
return codegen.slice_vector(v, 0, align_up(len, align));
}

CodeGenTy &codegen;
int vl;
VecTy src_a;
VecTy src_b;
int src_lanes;
int src_lanes_aligned;
};

} // namespace Internal
} // namespace Halide

#endif
1 change: 1 addition & 0 deletions test/correctness/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ tests(GROUPS correctness
debug_to_file.cpp
debug_to_file_multiple_outputs.cpp
debug_to_file_reorder.cpp
decompose_vector_shuffle.cpp
deferred_loop_level.cpp
deinterleave4.cpp
device_buffer_copies_with_profile.cpp
Expand Down
Loading
Loading