-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Shuffle scalable vector in CodeGen_ARM #8898
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
27 commits
Select commit
Hold shift + click to select a range
9772743
Add helpers for shuffle operations of scalable vector
stevesuzuki-arm 5793e89
Move helpers for shuffle scalable vectors to CodeGen_ARM
stevesuzuki-arm 263f6c6
Modify workaround of using FixedVector for ScalableVector
stevesuzuki-arm 7cdfd64
Shuffle scalable vector in CodeGen_ARM
stevesuzuki-arm 85ef5b5
Add DecomposeVectorShuffle to Makefile
stevesuzuki-arm 84ec0ee
Improve performance of vector broadcast in SVE2
stevesuzuki-arm 94c1684
Modify test cases of total bit width in simd_op_check_sve2
stevesuzuki-arm e3ae7ab
Simplify DecomposeVectorShuffle implementation; move test to tests/
alexreinking 190e032
Remove extra header in decompose_vector_shuffle.cpp
alexreinking f671968
Add an assertion that highest_lane is a real lane.
alexreinking 9d536a8
Assert that dst_slice has a value
alexreinking ec222f7
Don't assert on paths that don't require alignment
alexreinking 6cfb39e
Assert arguments to shuffle_scalable_vectors_general are valid
alexreinking c58006e
Fix typo in comment in CodeGen_ARM::visit(Shuffle)
alexreinking f35a8a8
Eliminate need for DecomposeVectorShuffle wrappers
alexreinking 562ac36
Drop old function prototypes
alexreinking 26809ed
Remove unused `using std::optional`
alexreinking c2b6f7b
Fix edge case bug in DecomposeVectorShuffle
stevesuzuki-arm bc284e0
Merge branch 'main' into pr-shuffle_sve2
stevesuzuki-arm acb6f5b
Add "tbl" and "whilelt" in simd_op_check_sve2
stevesuzuki-arm be0b4a6
Merge branch 'main' into pr-shuffle_sve2
stevesuzuki-arm 2d43f3d
Skip failing tests on SVE2 target with LLVM21
stevesuzuki-arm b4572b7
Merge branch 'main' into pr-shuffle_sve2
alexreinking 80087d5
Skip performance_boundary_conditions on SVE2 with LLVM 21
alexreinking 9ea7782
Merge branch 'main' into pr-shuffle_sve2
alexreinking b1a53b6
Use CamelCase instead of ALL_CAPS in enum value names
alexreinking 2c47425
Skip Lesson 5 in CMake rather than the code.
alexreinking File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,80 @@ | ||
| #include "DecomposeVectorShuffle.h" | ||
|
alexreinking marked this conversation as resolved.
|
||
|
|
||
| #include <unordered_map> | ||
|
|
||
| namespace Halide::Internal { | ||
|
|
||
| std::vector<std::vector<NativeShuffle>> decompose_to_native_shuffles( | ||
| int src_lanes, const std::vector<int> &indices, int vl) { | ||
|
|
||
| int dst_lanes = static_cast<int>(indices.size()); | ||
| int src_lanes_aligned = align_up(src_lanes, vl); | ||
|
|
||
| // Adjust indices so that src vectors are aligned up to multiple of vl | ||
| std::vector<int> aligned_indices = indices; | ||
| for (int &idx : aligned_indices) { | ||
| if (idx >= src_lanes) { | ||
| idx += src_lanes_aligned - src_lanes; | ||
| } | ||
| } | ||
|
|
||
| const int num_dst_slices = align_up(dst_lanes, vl) / vl; | ||
| std::vector<std::vector<NativeShuffle>> all_steps(num_dst_slices); | ||
|
|
||
| for (int dst_slice = 0; dst_slice < num_dst_slices; dst_slice++) { | ||
| std::unordered_map<int, int> slice_to_step; | ||
| auto &steps = all_steps[dst_slice]; | ||
| const int dst_start = dst_slice * vl; | ||
|
|
||
| for (int dst_index = dst_start; dst_index < dst_start + vl && dst_index < dst_lanes; ++dst_index) { | ||
| const int src_index = aligned_indices[dst_index]; | ||
| if (src_index < 0) { | ||
| continue; | ||
| } | ||
|
|
||
| const int src_slice = src_index / vl; | ||
| const int lane_in_src_slice = src_index % vl; | ||
| const int lane_in_dst_slice = dst_index - dst_start; | ||
|
|
||
| if (steps.empty()) { | ||
| // first slice in this block | ||
| slice_to_step[src_slice] = 0; | ||
| steps.emplace_back(vl, src_slice, SliceIndexNone); | ||
| steps.back().lane_map[lane_in_dst_slice] = lane_in_src_slice; | ||
|
|
||
| } else if (auto itr = slice_to_step.find(src_slice); itr != slice_to_step.end()) { | ||
| // slice already seen | ||
| NativeShuffle &step = steps[itr->second]; | ||
| bool is_a = (step.slice_a != SliceIndexCarryPrevResult && step.slice_a == src_slice); | ||
| int offset = is_a ? 0 : vl; | ||
| step.lane_map[lane_in_dst_slice] = lane_in_src_slice + offset; | ||
|
|
||
| } else if (steps[0].slice_b == SliceIndexNone) { | ||
| // add as 'b' of first step if b is unused | ||
| slice_to_step[src_slice] = 0; | ||
| steps[0].slice_b = src_slice; | ||
| steps[0].lane_map[lane_in_dst_slice] = lane_in_src_slice + vl; | ||
|
|
||
| } else { | ||
| // otherwise chain a new step | ||
| slice_to_step[src_slice] = static_cast<int>(steps.size()); | ||
| // new step uses previous result as 'a', so we use 'b' for this one | ||
| steps.emplace_back(vl, SliceIndexCarryPrevResult, src_slice); | ||
|
|
||
| // Except for the first step, we need to arrange indices | ||
| // so that the output carried from the previous step is kept | ||
| auto &lane_map = steps.back().lane_map; | ||
| // initialize lane_map as identical copy | ||
| for (size_t lane_idx = 0; lane_idx < lane_map.size(); ++lane_idx) { | ||
| lane_map[lane_idx] = lane_idx; | ||
| } | ||
| // update for this index | ||
| lane_map[lane_in_dst_slice] = lane_in_src_slice + vl; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| return all_steps; | ||
| } | ||
|
|
||
| } // namespace Halide::Internal | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,163 @@ | ||
| #ifndef HALIDE_DECOMPOSE_VECTOR_SHUFFLE_H | ||
| #define HALIDE_DECOMPOSE_VECTOR_SHUFFLE_H | ||
|
|
||
| /** \file | ||
| * | ||
| * Perform vector shuffle by decomposing the operation to | ||
| * a sequence of the sub shuffle steps where each step is a shuffle of: | ||
| * - One or two slices as input (slice_a and slice_b) | ||
| * - Produce one slice (dst slice) | ||
| * - All the slices have the same length as target native vector (vl) | ||
| * | ||
| * The structure of the sequence of steps consists of: | ||
| * 1. Outer loop to iterate the slices of dst vector. | ||
| * 2. Inner loop to iterate the native shuffle steps to complete a single dst slice. | ||
| * This can be multiple steps because a single native shuffle can take | ||
| * only 2 slices (native vector length x 2) at most, while we may need | ||
| * to fetch from wider location in the src vector. | ||
| * | ||
| * The following example, log of test code, illustrates how it works. | ||
| * | ||
| * src_lanes: 17, dst_lanes: 7, vl: 4 | ||
| * input a: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, ] | ||
| * input b: [170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300, 310, 320, 330, ] | ||
| * indices: [6, 13, 24, 14, 7, 11, 5, ] | ||
| * | ||
| * slice a:[40, 50, 60, 70, ], slice b:[120, 130, 140, 150, ], indices:[2, 5, -1, 6, ] | ||
| * => slice output:[60, 130, -559038801, 140, ] | ||
| * slice a:[60, 130, -559038801, 140, ], slice b:[210, 220, 230, 240, ], indices:[0, 1, 7, 3, ] | ||
| * => slice output:[60, 130, 240, 140, ] | ||
| * slice a:[40, 50, 60, 70, ], slice b:[80, 90, 100, 110, ], indices:[3, 7, 1, -1, ] | ||
| * => slice output:[70, 110, 50, -559038801, ] | ||
| * | ||
| * output: [60, 130, 240, 140, 70, 110, 50, ] | ||
| * | ||
| */ | ||
|
|
||
| #include "Error.h" | ||
| #include "Util.h" | ||
|
|
||
| #include <optional> | ||
| #include <vector> | ||
|
|
||
| namespace Halide { | ||
| namespace Internal { | ||
|
|
||
| /** Enum to represent the special cases of slice index */ | ||
| enum { | ||
| SliceIndexNone = -1, | ||
| SliceIndexCarryPrevResult = -2, | ||
| }; | ||
|
|
||
| struct NativeShuffle { | ||
| int slice_a; | ||
| int slice_b; | ||
| std::vector<int> lane_map; | ||
|
|
||
| NativeShuffle(int vl, int a, int b) | ||
| : slice_a(a), slice_b(b) { | ||
| lane_map.resize(vl, SliceIndexNone); | ||
| } | ||
| }; | ||
|
|
||
| std::vector<std::vector<NativeShuffle>> decompose_to_native_shuffles( | ||
| int src_lanes, const std::vector<int> &indices, int vl); | ||
|
|
||
| /** Algorithm logic for shuffle decomposition, parameterized on vector type | ||
| * and a codegen-like class that provides primitive vector operations. | ||
| */ | ||
| template<typename CodeGenTy, typename VecTy> | ||
| struct DecomposeVectorShuffle { | ||
| // TODO: when upgrading to C++20, replace with a concept. | ||
| // get_vector_num_elements may be overloaded (e.g. on Type* and Value*), so use | ||
| // expression SFINAE rather than a method pointer to handle overload resolution. | ||
| static_assert(std::is_convertible_v<decltype(std::declval<CodeGenTy &>().get_vector_num_elements(std::declval<VecTy>())), int>, | ||
| "CodeGenTy must provide: int get_vector_num_elements(VecTy)"); | ||
| static_assert(std::is_invocable_r_v<VecTy, decltype(&CodeGenTy::slice_vector), CodeGenTy &, const VecTy &, int, int>, | ||
| "CodeGenTy must provide: VecTy slice_vector(const VecTy &, int, int)"); | ||
| static_assert(std::is_invocable_r_v<VecTy, decltype(&CodeGenTy::concat_vectors), CodeGenTy &, const std::vector<VecTy> &>, | ||
| "CodeGenTy must provide: VecTy concat_vectors(const std::vector<VecTy> &)"); | ||
| static_assert(std::is_invocable_r_v<VecTy, decltype(&CodeGenTy::shuffle_scalable_vectors_general), CodeGenTy &, | ||
| const VecTy &, const VecTy &, const std::vector<int> &>, | ||
| "CodeGenTy must provide: VecTy shuffle_scalable_vectors_general(const VecTy &, const VecTy &, const std::vector<int> &)"); | ||
| static_assert(std::is_invocable_r_v<VecTy, decltype(&CodeGenTy::create_undef_vector_like), CodeGenTy &, const VecTy &, int>, | ||
| "CodeGenTy must provide: VecTy create_undef_vector_like(const VecTy &, int)"); | ||
|
|
||
| DecomposeVectorShuffle(CodeGenTy &codegen, const VecTy &src_a, const VecTy &src_b, int src_lanes, int vl) | ||
| : codegen(codegen), | ||
| vl(vl), | ||
| src_a(align_up_vector(src_a, vl)), | ||
| src_b(align_up_vector(src_b, vl)), | ||
| src_lanes(src_lanes), | ||
| src_lanes_aligned(align_up(src_lanes, vl)) { | ||
| } | ||
|
|
||
| VecTy run(const std::vector<int> &indices) { | ||
| auto shuffle_plan = decompose_to_native_shuffles(src_lanes, indices, vl); | ||
| int dst_lanes = static_cast<int>(indices.size()); | ||
|
|
||
| // process each block divided by vl | ||
| std::vector<VecTy> shuffled_dst_slices; | ||
| shuffled_dst_slices.reserve(shuffle_plan.size()); | ||
|
|
||
| for (const auto &steps_for_dst_slice : shuffle_plan) { | ||
| std::optional<VecTy> dst_slice = std::nullopt; | ||
| for (const auto &step : steps_for_dst_slice) { | ||
| // Obtain 1st slice a | ||
| VecTy a; | ||
| if (step.slice_a == SliceIndexCarryPrevResult) { | ||
| internal_assert(dst_slice.has_value()) << "Tried to carry from undefined previous result"; | ||
| a = *dst_slice; | ||
| } else { | ||
| a = get_vl_slice(step.slice_a); | ||
| } | ||
| // Obtain 2nd slice b | ||
| std::optional<VecTy> b; | ||
| if (step.slice_b == SliceIndexNone) { | ||
| b = std::nullopt; | ||
| } else { | ||
| b = std::optional<VecTy>(get_vl_slice(step.slice_b)); | ||
| } | ||
| // Perform shuffle where vector length is aligned | ||
| dst_slice = codegen.shuffle_scalable_vectors_general(a, b.value_or(VecTy{}), step.lane_map); | ||
| } | ||
| if (!dst_slice.has_value()) { | ||
| // No shuffle step for this slice, i.e. all the indices are -1 | ||
| dst_slice = codegen.create_undef_vector_like(src_a, vl); | ||
| } | ||
| shuffled_dst_slices.push_back(*dst_slice); | ||
| } | ||
|
|
||
| return codegen.slice_vector(codegen.concat_vectors(shuffled_dst_slices), 0, dst_lanes); | ||
| } | ||
|
|
||
| private: | ||
| // Helper to extract slice with lanes=vl | ||
| VecTy get_vl_slice(int slice_index) { | ||
| const int num_slices_a = src_lanes_aligned / vl; | ||
| int start_index = slice_index * vl; | ||
| if (slice_index < num_slices_a) { | ||
| return codegen.slice_vector(src_a, start_index, vl); | ||
| } else { | ||
| start_index -= src_lanes_aligned; | ||
| return codegen.slice_vector(src_b, start_index, vl); | ||
| } | ||
| } | ||
|
|
||
| VecTy align_up_vector(const VecTy &v, int align) { | ||
| int len = codegen.get_vector_num_elements(v); | ||
| return codegen.slice_vector(v, 0, align_up(len, align)); | ||
| } | ||
|
|
||
| CodeGenTy &codegen; | ||
| int vl; | ||
| VecTy src_a; | ||
| VecTy src_b; | ||
| int src_lanes; | ||
| int src_lanes_aligned; | ||
| }; | ||
|
|
||
| } // namespace Internal | ||
| } // namespace Halide | ||
|
|
||
| #endif |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.