Skip to content

Commit 3645cef

Browse files
[AMDGPU] LiveRegOptimizer: consider i8/i16 binops on SDWA (#155800)
PHI-node part was merged with PR#160909. Extend `isOpLegal` to treat 8/16-bit vector add/sub/and/or/xor as profitable on SDWA targets (stores and intrinsics remain profitable). This repacks loop-carried values to i32 across BBs and restores SDWA lowering instead of scattered lshr/lshl/or sequences. Testing: - Local: `check-llvm-codegen-amdgpu` is green (4314/4320 passed, 6 XFAIL). - Additional: validated in AMD internal CI
1 parent 463c9f0 commit 3645cef

File tree

2 files changed

+95
-1
lines changed

2 files changed

+95
-1
lines changed

llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,38 @@ class LiveRegOptimizer {
126126
return LK.first != TargetLoweringBase::TypeLegal;
127127
}
128128

129-
bool isOpLegal(Instruction *I) { return isa<StoreInst, IntrinsicInst>(I); }
129+
bool isOpLegal(const Instruction *I) {
130+
if (isa<IntrinsicInst>(I))
131+
return true;
132+
133+
// Any store is a profitable sink (prevents flip-flopping)
134+
if (isa<StoreInst>(I))
135+
return true;
136+
137+
if (auto *BO = dyn_cast<BinaryOperator>(I)) {
138+
if (auto *VT = dyn_cast<FixedVectorType>(BO->getType())) {
139+
if (const auto *IT = dyn_cast<IntegerType>(VT->getElementType())) {
140+
unsigned EB = IT->getBitWidth();
141+
unsigned EC = VT->getNumElements();
142+
// Check for SDWA-compatible operation
143+
if ((EB == 8 || EB == 16) && ST.hasSDWA() && EC * EB <= 32) {
144+
switch (BO->getOpcode()) {
145+
case Instruction::Add:
146+
case Instruction::Sub:
147+
case Instruction::And:
148+
case Instruction::Or:
149+
case Instruction::Xor:
150+
return true;
151+
default:
152+
break;
153+
}
154+
}
155+
}
156+
}
157+
}
158+
159+
return false;
160+
}
130161

131162
bool isCoercionProfitable(Instruction *II) {
132163
SmallPtrSet<Instruction *, 4> CVisited;
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
; REQUIRES: amdgpu-registered-target
2+
; RUN: opt -S -passes=amdgpu-late-codegenprepare \
3+
; RUN: -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s | FileCheck %s
4+
5+
; Purpose:
6+
; - Input has a loop-carried PHI of type <4 x i8> and byte-wise adds in the
7+
; loop header (same basic block as the PHI).
8+
; - After amdgpu-late-codegenprepare, the PHI must be coerced to i32 across
9+
; the backedge, and a single dominating "bitcast i32 -> <4 x i8>" must be
10+
; placed in the header (enabling SDWA-friendly lowering later).
11+
;
12+
; What we check:
13+
; - PHI is i32 (no loop-carried <4 x i8> PHI remains).
14+
; - A header-local bitcast i32 -> <4 x i8> exists and feeds the vector add.
15+
; - The loopexit produces a bitcast <4 x i8> -> i32 for the backedge.
16+
17+
define amdgpu_kernel void @lro_coerce_v4i8_phi(ptr nocapture %p, i32 %n) {
18+
entry:
19+
br label %loop
20+
21+
loop:
22+
; Loop index
23+
%i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
24+
25+
; Loop-carried accumulator in vector-of-bytes form (problematic on input).
26+
%acc = phi <4 x i8> [ zeroinitializer, %entry ], [ %acc.next, %loop ]
27+
28+
; Make up four i8 values derived from %i to avoid memory noise.
29+
%i0 = trunc i32 %i to i8
30+
%i1i = add i32 %i, 1
31+
%i1 = trunc i32 %i1i to i8
32+
%i2i = add i32 %i, 2
33+
%i2 = trunc i32 %i2i to i8
34+
%i3i = add i32 %i, 3
35+
%i3 = trunc i32 %i3i to i8
36+
37+
; Pack them into <4 x i8>.
38+
%v01 = insertelement <4 x i8> zeroinitializer, i8 %i0, i32 0
39+
%v02 = insertelement <4 x i8> %v01, i8 %i1, i32 1
40+
%v03 = insertelement <4 x i8> %v02, i8 %i2, i32 2
41+
%v = insertelement <4 x i8> %v03, i8 %i3, i32 3
42+
43+
; Byte-wise add in the same block as the PHI (this must make coercion profitable).
44+
%acc.next = add <4 x i8> %acc, %v
45+
46+
; Loop control.
47+
%i.next = add i32 %i, 4
48+
%cond = icmp slt i32 %i.next, %n
49+
br i1 %cond, label %loop, label %exit
50+
51+
exit:
52+
ret void
53+
}
54+
55+
; CHECK-LABEL: define amdgpu_kernel void @lro_coerce_v4i8_phi(
56+
; CHECK: loop:
57+
; CHECK: %i = phi i32
58+
; CHECK-NOT: phi <4 x i8>
59+
; CHECK: %[[ACCI32:[^ ]+]] = phi i32
60+
; CHECK-NEXT: %[[HDRCAST:[^ ]+]] = bitcast i32 %[[ACCI32]] to <4 x i8>
61+
; CHECK: add <4 x i8> %[[HDRCAST]],
62+
; CHECK: br i1
63+

0 commit comments

Comments
 (0)