[AMDGPU] LiveRegOptimizer: consider i8/i16 binops on SDWA (#155800)

michaelselehov · web-flow · commit 3645cef1ef50 · 2025-12-15T12:04:33.000-05:00
PHI-node part was merged with PR#160909.

Extend `isOpLegal` to treat 8/16-bit vector add/sub/and/or/xor as
profitable on SDWA targets (stores and intrinsics remain profitable).
This repacks loop-carried values to i32 across BBs and restores SDWA
lowering instead of scattered lshr/lshl/or sequences.

Testing:
- Local: `check-llvm-codegen-amdgpu` is green (4314/4320 passed, 6
XFAIL).
- Additional: validated in AMD internal CI
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -126,7 +126,38 @@ class LiveRegOptimizer {
     return LK.first != TargetLoweringBase::TypeLegal;
   }
 
-  bool isOpLegal(Instruction *I) { return isa<StoreInst, IntrinsicInst>(I); }
+  bool isOpLegal(const Instruction *I) {
+    if (isa<IntrinsicInst>(I))
+      return true;
+
+    // Any store is a profitable sink (prevents flip-flopping)
+    if (isa<StoreInst>(I))
+      return true;
+
+    if (auto *BO = dyn_cast<BinaryOperator>(I)) {
+      if (auto *VT = dyn_cast<FixedVectorType>(BO->getType())) {
+        if (const auto *IT = dyn_cast<IntegerType>(VT->getElementType())) {
+          unsigned EB = IT->getBitWidth();
+          unsigned EC = VT->getNumElements();
+          // Check for SDWA-compatible operation
+          if ((EB == 8 || EB == 16) && ST.hasSDWA() && EC * EB <= 32) {
+            switch (BO->getOpcode()) {
+            case Instruction::Add:
+            case Instruction::Sub:
+            case Instruction::And:
+            case Instruction::Or:
+            case Instruction::Xor:
+              return true;
+            default:
+              break;
+            }
+          }
+        }
+      }
+    }
+
+    return false;
+  }
 
   bool isCoercionProfitable(Instruction *II) {
     SmallPtrSet<Instruction *, 4> CVisited;
diff --git a/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll b/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
@@ -0,0 +1,63 @@
+; REQUIRES: amdgpu-registered-target
+; RUN: opt -S -passes=amdgpu-late-codegenprepare \
+; RUN:   -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s | FileCheck %s
+
+; Purpose:
+;  - Input has a loop-carried PHI of type <4 x i8> and byte-wise adds in the
+;    loop header (same basic block as the PHI).
+;  - After amdgpu-late-codegenprepare, the PHI must be coerced to i32 across
+;    the backedge, and a single dominating "bitcast i32 -> <4 x i8>" must be
+;    placed in the header (enabling SDWA-friendly lowering later).
+;
+; What we check:
+;  - PHI is i32 (no loop-carried <4 x i8> PHI remains).
+;  - A header-local bitcast i32 -> <4 x i8> exists and feeds the vector add.
+;  - The loopexit produces a bitcast <4 x i8> -> i32 for the backedge.
+
+define amdgpu_kernel void @lro_coerce_v4i8_phi(ptr nocapture %p, i32 %n) {
+entry:
+  br label %loop
+
+loop:
+  ; Loop index
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+
+  ; Loop-carried accumulator in vector-of-bytes form (problematic on input).
+  %acc = phi <4 x i8> [ zeroinitializer, %entry ], [ %acc.next, %loop ]
+
+  ; Make up four i8 values derived from %i to avoid memory noise.
+  %i0 = trunc i32 %i to i8
+  %i1i = add i32 %i, 1
+  %i1 = trunc i32 %i1i to i8
+  %i2i = add i32 %i, 2
+  %i2 = trunc i32 %i2i to i8
+  %i3i = add i32 %i, 3
+  %i3 = trunc i32 %i3i to i8
+
+  ; Pack them into <4 x i8>.
+  %v01 = insertelement <4 x i8> zeroinitializer, i8 %i0, i32 0
+  %v02 = insertelement <4 x i8> %v01, i8 %i1, i32 1
+  %v03 = insertelement <4 x i8> %v02, i8 %i2, i32 2
+  %v   = insertelement <4 x i8> %v03, i8 %i3, i32 3
+
+  ; Byte-wise add in the same block as the PHI (this must make coercion profitable).
+  %acc.next = add <4 x i8> %acc, %v
+
+  ; Loop control.
+  %i.next = add i32 %i, 4
+  %cond = icmp slt i32 %i.next, %n
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: define amdgpu_kernel void @lro_coerce_v4i8_phi(
+; CHECK: loop:
+; CHECK: %i = phi i32
+; CHECK-NOT: phi <4 x i8>
+; CHECK: %[[ACCI32:[^ ]+]] = phi i32
+; CHECK-NEXT: %[[HDRCAST:[^ ]+]] = bitcast i32 %[[ACCI32]] to <4 x i8>
+; CHECK: add <4 x i8> %[[HDRCAST]],
+; CHECK: br i1
+