Skip to content

Commit 6f3d70f

Browse files
[AArch64][GlobalISel] Fix incorrect codegen for FPR16/FPR8 to GPR copies
Previously, copyPhysReg() was missing handlers for copies between FPR16/FPR8 and GPR32/GPR64 register classes. These cases fell through to the NZCV handler, which incorrectly generated 'mrs Rd, NZCV' instead of the proper FMOV instruction. This caused incorrect code generation for patterns like: %ival = bitcast half %val to i16 store atomic i16 %ival, ptr %addr release, align 2 Which generated 'mrs w8, NZCV' instead of 'fmov w8, h0'. The fix adds proper copy handlers: - FPR16 <-> GPR32: Use FMOVHWr/FMOVWHr with FullFP16, otherwise promote to FPR32 super-register and use FMOVSWr/FMOVWSr - FPR16 <-> GPR64: Use FMOVHXr/FMOVXHr with FullFP16, otherwise promote to FPR64 super-register and use FMOVDXr/FMOVXDr - FPR8 <-> GPR32: Promote to FPR32 and use FMOVSWr/FMOVWSr - FPR8 <-> GPR64: Promote to FPR64 and use FMOVDXr/FMOVXDr Fixes #171494 Co-authored-by: Claude <noreply@anthropic.com>
1 parent 0eb00ef commit 6f3d70f

File tree

2 files changed

+205
-0
lines changed

2 files changed

+205
-0
lines changed

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5851,6 +5851,102 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
58515851
return;
58525852
}
58535853

5854+
// Copies between GPR32 and FPR16.
5855+
if (AArch64::FPR16RegClass.contains(DestReg) &&
5856+
AArch64::GPR32RegClass.contains(SrcReg)) {
5857+
if (Subtarget.hasFullFP16()) {
5858+
BuildMI(MBB, I, DL, get(AArch64::FMOVWHr), DestReg)
5859+
.addReg(SrcReg, getKillRegState(KillSrc));
5860+
} else {
5861+
MCRegister DestRegS =
5862+
RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass);
5863+
BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestRegS)
5864+
.addReg(SrcReg, getKillRegState(KillSrc));
5865+
}
5866+
return;
5867+
}
5868+
if (AArch64::GPR32RegClass.contains(DestReg) &&
5869+
AArch64::FPR16RegClass.contains(SrcReg)) {
5870+
if (Subtarget.hasFullFP16()) {
5871+
BuildMI(MBB, I, DL, get(AArch64::FMOVHWr), DestReg)
5872+
.addReg(SrcReg, getKillRegState(KillSrc));
5873+
} else {
5874+
MCRegister SrcRegS =
5875+
RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass);
5876+
BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
5877+
.addReg(SrcRegS, RegState::Undef)
5878+
.addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5879+
}
5880+
return;
5881+
}
5882+
5883+
// Copies between GPR64 and FPR16.
5884+
if (AArch64::FPR16RegClass.contains(DestReg) &&
5885+
AArch64::GPR64RegClass.contains(SrcReg)) {
5886+
if (Subtarget.hasFullFP16()) {
5887+
BuildMI(MBB, I, DL, get(AArch64::FMOVXHr), DestReg)
5888+
.addReg(SrcReg, getKillRegState(KillSrc));
5889+
} else {
5890+
MCRegister DestRegD =
5891+
RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR64RegClass);
5892+
BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestRegD)
5893+
.addReg(SrcReg, getKillRegState(KillSrc));
5894+
}
5895+
return;
5896+
}
5897+
if (AArch64::GPR64RegClass.contains(DestReg) &&
5898+
AArch64::FPR16RegClass.contains(SrcReg)) {
5899+
if (Subtarget.hasFullFP16()) {
5900+
BuildMI(MBB, I, DL, get(AArch64::FMOVHXr), DestReg)
5901+
.addReg(SrcReg, getKillRegState(KillSrc));
5902+
} else {
5903+
MCRegister SrcRegD =
5904+
RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR64RegClass);
5905+
BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
5906+
.addReg(SrcRegD, RegState::Undef)
5907+
.addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5908+
}
5909+
return;
5910+
}
5911+
5912+
// Copies between GPR32 and FPR8.
5913+
if (AArch64::FPR8RegClass.contains(DestReg) &&
5914+
AArch64::GPR32RegClass.contains(SrcReg)) {
5915+
MCRegister DestRegS =
5916+
RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass);
5917+
BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestRegS)
5918+
.addReg(SrcReg, getKillRegState(KillSrc));
5919+
return;
5920+
}
5921+
if (AArch64::GPR32RegClass.contains(DestReg) &&
5922+
AArch64::FPR8RegClass.contains(SrcReg)) {
5923+
MCRegister SrcRegS =
5924+
RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass);
5925+
BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
5926+
.addReg(SrcRegS, RegState::Undef)
5927+
.addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5928+
return;
5929+
}
5930+
5931+
// Copies between GPR64 and FPR8.
5932+
if (AArch64::FPR8RegClass.contains(DestReg) &&
5933+
AArch64::GPR64RegClass.contains(SrcReg)) {
5934+
MCRegister DestRegD =
5935+
RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR64RegClass);
5936+
BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestRegD)
5937+
.addReg(SrcReg, getKillRegState(KillSrc));
5938+
return;
5939+
}
5940+
if (AArch64::GPR64RegClass.contains(DestReg) &&
5941+
AArch64::FPR8RegClass.contains(SrcReg)) {
5942+
MCRegister SrcRegD =
5943+
RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR64RegClass);
5944+
BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
5945+
.addReg(SrcRegD, RegState::Undef)
5946+
.addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5947+
return;
5948+
}
5949+
58545950
if (DestReg == AArch64::NZCV) {
58555951
assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
58565952
BuildMI(MBB, I, DL, get(AArch64::MSR))
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=arm64-apple-ios -global-isel -global-isel-abort=1 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-NOFP16
3+
; RUN: llc < %s -mtriple=arm64-apple-ios -mattr=+fullfp16 -global-isel -global-isel-abort=1 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-FP16
4+
5+
; Test for https://github.com/llvm/llvm-project/issues/171494
6+
; Atomic store of bitcast half to i16 was generating incorrect code (mrs instead of fmov).
7+
8+
define void @atomic_store_half(ptr %addr, half %val) {
9+
; CHECK-NOFP16-LABEL: atomic_store_half:
10+
; CHECK-NOFP16: ; %bb.0:
11+
; CHECK-NOFP16-NEXT: fmov w8, s0
12+
; CHECK-NOFP16-NEXT: stlrh w8, [x0]
13+
; CHECK-NOFP16-NEXT: ret
14+
;
15+
; CHECK-FP16-LABEL: atomic_store_half:
16+
; CHECK-FP16: ; %bb.0:
17+
; CHECK-FP16-NEXT: fmov w8, h0
18+
; CHECK-FP16-NEXT: stlrh w8, [x0]
19+
; CHECK-FP16-NEXT: ret
20+
%ival = bitcast half %val to i16
21+
store atomic i16 %ival, ptr %addr release, align 2
22+
ret void
23+
}
24+
25+
define half @atomic_load_half(ptr %addr) {
26+
; CHECK-NOFP16-LABEL: atomic_load_half:
27+
; CHECK-NOFP16: ; %bb.0:
28+
; CHECK-NOFP16-NEXT: ldarh w8, [x0]
29+
; CHECK-NOFP16-NEXT: fmov s0, w8
30+
; CHECK-NOFP16-NEXT: ret
31+
;
32+
; CHECK-FP16-LABEL: atomic_load_half:
33+
; CHECK-FP16: ; %bb.0:
34+
; CHECK-FP16-NEXT: ldarh w8, [x0]
35+
; CHECK-FP16-NEXT: fmov h0, w8
36+
; CHECK-FP16-NEXT: ret
37+
%ival = load atomic i16, ptr %addr acquire, align 2
38+
%val = bitcast i16 %ival to half
39+
ret half %val
40+
}
41+
42+
define void @atomic_store_bfloat(ptr %addr, bfloat %val) {
43+
; CHECK-NOFP16-LABEL: atomic_store_bfloat:
44+
; CHECK-NOFP16: ; %bb.0:
45+
; CHECK-NOFP16-NEXT: fmov w8, s0
46+
; CHECK-NOFP16-NEXT: stlrh w8, [x0]
47+
; CHECK-NOFP16-NEXT: ret
48+
;
49+
; CHECK-FP16-LABEL: atomic_store_bfloat:
50+
; CHECK-FP16: ; %bb.0:
51+
; CHECK-FP16-NEXT: fmov w8, h0
52+
; CHECK-FP16-NEXT: stlrh w8, [x0]
53+
; CHECK-FP16-NEXT: ret
54+
%ival = bitcast bfloat %val to i16
55+
store atomic i16 %ival, ptr %addr release, align 2
56+
ret void
57+
}
58+
59+
define bfloat @atomic_load_bfloat(ptr %addr) {
60+
; CHECK-NOFP16-LABEL: atomic_load_bfloat:
61+
; CHECK-NOFP16: ; %bb.0:
62+
; CHECK-NOFP16-NEXT: ldarh w8, [x0]
63+
; CHECK-NOFP16-NEXT: fmov s0, w8
64+
; CHECK-NOFP16-NEXT: ret
65+
;
66+
; CHECK-FP16-LABEL: atomic_load_bfloat:
67+
; CHECK-FP16: ; %bb.0:
68+
; CHECK-FP16-NEXT: ldarh w8, [x0]
69+
; CHECK-FP16-NEXT: fmov h0, w8
70+
; CHECK-FP16-NEXT: ret
71+
%ival = load atomic i16, ptr %addr acquire, align 2
72+
%val = bitcast i16 %ival to bfloat
73+
ret bfloat %val
74+
}
75+
76+
; Test FPR8 to GPR32 copies (bitcast <1 x i8> to i8 for atomic store)
77+
define void @atomic_store_v1i8(ptr %addr, <1 x i8> %val) {
78+
; CHECK-NOFP16-LABEL: atomic_store_v1i8:
79+
; CHECK-NOFP16: ; %bb.0:
80+
; CHECK-NOFP16-NEXT: fmov w8, s0
81+
; CHECK-NOFP16-NEXT: stlrb w8, [x0]
82+
; CHECK-NOFP16-NEXT: ret
83+
;
84+
; CHECK-FP16-LABEL: atomic_store_v1i8:
85+
; CHECK-FP16: ; %bb.0:
86+
; CHECK-FP16-NEXT: fmov w8, s0
87+
; CHECK-FP16-NEXT: stlrb w8, [x0]
88+
; CHECK-FP16-NEXT: ret
89+
%ival = bitcast <1 x i8> %val to i8
90+
store atomic i8 %ival, ptr %addr release, align 1
91+
ret void
92+
}
93+
94+
define <1 x i8> @atomic_load_v1i8(ptr %addr) {
95+
; CHECK-NOFP16-LABEL: atomic_load_v1i8:
96+
; CHECK-NOFP16: ; %bb.0:
97+
; CHECK-NOFP16-NEXT: ldarb w8, [x0]
98+
; CHECK-NOFP16-NEXT: fmov s0, w8
99+
; CHECK-NOFP16-NEXT: ret
100+
;
101+
; CHECK-FP16-LABEL: atomic_load_v1i8:
102+
; CHECK-FP16: ; %bb.0:
103+
; CHECK-FP16-NEXT: ldarb w8, [x0]
104+
; CHECK-FP16-NEXT: fmov s0, w8
105+
; CHECK-FP16-NEXT: ret
106+
%ival = load atomic i8, ptr %addr acquire, align 1
107+
%val = bitcast i8 %ival to <1 x i8>
108+
ret <1 x i8> %val
109+
}

0 commit comments

Comments
 (0)