Skip to content

Commit 928a7e6

Browse files
authored
[AArch64] Use 0-cycle reg2reg MOVs for FPR32, FPR16, FPR8 (#144152)
This change emits optimized copy instructions for FPR32, FPR16, FPR8 register classes on targets that support it. The implementation is similar to what has been done for GPR32. It adds 2 regression tests for FPR32 and FPR16. Depends on: #143680 to resolve the test structure.
1 parent f0c1a9a commit 928a7e6

File tree

4 files changed

+181
-14
lines changed

4 files changed

+181
-14
lines changed

llvm/lib/Target/AArch64/AArch64Features.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -615,6 +615,12 @@ def FeatureNoSVEFPLD1R : SubtargetFeature<"no-sve-fp-ld1r",
615615
def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
616616
"Has zero-cycle register moves">;
617617

618+
def FeatureZCRegMoveFPR64 : SubtargetFeature<"zcm-fpr64", "HasZeroCycleRegMoveFPR64", "true",
619+
"Has zero-cycle register moves for FPR64 registers">;
620+
621+
def FeatureZCRegMoveFPR32 : SubtargetFeature<"zcm-fpr32", "HasZeroCycleRegMoveFPR32", "true",
622+
"Has zero-cycle register moves for FPR32 registers">;
623+
618624
def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true",
619625
"Has zero-cycle zeroing instructions for generic registers">;
620626

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

Lines changed: 62 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5302,30 +5302,78 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
53025302

53035303
if (AArch64::FPR32RegClass.contains(DestReg) &&
53045304
AArch64::FPR32RegClass.contains(SrcReg)) {
5305-
BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5306-
.addReg(SrcReg, getKillRegState(KillSrc));
5305+
if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5306+
!Subtarget.hasZeroCycleRegMoveFPR32()) {
5307+
const TargetRegisterInfo *TRI = &getRegisterInfo();
5308+
MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
5309+
&AArch64::FPR64RegClass);
5310+
MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::ssub,
5311+
&AArch64::FPR64RegClass);
5312+
// This instruction is reading and writing D registers. This may upset
5313+
// the register scavenger and machine verifier, so we need to indicate
5314+
// that we are reading an undefined value from SrcRegD, but a proper
5315+
// value from SrcReg.
5316+
BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5317+
.addReg(SrcRegD, RegState::Undef)
5318+
.addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5319+
} else {
5320+
BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5321+
.addReg(SrcReg, getKillRegState(KillSrc));
5322+
}
53075323
return;
53085324
}
53095325

53105326
if (AArch64::FPR16RegClass.contains(DestReg) &&
53115327
AArch64::FPR16RegClass.contains(SrcReg)) {
5312-
DestReg =
5313-
RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass);
5314-
SrcReg =
5315-
RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass);
5316-
BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5317-
.addReg(SrcReg, getKillRegState(KillSrc));
5328+
if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5329+
!Subtarget.hasZeroCycleRegMoveFPR32()) {
5330+
const TargetRegisterInfo *TRI = &getRegisterInfo();
5331+
MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
5332+
&AArch64::FPR64RegClass);
5333+
MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::hsub,
5334+
&AArch64::FPR64RegClass);
5335+
// This instruction is reading and writing D registers. This may upset
5336+
// the register scavenger and machine verifier, so we need to indicate
5337+
// that we are reading an undefined value from SrcRegD, but a proper
5338+
// value from SrcReg.
5339+
BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5340+
.addReg(SrcRegD, RegState::Undef)
5341+
.addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5342+
} else {
5343+
DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
5344+
&AArch64::FPR32RegClass);
5345+
SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
5346+
&AArch64::FPR32RegClass);
5347+
BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5348+
.addReg(SrcReg, getKillRegState(KillSrc));
5349+
}
53185350
return;
53195351
}
53205352

53215353
if (AArch64::FPR8RegClass.contains(DestReg) &&
53225354
AArch64::FPR8RegClass.contains(SrcReg)) {
5323-
DestReg =
5324-
RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass);
5325-
SrcReg =
5326-
RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass);
5327-
BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5328-
.addReg(SrcReg, getKillRegState(KillSrc));
5355+
if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5356+
!Subtarget.hasZeroCycleRegMoveFPR32()) {
5357+
const TargetRegisterInfo *TRI = &getRegisterInfo();
5358+
MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::bsub,
5359+
&AArch64::FPR64RegClass);
5360+
MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::bsub,
5361+
&AArch64::FPR64RegClass);
5362+
// This instruction is reading and writing D registers. This may upset
5363+
// the register scavenger and machine verifier, so we need to indicate
5364+
// that we are reading an undefined value from SrcRegD, but a proper
5365+
// value from SrcReg.
5366+
BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5367+
.addReg(SrcRegD, RegState::Undef)
5368+
.addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5369+
} else {
5370+
DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
5371+
&AArch64::FPR32RegClass);
5372+
SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
5373+
&AArch64::FPR32RegClass);
5374+
BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5375+
.addReg(SrcReg, getKillRegState(KillSrc));
5376+
}
53295377
return;
53305378
}
53315379

llvm/lib/Target/AArch64/AArch64Processors.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,7 @@ def TuneAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7",
312312
FeatureFuseAES, FeatureFuseCryptoEOR,
313313
FeatureStorePairSuppress,
314314
FeatureZCRegMove,
315+
FeatureZCRegMoveFPR64,
315316
FeatureZCZeroing,
316317
FeatureZCZeroingFPWorkaround]>;
317318

@@ -325,6 +326,7 @@ def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
325326
FeatureFuseCryptoEOR,
326327
FeatureStorePairSuppress,
327328
FeatureZCRegMove,
329+
FeatureZCRegMoveFPR64,
328330
FeatureZCZeroing]>;
329331

330332
def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
@@ -337,6 +339,7 @@ def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
337339
FeatureFuseCryptoEOR,
338340
FeatureStorePairSuppress,
339341
FeatureZCRegMove,
342+
FeatureZCRegMoveFPR64,
340343
FeatureZCZeroing]>;
341344

342345
def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
@@ -349,6 +352,7 @@ def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
349352
FeatureFuseCryptoEOR,
350353
FeatureStorePairSuppress,
351354
FeatureZCRegMove,
355+
FeatureZCRegMoveFPR64,
352356
FeatureZCZeroing]>;
353357

354358
def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
@@ -361,6 +365,7 @@ def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
361365
FeatureFuseCryptoEOR,
362366
FeatureStorePairSuppress,
363367
FeatureZCRegMove,
368+
FeatureZCRegMoveFPR64,
364369
FeatureZCZeroing]>;
365370

366371
def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
@@ -378,6 +383,7 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
378383
FeatureFuseLiterals,
379384
FeatureStorePairSuppress,
380385
FeatureZCRegMove,
386+
FeatureZCRegMoveFPR64,
381387
FeatureZCZeroing]>;
382388

383389
def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
@@ -395,6 +401,7 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
395401
FeatureFuseLiterals,
396402
FeatureStorePairSuppress,
397403
FeatureZCRegMove,
404+
FeatureZCRegMoveFPR64,
398405
FeatureZCZeroing]>;
399406

400407
def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
@@ -412,6 +419,7 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
412419
FeatureFuseLiterals,
413420
FeatureStorePairSuppress,
414421
FeatureZCRegMove,
422+
FeatureZCRegMoveFPR64,
415423
FeatureZCZeroing]>;
416424

417425
def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
@@ -429,6 +437,7 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
429437
FeatureFuseLiterals,
430438
FeatureStorePairSuppress,
431439
FeatureZCRegMove,
440+
FeatureZCRegMoveFPR64,
432441
FeatureZCZeroing]>;
433442

434443
def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
@@ -445,6 +454,7 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
445454
FeatureFuseCryptoEOR,
446455
FeatureFuseLiterals,
447456
FeatureZCRegMove,
457+
FeatureZCRegMoveFPR64,
448458
FeatureZCZeroing
449459
]>;
450460

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s -check-prefixes=NOTCPU-LINUX --match-full-lines
2+
; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=generic | FileCheck %s -check-prefixes=NOTCPU-APPLE --match-full-lines
3+
; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=CPU --match-full-lines
4+
; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 -mattr=-zcm-fpr64 | FileCheck %s -check-prefixes=NOTATTR --match-full-lines
5+
; RUN: llc < %s -mtriple=arm64-apple-macosx -mattr=+zcm-fpr64 | FileCheck %s -check-prefixes=ATTR --match-full-lines
6+
7+
define void @zero_cycle_regmov_FPR32(float %a, float %b, float %c, float %d) {
8+
entry:
9+
; CHECK-LABEL: t:
10+
; NOTCPU-LINUX: fmov s0, s2
11+
; NOTCPU-LINUX: fmov s1, s3
12+
; NOTCPU-LINUX: fmov [[REG2:s[0-9]+]], s3
13+
; NOTCPU-LINUX: fmov [[REG1:s[0-9]+]], s2
14+
; NOTCPU-LINUX-NEXT: bl {{_?foo_float}}
15+
; NOTCPU-LINUX: fmov s0, [[REG1]]
16+
; NOTCPU-LINUX: fmov s1, [[REG2]]
17+
18+
; NOTCPU-APPLE: fmov s0, s2
19+
; NOTCPU-APPLE: fmov s1, s3
20+
; NOTCPU-APPLE: fmov [[REG2:s[0-9]+]], s3
21+
; NOTCPU-APPLE: fmov [[REG1:s[0-9]+]], s2
22+
; NOTCPU-APPLE-NEXT: bl {{_?foo_float}}
23+
; NOTCPU-APPLE: fmov s0, [[REG1]]
24+
; NOTCPU-APPLE: fmov s1, [[REG2]]
25+
26+
; CPU: fmov [[REG2:d[0-9]+]], d3
27+
; CPU: fmov [[REG1:d[0-9]+]], d2
28+
; CPU: fmov d0, d2
29+
; CPU: fmov d1, d3
30+
; CPU-NEXT: bl {{_?foo_float}}
31+
; CPU: fmov d0, [[REG1]]
32+
; CPU: fmov d1, [[REG2]]
33+
34+
; NOTATTR: fmov [[REG2:s[0-9]+]], s3
35+
; NOTATTR: fmov [[REG1:s[0-9]+]], s2
36+
; NOTATTR: fmov s0, s2
37+
; NOTATTR: fmov s1, s3
38+
; NOTATTR-NEXT: bl {{_?foo_float}}
39+
; NOTATTR: fmov s0, [[REG1]]
40+
; NOTATTR: fmov s1, [[REG2]]
41+
42+
; ATTR: fmov d0, d2
43+
; ATTR: fmov d1, d3
44+
; ATTR: fmov [[REG2:d[0-9]+]], d3
45+
; ATTR: fmov [[REG1:d[0-9]+]], d2
46+
; ATTR-NEXT: bl {{_?foo_float}}
47+
; ATTR: fmov d0, [[REG1]]
48+
; ATTR: fmov d1, [[REG2]]
49+
%call = call float @foo_float(float %c, float %d)
50+
%call1 = call float @foo_float(float %c, float %d)
51+
unreachable
52+
}
53+
54+
declare float @foo_float(float, float)
55+
56+
define void @zero_cycle_regmov_FPR16(half %a, half %b, half %c, half %d) {
57+
entry:
58+
; CHECK-LABEL: t:
59+
; NOTCPU-LINUX: fmov s0, s2
60+
; NOTCPU-LINUX: fmov s1, s3
61+
; NOTCPU-LINUX: fmov [[REG2:s[0-9]+]], s3
62+
; NOTCPU-LINUX: fmov [[REG1:s[0-9]+]], s2
63+
; NOTCPU-LINUX-NEXT: bl {{_?foo_half}}
64+
; NOTCPU-LINUX: fmov s0, [[REG1]]
65+
; NOTCPU-LINUX: fmov s1, [[REG2]]
66+
67+
; NOTCPU-APPLE: fmov s0, s2
68+
; NOTCPU-APPLE: fmov s1, s3
69+
; NOTCPU-APPLE: fmov [[REG2:s[0-9]+]], s3
70+
; NOTCPU-APPLE: fmov [[REG1:s[0-9]+]], s2
71+
; NOTCPU-APPLE-NEXT: bl {{_?foo_half}}
72+
; NOTCPU-APPLE: fmov s0, [[REG1]]
73+
; NOTCPU-APPLE: fmov s1, [[REG2]]
74+
75+
; CPU: fmov [[REG2:d[0-9]+]], d3
76+
; CPU: fmov [[REG1:d[0-9]+]], d2
77+
; CPU: fmov d0, d2
78+
; CPU: fmov d1, d3
79+
; CPU-NEXT: bl {{_?foo_half}}
80+
; CPU: fmov d0, [[REG1]]
81+
; CPU: fmov d1, [[REG2]]
82+
83+
; NOTATTR: fmov [[REG2:s[0-9]+]], s3
84+
; NOTATTR: fmov [[REG1:s[0-9]+]], s2
85+
; NOTATTR: fmov s0, s2
86+
; NOTATTR: fmov s1, s3
87+
; NOTATTR-NEXT: bl {{_?foo_half}}
88+
; NOTATTR: fmov s0, [[REG1]]
89+
; NOTATTR: fmov s1, [[REG2]]
90+
91+
; ATTR: fmov d0, d2
92+
; ATTR: fmov d1, d3
93+
; ATTR: fmov [[REG2:d[0-9]+]], d3
94+
; ATTR: fmov [[REG1:d[0-9]+]], d2
95+
; ATTR-NEXT: bl {{_?foo_half}}
96+
; ATTR: fmov d0, [[REG1]]
97+
; ATTR: fmov d1, [[REG2]]
98+
%call = call half @foo_half(half %c, half %d)
99+
%call1 = call half @foo_half(half %c, half %d)
100+
unreachable
101+
}
102+
103+
declare half @foo_half(half, half)

0 commit comments

Comments
 (0)