Skip to content

Commit 4169270

Browse files
committed
AMDGPU: Handle vectors in copysign magnitude sign case
1 parent a53ec5b commit 4169270

File tree

4 files changed

+117
-107
lines changed

4 files changed

+117
-107
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 46 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11721,29 +11721,63 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
1172111721
DAGCombinerInfo &DCI) const {
1172211722
SDValue MagnitudeOp = N->getOperand(0);
1172311723
SDValue SignOp = N->getOperand(1);
11724+
11725+
// The generic combine for fcopysign + fp cast is too conservative with
11726+
// vectors, and also gets confused by the splitting we will perform here, so
11727+
// peek through FP casts.
11728+
if (SignOp.getOpcode() == ISD::FP_EXTEND ||
11729+
SignOp.getOpcode() == ISD::FP_ROUND)
11730+
SignOp = SignOp.getOperand(0);
11731+
1172411732
SelectionDAG &DAG = DCI.DAG;
1172511733
SDLoc DL(N);
11734+
EVT SignVT = SignOp.getValueType();
1172611735

1172711736
// f64 fcopysign is really an f32 copysign on the high bits, so replace the
1172811737
// lower half with a copy.
1172911738
// fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11730-
if (MagnitudeOp.getValueType() == MVT::f64) {
11731-
SDValue MagAsVector =
11732-
DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
11733-
SDValue MagLo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
11734-
MagAsVector, DAG.getConstant(0, DL, MVT::i32));
11735-
SDValue MagHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
11736-
MagAsVector, DAG.getConstant(1, DL, MVT::i32));
11739+
EVT MagVT = MagnitudeOp.getValueType();
11740+
if (MagVT.getScalarType() == MVT::f64) {
11741+
unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
11742+
11743+
EVT F32VT = MagVT.isVector()
11744+
? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
11745+
: MVT::v2f32;
11746+
11747+
SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
11748+
11749+
SmallVector<SDValue, 8> NewElts;
11750+
for (unsigned I = 0; I != NumElts; ++I) {
11751+
SDValue MagLo =
11752+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11753+
DAG.getConstant(2 * I, DL, MVT::i32));
11754+
SDValue MagHi =
11755+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11756+
DAG.getConstant(2 * I + 1, DL, MVT::i32));
1173711757

11738-
SDValue HiOp = DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
11758+
SDValue SignOpElt =
11759+
MagVT.isVector()
11760+
? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SignVT.getScalarType(),
11761+
SignOp, DAG.getConstant(I, DL, MVT::i32))
11762+
: SignOp;
11763+
11764+
SDValue HiOp =
11765+
DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
11766+
11767+
SDValue Vector =
11768+
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
11769+
11770+
SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
11771+
NewElts.push_back(NewElt);
11772+
}
1173911773

11740-
SDValue Vector =
11741-
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
11774+
if (NewElts.size() == 1)
11775+
return NewElts[0];
1174211776

11743-
return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
11777+
return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
1174411778
}
1174511779

11746-
if (SignOp.getValueType() != MVT::f64)
11780+
if (SignVT != MVT::f64)
1174711781
return SDValue();
1174811782

1174911783
// Reduce width of sign operand, we only need the highest bit.

llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,8 +131,8 @@ define <2 x double> @test_pown_reduced_fast_v2f64_known_odd(<2 x double> %x, <2
131131
; GFX9-LABEL: test_pown_reduced_fast_v2f64_known_odd:
132132
; GFX9: ; %bb.0:
133133
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
134-
; GFX9-NEXT: v_or_b32_e32 v6, 1, v5
135134
; GFX9-NEXT: v_or_b32_e32 v4, 1, v4
135+
; GFX9-NEXT: v_or_b32_e32 v6, 1, v5
136136
; GFX9-NEXT: v_cvt_f64_i32_e32 v[4:5], v4
137137
; GFX9-NEXT: v_cvt_f64_i32_e32 v[6:7], v6
138138
; GFX9-NEXT: s_brev_b32 s4, -2

llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll

Lines changed: 40 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -4055,50 +4055,38 @@ define <2 x double> @v_copysign_out_v2f64_mag_v2f64_sign_v2bf16(<2 x double> %ma
40554055
; GCN-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2bf16:
40564056
; GCN: ; %bb.0:
40574057
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4058-
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
4059-
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
40604058
; GCN-NEXT: s_brev_b32 s4, -2
4061-
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
4062-
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
4063-
; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
4064-
; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
4065-
; GCN-NEXT: v_bfi_b32 v1, s4, v1, v7
4059+
; GCN-NEXT: v_bfi_b32 v1, s4, v1, v4
40664060
; GCN-NEXT: v_bfi_b32 v3, s4, v3, v5
40674061
; GCN-NEXT: s_setpc_b64 s[30:31]
40684062
;
40694063
; GFX7-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2bf16:
40704064
; GFX7: ; %bb.0:
40714065
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4072-
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
4073-
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
4074-
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
4075-
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
4076-
; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
4077-
; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
40784066
; GFX7-NEXT: s_brev_b32 s4, -2
4079-
; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v5
4080-
; GFX7-NEXT: v_bfi_b32 v3, s4, v3, v7
4067+
; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v4
4068+
; GFX7-NEXT: v_bfi_b32 v3, s4, v3, v5
40814069
; GFX7-NEXT: s_setpc_b64 s[30:31]
40824070
;
40834071
; GFX8-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2bf16:
40844072
; GFX8: ; %bb.0:
40854073
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4086-
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4
4087-
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
4074+
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
4075+
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
40884076
; GFX8-NEXT: s_brev_b32 s4, -2
4089-
; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v4
4090-
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v5
4077+
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
4078+
; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v5
40914079
; GFX8-NEXT: v_bfi_b32 v3, s4, v3, v4
40924080
; GFX8-NEXT: s_setpc_b64 s[30:31]
40934081
;
40944082
; GFX9-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2bf16:
40954083
; GFX9: ; %bb.0:
40964084
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4097-
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4
4098-
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
4085+
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
4086+
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
40994087
; GFX9-NEXT: s_brev_b32 s4, -2
4100-
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v4
4101-
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v5
4088+
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
4089+
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v5
41024090
; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v4
41034091
; GFX9-NEXT: s_setpc_b64 s[30:31]
41044092
;
@@ -4969,71 +4957,63 @@ define amdgpu_ps <2 x i32> @s_copysign_out_v2f32_mag_v2f32_sign_v2bf16(<2 x floa
49694957
define amdgpu_ps <4 x i32> @s_copysign_out_v2f64_mag_v2f64_sign_v2bf16(<2 x double> inreg %mag, <2 x bfloat> inreg %sign) {
49704958
; GCN-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16:
49714959
; GCN: ; %bb.0:
4972-
; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s5
4973-
; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s4
4974-
; GCN-NEXT: s_brev_b32 s4, -2
4975-
; GCN-NEXT: v_mov_b32_e32 v4, s3
4976-
; GCN-NEXT: v_mov_b32_e32 v5, s1
4977-
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
4978-
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
4979-
; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
4980-
; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
4981-
; GCN-NEXT: v_bfi_b32 v0, s4, v4, v3
4982-
; GCN-NEXT: v_bfi_b32 v1, s4, v5, v1
4960+
; GCN-NEXT: s_brev_b32 s6, -2
4961+
; GCN-NEXT: v_mov_b32_e32 v0, s3
4962+
; GCN-NEXT: v_mov_b32_e32 v1, s5
4963+
; GCN-NEXT: v_mov_b32_e32 v2, s1
4964+
; GCN-NEXT: v_mov_b32_e32 v3, s4
4965+
; GCN-NEXT: v_bfi_b32 v0, s6, v0, v1
4966+
; GCN-NEXT: v_bfi_b32 v1, s6, v2, v3
49834967
; GCN-NEXT: v_readfirstlane_b32 s1, v1
49844968
; GCN-NEXT: v_readfirstlane_b32 s3, v0
49854969
; GCN-NEXT: ; return to shader part epilog
49864970
;
49874971
; GFX7-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16:
49884972
; GFX7: ; %bb.0:
4989-
; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s5
4990-
; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s4
4991-
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
4992-
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
4993-
; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
4994-
; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
4995-
; GFX7-NEXT: s_brev_b32 s4, -2
4973+
; GFX7-NEXT: s_brev_b32 s6, -2
49964974
; GFX7-NEXT: v_mov_b32_e32 v0, s3
4997-
; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1
4975+
; GFX7-NEXT: v_mov_b32_e32 v1, s5
4976+
; GFX7-NEXT: v_bfi_b32 v0, s6, v0, v1
49984977
; GFX7-NEXT: v_mov_b32_e32 v1, s1
4999-
; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v3
4978+
; GFX7-NEXT: v_mov_b32_e32 v2, s4
4979+
; GFX7-NEXT: v_bfi_b32 v1, s6, v1, v2
50004980
; GFX7-NEXT: v_readfirstlane_b32 s1, v1
50014981
; GFX7-NEXT: v_readfirstlane_b32 s3, v0
50024982
; GFX7-NEXT: ; return to shader part epilog
50034983
;
50044984
; GFX8-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16:
50054985
; GFX8: ; %bb.0:
5006-
; GFX8-NEXT: s_lshr_b32 s5, s4, 16
50074986
; GFX8-NEXT: v_lshlrev_b32_e64 v0, 16, s4
5008-
; GFX8-NEXT: s_brev_b32 s4, -2
4987+
; GFX8-NEXT: s_brev_b32 s5, -2
50094988
; GFX8-NEXT: v_mov_b32_e32 v1, s1
5010-
; GFX8-NEXT: v_bfi_b32 v0, s4, v1, v0
5011-
; GFX8-NEXT: v_lshlrev_b32_e64 v1, 16, s5
4989+
; GFX8-NEXT: s_lshr_b32 s1, s4, 16
4990+
; GFX8-NEXT: v_bfi_b32 v0, s5, v1, v0
4991+
; GFX8-NEXT: v_lshlrev_b32_e64 v1, 16, s1
50124992
; GFX8-NEXT: v_mov_b32_e32 v2, s3
5013-
; GFX8-NEXT: v_bfi_b32 v1, s4, v2, v1
4993+
; GFX8-NEXT: v_bfi_b32 v1, s5, v2, v1
50144994
; GFX8-NEXT: v_readfirstlane_b32 s1, v0
50154995
; GFX8-NEXT: v_readfirstlane_b32 s3, v1
50164996
; GFX8-NEXT: ; return to shader part epilog
50174997
;
50184998
; GFX9-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16:
50194999
; GFX9: ; %bb.0:
5020-
; GFX9-NEXT: s_lshr_b32 s5, s4, 16
50215000
; GFX9-NEXT: v_lshlrev_b32_e64 v0, 16, s4
5022-
; GFX9-NEXT: s_brev_b32 s4, -2
5001+
; GFX9-NEXT: s_brev_b32 s5, -2
50235002
; GFX9-NEXT: v_mov_b32_e32 v1, s1
5024-
; GFX9-NEXT: v_bfi_b32 v0, s4, v1, v0
5025-
; GFX9-NEXT: v_lshlrev_b32_e64 v1, 16, s5
5003+
; GFX9-NEXT: s_lshr_b32 s1, s4, 16
5004+
; GFX9-NEXT: v_bfi_b32 v0, s5, v1, v0
5005+
; GFX9-NEXT: v_lshlrev_b32_e64 v1, 16, s1
50265006
; GFX9-NEXT: v_mov_b32_e32 v2, s3
5027-
; GFX9-NEXT: v_bfi_b32 v1, s4, v2, v1
5007+
; GFX9-NEXT: v_bfi_b32 v1, s5, v2, v1
50285008
; GFX9-NEXT: v_readfirstlane_b32 s1, v0
50295009
; GFX9-NEXT: v_readfirstlane_b32 s3, v1
50305010
; GFX9-NEXT: ; return to shader part epilog
50315011
;
50325012
; GFX10-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16:
50335013
; GFX10: ; %bb.0:
5034-
; GFX10-NEXT: s_lshr_b32 s5, s4, 16
50355014
; GFX10-NEXT: v_lshlrev_b32_e64 v0, 16, s4
5036-
; GFX10-NEXT: v_lshlrev_b32_e64 v1, 16, s5
5015+
; GFX10-NEXT: s_lshr_b32 s4, s4, 16
5016+
; GFX10-NEXT: v_lshlrev_b32_e64 v1, 16, s4
50375017
; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0
50385018
; GFX10-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v1
50395019
; GFX10-NEXT: v_readfirstlane_b32 s1, v0
@@ -5042,14 +5022,15 @@ define amdgpu_ps <4 x i32> @s_copysign_out_v2f64_mag_v2f64_sign_v2bf16(<2 x doub
50425022
;
50435023
; GFX11-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16:
50445024
; GFX11: ; %bb.0:
5045-
; GFX11-NEXT: s_lshr_b32 s5, s4, 16
50465025
; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s4
5047-
; GFX11-NEXT: v_lshlrev_b32_e64 v1, 16, s5
5048-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
5026+
; GFX11-NEXT: s_lshr_b32 s4, s4, 16
5027+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
5028+
; GFX11-NEXT: v_lshlrev_b32_e64 v1, 16, s4
50495029
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0
5050-
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v1
50515030
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
5031+
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v1
50525032
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
5033+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
50535034
; GFX11-NEXT: v_readfirstlane_b32 s3, v1
50545035
; GFX11-NEXT: ; return to shader part epilog
50555036
%sign.ext = fpext <2 x bfloat> %sign to <2 x double>

llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll

Lines changed: 30 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -3365,34 +3365,30 @@ define <2 x double> @v_copysign_out_v2f64_mag_v2f64_sign_v2f16(<2 x double> %mag
33653365
; SI-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2f16:
33663366
; SI: ; %bb.0:
33673367
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3368-
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
3369-
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
33703368
; SI-NEXT: s_brev_b32 s4, -2
3371-
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
3372-
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
33733369
; SI-NEXT: v_bfi_b32 v1, s4, v1, v4
33743370
; SI-NEXT: v_bfi_b32 v3, s4, v3, v5
33753371
; SI-NEXT: s_setpc_b64 s[30:31]
33763372
;
33773373
; VI-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2f16:
33783374
; VI: ; %bb.0:
33793375
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3380-
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
3381-
; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
3376+
; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4
3377+
; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
33823378
; VI-NEXT: s_brev_b32 s4, -2
3383-
; VI-NEXT: v_bfi_b32 v1, s4, v1, v4
3384-
; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v5
3379+
; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
3380+
; VI-NEXT: v_bfi_b32 v1, s4, v1, v5
33853381
; VI-NEXT: v_bfi_b32 v3, s4, v3, v4
33863382
; VI-NEXT: s_setpc_b64 s[30:31]
33873383
;
33883384
; GFX9-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2f16:
33893385
; GFX9: ; %bb.0:
33903386
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3391-
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4
3392-
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
3387+
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
3388+
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
33933389
; GFX9-NEXT: s_brev_b32 s4, -2
3394-
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v4
3395-
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v5
3390+
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
3391+
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v5
33963392
; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v4
33973393
; GFX9-NEXT: s_setpc_b64 s[30:31]
33983394
;
@@ -4294,57 +4290,56 @@ define amdgpu_ps <2 x i32> @s_copysign_out_v2f32_mag_v2f32_sign_v2f16(<2 x float
42944290
define amdgpu_ps <4 x i32> @s_copysign_out_v2f64_mag_v2f64_sign_v2f16(<2 x double> inreg %mag, <2 x half> inreg %sign) {
42954291
; SI-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2f16:
42964292
; SI: ; %bb.0:
4297-
; SI-NEXT: v_cvt_f16_f32_e32 v0, s5
4298-
; SI-NEXT: v_cvt_f16_f32_e32 v1, s4
4299-
; SI-NEXT: s_brev_b32 s4, -2
4300-
; SI-NEXT: v_mov_b32_e32 v2, s3
4301-
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
4302-
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
4303-
; SI-NEXT: v_bfi_b32 v0, s4, v2, v0
4304-
; SI-NEXT: v_mov_b32_e32 v2, s1
4305-
; SI-NEXT: v_bfi_b32 v1, s4, v2, v1
4293+
; SI-NEXT: s_brev_b32 s6, -2
4294+
; SI-NEXT: v_mov_b32_e32 v0, s3
4295+
; SI-NEXT: v_mov_b32_e32 v1, s5
4296+
; SI-NEXT: v_bfi_b32 v0, s6, v0, v1
4297+
; SI-NEXT: v_mov_b32_e32 v1, s1
4298+
; SI-NEXT: v_mov_b32_e32 v2, s4
4299+
; SI-NEXT: v_bfi_b32 v1, s6, v1, v2
43064300
; SI-NEXT: v_readfirstlane_b32 s1, v1
43074301
; SI-NEXT: v_readfirstlane_b32 s3, v0
43084302
; SI-NEXT: ; return to shader part epilog
43094303
;
43104304
; VI-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2f16:
43114305
; VI: ; %bb.0:
4312-
; VI-NEXT: s_lshr_b32 s5, s4, 16
43134306
; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s4
4314-
; VI-NEXT: s_brev_b32 s4, -2
4307+
; VI-NEXT: s_brev_b32 s5, -2
43154308
; VI-NEXT: v_mov_b32_e32 v1, s1
4316-
; VI-NEXT: v_bfi_b32 v0, s4, v1, v0
4317-
; VI-NEXT: v_lshlrev_b32_e64 v1, 16, s5
4309+
; VI-NEXT: s_lshr_b32 s1, s4, 16
4310+
; VI-NEXT: v_bfi_b32 v0, s5, v1, v0
4311+
; VI-NEXT: v_lshlrev_b32_e64 v1, 16, s1
43184312
; VI-NEXT: v_mov_b32_e32 v2, s3
4319-
; VI-NEXT: v_bfi_b32 v1, s4, v2, v1
4313+
; VI-NEXT: v_bfi_b32 v1, s5, v2, v1
43204314
; VI-NEXT: v_readfirstlane_b32 s1, v0
43214315
; VI-NEXT: v_readfirstlane_b32 s3, v1
43224316
; VI-NEXT: ; return to shader part epilog
43234317
;
43244318
; GFX9-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2f16:
43254319
; GFX9: ; %bb.0:
4326-
; GFX9-NEXT: s_lshr_b32 s5, s4, 16
43274320
; GFX9-NEXT: v_lshlrev_b32_e64 v0, 16, s4
4328-
; GFX9-NEXT: s_brev_b32 s4, -2
4321+
; GFX9-NEXT: s_brev_b32 s5, -2
43294322
; GFX9-NEXT: v_mov_b32_e32 v1, s1
4330-
; GFX9-NEXT: v_bfi_b32 v0, s4, v1, v0
4331-
; GFX9-NEXT: v_lshlrev_b32_e64 v1, 16, s5
4323+
; GFX9-NEXT: s_lshr_b32 s1, s4, 16
4324+
; GFX9-NEXT: v_bfi_b32 v0, s5, v1, v0
4325+
; GFX9-NEXT: v_lshlrev_b32_e64 v1, 16, s1
43324326
; GFX9-NEXT: v_mov_b32_e32 v2, s3
4333-
; GFX9-NEXT: v_bfi_b32 v1, s4, v2, v1
4327+
; GFX9-NEXT: v_bfi_b32 v1, s5, v2, v1
43344328
; GFX9-NEXT: v_readfirstlane_b32 s1, v0
43354329
; GFX9-NEXT: v_readfirstlane_b32 s3, v1
43364330
; GFX9-NEXT: ; return to shader part epilog
43374331
;
43384332
; GFX11-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2f16:
43394333
; GFX11: ; %bb.0:
4340-
; GFX11-NEXT: s_lshr_b32 s5, s4, 16
43414334
; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s4
4342-
; GFX11-NEXT: v_lshlrev_b32_e64 v1, 16, s5
4343-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
4335+
; GFX11-NEXT: s_lshr_b32 s4, s4, 16
4336+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
4337+
; GFX11-NEXT: v_lshlrev_b32_e64 v1, 16, s4
43444338
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0
4345-
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v1
43464339
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
4340+
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v1
43474341
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
4342+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
43484343
; GFX11-NEXT: v_readfirstlane_b32 s3, v1
43494344
; GFX11-NEXT: ; return to shader part epilog
43504345
%sign.ext = fpext <2 x half> %sign to <2 x double>

0 commit comments

Comments
 (0)