Skip to content

Commit a920da7

Browse files
committed
AMDGPU: Handle vectors in copysign sign type combine
This avoids some ugly codegen on pre-16-bit instruction targets now from annoying f16 legalization effects. This also avoids regressions on newer targets in a future patch.
1 parent e39e990 commit a920da7

File tree

3 files changed

+129
-581
lines changed

3 files changed

+129
-581
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11737,9 +11737,10 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
1173711737
// lower half with a copy.
1173811738
// fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
1173911739
EVT MagVT = MagnitudeOp.getValueType();
11740-
if (MagVT.getScalarType() == MVT::f64) {
11741-
unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
1174211740

11741+
unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
11742+
11743+
if (MagVT.getScalarType() == MVT::f64) {
1174311744
EVT F32VT = MagVT.isVector()
1174411745
? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
1174511746
: MVT::v2f32;
@@ -11777,21 +11778,39 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
1177711778
return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
1177811779
}
1177911780

11780-
if (SignVT != MVT::f64)
11781+
if (SignVT.getScalarType() != MVT::f64)
1178111782
return SDValue();
1178211783

1178311784
// Reduce width of sign operand, we only need the highest bit.
1178411785
//
1178511786
// fcopysign f64:x, f64:y ->
1178611787
// fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
1178711788
// TODO: In some cases it might make sense to go all the way to f16.
11788-
SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
11789-
SDValue SignAsF32 =
11790-
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
11791-
DAG.getConstant(1, DL, MVT::i32));
11789+
11790+
EVT F32VT = MagVT.isVector()
11791+
? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
11792+
: MVT::v2f32;
11793+
11794+
SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
11795+
11796+
SmallVector<SDValue, 8> F32Signs;
11797+
for (unsigned I = 0; I != NumElts; ++I) {
11798+
// Take sign from odd elements of cast vector
11799+
SDValue SignAsF32 =
11800+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
11801+
DAG.getConstant(2 * I + 1, DL, MVT::i32));
11802+
F32Signs.push_back(SignAsF32);
11803+
}
11804+
11805+
SDValue NewSign =
11806+
NumElts == 1
11807+
? F32Signs.back()
11808+
: DAG.getNode(ISD::BUILD_VECTOR, DL,
11809+
EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
11810+
F32Signs);
1179211811

1179311812
return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
11794-
SignAsF32);
11813+
NewSign);
1179511814
}
1179611815

1179711816
// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)

llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll

Lines changed: 76 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -4677,37 +4677,33 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64(<2 x bfloat> %m
46774677
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
46784678
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
46794679
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
4680-
; GCN-NEXT: v_cvt_f32_f64_e32 v2, v[2:3]
4681-
; GCN-NEXT: v_cvt_f32_f64_e32 v3, v[4:5]
4680+
; GCN-NEXT: v_and_b32_e32 v2, 0x80000000, v5
4681+
; GCN-NEXT: v_and_b32_e32 v3, 0x80000000, v3
46824682
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
4683-
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
46844683
; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15
4684+
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
46854685
; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
4686-
; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3
4687-
; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2
4688-
; GCN-NEXT: v_or_b32_e32 v1, v1, v3
4689-
; GCN-NEXT: v_or_b32_e32 v0, v0, v2
4690-
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
4686+
; GCN-NEXT: v_or_b32_e32 v1, v1, v2
4687+
; GCN-NEXT: v_or_b32_e32 v0, v0, v3
46914688
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
4689+
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
46924690
; GCN-NEXT: s_setpc_b64 s[30:31]
46934691
;
46944692
; GFX7-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64:
46954693
; GFX7: ; %bb.0:
46964694
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4697-
; GFX7-NEXT: v_cvt_f32_f64_e32 v2, v[2:3]
4698-
; GFX7-NEXT: v_cvt_f32_f64_e32 v3, v[4:5]
4699-
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
47004695
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
4696+
; GFX7-NEXT: v_and_b32_e32 v2, 0x80000000, v5
47014697
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
4702-
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
4703-
; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v3
47044698
; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15
4705-
; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v2
4699+
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
4700+
; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
4701+
; GFX7-NEXT: v_and_b32_e32 v2, 0x80000000, v3
4702+
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
47064703
; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
4707-
; GFX7-NEXT: v_or_b32_e32 v1, v1, v3
47084704
; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
4709-
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
47104705
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
4706+
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
47114707
; GFX7-NEXT: s_setpc_b64 s[30:31]
47124708
;
47134709
; GFX8-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64:
@@ -5585,35 +5581,31 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2bf16_sign_v2f64(<2 x bfloat> i
55855581
; GCN: ; %bb.0:
55865582
; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s1
55875583
; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s0
5588-
; GCN-NEXT: v_cvt_f32_f64_e32 v2, s[4:5]
5589-
; GCN-NEXT: v_cvt_f32_f64_e32 v3, s[2:3]
5590-
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
5591-
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
5584+
; GCN-NEXT: s_and_b32 s0, s3, 0x80000000
5585+
; GCN-NEXT: s_and_b32 s1, s5, 0x80000000
5586+
; GCN-NEXT: s_lshr_b32 s0, s0, 16
55925587
; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15
5588+
; GCN-NEXT: s_lshr_b32 s1, s1, 16
55935589
; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
5594-
; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3
5595-
; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2
5596-
; GCN-NEXT: v_or_b32_e32 v1, v1, v3
5597-
; GCN-NEXT: v_or_b32_e32 v0, v0, v2
5590+
; GCN-NEXT: v_or_b32_e32 v1, s0, v1
5591+
; GCN-NEXT: v_or_b32_e32 v0, s1, v0
55985592
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
55995593
; GCN-NEXT: v_or_b32_e32 v0, v1, v0
56005594
; GCN-NEXT: v_readfirstlane_b32 s0, v0
56015595
; GCN-NEXT: ; return to shader part epilog
56025596
;
56035597
; GFX7-LABEL: s_copysign_out_v2bf16_mag_v2bf16_sign_v2f64:
56045598
; GFX7: ; %bb.0:
5605-
; GFX7-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
5606-
; GFX7-NEXT: v_cvt_f32_f64_e32 v1, s[2:3]
5607-
; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s1
5608-
; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s0
5609-
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
5610-
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
5611-
; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0
5612-
; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15
5613-
; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1
5614-
; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15
5615-
; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
5616-
; GFX7-NEXT: v_or_b32_e32 v1, v3, v1
5599+
; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0
5600+
; GFX7-NEXT: s_and_b32 s0, s3, 0x80000000
5601+
; GFX7-NEXT: s_lshr_b32 s0, s0, 16
5602+
; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15
5603+
; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s1
5604+
; GFX7-NEXT: v_or_b32_e32 v1, s0, v1
5605+
; GFX7-NEXT: s_and_b32 s0, s5, 0x80000000
5606+
; GFX7-NEXT: s_lshr_b32 s0, s0, 16
5607+
; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
5608+
; GFX7-NEXT: v_or_b32_e32 v0, s0, v0
56175609
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
56185610
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
56195611
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
@@ -6682,51 +6674,45 @@ define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3bf16_sign_v3f64(<3 x bfloat> %m
66826674
; GCN: ; %bb.0:
66836675
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
66846676
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
6685-
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
66866677
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
6687-
; GCN-NEXT: v_cvt_f32_f64_e32 v3, v[3:4]
6688-
; GCN-NEXT: v_cvt_f32_f64_e32 v4, v[5:6]
6689-
; GCN-NEXT: v_cvt_f32_f64_e32 v5, v[7:8]
6678+
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
6679+
; GCN-NEXT: v_and_b32_e32 v3, 0x80000000, v6
6680+
; GCN-NEXT: v_and_b32_e32 v5, 0x80000000, v8
6681+
; GCN-NEXT: v_and_b32_e32 v4, 0x80000000, v4
66906682
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
6691-
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
6683+
; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15
66926684
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
66936685
; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15
6694-
; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15
6686+
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
66956687
; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
6696-
; GCN-NEXT: v_and_b32_e32 v5, 0x8000, v5
6697-
; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v4
6698-
; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3
6688+
; GCN-NEXT: v_or_b32_e32 v1, v1, v3
66996689
; GCN-NEXT: v_or_b32_e32 v2, v2, v5
6700-
; GCN-NEXT: v_or_b32_e32 v1, v1, v4
6701-
; GCN-NEXT: v_or_b32_e32 v0, v0, v3
6702-
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
6690+
; GCN-NEXT: v_or_b32_e32 v0, v0, v4
67036691
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
6692+
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
67046693
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
67056694
; GCN-NEXT: s_setpc_b64 s[30:31]
67066695
;
67076696
; GFX7-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f64:
67086697
; GFX7: ; %bb.0:
67096698
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6710-
; GFX7-NEXT: v_cvt_f32_f64_e32 v3, v[3:4]
6711-
; GFX7-NEXT: v_cvt_f32_f64_e32 v4, v[5:6]
6712-
; GFX7-NEXT: v_cvt_f32_f64_e32 v5, v[7:8]
6713-
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
67146699
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
6700+
; GFX7-NEXT: v_and_b32_e32 v3, 0x80000000, v6
6701+
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
6702+
; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15
67156703
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
6704+
; GFX7-NEXT: v_or_b32_e32 v1, v1, v3
6705+
; GFX7-NEXT: v_and_b32_e32 v3, 0x80000000, v8
67166706
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
6717-
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
6718-
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
6719-
; GFX7-NEXT: v_and_b32_e32 v5, 0x8000, v5
67206707
; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15
6721-
; GFX7-NEXT: v_and_b32_e32 v4, 0x8000, v4
6722-
; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15
6723-
; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v3
6708+
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
6709+
; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
6710+
; GFX7-NEXT: v_and_b32_e32 v3, 0x80000000, v4
6711+
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
67246712
; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
6725-
; GFX7-NEXT: v_or_b32_e32 v2, v2, v5
6726-
; GFX7-NEXT: v_or_b32_e32 v1, v1, v4
67276713
; GFX7-NEXT: v_or_b32_e32 v0, v0, v3
6728-
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
67296714
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
6715+
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
67306716
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
67316717
; GFX7-NEXT: s_setpc_b64 s[30:31]
67326718
;
@@ -8082,66 +8068,58 @@ define <4 x bfloat> @v_copysign_out_v4bf16_mag_v4bf16_sign_v4f64(<4 x bfloat> %m
80828068
; GCN: ; %bb.0:
80838069
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
80848070
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
8085-
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
80868071
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
80878072
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
8088-
; GCN-NEXT: v_cvt_f32_f64_e32 v4, v[4:5]
8089-
; GCN-NEXT: v_cvt_f32_f64_e32 v5, v[6:7]
8090-
; GCN-NEXT: v_cvt_f32_f64_e32 v6, v[8:9]
8091-
; GCN-NEXT: v_cvt_f32_f64_e32 v7, v[10:11]
8073+
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
8074+
; GCN-NEXT: v_and_b32_e32 v4, 0x80000000, v7
8075+
; GCN-NEXT: v_and_b32_e32 v6, 0x80000000, v11
8076+
; GCN-NEXT: v_and_b32_e32 v7, 0x80000000, v9
8077+
; GCN-NEXT: v_and_b32_e32 v5, 0x80000000, v5
80928078
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
8093-
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
8079+
; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15
80948080
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
8095-
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
80968081
; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15
8082+
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
80978083
; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15
8098-
; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15
8084+
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
80998085
; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
8100-
; GCN-NEXT: v_and_b32_e32 v7, 0x8000, v7
8101-
; GCN-NEXT: v_and_b32_e32 v6, 0x8000, v6
8102-
; GCN-NEXT: v_and_b32_e32 v5, 0x8000, v5
8103-
; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v4
8104-
; GCN-NEXT: v_or_b32_e32 v3, v3, v7
8105-
; GCN-NEXT: v_or_b32_e32 v2, v2, v6
8106-
; GCN-NEXT: v_or_b32_e32 v1, v1, v5
8107-
; GCN-NEXT: v_or_b32_e32 v0, v0, v4
8108-
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
8086+
; GCN-NEXT: v_or_b32_e32 v1, v1, v4
8087+
; GCN-NEXT: v_or_b32_e32 v3, v3, v6
8088+
; GCN-NEXT: v_or_b32_e32 v2, v2, v7
8089+
; GCN-NEXT: v_or_b32_e32 v0, v0, v5
81098090
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
8110-
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
81118091
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
8092+
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
8093+
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
81128094
; GCN-NEXT: s_setpc_b64 s[30:31]
81138095
;
81148096
; GFX7-LABEL: v_copysign_out_v4bf16_mag_v4bf16_sign_v4f64:
81158097
; GFX7: ; %bb.0:
81168098
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8117-
; GFX7-NEXT: v_cvt_f32_f64_e32 v4, v[4:5]
8118-
; GFX7-NEXT: v_cvt_f32_f64_e32 v5, v[6:7]
8119-
; GFX7-NEXT: v_cvt_f32_f64_e32 v6, v[8:9]
8120-
; GFX7-NEXT: v_cvt_f32_f64_e32 v7, v[10:11]
8121-
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
81228099
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
8123-
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
8100+
; GFX7-NEXT: v_and_b32_e32 v4, 0x80000000, v7
8101+
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
8102+
; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15
81248103
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
8104+
; GFX7-NEXT: v_or_b32_e32 v1, v1, v4
8105+
; GFX7-NEXT: v_and_b32_e32 v4, 0x80000000, v11
81258106
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
8126-
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
8127-
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
8128-
; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
8129-
; GFX7-NEXT: v_and_b32_e32 v7, 0x8000, v7
81308107
; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15
8131-
; GFX7-NEXT: v_and_b32_e32 v6, 0x8000, v6
8108+
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
8109+
; GFX7-NEXT: v_or_b32_e32 v3, v3, v4
8110+
; GFX7-NEXT: v_and_b32_e32 v4, 0x80000000, v9
8111+
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
81328112
; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15
8133-
; GFX7-NEXT: v_and_b32_e32 v5, 0x8000, v5
8134-
; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15
8135-
; GFX7-NEXT: v_and_b32_e32 v4, 0x8000, v4
8113+
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
8114+
; GFX7-NEXT: v_or_b32_e32 v2, v2, v4
8115+
; GFX7-NEXT: v_and_b32_e32 v4, 0x80000000, v5
8116+
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
81368117
; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
8137-
; GFX7-NEXT: v_or_b32_e32 v3, v3, v7
8138-
; GFX7-NEXT: v_or_b32_e32 v2, v2, v6
8139-
; GFX7-NEXT: v_or_b32_e32 v1, v1, v5
81408118
; GFX7-NEXT: v_or_b32_e32 v0, v0, v4
8141-
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
81428119
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
8143-
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
81448120
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
8121+
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
8122+
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
81458123
; GFX7-NEXT: s_setpc_b64 s[30:31]
81468124
;
81478125
; GFX8-LABEL: v_copysign_out_v4bf16_mag_v4bf16_sign_v4f64:

0 commit comments

Comments
 (0)