Skip to content

Commit 9427bd0

Browse files
committed
AMDGPU: Handle vectors in copysign magnitude sign case
1 parent 5d89896 commit 9427bd0

File tree

4 files changed

+242
-291
lines changed

4 files changed

+242
-291
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 46 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11721,29 +11721,63 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
1172111721
DAGCombinerInfo &DCI) const {
1172211722
SDValue MagnitudeOp = N->getOperand(0);
1172311723
SDValue SignOp = N->getOperand(1);
11724+
11725+
// The generic combine for fcopysign + fp cast is too conservative with
11726+
// vectors, and also gets confused by the splitting we will perform here, so
11727+
// peek through FP casts.
11728+
if (SignOp.getOpcode() == ISD::FP_EXTEND ||
11729+
SignOp.getOpcode() == ISD::FP_ROUND)
11730+
SignOp = SignOp.getOperand(0);
11731+
1172411732
SelectionDAG &DAG = DCI.DAG;
1172511733
SDLoc DL(N);
11734+
EVT SignVT = SignOp.getValueType();
1172611735

1172711736
// f64 fcopysign is really an f32 copysign on the high bits, so replace the
1172811737
// lower half with a copy.
1172911738
// fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11730-
if (MagnitudeOp.getValueType() == MVT::f64) {
11731-
SDValue MagAsVector =
11732-
DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
11733-
SDValue MagLo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
11734-
MagAsVector, DAG.getConstant(0, DL, MVT::i32));
11735-
SDValue MagHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
11736-
MagAsVector, DAG.getConstant(1, DL, MVT::i32));
11739+
EVT MagVT = MagnitudeOp.getValueType();
11740+
if (MagVT.getScalarType() == MVT::f64) {
11741+
unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
11742+
11743+
EVT F32VT = MagVT.isVector()
11744+
? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
11745+
: MVT::v2f32;
11746+
11747+
SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
11748+
11749+
SmallVector<SDValue, 8> NewElts;
11750+
for (unsigned I = 0; I != NumElts; ++I) {
11751+
SDValue MagLo =
11752+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11753+
DAG.getConstant(2 * I, DL, MVT::i32));
11754+
SDValue MagHi =
11755+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11756+
DAG.getConstant(2 * I + 1, DL, MVT::i32));
1173711757

11738-
SDValue HiOp = DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
11758+
SDValue SignOpElt =
11759+
MagVT.isVector()
11760+
? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SignVT.getScalarType(),
11761+
SignOp, DAG.getConstant(I, DL, MVT::i32))
11762+
: SignOp;
11763+
11764+
SDValue HiOp =
11765+
DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
11766+
11767+
SDValue Vector =
11768+
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
11769+
11770+
SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
11771+
NewElts.push_back(NewElt);
11772+
}
1173911773

11740-
SDValue Vector =
11741-
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
11774+
if (NewElts.size() == 1)
11775+
return NewElts[0];
1174211776

11743-
return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
11777+
return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
1174411778
}
1174511779

11746-
if (SignOp.getValueType() != MVT::f64)
11780+
if (SignVT != MVT::f64)
1174711781
return SDValue();
1174811782

1174911783
// Reduce width of sign operand, we only need the highest bit.

llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,8 +131,8 @@ define <2 x double> @test_pown_reduced_fast_v2f64_known_odd(<2 x double> %x, <2
131131
; GFX9-LABEL: test_pown_reduced_fast_v2f64_known_odd:
132132
; GFX9: ; %bb.0:
133133
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
134-
; GFX9-NEXT: v_or_b32_e32 v6, 1, v5
135134
; GFX9-NEXT: v_or_b32_e32 v4, 1, v4
135+
; GFX9-NEXT: v_or_b32_e32 v6, 1, v5
136136
; GFX9-NEXT: v_cvt_f64_i32_e32 v[4:5], v4
137137
; GFX9-NEXT: v_cvt_f64_i32_e32 v[6:7], v6
138138
; GFX9-NEXT: s_brev_b32 s4, -2

0 commit comments

Comments
 (0)