Skip to content

Commit c480874

Browse files
authored
[AArch64][CostModel] Alter sdiv/srem cost where the divisor is constant (#123552)
This patch revises the cost model for sdiv/srem and draws its inspiration from the udiv/urem patch #122236 The typical codegen for the different scenarios has been mentioned as notes/comments in the code itself( this is done owing to lot of scenarios such that it would be difficult to mention them here in the patch description).
1 parent 58fc4b1 commit c480874

File tree

8 files changed

+608
-568
lines changed

8 files changed

+608
-568
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 105 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "llvm/CodeGen/BasicTTIImpl.h"
1919
#include "llvm/CodeGen/CostTable.h"
2020
#include "llvm/CodeGen/TargetLowering.h"
21+
#include "llvm/IR/DerivedTypes.h"
2122
#include "llvm/IR/IntrinsicInst.h"
2223
#include "llvm/IR/Intrinsics.h"
2324
#include "llvm/IR/IntrinsicsAArch64.h"
@@ -3531,23 +3532,111 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
35313532
default:
35323533
return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
35333534
Op2Info);
3535+
case ISD::SREM:
35343536
case ISD::SDIV:
3535-
if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) {
3536-
// On AArch64, scalar signed division by constants power-of-two are
3537-
// normally expanded to the sequence ADD + CMP + SELECT + SRA.
3538-
// The OperandValue properties many not be same as that of previous
3539-
// operation; conservatively assume OP_None.
3540-
InstructionCost Cost = getArithmeticInstrCost(
3541-
Instruction::Add, Ty, CostKind,
3542-
Op1Info.getNoProps(), Op2Info.getNoProps());
3543-
Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
3544-
Op1Info.getNoProps(), Op2Info.getNoProps());
3545-
Cost += getArithmeticInstrCost(
3546-
Instruction::Select, Ty, CostKind,
3547-
Op1Info.getNoProps(), Op2Info.getNoProps());
3548-
Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
3549-
Op1Info.getNoProps(), Op2Info.getNoProps());
3550-
return Cost;
3537+
/*
3538+
Notes for sdiv/srem specific costs:
3539+
1. This only considers the cases where the divisor is constant, uniform and
3540+
(pow-of-2/non-pow-of-2). Other cases are not important since they either
3541+
result in some form of (ldr + adrp), corresponding to constant vectors, or
3542+
scalarization of the division operation.
3543+
2. Constant divisors, either negative in whole or partially, don't result in
3544+
significantly different codegen as compared to positive constant divisors.
3545+
So, we don't consider negative divisors seperately.
3546+
3. If the codegen is significantly different with SVE, it has been indicated
3547+
using comments at appropriate places.
3548+
3549+
sdiv specific cases:
3550+
-----------------------------------------------------------------------
3551+
codegen | pow-of-2 | Type
3552+
-----------------------------------------------------------------------
3553+
add + cmp + csel + asr | Y | i64
3554+
add + cmp + csel + asr | Y | i32
3555+
-----------------------------------------------------------------------
3556+
3557+
srem specific cases:
3558+
-----------------------------------------------------------------------
3559+
codegen | pow-of-2 | Type
3560+
-----------------------------------------------------------------------
3561+
negs + and + and + csneg | Y | i64
3562+
negs + and + and + csneg | Y | i32
3563+
-----------------------------------------------------------------------
3564+
3565+
other sdiv/srem cases:
3566+
-------------------------------------------------------------------------
3567+
commom codegen | + srem | + sdiv | pow-of-2 | Type
3568+
-------------------------------------------------------------------------
3569+
smulh + asr + add + add | - | - | N | i64
3570+
smull + lsr + add + add | - | - | N | i32
3571+
usra | and + sub | sshr | Y | <2 x i64>
3572+
2 * (scalar code) | - | - | N | <2 x i64>
3573+
usra | bic + sub | sshr + neg | Y | <4 x i32>
3574+
smull2 + smull + uzp2 | mls | - | N | <4 x i32>
3575+
+ sshr + usra | | | |
3576+
-------------------------------------------------------------------------
3577+
*/
3578+
if (Op2Info.isConstant() && Op2Info.isUniform()) {
3579+
InstructionCost AddCost =
3580+
getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
3581+
Op1Info.getNoProps(), Op2Info.getNoProps());
3582+
InstructionCost AsrCost =
3583+
getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
3584+
Op1Info.getNoProps(), Op2Info.getNoProps());
3585+
InstructionCost MulCost =
3586+
getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
3587+
Op1Info.getNoProps(), Op2Info.getNoProps());
3588+
// add/cmp/csel/csneg should have similar cost while asr/negs/and should
3589+
// have similar cost.
3590+
auto VT = TLI->getValueType(DL, Ty);
3591+
if (LT.second.isScalarInteger() && VT.getSizeInBits() <= 64) {
3592+
if (Op2Info.isPowerOf2()) {
3593+
return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
3594+
: (3 * AsrCost + AddCost);
3595+
} else {
3596+
return MulCost + AsrCost + 2 * AddCost;
3597+
}
3598+
} else if (VT.isVector()) {
3599+
InstructionCost UsraCost = 2 * AsrCost;
3600+
if (Op2Info.isPowerOf2()) {
3601+
// Division with scalable types corresponds to native 'asrd'
3602+
// instruction when SVE is available.
3603+
// e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
3604+
if (Ty->isScalableTy() && ST->hasSVE())
3605+
return 2 * AsrCost;
3606+
return UsraCost +
3607+
(ISD == ISD::SDIV
3608+
? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) *
3609+
AsrCost
3610+
: 2 * AddCost);
3611+
} else if (LT.second == MVT::v2i64) {
3612+
return VT.getVectorNumElements() *
3613+
getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
3614+
Op1Info.getNoProps(),
3615+
Op2Info.getNoProps());
3616+
} else {
3617+
// When SVE is available, we get:
3618+
// smulh + lsr + add/sub + asr + add/sub.
3619+
if (Ty->isScalableTy() && ST->hasSVE())
3620+
return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost;
3621+
return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost;
3622+
}
3623+
}
3624+
}
3625+
if (Op2Info.isConstant() && !Op2Info.isUniform() &&
3626+
LT.second.isFixedLengthVector()) {
3627+
// FIXME: When the constant vector is non-uniform, this may result in
3628+
// loading the vector from constant pool or in some cases, may also result
3629+
// in scalarization. For now, we are approximating this with the
3630+
// scalarization cost.
3631+
auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,
3632+
CostKind, -1, nullptr, nullptr);
3633+
auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,
3634+
CostKind, -1, nullptr, nullptr);
3635+
unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();
3636+
return ExtractCost + InsertCost +
3637+
NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),
3638+
CostKind, Op1Info.getNoProps(),
3639+
Op2Info.getNoProps());
35513640
}
35523641
[[fallthrough]];
35533642
case ISD::UDIV:
@@ -3587,23 +3676,6 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
35873676
AddCost * 2 + ShrCost;
35883677
return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
35893678
}
3590-
3591-
// TODO: Fix SDIV and SREM costs, similar to the above.
3592-
if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT) &&
3593-
Op2Info.isUniform() && !VT.isScalableVector()) {
3594-
// Vector signed division by constant are expanded to the
3595-
// sequence MULHS + ADD/SUB + SRA + SRL + ADD.
3596-
InstructionCost MulCost =
3597-
getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
3598-
Op1Info.getNoProps(), Op2Info.getNoProps());
3599-
InstructionCost AddCost =
3600-
getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
3601-
Op1Info.getNoProps(), Op2Info.getNoProps());
3602-
InstructionCost ShrCost =
3603-
getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
3604-
Op1Info.getNoProps(), Op2Info.getNoProps());
3605-
return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
3606-
}
36073679
}
36083680

36093681
// div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are

0 commit comments

Comments
 (0)