|
18 | 18 | #include "llvm/CodeGen/BasicTTIImpl.h"
|
19 | 19 | #include "llvm/CodeGen/CostTable.h"
|
20 | 20 | #include "llvm/CodeGen/TargetLowering.h"
|
| 21 | +#include "llvm/IR/DerivedTypes.h" |
21 | 22 | #include "llvm/IR/IntrinsicInst.h"
|
22 | 23 | #include "llvm/IR/Intrinsics.h"
|
23 | 24 | #include "llvm/IR/IntrinsicsAArch64.h"
|
@@ -3531,23 +3532,111 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
|
3531 | 3532 | default:
|
3532 | 3533 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
|
3533 | 3534 | Op2Info);
|
| 3535 | + case ISD::SREM: |
3534 | 3536 | case ISD::SDIV:
|
3535 |
| - if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) { |
3536 |
| - // On AArch64, scalar signed division by constants power-of-two are |
3537 |
| - // normally expanded to the sequence ADD + CMP + SELECT + SRA. |
3538 |
| - // The OperandValue properties many not be same as that of previous |
3539 |
| - // operation; conservatively assume OP_None. |
3540 |
| - InstructionCost Cost = getArithmeticInstrCost( |
3541 |
| - Instruction::Add, Ty, CostKind, |
3542 |
| - Op1Info.getNoProps(), Op2Info.getNoProps()); |
3543 |
| - Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, |
3544 |
| - Op1Info.getNoProps(), Op2Info.getNoProps()); |
3545 |
| - Cost += getArithmeticInstrCost( |
3546 |
| - Instruction::Select, Ty, CostKind, |
3547 |
| - Op1Info.getNoProps(), Op2Info.getNoProps()); |
3548 |
| - Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, |
3549 |
| - Op1Info.getNoProps(), Op2Info.getNoProps()); |
3550 |
| - return Cost; |
| 3537 | + /* |
| 3538 | + Notes for sdiv/srem specific costs: |
| 3539 | + 1. This only considers the cases where the divisor is constant, uniform and |
| 3540 | + (pow-of-2/non-pow-of-2). Other cases are not important since they either |
| 3541 | + result in some form of (ldr + adrp), corresponding to constant vectors, or |
| 3542 | + scalarization of the division operation. |
| 3543 | + 2. Constant divisors, either negative in whole or partially, don't result in |
| 3544 | + significantly different codegen as compared to positive constant divisors. |
| 3545 | + So, we don't consider negative divisors seperately. |
| 3546 | + 3. If the codegen is significantly different with SVE, it has been indicated |
| 3547 | + using comments at appropriate places. |
| 3548 | +
|
| 3549 | + sdiv specific cases: |
| 3550 | + ----------------------------------------------------------------------- |
| 3551 | + codegen | pow-of-2 | Type |
| 3552 | + ----------------------------------------------------------------------- |
| 3553 | + add + cmp + csel + asr | Y | i64 |
| 3554 | + add + cmp + csel + asr | Y | i32 |
| 3555 | + ----------------------------------------------------------------------- |
| 3556 | +
|
| 3557 | + srem specific cases: |
| 3558 | + ----------------------------------------------------------------------- |
| 3559 | + codegen | pow-of-2 | Type |
| 3560 | + ----------------------------------------------------------------------- |
| 3561 | + negs + and + and + csneg | Y | i64 |
| 3562 | + negs + and + and + csneg | Y | i32 |
| 3563 | + ----------------------------------------------------------------------- |
| 3564 | +
|
| 3565 | + other sdiv/srem cases: |
| 3566 | + ------------------------------------------------------------------------- |
| 3567 | + commom codegen | + srem | + sdiv | pow-of-2 | Type |
| 3568 | + ------------------------------------------------------------------------- |
| 3569 | + smulh + asr + add + add | - | - | N | i64 |
| 3570 | + smull + lsr + add + add | - | - | N | i32 |
| 3571 | + usra | and + sub | sshr | Y | <2 x i64> |
| 3572 | + 2 * (scalar code) | - | - | N | <2 x i64> |
| 3573 | + usra | bic + sub | sshr + neg | Y | <4 x i32> |
| 3574 | + smull2 + smull + uzp2 | mls | - | N | <4 x i32> |
| 3575 | + + sshr + usra | | | | |
| 3576 | + ------------------------------------------------------------------------- |
| 3577 | + */ |
| 3578 | + if (Op2Info.isConstant() && Op2Info.isUniform()) { |
| 3579 | + InstructionCost AddCost = |
| 3580 | + getArithmeticInstrCost(Instruction::Add, Ty, CostKind, |
| 3581 | + Op1Info.getNoProps(), Op2Info.getNoProps()); |
| 3582 | + InstructionCost AsrCost = |
| 3583 | + getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, |
| 3584 | + Op1Info.getNoProps(), Op2Info.getNoProps()); |
| 3585 | + InstructionCost MulCost = |
| 3586 | + getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, |
| 3587 | + Op1Info.getNoProps(), Op2Info.getNoProps()); |
| 3588 | + // add/cmp/csel/csneg should have similar cost while asr/negs/and should |
| 3589 | + // have similar cost. |
| 3590 | + auto VT = TLI->getValueType(DL, Ty); |
| 3591 | + if (LT.second.isScalarInteger() && VT.getSizeInBits() <= 64) { |
| 3592 | + if (Op2Info.isPowerOf2()) { |
| 3593 | + return ISD == ISD::SDIV ? (3 * AddCost + AsrCost) |
| 3594 | + : (3 * AsrCost + AddCost); |
| 3595 | + } else { |
| 3596 | + return MulCost + AsrCost + 2 * AddCost; |
| 3597 | + } |
| 3598 | + } else if (VT.isVector()) { |
| 3599 | + InstructionCost UsraCost = 2 * AsrCost; |
| 3600 | + if (Op2Info.isPowerOf2()) { |
| 3601 | + // Division with scalable types corresponds to native 'asrd' |
| 3602 | + // instruction when SVE is available. |
| 3603 | + // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8) |
| 3604 | + if (Ty->isScalableTy() && ST->hasSVE()) |
| 3605 | + return 2 * AsrCost; |
| 3606 | + return UsraCost + |
| 3607 | + (ISD == ISD::SDIV |
| 3608 | + ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * |
| 3609 | + AsrCost |
| 3610 | + : 2 * AddCost); |
| 3611 | + } else if (LT.second == MVT::v2i64) { |
| 3612 | + return VT.getVectorNumElements() * |
| 3613 | + getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind, |
| 3614 | + Op1Info.getNoProps(), |
| 3615 | + Op2Info.getNoProps()); |
| 3616 | + } else { |
| 3617 | + // When SVE is available, we get: |
| 3618 | + // smulh + lsr + add/sub + asr + add/sub. |
| 3619 | + if (Ty->isScalableTy() && ST->hasSVE()) |
| 3620 | + return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost; |
| 3621 | + return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost; |
| 3622 | + } |
| 3623 | + } |
| 3624 | + } |
| 3625 | + if (Op2Info.isConstant() && !Op2Info.isUniform() && |
| 3626 | + LT.second.isFixedLengthVector()) { |
| 3627 | + // FIXME: When the constant vector is non-uniform, this may result in |
| 3628 | + // loading the vector from constant pool or in some cases, may also result |
| 3629 | + // in scalarization. For now, we are approximating this with the |
| 3630 | + // scalarization cost. |
| 3631 | + auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty, |
| 3632 | + CostKind, -1, nullptr, nullptr); |
| 3633 | + auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty, |
| 3634 | + CostKind, -1, nullptr, nullptr); |
| 3635 | + unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements(); |
| 3636 | + return ExtractCost + InsertCost + |
| 3637 | + NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(), |
| 3638 | + CostKind, Op1Info.getNoProps(), |
| 3639 | + Op2Info.getNoProps()); |
3551 | 3640 | }
|
3552 | 3641 | [[fallthrough]];
|
3553 | 3642 | case ISD::UDIV:
|
@@ -3587,23 +3676,6 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
|
3587 | 3676 | AddCost * 2 + ShrCost;
|
3588 | 3677 | return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
|
3589 | 3678 | }
|
3590 |
| - |
3591 |
| - // TODO: Fix SDIV and SREM costs, similar to the above. |
3592 |
| - if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT) && |
3593 |
| - Op2Info.isUniform() && !VT.isScalableVector()) { |
3594 |
| - // Vector signed division by constant are expanded to the |
3595 |
| - // sequence MULHS + ADD/SUB + SRA + SRL + ADD. |
3596 |
| - InstructionCost MulCost = |
3597 |
| - getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, |
3598 |
| - Op1Info.getNoProps(), Op2Info.getNoProps()); |
3599 |
| - InstructionCost AddCost = |
3600 |
| - getArithmeticInstrCost(Instruction::Add, Ty, CostKind, |
3601 |
| - Op1Info.getNoProps(), Op2Info.getNoProps()); |
3602 |
| - InstructionCost ShrCost = |
3603 |
| - getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, |
3604 |
| - Op1Info.getNoProps(), Op2Info.getNoProps()); |
3605 |
| - return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1; |
3606 |
| - } |
3607 | 3679 | }
|
3608 | 3680 |
|
3609 | 3681 | // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
|
|
0 commit comments