Skip to content

Commit 6534370

Browse files
author
Leon Clark
committed
Add cost analysis for shufflevector ops and update tests.
1 parent 1861c7d commit 6534370

File tree

7 files changed

+111
-89
lines changed

7 files changed

+111
-89
lines changed

clang/test/CodeGenOpenCL/preserve_vec3.cl

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ typedef float float4 __attribute__((ext_vector_type(4)));
1111
// CHECK-LABEL: define dso_local spir_kernel void @foo(
1212
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META6:![0-9]+]] !kernel_arg_type_qual [[META7:![0-9]+]] {
1313
// CHECK-NEXT: [[ENTRY:.*:]]
14-
// CHECK-NEXT: [[LOADVECN:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
15-
// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <3 x float> [[LOADVECN]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
14+
// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
15+
// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
1616
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8:![0-9]+]]
1717
// CHECK-NEXT: ret void
1818
//
@@ -23,8 +23,8 @@ void kernel foo(global float3 *a, global float3 *b) {
2323
// CHECK-LABEL: define dso_local spir_kernel void @float4_to_float3(
2424
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[A:%.*]], ptr addrspace(1) noundef readonly align 16 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META12:![0-9]+]] !kernel_arg_type_qual [[META7]] {
2525
// CHECK-NEXT: [[ENTRY:.*:]]
26-
// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
27-
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
26+
// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
27+
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
2828
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]]
2929
// CHECK-NEXT: ret void
3030
//
@@ -35,8 +35,8 @@ void kernel float4_to_float3(global float3 *a, global float4 *b) {
3535
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_float4(
3636
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META7]] {
3737
// CHECK-NEXT: [[ENTRY:.*:]]
38-
// CHECK-NEXT: [[LOADVECN:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
39-
// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <3 x float> [[LOADVECN]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
38+
// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
39+
// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
4040
// CHECK-NEXT: store <4 x float> [[ASTYPE]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
4141
// CHECK-NEXT: ret void
4242
//
@@ -47,8 +47,8 @@ void kernel float3_to_float4(global float3 *a, global float4 *b) {
4747
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_double2(
4848
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META7]] {
4949
// CHECK-NEXT: [[ENTRY:.*:]]
50-
// CHECK-NEXT: [[LOADVECN:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
51-
// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x float> [[LOADVECN]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
50+
// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
51+
// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
5252
// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
5353
// CHECK-NEXT: ret void
5454
//
@@ -59,8 +59,8 @@ void kernel float3_to_double2(global float3 *a, global double2 *b) {
5959
// CHECK-LABEL: define dso_local spir_kernel void @char8_to_short3(
6060
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[A:%.*]], ptr addrspace(1) noundef readonly align 8 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META7]] {
6161
// CHECK-NEXT: [[ENTRY:.*:]]
62-
// CHECK-NEXT: [[TMP0:%.*]] = load <3 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
63-
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
62+
// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
63+
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
6464
// CHECK-NEXT: store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 8, !tbaa [[TBAA8]]
6565
// CHECK-NEXT: ret void
6666
//

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 34 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include "llvm/ADT/DenseMap.h"
1717
#include "llvm/ADT/STLExtras.h"
1818
#include "llvm/ADT/ScopeExit.h"
19+
#include "llvm/ADT/SmallVector.h"
1920
#include "llvm/ADT/Statistic.h"
2021
#include "llvm/Analysis/AssumptionCache.h"
2122
#include "llvm/Analysis/BasicAliasAnalysis.h"
@@ -34,6 +35,7 @@
3435
#include <numeric>
3536
#include <queue>
3637
#include <set>
38+
#include <tuple>
3739

3840
#define DEBUG_TYPE "vector-combine"
3941
#include "llvm/Transforms/Utils/InstructionWorklist.h"
@@ -3484,10 +3486,13 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
34843486
return true;
34853487
}
34863488

3487-
// If `I` is a load instruction, used only by shufflevector instructions with
3488-
// poison values, attempt to shrink the load to only the lanes being used.
3489+
// Attempt to shrink loads that are only used by shufflevector instructions.
34893490
bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
3490-
auto *OldLoad = dyn_cast<LoadInst>(&I);
3491+
auto *InputShuffle = dyn_cast<ShuffleVectorInst>(&I);
3492+
if (!InputShuffle)
3493+
return {};
3494+
3495+
auto *OldLoad = dyn_cast<LoadInst>(InputShuffle->getOperand(0u));
34913496
if (!OldLoad || !OldLoad->isSimple())
34923497
return false;
34933498

@@ -3523,7 +3528,8 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
35233528
auto NumElems = int(Op0Ty->getNumElements());
35243529

35253530
for (auto Index : Mask) {
3526-
if (Index >= 0 && Index < NumElems) {
3531+
if (Index >= 0) {
3532+
Index %= NumElems;
35273533
OutputRange.first = std::min(Index, OutputRange.first);
35283534
OutputRange.second = std::max(Index, OutputRange.second);
35293535
}
@@ -3552,37 +3558,49 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
35523558
Builder.CreateAlignedLoad(NewVecTy, PtrOp, OldLoad->getAlign()));
35533559
NewLoad->copyMetadata(I);
35543560

3555-
// Compare cost of old and new ops.
3561+
// Calculate costs of old and new ops.
35563562
auto OldCost = TTI.getMemoryOpCost(
35573563
Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
35583564
OldLoad->getPointerAddressSpace(), CostKind);
35593565
auto NewCost = TTI.getMemoryOpCost(
35603566
Instruction::Load, NewLoad->getType(), NewLoad->getAlign(),
35613567
NewLoad->getPointerAddressSpace(), CostKind);
35623568

3569+
using UseEntry = std::pair<ShuffleVectorInst*, std::vector<int>>;
3570+
auto NewUses = SmallVector<UseEntry, 4u>();
3571+
auto SizeDiff = OldSize - NewSize;
3572+
35633573
for (auto &Use : I.uses()) {
35643574
auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
3565-
auto Mask = Shuffle->getShuffleMask();
3566-
3567-
OldCost +=
3568-
TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, VecTy, Mask, CostKind);
3569-
NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, NewVecTy, Mask,
3570-
CostKind);
3575+
auto OldMask = Shuffle->getShuffleMask();
3576+
3577+
// Create entry for new use.
3578+
NewUses.push_back({Shuffle, {}});
3579+
auto &NewMask = NewUses.back().second;
3580+
for (auto Index : OldMask)
3581+
NewMask.push_back(Index >= int(OldSize) ? Index - SizeDiff : Index);
3582+
3583+
// Update costs.
3584+
OldCost += TTI.getShuffleCost(
3585+
TTI::SK_PermuteSingleSrc, VecTy, OldMask, CostKind);
3586+
NewCost += TTI.getShuffleCost(
3587+
TTI::SK_PermuteSingleSrc, NewVecTy, NewMask, CostKind);
35713588
}
35723589

35733590
if (OldCost < NewCost || !NewCost.isValid()) {
35743591
NewLoad->eraseFromParent();
35753592
return false;
35763593
}
35773594

3578-
// Replace all users.
3579-
for (auto &Use : I.uses()) {
3580-
auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
3595+
// Replace all uses.
3596+
for (auto &Use : NewUses) {
3597+
auto *Shuffle = Use.first;
3598+
auto &NewMask = Use.second;
35813599

35823600
Builder.SetInsertPoint(Shuffle);
35833601
Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
35843602
auto *NewShuffle = Builder.CreateShuffleVector(
3585-
NewLoad, PoisonValue::get(NewVecTy), Shuffle->getShuffleMask());
3603+
NewLoad, PoisonValue::get(NewVecTy), NewMask);
35863604

35873605
replaceValue(*Shuffle, *NewShuffle);
35883606
}
@@ -3667,13 +3685,11 @@ bool VectorCombine::run() {
36673685
MadeChange |= foldShuffleOfIntrinsics(I);
36683686
MadeChange |= foldSelectShuffle(I);
36693687
MadeChange |= foldShuffleToIdentity(I);
3688+
MadeChange |= shrinkLoadForShuffles(I);
36703689
break;
36713690
case Instruction::BitCast:
36723691
MadeChange |= foldBitcastShuffle(I);
36733692
break;
3674-
case Instruction::Load:
3675-
MadeChange |= shrinkLoadForShuffles(I);
3676-
break;
36773693
default:
36783694
MadeChange |= shrinkType(I);
36793695
break;

llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,13 @@ $getAt = comdat any
1111

1212
define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #0 {
1313
; SSE-LABEL: @ConvertVectors_ByRef(
14-
; SSE-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
15-
; SSE-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
14+
; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
15+
; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
1616
; SSE-NEXT: ret <4 x float> [[TMP3]]
1717
;
1818
; AVX-LABEL: @ConvertVectors_ByRef(
19-
; AVX-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
20-
; AVX-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
19+
; AVX-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
20+
; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
2121
; AVX-NEXT: ret <4 x float> [[TMP3]]
2222
;
2323
%2 = alloca ptr, align 8

llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -577,8 +577,8 @@ define <8 x i32> @load_v1i32_extract_insert_v8i32_extra_use(ptr align 16 derefer
577577
; CHECK-LABEL: @load_v1i32_extract_insert_v8i32_extra_use(
578578
; CHECK-NEXT: [[L:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 4
579579
; CHECK-NEXT: store <1 x i32> [[L]], ptr [[STORE_PTR:%.*]], align 4
580-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <1 x i32> [[L]], <1 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
581-
; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
580+
; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i32>, ptr [[P]], align 4
581+
; CHECK-NEXT: [[R:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
582582
; CHECK-NEXT: ret <8 x i32> [[R]]
583583
;
584584
%l = load <1 x i32>, ptr %p, align 4

llvm/test/Transforms/VectorCombine/X86/load-widening.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -443,8 +443,8 @@ define <8 x float> @load_v2f32_v8f32_hwasan(ptr dereferenceable(32) %p) sanitize
443443

444444
define <4 x i32> @load_v2i32_v4i32_asan(ptr dereferenceable(16) %p) sanitize_address {
445445
; CHECK-LABEL: @load_v2i32_v4i32_asan(
446-
; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 1
447-
; CHECK-NEXT: [[S:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
446+
; CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1
447+
; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
448448
; CHECK-NEXT: ret <4 x i32> [[S]]
449449
;
450450
%l = load <2 x i32>, ptr %p, align 1

llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,21 @@ define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) {
4747
; broadcast loads are free on AVX (and blends are much cheap than general 2-operand shuffles)
4848

4949
define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) {
50-
; CHECK-LABEL: define <4 x double> @blend_broadcasts_v4f64(
51-
; CHECK-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
52-
; CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[P0]], align 32
53-
; CHECK-NEXT: [[TMP2:%.*]] = load <1 x double>, ptr [[P1]], align 32
54-
; CHECK-NEXT: [[BLEND:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 1, i32 0>
55-
; CHECK-NEXT: ret <4 x double> [[BLEND]]
50+
; SSE-LABEL: define <4 x double> @blend_broadcasts_v4f64(
51+
; SSE-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
52+
; SSE-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
53+
; SSE-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
54+
; SSE-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> [[LD1]], <4 x i32> <i32 0, i32 4, i32 4, i32 0>
55+
; SSE-NEXT: ret <4 x double> [[BLEND]]
56+
;
57+
; AVX-LABEL: define <4 x double> @blend_broadcasts_v4f64(
58+
; AVX-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
59+
; AVX-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
60+
; AVX-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
61+
; AVX-NEXT: [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer
62+
; AVX-NEXT: [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer
63+
; AVX-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
64+
; AVX-NEXT: ret <4 x double> [[BLEND]]
5665
;
5766
%ld0 = load <4 x double>, ptr %p0, align 32
5867
%ld1 = load <4 x double>, ptr %p1, align 32
@@ -72,6 +81,3 @@ define <2 x float> @PR86068(<2 x float> %a0, <2 x float> %a1) {
7281
%s2 = shufflevector <2 x float> %s1, <2 x float> %a0, <2 x i32> <i32 0, i32 3>
7382
ret <2 x float> %s2
7483
}
75-
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
76-
; AVX: {{.*}}
77-
; SSE: {{.*}}

0 commit comments

Comments
 (0)