diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index cd1bbb8fbb7b7..f04603867a587 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -21,6 +21,7 @@ #include "X86TargetMachine.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" @@ -37,6 +38,7 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SDPatternMatch.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/CallingConv.h" @@ -8783,6 +8785,52 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL, return LowerShift(Res, Subtarget, DAG); } +static bool isShuffleFoldableLoad(SDValue); + +/// Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats +/// representing a blend. +static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL, + X86Subtarget const &Subtarget, + SelectionDAG &DAG) { + MVT VT = BVOp->getSimpleValueType(0u); + + if (VT != MVT::v4f64) + return SDValue(); + + // Collect unique operands. + auto UniqueOps = SmallSet(); + for (SDValue Op : BVOp->ops()) { + if (isIntOrFPConstant(Op) || Op.isUndef()) + return SDValue(); + UniqueOps.insert(Op); + } + + // Candidate BUILD_VECTOR must have 2 unique operands. + if (UniqueOps.size() != 2u) + return SDValue(); + + SDValue Op0 = BVOp->getOperand(0u); + UniqueOps.erase(Op0); + SDValue Op1 = *UniqueOps.begin(); + + if (Subtarget.hasAVX2() || isShuffleFoldableLoad(Op0) || + isShuffleFoldableLoad(Op1)) { + // Create shuffle mask. + auto const NumElems = VT.getVectorNumElements(); + SmallVector Mask(NumElems); + for (auto I = 0u; I < NumElems; ++I) { + SDValue Op = BVOp->getOperand(I); + Mask[I] = Op == Op0 ? I : I + NumElems; + } + // Create shuffle of splats. + SDValue NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0); + SDValue NewOp1 = DAG.getSplatBuildVector(VT, DL, Op1); + return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask); + } + + return SDValue(); +} + /// Create a vector constant without a load. SSE/AVX provide the bare minimum /// functionality to do this, so it's all zeros, all ones, or some derivation /// that is cheap to calculate. @@ -9245,6 +9293,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return Broadcast; if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG)) return BitOp; + if (SDValue Blend = lowerBuildVectorAsBlend(BV, dl, Subtarget, DAG)) + return Blend; unsigned NumZero = ZeroMask.popcount(); unsigned NumNonZero = NonZeroMask.popcount(); diff --git a/llvm/test/CodeGen/X86/build-vector-256.ll b/llvm/test/CodeGen/X86/build-vector-256.ll index 6c1cbfb4014b6..3edb712e53c8d 100644 --- a/llvm/test/CodeGen/X86/build-vector-256.ll +++ b/llvm/test/CodeGen/X86/build-vector-256.ll @@ -415,20 +415,34 @@ define <32 x i8> @test_buildvector_v32i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, ; build vectors of repeated elements define <4 x double> @test_buildvector_4f64_2_var(double %a0, double %a1) { -; AVX-32-LABEL: test_buildvector_4f64_2_var: -; AVX-32: # %bb.0: -; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 -; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; AVX-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-32-NEXT: retl +; AVX1-32-LABEL: test_buildvector_4f64_2_var: +; AVX1-32: # %bb.0: +; AVX1-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; AVX1-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; AVX1-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-32-NEXT: retl ; -; AVX-64-LABEL: test_buildvector_4f64_2_var: -; AVX-64: # %bb.0: -; AVX-64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX-64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: retq +; AVX1-64-LABEL: test_buildvector_4f64_2_var: +; AVX1-64: # %bb.0: +; AVX1-64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-64-NEXT: retq +; +; AVX2-32-LABEL: test_buildvector_4f64_2_var: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0 +; AVX2-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm1 +; AVX2-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-32-NEXT: retl +; +; AVX2-64-LABEL: test_buildvector_4f64_2_var: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX2-64-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] +; AVX2-64-NEXT: retq %v0 = insertelement <4 x double> poison, double %a0, i32 0 %v1 = insertelement <4 x double> %v0, double %a1, i32 1 %v2 = insertelement <4 x double> %v1, double %a1, i32 2 @@ -437,25 +451,41 @@ define <4 x double> @test_buildvector_4f64_2_var(double %a0, double %a1) { } define <4 x double> @test_buildvector_4f64_2_load(ptr %p0, ptr %p1) { -; AVX-32-LABEL: test_buildvector_4f64_2_load: -; AVX-32: # %bb.0: -; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; AVX-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-32-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX-32-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-32-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-32-NEXT: retl +; AVX1-32-LABEL: test_buildvector_4f64_2_load: +; AVX1-32: # %bb.0: +; AVX1-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX1-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX1-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-32-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-32-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-32-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-32-NEXT: retl ; -; AVX-64-LABEL: test_buildvector_4f64_2_load: -; AVX-64: # %bb.0: -; AVX-64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX-64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: retq +; AVX1-64-LABEL: test_buildvector_4f64_2_load: +; AVX1-64: # %bb.0: +; AVX1-64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-64-NEXT: retq +; +; AVX2-32-LABEL: test_buildvector_4f64_2_load: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX2-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX2-32-NEXT: vbroadcastsd (%ecx), %ymm0 +; AVX2-32-NEXT: vbroadcastsd (%eax), %ymm1 +; AVX2-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-32-NEXT: retl +; +; AVX2-64-LABEL: test_buildvector_4f64_2_load: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vbroadcastsd (%rsi), %ymm0 +; AVX2-64-NEXT: vbroadcastsd (%rdi), %ymm1 +; AVX2-64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-64-NEXT: retq %a0 = load double, ptr %p0 %a1 = load double, ptr %p1 %v0 = insertelement <4 x double> poison, double %a0, i32 0 diff --git a/llvm/test/CodeGen/X86/build-vector-512.ll b/llvm/test/CodeGen/X86/build-vector-512.ll index 5d38f087aa1b3..789196c5e4848 100644 --- a/llvm/test/CodeGen/X86/build-vector-512.ll +++ b/llvm/test/CodeGen/X86/build-vector-512.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX-32 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX-64 -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX-32 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX-64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX-32,AVX512F-32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX-64,AVX512F-64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX-32,AVX512BW-32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX-64,AVX512BW-64 define <8 x double> @test_buildvector_v8f64(double %a0, double %a1, double %a2, double %a3, double %a4, double %a5, double %a6, double %a7) { ; AVX-32-LABEL: test_buildvector_v8f64: @@ -480,23 +480,37 @@ define <64 x i8> @test_buildvector_v64i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, ; build vectors of repeated elements define <8 x double> @test_buildvector_8f64_2_var(double %a0, double %a1) { -; AVX-32-LABEL: test_buildvector_8f64_2_var: -; AVX-32: # %bb.0: -; AVX-32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm1 -; AVX-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 -; AVX-32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-32-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX-32-NEXT: retl +; AVX512F-32-LABEL: test_buildvector_8f64_2_var: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %zmm0 +; AVX512F-32-NEXT: movb $-126, %al +; AVX512F-32-NEXT: kmovw %eax, %k1 +; AVX512F-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %zmm0 {%k1} +; AVX512F-32-NEXT: retl ; -; AVX-64-LABEL: test_buildvector_8f64_2_var: -; AVX-64: # %bb.0: -; AVX-64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX-64-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0] -; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 -; AVX-64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-64-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX-64-NEXT: retq +; AVX512F-64-LABEL: test_buildvector_8f64_2_var: +; AVX512F-64: # %bb.0: +; AVX512F-64-NEXT: vbroadcastsd %xmm0, %zmm0 +; AVX512F-64-NEXT: movb $-126, %al +; AVX512F-64-NEXT: kmovw %eax, %k1 +; AVX512F-64-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1} +; AVX512F-64-NEXT: retq +; +; AVX512BW-32-LABEL: test_buildvector_8f64_2_var: +; AVX512BW-32: # %bb.0: +; AVX512BW-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %zmm0 +; AVX512BW-32-NEXT: movb $-126, %al +; AVX512BW-32-NEXT: kmovd %eax, %k1 +; AVX512BW-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %zmm0 {%k1} +; AVX512BW-32-NEXT: retl +; +; AVX512BW-64-LABEL: test_buildvector_8f64_2_var: +; AVX512BW-64: # %bb.0: +; AVX512BW-64-NEXT: vbroadcastsd %xmm0, %zmm0 +; AVX512BW-64-NEXT: movb $-126, %al +; AVX512BW-64-NEXT: kmovd %eax, %k1 +; AVX512BW-64-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1} +; AVX512BW-64-NEXT: retq %v0 = insertelement <8 x double> poison, double %a0, i32 0 %v1 = insertelement <8 x double> %v0, double %a1, i32 1 %v2 = insertelement <8 x double> %v1, double %a0, i32 2 @@ -509,25 +523,41 @@ define <8 x double> @test_buildvector_8f64_2_var(double %a0, double %a1) { } define <8 x double> @test_buildvector_8f64_2_load(ptr %p0, ptr %p1) { -; AVX-32-LABEL: test_buildvector_8f64_2_load: -; AVX-32: # %bb.0: -; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; AVX-32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm0[0,1],mem[0,1] -; AVX-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 -; AVX-32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-32-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX-32-NEXT: retl +; AVX512F-32-LABEL: test_buildvector_8f64_2_load: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512F-32-NEXT: vbroadcastsd (%ecx), %zmm0 +; AVX512F-32-NEXT: movb $-126, %cl +; AVX512F-32-NEXT: kmovw %ecx, %k1 +; AVX512F-32-NEXT: vbroadcastsd (%eax), %zmm0 {%k1} +; AVX512F-32-NEXT: retl ; -; AVX-64-LABEL: test_buildvector_8f64_2_load: -; AVX-64: # %bb.0: -; AVX-64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX-64-NEXT: vmovhps {{.*#+}} xmm1 = xmm0[0,1],mem[0,1] -; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 -; AVX-64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-64-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX-64-NEXT: retq +; AVX512F-64-LABEL: test_buildvector_8f64_2_load: +; AVX512F-64: # %bb.0: +; AVX512F-64-NEXT: vbroadcastsd (%rdi), %zmm0 +; AVX512F-64-NEXT: movb $-126, %al +; AVX512F-64-NEXT: kmovw %eax, %k1 +; AVX512F-64-NEXT: vbroadcastsd (%rsi), %zmm0 {%k1} +; AVX512F-64-NEXT: retq +; +; AVX512BW-32-LABEL: test_buildvector_8f64_2_load: +; AVX512BW-32: # %bb.0: +; AVX512BW-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512BW-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512BW-32-NEXT: vbroadcastsd (%ecx), %zmm0 +; AVX512BW-32-NEXT: movb $-126, %cl +; AVX512BW-32-NEXT: kmovd %ecx, %k1 +; AVX512BW-32-NEXT: vbroadcastsd (%eax), %zmm0 {%k1} +; AVX512BW-32-NEXT: retl +; +; AVX512BW-64-LABEL: test_buildvector_8f64_2_load: +; AVX512BW-64: # %bb.0: +; AVX512BW-64-NEXT: vbroadcastsd (%rdi), %zmm0 +; AVX512BW-64-NEXT: movb $-126, %al +; AVX512BW-64-NEXT: kmovd %eax, %k1 +; AVX512BW-64-NEXT: vbroadcastsd (%rsi), %zmm0 {%k1} +; AVX512BW-64-NEXT: retq %a0 = load double, ptr %p0 %a1 = load double, ptr %p1 %v0 = insertelement <8 x double> poison, double %a0, i32 0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll index fb8618be17f06..4cdc65e5c1b97 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -2367,6 +2367,97 @@ define <4 x double> @unpckh_v4f64(<4 x double> %x, <4 x double> %y) { ret <4 x double> %unpckh } +define <4 x double> @blend_broadcasts_v1f64(ptr %p0, ptr %p1) { +; AVX1-LABEL: blend_broadcasts_v1f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: blend_broadcasts_v1f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd (%rsi), %ymm0 +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: blend_broadcasts_v1f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vbroadcastsd (%rsi), %ymm0 +; AVX512VL-NEXT: vbroadcastsd (%rdi), %ymm1 +; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX512VL-NEXT: retq + %ld0 = load <1 x double>, ptr %p0, align 32 + %ld1 = load <1 x double>, ptr %p1, align 32 + %blend = shufflevector <1 x double> %ld0, <1 x double> %ld1, <4 x i32> + ret <4 x double> %blend +} + +define <4 x double> @blend_broadcasts_v1f64_4x(ptr %p0, ptr %p1) { +; AVX1-LABEL: blend_broadcasts_v1f64_4x: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: blend_broadcasts_v1f64_4x: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd (%rsi), %ymm0 +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: blend_broadcasts_v1f64_4x: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vbroadcastsd (%rsi), %ymm0 +; AVX512VL-NEXT: vbroadcastsd (%rdi), %ymm1 +; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX512VL-NEXT: retq + %ld0 = load <1 x double>, ptr %p0, align 32 + %ld1 = load <1 x double>, ptr %p1, align 32 + %bcst0 = shufflevector <1 x double> %ld0, <1 x double> poison, <4 x i32> zeroinitializer + %bcst1 = shufflevector <1 x double> %ld1, <1 x double> poison, <4 x i32> zeroinitializer + %blend = shufflevector <4 x double> %bcst0, <4 x double> %bcst1, <4 x i32> + ret <4 x double> %blend +} + +define <4 x double> @blend_broadcasts_v1f64_2x(ptr %p0, ptr %p1) { +; AVX1-LABEL: blend_broadcasts_v1f64_2x: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: blend_broadcasts_v1f64_2x: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd (%rsi), %ymm0 +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: blend_broadcasts_v1f64_2x: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vbroadcastsd (%rsi), %ymm0 +; AVX512VL-NEXT: vbroadcastsd (%rdi), %ymm1 +; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX512VL-NEXT: retq + %ld0 = load <1 x double>, ptr %p0, align 32 + %ld1 = load <1 x double>, ptr %p1, align 32 + %bcst0 = shufflevector <1 x double> %ld0, <1 x double> poison, <2 x i32> zeroinitializer + %bcst1 = shufflevector <1 x double> %ld1, <1 x double> poison, <2 x i32> zeroinitializer + %blend = shufflevector <2 x double> %bcst0, <2 x double> %bcst1, <4 x i32> + ret <4 x double> %blend +} + !llvm.module.flags = !{!0} !0 = !{i32 1, !"ProfileSummary", !1} !1 = !{!2, !3, !4, !5, !6, !7, !8, !9}