aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSimon Pilgrim <llvm-dev@redking.me.uk>2018-11-12 21:12:38 +0000
committerSimon Pilgrim <llvm-dev@redking.me.uk>2018-11-12 21:12:38 +0000
commit225b2c8a3294901d3633edea5461c395a5869c53 (patch)
treea08b8f6abc03ea810f1e89c41ae021983d04316e
parentac159b573df821778869996491b46c93ae88014e (diff)
[X86][SSE] Add lowerVectorShuffleAsByteRotateAndPermute (PR39387)
This patch adds the ability to use a PALIGNR to rotate a pair of inputs to select a range containing all the referenced elements, followed by a single input permute to put them in the right location. Differential Revision: https://reviews.llvm.org/D54267 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346706 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp123
-rw-r--r--test/CodeGen/X86/insertelement-ones.ll35
-rw-r--r--test/CodeGen/X86/vector-shuffle-128-v16.ll20
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v32.ll44
-rw-r--r--test/CodeGen/X86/x86-interleaved-access.ll248
5 files changed, 267 insertions, 203 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 16b3a6c26ac..8359e37b023 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -10143,7 +10143,8 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
SDValue V1, SDValue V2,
ArrayRef<int> Mask,
- SelectionDAG &DAG) {
+ SelectionDAG &DAG,
+ bool ImmBlends = false) {
// We build up the blend mask while checking whether a blend is a viable way
// to reduce the shuffle.
SmallVector<int, 32> BlendMask(Mask.size(), -1);
@@ -10163,6 +10164,12 @@ static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
PermuteMask[i] = Mask[i] % Size;
}
+ // If only immediate blends, then bail if the blend mask can't be widened to
+ // i16.
+ unsigned EltSize = VT.getScalarSizeInBits();
+ if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
+ return SDValue();
+
SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
}
@@ -10233,6 +10240,92 @@ static SDValue lowerVectorShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
}
+/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
+/// permuting the elements of the result in place.
+static SDValue lowerVectorShuffleAsByteRotateAndPermute(
+ const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+ if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
+ (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
+ (VT.is512BitVector() && !Subtarget.hasBWI()))
+ return SDValue();
+
+ // We don't currently support lane crossing permutes.
+ if (is128BitLaneCrossingShuffleMask(VT, Mask))
+ return SDValue();
+
+ int Scale = VT.getScalarSizeInBits() / 8;
+ int NumLanes = VT.getSizeInBits() / 128;
+ int NumElts = VT.getVectorNumElements();
+ int NumEltsPerLane = NumElts / NumLanes;
+
+ // Determine range of mask elts.
+ bool Blend1 = true;
+ bool Blend2 = true;
+ std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
+ std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
+ for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
+ for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
+ int M = Mask[Lane + Elt];
+ if (M < 0)
+ continue;
+ if (M < NumElts) {
+ Blend1 &= (M == (Lane + Elt));
+ assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
+ M = M % NumEltsPerLane;
+ Range1.first = std::min(Range1.first, M);
+ Range1.second = std::max(Range1.second, M);
+ } else {
+ M -= NumElts;
+ Blend2 &= (M == (Lane + Elt));
+ assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
+ M = M % NumEltsPerLane;
+ Range2.first = std::min(Range2.first, M);
+ Range2.second = std::max(Range2.second, M);
+ }
+ }
+ }
+
+ // Bail if we don't need both elements.
+ // TODO - it might be worth doing this for unary shuffles if the permute
+ // can be widened.
+ if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
+ !(0 <= Range2.first && Range2.second < NumEltsPerLane))
+ return SDValue();
+
+ if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
+ return SDValue();
+
+ // Rotate the 2 ops so we can access both ranges, then permute the result.
+ auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
+ MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
+ SDValue Rotate = DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
+ DAG.getBitcast(ByteVT, Lo),
+ DAG.getConstant(Scale * RotAmt, DL, MVT::i8)));
+ SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
+ for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
+ for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
+ int M = Mask[Lane + Elt];
+ if (M < 0)
+ continue;
+ if (M < NumElts)
+ PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
+ else
+ PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
+ }
+ }
+ return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
+ };
+
+ // Check if the ranges are small enough to rotate from either direction.
+ if (Range2.second < Range1.first)
+ return RotateAndPermute(V1, V2, Range1.first, 0);
+ if (Range1.second < Range2.first)
+ return RotateAndPermute(V2, V1, Range2.first, NumElts);
+ return SDValue();
+}
+
/// Generic routine to decompose a shuffle and blend into independent
/// blends and permutes.
///
@@ -10257,18 +10350,26 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(
BlendMask[i] = i + Size;
}
- // Try to lower with the simpler initial blend/unpack strategies unless one of
- // the input shuffles would be a no-op. We prefer to shuffle inputs as the
- // shuffle may be able to fold with a load or other benefit. However, when
- // we'll have to do 2x as many shuffles in order to achieve this,
- // blending/unpacking first is a better strategy.
+ // Try to lower with the simpler initial blend/unpack/rotate strategies unless
+ // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
+ // the shuffle may be able to fold with a load or other benefit. However, when
+ // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
+ // pre-shuffle first is a better strategy.
if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
- if (SDValue BlendPerm =
- lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
+ // Only prefer immediate blends to unpack/rotate.
+ if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
+ DL, VT, V1, V2, Mask, DAG, true))
return BlendPerm;
if (SDValue UnpackPerm =
lowerVectorShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
return UnpackPerm;
+ if (SDValue RotatePerm = lowerVectorShuffleAsByteRotateAndPermute(
+ DL, VT, V1, V2, Mask, Subtarget, DAG))
+ return RotatePerm;
+ // Unpack/rotate failed - try again with variable blends.
+ if (SDValue BlendPerm =
+ lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
+ return BlendPerm;
}
V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
@@ -13104,6 +13205,12 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
if (Subtarget.hasVBMI() && Subtarget.hasVLX())
return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
+
+ // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
+ // PALIGNR will be cheaper than the second PSHUFB+OR.
+ if (SDValue V = lowerVectorShuffleAsByteRotateAndPermute(
+ DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
+ return V;
}
return PSHUFB;
diff --git a/test/CodeGen/X86/insertelement-ones.ll b/test/CodeGen/X86/insertelement-ones.ll
index 61f3c9673f6..1d64053e9f6 100644
--- a/test/CodeGen/X86/insertelement-ones.ll
+++ b/test/CodeGen/X86/insertelement-ones.ll
@@ -344,15 +344,14 @@ define <16 x i8> @insert_v16i8_x123456789ABCDEx(<16 x i8> %a) {
;
; SSSE3-LABEL: insert_v16i8_x123456789ABCDEx:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; SSSE3-NEXT: movl $255, %eax
; SSSE3-NEXT: movd %eax, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: por %xmm2, %xmm0
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero
+; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero
; SSSE3-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
-; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: por %xmm2, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_v16i8_x123456789ABCDEx:
@@ -420,22 +419,20 @@ define <32 x i8> @insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx(<32 x i8> %a) {
;
; SSSE3-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; SSSE3-NEXT: movl $255, %eax
-; SSSE3-NEXT: movd %eax, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm3
-; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: por %xmm3, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,128]
-; SSSE3-NEXT: pshufb %xmm3, %xmm0
-; SSSE3-NEXT: movdqa %xmm2, %xmm4
-; SSSE3-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
-; SSSE3-NEXT: por %xmm4, %xmm0
+; SSSE3-NEXT: movd %eax, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero
+; SSSE3-NEXT: movdqa %xmm3, %xmm0
+; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; SSSE3-NEXT: por %xmm0, %xmm2
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero,xmm1[15]
-; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0],zero
-; SSSE3-NEXT: por %xmm2, %xmm1
-; SSSE3-NEXT: pshufb %xmm3, %xmm1
-; SSSE3-NEXT: por %xmm4, %xmm1
+; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0],zero
+; SSSE3-NEXT: por %xmm3, %xmm1
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero
+; SSSE3-NEXT: por %xmm0, %xmm1
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx:
diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll
index e9ccd7177cf..d9790aa9a75 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -610,30 +610,26 @@ define <16 x i8> @shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4(<16 x i8>
;
; SSSE3-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[11,12,13,14,14],zero,zero,zero,zero,zero
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm0[1,1,2,3,4]
-; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,3,6,6,7,8,9]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[11,12,13,14,14],zero,zero,zero,zero,zero
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm0[1,1,2,3,4]
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,3,6,6,7,8,9]
; SSE41-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[11,12,13,14,14],zero,zero,zero,zero,zero
-; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm0[1,1,2,3,4]
-; AVX1OR2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,3,6,6,7,8,9]
; AVX1OR2-NEXT: retq
;
; AVX512VLBW-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[11,12,13,14,14],zero,zero,zero,zero,zero
-; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm0[1,1,2,3,4]
-; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,3,6,6,7,8,9]
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll
index c4759ab54f5..0ae34e03f30 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -913,9 +913,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,xmm2[8],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0]
-; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,8,8,8,8,8,8,0,8,8,8,8,8,8,8,8]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -953,9 +952,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[9],zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0]
-; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,7,7,7,7,7,0,7,7,7,7,7,7,7,7,7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -993,9 +991,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[10],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0]
-; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,0,6,6,6,6,6,6,6,6,6,6]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -1033,9 +1030,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0]
-; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,0,5,5,5,5,5,5,5,5,5,5,5]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -1073,9 +1069,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,4,4,0,4,4,4,4,4,4,4,4,4,4,4,4]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -1113,9 +1108,8 @@ define <32 x i8> @shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,3,0,3,3,3,3,3,3,3,3,3,3,3,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -1153,9 +1147,8 @@ define <32 x i8> @shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[14],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -1192,12 +1185,9 @@ define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: movl $128, %eax
-; AVX1-NEXT: vmovd %eax, %xmm2
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll
index 28f5e8a040a..8bdb4530ea7 100644
--- a/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/test/CodeGen/X86/x86-interleaved-access.ll
@@ -950,27 +950,23 @@ define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){
; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10]
; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm2
-; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm7[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm6[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
; AVX1-NEXT: # ymm5 = mem[0,1,0,1]
; AVX1-NEXT: vandnps %ymm2, %ymm5, %ymm2
; AVX1-NEXT: vandps %ymm5, %ymm8, %ymm5
; AVX1-NEXT: vorps %ymm2, %ymm5, %ymm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,128,11,12,13,14,15,128,128,128,128,128]
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
-; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm6
-; AVX1-NEXT: vpor %xmm3, %xmm6, %xmm3
-; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1
-; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
-; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpaddb %xmm9, %xmm2, %xmm2
-; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpaddb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512-LABEL: interleaved_load_vf32_i8_stride3:
@@ -992,8 +988,8 @@ define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){
; AVX2OR512-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
; AVX2OR512-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1
; AVX2OR512-NEXT: vpaddb %ymm2, %ymm1, %ymm1
-; AVX2OR512-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
-; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
+; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
+; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
; AVX2OR512-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; AVX2OR512-NEXT: retq
%wide.vec = load <96 x i8>, <96 x i8>* %ptr
@@ -1022,9 +1018,8 @@ define <16 x i8> @interleaved_load_vf16_i8_stride3(<48 x i8>* %ptr){
; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
; AVX-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1
; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[11,12,13,14,15],zero,zero,zero,zero,zero
-; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm3[0,1,2,3,4]
-; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%wide.vec = load <48 x i8>, <48 x i8>* %ptr
@@ -1090,23 +1085,19 @@ define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x
; AVX1-LABEL: interleaved_store_vf16_i8_stride3:
; AVX1: # %bb.0:
; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,0,128,128,1,128,128,2,128,128,3,128,128,4,128,128]
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10]
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm6
-; AVX1-NEXT: vpor %xmm4, %xmm6, %xmm4
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm6
-; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpor %xmm6, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
-; AVX1-NEXT: vmovdqu %xmm1, 32(%rdi)
+; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmovdqu %xmm2, 32(%rdi)
; AVX1-NEXT: vmovups %ymm0, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -1114,23 +1105,19 @@ define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x
; AVX2-LABEL: interleaved_store_vf16_i8_stride3:
; AVX2: # %bb.0:
; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
-; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
-; AVX2-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
-; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [128,0,128,128,1,128,128,2,128,128,3,128,128,4,128,128]
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm4
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10]
-; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6
-; AVX2-NEXT: vpor %xmm4, %xmm6, %xmm4
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm6
-; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0
-; AVX2-NEXT: vpor %xmm6, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1
-; AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0
-; AVX2-NEXT: vmovdqu %xmm1, 32(%rdi)
+; AVX2-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
+; AVX2-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
+; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
+; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vmovdqu %xmm2, 32(%rdi)
; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -1138,23 +1125,19 @@ define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x
; AVX512-LABEL: interleaved_store_vf16_i8_stride3:
; AVX512: # %bb.0:
; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
-; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
-; AVX512-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
-; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [128,0,128,128,1,128,128,2,128,128,3,128,128,4,128,128]
-; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm4
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10]
-; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm6
-; AVX512-NEXT: vpor %xmm4, %xmm6, %xmm4
-; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm6
-; AVX512-NEXT: vpshufb %xmm5, %xmm0, %xmm0
-; AVX512-NEXT: vpor %xmm6, %xmm0, %xmm0
-; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX512-NEXT: vpshufb %xmm5, %xmm1, %xmm1
-; AVX512-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
+; AVX512-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
+; AVX512-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
+; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
+; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1
; AVX512-NEXT: vmovdqu %ymm0, (%rdi)
; AVX512-NEXT: vextracti32x4 $2, %zmm1, 32(%rdi)
; AVX512-NEXT: vzeroupper
@@ -1426,71 +1409,64 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(<192 x i8>* %ptr){
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
; AVX1-NEXT: vpshufb %xmm4, %xmm6, %xmm6
; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpshufb %xmm4, %xmm11, %xmm2
+; AVX1-NEXT: vpshufb %xmm4, %xmm11, %xmm11
; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpshufb %xmm4, %xmm10, %xmm11
+; AVX1-NEXT: vpshufb %xmm4, %xmm10, %xmm10
; AVX1-NEXT: vpshufb %xmm4, %xmm12, %xmm12
; AVX1-NEXT: vpshufb %xmm4, %xmm14, %xmm14
; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm4, %xmm13, %xmm0
-; AVX1-NEXT: vpshufb %xmm4, %xmm15, %xmm7
-; AVX1-NEXT: vpshufb %xmm4, %xmm8, %xmm13
+; AVX1-NEXT: vpshufb %xmm4, %xmm13, %xmm2
+; AVX1-NEXT: vpshufb %xmm4, %xmm15, %xmm0
+; AVX1-NEXT: vpshufb %xmm4, %xmm8, %xmm7
; AVX1-NEXT: vpshufb %xmm4, %xmm9, %xmm4
-; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm4[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm10 = xmm13[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm7[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm0[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm13 = xmm4[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm7[11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm0[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm2[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10]
; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm1
; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm14[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm14
+; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm14[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm11, %ymm14
; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm12[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm12
-; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
-; AVX1-NEXT: # ymm13 = mem[0,1,0,1]
-; AVX1-NEXT: vandnps %ymm12, %ymm13, %ymm12
-; AVX1-NEXT: vandps %ymm13, %ymm14, %ymm14
-; AVX1-NEXT: vorps %ymm12, %ymm14, %ymm12
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm14
-; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm15[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vandnps %ymm14, %ymm13, %ymm14
-; AVX1-NEXT: vandps %ymm13, %ymm7, %ymm7
-; AVX1-NEXT: vorps %ymm14, %ymm7, %ymm13
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,128,128,128,128,11,12,13,14,15,128,128,128,128,128]
-; AVX1-NEXT: vpshufb %xmm14, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
-; AVX1-NEXT: vpshufb %xmm7, %xmm15, %xmm4
-; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm10[11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT: vpshufb %xmm14, %xmm2, %xmm2
-; AVX1-NEXT: vpshufb %xmm7, %xmm10, %xmm4
-; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm9[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT: vpshufb %xmm14, %xmm5, %xmm4
-; AVX1-NEXT: vpshufb %xmm7, %xmm9, %xmm5
-; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpshufb %xmm14, %xmm6, %xmm5
-; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm8[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT: vpshufb %xmm7, %xmm8, %xmm0
-; AVX1-NEXT: vpor %xmm5, %xmm0, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm13, %xmm0
-; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpaddb %xmm0, %xmm4, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm0
-; AVX1-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm10[11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm10
+; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; AVX1-NEXT: # ymm12 = mem[0,1,0,1]
+; AVX1-NEXT: vandnps %ymm10, %ymm12, %ymm10
+; AVX1-NEXT: vandps %ymm12, %ymm14, %ymm14
+; AVX1-NEXT: vorps %ymm10, %ymm14, %ymm10
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm14
+; AVX1-NEXT: vandnps %ymm14, %ymm12, %ymm14
+; AVX1-NEXT: vandps %ymm12, %ymm1, %ymm1
+; AVX1-NEXT: vorps %ymm14, %ymm1, %ymm1
+; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm13[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm12 = xmm15[11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm11[11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm8[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm5[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm0
+; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
; AVX1-NEXT: vpaddb %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpaddb %xmm11, %xmm12, %xmm3
-; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: vpaddb %xmm6, %xmm13, %xmm2
-; AVX1-NEXT: vpaddb %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vpaddb %xmm12, %xmm10, %xmm3
+; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm7[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT: vpaddb %xmm9, %xmm1, %xmm1
+; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm6[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: interleaved_load_vf64_i8_stride3:
@@ -1528,11 +1504,11 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(<192 x i8>* %ptr){
; AVX2-NEXT: vpaddb %ymm5, %ymm1, %ymm1
; AVX2-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpaddb %ymm4, %ymm2, %ymm2
-; AVX2-NEXT: vpblendvb %ymm8, %ymm6, %ymm0, %ymm0
-; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpblendvb %ymm8, %ymm7, %ymm3, %ymm1
-; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: retq
;
@@ -1561,18 +1537,16 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(<192 x i8>* %ptr){
; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
; AVX512-NEXT: movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800
; AVX512-NEXT: kmovq %rax, %k1
-; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
-; AVX512-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX512-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm5
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm6
+; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
+; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm5
; AVX512-NEXT: vpalignr {{.*#+}} zmm0 {%k1} = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
+; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
; AVX512-NEXT: vextracti64x4 $1, %zmm3, %ymm2
-; AVX512-NEXT: vpblendvb %ymm4, %ymm2, %ymm6, %ymm2
-; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
+; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm5[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
+; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512-NEXT: vpaddb %zmm0, %zmm1, %zmm0
; AVX512-NEXT: retq