From 225b2c8a3294901d3633edea5461c395a5869c53 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 12 Nov 2018 21:12:38 +0000 Subject: [X86][SSE] Add lowerVectorShuffleAsByteRotateAndPermute (PR39387) This patch adds the ability to use a PALIGNR to rotate a pair of inputs to select a range containing all the referenced elements, followed by a single input permute to put them in the right location. Differential Revision: https://reviews.llvm.org/D54267 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346706 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/insertelement-ones.ll | 35 ++-- test/CodeGen/X86/vector-shuffle-128-v16.ll | 20 +-- test/CodeGen/X86/vector-shuffle-256-v32.ll | 44 ++--- test/CodeGen/X86/x86-interleaved-access.ll | 248 +++++++++++++---------------- 4 files changed, 152 insertions(+), 195 deletions(-) (limited to 'test') diff --git a/test/CodeGen/X86/insertelement-ones.ll b/test/CodeGen/X86/insertelement-ones.ll index 61f3c9673f6..1d64053e9f6 100644 --- a/test/CodeGen/X86/insertelement-ones.ll +++ b/test/CodeGen/X86/insertelement-ones.ll @@ -344,15 +344,14 @@ define <16 x i8> @insert_v16i8_x123456789ABCDEx(<16 x i8> %a) { ; ; SSSE3-LABEL: insert_v16i8_x123456789ABCDEx: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; SSSE3-NEXT: movl $255, %eax ; SSSE3-NEXT: movd %eax, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: por %xmm2, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero +; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] +; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero ; SSSE3-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] -; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: por %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_v16i8_x123456789ABCDEx: @@ -420,22 +419,20 @@ define <32 x i8> @insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx(<32 x i8> %a) { ; ; SSSE3-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; SSSE3-NEXT: movl $255, %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: por %xmm3, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,128] -; SSSE3-NEXT: pshufb %xmm3, %xmm0 -; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] -; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] +; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero +; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; SSSE3-NEXT: por %xmm0, %xmm2 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero,xmm1[15] -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0],zero -; SSSE3-NEXT: por %xmm2, %xmm1 -; SSSE3-NEXT: pshufb %xmm3, %xmm1 -; SSSE3-NEXT: por %xmm4, %xmm1 +; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0],zero +; SSSE3-NEXT: por %xmm3, %xmm1 +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero +; SSSE3-NEXT: por %xmm0, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll index e9ccd7177cf..d9790aa9a75 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -610,30 +610,26 @@ define <16 x i8> @shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4(<16 x i8> ; ; SSSE3-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[11,12,13,14,14],zero,zero,zero,zero,zero -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm0[1,1,2,3,4] -; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,3,6,6,7,8,9] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[11,12,13,14,14],zero,zero,zero,zero,zero -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm0[1,1,2,3,4] -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,3,6,6,7,8,9] ; SSE41-NEXT: retq ; ; AVX1OR2-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[11,12,13,14,14],zero,zero,zero,zero,zero -; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm0[1,1,2,3,4] -; AVX1OR2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] +; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,3,6,6,7,8,9] ; AVX1OR2-NEXT: retq ; ; AVX512VLBW-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[11,12,13,14,14],zero,zero,zero,zero,zero -; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm0[1,1,2,3,4] -; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] +; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,3,6,6,7,8,9] ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4: diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll index c4759ab54f5..0ae34e03f30 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -913,9 +913,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_ ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,xmm2[8],zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,8,8,8,8,8,8,0,8,8,8,8,8,8,8,8] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -953,9 +952,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_ ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[9],zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,7,7,7,7,7,0,7,7,7,7,7,7,7,7,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -993,9 +991,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_ ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[10],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,0,6,6,6,6,6,6,6,6,6,6] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -1033,9 +1030,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_ ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,0,5,5,5,5,5,5,5,5,5,5,5] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -1073,9 +1069,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,4,4,0,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -1113,9 +1108,8 @@ define <32 x i8> @shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,3,0,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -1153,9 +1147,8 @@ define <32 x i8> @shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[14],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -1192,12 +1185,9 @@ define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: movl $128, %eax -; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll index 28f5e8a040a..8bdb4530ea7 100644 --- a/test/CodeGen/X86/x86-interleaved-access.ll +++ b/test/CodeGen/X86/x86-interleaved-access.ll @@ -950,27 +950,23 @@ define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){ ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10] ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10] ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm2 -; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm7[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm6[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10] ; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX1-NEXT: # ymm5 = mem[0,1,0,1] ; AVX1-NEXT: vandnps %ymm2, %ymm5, %ymm2 ; AVX1-NEXT: vandps %ymm5, %ymm8, %ymm5 ; AVX1-NEXT: vorps %ymm2, %ymm5, %ymm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,128,11,12,13,14,15,128,128,128,128,128] -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] -; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX1-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpaddb %xmm9, %xmm2, %xmm2 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpaddb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2OR512-LABEL: interleaved_load_vf32_i8_stride3: @@ -992,8 +988,8 @@ define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){ ; AVX2OR512-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2OR512-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1 ; AVX2OR512-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; AVX2OR512-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 -; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20] +; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26] +; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; AVX2OR512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX2OR512-NEXT: retq %wide.vec = load <96 x i8>, <96 x i8>* %ptr @@ -1022,9 +1018,8 @@ define <16 x i8> @interleaved_load_vf16_i8_stride3(<48 x i8>* %ptr){ ; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1 ; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[11,12,13,14,15],zero,zero,zero,zero,zero -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm3[0,1,2,3,4] -; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10] +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %wide.vec = load <48 x i8>, <48 x i8>* %ptr @@ -1090,23 +1085,19 @@ define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x ; AVX1-LABEL: interleaved_store_vf16_i8_stride3: ; AVX1: # %bb.0: ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,0,128,128,1,128,128,2,128,128,3,128,128,4,128,128] -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10] -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm6 -; AVX1-NEXT: vpor %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm6 -; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-NEXT: vmovdqu %xmm1, 32(%rdi) +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovdqu %xmm2, 32(%rdi) ; AVX1-NEXT: vmovups %ymm0, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1114,23 +1105,19 @@ define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x ; AVX2-LABEL: interleaved_store_vf16_i8_stride3: ; AVX2: # %bb.0: ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] -; AVX2-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [128,0,128,128,1,128,128,2,128,128,3,128,128,4,128,128] -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm4 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10] -; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6 -; AVX2-NEXT: vpor %xmm4, %xmm6, %xmm4 -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm6 -; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX2-NEXT: vmovdqu %xmm1, 32(%rdi) +; AVX2-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] +; AVX2-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] +; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovdqu %xmm2, 32(%rdi) ; AVX2-NEXT: vmovdqu %ymm0, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1138,23 +1125,19 @@ define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x ; AVX512-LABEL: interleaved_store_vf16_i8_stride3: ; AVX512: # %bb.0: ; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] -; AVX512-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [128,0,128,128,1,128,128,2,128,128,3,128,128,4,128,128] -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm4 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10] -; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm6 -; AVX512-NEXT: vpor %xmm4, %xmm6, %xmm4 -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm6 -; AVX512-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX512-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] +; AVX512-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] +; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu %ymm0, (%rdi) ; AVX512-NEXT: vextracti32x4 $2, %zmm1, 32(%rdi) ; AVX512-NEXT: vzeroupper @@ -1426,71 +1409,64 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(<192 x i8>* %ptr){ ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] ; AVX1-NEXT: vpshufb %xmm4, %xmm6, %xmm6 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpshufb %xmm4, %xmm11, %xmm2 +; AVX1-NEXT: vpshufb %xmm4, %xmm11, %xmm11 ; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm4, %xmm10, %xmm11 +; AVX1-NEXT: vpshufb %xmm4, %xmm10, %xmm10 ; AVX1-NEXT: vpshufb %xmm4, %xmm12, %xmm12 ; AVX1-NEXT: vpshufb %xmm4, %xmm14, %xmm14 ; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm4, %xmm13, %xmm0 -; AVX1-NEXT: vpshufb %xmm4, %xmm15, %xmm7 -; AVX1-NEXT: vpshufb %xmm4, %xmm8, %xmm13 +; AVX1-NEXT: vpshufb %xmm4, %xmm13, %xmm2 +; AVX1-NEXT: vpshufb %xmm4, %xmm15, %xmm0 +; AVX1-NEXT: vpshufb %xmm4, %xmm8, %xmm7 ; AVX1-NEXT: vpshufb %xmm4, %xmm9, %xmm4 -; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm4[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpalignr {{.*#+}} xmm10 = xmm13[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm7[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm0[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm13 = xmm4[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm7[11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm0[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm2[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10] ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] ; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7 +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm1 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm14[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm14 +; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm14[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm11, %ymm14 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm12[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm12 -; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX1-NEXT: # ymm13 = mem[0,1,0,1] -; AVX1-NEXT: vandnps %ymm12, %ymm13, %ymm12 -; AVX1-NEXT: vandps %ymm13, %ymm14, %ymm14 -; AVX1-NEXT: vorps %ymm12, %ymm14, %ymm12 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm14 -; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm15[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vandnps %ymm14, %ymm13, %ymm14 -; AVX1-NEXT: vandps %ymm13, %ymm7, %ymm7 -; AVX1-NEXT: vorps %ymm14, %ymm7, %ymm13 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,128,128,128,128,11,12,13,14,15,128,128,128,128,128] -; AVX1-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] -; AVX1-NEXT: vpshufb %xmm7, %xmm15, %xmm4 -; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm10[11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpshufb %xmm14, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm7, %xmm10, %xmm4 -; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm9[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpshufb %xmm14, %xmm5, %xmm4 -; AVX1-NEXT: vpshufb %xmm7, %xmm9, %xmm5 -; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpshufb %xmm14, %xmm6, %xmm5 -; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm8[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpshufb %xmm7, %xmm8, %xmm0 -; AVX1-NEXT: vpor %xmm5, %xmm0, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm13, %xmm0 -; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddb %xmm0, %xmm4, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm0 -; AVX1-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm10[11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm10 +; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX1-NEXT: # ymm12 = mem[0,1,0,1] +; AVX1-NEXT: vandnps %ymm10, %ymm12, %ymm10 +; AVX1-NEXT: vandps %ymm12, %ymm14, %ymm14 +; AVX1-NEXT: vorps %ymm10, %ymm14, %ymm10 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm14 +; AVX1-NEXT: vandnps %ymm14, %ymm12, %ymm14 +; AVX1-NEXT: vandps %ymm12, %ymm1, %ymm1 +; AVX1-NEXT: vorps %ymm14, %ymm1, %ymm1 +; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm13[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm12 = xmm15[11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm11[11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm8[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm5[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] +; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm0 +; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] ; AVX1-NEXT: vpaddb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm11, %xmm12, %xmm3 -; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: vpaddb %xmm6, %xmm13, %xmm2 -; AVX1-NEXT: vpaddb %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vpaddb %xmm12, %xmm10, %xmm3 +; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm7[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] +; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT: vpaddb %xmm9, %xmm1, %xmm1 +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm6[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] +; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: interleaved_load_vf64_i8_stride3: @@ -1528,11 +1504,11 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(<192 x i8>* %ptr){ ; AVX2-NEXT: vpaddb %ymm5, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpaddb %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpblendvb %ymm8, %ymm6, %ymm0, %ymm0 -; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20] +; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26] +; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpblendvb %ymm8, %ymm7, %ymm3, %ymm1 -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20] +; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26] +; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; @@ -1561,18 +1537,16 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(<192 x i8>* %ptr){ ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58] ; AVX512-NEXT: movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800 ; AVX512-NEXT: kmovq %rax, %k1 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX512-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm5 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm6 +; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26] +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm5 ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 {%k1} = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58] ; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58] ; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58] ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20] +; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; AVX512-NEXT: vextracti64x4 $1, %zmm3, %ymm2 -; AVX512-NEXT: vpblendvb %ymm4, %ymm2, %ymm6, %ymm2 -; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20] +; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm5[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26] +; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq -- cgit v1.2.3