; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+fast-variable-shuffle -O2 | FileCheck %s --check-prefix=AVX512 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+fast-variable-shuffle -O2 | FileCheck %s --check-prefix=AVX512NOTDQ define void @load_v8i1_broadcast_4_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { ; AVX512-LABEL: load_v8i1_broadcast_4_v2i1: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 4(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm2 ; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512-NEXT: vpmovq2m %xmm2, %k1 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512-NEXT: vmovapd %xmm1, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v2i1: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 4(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <8 x i1>, <8 x i1>* %a0 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32> %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 store <2 x double> %d2, <2 x double>* %a3 ret void } define void @load_v8i1_broadcast_7_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { ; AVX512-LABEL: load_v8i1_broadcast_7_v2i1: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 6(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm2 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX512-NEXT: vpmovq2m %xmm2, %k1 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512-NEXT: vmovapd %xmm1, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v2i1: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 6(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <8 x i1>, <8 x i1>* %a0 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32> %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 store <2 x double> %d2, <2 x double>* %a3 ret void } define void @load_v16i1_broadcast_8_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { ; AVX512-LABEL: load_v16i1_broadcast_8_v2i1: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 8(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm2 ; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512-NEXT: vpmovq2m %xmm2, %k1 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512-NEXT: vmovapd %xmm1, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v2i1: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 8(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <16 x i1>, <16 x i1>* %a0 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32> %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 store <2 x double> %d2, <2 x double>* %a3 ret void } define void @load_v16i1_broadcast_8_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) { ; AVX512-LABEL: load_v16i1_broadcast_8_v4i1: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 8(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %xmm2 ; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512-NEXT: vpmovd2m %xmm2, %k1 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1} ; AVX512-NEXT: vmovaps %xmm1, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v4i1: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 8(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <16 x i1>, <16 x i1>* %a0 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32> %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2 store <4 x float> %d2, <4 x float>* %a3 ret void } define void @load_v16i1_broadcast_15_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { ; AVX512-LABEL: load_v16i1_broadcast_15_v2i1: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 14(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm2 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX512-NEXT: vpmovq2m %xmm2, %k1 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512-NEXT: vmovapd %xmm1, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v2i1: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 14(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <16 x i1>, <16 x i1>* %a0 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32> %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 store <2 x double> %d2, <2 x double>* %a3 ret void } define void @load_v16i1_broadcast_15_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) { ; AVX512-LABEL: load_v16i1_broadcast_15_v4i1: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 12(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %xmm2 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vpmovd2m %xmm2, %k1 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1} ; AVX512-NEXT: vmovaps %xmm1, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v4i1: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 12(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <16 x i1>, <16 x i1>* %a0 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32> %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2 store <4 x float> %d2, <4 x float>* %a3 ret void } define void @load_v32i1_broadcast_16_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { ; AVX512-LABEL: load_v32i1_broadcast_16_v2i1: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 16(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm2 ; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512-NEXT: vpmovq2m %xmm2, %k1 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512-NEXT: vmovapd %xmm1, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v2i1: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 16(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <32 x i1>, <32 x i1>* %a0 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32> %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 store <2 x double> %d2, <2 x double>* %a3 ret void } define void @load_v32i1_broadcast_16_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) { ; AVX512-LABEL: load_v32i1_broadcast_16_v4i1: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 16(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %xmm2 ; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512-NEXT: vpmovd2m %xmm2, %k1 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1} ; AVX512-NEXT: vmovaps %xmm1, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v4i1: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 16(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <32 x i1>, <32 x i1>* %a0 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32> %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2 store <4 x float> %d2, <4 x float>* %a3 ret void } define void @load_v32i1_broadcast_16_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) { ; AVX512-LABEL: load_v32i1_broadcast_16_v8i1: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 16(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %ymm2 ; AVX512-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512-NEXT: vpmovd2m %ymm2, %k1 ; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1} ; AVX512-NEXT: vmovaps %ymm1, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v8i1: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 16(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi) ; AVX512NOTDQ-NEXT: vzeroupper ; AVX512NOTDQ-NEXT: retq %d0 = load <32 x i1>, <32 x i1>* %a0 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32> %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2 store <8 x float> %d2, <8 x float>* %a3 ret void } define void @load_v32i1_broadcast_31_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { ; AVX512-LABEL: load_v32i1_broadcast_31_v2i1: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 30(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm2 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX512-NEXT: vpmovq2m %xmm2, %k1 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512-NEXT: vmovapd %xmm1, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v2i1: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 30(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <32 x i1>, <32 x i1>* %a0 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32> %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 store <2 x double> %d2, <2 x double>* %a3 ret void } define void @load_v32i1_broadcast_31_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) { ; AVX512-LABEL: load_v32i1_broadcast_31_v4i1: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 28(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %xmm2 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vpmovd2m %xmm2, %k1 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1} ; AVX512-NEXT: vmovaps %xmm1, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v4i1: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 28(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <32 x i1>, <32 x i1>* %a0 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32> %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2 store <4 x float> %d2, <4 x float>* %a3 ret void } define void @load_v32i1_broadcast_31_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) { ; AVX512-LABEL: load_v32i1_broadcast_31_v8i1: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 24(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %ymm2 ; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] ; AVX512-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX512-NEXT: vpmovd2m %ymm2, %k1 ; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1} ; AVX512-NEXT: vmovaps %ymm1, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v8i1: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 24(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] ; AVX512NOTDQ-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi) ; AVX512NOTDQ-NEXT: vzeroupper ; AVX512NOTDQ-NEXT: retq %d0 = load <32 x i1>, <32 x i1>* %a0 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32> %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2 store <8 x float> %d2, <8 x float>* %a3 ret void } define void @load_v64i1_broadcast_32_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { ; AVX512-LABEL: load_v64i1_broadcast_32_v2i1: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 32(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm2 ; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512-NEXT: vpmovq2m %xmm2, %k1 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512-NEXT: vmovapd %xmm1, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v2i1: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 32(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <64 x i1>, <64 x i1>* %a0 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32> %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 store <2 x double> %d2, <2 x double>* %a3 ret void } define void @load_v64i1_broadcast_32_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) { ; AVX512-LABEL: load_v64i1_broadcast_32_v4i1: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 32(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %xmm2 ; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512-NEXT: vpmovd2m %xmm2, %k1 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1} ; AVX512-NEXT: vmovaps %xmm1, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v4i1: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 32(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <64 x i1>, <64 x i1>* %a0 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32> %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2 store <4 x float> %d2, <4 x float>* %a3 ret void } define void @load_v64i1_broadcast_32_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) { ; AVX512-LABEL: load_v64i1_broadcast_32_v8i1: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 32(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %ymm2 ; AVX512-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512-NEXT: vpmovd2m %ymm2, %k1 ; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1} ; AVX512-NEXT: vmovaps %ymm1, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v8i1: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 32(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi) ; AVX512NOTDQ-NEXT: vzeroupper ; AVX512NOTDQ-NEXT: retq %d0 = load <64 x i1>, <64 x i1>* %a0 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32> %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2 store <8 x float> %d2, <8 x float>* %a3 ret void } define void @load_v64i1_broadcast_32_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x float> %a2,<16 x float>* %a3) { ; AVX512-LABEL: load_v64i1_broadcast_32_v16i1: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovw 32(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %zmm2 ; AVX512-NEXT: vpbroadcastd %xmm2, %zmm2 ; AVX512-NEXT: vpmovd2m %zmm2, %k1 ; AVX512-NEXT: vmovaps %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovaps %zmm1, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 32(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %zmm2 ; AVX512NOTDQ-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %zmm0, %zmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %zmm1, (%rsi) ; AVX512NOTDQ-NEXT: vzeroupper ; AVX512NOTDQ-NEXT: retq %d0 = load <64 x i1>, <64 x i1>* %a0 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32> %d2 = select <16 x i1> %d1, <16 x float> %a1, <16 x float> %a2 store <16 x float> %d2, <16 x float>* %a3 ret void } define void @load_v64i1_broadcast_63_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { ; AVX512-LABEL: load_v64i1_broadcast_63_v2i1: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 62(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm2 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX512-NEXT: vpmovq2m %xmm2, %k1 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512-NEXT: vmovapd %xmm1, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v2i1: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 62(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <64 x i1>, <64 x i1>* %a0 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32> %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 store <2 x double> %d2, <2 x double>* %a3 ret void } define void @load_v64i1_broadcast_63_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) { ; AVX512-LABEL: load_v64i1_broadcast_63_v4i1: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 60(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %xmm2 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vpmovd2m %xmm2, %k1 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1} ; AVX512-NEXT: vmovaps %xmm1, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v4i1: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 60(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <64 x i1>, <64 x i1>* %a0 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32> %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2 store <4 x float> %d2, <4 x float>* %a3 ret void } define void @load_v64i1_broadcast_63_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) { ; AVX512-LABEL: load_v64i1_broadcast_63_v8i1: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 56(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %ymm2 ; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] ; AVX512-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX512-NEXT: vpmovd2m %ymm2, %k1 ; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1} ; AVX512-NEXT: vmovaps %ymm1, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v8i1: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 56(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] ; AVX512NOTDQ-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi) ; AVX512NOTDQ-NEXT: vzeroupper ; AVX512NOTDQ-NEXT: retq %d0 = load <64 x i1>, <64 x i1>* %a0 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32> %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2 store <8 x float> %d2, <8 x float>* %a3 ret void } define void @load_v64i1_broadcast_63_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x float> %a2,<16 x float>* %a3) { ; AVX512-LABEL: load_v64i1_broadcast_63_v16i1: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovw 48(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %zmm2 ; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512-NEXT: vpermd %zmm2, %zmm3, %zmm2 ; AVX512-NEXT: vpmovd2m %zmm2, %k1 ; AVX512-NEXT: vmovaps %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovaps %zmm1, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v16i1: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 48(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512NOTDQ-NEXT: vpermd %zmm2, %zmm3, %zmm2 ; AVX512NOTDQ-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %zmm0, %zmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %zmm1, (%rsi) ; AVX512NOTDQ-NEXT: vzeroupper ; AVX512NOTDQ-NEXT: retq %d0 = load <64 x i1>, <64 x i1>* %a0 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32> %d2 = select <16 x i1> %d1, <16 x float> %a1, <16 x float> %a2 store <16 x float> %d2, <16 x float>* %a3 ret void } define void @load_v2i1_broadcast_1_v1i1_store(<2 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v2i1_broadcast_1_v1i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 1(%rdi), %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v2i1_broadcast_1_v1i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: movb 1(%rdi), %al ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <2 x i1>, <2 x i1>* %a0 %d1 = shufflevector <2 x i1> %d0,<2 x i1> undef,<1 x i32> store <1 x i1> %d1, <1 x i1>* %a1 ret void } define void @load_v3i1_broadcast_1_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v3i1_broadcast_1_v1i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 1(%rdi), %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v3i1_broadcast_1_v1i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: movb 1(%rdi), %al ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <3 x i1>, <3 x i1>* %a0 %d1 = shufflevector <3 x i1> %d0,<3 x i1> undef,<1 x i32> store <1 x i1> %d1, <1 x i1>* %a1 ret void } define void @load_v3i1_broadcast_2_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v3i1_broadcast_2_v1i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 2(%rdi), %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v3i1_broadcast_2_v1i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: movb 2(%rdi), %al ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <3 x i1>, <3 x i1>* %a0 %d1 = shufflevector <3 x i1> %d0,<3 x i1> undef,<1 x i32> store <1 x i1> %d1, <1 x i1>* %a1 ret void } define void @load_v4i1_broadcast_2_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v4i1_broadcast_2_v1i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 2(%rdi), %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v4i1_broadcast_2_v1i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: movb 2(%rdi), %al ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <4 x i1>, <4 x i1>* %a0 %d1 = shufflevector <4 x i1> %d0,<4 x i1> undef,<1 x i32> store <1 x i1> %d1, <1 x i1>* %a1 ret void } define void @load_v4i1_broadcast_3_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v4i1_broadcast_3_v1i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 3(%rdi), %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v4i1_broadcast_3_v1i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: movb 3(%rdi), %al ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <4 x i1>, <4 x i1>* %a0 %d1 = shufflevector <4 x i1> %d0,<4 x i1> undef,<1 x i32> store <1 x i1> %d1, <1 x i1>* %a1 ret void } define void @load_v8i1_broadcast_4_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v8i1_broadcast_4_v1i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 4(%rdi), %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v1i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: movb 4(%rdi), %al ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <8 x i1>, <8 x i1>* %a0 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<1 x i32> store <1 x i1> %d1, <1 x i1>* %a1 ret void } define void @load_v8i1_broadcast_4_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) { ; AVX512-LABEL: load_v8i1_broadcast_4_v2i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 4(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm0 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512-NEXT: vpmovq2m %xmm0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v2i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 4(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <8 x i1>, <8 x i1>* %a0 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32> store <2 x i1> %d1, <2 x i1>* %a1 ret void } define void @load_v8i1_broadcast_7_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v8i1_broadcast_7_v1i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 7(%rdi), %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v1i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: movb 7(%rdi), %al ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <8 x i1>, <8 x i1>* %a0 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<1 x i32> store <1 x i1> %d1, <1 x i1>* %a1 ret void } define void @load_v8i1_broadcast_7_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) { ; AVX512-LABEL: load_v8i1_broadcast_7_v2i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 6(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmovq2m %xmm0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v2i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 6(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <8 x i1>, <8 x i1>* %a0 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32> store <2 x i1> %d1, <2 x i1>* %a1 ret void } define void @load_v16i1_broadcast_8_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v16i1_broadcast_8_v1i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 8(%rdi), %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v1i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: movb 8(%rdi), %al ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <16 x i1>, <16 x i1>* %a0 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<1 x i32> store <1 x i1> %d1, <1 x i1>* %a1 ret void } define void @load_v16i1_broadcast_8_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) { ; AVX512-LABEL: load_v16i1_broadcast_8_v2i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 8(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm0 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512-NEXT: vpmovq2m %xmm0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v2i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 8(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <16 x i1>, <16 x i1>* %a0 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32> store <2 x i1> %d1, <2 x i1>* %a1 ret void } define void @load_v16i1_broadcast_8_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) { ; AVX512-LABEL: load_v16i1_broadcast_8_v4i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 8(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %xmm0 ; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX512-NEXT: vpmovd2m %xmm0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v4i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 8(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <16 x i1>, <16 x i1>* %a0 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32> store <4 x i1> %d1, <4 x i1>* %a1 ret void } define void @load_v16i1_broadcast_15_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v16i1_broadcast_15_v1i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 15(%rdi), %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v1i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: movb 15(%rdi), %al ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <16 x i1>, <16 x i1>* %a0 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<1 x i32> store <1 x i1> %d1, <1 x i1>* %a1 ret void } define void @load_v16i1_broadcast_15_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) { ; AVX512-LABEL: load_v16i1_broadcast_15_v2i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 14(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmovq2m %xmm0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v2i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 14(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <16 x i1>, <16 x i1>* %a0 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32> store <2 x i1> %d1, <2 x i1>* %a1 ret void } define void @load_v16i1_broadcast_15_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) { ; AVX512-LABEL: load_v16i1_broadcast_15_v4i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 12(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vpmovd2m %xmm0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v4i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 12(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <16 x i1>, <16 x i1>* %a0 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32> store <4 x i1> %d1, <4 x i1>* %a1 ret void } define void @load_v32i1_broadcast_16_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v32i1_broadcast_16_v1i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 16(%rdi), %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v1i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: movb 16(%rdi), %al ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <32 x i1>, <32 x i1>* %a0 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<1 x i32> store <1 x i1> %d1, <1 x i1>* %a1 ret void } define void @load_v32i1_broadcast_16_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) { ; AVX512-LABEL: load_v32i1_broadcast_16_v2i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 16(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm0 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512-NEXT: vpmovq2m %xmm0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v2i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 16(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <32 x i1>, <32 x i1>* %a0 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32> store <2 x i1> %d1, <2 x i1>* %a1 ret void } define void @load_v32i1_broadcast_16_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) { ; AVX512-LABEL: load_v32i1_broadcast_16_v4i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 16(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %xmm0 ; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX512-NEXT: vpmovd2m %xmm0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v4i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 16(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <32 x i1>, <32 x i1>* %a0 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32> store <4 x i1> %d1, <4 x i1>* %a1 ret void } define void @load_v32i1_broadcast_16_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) { ; AVX512-LABEL: load_v32i1_broadcast_16_v8i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 16(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %ymm0 ; AVX512-NEXT: vpbroadcastd %xmm0, %ymm0 ; AVX512-NEXT: vpmovd2m %ymm0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v8i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 16(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %ymm0 ; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: vzeroupper ; AVX512NOTDQ-NEXT: retq %d0 = load <32 x i1>, <32 x i1>* %a0 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32> store <8 x i1> %d1, <8 x i1>* %a1 ret void } define void @load_v32i1_broadcast_31_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v32i1_broadcast_31_v1i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 31(%rdi), %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v1i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: movb 31(%rdi), %al ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <32 x i1>, <32 x i1>* %a0 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<1 x i32> store <1 x i1> %d1, <1 x i1>* %a1 ret void } define void @load_v32i1_broadcast_31_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) { ; AVX512-LABEL: load_v32i1_broadcast_31_v2i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 30(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmovq2m %xmm0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v2i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 30(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <32 x i1>, <32 x i1>* %a0 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32> store <2 x i1> %d1, <2 x i1>* %a1 ret void } define void @load_v32i1_broadcast_31_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) { ; AVX512-LABEL: load_v32i1_broadcast_31_v4i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 28(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vpmovd2m %xmm0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v4i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 28(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <32 x i1>, <32 x i1>* %a0 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32> store <4 x i1> %d1, <4 x i1>* %a1 ret void } define void @load_v32i1_broadcast_31_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) { ; AVX512-LABEL: load_v32i1_broadcast_31_v8i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 24(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %ymm0 ; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] ; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vpmovd2m %ymm0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v8i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 24(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] ; AVX512NOTDQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: vzeroupper ; AVX512NOTDQ-NEXT: retq %d0 = load <32 x i1>, <32 x i1>* %a0 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32> store <8 x i1> %d1, <8 x i1>* %a1 ret void } define void @load_v64i1_broadcast_32_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v64i1_broadcast_32_v1i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 32(%rdi), %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v1i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: movb 32(%rdi), %al ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <64 x i1>, <64 x i1>* %a0 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<1 x i32> store <1 x i1> %d1, <1 x i1>* %a1 ret void } define void @load_v64i1_broadcast_32_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) { ; AVX512-LABEL: load_v64i1_broadcast_32_v2i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 32(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm0 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512-NEXT: vpmovq2m %xmm0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v2i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 32(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <64 x i1>, <64 x i1>* %a0 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32> store <2 x i1> %d1, <2 x i1>* %a1 ret void } define void @load_v64i1_broadcast_32_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) { ; AVX512-LABEL: load_v64i1_broadcast_32_v4i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 32(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %xmm0 ; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX512-NEXT: vpmovd2m %xmm0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v4i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 32(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <64 x i1>, <64 x i1>* %a0 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32> store <4 x i1> %d1, <4 x i1>* %a1 ret void } define void @load_v64i1_broadcast_32_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) { ; AVX512-LABEL: load_v64i1_broadcast_32_v8i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 32(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %ymm0 ; AVX512-NEXT: vpbroadcastd %xmm0, %ymm0 ; AVX512-NEXT: vpmovd2m %ymm0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v8i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 32(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %ymm0 ; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: vzeroupper ; AVX512NOTDQ-NEXT: retq %d0 = load <64 x i1>, <64 x i1>* %a0 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32> store <8 x i1> %d1, <8 x i1>* %a1 ret void } define void @load_v64i1_broadcast_32_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) { ; AVX512-LABEL: load_v64i1_broadcast_32_v16i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovw 32(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %zmm0 ; AVX512-NEXT: vpbroadcastd %xmm0, %zmm0 ; AVX512-NEXT: vpmovd2m %zmm0, %k0 ; AVX512-NEXT: kmovw %k0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 32(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %zmm0 ; AVX512NOTDQ-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512NOTDQ-NEXT: kmovw %k0, (%rsi) ; AVX512NOTDQ-NEXT: vzeroupper ; AVX512NOTDQ-NEXT: retq %d0 = load <64 x i1>, <64 x i1>* %a0 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32> store <16 x i1> %d1, <16 x i1>* %a1 ret void } define void @load_v64i1_broadcast_63_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v64i1_broadcast_63_v1i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 63(%rdi), %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v1i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: movb 63(%rdi), %al ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <64 x i1>, <64 x i1>* %a0 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<1 x i32> store <1 x i1> %d1, <1 x i1>* %a1 ret void } define void @load_v64i1_broadcast_63_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) { ; AVX512-LABEL: load_v64i1_broadcast_63_v2i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 62(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmovq2m %xmm0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v2i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 62(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <64 x i1>, <64 x i1>* %a0 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32> store <2 x i1> %d1, <2 x i1>* %a1 ret void } define void @load_v64i1_broadcast_63_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) { ; AVX512-LABEL: load_v64i1_broadcast_63_v4i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 60(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vpmovd2m %xmm0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v4i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 60(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <64 x i1>, <64 x i1>* %a0 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32> store <4 x i1> %d1, <4 x i1>* %a1 ret void } define void @load_v64i1_broadcast_63_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) { ; AVX512-LABEL: load_v64i1_broadcast_63_v8i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb 56(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %ymm0 ; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] ; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vpmovd2m %ymm0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v8i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 56(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] ; AVX512NOTDQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: vzeroupper ; AVX512NOTDQ-NEXT: retq %d0 = load <64 x i1>, <64 x i1>* %a0 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32> store <8 x i1> %d1, <8 x i1>* %a1 ret void } define void @load_v64i1_broadcast_63_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) { ; AVX512-LABEL: load_v64i1_broadcast_63_v16i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovw 48(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %zmm0 ; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: vpmovd2m %zmm0, %k0 ; AVX512-NEXT: kmovw %k0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v16i1_store: ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw 48(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512NOTDQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512NOTDQ-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512NOTDQ-NEXT: kmovw %k0, (%rsi) ; AVX512NOTDQ-NEXT: vzeroupper ; AVX512NOTDQ-NEXT: retq %d0 = load <64 x i1>, <64 x i1>* %a0 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32> store <16 x i1> %d1, <16 x i1>* %a1 ret void }