; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s -check-prefix=ALL32 -check-prefix=NO-AVX512BW -check-prefix=AVX2 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s -check-prefix=ALL32 -check-prefix=NO-AVX512BW -check-prefix=AVX512 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f,+avx512bw | FileCheck %s -check-prefix=ALL32 -check-prefix=AVX512 -check-prefix=AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX-64 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s -check-prefix=ALL64 -check-prefix=NO-AVX512BW-64 -check-prefix=AVX2-64 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s -check-prefix=ALL64 -check-prefix=NO-AVX512BW-64 -check-prefix=AVX512F-64 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,+avx512bw | FileCheck %s -check-prefix=ALL64 -check-prefix=AVX512F-64 -check-prefix=AVX512BW-64 ;===-----------------------------------------------------------------------------=== ; This test checks the ability to recognize a cross element pattern of ; constants and perform the load via broadcasting a smaller constant ; vector. ; For example: ; => broadcast of the constant vector ;===-----------------------------------------------------------------------------=== define <16 x i8> @f16xi8_i16(<16 x i8> %a) { ; AVX-LABEL: f16xi8_i16: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retl ; ; ALL32-LABEL: f16xi8_i16: ; ALL32: # %bb.0: ; ALL32-NEXT: vpbroadcastw {{.*#+}} xmm1 = [256,256,256,256,256,256,256,256] ; ALL32-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: retl ; ; AVX-64-LABEL: f16xi8_i16: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: retq ; ; ALL64-LABEL: f16xi8_i16: ; ALL64: # %bb.0: ; ALL64-NEXT: vpbroadcastw {{.*#+}} xmm1 = [256,256,256,256,256,256,256,256] ; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: retq %res1 = add <16 x i8> , %a %res2 = and <16 x i8> , %res1 ret <16 x i8> %res2 } define <16 x i8> @f16xi8_i32(<16 x i8> %a) { ; AVX-LABEL: f16xi8_i32: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37] ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retl ; ; ALL32-LABEL: f16xi8_i32: ; ALL32: # %bb.0: ; ALL32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [50462976,50462976,50462976,50462976] ; ALL32-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: retl ; ; AVX-64-LABEL: f16xi8_i32: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37] ; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: retq ; ; ALL64-LABEL: f16xi8_i32: ; ALL64: # %bb.0: ; ALL64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [50462976,50462976,50462976,50462976] ; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: retq %res1 = add <16 x i8> , %a %res2 = and <16 x i8> , %res1 ret <16 x i8> %res2 } define <16 x i8> @f16xi8_i64(<16 x i8> %a) { ; AVX-LABEL: f16xi8_i64: ; AVX: # %bb.0: ; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [7.9499288951273625E-275,7.9499288951273625E-275] ; AVX-NEXT: # xmm1 = mem[0,0] ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retl ; ; ALL32-LABEL: f16xi8_i64: ; ALL32: # %bb.0: ; ALL32-NEXT: vpbroadcastq {{.*#+}} xmm1 = [7.9499288951273625E-275,7.9499288951273625E-275] ; ALL32-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: retl ; ; AVX-64-LABEL: f16xi8_i64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = [7.9499288951273625E-275,7.9499288951273625E-275] ; AVX-64-NEXT: # xmm1 = mem[0,0] ; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: retq ; ; ALL64-LABEL: f16xi8_i64: ; ALL64: # %bb.0: ; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [506097522914230528,506097522914230528] ; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: retq %res1 = add <16 x i8> , %a %res2 = and <16 x i8> , %res1 ret <16 x i8> %res2 } define <32 x i8> @f32xi8_i16(<32 x i8> %a) { ; AVX-LABEL: f32xi8_i16: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 ; AVX-NEXT: retl ; ; ALL32-LABEL: f32xi8_i16: ; ALL32: # %bb.0: ; ALL32-NEXT: vpbroadcastw {{.*#+}} ymm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] ; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: retl ; ; AVX-64-LABEL: f32xi8_i16: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX-64-NEXT: retq ; ; ALL64-LABEL: f32xi8_i16: ; ALL64: # %bb.0: ; ALL64-NEXT: vpbroadcastw {{.*#+}} ymm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] ; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: retq %res1 = add <32 x i8> , %a %res2 = and <32 x i8> , %res1 ret <32 x i8> %res2 } define <32 x i8> @f32xi8_i32(<32 x i8> %a) { ; AVX-LABEL: f32xi8_i32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37] ; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 ; AVX-NEXT: retl ; ; ALL32-LABEL: f32xi8_i32: ; ALL32: # %bb.0: ; ALL32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] ; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: retl ; ; AVX-64-LABEL: f32xi8_i32: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37] ; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX-64-NEXT: retq ; ; ALL64-LABEL: f32xi8_i32: ; ALL64: # %bb.0: ; ALL64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] ; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: retq %res1 = add <32 x i8> , %a %res2 = and <32 x i8> , %res1 ret <32 x i8> %res2 } define <32 x i8> @f32xi8_i64(<32 x i8> %a) { ; AVX-LABEL: f32xi8_i64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [7.9499288951273625E-275,7.9499288951273625E-275] ; AVX-NEXT: # xmm2 = mem[0,0] ; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 ; AVX-NEXT: retl ; ; ALL32-LABEL: f32xi8_i64: ; ALL32: # %bb.0: ; ALL32-NEXT: vpbroadcastq {{.*#+}} ymm1 = [7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275] ; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: retl ; ; AVX-64-LABEL: f32xi8_i64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = [7.9499288951273625E-275,7.9499288951273625E-275] ; AVX-64-NEXT: # xmm2 = mem[0,0] ; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX-64-NEXT: retq ; ; ALL64-LABEL: f32xi8_i64: ; ALL64: # %bb.0: ; ALL64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528] ; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: retq %res1 = add <32 x i8> , %a %res2 = and <32 x i8> , %res1 ret <32 x i8> %res2 } define <32 x i8> @f32xi8_i128(<32 x i8> %a) { ; AVX-LABEL: f32xi8_i128: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 ; AVX-NEXT: retl ; ; ALL32-LABEL: f32xi8_i128: ; ALL32: # %bb.0: ; ALL32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; ALL32-NEXT: # ymm1 = mem[0,1,0,1] ; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: retl ; ; AVX-64-LABEL: f32xi8_i128: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX-64-NEXT: retq ; ; ALL64-LABEL: f32xi8_i128: ; ALL64: # %bb.0: ; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; ALL64-NEXT: # ymm1 = mem[0,1,0,1] ; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: retq %res1 = add <32 x i8> , %a %res2 = and <32 x i8> , %res1 ret <32 x i8> %res2 } define <64 x i8> @f64xi8_i16(<64 x i8> %a) { ; AVX-LABEL: f64xi8_i16: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl ; ; AVX2-LABEL: f64xi8_i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] ; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retl ; ; AVX512BW-LABEL: f64xi8_i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retl ; ; AVX-64-LABEL: f64xi8_i16: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq ; ; AVX2-64-LABEL: f64xi8_i16: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] ; AVX2-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: retq ; ; AVX512BW-64-LABEL: f64xi8_i16: ; AVX512BW-64: # %bb.0: ; AVX512BW-64-NEXT: vpbroadcastw {{.*#+}} zmm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] ; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-64-NEXT: retq %res1 = add <64 x i8> , %a %res2 = and <64 x i8> , %res1 ret <64 x i8> %res2 } define <64 x i8> @f64i8_i32(<64 x i8> %a) { ; AVX-LABEL: f64i8_i32: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX-NEXT: vpaddb %xmm2, %xmm3, %xmm3 ; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX-NEXT: vpaddb %xmm2, %xmm3, %xmm3 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl ; ; AVX2-LABEL: f64i8_i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] ; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retl ; ; AVX512BW-LABEL: f64i8_i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retl ; ; AVX-64-LABEL: f64i8_i32: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vbroadcastss {{.*#+}} ymm2 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37] ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX-64-NEXT: vpaddb %xmm2, %xmm3, %xmm3 ; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX-64-NEXT: vpaddb %xmm2, %xmm3, %xmm3 ; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq ; ; AVX2-64-LABEL: f64i8_i32: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] ; AVX2-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: retq ; ; AVX512BW-64-LABEL: f64i8_i32: ; AVX512BW-64: # %bb.0: ; AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] ; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-64-NEXT: retq %res1 = add <64 x i8> , %a %res2 = and <64 x i8> , %res1 ret <64 x i8> %res2 } define <64 x i8> @f64xi8_i64(<64 x i8> %a) { ; AVX-LABEL: f64xi8_i64: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX-NEXT: vpaddb %xmm2, %xmm3, %xmm3 ; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX-NEXT: vpaddb %xmm2, %xmm3, %xmm3 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl ; ; AVX2-LABEL: f64xi8_i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275] ; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retl ; ; AVX512BW-LABEL: f64xi8_i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm1 = [7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275] ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retl ; ; AVX-64-LABEL: f64xi8_i64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275] ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX-64-NEXT: vpaddb %xmm2, %xmm3, %xmm3 ; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX-64-NEXT: vpaddb %xmm2, %xmm3, %xmm3 ; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq ; ; AVX2-64-LABEL: f64xi8_i64: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528] ; AVX2-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: retq ; ; AVX512BW-64-LABEL: f64xi8_i64: ; AVX512BW-64: # %bb.0: ; AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528] ; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-64-NEXT: retq %res1 = add <64 x i8> , %a %res2 = and <64 x i8> , %res1 ret <64 x i8> %res2 } define <64 x i8> @f64xi8_i128(<64 x i8> %a) { ; AVX-LABEL: f64xi8_i128: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl ; ; AVX2-LABEL: f64xi8_i128: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retl ; ; AVX512BW-LABEL: f64xi8_i128: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retl ; ; AVX-64-LABEL: f64xi8_i128: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq ; ; AVX2-64-LABEL: f64xi8_i128: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: retq ; ; AVX512BW-64-LABEL: f64xi8_i128: ; AVX512BW-64: # %bb.0: ; AVX512BW-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512BW-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-64-NEXT: retq %res1 = add <64 x i8> , %a %res2 = and <64 x i8> , %res1 ret <64 x i8> %res2 } define <64 x i8> @f64xi8_i256(<64 x i8> %a) { ; AVX-LABEL: f64xi8_i256: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX-NEXT: vpaddb %xmm4, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm4, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl ; ; AVX2-LABEL: f64xi8_i256: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] ; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retl ; ; AVX512BW-LABEL: f64xi8_i256: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retl ; ; AVX-64-LABEL: f64xi8_i256: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] ; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX-64-NEXT: vpaddb %xmm4, %xmm1, %xmm1 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddb %xmm4, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq ; ; AVX2-64-LABEL: f64xi8_i256: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] ; AVX2-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: retq ; ; AVX512BW-64-LABEL: f64xi8_i256: ; AVX512BW-64: # %bb.0: ; AVX512BW-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] ; AVX512BW-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-64-NEXT: retq %res1 = add <64 x i8> , %a %res2 = and <64 x i8> , %res1 ret <64 x i8> %res2 } define <8 x i16> @f8xi16_i32(<8 x i16> %a) { ; AVX-LABEL: f8xi16_i32: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41] ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retl ; ; ALL32-LABEL: f8xi16_i32: ; ALL32: # %bb.0: ; ALL32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65536,65536,65536,65536] ; ALL32-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: retl ; ; AVX-64-LABEL: f8xi16_i32: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41] ; AVX-64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: retq ; ; ALL64-LABEL: f8xi16_i32: ; ALL64: # %bb.0: ; ALL64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65536,65536,65536,65536] ; ALL64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: retq %res1 = add <8 x i16> , %a %res2 = and <8 x i16> , %res1 ret <8 x i16> %res2 } define <8 x i16> @f8xi16_i64(<8 x i16> %a) { ; AVX-LABEL: f8xi16_i64: ; AVX: # %bb.0: ; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4.1720559249406128E-309,4.1720559249406128E-309] ; AVX-NEXT: # xmm1 = mem[0,0] ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retl ; ; ALL32-LABEL: f8xi16_i64: ; ALL32: # %bb.0: ; ALL32-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4.1720559249406128E-309,4.1720559249406128E-309] ; ALL32-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: retl ; ; AVX-64-LABEL: f8xi16_i64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = [4.1720559249406128E-309,4.1720559249406128E-309] ; AVX-64-NEXT: # xmm1 = mem[0,0] ; AVX-64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: retq ; ; ALL64-LABEL: f8xi16_i64: ; ALL64: # %bb.0: ; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [844433520132096,844433520132096] ; ALL64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: retq %res1 = add <8 x i16> , %a %res2 = and <8 x i16> , %res1 ret <8 x i16> %res2 } define <16 x i16> @f16xi16_i32(<16 x i16> %a) { ; AVX-LABEL: f16xi16_i32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41] ; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 ; AVX-NEXT: retl ; ; ALL32-LABEL: f16xi16_i32: ; ALL32: # %bb.0: ; ALL32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65536,65536,65536,65536,65536,65536,65536,65536] ; ALL32-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: retl ; ; AVX-64-LABEL: f16xi16_i32: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm2 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41] ; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX-64-NEXT: retq ; ; ALL64-LABEL: f16xi16_i32: ; ALL64: # %bb.0: ; ALL64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65536,65536,65536,65536,65536,65536,65536,65536] ; ALL64-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: retq %res1 = add <16 x i16> , %a %res2 = and <16 x i16> , %res1 ret <16 x i16> %res2 } define <16 x i16> @f16xi16_i64(<16 x i16> %a) { ; AVX-LABEL: f16xi16_i64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [4.1720559249406128E-309,4.1720559249406128E-309] ; AVX-NEXT: # xmm2 = mem[0,0] ; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 ; AVX-NEXT: retl ; ; ALL32-LABEL: f16xi16_i64: ; ALL32: # %bb.0: ; ALL32-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309] ; ALL32-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: retl ; ; AVX-64-LABEL: f16xi16_i64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = [4.1720559249406128E-309,4.1720559249406128E-309] ; AVX-64-NEXT: # xmm2 = mem[0,0] ; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX-64-NEXT: retq ; ; ALL64-LABEL: f16xi16_i64: ; ALL64: # %bb.0: ; ALL64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [844433520132096,844433520132096,844433520132096,844433520132096] ; ALL64-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: retq %res1 = add <16 x i16> , %a %res2 = and <16 x i16> , %res1 ret <16 x i16> %res2 } define <16 x i16> @f16xi16_i128(<16 x i16> %a) { ; AVX-LABEL: f16xi16_i128: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] ; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 ; AVX-NEXT: retl ; ; ALL32-LABEL: f16xi16_i128: ; ALL32: # %bb.0: ; ALL32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; ALL32-NEXT: # ymm1 = mem[0,1,0,1] ; ALL32-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: retl ; ; AVX-64-LABEL: f16xi16_i128: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] ; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX-64-NEXT: retq ; ; ALL64-LABEL: f16xi16_i128: ; ALL64: # %bb.0: ; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; ALL64-NEXT: # ymm1 = mem[0,1,0,1] ; ALL64-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: retq %res1 = add <16 x i16> , %a %res2 = and <16 x i16> , %res1 ret <16 x i16> %res2 } define <32 x i16> @f32xi16_i32(<32 x i16> %a) { ; AVX-LABEL: f32xi16_i32: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX-NEXT: vpaddw %xmm2, %xmm3, %xmm3 ; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX-NEXT: vpaddw %xmm2, %xmm3, %xmm3 ; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl ; ; AVX2-LABEL: f32xi16_i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536] ; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retl ; ; AVX512BW-LABEL: f32xi16_i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536] ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retl ; ; AVX-64-LABEL: f32xi16_i32: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vbroadcastss {{.*#+}} ymm2 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41] ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX-64-NEXT: vpaddw %xmm2, %xmm3, %xmm3 ; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX-64-NEXT: vpaddw %xmm2, %xmm3, %xmm3 ; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq ; ; AVX2-64-LABEL: f32xi16_i32: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536] ; AVX2-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: retq ; ; AVX512BW-64-LABEL: f32xi16_i32: ; AVX512BW-64: # %bb.0: ; AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536] ; AVX512BW-64-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-64-NEXT: retq %res1 = add <32 x i16> , %a %res2 = and <32 x i16> , %res1 ret <32 x i16> %res2 } define <32 x i16> @f32xi16_i64(<32 x i16> %a) { ; AVX-LABEL: f32xi16_i64: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX-NEXT: vpaddw %xmm2, %xmm3, %xmm3 ; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX-NEXT: vpaddw %xmm2, %xmm3, %xmm3 ; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl ; ; AVX2-LABEL: f32xi16_i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309] ; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retl ; ; AVX512BW-LABEL: f32xi16_i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309] ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retl ; ; AVX-64-LABEL: f32xi16_i64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309] ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX-64-NEXT: vpaddw %xmm2, %xmm3, %xmm3 ; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX-64-NEXT: vpaddw %xmm2, %xmm3, %xmm3 ; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq ; ; AVX2-64-LABEL: f32xi16_i64: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [844433520132096,844433520132096,844433520132096,844433520132096] ; AVX2-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: retq ; ; AVX512BW-64-LABEL: f32xi16_i64: ; AVX512BW-64: # %bb.0: ; AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096] ; AVX512BW-64-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-64-NEXT: retq %res1 = add <32 x i16> , %a %res2 = and <32 x i16> , %res1 ret <32 x i16> %res2 } define <32 x i16> @f32xi16_i128(<32 x i16> %a) { ; AVX-LABEL: f32xi16_i128: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7] ; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl ; ; AVX2-LABEL: f32xi16_i128: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; AVX2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retl ; ; AVX512BW-LABEL: f32xi16_i128: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retl ; ; AVX-64-LABEL: f32xi16_i128: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7] ; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddw %xmm3, %xmm1, %xmm1 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq ; ; AVX2-64-LABEL: f32xi16_i128: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: retq ; ; AVX512BW-64-LABEL: f32xi16_i128: ; AVX512BW-64: # %bb.0: ; AVX512BW-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; AVX512BW-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-64-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-64-NEXT: retq %res1 = add <32 x i16> , %a %res2 = and <32 x i16> , %res1 ret <32 x i16> %res2 } define <32 x i16> @f32xi16_i256(<32 x i16> %a) { ; AVX-LABEL: f32xi16_i256: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,12,13,14,15] ; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7] ; AVX-NEXT: vpaddw %xmm4, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddw %xmm4, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl ; ; AVX2-LABEL: f32xi16_i256: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retl ; ; AVX512BW-LABEL: f32xi16_i256: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retl ; ; AVX-64-LABEL: f32xi16_i256: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,12,13,14,15] ; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7] ; AVX-64-NEXT: vpaddw %xmm4, %xmm1, %xmm1 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddw %xmm4, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq ; ; AVX2-64-LABEL: f32xi16_i256: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX2-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: retq ; ; AVX512BW-64-LABEL: f32xi16_i256: ; AVX512BW-64: # %bb.0: ; AVX512BW-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512BW-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-64-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-64-NEXT: retq %res1 = add <32 x i16> , %a %res2 = and <32 x i16> , %res1 ret <32 x i16> %res2 } define <4 x i32> @f4xi32_i64(<4 x i32> %a) { ; AVX-LABEL: f4xi32_i64: ; AVX: # %bb.0: ; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [2.1219957909652723E-314,2.1219957909652723E-314] ; AVX-NEXT: # xmm1 = mem[0,0] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retl ; ; ALL32-LABEL: f4xi32_i64: ; ALL32: # %bb.0: ; ALL32-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2.1219957909652723E-314,2.1219957909652723E-314] ; ALL32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: retl ; ; AVX-64-LABEL: f4xi32_i64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = [2.1219957909652723E-314,2.1219957909652723E-314] ; AVX-64-NEXT: # xmm1 = mem[0,0] ; AVX-64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: retq ; ; ALL64-LABEL: f4xi32_i64: ; ALL64: # %bb.0: ; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967296,4294967296] ; ALL64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: retq %res1 = add <4 x i32> , %a %res2 = and <4 x i32> , %res1 ret <4 x i32> %res2 } define <8 x i32> @f8xi32_i64(<8 x i32> %a) { ; AVX-LABEL: f8xi32_i64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [2.1219957909652723E-314,2.1219957909652723E-314] ; AVX-NEXT: # xmm2 = mem[0,0] ; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 ; AVX-NEXT: retl ; ; ALL32-LABEL: f8xi32_i64: ; ALL32: # %bb.0: ; ALL32-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314] ; ALL32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: retl ; ; AVX-64-LABEL: f8xi32_i64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = [2.1219957909652723E-314,2.1219957909652723E-314] ; AVX-64-NEXT: # xmm2 = mem[0,0] ; AVX-64-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX-64-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX-64-NEXT: retq ; ; ALL64-LABEL: f8xi32_i64: ; ALL64: # %bb.0: ; ALL64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967296,4294967296,4294967296,4294967296] ; ALL64-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: retq %res1 = add <8 x i32> , %a %res2 = and <8 x i32> , %res1 ret <8 x i32> %res2 } define <8 x i32> @f8xi32_i128(<8 x i32> %a) { ; AVX-LABEL: f8xi32_i128: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3] ; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 ; AVX-NEXT: retl ; ; ALL32-LABEL: f8xi32_i128: ; ALL32: # %bb.0: ; ALL32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3] ; ALL32-NEXT: # ymm1 = mem[0,1,0,1] ; ALL32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: retl ; ; AVX-64-LABEL: f8xi32_i128: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3] ; AVX-64-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX-64-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX-64-NEXT: retq ; ; ALL64-LABEL: f8xi32_i128: ; ALL64: # %bb.0: ; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3] ; ALL64-NEXT: # ymm1 = mem[0,1,0,1] ; ALL64-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: retq %res1 = add <8 x i32> , %a %res2 = and <8 x i32> , %res1 ret <8 x i32> %res2 } define <16 x i32> @f16xi32_i64(<16 x i32> %a) { ; AVX-LABEL: f16xi32_i64: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX-NEXT: vpaddd %xmm2, %xmm3, %xmm3 ; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX-NEXT: vpaddd %xmm2, %xmm3, %xmm3 ; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl ; ; AVX2-LABEL: f16xi32_i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314] ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retl ; ; AVX512-LABEL: f16xi32_i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314] ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retl ; ; AVX-64-LABEL: f16xi32_i64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314] ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX-64-NEXT: vpaddd %xmm2, %xmm3, %xmm3 ; AVX-64-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX-64-NEXT: vpaddd %xmm2, %xmm3, %xmm3 ; AVX-64-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq ; ; AVX2-64-LABEL: f16xi32_i64: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967296,4294967296,4294967296,4294967296] ; AVX2-64-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: retq ; ; AVX512F-64-LABEL: f16xi32_i64: ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296] ; AVX512F-64-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-64-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512F-64-NEXT: retq %res1 = add <16 x i32> , %a %res2 = and <16 x i32> , %res1 ret <16 x i32> %res2 } define <16 x i32> @f16xi32_i128(<16 x i32> %a) { ; AVX-LABEL: f16xi32_i128: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] ; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddd %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl ; ; AVX2-LABEL: f16xi32_i128: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3] ; AVX2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retl ; ; AVX512-LABEL: f16xi32_i128: ; AVX512: # %bb.0: ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retl ; ; AVX-64-LABEL: f16xi32_i128: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] ; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddd %xmm3, %xmm1, %xmm1 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq ; ; AVX2-64-LABEL: f16xi32_i128: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3] ; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-64-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: retq ; ; AVX512F-64-LABEL: f16xi32_i128: ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-64-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-64-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512F-64-NEXT: retq %res1 = add <16 x i32> , %a %res2 = and <16 x i32> , %res1 ret <16 x i32> %res2 } define <4 x i64> @f4xi64_i128(<4 x i64> %a) { ; AVX-LABEL: f4xi64_i128: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,1,0] ; AVX-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 ; AVX-NEXT: retl ; ; ALL32-LABEL: f4xi64_i128: ; ALL32: # %bb.0: ; ALL32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0] ; ALL32-NEXT: # ymm1 = mem[0,1,0,1] ; ALL32-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: retl ; ; AVX-64-LABEL: f4xi64_i128: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] ; AVX-64-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ; AVX-64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX-64-NEXT: retq ; ; ALL64-LABEL: f4xi64_i128: ; ALL64: # %bb.0: ; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,0,1] ; ALL64-NEXT: # ymm1 = mem[0,1,0,1] ; ALL64-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: retq %res1 = add <4 x i64> , %a %res2 = and <4 x i64> , %res1 ret <4 x i64> %res2 } define <8 x i64> @f8xi64_i128(<8 x i64> %a) { ; AVX-LABEL: f8xi64_i128: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,1,0] ; AVX-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddq %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddq %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl ; ; AVX2-LABEL: f8xi64_i128: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0] ; AVX2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retl ; ; AVX512-LABEL: f8xi64_i128: ; AVX512: # %bb.0: ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0] ; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retl ; ; AVX-64-LABEL: f8xi64_i128: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] ; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddq %xmm3, %xmm1, %xmm1 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddq %xmm3, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] ; AVX-64-NEXT: # ymm2 = mem[0,1,0,1] ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq ; ; AVX2-64-LABEL: f8xi64_i128: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,0,1] ; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-64-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: retq ; ; AVX512F-64-LABEL: f8xi64_i128: ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,0,1,0,1,0,1] ; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512F-64-NEXT: retq %res1 = add <8 x i64> , %a %res2 = and <8 x i64> , %res1 ret <8 x i64> %res2 } define <8 x i64> @f8xi64_i256(<8 x i64> %a) { ; AVX-LABEL: f8xi64_i256: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [2,0,3,0] ; AVX-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,0,1,0] ; AVX-NEXT: vpaddq %xmm4, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddq %xmm4, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0] ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl ; ; AVX2-LABEL: f8xi64_i256: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0] ; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retl ; ; AVX512-LABEL: f8xi64_i256: ; AVX512: # %bb.0: ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,0,0,1,0,2,0,3,0] ; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retl ; ; AVX-64-LABEL: f8xi64_i256: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3] ; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vmovdqa {{.*#+}} xmm4 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] ; AVX-64-NEXT: vpaddq %xmm4, %xmm1, %xmm1 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddq %xmm4, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3] ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq ; ; AVX2-64-LABEL: f8xi64_i256: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3] ; AVX2-64-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: retq ; ; AVX512F-64-LABEL: f8xi64_i256: ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3] ; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512F-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512F-64-NEXT: retq %res1 = add <8 x i64> , %a %res2 = and <8 x i64> , %res1 ret <8 x i64> %res2 } define <4 x float> @f4xf32_f64(<4 x float> %a) { ; AVX-LABEL: f4xf32_f64: ; AVX: # %bb.0: ; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [7.8125018626451492E-3,7.8125018626451492E-3] ; AVX-NEXT: # xmm1 = mem[0,0] ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retl ; ; ALL32-LABEL: f4xf32_f64: ; ALL32: # %bb.0: ; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = [7.8125018626451492E-3,7.8125018626451492E-3] ; ALL32-NEXT: # xmm1 = mem[0,0] ; ALL32-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; ALL32-NEXT: retl ; ; AVX-64-LABEL: f4xf32_f64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = [7.8125018626451492E-3,7.8125018626451492E-3] ; AVX-64-NEXT: # xmm1 = mem[0,0] ; AVX-64-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; AVX-64-NEXT: retq ; ; ALL64-LABEL: f4xf32_f64: ; ALL64: # %bb.0: ; ALL64-NEXT: vmovddup {{.*#+}} xmm1 = [4575657222482165760,4575657222482165760] ; ALL64-NEXT: # xmm1 = mem[0,0] ; ALL64-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; ALL64-NEXT: retq %res1 = fadd <4 x float> , %a %res2 = fdiv <4 x float> , %res1 ret <4 x float> %res2 } define <8 x float> @f8xf32_f64(<8 x float> %a) { ; AVX-LABEL: f8xf32_f64: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastsd {{.*#+}} ymm1 = [7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3] ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; AVX-NEXT: retl ; ; ALL32-LABEL: f8xf32_f64: ; ALL32: # %bb.0: ; ALL32-NEXT: vbroadcastsd {{.*#+}} ymm1 = [7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3] ; ALL32-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; ALL32-NEXT: retl ; ; AVX-64-LABEL: f8xf32_f64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm1 = [7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3] ; AVX-64-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-64-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; AVX-64-NEXT: retq ; ; ALL64-LABEL: f8xf32_f64: ; ALL64: # %bb.0: ; ALL64-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760] ; ALL64-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; ALL64-NEXT: retq %res1 = fadd <8 x float> , %a %res2 = fdiv <8 x float> , %res1 ret <8 x float> %res2 } define <8 x float> @f8xf32_f128(<8 x float> %a) { ; AVX-LABEL: f8xf32_f128: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] ; AVX-NEXT: # ymm1 = mem[0,1,0,1] ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; AVX-NEXT: retl ; ; ALL32-LABEL: f8xf32_f128: ; ALL32: # %bb.0: ; ALL32-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] ; ALL32-NEXT: # ymm1 = mem[0,1,0,1] ; ALL32-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; ALL32-NEXT: retl ; ; AVX-64-LABEL: f8xf32_f128: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] ; AVX-64-NEXT: # ymm1 = mem[0,1,0,1] ; AVX-64-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-64-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; AVX-64-NEXT: retq ; ; ALL64-LABEL: f8xf32_f128: ; ALL64: # %bb.0: ; ALL64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] ; ALL64-NEXT: # ymm1 = mem[0,1,0,1] ; ALL64-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; ALL64-NEXT: retq %res1 = fadd <8 x float> , %a %res2 = fdiv <8 x float> , %res1 ret <8 x float> %res2 } define <16 x float> @f16xf32_f64(<16 x float> %a) { ; AVX-LABEL: f16xf32_f64: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3] ; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; AVX-NEXT: retl ; ; AVX2-LABEL: f16xf32_f64: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3] ; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retl ; ; AVX512-LABEL: f16xf32_f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vbroadcastsd {{.*#+}} zmm1 = [7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3] ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retl ; ; AVX-64-LABEL: f16xf32_f64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3] ; AVX-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; AVX-64-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; AVX-64-NEXT: retq ; ; AVX2-64-LABEL: f16xf32_f64: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760] ; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; AVX2-64-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; AVX2-64-NEXT: retq ; ; AVX512F-64-LABEL: f16xf32_f64: ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760] ; AVX512F-64-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; AVX512F-64-NEXT: vdivps %zmm0, %zmm1, %zmm0 ; AVX512F-64-NEXT: retq %res1 = fadd <16 x float> , %a %res2 = fdiv <16 x float> , %res1 ret <16 x float> %res2 } define <16 x float> @f16xf32_f128(<16 x float> %a) { ; AVX-LABEL: f16xf32_f128: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] ; AVX-NEXT: # ymm2 = mem[0,1,0,1] ; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; AVX-NEXT: retl ; ; AVX2-LABEL: f16xf32_f128: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] ; AVX2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retl ; ; AVX512-LABEL: f16xf32_f128: ; AVX512: # %bb.0: ; AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] ; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retl ; ; AVX-64-LABEL: f16xf32_f128: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] ; AVX-64-NEXT: # ymm2 = mem[0,1,0,1] ; AVX-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; AVX-64-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; AVX-64-NEXT: retq ; ; AVX2-64-LABEL: f16xf32_f128: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] ; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; AVX2-64-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; AVX2-64-NEXT: retq ; ; AVX512F-64-LABEL: f16xf32_f128: ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] ; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-64-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; AVX512F-64-NEXT: vdivps %zmm0, %zmm1, %zmm0 ; AVX512F-64-NEXT: retq %res1 = fadd <16 x float> , %a %res2 = fdiv <16 x float> , %res1 ret <16 x float> %res2 } define <16 x float> @f16xf32_f256(<16 x float> %a) { ; AVX-LABEL: f16xf32_f256: ; AVX: # %bb.0: ; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0] ; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; AVX-NEXT: retl ; ; AVX2-LABEL: f16xf32_f256: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0] ; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retl ; ; AVX512-LABEL: f16xf32_f256: ; AVX512: # %bb.0: ; AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0] ; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retl ; ; AVX-64-LABEL: f16xf32_f256: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0] ; AVX-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; AVX-64-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; AVX-64-NEXT: retq ; ; AVX2-64-LABEL: f16xf32_f256: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0] ; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; AVX2-64-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; AVX2-64-NEXT: retq ; ; AVX512F-64-LABEL: f16xf32_f256: ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0] ; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512F-64-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; AVX512F-64-NEXT: vdivps %zmm0, %zmm1, %zmm0 ; AVX512F-64-NEXT: retq %res1 = fadd <16 x float> , %a %res2 = fdiv <16 x float> , %res1 ret <16 x float> %res2 } define <4 x double> @f4xf64_f128(<4 x double> %a) { ; AVX-LABEL: f4xf64_f128: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] ; AVX-NEXT: # ymm1 = mem[0,1,0,1] ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vdivpd %ymm0, %ymm1, %ymm0 ; AVX-NEXT: retl ; ; ALL32-LABEL: f4xf64_f128: ; ALL32: # %bb.0: ; ALL32-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] ; ALL32-NEXT: # ymm1 = mem[0,1,0,1] ; ALL32-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: vdivpd %ymm0, %ymm1, %ymm0 ; ALL32-NEXT: retl ; ; AVX-64-LABEL: f4xf64_f128: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] ; AVX-64-NEXT: # ymm1 = mem[0,1,0,1] ; AVX-64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-64-NEXT: vdivpd %ymm0, %ymm1, %ymm0 ; AVX-64-NEXT: retq ; ; ALL64-LABEL: f4xf64_f128: ; ALL64: # %bb.0: ; ALL64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] ; ALL64-NEXT: # ymm1 = mem[0,1,0,1] ; ALL64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: vdivpd %ymm0, %ymm1, %ymm0 ; ALL64-NEXT: retq %res1 = fadd <4 x double> , %a %res2 = fdiv <4 x double> , %res1 ret <4 x double> %res2 } define <8 x double> @f8xf64_f128(<8 x double> %a) { ; AVX-LABEL: f8xf64_f128: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] ; AVX-NEXT: # ymm2 = mem[0,1,0,1] ; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vdivpd %ymm0, %ymm2, %ymm0 ; AVX-NEXT: vdivpd %ymm1, %ymm2, %ymm1 ; AVX-NEXT: retl ; ; AVX2-LABEL: f8xf64_f128: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] ; AVX2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vdivpd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vdivpd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retl ; ; AVX512-LABEL: f8xf64_f128: ; AVX512: # %bb.0: ; AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0] ; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vdivpd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retl ; ; AVX-64-LABEL: f8xf64_f128: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] ; AVX-64-NEXT: # ymm2 = mem[0,1,0,1] ; AVX-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vdivpd %ymm0, %ymm2, %ymm0 ; AVX-64-NEXT: vdivpd %ymm1, %ymm2, %ymm1 ; AVX-64-NEXT: retq ; ; AVX2-64-LABEL: f8xf64_f128: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] ; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vdivpd %ymm0, %ymm2, %ymm0 ; AVX2-64-NEXT: vdivpd %ymm1, %ymm2, %ymm1 ; AVX2-64-NEXT: retq ; ; AVX512F-64-LABEL: f8xf64_f128: ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0] ; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-64-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; AVX512F-64-NEXT: vdivpd %zmm0, %zmm1, %zmm0 ; AVX512F-64-NEXT: retq %res1 = fadd <8 x double> , %a %res2 = fdiv <8 x double> , %res1 ret <8 x double> %res2 } ; AVX512: .LCPI37 ; AVX512-NEXT: .quad 4616189618054758400 # double 4 ; AVX512-NEXT: .quad 4607182418800017408 # double 1 ; AVX512-NEXT: .quad 4611686018427387904 # double 2 ; AVX512-NEXT: .quad 4613937818241073152 # double 3 ; AVX512-NOT: .quad define <8 x double> @f8xf64_f256(<8 x double> %a) { ; AVX-LABEL: f8xf64_f256: ; AVX: # %bb.0: ; AVX-NEXT: vmovapd {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0] ; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vdivpd %ymm0, %ymm2, %ymm0 ; AVX-NEXT: vdivpd %ymm1, %ymm2, %ymm1 ; AVX-NEXT: retl ; ; AVX2-LABEL: f8xf64_f256: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovapd {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0] ; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vdivpd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vdivpd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retl ; ; AVX512-LABEL: f8xf64_f256: ; AVX512: # %bb.0: ; AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] ; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vdivpd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retl ; ; AVX-64-LABEL: f8xf64_f256: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vmovapd {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0] ; AVX-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vdivpd %ymm0, %ymm2, %ymm0 ; AVX-64-NEXT: vdivpd %ymm1, %ymm2, %ymm1 ; AVX-64-NEXT: retq ; ; AVX2-64-LABEL: f8xf64_f256: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vmovapd {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0] ; AVX2-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vdivpd %ymm0, %ymm2, %ymm0 ; AVX2-64-NEXT: vdivpd %ymm1, %ymm2, %ymm1 ; AVX2-64-NEXT: retq ; ; AVX512F-64-LABEL: f8xf64_f256: ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] ; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512F-64-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; AVX512F-64-NEXT: vdivpd %zmm0, %zmm1, %zmm0 ; AVX512F-64-NEXT: retq %res1 = fadd <8 x double> , %a %res2 = fdiv <8 x double> , %res1 ret <8 x double> %res2 } define <8 x i16> @f8xi16_i32_NaN(<8 x i16> %a) { ; AVX-LABEL: f8xi16_i32_NaN: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retl ; ; ALL32-LABEL: f8xi16_i32_NaN: ; ALL32: # %bb.0: ; ALL32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4290379776,4290379776,4290379776,4290379776] ; ALL32-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: retl ; ; AVX-64-LABEL: f8xi16_i32_NaN: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] ; AVX-64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: retq ; ; ALL64-LABEL: f8xi16_i32_NaN: ; ALL64: # %bb.0: ; ALL64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4290379776,4290379776,4290379776,4290379776] ; ALL64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: retq %res1 = add <8 x i16> , %a %res2 = and <8 x i16> , %res1 ret <8 x i16> %res2 }