diff options
Diffstat (limited to 'SingleSource/UnitTests/Vector/AVX512BWVL')
12 files changed, 2622 insertions, 0 deletions
diff --git a/SingleSource/UnitTests/Vector/AVX512BWVL/CMakeLists.txt b/SingleSource/UnitTests/Vector/AVX512BWVL/CMakeLists.txt new file mode 100644 index 00000000..d06bdafc --- /dev/null +++ b/SingleSource/UnitTests/Vector/AVX512BWVL/CMakeLists.txt @@ -0,0 +1,5 @@ +list(APPEND CPPFLAGS -I ${CMAKE_SOURCE_DIR}/${VECTOR_MAIN_DIR}) +list(APPEND LDFLAGS -lm) +list(APPEND CFLAGS "-march=${X86CPU_ARCH}") +list(APPEND CFLAGS -fms-extensions) +llvm_singlesource(PREFIX "Vector-AVX512BWVL-") diff --git a/SingleSource/UnitTests/Vector/AVX512BWVL/Makefile b/SingleSource/UnitTests/Vector/AVX512BWVL/Makefile new file mode 100644 index 00000000..1825531f --- /dev/null +++ b/SingleSource/UnitTests/Vector/AVX512BWVL/Makefile @@ -0,0 +1,11 @@ +# SingleSource/UnitTests/Vector/AVX512BWVL/Makefile + +DIRS = +LEVEL = ../../../.. +CFLAGS += -fms-extensions -march=native -mavx512bw -mavx512vl -I${SourceDir}/.. +LDFLAGS += -lm + +include $(LEVEL)/SingleSource/Makefile.singlesrc + +TARGET_FLAGS += -march=native -mavx512bw -mavx512vl +LCCFLAGS += -march=native -mavx512bw -mavx512vl diff --git a/SingleSource/UnitTests/Vector/AVX512BWVL/permutes.c b/SingleSource/UnitTests/Vector/AVX512BWVL/permutes.c new file mode 100644 index 00000000..5850ef6f --- /dev/null +++ b/SingleSource/UnitTests/Vector/AVX512BWVL/permutes.c @@ -0,0 +1,1014 @@ +/* + * Test pertumes and shuffle intrinsics. + * This test was created to check the correctness + * of the following intrinsics support: + * _mm*_permute*_pd() + * _mm*_shuffle_epi8() + */ + +#include "m512_test_util.h" + +volatile int vol0 = 0; + +V512 i8; +V512 i8_mix; +V512 i8_big; +V512 i16; +V512 i16_mix; +V512 i16_big; +V512 i32; +V512 i32_mix; +V512 i32_big; +V512 i64; +V512 i64_mix; +V512 i64_big; + +void NOINLINE init() { + volatile int i; + + for (i = 0; i < 64; i++) { + i8.s8[i] = i; + i8_mix.s8[i] = (i & 1) ? i : -i; + i8_big.s8[i] = 1000 * (i + 1); + } + + for (i = 0; i < 32; i++) { + i16.s16[i] = i; + i16_mix.s16[i] = (i & 1) ? i : -i; + i16_big.s16[i] = 1000 * (i + 1); + if ((i & 1) != 0) { + i16_big.s16[i] = -i16_big.s16[i]; + } + } + + for (i = 0; i < 16; i++) { + i32.s32[i] = i; + i32_mix.s32[i] = (i & 1) ? i : -i; + i32_big.s32[i] = 1000 * (i + 1); + if ((i & 1) != 0) { + i32_big.s32[i] = -i32_big.s32[i]; + } + } + + for (i = 0; i < 8; i++) { + i64.s64[i] = i; + i64_mix.s64[i] = (i & 1) ? i : -i; + i64_big.s64[i] = 1000 * (i + 1); + if ((i & 1) != 0) { + i64_big.s64[i] = -i64_big.s64[i]; + } + } +} + +#define CHECK_PSHUFB(n_elems, dest, mask, zeroing, name) \ + { \ + int i, lane; \ + for (i = 0; i < n_elems; i++) { \ + expected.s8[i] = 0; \ + if (i8_mix.s8[i] >= 0) { \ + lane = i / 16; \ + expected.s8[i] = i8.s8[16 * lane + (i8_mix.s8[i] & 0xf)]; \ + } \ + if ((mask & (1LL << i)) == 0) { \ + if (zeroing) { \ + expected.s8[i] = 0; \ + } else { \ + expected.s8[i] = dest.s8[i]; \ + } \ + } \ + } \ + check_equal_nd(&res, &expected, n_elems / 4, name, __LINE__); \ + i8_mix.xmmi[vol0] = i8_mix.xmmi[vol0]; \ + } + +void NOINLINE do_pshufb() { + V512 res; + V512 expected; + __mmask64 k64 = 0xFFFFFFFFFFFFFFFFLL; + + /* Non-masked versions. */ + res.xmmi[vol0] = _mm_shuffle_epi8(i8.xmmi[vol0], i8_mix.xmmi[vol0]); + CHECK_PSHUFB(16, i8_big, k64, 0, "_mm_shuffle_epi8"); + + res.ymmi[vol0] = _mm256_shuffle_epi8(i8.ymmi[vol0], i8_mix.ymmi[vol0]); + CHECK_PSHUFB(32, i8_big, k64, 0, "_mm256_shuffle_epi8"); + + res.zmmi = _mm512_shuffle_epi8(i8.zmmi, i8_mix.zmmi); + CHECK_PSHUFB(64, i8_big, k64, 0, "_mm512_shuffle_epi8"); + + /* Masked versions. */ + k64 = 0xA4A4A4A4A4A4A4A4LL; + res.xmmi[vol0] = _mm_mask_shuffle_epi8(i8_big.xmmi[vol0], k64, i8.xmmi[vol0], + i8_mix.xmmi[vol0]); + CHECK_PSHUFB(16, i8_big, k64, 0, "_mm_mask_shuffle_epi8"); + + res.ymmi[vol0] = _mm256_mask_shuffle_epi8(i8_big.ymmi[vol0], k64, + i8.ymmi[vol0], i8_mix.ymmi[vol0]); + CHECK_PSHUFB(32, i8_big, k64, 0, "_mm256_mask_shuffle_epi8"); + + res.zmmi = _mm512_mask_shuffle_epi8(i8_big.zmmi, k64, i8.zmmi, i8_mix.zmmi); + CHECK_PSHUFB(64, i8_big, k64, 0, "_mm512_mask_shuffle_epi8"); + + /* Zero-masked versions. */ + k64 = 0x4A4A4A4A4A4A4A4ALL; + res.xmmi[vol0] = + _mm_maskz_shuffle_epi8(k64, i8.xmmi[vol0], i8_mix.xmmi[vol0]); + CHECK_PSHUFB(16, i8_big, k64, 1, "_mm_maskz_shuffle_epi8"); + + res.ymmi[vol0] = + _mm256_maskz_shuffle_epi8(k64, i8.ymmi[vol0], i8_mix.ymmi[vol0]); + CHECK_PSHUFB(32, i8_big, k64, 1, "_mm256_maskz_shuffle_epi8"); + + res.zmmi = _mm512_maskz_shuffle_epi8(k64, i8.zmmi, i8_mix.zmmi); + CHECK_PSHUFB(64, i8_big, k64, 1, "_mm512_maskz_shuffle_epi8"); +} + +void NOINLINE do_perm_epi32() { + __mmask16 k; + + volatile __m512i v1 = + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + volatile __m512i v2 = + _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + __m512i z1 = v1; + __m512i z2 = v2; + __m512i z3; + __m512i e1; + volatile __m256i y1 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + volatile __m256i y2 = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); + __m256i y3; + __m256i e2; + + z3 = _mm512_permutexvar_epi32(z2, z1); + check_equal_nd(&z3, &z2, 16, "_mm512_permutexvar_epi32", __LINE__); + + k = 0xa97e; + + y3 = y1; + y3 = _mm256_mask_permutexvar_epi32(y3, k, y2, y1); + e2 = _mm256_set_epi32(7, 1, 2, 3, 4, 5, 6, 0); + check_equal_nd(&y3, &e2, 8, "_mm256_mask_permutexvar_epi32", __LINE__); + + z3 = v1; + z3 = _mm512_mask_permutexvar_epi32(z3, k, z2, z1); + e1 = _mm512_set_epi32(0, 14, 2, 12, 4, 10, 9, 7, 7, 9, 10, 11, 12, 13, 14, 0); + check_equal_nd(&z3, &e1, 16, "_mm512_mask_permutexvar_epi32", __LINE__); + + y3 = _mm256_maskz_permutexvar_epi32(k, y2, y1); + e2 = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 0); + check_equal_nd(&y3, &e2, 8, "_mm256_maskz_permutexvar_epi32", __LINE__); + + z3 = _mm512_maskz_permutexvar_epi32(k, z2, z1); + e1 = _mm512_set_epi32(0, 0, 2, 0, 4, 0, 0, 7, 0, 9, 10, 11, 12, 13, 14, 0); + check_equal_nd(&z3, &e1, 16, "_mm512_maskz_permutexvar_epi32", __LINE__); +} + +void NOINLINE do_perm_ps() { + __mmask16 k; + + volatile __m512i v1 = + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + volatile __m512i v2 = + _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + __m512 z1 = _mm512_castsi512_ps(v1); + __m512i z2 = v2; + __m512 z3; + __m512i e1; + volatile __m256 y1 = + _mm256_castsi256_ps(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + volatile __m256i y2 = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); + __m256 y3, e2; + + y3 = _mm256_permutevar8x32_ps(y1, y2); + e2 = _mm256_castsi256_ps(y2); + check_equal_nd(&y3, &e2, 8, "_mm256_permutevar8x32_ps", __LINE__); + + y3 = _mm256_permutexvar_ps(y2, y1); + e2 = _mm256_castsi256_ps(y2); + check_equal_nd(&y3, &e2, 8, "_mm256_permutexvar_ps", __LINE__); + + z3 = _mm512_permutexvar_ps(z2, z1); + check_equal_nd(&z3, &z2, 16, "_mm512_permutexvar_ps", __LINE__); + + k = 0xa97e; + y3 = y1; + y3 = _mm256_mask_permutexvar_ps(y3, k, y2, y1); + e2 = _mm256_castsi256_ps(_mm256_set_epi32(7, 1, 2, 3, 4, 5, 6, 0)); + check_equal_nd(&y3, &e2, 8, "_mm256_mask_permutexvar_ps", __LINE__); + + k = 0xa97e; + z3 = _mm512_castsi512_ps(v1); + z3 = _mm512_mask_permutexvar_ps(z3, k, z2, z1); + e1 = _mm512_set_epi32(0, 14, 2, 12, 4, 10, 9, 7, 7, 9, 10, 11, 12, 13, 14, 0); + check_equal_nd(&z3, &e1, 16, "_mm512_mask_permutexvar_ps", __LINE__); + + k = 0xa97e; + y3 = y1; + y3 = _mm256_maskz_permutexvar_ps(k, y2, y1); + e2 = _mm256_castsi256_ps(_mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 0)); + check_equal_nd(&y3, &e2, 8, "_mm256_maskz_permutexvar_ps", __LINE__); + + k = 0xa97e; + z3 = _mm512_castsi512_ps(v1); + z3 = _mm512_maskz_permutexvar_ps(k, z2, z1); + e1 = _mm512_set_epi32(0, 0, 2, 0, 4, 0, 0, 7, 0, 9, 10, 11, 12, 13, 14, 0); + check_equal_nd(&z3, &e1, 16, "_mm512_maskz_permutexvar_ps", __LINE__); +} + +#define CHECK_PERMI_PS(n_elems, dest, mask, zeroing, name) \ + { \ + volatile int i; \ + for (i = 0; i < n_elems; ++i) { \ + expected.f32[i] = i32.f32[4 * (i / 4) + (i32_mix.s32[i] & 0x3)]; \ + if ((mask & (1LL << i)) == 0) { \ + if (zeroing) { \ + expected.f32[i] = 0; \ + } else { \ + expected.f32[i] = dest.f32[i]; \ + } \ + } \ + } \ + check_equal_nd(&res, &expected, n_elems, name, __LINE__); \ + i32_mix.ymmi[vol0] = i32_mix.ymmi[vol0]; \ + } + +#define CHECK_PERMI_PS_IMM(n_elems, dest, mask, zeroing, name) \ + { \ + volatile int i; \ + for (i = 0; i < n_elems; ++i) { \ + expected.f32[i] = i32.f32[4 * (i / 4) + ((imm >> ((i % 4) * 2)) & 0x3)]; \ + if ((mask & (1LL << i)) == 0) { \ + if (zeroing) { \ + expected.f32[i] = 0; \ + } else { \ + expected.f32[i] = dest.f32[i]; \ + } \ + } \ + } \ + check_equal_nd(&res, &expected, n_elems, name, __LINE__); \ + i32.ymmi[vol0] = i32.ymmi[vol0]; \ + } + +void NOINLINE do_permi_ps() { + V512 res; + V512 expected; + __mmask16 k = 0xFFFF; + char imm; + + res.xmm[vol0] = _mm_permutevar_ps(i32.xmm[vol0], i32_mix.xmmi[vol0]); + CHECK_PERMI_PS(2, i32_big, k, 0, "_mm_permutevar_ps"); + res.ymm[vol0] = _mm256_permutevar_ps(i32.ymm[vol0], i32_mix.ymmi[vol0]); + CHECK_PERMI_PS(4, i32_big, k, 0, "_mm256_permutevar_ps"); + res.zmm = _mm512_permutevar_ps(i32.zmm, i32_mix.zmmi); + CHECK_PERMI_PS(8, i32_big, k, 0, "_mm512_permutevar_ps"); + + k = 0xA4; + res.xmm[vol0] = _mm_mask_permutevar_ps(i32_big.xmm[vol0], k, i32.xmm[vol0], + i32_mix.xmmi[vol0]); + CHECK_PERMI_PS(2, i32_big, k, 0, "_mm_mask_permutevar_ps"); + res.ymm[vol0] = _mm256_mask_permutevar_ps(i32_big.ymm[vol0], k, i32.ymm[vol0], + i32_mix.ymmi[vol0]); + CHECK_PERMI_PS(4, i32_big, k, 0, "_mm256_mask_permutevar_ps"); + res.zmm = _mm512_mask_permutevar_ps(i32_big.zmm, k, i32.zmm, i32_mix.zmmi); + CHECK_PERMI_PS(8, i32_big, k, 0, "_mm512_mask_permutevar_ps"); + + k = 0xA4; + res.xmm[vol0] = _mm_maskz_permutevar_ps(k, i32.xmm[vol0], i32_mix.xmmi[vol0]); + CHECK_PERMI_PS(2, i32_big, k, 1, "_mm_maskz_permutevar_ps"); + res.ymm[vol0] = + _mm256_maskz_permutevar_ps(k, i32.ymm[vol0], i32_mix.ymmi[vol0]); + CHECK_PERMI_PS(4, i32_big, k, 1, "_mm256_maskz_permutevar_ps"); + res.zmm = _mm512_maskz_permutevar_ps(k, i32.zmm, i32_mix.zmmi); + CHECK_PERMI_PS(8, i32_big, k, 1, "_mm512_maskz_permutevar_ps"); + + imm = 0xA4; + k = 0xFF; + res.xmm[vol0] = _mm_permute_ps(i32.xmm[vol0], 0xA4); + CHECK_PERMI_PS_IMM(2, i32_big, k, 0, "_mm_permute_ps"); + res.ymm[vol0] = _mm256_permute_ps(i32.ymm[vol0], 0xA4); + CHECK_PERMI_PS_IMM(4, i32_big, k, 0, "_mm256_permute_ps"); + res.zmm = _mm512_permute_ps(i32.zmm, 0xA4); + CHECK_PERMI_PS_IMM(8, i32_big, k, 0, "_mm512_permute_pd"); + + k = 0xA4; + res.xmm[vol0] = + _mm_mask_permute_ps(i32_big.xmm[vol0], k, i32.xmm[vol0], 0xA4); + CHECK_PERMI_PS_IMM(2, i32_big, k, 0, "_mm_mask_permute_ps"); + res.ymm[vol0] = + _mm256_mask_permute_ps(i32_big.ymm[vol0], k, i32.ymm[vol0], 0xA4); + CHECK_PERMI_PS_IMM(4, i32_big, k, 0, "_mm256_mask_permute_ps"); + res.zmm = _mm512_mask_permute_ps(i32_big.zmm, k, i32.zmm, 0xA4); + CHECK_PERMI_PS_IMM(8, i32_big, k, 0, "_mm512_mask_permute_ps"); + + k = 0xA4; + res.xmm[vol0] = _mm_maskz_permute_ps(k, i32.xmm[vol0], 0xA4); + CHECK_PERMI_PS_IMM(2, i32_big, k, 1, "_mm_maskz_permute_ps"); + res.ymm[vol0] = _mm256_maskz_permute_ps(k, i32.ymm[vol0], 0xA4); + CHECK_PERMI_PS_IMM(4, i32_big, k, 1, "_mm256_maskz_permute_ps"); + res.zmm = _mm512_maskz_permute_ps(k, i32.zmm, 0xA4); + CHECK_PERMI_PS_IMM(8, i32_big, k, 1, "_mm512_maskz_permute_ps"); +} + +void NOINLINE do_perm_epi64() { + __mmask8 k; + + volatile __m512i v1 = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0); + volatile __m512i v2 = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7); + __m512i z1 = v1; + __m512i z2 = v2; + __m512i z3; + __m512i e1; + volatile __m256i y1 = _mm256_set_epi64x(3, 2, 1, 0); + volatile __m256i y2 = _mm256_set_epi64x(0, 1, 2, 3); + __m256i y3, e2; + + y3 = _mm256_permutexvar_epi64(y2, y1); + e2 = y2; + check_equal_nd(&y3, &e2, 8, "_mm256_permutexvar_epi64", __LINE__); + + z3 = _mm512_permutexvar_epi64(z2, z1); + check_equal_nd(&z3, &z2, 16, "_mm512_permutexvar_epi64", __LINE__); + + k = 0x7e; + y3 = y1; + y3 = _mm256_mask_permutexvar_epi64(y3, k, y2, y1); + e2 = _mm256_set_epi64x(0, 1, 2, 0); + check_equal_nd(&y3, &e2, 8, "_mm256_mask_permutexvar_epi64", __LINE__); + + k = 0x7e; + z3 = v1; + z3 = _mm512_mask_permutexvar_epi64(z3, k, z2, z1); + e1 = _mm512_set_epi64(7, 1, 2, 3, 4, 5, 6, 0); + check_equal_nd(&z3, &e1, 16, "_mm512_mask_permutexvar_epi64", __LINE__); + + k = 0x7c; + y3 = y1; + y3 = _mm256_maskz_permutexvar_epi64(k, y2, y1); + e2 = _mm256_set_epi64x(0, 1, 0, 0); + check_equal_nd(&y3, &e2, 8, "_mm256_maskz_permutexvar_epi64", __LINE__); + + k = 0x7e; + z3 = v1; + z3 = _mm512_maskz_permutexvar_epi64(k, z2, z1); + e1 = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 0); + check_equal_nd(&z3, &e1, 16, "_mm512_maskz_permutexvar_epi64", __LINE__); +} + +void NOINLINE do_perm_pd() { + __mmask8 k; + + volatile __m512i v1 = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0); + volatile __m512i v2 = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7); + __m512d z1 = _mm512_castsi512_pd(v1); + __m512i z2 = v2; + __m512d z3; + __m512i e1; + volatile __m256i yv1; + volatile __m256i yv2; + __m256d y1; + __m256i y2; + __m256d y3; + __m256i ye1; + + z3 = _mm512_permutexvar_pd(z2, z1); + check_equal_nd(&z3, &z2, 16, "_mm512_permutexvar_pd", __LINE__); + + k = 0x7e; + z3 = _mm512_castsi512_pd(v1); + z3 = _mm512_mask_permutexvar_pd(z3, k, z2, z1); + e1 = _mm512_set_epi64(7, 1, 2, 3, 4, 5, 6, 0); + check_equal_nd(&z3, &e1, 16, "_mm512_mask_permutexvar_pd", __LINE__); + + z3 = _mm512_maskz_permutexvar_pd(k, z2, z1); + e1 = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 0); + check_equal_nd(&z3, &e1, 16, "_mm512_maskz_permutexvar_pd", __LINE__); + + /* 256 */ + yv1 = _mm256_set_epi64x(7, 6, 5, 4); + yv2 = _mm256_set_epi64x(4, 5, 6, 7); + y1 = _mm256_castsi256_pd(yv1); + y2 = yv2; + + y3 = _mm256_permutexvar_pd(y2, y1); + check_equal_nd(&y3, &y2, 8, "_mm256_permutexvar_pd", __LINE__); + + k = 0x6; + y3 = _mm256_castsi256_pd(yv1); + y3 = _mm256_mask_permutexvar_pd(y3, k, y2, y1); + ye1 = _mm256_set_epi64x(7, 5, 6, 4); + check_equal_nd(&y3, &ye1, 8, "_mm256_mask_permutexvar_pd", __LINE__); + + y3 = _mm256_maskz_permutexvar_pd(k, y2, y1); + ye1 = _mm256_set_epi64x(0, 5, 6, 0); + check_equal_nd(&y3, &ye1, 8, "_mm256_maskz_permutexvar_pd", __LINE__); +} + +#define CHECK_PERMI_PD(n_elems, dest, mask, zeroing, name) \ + { \ + volatile int i; \ + for (i = 0; i < n_elems; ++i) { \ + if ((i64_mix.s64[i] & 0x2) == 0) { \ + expected.f64[i] = i64.f64[2 * (i / 2)]; \ + } else { \ + expected.f64[i] = i64.f64[2 * (i / 2) + 1]; \ + } \ + if ((mask & (1LL << i)) == 0) { \ + if (zeroing) { \ + expected.f64[i] = 0; \ + } else { \ + expected.f64[i] = dest.f64[i]; \ + } \ + } \ + } \ + check_equal_nd(&res, &expected, n_elems * 2, name, __LINE__); \ + i64_mix.ymmi[vol0] = i64_mix.ymmi[vol0]; \ + } + +#define CHECK_PERMI_PD_IMM(n_elems, dest, mask, zeroing, name) \ + { \ + volatile int i; \ + for (i = 0; i < n_elems; ++i) { \ + if (((imm >> i) & 0x1) == 0) { \ + expected.f64[i] = i64.f64[2 * (i / 2)]; \ + } else { \ + expected.f64[i] = i64.f64[2 * (i / 2) + 1]; \ + } \ + if ((mask & (1LL << i)) == 0) { \ + if (zeroing) { \ + expected.f64[i] = 0; \ + } else { \ + expected.f64[i] = dest.f64[i]; \ + } \ + } \ + } \ + check_equal_nd(&res, &expected, n_elems * 2, name, __LINE__); \ + i64.ymmi[vol0] = i64.ymmi[vol0]; \ + } + +void NOINLINE do_permi_pd() { + V512 res; + V512 expected; + __mmask8 k = 0xFF; + char imm; + + res.xmmd[vol0] = _mm_permutevar_pd(i64.xmmd[vol0], i64_mix.xmmi[vol0]); + CHECK_PERMI_PD(2, i64_big, k, 0, "_mm_permutevar_pd"); + res.ymmd[vol0] = _mm256_permutevar_pd(i64.ymmd[vol0], i64_mix.ymmi[vol0]); + CHECK_PERMI_PD(4, i64_big, k, 0, "_mm256_permutevar_pd"); + res.zmmd = _mm512_permutevar_pd(i64.zmmd, i64_mix.zmmi); + CHECK_PERMI_PD(8, i64_big, k, 0, "_mm512_permutevar_pd"); + + k = 0xA4; + res.xmmd[vol0] = _mm_mask_permutevar_pd(i64_big.xmmd[vol0], k, i64.xmmd[vol0], + i64_mix.xmmi[vol0]); + CHECK_PERMI_PD(2, i64_big, k, 0, "_mm_mask_permutevar_pd"); + res.ymmd[vol0] = _mm256_mask_permutevar_pd( + i64_big.ymmd[vol0], k, i64.ymmd[vol0], i64_mix.ymmi[vol0]); + CHECK_PERMI_PD(4, i64_big, k, 0, "_mm256_mask_permutevar_pd"); + res.zmmd = _mm512_mask_permutevar_pd(i64_big.zmmd, k, i64.zmmd, i64_mix.zmmi); + CHECK_PERMI_PD(8, i64_big, k, 0, "_mm512_mask_permutevar_pd"); + + k = 0xA4; + res.xmmd[vol0] = + _mm_maskz_permutevar_pd(k, i64.xmmd[vol0], i64_mix.xmmi[vol0]); + CHECK_PERMI_PD(2, i64_big, k, 1, "_mm_maskz_permutevar_pd"); + res.ymmd[vol0] = + _mm256_maskz_permutevar_pd(k, i64.ymmd[vol0], i64_mix.ymmi[vol0]); + CHECK_PERMI_PD(4, i64_big, k, 1, "_mm256_maskz_permutevar_pd"); + res.zmmd = _mm512_maskz_permutevar_pd(k, i64.zmmd, i64_mix.zmmi); + CHECK_PERMI_PD(8, i64_big, k, 1, "_mm512_maskz_permutevar_pd"); + + imm = 0xA4; + k = 0xFF; + res.xmmd[vol0] = _mm_permute_pd(i64.xmmd[vol0], 0xA4 & 0x3); + CHECK_PERMI_PD_IMM(2, i64_big, k, 0, "_mm_permute_pd"); + res.ymmd[vol0] = _mm256_permute_pd(i64.ymmd[vol0], 0xA4 & 0xf); + CHECK_PERMI_PD_IMM(4, i64_big, k, 0, "_mm256_permute_pd"); + res.zmmd = _mm512_permute_pd(i64.zmmd, 0xA4); + CHECK_PERMI_PD_IMM(8, i64_big, k, 0, "_mm512_permute_pd"); + + k = 0xA4; + res.xmmd[vol0] = + _mm_mask_permute_pd(i64_big.xmmd[vol0], k, i64.xmmd[vol0], 0xA4 & 0x3); + CHECK_PERMI_PD_IMM(2, i64_big, k, 0, "_mm_mask_permute_pd"); + res.ymmd[vol0] = + _mm256_mask_permute_pd(i64_big.ymmd[vol0], k, i64.ymmd[vol0], 0xA4 & 0xf); + CHECK_PERMI_PD_IMM(4, i64_big, k, 0, "_mm256_mask_permute_pd"); + res.zmmd = _mm512_mask_permute_pd(i64_big.zmmd, k, i64.zmmd, 0xA4); + CHECK_PERMI_PD_IMM(8, i64_big, k, 0, "_mm512_mask_permute_pd"); + + k = 0xA4; + res.xmmd[vol0] = _mm_maskz_permute_pd(k, i64.xmmd[vol0], 0xA4 & 0x3); + CHECK_PERMI_PD_IMM(2, i64_big, k, 1, "_mm_maskz_permute_pd"); + res.ymmd[vol0] = _mm256_maskz_permute_pd(k, i64.ymmd[vol0], 0xA4 & 0xf); + CHECK_PERMI_PD_IMM(4, i64_big, k, 1, "_mm256_maskz_permute_pd"); + res.zmmd = _mm512_maskz_permute_pd(k, i64.zmmd, 0xA4); + CHECK_PERMI_PD_IMM(8, i64_big, k, 1, "_mm512_maskz_permute_pd"); +} + +void NOINLINE do_perm_epi64_imm() { + __mmask8 k; + + volatile __m512i v1 = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0); + __m512i z1 = v1; + __m512i z2; + __m512i e1; + volatile __m256i y1 = _mm256_set_epi64x(3, 2, 1, 0); + __m256i y2, e2; + + y2 = y1; + y2 = _mm256_permutex_epi64(y2, 0x7a); + e2 = _mm256_set_epi64x(1, 3, 2, 2); + check_equal_nd(&y2, &e2, 8, "_mm256_permutex_epi64", __LINE__); + + z2 = _mm512_permutex_epi64(z1, 0x7a); + e1 = _mm512_set_epi64(5, 7, 6, 6, 1, 3, 2, 2); + check_equal_nd(&z2, &e1, 16, "_mm512_permutex_epi64", __LINE__); + + k = 0x7e; + y2 = y1; + y2 = _mm256_mask_permutex_epi64(y2, k, y2, 0x7a); + e2 = _mm256_set_epi64x(1, 3, 2, 0); + check_equal_nd(&y2, &e2, 8, "_mm256_mask_permutex_epi64", __LINE__); + + k = 0x7e; + z2 = v1; + z2 = _mm512_mask_permutex_epi64(z2, k, z2, 0x7a); + e1 = _mm512_set_epi64(7, 7, 6, 6, 1, 3, 2, 0); + check_equal_nd(&z2, &e1, 16, "_mm512_mask_permutex_epi64", __LINE__); + + k = 0x76; + y2 = y1; + y2 = _mm256_maskz_permutex_epi64(k, y2, 0x7a); + e2 = _mm256_set_epi64x(0, 3, 2, 0); + check_equal_nd(&y2, &e2, 8, "_mm256_maskz_permutex_epi64", __LINE__); + + k = 0x7e; + z2 = v1; + z2 = _mm512_maskz_permutex_epi64(k, z2, 0x7a); + e1 = _mm512_set_epi64(0, 7, 6, 6, 1, 3, 2, 0); + check_equal_nd(&z2, &e1, 16, "_mm512_maskz_permutex_epi64", __LINE__); +} + +void NOINLINE do_perm_pd_imm() { + __mmask8 k; + + volatile __m512i v1 = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0); + __m512d z1 = _mm512_castsi512_pd(v1); + __m512d z2; + __m512i e1; + volatile __m256i yv1; + __m256d y1; + __m256d y2; + __m256i ye1; + + z2 = _mm512_permutex_pd(z1, 0x7a); + e1 = _mm512_set_epi64(5, 7, 6, 6, 1, 3, 2, 2); + check_equal_nd(&z2, &e1, 16, "_mm512_permutex_pd", __LINE__); + + k = 0x7e; + z2 = _mm512_castsi512_pd(v1); + z2 = _mm512_mask_permutex_pd(z2, k, z2, 0x7a); + e1 = _mm512_set_epi64(7, 7, 6, 6, 1, 3, 2, 0); + check_equal_nd(&z2, &e1, 16, "_mm512_mask_permutex_pd", __LINE__); + + z2 = _mm512_castsi512_pd(v1); + z2 = _mm512_maskz_permutex_pd(k, z2, 0x7a); + e1 = _mm512_set_epi64(0, 7, 6, 6, 1, 3, 2, 0); + check_equal_nd(&z2, &e1, 16, "_mm512_maskz_permutex_pd", __LINE__); + + /* 256 */ + yv1 = _mm256_set_epi64x(7, 6, 5, 4); + y1 = _mm256_castsi256_pd(yv1); + + y2 = _mm256_permutex_pd(y1, 0xa); + ye1 = _mm256_set_epi64x(4, 4, 6, 6); + check_equal_nd(&y2, &ye1, 8, "_mm256_permutex_pd", __LINE__); + + k = 0x7e; + y2 = _mm256_castsi256_pd(yv1); + y2 = _mm256_mask_permutex_pd(y2, k, y2, 0xa); + ye1 = _mm256_set_epi64x(4, 4, 6, 4); + check_equal_nd(&y2, &ye1, 8, "_mm256_mask_permutex_pd", __LINE__); + + y2 = _mm256_castsi256_pd(yv1); + y2 = _mm256_maskz_permutex_pd(k, y2, 0xa); + ye1 = _mm256_set_epi64x(4, 4, 6, 0); + check_equal_nd(&y2, &ye1, 8, "_mm256_maskz_permutex_pd", __LINE__); +} + +void NOINLINE do_perm_ti_2w() { + V512 res; + V512 expected; + volatile int i; + __mmask32 k; + + res.zmmi = _mm512_permutex2var_epi16(i16.zmmi, i16_mix.zmmi, i16_big.zmmi); + for (i = 0; i < 32; i++) { + int index = i16_mix.s16[i] & 0x1f; + expected.s16[i] = + (i16_mix.s16[i] & 0x20) ? i16_big.s16[index] : i16.s16[index]; + } + check_equal_nd(&res, &expected, 16, "_mm512_permutex2var_epi16", __LINE__); + + i16_big.xmmi[vol0] = i16_big.xmmi[vol0]; + + k = 0xabcdffef; + res.zmmi = + _mm512_mask_permutex2var_epi16(i16.zmmi, k, i16_mix.zmmi, i16_big.zmmi); + for (i = 0; i < 32; i++) { + int index = i16_mix.s16[i] & 0x1f; + expected.s16[i] = + (i16_mix.s16[i] & 0x20) ? i16_big.s16[index] : i16.s16[index]; + if ((k & (1 << i)) == 0) { + expected.s16[i] = i16.s16[i]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_permutex2var_epi16", + __LINE__); + + i16_big.xmmi[vol0] = i16_big.xmmi[vol0]; + + k = 0xabcdffef; + res.zmmi = + _mm512_mask2_permutex2var_epi16(i16.zmmi, i16_mix.zmmi, k, i16_big.zmmi); + for (i = 0; i < 32; i++) { + int index = i16_mix.s16[i] & 0x1f; + expected.s16[i] = + (i16_mix.s16[i] & 0x20) ? i16_big.s16[index] : i16.s16[index]; + if ((k & (1 << i)) == 0) { + expected.s16[i] = i16_mix.s16[i]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask2_permutex2var_epi16", + __LINE__); + + i16_big.xmmi[vol0] = i16_big.xmmi[vol0]; + + k = 0xabcdffef; + res.zmmi = + _mm512_maskz_permutex2var_epi16(k, i16.zmmi, i16_mix.zmmi, i16_big.zmmi); + for (i = 0; i < 32; i++) { + int index = i16_mix.s16[i] & 0x1f; + expected.s16[i] = + (i16_mix.s16[i] & 0x20) ? i16_big.s16[index] : i16.s16[index]; + if ((k & (1 << i)) == 0) { + expected.s16[i] = 0; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_maskz_permutex2var_epi16", + __LINE__); +} + +void NOINLINE do_perm_ti_2d() { + V512 res; + V512 expected; + volatile int i; + __mmask16 k; + + res.zmmi = _mm512_permutex2var_epi32(i32.zmmi, i32_mix.zmmi, i32_big.zmmi); + for (i = 0; i < 16; i++) { + int index = i32_mix.s32[i] & 0xf; + expected.s32[i] = + (i32_mix.s32[i] & 0x10) ? i32_big.s32[index] : i32.s32[index]; + } + check_equal_nd(&res, &expected, 16, "_mm512_permutex2var_epi32", __LINE__); + + i32_big.xmmi[vol0] = i32_big.xmmi[vol0]; + + k = 0xabcd; + res.zmmi = + _mm512_mask_permutex2var_epi32(i32.zmmi, k, i32_mix.zmmi, i32_big.zmmi); + for (i = 0; i < 16; i++) { + int index = i32_mix.s32[i] & 0xf; + expected.s32[i] = + (i32_mix.s32[i] & 0x10) ? i32_big.s32[index] : i32.s32[index]; + if ((k & (1 << i)) == 0) { + expected.s32[i] = i32.s32[i]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_permutex2var_epi32", + __LINE__); + + i32_big.xmmi[vol0] = i32_big.xmmi[vol0]; + + k = 0xdcba; + res.zmmi = + _mm512_mask2_permutex2var_epi32(i32.zmmi, i32_mix.zmmi, k, i32_big.zmmi); + for (i = 0; i < 16; i++) { + int index = i32_mix.s32[i] & 0xf; + expected.s32[i] = + (i32_mix.s32[i] & 0x10) ? i32_big.s32[index] : i32.s32[index]; + if ((k & (1 << i)) == 0) { + expected.s32[i] = i32_mix.s32[i]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask2_permutex2var_epi32", + __LINE__); + + i32_big.xmmi[vol0] = i32_big.xmmi[vol0]; + + k = 0xabcd; + res.zmmi = + _mm512_maskz_permutex2var_epi32(k, i32.zmmi, i32_mix.zmmi, i32_big.zmmi); + for (i = 0; i < 16; i++) { + int index = i32_mix.s32[i] & 0xf; + expected.s32[i] = + (i32_mix.s32[i] & 0x10) ? i32_big.s32[index] : i32.s32[index]; + if ((k & (1 << i)) == 0) { + expected.s32[i] = 0; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_maskz_permutex2var_epi32", + __LINE__); +} + +void NOINLINE do_perm_ti_2q() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k; + + res.zmmi = _mm512_permutex2var_epi64(i64.zmmi, i64_mix.zmmi, i64_big.zmmi); + for (i = 0; i < 8; i++) { + int index = i64_mix.s64[i] & 0x7; + expected.s64[i] = + (i64_mix.s64[i] & 0x8) ? i64_big.s64[index] : i64.s64[index]; + } + check_equal_nd(&res, &expected, 16, "_mm512_permutex2var_epi64", __LINE__); + + i64_big.xmmi[vol0] = i64_big.xmmi[vol0]; + + k = 0xf9; + res.zmmi = + _mm512_mask_permutex2var_epi64(i64.zmmi, k, i64_mix.zmmi, i64_big.zmmi); + for (i = 0; i < 8; i++) { + int index = i64_mix.s64[i] & 0x7; + expected.s64[i] = + (i64_mix.s64[i] & 0x8) ? i64_big.s64[index] : i64.s64[index]; + if ((k & (1 << i)) == 0) { + expected.s64[i] = i64.s64[i]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_permutex2var_epi64", + __LINE__); + + i64_big.xmmi[vol0] = i64_big.xmmi[vol0]; + + k = 0xf9; + res.zmmi = + _mm512_mask2_permutex2var_epi64(i64.zmmi, i64_mix.zmmi, k, i64_big.zmmi); + for (i = 0; i < 8; i++) { + int index = i64_mix.s64[i] & 0x7; + expected.s64[i] = + (i64_mix.s64[i] & 0x8) ? i64_big.s64[index] : i64.s64[index]; + if ((k & (1 << i)) == 0) { + expected.s64[i] = i64_mix.s64[i]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask2_permutex2var_epi64", + __LINE__); + + i64_big.xmmi[vol0] = i64_big.xmmi[vol0]; + + k = 0xe7; + res.zmmi = + _mm512_maskz_permutex2var_epi64(k, i64.zmmi, i64_mix.zmmi, i64_big.zmmi); + for (i = 0; i < 8; i++) { + int index = i64_mix.s64[i] & 0x7; + expected.s64[i] = + (i64_mix.s64[i] & 0x8) ? i64_big.s64[index] : i64.s64[index]; + if ((k & (1 << i)) == 0) { + expected.s64[i] = 0; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_maskz_permutex2var_epi64", + __LINE__); +} + +void NOINLINE do_perm_ti_2ps() { + V512 res; + V512 expected; + volatile int i; + __mmask16 k; + + res.zmm = _mm512_permutex2var_ps(i32.zmm, i32_mix.zmmi, i32_big.zmm); + for (i = 0; i < 16; i++) { + int index = i32_mix.s32[i] & 0xf; + expected.s32[i] = + (i32_mix.s32[i] & 0x10) ? i32_big.s32[index] : i32.s32[index]; + } + check_equal_nd(&res, &expected, 16, "_mm512_permutex2var_ps", __LINE__); + + i32_big.xmmi[vol0] = i32_big.xmmi[vol0]; + + k = 0xabcd; + res.zmm = _mm512_mask_permutex2var_ps(i32.zmm, k, i32_mix.zmmi, i32_big.zmm); + for (i = 0; i < 16; i++) { + int index = i32_mix.s32[i] & 0xf; + expected.s32[i] = + (i32_mix.s32[i] & 0x10) ? i32_big.s32[index] : i32.s32[index]; + if ((k & (1 << i)) == 0) { + expected.s32[i] = i32.s32[i]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_permutex2var_ps", __LINE__); + + i32_big.xmmi[vol0] = i32_big.xmmi[vol0]; + + k = 0xabcd; + res.zmm = _mm512_mask2_permutex2var_ps(i32.zmm, i32_mix.zmmi, k, i32_big.zmm); + for (i = 0; i < 16; i++) { + int index = i32_mix.s32[i] & 0xf; + expected.s32[i] = + (i32_mix.s32[i] & 0x10) ? i32_big.s32[index] : i32.s32[index]; + if ((k & (1 << i)) == 0) { + expected.s32[i] = i32_mix.s32[i]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask2_permutex2var_ps", __LINE__); + + i32_big.xmmi[vol0] = i32_big.xmmi[vol0]; + + k = 0xabcd; + res.zmm = _mm512_maskz_permutex2var_ps(k, i32.zmm, i32_mix.zmmi, i32_big.zmm); + for (i = 0; i < 16; i++) { + int index = i32_mix.s32[i] & 0xf; + expected.s32[i] = + (i32_mix.s32[i] & 0x10) ? i32_big.s32[index] : i32.s32[index]; + if ((k & (1 << i)) == 0) { + expected.s32[i] = 0; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_maskz_permutex2var_ps", __LINE__); +} + +void NOINLINE do_perm_ti_2pd() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k; + + res.zmmd = _mm512_permutex2var_pd(i64.zmmd, i64_mix.zmmi, i64_big.zmmd); + for (i = 0; i < 8; i++) { + int index = i64_mix.s64[i] & 0x7; + expected.s64[i] = + (i64_mix.s64[i] & 0x8) ? i64_big.s64[index] : i64.s64[index]; + } + check_equal_nd(&res, &expected, 16, "_mm512_permutex2var_pd", __LINE__); + + i64_big.xmmi[vol0] = i64_big.xmmi[vol0]; + + k = 0xf9; + res.zmmd = + _mm512_mask_permutex2var_pd(i64.zmmd, k, i64_mix.zmmi, i64_big.zmmd); + for (i = 0; i < 8; i++) { + int index = i64_mix.s64[i] & 0x7; + expected.s64[i] = + (i64_mix.s64[i] & 0x8) ? i64_big.s64[index] : i64.s64[index]; + if ((k & (1 << i)) == 0) { + expected.s64[i] = i64.s64[i]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_permutex2var_pd", __LINE__); + + i64_big.xmmi[vol0] = i64_big.xmmi[vol0]; + + k = 0xf9; + res.zmmd = + _mm512_mask2_permutex2var_pd(i64.zmmd, i64_mix.zmmi, k, i64_big.zmmd); + for (i = 0; i < 8; i++) { + int index = i64_mix.s64[i] & 0x7; + expected.s64[i] = + (i64_mix.s64[i] & 0x8) ? i64_big.s64[index] : i64.s64[index]; + if ((k & (1 << i)) == 0) { + expected.s64[i] = i64_mix.s64[i]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask2_permutex2var_pd", __LINE__); + + i64_big.xmmi[vol0] = i64_big.xmmi[vol0]; + + k = 0xf9; + res.zmmd = + _mm512_maskz_permutex2var_pd(k, i64.zmmd, i64_mix.zmmi, i64_big.zmmd); + for (i = 0; i < 8; i++) { + int index = i64_mix.s64[i] & 0x7; + expected.s64[i] = + (i64_mix.s64[i] & 0x8) ? i64_big.s64[index] : i64.s64[index]; + if ((k & (1 << i)) == 0) { + expected.s64[i] = 0; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_maskz_permutex2var_pd", __LINE__); +} + +#define CHECK_PERMW(n_elems, dest, mask, zeroing, name) \ + { \ + volatile int i; \ + for (i = 0; i < n_elems; i++) { \ + expected.s16[i] = i16_mix.s16[i16.s16[i] & 0x1f]; \ + if ((mask & (1 << i)) == 0) { \ + if (zeroing) { \ + expected.s16[i] = 0; \ + } else { \ + expected.s16[i] = dest.s16[i]; \ + } \ + } \ + } \ + check_equal_nd(&res, &expected, n_elems / 2, name, __LINE__); \ + i16.xmmi[vol0] = i16.xmmi[vol0]; \ + } + +void NOINLINE do_permw() { + V512 res; + V512 expected; + __mmask32 k32 = 0xFFFFFFFF; + + res.xmmi[vol0] = _mm_permutexvar_epi16(i16.xmmi[vol0], i16_mix.xmmi[vol0]); + CHECK_PERMW(8, i16_big, k32, 0, "_mm_permutexvar_epi16"); + + res.ymmi[vol0] = _mm256_permutexvar_epi16(i16.ymmi[vol0], i16_mix.ymmi[vol0]); + CHECK_PERMW(16, i16_big, k32, 0, "_mm256_permutexvar_epi16"); + + res.zmmi = _mm512_permutexvar_epi16(i16.zmmi, i16_mix.zmmi); + CHECK_PERMW(32, i16_big, k32, 0, "_mm512_permutexvar_epi16"); + + k32 = 0xA4A4A4A4; + res.xmmi[vol0] = _mm_mask_permutexvar_epi16( + i16_big.xmmi[vol0], k32, i16.xmmi[vol0], i16_mix.xmmi[vol0]); + CHECK_PERMW(8, i16_big, k32, 0, "_mm_mask_permutexvar_epi16"); + + res.ymmi[vol0] = _mm256_mask_permutexvar_epi16( + i16_big.ymmi[vol0], k32, i16.ymmi[vol0], i16_mix.ymmi[vol0]); + CHECK_PERMW(16, i16_big, k32, 0, "_mm256_mask_permutexvar_epi16"); + + res.zmmi = + _mm512_mask_permutexvar_epi16(i16_big.zmmi, k32, i16.zmmi, i16_mix.zmmi); + CHECK_PERMW(32, i16_big, k32, 0, "_mm512_mask_permutexvar_epi16"); + + k32 = 0x4A4A4A4A; + res.xmmi[vol0] = + _mm_maskz_permutexvar_epi16(k32, i16.xmmi[vol0], i16_mix.xmmi[vol0]); + CHECK_PERMW(8, i16_big, k32, 1, "_mm_maskz_permutexvar_epi16"); + + res.ymmi[vol0] = + _mm256_maskz_permutexvar_epi16(k32, i16.ymmi[vol0], i16_mix.ymmi[vol0]); + CHECK_PERMW(16, i16_big, k32, 1, "_mm256_maskz_permutexvar_epi16"); + + res.zmmi = _mm512_maskz_permutexvar_epi16(k32, i16.zmmi, i16_mix.zmmi); + CHECK_PERMW(32, i16_big, k32, 1, "_mm512_maskz_permutexvar_epi16"); +} + +void NOINLINE do_blendmps() { + V512 res; + V512 expected; + volatile int i; + __mmask16 k = 0x3456; + + res.zmm = _mm512_mask_blend_ps(k, i32.zmm, i32_mix.zmm); + for (i = 0; i < 16; i++) { + expected.s32[i] = i32.s32[i]; + if ((k & (1 << i)) != 0) { + expected.s32[i] = i32_mix.s32[i]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_blend_ps", __LINE__); +} + +int main(int argc, char *argv[]) { + init(); + + do_pshufb(); + + do_perm_epi32(); + do_perm_ps(); + do_permi_ps(); + + do_perm_epi64(); + do_perm_pd(); + do_permi_pd(); + + do_perm_epi64_imm(); + do_perm_pd_imm(); + + do_perm_ti_2w(); + + do_perm_ti_2d(); + do_perm_ti_2q(); + + do_perm_ti_2ps(); + do_perm_ti_2pd(); + + do_permw(); + + do_blendmps(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} diff --git a/SingleSource/UnitTests/Vector/AVX512BWVL/permutes.reference_output b/SingleSource/UnitTests/Vector/AVX512BWVL/permutes.reference_output new file mode 100644 index 00000000..bfae62d0 --- /dev/null +++ b/SingleSource/UnitTests/Vector/AVX512BWVL/permutes.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 diff --git a/SingleSource/UnitTests/Vector/AVX512BWVL/psadbw.c b/SingleSource/UnitTests/Vector/AVX512BWVL/psadbw.c new file mode 100644 index 00000000..2e303069 --- /dev/null +++ b/SingleSource/UnitTests/Vector/AVX512BWVL/psadbw.c @@ -0,0 +1,272 @@ +// +// Testcases for +// 128/256/512 bit vdbpsadbw +// intrinsics with no mask/blend mask/zero mask forms. +// +// Here we check for _mm*_dbsad_epu8 intrinsics. +// + +#include "m512_test_util.h" +#include <stdio.h> + +#if DEBUG +#define dprintf(...) printf(__VA_ARGS__) +#define ddisplay_w(...) display_pw(__VA_ARGS__) +#define ddisplay_b(...) display_pb(__VA_ARGS__) +#else +#define dprintf(...) +#define ddisplay_w(...) +#define ddisplay_b(...) +#endif // DEBUG + +typedef int bool; +#define true 1 +#define false 0 + +#define CHECK_DBPSADBW(opcode, res_bit_size, is_masked, mask, is_zero_mask, \ + imm) \ + { \ + int fail = 0; \ + /* Compute the expected result. */ \ + expVal.zmmi = \ + compute_vdbpsadbw(&expVal, is_masked, mask, is_zero_mask, imm, \ + &bop1.zmmi, &bop2.zmmi, res_bit_size); \ + \ + /* Compare the obtained and expected results. */ \ + fail = check_equal_nw(&res, &expVal, (res_bit_size / 16), \ + is_masked ? (is_zero_mask ? opcode " zero mask" \ + : opcode " blend mask") \ + : opcode " no mask", \ + __LINE__); \ + if (fail) { \ + dprintf("\n"); \ + ddisplay_w(&wres_orig, "old:", res_bit_size / 16); \ + dprintf("\n"); \ + ddisplay_b(&bop2, "bop2:", res_bit_size / 8); \ + dprintf("\n"); \ + ddisplay_b(&bop1, "bop1:", res_bit_size / 8); \ + dprintf("\n===========================================\n"); \ + } \ + } + +#define XYDBPSADBW(opcode, res_bit_size, mmsuffix, is_masked, mask, \ + is_zero_mask, imm, xy) \ + { \ + if (is_masked) { \ + if (is_zero_mask) { \ + /* Zero masking */ \ + memset(&res, 0xFF, sizeof(res)); \ + res.xy##mmi[0] = mmsuffix##maskz_##dbsad_epu8(mask, bop1.xy##mmi[0], \ + bop2.xy##mmi[0], imm); \ + } else { \ + /* Blend masking */ \ + memcpy(&res, &wres_orig, sizeof(res)); \ + res.xy##mmi[0] = mmsuffix##mask_##dbsad_epu8( \ + res.xy##mmi[0], mask, bop1.xy##mmi[0], bop2.xy##mmi[0], imm); \ + } \ + } else { \ + /* No masking */ \ + memset(&res, 0x0, sizeof(res)); \ + res.xy##mmi[0] = \ + mmsuffix##dbsad_epu8(bop1.xy##mmi[0], bop2.xy##mmi[0], imm); \ + } \ + CHECK_DBPSADBW(opcode, res_bit_size, is_masked, mask, is_zero_mask, imm) \ + } + +#define ZDBPSADBW(opcode, is_masked, mask, is_zero_mask, imm) \ + { \ + if (is_masked) { \ + if (is_zero_mask) { /* Zero masking */ \ + memset(&res, 0xFF, sizeof(res)); \ + res.zmmi = _mm512_maskz_##dbsad_epu8(mask, bop1.zmmi, bop2.zmmi, imm); \ + } else { /* Blend masking */ \ + memcpy(&res, &wres_orig, sizeof(res)); \ + res.zmmi = _mm512_mask_##dbsad_epu8(res.zmmi, mask, bop1.zmmi, \ + bop2.zmmi, imm); \ + } \ + } else { /* No masking */ \ + memset(&res, 0x0, sizeof(res)); \ + res.zmmi = _mm512_##dbsad_epu8(bop1.zmmi, bop2.zmmi, imm); \ + } \ + CHECK_DBPSADBW(opcode, 512, is_masked, mask, is_zero_mask, imm) \ + } + +// +// Data +// + +volatile unsigned short u16_orig_arr[32] = { + 0x1000, 0x1100, 0x2200, 0x3300, 0x4400, 0x5500, 0x6600, 0x7700, + 0x8800, 0x9900, 0xaa00, 0xbb00, 0xcc00, 0xdd00, 0xee00, 0xff00, + 0x1234, 0x1111, 0x2222, 0x3333, 0x4444, 0x5555, 0x6666, 0x7777, + 0x8888, 0x9999, 0xaaaa, 0xbbbb, 0xcccc, 0xdddd, 0xeeee, 0xffff}; + +V512 bop1, bop2; +V512 res, expVal; +V512 wres_orig; + +static void NOINLINE init() { + int i; + + // i8 operand vectors + // + for (i = 0; i < 64; i++) { + bop1.s8[i] = i; + } + for (i = 63; i >= 0; i--) { + bop2.s8[63 - i] = i; + } + + // Destructed operand vectors + memcpy((void *)&wres_orig, (void *)u16_orig_arr, 64); +} + +// +// Emulate the vdbpsadbw operation. +// + +__m512i NOINLINE compute_vdbpsadbw(void *res, bool is_masked, unsigned int mask, + bool zero_mask, int imm, const void *op1, + const void *op2, int res_bit_size) { + V512 *vres = (V512 *)res; + V512 *vop1 = (V512 *)op1; + V512 *vop2 = (V512 *)op2; + V512 vtmp; + + int lanes = res_bit_size / 128; + int lane, res_i; + int elems, elem; + + dprintf("\n\n"); + + // Do unmasked vdbpsadbw to get temp result. + // + for (lane = 0; lane < lanes; lane++) { + + dprintf("\n"); + for (elem = 0; elem < 4; elem++) { + int op_i; + + res_i = lane * 4 + elem; + op_i = lane * 4 + ((imm >> (2 * elem)) & 0x3); + vtmp.u32[res_i] = vop2->u32[op_i]; + + dprintf("l,e %d:%d, tmp[%d] = op2[%d]\n", lane, elem, res_i, op_i); + } + } + + elems = res_bit_size / 64; + + for (elem = 0; elem < elems; elem++) { + unsigned short *res_wp = (unsigned short *)&vres->u64[elem]; + unsigned char *op1_bp = (unsigned char *)&vop1->u64[elem]; + unsigned char *tmp_bp = (unsigned char *)&vtmp.u64[elem]; + + res_wp[0] = abs(op1_bp[0] - tmp_bp[0]) + abs(op1_bp[1] - tmp_bp[1]) + + abs(op1_bp[2] - tmp_bp[2]) + abs(op1_bp[3] - tmp_bp[3]); + + res_wp[1] = abs(op1_bp[0] - tmp_bp[1]) + abs(op1_bp[1] - tmp_bp[2]) + + abs(op1_bp[2] - tmp_bp[3]) + abs(op1_bp[3] - tmp_bp[4]); + + res_wp[2] = abs(op1_bp[4] - tmp_bp[2]) + abs(op1_bp[5] - tmp_bp[3]) + + abs(op1_bp[6] - tmp_bp[4]) + abs(op1_bp[7] - tmp_bp[5]); + + res_wp[3] = abs(op1_bp[4] - tmp_bp[3]) + abs(op1_bp[5] - tmp_bp[4]) + + abs(op1_bp[6] - tmp_bp[5]) + abs(op1_bp[7] - tmp_bp[6]); + } + + // Apply masking to get final result. + // + elems = res_bit_size / 16; + + for (res_i = 0; res_i < elems; res_i++) { + int elem_mask; + + elem_mask = mask & (1 << res_i); + + // The unmasked computation above has taken care of + // the elem_mask == 1 case. + if (elem_mask == 0) { + if (zero_mask) { + // Zeroing behavior. + vres->u16[res_i] = 0; + } else { + // Blending behavior + vres->u16[res_i] = wres_orig.u16[res_i]; + } + } + } + + return vres->zmmi; +} + +// +// Mask values. +// + +#define KMASK8_NONE 0xff +#define KMASK16_NONE 0xffff +#define KMASK32_NONE 0xffffffff + +#define KMASK8_ONES 0xff +#define KMASK16_ONES 0xffff +#define KMASK32_ONES 0xffffffff + +#define KMASK8_ALT 0xaa +#define KMASK16_ALT 0xaaaa +#define KMASK32_ALT 0xaaaaaaaa + +// +// Immediate value. +// +#define IMM_3210 0xe4 + +// +// Tests for vdbpsadbw +// +void do_xdbpsadbw() { + XYDBPSADBW("EDBPSADBW", 128, _mm_, false, KMASK8_NONE, false, IMM_3210, x); + + XYDBPSADBW("EDBPSADBW", 128, _mm_, true, KMASK8_ONES, false, IMM_3210, x); + + XYDBPSADBW("EDBPSADBW", 128, _mm_, true, KMASK8_ALT, false, IMM_3210, x); + + XYDBPSADBW("EDBPSADBW", 128, _mm_, true, KMASK8_ALT, true, IMM_3210, x); +} + +void do_ydbpsadbw() { + XYDBPSADBW("YDBPSADBW", 256, _mm256_, false, KMASK16_NONE, false, IMM_3210, + y); + + XYDBPSADBW("YDBPSADBW", 256, _mm256_, true, KMASK16_ONES, false, IMM_3210, y); + + XYDBPSADBW("YDBPSADBW", 256, _mm256_, true, KMASK16_ALT, false, IMM_3210, y); + + XYDBPSADBW("YDBPSADBW", 256, _mm256_, true, KMASK16_ALT, true, IMM_3210, y); +} + +void do_zdbpsadbw() { + ZDBPSADBW("ZDBPSADBW", false, KMASK32_NONE, false, IMM_3210); + + ZDBPSADBW("ZDBPSADBW", true, KMASK32_ONES, false, IMM_3210); + + ZDBPSADBW("ZDBPSADBW", true, KMASK32_ALT, false, IMM_3210); + + ZDBPSADBW("ZDBPSADBW", true, KMASK32_ALT, true, IMM_3210); +} + +int main() { + init(); + + do_xdbpsadbw(); + do_ydbpsadbw(); + do_zdbpsadbw(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} diff --git a/SingleSource/UnitTests/Vector/AVX512BWVL/psadbw.reference_output b/SingleSource/UnitTests/Vector/AVX512BWVL/psadbw.reference_output new file mode 100644 index 00000000..bfae62d0 --- /dev/null +++ b/SingleSource/UnitTests/Vector/AVX512BWVL/psadbw.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 diff --git a/SingleSource/UnitTests/Vector/AVX512BWVL/ptestm.c b/SingleSource/UnitTests/Vector/AVX512BWVL/ptestm.c new file mode 100644 index 00000000..4639ae17 --- /dev/null +++ b/SingleSource/UnitTests/Vector/AVX512BWVL/ptestm.c @@ -0,0 +1,214 @@ +/* + * Test intrinsics for vptestm[bdqw] and vptestnm[bdqw]. + * Here we check for _mm512_test_[epi32|epi64]_mask intrinsics. + */ + +#include "m512_test_util.h" +#include <stdio.h> + +volatile int ten = 10; + +#define TEST(base_intrin, mask_intrin, r1, r2, correct_res) \ + { \ + __int64 _i, _mask; \ + __int64 _cres = (correct_res); \ + __int64 _res = base_intrin((r1), (r2)); \ + if (_res != _cres) { \ + printf(#base_intrin "(" #r1 ", " #r2 ") failed\n"); \ + printf("Expected 0x%08x%08x; got 0x%08x%08x\n", (int)(_cres >> 32), \ + (int)_cres, (int)(_res >> 32), (int)_res); \ + n_errs++; \ + } \ + for (_i = 0; _i < ten; _i++) { \ + _mask = ((__int64)rand() << 32) | rand(); \ + _res = mask_intrin(_mask, (r1), (r2)); \ + _cres = (correct_res)&_mask; \ + if (_res != _cres) { \ + printf(#mask_intrin "(0x%08x%08x, " #r1 ", " #r2 ") " \ + "failed\n", \ + (int)(_mask >> 32), (int)_mask); \ + printf("Expected 0x%08x%08x; got 0x%08x%08x\n", (int)(_cres >> 32), \ + (int)_cres, (int)(_res >> 32), (int)_res); \ + n_errs++; \ + } \ + } \ + } + +V512 i8; +V512 mix8; +V512 i16; +V512 mix16; + +V512 i32; +V512 mix32; + +V512 i64; +V512 mix64; + +volatile int vol0 = 0; + +void NOINLINE init() { + volatile int i; + + for (i = 0; i < 64; i++) { + i8.s8[i] = -1; + mix8.s8[i] = (i & 1) ? 0 : -1; + } + + for (i = 0; i < 32; i++) { + i16.s16[i] = -1; + mix16.s16[i] = (i & 1) ? 0 : -1; + } + + for (i = 0; i < 16; i++) { + i32.s32[i] = -1; + mix32.s32[i] = (i & 1) ? 0 : -1; + } + + for (i = 0; i < 8; i++) { + i64.s64[i] = -1; + mix64.s64[i] = (i & 1) ? 0 : -1; + } +} + +void NOINLINE do_ptestmb() { + TEST(_mm_test_epi8_mask, _mm_mask_test_epi8_mask, mix8.xmmi[0], i8.xmmi[0], + 0x5555); + + i8.xmmi[vol0] = i8.xmmi[vol0]; /* No-op. */ + + TEST(_mm256_test_epi8_mask, _mm256_mask_test_epi8_mask, mix8.ymmi[0], + i8.ymmi[0], 0x55555555); + + i8.xmmi[vol0] = i8.xmmi[vol0]; /* No-op. */ + + TEST(_mm512_test_epi8_mask, _mm512_mask_test_epi8_mask, mix8.zmmi, i8.zmmi, + 0x5555555555555555); +} + +void NOINLINE do_ptestmw() { + TEST(_mm_test_epi16_mask, _mm_mask_test_epi16_mask, mix16.xmmi[0], + i16.xmmi[0], 0x55); + + i16.xmmi[vol0] = i16.xmmi[vol0]; /* No-op. */ + + TEST(_mm256_test_epi16_mask, _mm256_mask_test_epi16_mask, mix16.ymmi[0], + i16.ymmi[0], 0x5555); + + i16.xmmi[vol0] = i16.xmmi[vol0]; /* No-op. */ + + TEST(_mm512_test_epi16_mask, _mm512_mask_test_epi16_mask, mix16.zmmi, + i16.zmmi, 0x55555555); +} + +void NOINLINE do_ptestmd() { + TEST(_mm_test_epi32_mask, _mm_mask_test_epi32_mask, mix32.xmmi[0], + i32.xmmi[0], 0x5); + + i32.xmmi[vol0] = i32.xmmi[vol0]; /* No-op. */ + + TEST(_mm256_test_epi32_mask, _mm256_mask_test_epi32_mask, mix32.ymmi[0], + i32.ymmi[0], 0x55); + + i32.xmmi[vol0] = i32.xmmi[vol0]; /* No-op. */ + + TEST(_mm512_test_epi32_mask, _mm512_mask_test_epi32_mask, mix32.zmmi, + i32.zmmi, 0x5555); +} + +void NOINLINE do_ptestmq() { + TEST(_mm_test_epi64_mask, _mm_mask_test_epi64_mask, mix64.xmmi[0], + i64.xmmi[0], 0x1); + + i64.xmmi[vol0] = i64.xmmi[vol0]; /* No-op. */ + + TEST(_mm256_test_epi64_mask, _mm256_mask_test_epi64_mask, mix64.ymmi[0], + i64.ymmi[0], 0x5); + + i64.xmmi[vol0] = i64.xmmi[vol0]; /* No-op. */ + + TEST(_mm512_test_epi64_mask, _mm512_mask_test_epi64_mask, mix64.zmmi, + i64.zmmi, 0x55); +} + +void NOINLINE do_ptestnmb() { + TEST(_mm_testn_epi8_mask, _mm_mask_testn_epi8_mask, mix8.xmmi[0], i8.xmmi[0], + 0xaaaa); + + i8.xmmi[vol0] = i8.xmmi[vol0]; /* No-op. */ + + TEST(_mm256_testn_epi8_mask, _mm256_mask_testn_epi8_mask, mix8.ymmi[0], + i8.ymmi[0], 0xaaaaaaaa); + + i8.xmmi[vol0] = i8.xmmi[vol0]; /* No-op. */ + + TEST(_mm512_testn_epi8_mask, _mm512_mask_testn_epi8_mask, mix8.zmmi, i8.zmmi, + 0xaaaaaaaaaaaaaaaa); +} + +void NOINLINE do_ptestnmw() { + TEST(_mm_testn_epi16_mask, _mm_mask_testn_epi16_mask, mix16.xmmi[0], + i16.xmmi[0], 0xaa); + + i16.xmmi[vol0] = i16.xmmi[vol0]; /* No-op. */ + + TEST(_mm256_testn_epi16_mask, _mm256_mask_testn_epi16_mask, mix16.ymmi[0], + i16.ymmi[0], 0xaaaa); + + i16.xmmi[vol0] = i16.xmmi[vol0]; /* No-op. */ + + TEST(_mm512_testn_epi16_mask, _mm512_mask_testn_epi16_mask, mix16.zmmi, + i16.zmmi, 0xaaaaaaaa); +} + +void NOINLINE do_ptestnmd() { + TEST(_mm_testn_epi32_mask, _mm_mask_testn_epi32_mask, mix32.xmmi[0], + i32.xmmi[0], 0xa); + + i32.xmmi[vol0] = i32.xmmi[vol0]; /* No-op. */ + + TEST(_mm256_testn_epi32_mask, _mm256_mask_testn_epi32_mask, mix32.ymmi[0], + i32.ymmi[0], 0xaa); + + i32.xmmi[vol0] = i32.xmmi[vol0]; /* No-op. */ + + TEST(_mm512_testn_epi32_mask, _mm512_mask_testn_epi32_mask, mix32.zmmi, + i32.zmmi, 0xaaaa); +} + +void NOINLINE do_ptestnmq() { + TEST(_mm_testn_epi64_mask, _mm_mask_testn_epi64_mask, mix64.xmmi[0], + i64.xmmi[0], 0x2); + + i64.xmmi[vol0] = i64.xmmi[vol0]; /* No-op. */ + + TEST(_mm256_testn_epi64_mask, _mm256_mask_testn_epi64_mask, mix64.ymmi[0], + i64.ymmi[0], 0xa); + + i64.xmmi[vol0] = i64.xmmi[vol0]; /* No-op. */ + + TEST(_mm512_testn_epi64_mask, _mm512_mask_testn_epi64_mask, mix64.zmmi, + i64.zmmi, 0xaa); +} + +int main(int argc, char *argv[]) { + init(); + + do_ptestmb(); + do_ptestmw(); + do_ptestmd(); + do_ptestmq(); + + do_ptestnmb(); + do_ptestnmw(); + do_ptestnmd(); + do_ptestnmq(); + + if (n_errs) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} diff --git a/SingleSource/UnitTests/Vector/AVX512BWVL/ptestm.reference_output b/SingleSource/UnitTests/Vector/AVX512BWVL/ptestm.reference_output new file mode 100644 index 00000000..bfae62d0 --- /dev/null +++ b/SingleSource/UnitTests/Vector/AVX512BWVL/ptestm.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 diff --git a/SingleSource/UnitTests/Vector/AVX512BWVL/shuffles.c b/SingleSource/UnitTests/Vector/AVX512BWVL/shuffles.c new file mode 100644 index 00000000..2d62f9e9 --- /dev/null +++ b/SingleSource/UnitTests/Vector/AVX512BWVL/shuffles.c @@ -0,0 +1,294 @@ +/* + * Test shifts and rotates. + * + * This test was created to check the correctness + * of the following intrinsics support: + * _mm_shuffle_epi32() + * _mm_shufflehi_epi16() + * _mm_shufflelo_epi16() + * _mm256_shuffle_epi32() + * _mm256_shufflehi_epi16() + * _mm256_shufflelo_epi16() + * _mm512_shuffle_epi32() + * _mm512_shufflehi_epi16() + * _mm512_shufflelo_epi16() + */ + +#include "m512_test_util.h" +#include <stdio.h> +#include <string.h> + +V512 counts16, counts32, counts64, src, passthru, zeros; +__mmask8 k8 = 0xf9; +__mmask16 k16 = 0x9ffe; + +volatile int vol0; + +/* + * Use this between tests to make compiler think src was updated. + * Prevents PRE'ing of a load of src. + */ +#define soft_src_update() src.xmmi[vol0] = src.xmmi[vol0] +#define soft_counts16_update() counts16.xmmi[vol0] = counts16.xmmi[vol0] +#define soft_counts32_update() counts32.xmmi[vol0] = counts32.xmmi[vol0] +#define soft_counts64_update() counts64.xmmi[vol0] = counts64.xmmi[vol0] + +void NOINLINE init() { + volatile int i; + + for (i = 0; i < 16; i++) { + counts32.s32[i] = 3; + zeros.u32[i] = 0; + src.s32[i] = -27 * i * i + 300 * i - 82; + if (i & 0x1) { + src.s32[i] *= -1; + } + passthru.s32[i] = 48 * i * i + 100 * i - 9; + } + + for (i = 0; i < 8; i++) { + counts64.s64[i] = 9; + } + + for (i = 0; i < 32; i++) { + counts16.s16[i] = 4; + } +} + + +void NOINLINE emulate_shuffle(void *presult, const void *p, +const void *mask_src, int size, int control, int mask) { + int i; + V512 *result = (V512 *)presult; + V512 *v = (V512 *)p; + V512 *src = (V512 *)mask_src; + for (i = 0; i < size; i++) { + if (((1 << i) & mask) == 0) + result->u32[i] = src->u32[i]; + else + result->u32[i] = v->u32[4 * (i / 4) + ((control >> (2 * (i % 4))) & 3)]; + } +} + + +void NOINLINE emulate_shuffle16(void *presult, const void *p, +const void *mask_src, int size, int control, int mask, int order) { + int i; + V512 *result = (V512 *)presult; + V512 *v = (V512 *)p; + V512 *src = (V512 *)mask_src; + for (i = 0; i < size; i++) { + if (((1 << i) & mask) == 0) { + result->u16[i] = src->u16[i]; + } else { + if ((i / 4) % 2 == order) { + result->u16[i] = v->u16[i]; + } else { + result->u16[i] = v->u16[4 * (i / 4) + ((control >> (2 * (i % 4))) & 3)]; + } + } + } +} + + +void NOINLINE do_shuffle_epi32() { + volatile V512 res; + V512 expected; + + // checking mm512 shuffle + soft_counts32_update(); + res.zmmi = _mm512_shuffle_epi32(src.zmmi, 3); + emulate_shuffle(&expected, &src, &zeros, 16, 3, 0xffff); + check_equal_nd(&res, &expected, 16, "_mm512_shuffle_epi32", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.zmmi = _mm512_mask_shuffle_epi32(passthru.zmmi, k16, src.zmmi, 3); + emulate_shuffle(&expected, &src, &passthru, 16, 3, k16); + check_equal_nd(&res, &expected, 16, "_mm512_mask_shuffle_epi32", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.zmmi = _mm512_maskz_shuffle_epi32(k16, src.zmmi, 3); + emulate_shuffle(&expected, &src, &zeros, 16, 3, k16); + check_equal_nd(&res, &expected, 16, "_mm512_maskz_shuffle_epi32", __LINE__); + soft_counts32_update(); + + // checking mm256 shuffle + soft_counts32_update(); + res.ymmi[0] = _mm256_shuffle_epi32(src.ymmi[0], 3); + emulate_shuffle(&expected, &src, &zeros, 8, 3, 0xff); + check_equal_nd(&res, &expected, 8, "_mm256_shuffle_epi32", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.ymmi[0] = _mm256_mask_shuffle_epi32(passthru.ymmi[0], k8, src.ymmi[0], 3); + emulate_shuffle(&expected, &src, &passthru, 8, 3, k8); + check_equal_nd(&res, &expected, 8, "_mm256_mask_shuffle_epi32", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.ymmi[0] = _mm256_maskz_shuffle_epi32(k8, src.ymmi[0], 3); + emulate_shuffle(&expected, &src, &zeros, 8, 3, k8); + check_equal_nd(&res, &expected, 8, "_mm256_maskz_shuffle_epi32", __LINE__); + soft_counts32_update(); + + // checking mm shuffle + soft_counts32_update(); + res.xmmi[0] = _mm_shuffle_epi32(src.xmmi[0], 3); + emulate_shuffle(&expected, &src, &zeros, 4, 3, 0xf); + check_equal_nd(&res, &expected, 4, "_mm_shuffle_epi32", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.xmmi[0] = _mm_mask_shuffle_epi32(passthru.xmmi[0], k8, src.xmmi[0], 3); + emulate_shuffle(&expected, &src, &passthru, 4, 3, k8); + check_equal_nd(&res, &expected, 4, "_mm_mask_shuffle_epi32", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.xmmi[0] = _mm_maskz_shuffle_epi32(k8, src.xmmi[0], 3); + emulate_shuffle(&expected, &src, &zeros, 4, 3, k8); + check_equal_nd(&res, &expected, 4, "_mm_maskz_shuffle_epi32", __LINE__); + soft_counts32_update(); +} + +void NOINLINE do_shufflehi_epi16() { + volatile V512 res; + V512 expected; + + // checking mm512 shufflehi + soft_counts32_update(); + res.zmmi = _mm512_shufflehi_epi16(src.zmmi, 3); + emulate_shuffle16(&expected, &src, &src, 32, 3, 0xffffffff, 0); + check_equal_nd(&res, &expected, 16, "_mm512_shufflehi_epi16", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.zmmi = _mm512_mask_shufflehi_epi16(passthru.zmmi, k16, src.zmmi, 3); + emulate_shuffle16(&expected, &src, &passthru, 32, 3, k16, 0); + check_equal_nd(&res, &expected, 16, "_mm512_mask_shufflehi_epi16", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.zmmi = _mm512_maskz_shufflehi_epi16(k16, src.zmmi, 3); + emulate_shuffle16(&expected, &src, &zeros, 32, 3, k16, 0); + check_equal_nd(&res, &expected, 16, "_mm512_maskz_shufflehi_epi16", __LINE__); + soft_counts32_update(); + + // checking mm256 shufflehi + soft_counts32_update(); + res.ymmi[0] = _mm256_shufflehi_epi16(src.ymmi[0], 3); + emulate_shuffle16(&expected, &src, &zeros, 16, 3, 0xffff, 0); + check_equal_nd(&res, &expected, 8, "_mm256_shufflehi_epi16", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.ymmi[0] = _mm256_mask_shufflehi_epi16(passthru.ymmi[0], k16, src.ymmi[0], 3); + emulate_shuffle16(&expected, &src, &passthru, 16, 3, k16, 0); + check_equal_nd(&res, &expected, 8, "_mm256_mask_shufflehi_epi16", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.ymmi[0] = _mm256_maskz_shufflehi_epi16(k16, src.ymmi[0], 3); + emulate_shuffle16(&expected, &src, &zeros, 16, 3, k16, 0); + check_equal_nd(&res, &expected, 8, "_mm256_maskz_shufflehi_epi16", __LINE__); + soft_counts32_update(); + + // checking mm shufflehi + soft_counts32_update(); + res.xmmi[0] = _mm_shufflehi_epi16(src.xmmi[0], 3); + emulate_shuffle16(&expected, &src, &zeros, 8, 3, 0xff, 0); + check_equal_nd(&res, &expected, 4, "_mm_shufflehi_epi16", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.xmmi[0] = _mm_mask_shufflehi_epi16(passthru.xmmi[0], k8, src.xmmi[0], 3); + emulate_shuffle16(&expected, &src, &passthru, 8, 3, k8, 0); + check_equal_nd(&res, &expected, 4, "_mm_mask_shufflehi_epi16", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.xmmi[0] = _mm_maskz_shufflehi_epi16(k8, src.xmmi[0], 3); + emulate_shuffle16(&expected, &src, &zeros, 8, 3, k8, 0); + check_equal_nd(&res, &expected, 4, "_mm_maskz_shufflehi_epi16", __LINE__); + soft_counts32_update(); +} + +void NOINLINE do_shufflelo_epi16() { + volatile V512 res; + V512 expected; + + // checking mm512 shufflelo + soft_counts32_update(); + res.zmmi = _mm512_shufflelo_epi16(src.zmmi, 3); + emulate_shuffle16(&expected, &src, &src, 32, 3, 0xffffffff, 1); + check_equal_nd(&res, &expected, 16, "_mm512_shufflelo_epi16", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.zmmi = _mm512_mask_shufflelo_epi16(passthru.zmmi, k16, src.zmmi, 3); + emulate_shuffle16(&expected, &src, &passthru, 32, 3, k16, 1); + check_equal_nd(&res, &expected, 16, "_mm512_mask_shufflelo_epi16", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.zmmi = _mm512_maskz_shufflelo_epi16(k16, src.zmmi, 3); + emulate_shuffle16(&expected, &src, &zeros, 32, 3, k16, 1); + check_equal_nd(&res, &expected, 16, "_mm512_maskz_shufflelo_epi16", __LINE__); + soft_counts32_update(); + + // checking mm256 shufflelo + soft_counts32_update(); + res.ymmi[0] = _mm256_shufflelo_epi16(src.ymmi[0], 3); + emulate_shuffle16(&expected, &src, &zeros, 16, 3, 0xffff, 1); + check_equal_nd(&res, &expected, 8, "_mm256_shufflelo_epi16", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.ymmi[0] = _mm256_mask_shufflelo_epi16(passthru.ymmi[0], k16, src.ymmi[0], 3); + emulate_shuffle16(&expected, &src, &passthru, 16, 3, k16, 1); + check_equal_nd(&res, &expected, 8, "_mm256_mask_shufflelo_epi16", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.ymmi[0] = _mm256_maskz_shufflelo_epi16(k16, src.ymmi[0], 3); + emulate_shuffle16(&expected, &src, &zeros, 16, 3, k16, 1); + check_equal_nd(&res, &expected, 8, "_mm256_maskz_shufflelo_epi16", __LINE__); + soft_counts32_update(); + + // checking mm shufflelo + soft_counts32_update(); + res.xmmi[0] = _mm_shufflelo_epi16(src.xmmi[0], 3); + emulate_shuffle16(&expected, &src, &zeros, 8, 3, 0xff, 1); + check_equal_nd(&res, &expected, 4, "_mm_shufflelo_epi16", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.xmmi[0] = _mm_mask_shufflelo_epi16(passthru.xmmi[0], k8, src.xmmi[0], 3); + emulate_shuffle16(&expected, &src, &passthru, 8, 3, k8, 1); + check_equal_nd(&res, &expected, 4, "_mm_mask_shufflelo_epi16", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.xmmi[0] = _mm_maskz_shufflelo_epi16(k8, src.xmmi[0], 3); + emulate_shuffle16(&expected, &src, &zeros, 8, 3, k8, 1); + check_equal_nd(&res, &expected, 4, "_mm_maskz_shufflelo_epi16", __LINE__); + soft_counts32_update(); +} + +int main(int argc, char *argv[]) { + init(); + + do_shuffle_epi32(); + do_shufflelo_epi16(); + do_shufflehi_epi16(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} diff --git a/SingleSource/UnitTests/Vector/AVX512BWVL/shuffles.reference_output b/SingleSource/UnitTests/Vector/AVX512BWVL/shuffles.reference_output new file mode 100644 index 00000000..bfae62d0 --- /dev/null +++ b/SingleSource/UnitTests/Vector/AVX512BWVL/shuffles.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 diff --git a/SingleSource/UnitTests/Vector/AVX512BWVL/unpack_msasm.c b/SingleSource/UnitTests/Vector/AVX512BWVL/unpack_msasm.c new file mode 100644 index 00000000..b0bf02aa --- /dev/null +++ b/SingleSource/UnitTests/Vector/AVX512BWVL/unpack_msasm.c @@ -0,0 +1,802 @@ +/* + * Here we check for punpck* and unpck* intrinsics using masm. + */ + +#include "m512_test_util.h" +#include <stdio.h> +#include <string.h> + +int verbose = 0; + +__m512i i1; +__m512i i2; +__m512i i3; +__m512i i4; +__m512i i5; + +volatile int vol = 0; /* To prevent optimizations */ + +void NOINLINE display_xmm_pi(const void *p, const char *banner) { + int i; + V512 *v = (V512 *)p; + + if (banner) { + printf("%s", banner); + } + + for (i = 0; i < 4; i++) { + printf(" 0x%0.8x", v->s32[3 - i]); + } + printf("\n"); +} + +void NOINLINE init() { + int i; + V512 *pi1 = (V512 *)&i1; + V512 *pi2 = (V512 *)&i2; + V512 *pi3 = (V512 *)&i3; + + for (i = 0; i < 64; i++) { + pi1->u8[i] = 17 + ((i & 1) ? 1 : -1) * i + vol; + + pi2->u8[i] = 100 + ((i & 3) == 3 ? 1 : -1) * i + vol; + + pi3->u8[i] = 400 + ((i & 1) ? -1 : 1) * i + vol; + } +} + +#define check_equal_xmm(vgot, vexpected, banner) \ + check_equal_nd(vgot, vexpected, 4, banner, __LINE__) +#define check_equal_ymm(vgot, vexpected, banner) \ + check_equal_nd(vgot, vexpected, 8, banner, __LINE__) +#define check_equal_zmm(vgot, vexpected, banner) \ + check_equal_nd(vgot, vexpected, 16, banner, __LINE__) + +void NOINLINE emulate_palignr(void *presult, const void *p1, const void *p2, + int shift, int num_lanes) { + int i, lane; + V512 *result = (V512 *)presult; + V512 *v1 = (V512 *)p1; + V512 *v2 = (V512 *)p2; + + if (shift < 0 || shift > 31) { + /* Result is zero. */ + for (lane = 0; lane < num_lanes; lane++) { + for (i = 0; i < 4; i++) { + result->u32[4 * lane + i] = 0; + } + } + + return; + } + + for (lane = 0; lane < num_lanes; lane++) { + for (i = 0; i < (16 - shift); i++) { + result->u8[16 * lane + i] = v2->u8[16 * lane + i + shift]; + } + for (; i < 16 && (i + shift < 32); i++) { + result->u8[16 * lane + i] = v1->u8[16 * lane + i - (16 - shift)]; + } + for (; i < 16; i++) { + result->u8[16 * lane + i] = 0; + } + } +} + +void NOINLINE emulate_punpck_bw(void *presult, const void *p1, const void *p2, + int num_lanes, int high) { + int i, lane; + V512 *result = (V512 *)presult; + V512 *v1 = (V512 *)p1; + V512 *v2 = (V512 *)p2; + int offset = high ? 8 : 0; + + for (lane = 0; lane < num_lanes; lane++) { + for (i = 0; i < 8; i++) { + result->u8[16 * lane + 2 * i] = v1->u8[16 * lane + i + offset]; + result->u8[16 * lane + 2 * i + 1] = v2->u8[16 * lane + i + offset]; + } + } +} + +#define emulate_punpckhbw(presult, p1, p2, num_lanes) \ + emulate_punpck_bw(presult, p1, p2, num_lanes, 1) +#define emulate_punpcklbw(presult, p1, p2, num_lanes) \ + emulate_punpck_bw(presult, p1, p2, num_lanes, 0) + +void NOINLINE emulate_punpck_wd(void *presult, const void *p1, const void *p2, + int num_lanes, int high) { + int i, lane; + V512 *result = (V512 *)presult; + V512 *v1 = (V512 *)p1; + V512 *v2 = (V512 *)p2; + int offset = high ? 4 : 0; + + for (lane = 0; lane < num_lanes; lane++) { + for (i = 0; i < 8; i++) { + result->u16[8 * lane + 2 * i] = v1->u16[8 * lane + i + offset]; + result->u16[8 * lane + 2 * i + 1] = v2->u16[8 * lane + i + offset]; + } + } +} + +#define emulate_punpckhwd(presult, p1, p2, num_lanes) \ + emulate_punpck_wd(presult, p1, p2, num_lanes, 1) +#define emulate_punpcklwd(presult, p1, p2, num_lanes) \ + emulate_punpck_wd(presult, p1, p2, num_lanes, 0) + +void NOINLINE emulate_punpck_dq(void *presult, const void *p1, const void *p2, + int num_lanes, int high) { + int i, lane; + V512 *result = (V512 *)presult; + V512 *v1 = (V512 *)p1; + V512 *v2 = (V512 *)p2; + int offset = high ? 2 : 0; + + for (lane = 0; lane < num_lanes; lane++) { + for (i = 0; i < 4; i++) { + result->u32[4 * lane + 2 * i] = v1->u32[4 * lane + i + offset]; + result->u32[4 * lane + 2 * i + 1] = v2->u32[4 * lane + i + offset]; + } + } +} + +#define emulate_punpckhdq(presult, p1, p2, num_lanes) \ + emulate_punpck_dq(presult, p1, p2, num_lanes, 1) +#define emulate_punpckldq(presult, p1, p2, num_lanes) \ + emulate_punpck_dq(presult, p1, p2, num_lanes, 0) + +void NOINLINE emulate_punpck_qdq(void *presult, const void *p1, const void *p2, + int num_lanes, int high) { + int i, lane; + V512 *result = (V512 *)presult; + V512 *v1 = (V512 *)p1; + V512 *v2 = (V512 *)p2; + int offset = high ? 1 : 0; + + for (lane = 0; lane < num_lanes; lane++) { + for (i = 0; i < 2; i++) { + result->u64[2 * lane + 2 * i] = v1->u64[2 * lane + i + offset]; + result->u64[2 * lane + 2 * i + 1] = v2->u64[2 * lane + i + offset]; + } + } +} + +#define emulate_punpckhqdq(presult, p1, p2, num_lanes) \ + emulate_punpck_qdq(presult, p1, p2, num_lanes, 1) +#define emulate_punpcklqdq(presult, p1, p2, num_lanes) \ + emulate_punpck_qdq(presult, p1, p2, num_lanes, 0) + +void NOINLINE do_punpck_bw() { + void *p1 = &i1; + void *p2 = &i2; + void *p3 = &i3; + +#define DO_XMM_REG_REG(opcode, reg1, reg2) \ + __asm {\ + __asm mov FULL_IREG(ax), [p1] \ + __asm movaps reg2, [FULL_IREG(ax)] \ + __asm mov FULL_IREG(ax), [p2] \ + __asm movaps reg1, [FULL_IREG(ax)] \ + __asm opcode reg1, reg2 \ + __asm mov FULL_IREG(ax), [p3] \ + __asm movaps [FULL_IREG(ax)], reg1 \ + } + +#define DO_V_REG_REG_REG(opcode, reg1, reg2, reg3) \ + __asm { \ + __asm mov FULL_IREG(ax), [p1] \ + __asm vmovaps reg3, [FULL_IREG(ax)] \ + __asm mov FULL_IREG(ax), [p2] \ + __asm vmovaps reg2, [FULL_IREG(ax)] \ + __asm opcode reg1, reg2, reg3 \ + __asm mov FULL_IREG(ax), [p3] \ + __asm vmovaps [FULL_IREG(ax)], reg1} + + DO_XMM_REG_REG(punpckhbw, xmm6, xmm3) + emulate_punpckhbw(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "punpckhbw xmm6, xmm3"); + if (verbose) { + printf("punpckhbw(i2, i1)\n"); + display_xmm_pi(&i1, "i1: "); + display_xmm_pi(&i2, "i2: "); + display_xmm_pi(&i3, "got: "); + display_xmm_pi(&i4, "exp: "); + } + + DO_XMM_REG_REG(punpcklbw, xmm2, xmm7) + emulate_punpcklbw(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "punpckhbw xmm2, xmm7"); + + DO_V_REG_REG_REG(vpunpckhbw, xmm1, xmm6, xmm5) + emulate_punpckhbw(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "vpunpckhbw xmm1, xmm6, xmm5"); + + DO_V_REG_REG_REG(vpunpckhbw, ymm4, ymm7, ymm5) + emulate_punpckhbw(&i4, &i2, &i1, 2); + check_equal_ymm(&i4, &i3, "vpunpckhbw ymm4, ymm7, ymm5"); + + DO_V_REG_REG_REG(vpunpcklbw, ymm7, ymm3, ymm1) + emulate_punpcklbw(&i4, &i2, &i1, 2); + check_equal_ymm(&i4, &i3, "vpunpcklbw ymm7, ymm3, ymm1"); + + DO_V_REG_REG_REG(vpunpckhbw, zmm4, zmm7, zmm5) + emulate_punpckhbw(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhbw zmm4, zmm4, zmm5"); + + DO_V_REG_REG_REG(vpunpcklbw, zmm7, zmm3, zmm1) + emulate_punpcklbw(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpcklbw zmm7, zmm3, zmm1"); + +#if defined(__x86_64) || defined(_M_X64) + + DO_V_REG_REG_REG(vpunpckhbw, xmm19, xmm6, xmm5) + emulate_punpckhbw(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "vpunpckhbw xmm19, xmm6, xmm5"); + + DO_V_REG_REG_REG(vpunpckhbw, xmm6, xmm19, xmm5) + emulate_punpckhbw(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "vpunpckhbw xmm6, xmm19, xmm5"); + + DO_V_REG_REG_REG(vpunpckhbw, xmm6, xmm5, xmm19) + emulate_punpckhbw(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "vpunpckhbw xmm6, xmm5, xmm19"); + + DO_V_REG_REG_REG(vpunpckhbw, zmm19, zmm6, zmm5) + emulate_punpckhbw(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhbw zmm19, zmm6, zmm5"); + + DO_V_REG_REG_REG(vpunpckhbw, zmm6, zmm19, zmm5) + emulate_punpckhbw(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhbw zmm6, zmm19, zmm5"); + + DO_V_REG_REG_REG(vpunpckhbw, zmm6, zmm5, zmm19) + emulate_punpckhbw(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhbw zmm6, zmm5, zmm19"); + +#endif /* defined(__x86_64) || defined(_M_X64) */ +} + +void NOINLINE do_punpck_wd() { + void *p1 = &i1; + void *p2 = &i2; + void *p3 = &i3; + + DO_XMM_REG_REG(punpckhwd, xmm6, xmm3) + emulate_punpckhwd(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "punpckhwd xmm6, xmm3"); + if (verbose) { + printf("punpckhwd(i2, i1)\n"); + display_xmm_pi(&i1, "i1: "); + display_xmm_pi(&i2, "i2: "); + display_xmm_pi(&i3, "got: "); + display_xmm_pi(&i4, "exp: "); + } + + DO_XMM_REG_REG(punpcklwd, xmm2, xmm7) + emulate_punpcklwd(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "punpckhwd xmm2, xmm7"); + + DO_V_REG_REG_REG(vpunpckhwd, xmm1, xmm6, xmm5) + emulate_punpckhwd(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "vpunpckhwd xmm1, xmm6, xmm5"); + + DO_V_REG_REG_REG(vpunpckhwd, ymm4, ymm7, ymm5) + emulate_punpckhwd(&i4, &i2, &i1, 2); + check_equal_ymm(&i4, &i3, "vpunpckhwd ymm4, ymm7, ymm5"); + + DO_V_REG_REG_REG(vpunpcklwd, ymm7, ymm3, ymm1) + emulate_punpcklwd(&i4, &i2, &i1, 2); + check_equal_ymm(&i4, &i3, "vpunpcklwd ymm7, ymm3, ymm1"); + + DO_V_REG_REG_REG(vpunpckhwd, zmm4, zmm7, zmm5) + emulate_punpckhwd(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhwd zmm4, zmm4, zmm5"); + + DO_V_REG_REG_REG(vpunpcklwd, zmm7, zmm3, zmm1) + emulate_punpcklwd(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpcklwd zmm7, zmm3, zmm1"); + +#if defined(__x86_64) || defined(_M_X64) + + DO_V_REG_REG_REG(vpunpckhwd, ymm19, ymm6, ymm5) + emulate_punpckhwd(&i4, &i2, &i1, 2); + check_equal_ymm(&i4, &i3, "vpunpckhwd ymm19, ymm6, ymm5"); + + DO_V_REG_REG_REG(vpunpckhwd, ymm6, ymm19, ymm5) + emulate_punpckhwd(&i4, &i2, &i1, 2); + check_equal_ymm(&i4, &i3, "vpunpckhwd ymm6, ymm19, ymm5"); + + DO_V_REG_REG_REG(vpunpckhwd, ymm6, ymm5, ymm19) + emulate_punpckhwd(&i4, &i2, &i1, 2); + check_equal_ymm(&i4, &i3, "vpunpckhwd ymm6, ymm5, ymm19"); + + DO_V_REG_REG_REG(vpunpckhwd, zmm19, zmm6, zmm5) + emulate_punpckhwd(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhwd zmm19, zmm6, zmm5"); + + DO_V_REG_REG_REG(vpunpckhwd, zmm6, zmm19, zmm5) + emulate_punpckhwd(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhwd zmm6, zmm19, zmm5"); + + DO_V_REG_REG_REG(vpunpckhwd, zmm6, zmm5, zmm19) + emulate_punpckhwd(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhwd zmm6, zmm5, zmm19"); + + DO_V_REG_REG_REG(vpunpckhwd, zmm26, zmm6, zmm5) + emulate_punpckhwd(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhwd zmm26, zmm6, zmm5"); + + DO_V_REG_REG_REG(vpunpckhwd, zmm6, zmm26, zmm5) + emulate_punpckhwd(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhwd zmm6, zmm26, zmm5"); + + DO_V_REG_REG_REG(vpunpckhwd, zmm6, zmm5, zmm26) + emulate_punpckhwd(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhwd zmm6, zmm5, zmm26"); + +#endif /* defined(__x86_64) || defined(_M_X64) */ +} + +#define DO_Z_REG_MASK_REG_REG(opcode, reg1, kreg, reg2, reg3) \ + __asm { \ + __asm mov FULL_IREG(ax), [p1] \ + __asm vmovaps reg3, [FULL_IREG(ax)] \ + __asm mov FULL_IREG(ax), [p2] \ + __asm vmovaps reg2, [FULL_IREG(ax)] \ + __asm kxnorw kreg, kreg, kreg \ + __asm opcode reg1{kreg} \ + , reg2, reg3 __asm mov FULL_IREG(ax), [p3] __asm vmovaps[FULL_IREG(ax)], \ + reg1 \ + } + +void NOINLINE do_punpck_dq() { + void *p1 = &i1; + void *p2 = &i2; + void *p3 = &i3; + + DO_XMM_REG_REG(punpckhdq, xmm6, xmm3) + emulate_punpckhdq(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "punpckhdq xmm6, xmm3"); + if (verbose) { + printf("punpckhdq(i2, i1)\n"); + display_xmm_pi(&i1, "i1: "); + display_xmm_pi(&i2, "i2: "); + display_xmm_pi(&i3, "got: "); + display_xmm_pi(&i4, "exp: "); + } + + DO_XMM_REG_REG(punpckldq, xmm2, xmm7) + emulate_punpckldq(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "punpckhdq xmm2, xmm7"); + + DO_V_REG_REG_REG(vpunpckhdq, xmm1, xmm6, xmm5) + emulate_punpckhdq(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "vpunpckhdq xmm1, xmm6, xmm5"); + + DO_V_REG_REG_REG(vpunpckhdq, ymm4, ymm7, ymm5) + emulate_punpckhdq(&i4, &i2, &i1, 2); + check_equal_ymm(&i4, &i3, "vpunpckhdq ymm4, ymm7, ymm5"); + + DO_V_REG_REG_REG(vpunpckldq, ymm7, ymm3, ymm1) + emulate_punpckldq(&i4, &i2, &i1, 2); + check_equal_ymm(&i4, &i3, "vpunpckldq ymm7, ymm3, ymm1"); + + DO_V_REG_REG_REG(vpunpckhdq, zmm4, zmm7, zmm5) + emulate_punpckhdq(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhdq zmm4, zmm4, zmm5"); + + DO_V_REG_REG_REG(vpunpckldq, zmm7, zmm3, zmm1) + emulate_punpckldq(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckldq zmm7, zmm3, zmm1"); + +#if defined(__x86_64) || defined(_M_X64) + + DO_V_REG_REG_REG(vpunpckhdq, xmm23, xmm7, xmm5) + emulate_punpckhdq(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "vpunpckhdq xmm23, xmm7, xmm5"); + + DO_V_REG_REG_REG(vpunpckhdq, xmm7, xmm23, xmm5) + emulate_punpckhdq(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "vpunpckhdq xmm7, xmm23, xmm5"); + + DO_V_REG_REG_REG(vpunpckhdq, xmm7, xmm5, xmm23) + emulate_punpckhdq(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "vpunpckhdq xmm7, xmm5, xmm23"); + + DO_V_REG_REG_REG(vpunpckhdq, ymm23, ymm16, ymm5) + emulate_punpckhdq(&i4, &i2, &i1, 2); + check_equal_ymm(&i4, &i3, "vpunpckhdq ymm23, ymm16, ymm5"); + + DO_V_REG_REG_REG(vpunpckhdq, zmm23, zmm7, zmm5) + emulate_punpckhdq(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhdq zmm23, zmm7, zmm5"); + + DO_V_REG_REG_REG(vpunpckhdq, zmm7, zmm23, zmm5) + emulate_punpckhdq(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhdq zmm7, zmm23, zmm5"); + + DO_V_REG_REG_REG(vpunpckhdq, zmm7, zmm5, zmm23) + emulate_punpckhdq(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhdq zmm7, zmm5, zmm23"); + + DO_Z_REG_MASK_REG_REG(vpunpckhdq, zmm23, k4, zmm7, zmm5) + emulate_punpckhdq(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhdq zmm23{k4}, zmm7, zmm5"); + +#endif /* defined(__x86_64) || defined(_M_X64) */ +} + +void NOINLINE do_punpck_qdq() { + void *p1 = &i1; + void *p2 = &i2; + void *p3 = &i3; + + DO_XMM_REG_REG(punpckhqdq, xmm6, xmm3) + emulate_punpckhqdq(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "punpckhqdq xmm6, xmm3"); + if (verbose) { + printf("punpckhqdq(i2, i1)\n"); + display_xmm_pi(&i1, "i1: "); + display_xmm_pi(&i2, "i2: "); + display_xmm_pi(&i3, "got: "); + display_xmm_pi(&i4, "exp: "); + } + + DO_XMM_REG_REG(punpcklqdq, xmm2, xmm7) + emulate_punpcklqdq(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "punpckhqdq xmm2, xmm7"); + + DO_V_REG_REG_REG(vpunpckhqdq, xmm1, xmm6, xmm5) + emulate_punpckhqdq(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "vpunpckhqdq xmm1, xmm6, xmm5"); + + DO_V_REG_REG_REG(vpunpckhqdq, ymm4, ymm7, ymm5) + emulate_punpckhqdq(&i4, &i2, &i1, 2); + check_equal_ymm(&i4, &i3, "vpunpckhqdq ymm4, ymm7, ymm5"); + + DO_V_REG_REG_REG(vpunpcklqdq, ymm7, ymm3, ymm1) + emulate_punpcklqdq(&i4, &i2, &i1, 2); + check_equal_ymm(&i4, &i3, "vpunpcklqdq ymm7, ymm3, ymm1"); + + DO_V_REG_REG_REG(vpunpckhqdq, zmm4, zmm7, zmm5) + emulate_punpckhqdq(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhqdq zmm4, zmm4, zmm5"); + + DO_V_REG_REG_REG(vpunpcklqdq, zmm7, zmm3, zmm1) + emulate_punpcklqdq(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpcklqdq zmm7, zmm3, zmm1"); + +#if defined(__x86_64) || defined(_M_X64) + + DO_Z_REG_MASK_REG_REG(vpunpcklqdq, zmm31, k6, zmm29, zmm27) + emulate_punpcklqdq(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpcklqdq zmm31{k6}, zmm29, zmm27"); + +#endif /* defined(__x86_64) || defined(_M_X64) */ +} + +void NOINLINE do_punpck_ps() { + void *p1 = &i1; + void *p2 = &i2; + void *p3 = &i3; + + DO_XMM_REG_REG(unpckhps, xmm6, xmm3) + emulate_punpckhdq(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "unpckhps xmm6, xmm3"); + if (verbose) { + printf("unpckhps(i2, i1)\n"); + display_xmm_pi(&i1, "i1: "); + display_xmm_pi(&i2, "i2: "); + display_xmm_pi(&i3, "got: "); + display_xmm_pi(&i4, "exp: "); + } + + DO_XMM_REG_REG(unpcklps, xmm2, xmm7) + emulate_punpckldq(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "unpckhps xmm2, xmm7"); + + DO_V_REG_REG_REG(vunpckhps, xmm1, xmm6, xmm5) + emulate_punpckhdq(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "vunpckhps xmm1, xmm6, xmm5"); + + DO_V_REG_REG_REG(vunpckhps, ymm4, ymm7, ymm5) + emulate_punpckhdq(&i4, &i2, &i1, 2); + check_equal_ymm(&i4, &i3, "vunpckhps ymm4, ymm7, ymm5"); + + DO_V_REG_REG_REG(vunpcklps, ymm7, ymm3, ymm1) + emulate_punpckldq(&i4, &i2, &i1, 2); + check_equal_ymm(&i4, &i3, "vunpcklps ymm7, ymm3, ymm1"); + + DO_V_REG_REG_REG(vunpckhps, zmm4, zmm7, zmm5) + emulate_punpckhdq(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vunpckhps zmm4, zmm4, zmm5"); + + DO_V_REG_REG_REG(vunpcklps, zmm7, zmm3, zmm1) + emulate_punpckldq(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vunpcklps zmm7, zmm3, zmm1"); + +#if defined(__x86_64) || defined(_M_X64) + + DO_Z_REG_MASK_REG_REG(vunpcklps, zmm30, k5, zmm28, zmm26) + emulate_punpckldq(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vunpcklps zmm30{k5}, zmm28, zmm26"); + +#endif /* defined(__x86_64) || defined(_M_X64) */ +} + +void NOINLINE do_palignr() { + void *p1 = &i1; + void *p2 = &i2; + void *p3 = &i3; + +#define DO_XMM_REG_REG_IMM(opcode, reg1, reg2, imm) \ + __asm {\ + __asm mov FULL_IREG(ax), [p1] \ + __asm movaps reg2, [FULL_IREG(ax)] \ + __asm mov FULL_IREG(ax), [p2] \ + __asm movaps reg1, [FULL_IREG(ax)] \ + __asm opcode reg1, reg2, imm \ + __asm mov FULL_IREG(ax), [p3] \ + __asm movaps [FULL_IREG(ax)], reg1 \ + } + +#define DO_V_REG_REG_REG_IMM(opcode, reg1, reg2, reg3, imm) \ + __asm { \ + __asm mov FULL_IREG(ax), [p1] \ + __asm vmovaps reg3, [FULL_IREG(ax)] \ + __asm mov FULL_IREG(ax), [p2] \ + __asm vmovaps reg2, [FULL_IREG(ax)] \ + __asm opcode reg1, reg2, reg3, imm \ + __asm mov FULL_IREG(ax), [p3] \ + __asm vmovaps [FULL_IREG(ax)], reg1} + + DO_XMM_REG_REG_IMM(palignr, xmm6, xmm3, 19) + emulate_palignr(&i4, &i2, &i1, 19, 1); + check_equal_xmm(&i4, &i3, "palignr xmm6, xmm3, 19"); + if (verbose) { + printf("palignr(i2, i1)\n"); + display_xmm_pi(&i1, "i1: "); + display_xmm_pi(&i2, "i2: "); + display_xmm_pi(&i3, "got: "); + display_xmm_pi(&i4, "exp: "); + } + + DO_V_REG_REG_REG_IMM(vpalignr, xmm6, xmm7, xmm3, 19) + emulate_palignr(&i4, &i2, &i1, 19, 1); + check_equal_xmm(&i4, &i3, "palignr xmm6, xmm7, xmm3, 19"); + + DO_V_REG_REG_REG_IMM(vpalignr, ymm6, ymm7, ymm3, 19) + emulate_palignr(&i4, &i2, &i1, 19, 2); + check_equal_ymm(&i4, &i3, "palignr ymm6, ymm7, ymm3, 19"); + + DO_V_REG_REG_REG_IMM(vpalignr, zmm4, zmm7, zmm5, 12) + emulate_palignr(&i4, &i2, &i1, 12, 4); + check_equal_zmm(&i4, &i3, "vpalignr zmm4, zmm4, zmm5, 12"); + +#if defined(__x86_64) || defined(_M_X64) + + DO_V_REG_REG_REG_IMM(vpalignr, ymm27, ymm5, ymm3, 18) + emulate_palignr(&i4, &i2, &i1, 18, 4); + check_equal_ymm(&i4, &i3, "vpalignr ymm27, ymm5, ymm3, 18"); + + DO_V_REG_REG_REG_IMM(vpalignr, zmm3, zmm5, zmm27, 9) + emulate_palignr(&i4, &i2, &i1, 9, 4); + check_equal_zmm(&i4, &i3, "vpalignr zmm3, zmm5, zmm27, 9"); + + DO_V_REG_REG_REG_IMM(vpalignr, zmm27, zmm5, zmm3, 22) + emulate_palignr(&i4, &i2, &i1, 22, 4); + check_equal_zmm(&i4, &i3, "vpalignr zmm27, zmm5, zmm3, 22"); + + DO_V_REG_REG_REG_IMM(vpalignr, zmm5, zmm27, zmm3, 13) + emulate_palignr(&i4, &i2, &i1, 13, 4); + check_equal_zmm(&i4, &i3, "vpalignr zmm5, zmm27, zmm3, 13"); + +#endif /* defined(__x86_64) || defined(_M_X64) */ + + i3 = _mm512_alignr_epi8(i2, i1, 6); + emulate_palignr(&i4, &i2, &i1, 6, 4); + check_equal_zmm(&i4, &i3, "_mm512_alignr_epi8"); +} + +void NOINLINE compare_reg_reg_vs_reg_mem() { + /* Check that zmm-memory operand forms are parsed and encoded properly. */ + + void *p1 = &i1; + void *p2 = &i2; + void *p3 = &i3; + void *p4 = &i4; + void *p5 = &i5; + + __asm { + __asm mov FULL_IREG(ax), [p1] + __asm vmovaps zmm1, [FULL_IREG(ax)] + __asm mov FULL_IREG(ax), [p2] + __asm vmovaps zmm2, [FULL_IREG(ax)] + __asm mov FULL_IREG(ax), [p3] + __asm vmovaps zmm3, [FULL_IREG(ax)] + + __asm vpxord zmm6, zmm6, zmm6 + __asm vpxord zmm7, zmm7, zmm7 + + /* vpunpckhbw */ + + __asm vmovaps zmm4, zmm1 + __asm vmovaps zmm5, zmm1 + __asm vpunpckhbw zmm4, zmm2, zmm3 + __asm mov FULL_IREG(ax), [p3] + __asm vpunpckhbw zmm5, zmm2, [FULL_IREG(ax)] + __asm vpsubd zmm6, zmm4, zmm5 + __asm vpord zmm7, zmm7, zmm6 + + /* vpunpcklbw */ + + __asm vmovaps zmm4, zmm1 + __asm vmovaps zmm5, zmm1 + __asm vpunpcklbw zmm4, zmm2, zmm3 + __asm mov FULL_IREG(ax), [p3] + __asm vpunpcklbw zmm5, zmm2, [FULL_IREG(ax)] + __asm vpsubd zmm6, zmm4, zmm5 + __asm vpord zmm7, zmm7, zmm6 + + /* vpunpckhwd */ + + __asm vmovaps zmm4, zmm1 + __asm vmovaps zmm5, zmm1 + __asm vpunpckhwd zmm4, zmm2, zmm3 + __asm mov FULL_IREG(ax), [p3] + __asm vpunpckhwd zmm5, zmm2, [FULL_IREG(ax)] + __asm vpsubd zmm6, zmm4, zmm5 + __asm vpord zmm7, zmm7, zmm6 + + /* vpunpcklwd */ + + __asm vmovaps zmm4, zmm1 + __asm vmovaps zmm5, zmm1 + __asm vpunpcklwd zmm4, zmm2, zmm3 + __asm mov FULL_IREG(ax), [p3] + __asm vpunpcklwd zmm5, zmm2, [FULL_IREG(ax)] + __asm vpsubd zmm6, zmm4, zmm5 + __asm vpord zmm7, zmm7, zmm6 + + /* vpunpckhdq */ + + __asm vmovaps zmm4, zmm1 + __asm vmovaps zmm5, zmm1 + __asm vpunpckhdq zmm4, zmm2, zmm3 + __asm mov FULL_IREG(ax), [p3] + __asm vpunpckhdq zmm5, zmm2, [FULL_IREG(ax)] + __asm vpsubd zmm6, zmm4, zmm5 + __asm vpord zmm7, zmm7, zmm6 + + __asm mov FULL_IREG(ax), [p3] + __asm vpunpcklqdq zmm5, zmm2, [FULL_IREG(ax)]{1to8} + + /* vpunpckldq */ + + __asm vmovaps zmm4, zmm1 + __asm vmovaps zmm5, zmm1 + __asm vpunpckldq zmm4, zmm2, zmm3 + __asm mov FULL_IREG(ax), [p3] + __asm vpunpckldq zmm5, zmm2, [FULL_IREG(ax)] + __asm vpsubd zmm6, zmm4, zmm5 + __asm vpord zmm7, zmm7, zmm6 + + /* vunpckhps */ + + __asm vmovaps zmm4, zmm1 + __asm vmovaps zmm5, zmm1 + __asm vunpckhps zmm4, zmm2, zmm3 + __asm mov FULL_IREG(ax), [p3] + __asm vunpckhps zmm5, zmm2, [FULL_IREG(ax)] + __asm vpsubd zmm6, zmm4, zmm5 + __asm vpord zmm7, zmm7, zmm6 + + /* vunpcklps */ + + __asm vmovaps zmm4, zmm1 + __asm vmovaps zmm5, zmm1 + __asm vunpcklps zmm4, zmm2, zmm3 + __asm mov FULL_IREG(ax), [p3] + __asm vunpcklps zmm5, zmm2, [FULL_IREG(ax)] + __asm vpsubd zmm6, zmm4, zmm5 + __asm vpord zmm7, zmm7, zmm6 + + /* vunpckhpd */ + + __asm vmovaps zmm4, zmm1 + __asm vmovaps zmm5, zmm1 + __asm vunpckhpd zmm4, zmm2, zmm3 + __asm mov FULL_IREG(ax), [p3] + __asm vunpckhpd zmm5, zmm2, [FULL_IREG(ax)] + __asm vpsubd zmm6, zmm4, zmm5 + __asm vpord zmm7, zmm7, zmm6 + + /* vunpcklpd */ + + __asm vmovaps zmm4, zmm1 + __asm vmovaps zmm5, zmm1 + __asm vunpcklpd zmm4, zmm2, zmm3 + __asm mov FULL_IREG(ax), [p3] + __asm vunpcklpd zmm5, zmm2, [FULL_IREG(ax)] + __asm vpsubd zmm6, zmm4, zmm5 + __asm vpord zmm7, zmm7, zmm6 + + /* vpermilps reg */ + + __asm vmovaps zmm4, zmm1 + __asm vmovaps zmm5, zmm1 + __asm vpermilps zmm4, zmm2, zmm3 + __asm mov FULL_IREG(ax), [p3] + __asm vpermilps zmm5, zmm2, [FULL_IREG(ax)] + __asm vpsubd zmm6, zmm4, zmm5 + __asm vpord zmm7, zmm7, zmm6 + + /* vpermilps imm */ + + __asm vmovaps zmm4, zmm1 + __asm vmovaps zmm5, zmm1 + __asm vpermilps zmm4, zmm2, 0x35 + __asm mov FULL_IREG(ax), [p2] + __asm vpermilps zmm5, [FULL_IREG(ax)], 0x35 + __asm vpsubd zmm6, zmm4, zmm5 + __asm vpord zmm7, zmm7, zmm6 + + /* vshufps */ + + /* __asm mov FULL_IREG(ax), [p3] */ + /* __asm vbroadcastf32x4 zmm5, [FULL_IREG(ax)] */ + /* __asm vshufps zmm4, zmm2, zmm5, 0x65 */ + __asm vshufps zmm4, zmm2, zmm3, 0x65 + __asm mov FULL_IREG(ax), [p3] + __asm vshufps zmm5, zmm2, [FULL_IREG(ax)], 0x65 + /* __asm vshufps zmm5, zmm2, [FULL_IREG(ax)]{4to16}, 0x65 */ + __asm vpsubd zmm6, zmm4, zmm5 + __asm vpord zmm7, zmm7, zmm6 + + /* vpalignr */ + + __asm vpalignr zmm4, zmm2, zmm3, 0xd + __asm mov FULL_IREG(ax), [p3] + __asm vpalignr zmm5, zmm2, [FULL_IREG(ax)], 0xd + __asm vpsubd zmm6, zmm4, zmm5 + __asm vpord zmm7, zmm7, zmm6 + + /* Cumulative difference from zero is in zmm7, save this in i5. */ + + __asm mov FULL_IREG(ax), [p5] + __asm vmovaps[FULL_IREG(ax)], zmm7 + + /* Expected difference is zero, put zero in i4. */ + + __asm vpxord zmm7, zmm7, zmm7 + __asm mov FULL_IREG(ax), [p4] + __asm vmovaps[FULL_IREG(ax)], zmm7 +} + +check_equal_zmm(&i5, &i4, "various 512-bit reg-reg vs reg-mem"); +} + +int main(int argc, char *argv[]) { + if (argc > 1 && argv[1][0] == '-' && argv[1][1] == 'v' && + argv[1][2] == '\0') { + verbose = 1; + } + + init(); + do_punpck_bw(); + do_punpck_wd(); + do_punpck_dq(); + do_punpck_qdq(); + do_punpck_ps(); + do_palignr(); + compare_reg_reg_vs_reg_mem(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} diff --git a/SingleSource/UnitTests/Vector/AVX512BWVL/unpack_msasm.reference_output b/SingleSource/UnitTests/Vector/AVX512BWVL/unpack_msasm.reference_output new file mode 100644 index 00000000..bfae62d0 --- /dev/null +++ b/SingleSource/UnitTests/Vector/AVX512BWVL/unpack_msasm.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 |