1 files changed, 515 insertions, 0 deletions
diff --git a/SingleSource/UnitTests/Vector/AVX512F/expand_compress.c b/SingleSource/UnitTests/Vector/AVX512F/expand_compress.c
new file mode 100644
index 00000000..27f6a128
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512F/expand_compress.c
@@ -0,0 +1,515 @@
+/*
+ * Tests for expand intrinsics family.
+ * This test was created to check the correctness
+ * of the following intrinsics support:
+ *      _mm512_mask_compress*()
+ *      _mm512_mask_compressstoreu*()
+ *      _mm512_mask_expand*()
+ *      _mm512_mask_expandloadu*()
+ *      _mm512_maskz_compress*()
+ *      _mm512_maskz_expand*()
+ *      _mm512_maskz_expandloadu*()
+ */
+
+#include "m512_test_util.h"
+#include <stdio.h>
+#include <string.h>
+
+volatile __int64 vol0;
+
+V512 isrc1;
+V512 isrc2;
+
+V512 fsrc1;
+V512 fsrc2;
+
+V512 dsrc1;
+V512 dsrc2;
+
+V512 res;
+V512 mres;
+
+__mmask8 k8;
+__mmask16 k16;
+
+void NOINLINE init() {
+  volatile int i;
+
+  for (i = 0; i < 16; i++) {
+    isrc1.s32[i] = i;
+    isrc2.s32[i] = i + 1;
+
+    fsrc1.f32[i] = i * 1.0f;
+    fsrc2.f32[i] = i * 2.0f;
+  }
+
+  for (i = 0; i < 8; i++) {
+    dsrc1.f64[i] = i * 4.0;
+    dsrc2.f64[i] = i * 5.0;
+  }
+
+  k8 = 0x5a;
+  k16 = 0x25d6;
+}
+
+/*
+ * Use this between tests to make compiler think src was updated.
+ * Prevents PRE'ing of a load of src.
+ */
+#define soft_isrc1_update() isrc1.xmmi[vol0] = isrc1.xmmi[vol0]
+#define soft_fsrc1_update() fsrc1.xmmi[vol0] = fsrc1.xmmi[vol0]
+#define soft_dsrc1_update() dsrc1.xmmi[vol0] = dsrc1.xmmi[vol0]
+
+/*
+ * Model expand intrinsic behavior.
+ */
+void NOINLINE model_mask_expand_i32(void *input1, __int64 mask, void *input2,
+                                    void *output, int n_elems) {
+  int i, j = 0;
+  int *v1i = (int *)input1;
+  int *v2i = (int *)input2;
+  int *v3o = (int *)output;
+
+  for (i = 0; i < n_elems; i++) {
+    if ((mask & (1LL << i)) != 0) {
+      v3o[i] = v2i[j];
+      j++;
+    } else {
+      v3o[i] = v1i[i];
+    }
+  }
+}
+
+void NOINLINE model_maskz_expand_i32(__int64 mask, void *input2, void *output,
+                                     int n_elems) {
+  int i, j = 0;
+  int *v2i = (int *)input2;
+  int *v3o = (int *)output;
+
+  for (i = 0; i < n_elems; i++) {
+    if ((mask & (1LL << i)) != 0) {
+      v3o[i] = v2i[j];
+      j++;
+    } else {
+      v3o[i] = 0;
+    }
+  }
+}
+
+void NOINLINE model_mask_expand_i64(void *input1, __int64 mask, void *input2,
+                                    void *output, int n_elems) {
+  int i, j = 0;
+  __int64 *v1i = (__int64 *)input1;
+  __int64 *v2i = (__int64 *)input2;
+  __int64 *v3o = (__int64 *)output;
+
+  for (i = 0; i < n_elems; i++) {
+    if ((mask & (1LL << i)) != 0) {
+      v3o[i] = v2i[j];
+      j++;
+    } else {
+      v3o[i] = v1i[i];
+    }
+  }
+}
+
+void NOINLINE model_maskz_expand_i64(__int64 mask, void *input2, void *output,
+                                     int n_elems) {
+  int i, j = 0;
+  __int64 *v2i = (__int64 *)input2;
+  __int64 *v3o = (__int64 *)output;
+
+  for (i = 0; i < n_elems; i++) {
+    if ((mask & (1LL << i)) != 0) {
+      v3o[i] = v2i[j];
+      j++;
+    } else {
+      v3o[i] = 0;
+    }
+  }
+}
+
+void NOINLINE model_mask_expand_f32(void *input1, __int64 mask, void *input2,
+                                    void *output, int n_elems) {
+  int i, j = 0;
+  float *v1i = (float *)input1;
+  float *v2i = (float *)input2;
+  float *v3o = (float *)output;
+
+  for (i = 0; i < n_elems; i++) {
+    if ((mask & (1LL << i)) != 0) {
+      v3o[i] = v2i[j];
+      j++;
+    } else {
+      v3o[i] = v1i[i];
+    }
+  }
+}
+
+void NOINLINE model_maskz_expand_f32(__int64 mask, void *input2, void *output,
+                                     int n_elems) {
+  int i, j = 0;
+  float *v2i = (float *)input2;
+  float *v3o = (float *)output;
+
+  for (i = 0; i < n_elems; i++) {
+    if ((mask & (1LL << i)) != 0) {
+      v3o[i] = v2i[j];
+      j++;
+    } else {
+      v3o[i] = 0.f;
+    }
+  }
+}
+
+void NOINLINE model_mask_expand_f64(void *input1, __int64 mask, void *input2,
+                                    void *output, int n_elems) {
+  int i, j = 0;
+  double *v1i = (double *)input1;
+  double *v2i = (double *)input2;
+  double *v3o = (double *)output;
+
+  for (i = 0; i < n_elems; i++) {
+    if ((mask & (1LL << i)) != 0) {
+      v3o[i] = v2i[j];
+      j++;
+    } else {
+      v3o[i] = v1i[i];
+    }
+  }
+}
+
+void NOINLINE model_maskz_expand_f64(__int64 mask, void *input2, void *output,
+                                     int n_elems) {
+  int i, j = 0;
+  double *v2i = (double *)input2;
+  double *v3o = (double *)output;
+
+  for (i = 0; i < n_elems; i++) {
+    if ((mask & (1LL << i)) != 0) {
+      v3o[i] = v2i[j];
+      j++;
+    } else {
+      v3o[i] = 0.;
+    }
+  }
+}
+
+#define GEN_MASK_CHECK_CASE(intrin, prefix, suffix, mask, n_elem, modeller,    \
+                            checker)                                           \
+  res.suffix = intrin(prefix##src2.suffix, mask, prefix##src1.suffix);         \
+  modeller((void *)&prefix##src2.suffix, mask, (void *)&prefix##src1.suffix,   \
+           (void *)&mres.suffix, n_elem);                                      \
+  checker((void *)&res.suffix, (void *)&mres.suffix, n_elem, #intrin, __LINE__)
+
+#define GEN_MASK_LOAD_CHECK_CASE(intrin, prefix, suffix, mask, n_elem,         \
+                                 modeller, checker)                            \
+  res.suffix = intrin(prefix##src2.suffix, mask, &prefix##src1.suffix);        \
+  modeller((void *)&prefix##src2.suffix, mask, (void *)&prefix##src1.suffix,   \
+           (void *)&mres.suffix, n_elem);                                      \
+  checker((void *)&res.suffix, (void *)&mres.suffix, n_elem, #intrin, __LINE__)
+
+#define GEN_MASKZ_CHECK_CASE(intrin, prefix, suffix, mask, n_elem, modeller,   \
+                             checker)                                          \
+  res.suffix = intrin(mask, prefix##src1.suffix);                              \
+  modeller(mask, (void *)&prefix##src1.suffix, (void *)&mres.suffix, n_elem);  \
+  checker((void *)&res.suffix, (void *)&mres.suffix, n_elem, #intrin, __LINE__)
+
+#define GEN_MASKZ_LOAD_CHECK_CASE(intrin, prefix, suffix, mask, n_elem,        \
+                                  modeller, checker)                           \
+  res.suffix = intrin(mask, &prefix##src1.suffix);                             \
+  modeller(mask, (void *)&prefix##src1.suffix, (void *)&mres.suffix, n_elem);  \
+  checker((void *)&res.suffix, (void *)&mres.suffix, n_elem, #intrin, __LINE__)
+
+void NOINLINE do_m512_expand() {
+  volatile V512 res;
+
+  soft_isrc1_update();
+  GEN_MASK_CHECK_CASE(_mm512_mask_expand_epi32, i, zmmi, k16, 16,
+                      model_mask_expand_i32, check_equal_nd);
+
+  soft_isrc1_update();
+  GEN_MASK_LOAD_CHECK_CASE(_mm512_mask_expandloadu_epi32, i, zmmi, k16, 16,
+                           model_mask_expand_i32, check_equal_nd);
+
+  soft_isrc1_update();
+  GEN_MASKZ_CHECK_CASE(_mm512_maskz_expand_epi32, i, zmmi, k16, 16,
+                       model_maskz_expand_i32, check_equal_nd);
+
+  soft_isrc1_update();
+  GEN_MASKZ_LOAD_CHECK_CASE(_mm512_maskz_expandloadu_epi32, i, zmmi, k16, 16,
+                            model_maskz_expand_i32, check_equal_nd);
+
+  soft_isrc1_update();
+  GEN_MASK_CHECK_CASE(_mm512_mask_expand_epi64, i, zmmi, k8, 8,
+                      model_mask_expand_i64, check_equal_nq);
+
+  soft_isrc1_update();
+  GEN_MASK_LOAD_CHECK_CASE(_mm512_mask_expandloadu_epi64, i, zmmi, k8, 8,
+                           model_mask_expand_i64, check_equal_nq);
+
+  soft_isrc1_update();
+  GEN_MASKZ_CHECK_CASE(_mm512_maskz_expand_epi64, i, zmmi, k8, 8,
+                       model_maskz_expand_i64, check_equal_nq);
+
+  soft_isrc1_update();
+  GEN_MASKZ_LOAD_CHECK_CASE(_mm512_maskz_expandloadu_epi64, i, zmmi, k8, 8,
+                            model_maskz_expand_i64, check_equal_nq);
+
+  soft_fsrc1_update();
+  GEN_MASK_CHECK_CASE(_mm512_mask_expand_ps, f, zmm, k16, 16,
+                      model_mask_expand_f32, check_equal_nsf);
+
+  soft_fsrc1_update();
+  GEN_MASK_LOAD_CHECK_CASE(_mm512_mask_expandloadu_ps, f, zmm, k16, 16,
+                           model_mask_expand_f32, check_equal_nsf);
+
+  soft_fsrc1_update();
+  GEN_MASKZ_CHECK_CASE(_mm512_maskz_expand_ps, f, zmm, k16, 16,
+                       model_maskz_expand_f32, check_equal_nsf);
+
+  soft_fsrc1_update();
+  GEN_MASKZ_LOAD_CHECK_CASE(_mm512_maskz_expandloadu_ps, f, zmm, k16, 16,
+                            model_maskz_expand_f32, check_equal_nsf);
+
+  soft_dsrc1_update();
+  GEN_MASK_CHECK_CASE(_mm512_mask_expand_pd, d, zmmd, k8, 8,
+                      model_mask_expand_f64, check_equal_ndf);
+
+  soft_dsrc1_update();
+  GEN_MASK_LOAD_CHECK_CASE(_mm512_mask_expandloadu_pd, d, zmmd, k8, 8,
+                           model_mask_expand_f64, check_equal_ndf);
+
+  soft_dsrc1_update();
+  GEN_MASKZ_CHECK_CASE(_mm512_maskz_expand_pd, d, zmmd, k8, 8,
+                       model_maskz_expand_f64, check_equal_ndf);
+
+  soft_dsrc1_update();
+  GEN_MASKZ_LOAD_CHECK_CASE(_mm512_maskz_expandloadu_pd, d, zmmd, k8, 8,
+                            model_maskz_expand_f64, check_equal_ndf);
+}
+
+/*
+ * Model compress intrinsic behavior.
+ */
+void NOINLINE model_mask_compress_i32(void *input1, __int64 mask, void *input2,
+                                      void *output, int n_elems) {
+  int i, j = 0;
+  int *v1i = (int *)input1;
+  int *v2i = (int *)input2;
+  int *v3o = (int *)output;
+
+  for (i = 0; i < n_elems; i++) {
+    if ((mask & (1LL << i)) != 0) {
+      v3o[j] = v2i[i];
+      j++;
+    }
+  }
+
+  for (i = j; i < n_elems; i++) {
+    v3o[i] = v1i[i];
+  }
+}
+
+void NOINLINE model_maskz_compress_i32(__int64 mask, void *input2, void *output,
+                                       int n_elems) {
+  int i, j = 0;
+  int *v2i = (int *)input2;
+  int *v3o = (int *)output;
+
+  for (i = 0; i < n_elems; i++) {
+    if ((mask & (1LL << i)) != 0) {
+      v3o[j] = v2i[i];
+      j++;
+    }
+  }
+
+  for (i = j; i < n_elems; i++) {
+    v3o[i] = 0;
+  }
+}
+
+void NOINLINE model_mask_compress_i64(void *input1, __int64 mask, void *input2,
+                                      void *output, int n_elems) {
+  int i, j = 0;
+  __int64 *v1i = (__int64 *)input1;
+  __int64 *v2i = (__int64 *)input2;
+  __int64 *v3o = (__int64 *)output;
+
+  for (i = 0; i < n_elems; i++) {
+    if ((mask & (1LL << i)) != 0) {
+      v3o[j] = v2i[i];
+      j++;
+    }
+  }
+
+  for (i = j; i < n_elems; i++) {
+    v3o[i] = v1i[i];
+  }
+}
+
+void NOINLINE model_maskz_compress_i64(__int64 mask, void *input2, void *output,
+                                       int n_elems) {
+  int i, j = 0;
+  __int64 *v2i = (__int64 *)input2;
+  __int64 *v3o = (__int64 *)output;
+
+  for (i = 0; i < n_elems; i++) {
+    if ((mask & (1LL << i)) != 0) {
+      v3o[j] = v2i[i];
+      j++;
+    }
+  }
+
+  for (i = j; i < n_elems; i++) {
+    v3o[i] = 0;
+  }
+}
+
+void NOINLINE model_mask_compress_f32(void *input1, __int64 mask, void *input2,
+                                      void *output, int n_elems) {
+  int i, j = 0;
+  float *v1i = (float *)input1;
+  float *v2i = (float *)input2;
+  float *v3o = (float *)output;
+
+  for (i = 0; i < n_elems; i++) {
+    if ((mask & (1LL << i)) != 0) {
+      v3o[j] = v2i[i];
+      j++;
+    }
+  }
+
+  for (i = j; i < n_elems; i++) {
+    v3o[i] = v1i[i];
+  }
+}
+
+void NOINLINE model_maskz_compress_f32(__int64 mask, void *input2, void *output,
+                                       int n_elems) {
+  int i, j = 0;
+  float *v2i = (float *)input2;
+  float *v3o = (float *)output;
+
+  for (i = 0; i < n_elems; i++) {
+    if ((mask & (1LL << i)) != 0) {
+      v3o[j] = v2i[i];
+      j++;
+    }
+  }
+
+  for (i = j; i < n_elems; i++) {
+    v3o[i] = 0;
+  }
+}
+
+void NOINLINE model_mask_compress_f64(void *input1, __int64 mask, void *input2,
+                                      void *output, int n_elems) {
+  int i, j = 0;
+  double *v1i = (double *)input1;
+  double *v2i = (double *)input2;
+  double *v3o = (double *)output;
+
+  for (i = 0; i < n_elems; i++) {
+    if ((mask & (1LL << i)) != 0) {
+      v3o[j] = v2i[i];
+      j++;
+    }
+  }
+
+  for (i = j; i < n_elems; i++) {
+    v3o[i] = v1i[i];
+  }
+}
+
+void NOINLINE model_maskz_compress_f64(__int64 mask, void *input2, void *output,
+                                       int n_elems) {
+  int i, j = 0;
+  double *v2i = (double *)input2;
+  double *v3o = (double *)output;
+
+  for (i = 0; i < n_elems; i++) {
+    if ((mask & (1LL << i)) != 0) {
+      v3o[j] = v2i[i];
+      j++;
+    }
+  }
+
+  for (i = j; i < n_elems; i++) {
+    v3o[i] = 0;
+  }
+}
+
+#define GEN_MASK_STORE_CHECK_CASE(intrin, prefix, suffix, mask, n_elem,        \
+                                  modeller, checker)                           \
+  intrin((void *)&res.suffix, mask, prefix##src1.suffix);                      \
+  modeller((void *)&prefix##src2.suffix, mask, (void *)&prefix##src1.suffix,   \
+           (void *)&mres.suffix, n_elem);                                      \
+  checker((void *)&res.suffix, (void *)&mres.suffix, n_elem, #intrin, __LINE__)
+
+void NOINLINE do_m512_compress() {
+  volatile V512 res;
+
+  soft_isrc1_update();
+  GEN_MASK_CHECK_CASE(_mm512_mask_compress_epi32, i, zmmi, k16, 16,
+                      model_mask_compress_i32, check_equal_nd);
+
+  soft_isrc1_update();
+  GEN_MASK_STORE_CHECK_CASE(_mm512_mask_compressstoreu_epi32, i, zmmi, k16, 16,
+                            model_mask_compress_i32, check_equal_nd);
+
+  soft_isrc1_update();
+  GEN_MASKZ_CHECK_CASE(_mm512_maskz_compress_epi32, i, zmmi, k16, 16,
+                       model_maskz_compress_i32, check_equal_nd);
+
+  soft_isrc1_update();
+  GEN_MASK_CHECK_CASE(_mm512_mask_compress_epi64, i, zmmi, k8, 8,
+                      model_mask_compress_i64, check_equal_nq);
+
+  soft_isrc1_update();
+  GEN_MASK_STORE_CHECK_CASE(_mm512_mask_compressstoreu_epi64, i, zmmi, k8, 8,
+                            model_mask_compress_i64, check_equal_nq);
+
+  soft_isrc1_update();
+  GEN_MASKZ_CHECK_CASE(_mm512_maskz_compress_epi64, i, zmmi, k8, 8,
+                       model_maskz_compress_i64, check_equal_nq);
+
+  soft_fsrc1_update();
+  GEN_MASK_CHECK_CASE(_mm512_mask_compress_ps, f, zmm, k16, 16,
+                      model_mask_compress_f32, check_equal_nsf);
+
+  soft_fsrc1_update();
+  GEN_MASK_STORE_CHECK_CASE(_mm512_mask_compressstoreu_ps, f, zmm, k16, 16,
+                            model_mask_compress_f32, check_equal_nsf);
+
+  soft_fsrc1_update();
+  GEN_MASKZ_CHECK_CASE(_mm512_maskz_compress_ps, f, zmm, k16, 16,
+                       model_maskz_compress_f32, check_equal_nsf);
+
+  soft_dsrc1_update();
+  GEN_MASK_CHECK_CASE(_mm512_mask_compress_pd, d, zmmd, k8, 8,
+                      model_mask_compress_f64, check_equal_ndf);
+
+  soft_dsrc1_update();
+  GEN_MASK_STORE_CHECK_CASE(_mm512_mask_compressstoreu_pd, d, zmmd, k8, 8,
+                            model_mask_compress_f64, check_equal_ndf);
+
+  soft_dsrc1_update();
+  GEN_MASKZ_CHECK_CASE(_mm512_maskz_compress_pd, d, zmmd, k8, 8,
+                       model_maskz_compress_f64, check_equal_ndf);
+}
+
+int main() {
+  init();
+
+  do_m512_expand();
+  do_m512_compress();
+
+  if (n_errs != 0) {
+    printf("FAILED\n");
+    return 1;
+  }
+
+  printf("PASSED\n");
+  return 0;
+}