diff options
author | Christophe Lyon <christophe.lyon@linaro.org> | 2016-06-06 23:44:46 +0200 |
---|---|---|
committer | Yvan Roux <yvan.roux@linaro.org> | 2016-06-14 14:47:54 +0200 |
commit | 060934a01ffa66b5d87ab7098b3dc3d8d4e27e10 (patch) | |
tree | aa124935b851b6c430e3a8f7875c87ddde0b5653 | |
parent | 309490ea34b4e40522a6c1645ee191c1ad5f68c9 (diff) |
gcc/
Backport from trunk r236331.
2016-05-17 Jiong Wang <jiong.wang@arm.com>
* config/aarch64/aarch64-simd.md (*aarch64_fma4_elt_to_128df): Rename
to *aarch64_fma4_elt_from_dup<mode>.
(*aarch64_fnma4_elt_to_128df): Rename to
*aarch64_fnma4_elt_from_dup<mode>.
* config/aarch64/arm_neon.h (vfma_n_f64): New.
(vfms_n_f32): Likewise.
(vfms_n_f64): Likewise.
(vfmsq_n_f32): Likewise.
(vfmsq_n_f64): Likewise.
gcc/testsuite/
Backport from trunk r236331.
2016-05-17 Jiong Wang <jiong.wang@arm.com>
* gcc.target/aarch64/fmla_intrinsic_1.c: Allow ".d[index]" besides
".2d[index]" when scan the assembly.
* gcc.target/aarch64/fmls_intrinsic_1.c: Likewise.
* gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h: New entry for
float64x1.
* gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c: New.
gcc/
Backport from trunk r236332.
2016-05-17 Jiong Wang <jiong.wang@arm.com>
* config/aarch64/aarch64-simd.md (*aarch64_mul3_elt_to_128df): Extend to
all supported modes. Rename to "*aarch64_mul3_elt_from_dup".
gcc/testsuite/
Backport from trunk r236332.
2016-05-17 Jiong Wang <jiong.wang@arm.com>
* gcc.target/aarch64/simd/vmul_elem_1.c: New.
gcc/
Backport from trunk r236333.
2016-05-17 Jiong Wang <jiong.wang@arm.com>
* config/aarch64/aarch64-simd.md (vmul_n_f32): Remove inline assembly.
Use builtin.
(vmul_n_s16): Likewise.
(vmul_n_s32): Likewise.
(vmul_n_u16): Likewise.
(vmul_n_u32): Likewise.
(vmulq_n_f32): Likewise.
(vmulq_n_f64): Likewise.
(vmulq_n_s16): Likewise.
(vmulq_n_s32): Likewise.
(vmulq_n_u16): Likewise.
(vmulq_n_u32): Likewise.
gcc/testsuite/
Backport from trunk r236333.
2016-05-17 Jiong Wang <jiong.wang@arm.com>
* gcc.target/aarch64/simd/vmul_elem_1.c: Use intrinsics.
gcc/
Backport from trunk r236334.
2016-05-17 Jiong Wang <jiong.wang@arm.com>
* config/aarch64/arm_neon.h (vmvn_s8): Reimplement using C operator.
Remove inline assembly.
(vmvn_s16): Likewise.
(vmvn_s32): Likewise.
(vmvn_u8): Likewise.
(vmvn_u16): Likewise.
(vmvn_u32): Likewise.
(vmvnq_s8): Likewise.
(vmvnq_s16): Likewise.
(vmvnq_s32): Likewise.
(vmvnq_u8): Likewise.
(vmvnq_u16): Likewise.
(vmvnq_u32): Likewise.
(vmvn_p8): Likewise.
(vmvnq_p16): Likewise.
gcc/testsuite/
Backport from trunk r236370.
2016-05-18 Jiong Wang <jiong.wang@arm.com>
* gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h: Guard float64_t
with __aarch64__.
* gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c: Guard variable
declarations under __aarch64__ and __ARM_FEATURE_FMA.
gcc/testsuite/
Backport from trunk r236762.
2016-05-26 Jiong Wang <jiong.wang@arm.com>
* gcc.target/aarch64/simd/vmul_elem_1.c: Force result variables to be
kept in memory.
Change-Id: Ie51f0f3de00727d8a6e0ba3a129df0968853449f
-rw-r--r-- | gcc/config/aarch64/aarch64-simd.md | 56 | ||||
-rw-r--r-- | gcc/config/aarch64/arm_neon.h | 459 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h | 4 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c | 490 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/simd/vmul_elem_1.c | 541 |
7 files changed, 1248 insertions, 306 deletions
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index bd73bce6441..0dd4bf81078 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -371,15 +371,15 @@ [(set_attr "type" "neon<fp>_mul_<Vetype>_scalar<q>")] ) -(define_insn "*aarch64_mul3_elt_to_128df" - [(set (match_operand:V2DF 0 "register_operand" "=w") - (mult:V2DF - (vec_duplicate:V2DF - (match_operand:DF 2 "register_operand" "w")) - (match_operand:V2DF 1 "register_operand" "w")))] +(define_insn "*aarch64_mul3_elt_from_dup<mode>" + [(set (match_operand:VMUL 0 "register_operand" "=w") + (mult:VMUL + (vec_duplicate:VMUL + (match_operand:<VEL> 1 "register_operand" "<h_con>")) + (match_operand:VMUL 2 "register_operand" "w")))] "TARGET_SIMD" - "fmul\\t%0.2d, %1.2d, %2.d[0]" - [(set_attr "type" "neon_fp_mul_d_scalar_q")] + "<f>mul\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"; + [(set_attr "type" "neon<fp>_mul_<Vetype>_scalar<q>")] ) (define_insn "aarch64_rsqrte_<mode>2" @@ -1579,16 +1579,16 @@ [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")] ) -(define_insn "*aarch64_fma4_elt_to_128df" - [(set (match_operand:V2DF 0 "register_operand" "=w") - (fma:V2DF - (vec_duplicate:V2DF - (match_operand:DF 1 "register_operand" "w")) - (match_operand:V2DF 2 "register_operand" "w") - (match_operand:V2DF 3 "register_operand" "0")))] +(define_insn "*aarch64_fma4_elt_from_dup<mode>" + [(set (match_operand:VMUL 0 "register_operand" "=w") + (fma:VMUL + (vec_duplicate:VMUL + (match_operand:<VEL> 1 "register_operand" "w")) + (match_operand:VMUL 2 "register_operand" "w") + (match_operand:VMUL 3 "register_operand" "0")))] "TARGET_SIMD" - "fmla\\t%0.2d, %2.2d, %1.2d[0]" - [(set_attr "type" "neon_fp_mla_d_scalar_q")] + "fmla\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]" + [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")] ) (define_insn "*aarch64_fma4_elt_to_64v2df" @@ -1656,17 +1656,17 @@ [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")] ) -(define_insn "*aarch64_fnma4_elt_to_128df" - [(set (match_operand:V2DF 0 "register_operand" "=w") - (fma:V2DF - (neg:V2DF - (match_operand:V2DF 2 "register_operand" "w")) - (vec_duplicate:V2DF - (match_operand:DF 1 "register_operand" "w")) - (match_operand:V2DF 3 "register_operand" "0")))] - "TARGET_SIMD" - "fmls\\t%0.2d, %2.2d, %1.2d[0]" - [(set_attr "type" "neon_fp_mla_d_scalar_q")] +(define_insn "*aarch64_fnma4_elt_from_dup<mode>" + [(set (match_operand:VMUL 0 "register_operand" "=w") + (fma:VMUL + (neg:VMUL + (match_operand:VMUL 2 "register_operand" "w")) + (vec_duplicate:VMUL + (match_operand:<VEL> 1 "register_operand" "w")) + (match_operand:VMUL 3 "register_operand" "0")))] + "TARGET_SIMD" + "fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]" + [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")] ) (define_insn "*aarch64_fnma4_elt_to_64v2df" diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 2612a325718..e563e3d2f77 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -7938,61 +7938,6 @@ vmovn_u64 (uint64x2_t a) return result; } -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vmul_n_f32 (float32x2_t a, float32_t b) -{ - float32x2_t result; - __asm__ ("fmul %0.2s,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vmul_n_s16 (int16x4_t a, int16_t b) -{ - int16x4_t result; - __asm__ ("mul %0.4h,%1.4h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vmul_n_s32 (int32x2_t a, int32_t b) -{ - int32x2_t result; - __asm__ ("mul %0.2s,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vmul_n_u16 (uint16x4_t a, uint16_t b) -{ - uint16x4_t result; - __asm__ ("mul %0.4h,%1.4h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vmul_n_u32 (uint32x2_t a, uint32_t b) -{ - uint32x2_t result; - __asm__ ("mul %0.2s,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - #define vmull_high_lane_s16(a, b, c) \ __extension__ \ ({ \ @@ -8443,227 +8388,6 @@ vmull_u32 (uint32x2_t a, uint32x2_t b) return result; } -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vmulq_n_f32 (float32x4_t a, float32_t b) -{ - float32x4_t result; - __asm__ ("fmul %0.4s,%1.4s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vmulq_n_f64 (float64x2_t a, float64_t b) -{ - float64x2_t result; - __asm__ ("fmul %0.2d,%1.2d,%2.d[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vmulq_n_s16 (int16x8_t a, int16_t b) -{ - int16x8_t result; - __asm__ ("mul %0.8h,%1.8h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vmulq_n_s32 (int32x4_t a, int32_t b) -{ - int32x4_t result; - __asm__ ("mul %0.4s,%1.4s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vmulq_n_u16 (uint16x8_t a, uint16_t b) -{ - uint16x8_t result; - __asm__ ("mul %0.8h,%1.8h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vmulq_n_u32 (uint32x4_t a, uint32_t b) -{ - uint32x4_t result; - __asm__ ("mul %0.4s,%1.4s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vmvn_p8 (poly8x8_t a) -{ - poly8x8_t result; - __asm__ ("mvn %0.8b,%1.8b" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vmvn_s8 (int8x8_t a) -{ - int8x8_t result; - __asm__ ("mvn %0.8b,%1.8b" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vmvn_s16 (int16x4_t a) -{ - int16x4_t result; - __asm__ ("mvn %0.8b,%1.8b" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vmvn_s32 (int32x2_t a) -{ - int32x2_t result; - __asm__ ("mvn %0.8b,%1.8b" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vmvn_u8 (uint8x8_t a) -{ - uint8x8_t result; - __asm__ ("mvn %0.8b,%1.8b" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vmvn_u16 (uint16x4_t a) -{ - uint16x4_t result; - __asm__ ("mvn %0.8b,%1.8b" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vmvn_u32 (uint32x2_t a) -{ - uint32x2_t result; - __asm__ ("mvn %0.8b,%1.8b" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) -vmvnq_p8 (poly8x16_t a) -{ - poly8x16_t result; - __asm__ ("mvn %0.16b,%1.16b" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vmvnq_s8 (int8x16_t a) -{ - int8x16_t result; - __asm__ ("mvn %0.16b,%1.16b" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vmvnq_s16 (int16x8_t a) -{ - int16x8_t result; - __asm__ ("mvn %0.16b,%1.16b" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vmvnq_s32 (int32x4_t a) -{ - int32x4_t result; - __asm__ ("mvn %0.16b,%1.16b" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vmvnq_u8 (uint8x16_t a) -{ - uint8x16_t result; - __asm__ ("mvn %0.16b,%1.16b" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vmvnq_u16 (uint16x8_t a) -{ - uint16x8_t result; - __asm__ ("mvn %0.16b,%1.16b" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vmvnq_u32 (uint32x4_t a) -{ - uint32x4_t result; - __asm__ ("mvn %0.16b,%1.16b" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - - __extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) vpadal_s8 (int16x4_t a, int8x8_t b) { @@ -14456,6 +14180,12 @@ vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c) return __builtin_aarch64_fmav2sf (__b, vdup_n_f32 (__c), __a); } +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) +vfma_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c) +{ + return (float64x1_t) {__b[0] * __c + __a[0]}; +} + __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) vfmaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c) { @@ -14597,6 +14327,29 @@ vfmsq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c) return __builtin_aarch64_fmav2df (-__b, __c, __a); } +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vfms_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c) +{ + return __builtin_aarch64_fmav2sf (-__b, vdup_n_f32 (__c), __a); +} + +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) +vfms_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c) +{ + return (float64x1_t) {-__b[0] * __c + __a[0]}; +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vfmsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c) +{ + return __builtin_aarch64_fmav4sf (-__b, vdupq_n_f32 (__c), __a); +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vfmsq_n_f64 (float64x2_t __a, float64x2_t __b, float64_t __c) +{ + return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c), __a); +} /* vfms_lane */ @@ -18895,6 +18648,160 @@ vmulq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __lane) return __a * __aarch64_vget_lane_any (__b, __lane); } +/* vmul_n. */ + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vmul_n_f32 (float32x2_t __a, float32_t __b) +{ + return __a * __b; +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vmulq_n_f32 (float32x4_t __a, float32_t __b) +{ + return __a * __b; +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vmulq_n_f64 (float64x2_t __a, float64_t __b) +{ + return __a * __b; +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vmul_n_s16 (int16x4_t __a, int16_t __b) +{ + return __a * __b; +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmulq_n_s16 (int16x8_t __a, int16_t __b) +{ + return __a * __b; +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vmul_n_s32 (int32x2_t __a, int32_t __b) +{ + return __a * __b; +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmulq_n_s32 (int32x4_t __a, int32_t __b) +{ + return __a * __b; +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vmul_n_u16 (uint16x4_t __a, uint16_t __b) +{ + return __a * __b; +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmulq_n_u16 (uint16x8_t __a, uint16_t __b) +{ + return __a * __b; +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vmul_n_u32 (uint32x2_t __a, uint32_t __b) +{ + return __a * __b; +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmulq_n_u32 (uint32x4_t __a, uint32_t __b) +{ + return __a * __b; +} + +/* vmvn */ + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vmvn_p8 (poly8x8_t __a) +{ + return (poly8x8_t) ~((int8x8_t) __a); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vmvn_s8 (int8x8_t __a) +{ + return ~__a; +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vmvn_s16 (int16x4_t __a) +{ + return ~__a; +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vmvn_s32 (int32x2_t __a) +{ + return ~__a; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vmvn_u8 (uint8x8_t __a) +{ + return ~__a; +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vmvn_u16 (uint16x4_t __a) +{ + return ~__a; +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vmvn_u32 (uint32x2_t __a) +{ + return ~__a; +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vmvnq_p8 (poly8x16_t __a) +{ + return (poly8x16_t) ~((int8x16_t) __a); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vmvnq_s8 (int8x16_t __a) +{ + return ~__a; +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmvnq_s16 (int16x8_t __a) +{ + return ~__a; +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmvnq_s32 (int32x4_t __a) +{ + return ~__a; +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vmvnq_u8 (uint8x16_t __a) +{ + return ~__a; +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmvnq_u16 (uint16x8_t __a) +{ + return ~__a; +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmvnq_u32 (uint32x4_t __a) +{ + return ~__a; +} + /* vneg */ __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h index 49fbd843e50..dde0e45a2b8 100644 --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h @@ -137,6 +137,9 @@ static ARRAY(result, poly, 16, 4); static ARRAY(result, float, 16, 4); #endif static ARRAY(result, float, 32, 2); +#ifdef __aarch64__ +static ARRAY(result, float, 64, 1); +#endif static ARRAY(result, int, 8, 16); static ARRAY(result, int, 16, 8); static ARRAY(result, int, 32, 4); @@ -169,6 +172,7 @@ extern ARRAY(expected, poly, 8, 8); extern ARRAY(expected, poly, 16, 4); extern ARRAY(expected, hfloat, 16, 4); extern ARRAY(expected, hfloat, 32, 2); +extern ARRAY(expected, hfloat, 64, 1); extern ARRAY(expected, int, 8, 16); extern ARRAY(expected, int, 16, 8); extern ARRAY(expected, int, 32, 4); diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c new file mode 100644 index 00000000000..efa9b5f2ece --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c @@ -0,0 +1,490 @@ +#include <arm_neon.h> +#include "arm-neon-ref.h" +#include "compute-ref-data.h" + +#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA) + +#define A0 123.4f +#define A1 -3.8f +#define A2 -29.4f +#define A3 (__builtin_inff ()) +#define A4 0.0f +#define A5 24.0f +#define A6 124.0f +#define A7 1024.0f + +#define B0 -5.8f +#define B1 -0.0f +#define B2 -10.8f +#define B3 10.0f +#define B4 23.4f +#define B5 -1234.8f +#define B6 8.9f +#define B7 4.0f + +#define E0 9.8f +#define E1 -1024.0f +#define E2 (-__builtin_inff ()) +#define E3 479.0f +float32_t elem0 = E0; +float32_t elem1 = E1; +float32_t elem2 = E2; +float32_t elem3 = E3; + +#define DA0 1231234.4 +#define DA1 -3.8 +#define DA2 -2980.4 +#define DA3 -5.8 +#define DA4 0.01123 +#define DA5 24.0 +#define DA6 124.12345 +#define DA7 1024.0 + +#define DB0 -5.8 +#define DB1 (__builtin_inf ()) +#define DB2 -105.8 +#define DB3 10.0 +#define DB4 (-__builtin_inf ()) +#define DB5 -1234.8 +#define DB6 848.9 +#define DB7 44444.0 + +#define DE0 9.8 +#define DE1 -1024.0 +#define DE2 105.8 +#define DE3 479.0 +float64_t delem0 = DE0; +float64_t delem1 = DE1; +float64_t delem2 = DE2; +float64_t delem3 = DE3; + +/* Expected results for vfms_n. */ + +VECT_VAR_DECL(expectedfms0, float, 32, 2) [] = {A0 + -B0 * E0, A1 + -B1 * E0}; +VECT_VAR_DECL(expectedfms1, float, 32, 2) [] = {A2 + -B2 * E1, A3 + -B3 * E1}; +VECT_VAR_DECL(expectedfms2, float, 32, 2) [] = {A4 + -B4 * E2, A5 + -B5 * E2}; +VECT_VAR_DECL(expectedfms3, float, 32, 2) [] = {A6 + -B6 * E3, A7 + -B7 * E3}; +VECT_VAR_DECL(expectedfma0, float, 32, 2) [] = {A0 + B0 * E0, A1 + B1 * E0}; +VECT_VAR_DECL(expectedfma1, float, 32, 2) [] = {A2 + B2 * E1, A3 + B3 * E1}; +VECT_VAR_DECL(expectedfma2, float, 32, 2) [] = {A4 + B4 * E2, A5 + B5 * E2}; +VECT_VAR_DECL(expectedfma3, float, 32, 2) [] = {A6 + B6 * E3, A7 + B7 * E3}; + +hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 2) = + (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 2); +hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 2) = + (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 2); +hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 2) = + (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 2); +hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 2) = + (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 2); +hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 2) = + (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 2); +hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 2) = + (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 2); +hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 2) = + (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 2); +hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 2) = + (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 2); + + +VECT_VAR_DECL(expectedfms0, float, 32, 4) [] = {A0 + -B0 * E0, A1 + -B1 * E0, + A2 + -B2 * E0, A3 + -B3 * E0}; +VECT_VAR_DECL(expectedfms1, float, 32, 4) [] = {A4 + -B4 * E1, A5 + -B5 * E1, + A6 + -B6 * E1, A7 + -B7 * E1}; +VECT_VAR_DECL(expectedfms2, float, 32, 4) [] = {A0 + -B0 * E2, A2 + -B2 * E2, + A4 + -B4 * E2, A6 + -B6 * E2}; +VECT_VAR_DECL(expectedfms3, float, 32, 4) [] = {A1 + -B1 * E3, A3 + -B3 * E3, + A5 + -B5 * E3, A7 + -B7 * E3}; +VECT_VAR_DECL(expectedfma0, float, 32, 4) [] = {A0 + B0 * E0, A1 + B1 * E0, + A2 + B2 * E0, A3 + B3 * E0}; +VECT_VAR_DECL(expectedfma1, float, 32, 4) [] = {A4 + B4 * E1, A5 + B5 * E1, + A6 + B6 * E1, A7 + B7 * E1}; +VECT_VAR_DECL(expectedfma2, float, 32, 4) [] = {A0 + B0 * E2, A2 + B2 * E2, + A4 + B4 * E2, A6 + B6 * E2}; +VECT_VAR_DECL(expectedfma3, float, 32, 4) [] = {A1 + B1 * E3, A3 + B3 * E3, + A5 + B5 * E3, A7 + B7 * E3}; + +hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 4) = + (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 4); +hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 4) = + (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 4); +hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 4) = + (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 4); +hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 4) = + (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 4); +hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 4) = + (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 4); +hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 4) = + (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 4); +hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 4) = + (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 4); +hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 4) = + (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 4); + +VECT_VAR_DECL(expectedfms0, float, 64, 2) [] = {DA0 + -DB0 * DE0, + DA1 + -DB1 * DE0}; +VECT_VAR_DECL(expectedfms1, float, 64, 2) [] = {DA2 + -DB2 * DE1, + DA3 + -DB3 * DE1}; +VECT_VAR_DECL(expectedfms2, float, 64, 2) [] = {DA4 + -DB4 * DE2, + DA5 + -DB5 * DE2}; +VECT_VAR_DECL(expectedfms3, float, 64, 2) [] = {DA6 + -DB6 * DE3, + DA7 + -DB7 * DE3}; +VECT_VAR_DECL(expectedfma0, float, 64, 2) [] = {DA0 + DB0 * DE0, + DA1 + DB1 * DE0}; +VECT_VAR_DECL(expectedfma1, float, 64, 2) [] = {DA2 + DB2 * DE1, + DA3 + DB3 * DE1}; +VECT_VAR_DECL(expectedfma2, float, 64, 2) [] = {DA4 + DB4 * DE2, + DA5 + DB5 * DE2}; +VECT_VAR_DECL(expectedfma3, float, 64, 2) [] = {DA6 + DB6 * DE3, + DA7 + DB7 * DE3}; +hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 2) = + (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 2); +hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 2) = + (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 2); +hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 2) = + (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 2); +hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 2) = + (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 2); +hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 2) = + (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 2); +hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 2) = + (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 2); +hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 2) = + (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 2); +hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 2) = + (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 2); + +VECT_VAR_DECL(expectedfms0, float, 64, 1) [] = {DA0 + -DB0 * DE0}; +VECT_VAR_DECL(expectedfms1, float, 64, 1) [] = {DA2 + -DB2 * DE1}; +VECT_VAR_DECL(expectedfms2, float, 64, 1) [] = {DA4 + -DB4 * DE2}; +VECT_VAR_DECL(expectedfms3, float, 64, 1) [] = {DA6 + -DB6 * DE3}; +VECT_VAR_DECL(expectedfma0, float, 64, 1) [] = {DA0 + DB0 * DE0}; +VECT_VAR_DECL(expectedfma1, float, 64, 1) [] = {DA2 + DB2 * DE1}; +VECT_VAR_DECL(expectedfma2, float, 64, 1) [] = {DA4 + DB4 * DE2}; +VECT_VAR_DECL(expectedfma3, float, 64, 1) [] = {DA6 + DB6 * DE3}; + +hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 1) = + (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 1); +hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 1) = + (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 1); +hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 1) = + (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 1); +hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 1) = + (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 1); +hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 1) = + (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 1); +hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 1) = + (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 1); +hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 1) = + (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 1); +hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 1) = + (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 1); + +void exec_vfma_vfms_n (void) +{ +#undef TEST_MSG +#define TEST_MSG "VFMS_VFMA_N (FP32)" + clean_results (); + + DECL_VARIABLE(vsrc_1, float, 32, 2); + DECL_VARIABLE(vsrc_2, float, 32, 2); + VECT_VAR_DECL (buf_src_1, float, 32, 2) [] = {A0, A1}; + VECT_VAR_DECL (buf_src_2, float, 32, 2) [] = {B0, B1}; + VLOAD (vsrc_1, buf_src_1, , float, f, 32, 2); + VLOAD (vsrc_2, buf_src_2, , float, f, 32, 2); + DECL_VARIABLE (vector_res, float, 32, 2) = + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), + VECT_VAR (vsrc_2, float, 32, 2), elem0); + vst1_f32 (VECT_VAR (result, float, 32, 2), + VECT_VAR (vector_res, float, 32, 2)); + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms0_static, ""); + VECT_VAR (vector_res, float, 32, 2) = + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), + VECT_VAR (vsrc_2, float, 32, 2), elem0); + vst1_f32 (VECT_VAR (result, float, 32, 2), + VECT_VAR (vector_res, float, 32, 2)); + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma0_static, ""); + + VECT_VAR_DECL (buf_src_3, float, 32, 2) [] = {A2, A3}; + VECT_VAR_DECL (buf_src_4, float, 32, 2) [] = {B2, B3}; + VLOAD (vsrc_1, buf_src_3, , float, f, 32, 2); + VLOAD (vsrc_2, buf_src_4, , float, f, 32, 2); + VECT_VAR (vector_res, float, 32, 2) = + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), + VECT_VAR (vsrc_2, float, 32, 2), elem1); + vst1_f32 (VECT_VAR (result, float, 32, 2), + VECT_VAR (vector_res, float, 32, 2)); + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms1_static, ""); + VECT_VAR (vector_res, float, 32, 2) = + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), + VECT_VAR (vsrc_2, float, 32, 2), elem1); + vst1_f32 (VECT_VAR (result, float, 32, 2), + VECT_VAR (vector_res, float, 32, 2)); + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma1_static, ""); + + VECT_VAR_DECL (buf_src_5, float, 32, 2) [] = {A4, A5}; + VECT_VAR_DECL (buf_src_6, float, 32, 2) [] = {B4, B5}; + VLOAD (vsrc_1, buf_src_5, , float, f, 32, 2); + VLOAD (vsrc_2, buf_src_6, , float, f, 32, 2); + VECT_VAR (vector_res, float, 32, 2) = + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), + VECT_VAR (vsrc_2, float, 32, 2), elem2); + vst1_f32 (VECT_VAR (result, float, 32, 2), + VECT_VAR (vector_res, float, 32, 2)); + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms2_static, ""); + VECT_VAR (vector_res, float, 32, 2) = + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), + VECT_VAR (vsrc_2, float, 32, 2), elem2); + vst1_f32 (VECT_VAR (result, float, 32, 2), + VECT_VAR (vector_res, float, 32, 2)); + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma2_static, ""); + + VECT_VAR_DECL (buf_src_7, float, 32, 2) [] = {A6, A7}; + VECT_VAR_DECL (buf_src_8, float, 32, 2) [] = {B6, B7}; + VLOAD (vsrc_1, buf_src_7, , float, f, 32, 2); + VLOAD (vsrc_2, buf_src_8, , float, f, 32, 2); + VECT_VAR (vector_res, float, 32, 2) = + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), + VECT_VAR (vsrc_2, float, 32, 2), elem3); + vst1_f32 (VECT_VAR (result, float, 32, 2), + VECT_VAR (vector_res, float, 32, 2)); + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms3_static, ""); + VECT_VAR (vector_res, float, 32, 2) = + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), + VECT_VAR (vsrc_2, float, 32, 2), elem3); + vst1_f32 (VECT_VAR (result, float, 32, 2), + VECT_VAR (vector_res, float, 32, 2)); + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma3_static, ""); + +#undef TEST_MSG +#define TEST_MSG "VFMSQ_VFMAQ_N (FP32)" + clean_results (); + + DECL_VARIABLE(vsrc_1, float, 32, 4); + DECL_VARIABLE(vsrc_2, float, 32, 4); + VECT_VAR_DECL (buf_src_1, float, 32, 4) [] = {A0, A1, A2, A3}; + VECT_VAR_DECL (buf_src_2, float, 32, 4) [] = {B0, B1, B2, B3}; + VLOAD (vsrc_1, buf_src_1, q, float, f, 32, 4); + VLOAD (vsrc_2, buf_src_2, q, float, f, 32, 4); + DECL_VARIABLE (vector_res, float, 32, 4) = + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), + VECT_VAR (vsrc_2, float, 32, 4), elem0); + vst1q_f32 (VECT_VAR (result, float, 32, 4), + VECT_VAR (vector_res, float, 32, 4)); + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms0_static, ""); + VECT_VAR (vector_res, float, 32, 4) = + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), + VECT_VAR (vsrc_2, float, 32, 4), elem0); + vst1q_f32 (VECT_VAR (result, float, 32, 4), + VECT_VAR (vector_res, float, 32, 4)); + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma0_static, ""); + + VECT_VAR_DECL (buf_src_3, float, 32, 4) [] = {A4, A5, A6, A7}; + VECT_VAR_DECL (buf_src_4, float, 32, 4) [] = {B4, B5, B6, B7}; + VLOAD (vsrc_1, buf_src_3, q, float, f, 32, 4); + VLOAD (vsrc_2, buf_src_4, q, float, f, 32, 4); + VECT_VAR (vector_res, float, 32, 4) = + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), + VECT_VAR (vsrc_2, float, 32, 4), elem1); + vst1q_f32 (VECT_VAR (result, float, 32, 4), + VECT_VAR (vector_res, float, 32, 4)); + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms1_static, ""); + VECT_VAR (vector_res, float, 32, 4) = + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), + VECT_VAR (vsrc_2, float, 32, 4), elem1); + vst1q_f32 (VECT_VAR (result, float, 32, 4), + VECT_VAR (vector_res, float, 32, 4)); + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma1_static, ""); + + VECT_VAR_DECL (buf_src_5, float, 32, 4) [] = {A0, A2, A4, A6}; + VECT_VAR_DECL (buf_src_6, float, 32, 4) [] = {B0, B2, B4, B6}; + VLOAD (vsrc_1, buf_src_5, q, float, f, 32, 4); + VLOAD (vsrc_2, buf_src_6, q, float, f, 32, 4); + VECT_VAR (vector_res, float, 32, 4) = + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), + VECT_VAR (vsrc_2, float, 32, 4), elem2); + vst1q_f32 (VECT_VAR (result, float, 32, 4), + VECT_VAR (vector_res, float, 32, 4)); + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms2_static, ""); + VECT_VAR (vector_res, float, 32, 4) = + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), + VECT_VAR (vsrc_2, float, 32, 4), elem2); + vst1q_f32 (VECT_VAR (result, float, 32, 4), + VECT_VAR (vector_res, float, 32, 4)); + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma2_static, ""); + + VECT_VAR_DECL (buf_src_7, float, 32, 4) [] = {A1, A3, A5, A7}; + VECT_VAR_DECL (buf_src_8, float, 32, 4) [] = {B1, B3, B5, B7}; + VLOAD (vsrc_1, buf_src_7, q, float, f, 32, 4); + VLOAD (vsrc_2, buf_src_8, q, float, f, 32, 4); + VECT_VAR (vector_res, float, 32, 4) = + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), + VECT_VAR (vsrc_2, float, 32, 4), elem3); + vst1q_f32 (VECT_VAR (result, float, 32, 4), + VECT_VAR (vector_res, float, 32, 4)); + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms3_static, ""); + VECT_VAR (vector_res, float, 32, 4) = + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), + VECT_VAR (vsrc_2, float, 32, 4), elem3); + vst1q_f32 (VECT_VAR (result, float, 32, 4), + VECT_VAR (vector_res, float, 32, 4)); + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma3_static, ""); + +#undef TEST_MSG +#define TEST_MSG "VFMSQ_VFMAQ_N (FP64)" + clean_results (); + + DECL_VARIABLE(vsrc_1, float, 64, 2); + DECL_VARIABLE(vsrc_2, float, 64, 2); + VECT_VAR_DECL (buf_src_1, float, 64, 2) [] = {DA0, DA1}; + VECT_VAR_DECL (buf_src_2, float, 64, 2) [] = {DB0, DB1}; + VLOAD (vsrc_1, buf_src_1, q, float, f, 64, 2); + VLOAD (vsrc_2, buf_src_2, q, float, f, 64, 2); + DECL_VARIABLE (vector_res, float, 64, 2) = + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), + VECT_VAR (vsrc_2, float, 64, 2), delem0); + vst1q_f64 (VECT_VAR (result, float, 64, 2), + VECT_VAR (vector_res, float, 64, 2)); + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms0_static, ""); + VECT_VAR (vector_res, float, 64, 2) = + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), + VECT_VAR (vsrc_2, float, 64, 2), delem0); + vst1q_f64 (VECT_VAR (result, float, 64, 2), + VECT_VAR (vector_res, float, 64, 2)); + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma0_static, ""); + + VECT_VAR_DECL (buf_src_3, float, 64, 2) [] = {DA2, DA3}; + VECT_VAR_DECL (buf_src_4, float, 64, 2) [] = {DB2, DB3}; + VLOAD (vsrc_1, buf_src_3, q, float, f, 64, 2); + VLOAD (vsrc_2, buf_src_4, q, float, f, 64, 2); + VECT_VAR (vector_res, float, 64, 2) = + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), + VECT_VAR (vsrc_2, float, 64, 2), delem1); + vst1q_f64 (VECT_VAR (result, float, 64, 2), + VECT_VAR (vector_res, float, 64, 2)); + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms1_static, ""); + VECT_VAR (vector_res, float, 64, 2) = + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), + VECT_VAR (vsrc_2, float, 64, 2), delem1); + vst1q_f64 (VECT_VAR (result, float, 64, 2), + VECT_VAR (vector_res, float, 64, 2)); + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma1_static, ""); + + VECT_VAR_DECL (buf_src_5, float, 64, 2) [] = {DA4, DA5}; + VECT_VAR_DECL (buf_src_6, float, 64, 2) [] = {DB4, DB5}; + VLOAD (vsrc_1, buf_src_5, q, float, f, 64, 2); + VLOAD (vsrc_2, buf_src_6, q, float, f, 64, 2); + VECT_VAR (vector_res, float, 64, 2) = + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), + VECT_VAR (vsrc_2, float, 64, 2), delem2); + vst1q_f64 (VECT_VAR (result, float, 64, 2), + VECT_VAR (vector_res, float, 64, 2)); + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms2_static, ""); + VECT_VAR (vector_res, float, 64, 2) = + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), + VECT_VAR (vsrc_2, float, 64, 2), delem2); + vst1q_f64 (VECT_VAR (result, float, 64, 2), + VECT_VAR (vector_res, float, 64, 2)); + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma2_static, ""); + + VECT_VAR_DECL (buf_src_7, float, 64, 2) [] = {DA6, DA7}; + VECT_VAR_DECL (buf_src_8, float, 64, 2) [] = {DB6, DB7}; + VLOAD (vsrc_1, buf_src_7, q, float, f, 64, 2); + VLOAD (vsrc_2, buf_src_8, q, float, f, 64, 2); + VECT_VAR (vector_res, float, 64, 2) = + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), + VECT_VAR (vsrc_2, float, 64, 2), delem3); + vst1q_f64 (VECT_VAR (result, float, 64, 2), + VECT_VAR (vector_res, float, 64, 2)); + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms3_static, ""); + VECT_VAR (vector_res, float, 64, 2) = + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), + VECT_VAR (vsrc_2, float, 64, 2), delem3); + vst1q_f64 (VECT_VAR (result, float, 64, 2), + VECT_VAR (vector_res, float, 64, 2)); + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma3_static, ""); + +#undef TEST_MSG +#define TEST_MSG "VFMS_VFMA_N (FP64)" + clean_results (); + + DECL_VARIABLE(vsrc_1, float, 64, 1); + DECL_VARIABLE(vsrc_2, float, 64, 1); + VECT_VAR_DECL (buf_src_1, float, 64, 1) [] = {DA0}; + VECT_VAR_DECL (buf_src_2, float, 64, 1) [] = {DB0}; + VLOAD (vsrc_1, buf_src_1, , float, f, 64, 1); + VLOAD (vsrc_2, buf_src_2, , float, f, 64, 1); + DECL_VARIABLE (vector_res, float, 64, 1) = + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), + VECT_VAR (vsrc_2, float, 64, 1), delem0); + vst1_f64 (VECT_VAR (result, float, 64, 1), + VECT_VAR (vector_res, float, 64, 1)); + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms0_static, ""); + VECT_VAR (vector_res, float, 64, 1) = + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), + VECT_VAR (vsrc_2, float, 64, 1), delem0); + vst1_f64 (VECT_VAR (result, float, 64, 1), + VECT_VAR (vector_res, float, 64, 1)); + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma0_static, ""); + + VECT_VAR_DECL (buf_src_3, float, 64, 1) [] = {DA2}; + VECT_VAR_DECL (buf_src_4, float, 64, 1) [] = {DB2}; + VLOAD (vsrc_1, buf_src_3, , float, f, 64, 1); + VLOAD (vsrc_2, buf_src_4, , float, f, 64, 1); + VECT_VAR (vector_res, float, 64, 1) = + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), + VECT_VAR (vsrc_2, float, 64, 1), delem1); + vst1_f64 (VECT_VAR (result, float, 64, 1), + VECT_VAR (vector_res, float, 64, 1)); + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms1_static, ""); + VECT_VAR (vector_res, float, 64, 1) = + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), + VECT_VAR (vsrc_2, float, 64, 1), delem1); + vst1_f64 (VECT_VAR (result, float, 64, 1), + VECT_VAR (vector_res, float, 64, 1)); + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma1_static, ""); + + VECT_VAR_DECL (buf_src_5, float, 64, 1) [] = {DA4}; + VECT_VAR_DECL (buf_src_6, float, 64, 1) [] = {DB4}; + VLOAD (vsrc_1, buf_src_5, , float, f, 64, 1); + VLOAD (vsrc_2, buf_src_6, , float, f, 64, 1); + VECT_VAR (vector_res, float, 64, 1) = + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), + VECT_VAR (vsrc_2, float, 64, 1), delem2); + vst1_f64 (VECT_VAR (result, float, 64, 1), + VECT_VAR (vector_res, float, 64, 1)); + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms2_static, ""); + VECT_VAR (vector_res, float, 64, 1) = + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), + VECT_VAR (vsrc_2, float, 64, 1), delem2); + vst1_f64 (VECT_VAR (result, float, 64, 1), + VECT_VAR (vector_res, float, 64, 1)); + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma2_static, ""); + + VECT_VAR_DECL (buf_src_7, float, 64, 1) [] = {DA6}; + VECT_VAR_DECL (buf_src_8, float, 64, 1) [] = {DB6}; + VLOAD (vsrc_1, buf_src_7, , float, f, 64, 1); + VLOAD (vsrc_2, buf_src_8, , float, f, 64, 1); + VECT_VAR (vector_res, float, 64, 1) = + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), + VECT_VAR (vsrc_2, float, 64, 1), delem3); + vst1_f64 (VECT_VAR (result, float, 64, 1), + VECT_VAR (vector_res, float, 64, 1)); + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms3_static, ""); + VECT_VAR (vector_res, float, 64, 1) = + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), + VECT_VAR (vsrc_2, float, 64, 1), delem3); + vst1_f64 (VECT_VAR (result, float, 64, 1), + VECT_VAR (vector_res, float, 64, 1)); + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma3_static, ""); +} +#endif + +int +main (void) +{ +#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA) + exec_vfma_vfms_n (); +#endif + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c index 1ba1fed98a0..5b348827002 100644 --- a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c +++ b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c @@ -110,6 +110,6 @@ main (int argc, char **argv) /* vfmaq_lane_f64. vfma_laneq_f64. vfmaq_laneq_f64. */ -/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */ +/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c index 887ebae10da..6c194a023d3 100644 --- a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c +++ b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c @@ -111,6 +111,6 @@ main (int argc, char **argv) /* vfmsq_lane_f64. vfms_laneq_f64. vfmsq_laneq_f64. */ -/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */ +/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmul_elem_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vmul_elem_1.c new file mode 100644 index 00000000000..a1faefd88ba --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/vmul_elem_1.c @@ -0,0 +1,541 @@ +/* Test the vmul_n_f64 AArch64 SIMD intrinsic. */ + +/* { dg-do run } */ +/* { dg-options "-O2 --save-temps" } */ + +#include "arm_neon.h" + +extern void abort (void); + +#define A (132.4f) +#define B (-0.0f) +#define C (-34.8f) +#define D (289.34f) +float32_t expected2_1[2] = {A * A, B * A}; +float32_t expected2_2[2] = {A * B, B * B}; +float32_t expected4_1[4] = {A * A, B * A, C * A, D * A}; +float32_t expected4_2[4] = {A * B, B * B, C * B, D * B}; +float32_t expected4_3[4] = {A * C, B * C, C * C, D * C}; +float32_t expected4_4[4] = {A * D, B * D, C * D, D * D}; +float32_t _elemA = A; +float32_t _elemB = B; +float32_t _elemC = C; +float32_t _elemD = D; + +#define AD (1234.5) +#define BD (-0.0) +#define CD (71.3) +#define DD (-1024.4) +float64_t expectedd2_1[2] = {AD * CD, BD * CD}; +float64_t expectedd2_2[2] = {AD * DD, BD * DD}; +float64_t _elemdC = CD; +float64_t _elemdD = DD; + + +#define AS (1024) +#define BS (-31) +#define CS (0) +#define DS (655) +int32_t expecteds2_1[2] = {AS * AS, BS * AS}; +int32_t expecteds2_2[2] = {AS * BS, BS * BS}; +int32_t expecteds4_1[4] = {AS * AS, BS * AS, CS * AS, DS * AS}; +int32_t expecteds4_2[4] = {AS * BS, BS * BS, CS * BS, DS * BS}; +int32_t expecteds4_3[4] = {AS * CS, BS * CS, CS * CS, DS * CS}; +int32_t expecteds4_4[4] = {AS * DS, BS * DS, CS * DS, DS * DS}; +int32_t _elemsA = AS; +int32_t _elemsB = BS; +int32_t _elemsC = CS; +int32_t _elemsD = DS; + +#define AH ((int16_t) 0) +#define BH ((int16_t) -32) +#define CH ((int16_t) 102) +#define DH ((int16_t) -51) +#define EH ((int16_t) 71) +#define FH ((int16_t) -91) +#define GH ((int16_t) 48) +#define HH ((int16_t) 255) +int16_t expectedh4_1[4] = {AH * AH, BH * AH, CH * AH, DH * AH}; +int16_t expectedh4_2[4] = {AH * BH, BH * BH, CH * BH, DH * BH}; +int16_t expectedh4_3[4] = {AH * CH, BH * CH, CH * CH, DH * CH}; +int16_t expectedh4_4[4] = {AH * DH, BH * DH, CH * DH, DH * DH}; +int16_t expectedh8_1[8] = {AH * AH, BH * AH, CH * AH, DH * AH, + EH * AH, FH * AH, GH * AH, HH * AH}; +int16_t expectedh8_2[8] = {AH * BH, BH * BH, CH * BH, DH * BH, + EH * BH, FH * BH, GH * BH, HH * BH}; +int16_t expectedh8_3[8] = {AH * CH, BH * CH, CH * CH, DH * CH, + EH * CH, FH * CH, GH * CH, HH * CH}; +int16_t expectedh8_4[8] = {AH * DH, BH * DH, CH * DH, DH * DH, + EH * DH, FH * DH, GH * DH, HH * DH}; +int16_t expectedh8_5[8] = {AH * EH, BH * EH, CH * EH, DH * EH, + EH * EH, FH * EH, GH * EH, HH * EH}; +int16_t expectedh8_6[8] = {AH * FH, BH * FH, CH * FH, DH * FH, + EH * FH, FH * FH, GH * FH, HH * FH}; +int16_t expectedh8_7[8] = {AH * GH, BH * GH, CH * GH, DH * GH, + EH * GH, FH * GH, GH * GH, HH * GH}; +int16_t expectedh8_8[8] = {AH * HH, BH * HH, CH * HH, DH * HH, + EH * HH, FH * HH, GH * HH, HH * HH}; +int16_t _elemhA = AH; +int16_t _elemhB = BH; +int16_t _elemhC = CH; +int16_t _elemhD = DH; +int16_t _elemhE = EH; +int16_t _elemhF = FH; +int16_t _elemhG = GH; +int16_t _elemhH = HH; + +#define AUS (1024) +#define BUS (31) +#define CUS (0) +#define DUS (655) +uint32_t expectedus2_1[2] = {AUS * AUS, BUS * AUS}; +uint32_t expectedus2_2[2] = {AUS * BUS, BUS * BUS}; +uint32_t expectedus4_1[4] = {AUS * AUS, BUS * AUS, CUS * AUS, DUS * AUS}; +uint32_t expectedus4_2[4] = {AUS * BUS, BUS * BUS, CUS * BUS, DUS * BUS}; +uint32_t expectedus4_3[4] = {AUS * CUS, BUS * CUS, CUS * CUS, DUS * CUS}; +uint32_t expectedus4_4[4] = {AUS * DUS, BUS * DUS, CUS * DUS, DUS * DUS}; +uint32_t _elemusA = AUS; +uint32_t _elemusB = BUS; +uint32_t _elemusC = CUS; +uint32_t _elemusD = DUS; + +#define AUH ((uint16_t) 0) +#define BUH ((uint16_t) 32) +#define CUH ((uint16_t) 102) +#define DUH ((uint16_t) 51) +#define EUH ((uint16_t) 71) +#define FUH ((uint16_t) 91) +#define GUH ((uint16_t) 48) +#define HUH ((uint16_t) 255) +uint16_t expecteduh4_1[4] = {AUH * AUH, BUH * AUH, CUH * AUH, DUH * AUH}; +uint16_t expecteduh4_2[4] = {AUH * BUH, BUH * BUH, CUH * BUH, DUH * BUH}; +uint16_t expecteduh4_3[4] = {AUH * CUH, BUH * CUH, CUH * CUH, DUH * CUH}; +uint16_t expecteduh4_4[4] = {AUH * DUH, BUH * DUH, CUH * DUH, DUH * DUH}; +uint16_t expecteduh8_1[8] = {AUH * AUH, BUH * AUH, CUH * AUH, DUH * AUH, + EUH * AUH, FUH * AUH, GUH * AUH, HUH * AUH}; +uint16_t expecteduh8_2[8] = {AUH * BUH, BUH * BUH, CUH * BUH, DUH * BUH, + EUH * BUH, FUH * BUH, GUH * BUH, HUH * BUH}; +uint16_t expecteduh8_3[8] = {AUH * CUH, BUH * CUH, CUH * CUH, DUH * CUH, + EUH * CUH, FUH * CUH, GUH * CUH, HUH * CUH}; +uint16_t expecteduh8_4[8] = {AUH * DUH, BUH * DUH, CUH * DUH, DUH * DUH, + EUH * DUH, FUH * DUH, GUH * DUH, HUH * DUH}; +uint16_t expecteduh8_5[8] = {AUH * EUH, BUH * EUH, CUH * EUH, DUH * EUH, + EUH * EUH, FUH * EUH, GUH * EUH, HUH * EUH}; +uint16_t expecteduh8_6[8] = {AUH * FUH, BUH * FUH, CUH * FUH, DUH * FUH, + EUH * FUH, FUH * FUH, GUH * FUH, HUH * FUH}; +uint16_t expecteduh8_7[8] = {AUH * GUH, BUH * GUH, CUH * GUH, DUH * GUH, + EUH * GUH, FUH * GUH, GUH * GUH, HUH * GUH}; +uint16_t expecteduh8_8[8] = {AUH * HUH, BUH * HUH, CUH * HUH, DUH * HUH, + EUH * HUH, FUH * HUH, GUH * HUH, HUH * HUH}; +uint16_t _elemuhA = AUH; +uint16_t _elemuhB = BUH; +uint16_t _elemuhC = CUH; +uint16_t _elemuhD = DUH; +uint16_t _elemuhE = EUH; +uint16_t _elemuhF = FUH; +uint16_t _elemuhG = GUH; +uint16_t _elemuhH = HUH; + +void +check_v2sf (float32_t elemA, float32_t elemB) +{ + int32_t indx; + const float32_t vec32x2_buf[2] = {A, B}; + float32x2_t vec32x2_src = vld1_f32 (vec32x2_buf); + float32_t vec32x2_res[2]; + + vst1_f32 (vec32x2_res, vmul_n_f32 (vec32x2_src, elemA)); + + for (indx = 0; indx < 2; indx++) + if (* (uint32_t *) &vec32x2_res[indx] != * (uint32_t *) &expected2_1[indx]) + abort (); + + vst1_f32 (vec32x2_res, vmul_n_f32 (vec32x2_src, elemB)); + + for (indx = 0; indx < 2; indx++) + if (* (uint32_t *) &vec32x2_res[indx] != * (uint32_t *) &expected2_2[indx]) + abort (); + +/* { dg-final { scan-assembler-times "fmul\tv\[0-9\]+\.2s, v\[0-9\]+\.2s, v\[0-9\]+\.s\\\[0\\\]" 2 } } */ +} + +void +check_v4sf (float32_t elemA, float32_t elemB, float32_t elemC, float32_t elemD) +{ + int32_t indx; + const float32_t vec32x4_buf[4] = {A, B, C, D}; + float32x4_t vec32x4_src = vld1q_f32 (vec32x4_buf); + float32_t vec32x4_res[4]; + + vst1q_f32 (vec32x4_res, vmulq_n_f32 (vec32x4_src, elemA)); + + for (indx = 0; indx < 4; indx++) + if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) &expected4_1[indx]) + abort (); + + vst1q_f32 (vec32x4_res, vmulq_n_f32 (vec32x4_src, elemB)); + + for (indx = 0; indx < 4; indx++) + if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) &expected4_2[indx]) + abort (); + + vst1q_f32 (vec32x4_res, vmulq_n_f32 (vec32x4_src, elemC)); + + for (indx = 0; indx < 4; indx++) + if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) &expected4_3[indx]) + abort (); + + vst1q_f32 (vec32x4_res, vmulq_n_f32 (vec32x4_src, elemD)); + + for (indx = 0; indx < 4; indx++) + if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) &expected4_4[indx]) + abort (); + +/* { dg-final { scan-assembler-times "fmul\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.s\\\[0\\\]" 4 } } */ +} + +void +check_v2df (float64_t elemdC, float64_t elemdD) +{ + int32_t indx; + const float64_t vec64x2_buf[2] = {AD, BD}; + float64x2_t vec64x2_src = vld1q_f64 (vec64x2_buf); + float64_t vec64x2_res[2]; + + vst1q_f64 (vec64x2_res, vmulq_n_f64 (vec64x2_src, elemdC)); + + for (indx = 0; indx < 2; indx++) + if (* (uint64_t *) &vec64x2_res[indx] != * (uint64_t *) &expectedd2_1[indx]) + abort (); + + vst1q_f64 (vec64x2_res, vmulq_n_f64 (vec64x2_src, elemdD)); + + for (indx = 0; indx < 2; indx++) + if (* (uint64_t *) &vec64x2_res[indx] != * (uint64_t *) &expectedd2_2[indx]) + abort (); + +/* { dg-final { scan-assembler-times "fmul\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.d\\\[0\\\]" 2 } } */ +} + +void +check_v2si (int32_t elemsA, int32_t elemsB) +{ + int32_t indx; + const int32_t vecs32x2_buf[2] = {AS, BS}; + int32x2_t vecs32x2_src = vld1_s32 (vecs32x2_buf); + int32_t vecs32x2_res[2]; + + vst1_s32 (vecs32x2_res, vmul_n_s32 (vecs32x2_src, elemsA)); + + for (indx = 0; indx < 2; indx++) + if (vecs32x2_res[indx] != expecteds2_1[indx]) + abort (); + + vst1_s32 (vecs32x2_res, vmul_n_s32 (vecs32x2_src, elemsB)); + + for (indx = 0; indx < 2; indx++) + if (vecs32x2_res[indx] != expecteds2_2[indx]) + abort (); +} + +void +check_v2si_unsigned (uint32_t elemusA, uint32_t elemusB) +{ + int indx; + const uint32_t vecus32x2_buf[2] = {AUS, BUS}; + uint32x2_t vecus32x2_src = vld1_u32 (vecus32x2_buf); + uint32_t vecus32x2_res[2]; + + vst1_u32 (vecus32x2_res, vmul_n_u32 (vecus32x2_src, elemusA)); + + for (indx = 0; indx < 2; indx++) + if (vecus32x2_res[indx] != expectedus2_1[indx]) + abort (); + + vst1_u32 (vecus32x2_res, vmul_n_u32 (vecus32x2_src, elemusB)); + + for (indx = 0; indx < 2; indx++) + if (vecus32x2_res[indx] != expectedus2_2[indx]) + abort (); + +/* { dg-final { scan-assembler-times "\tmul\tv\[0-9\]+\.2s, v\[0-9\]+\.2s, v\[0-9\]+\.s\\\[0\\\]" 4 } } */ +} + +void +check_v4si (int32_t elemsA, int32_t elemsB, int32_t elemsC, int32_t elemsD) +{ + int32_t indx; + const int32_t vecs32x4_buf[4] = {AS, BS, CS, DS}; + int32x4_t vecs32x4_src = vld1q_s32 (vecs32x4_buf); + int32_t vecs32x4_res[4]; + + vst1q_s32 (vecs32x4_res, vmulq_n_s32 (vecs32x4_src, elemsA)); + + for (indx = 0; indx < 4; indx++) + if (vecs32x4_res[indx] != expecteds4_1[indx]) + abort (); + + vst1q_s32 (vecs32x4_res, vmulq_n_s32 (vecs32x4_src, elemsB)); + + for (indx = 0; indx < 4; indx++) + if (vecs32x4_res[indx] != expecteds4_2[indx]) + abort (); + + vst1q_s32 (vecs32x4_res, vmulq_n_s32 (vecs32x4_src, elemsC)); + + for (indx = 0; indx < 4; indx++) + if (vecs32x4_res[indx] != expecteds4_3[indx]) + abort (); + + vst1q_s32 (vecs32x4_res, vmulq_n_s32 (vecs32x4_src, elemsD)); + + for (indx = 0; indx < 4; indx++) + if (vecs32x4_res[indx] != expecteds4_4[indx]) + abort (); +} + +void +check_v4si_unsigned (uint32_t elemusA, uint32_t elemusB, uint32_t elemusC, + uint32_t elemusD) +{ + int indx; + const uint32_t vecus32x4_buf[4] = {AUS, BUS, CUS, DUS}; + uint32x4_t vecus32x4_src = vld1q_u32 (vecus32x4_buf); + uint32_t vecus32x4_res[4]; + + vst1q_u32 (vecus32x4_res, vmulq_n_u32 (vecus32x4_src, elemusA)); + + for (indx = 0; indx < 4; indx++) + if (vecus32x4_res[indx] != expectedus4_1[indx]) + abort (); + + vst1q_u32 (vecus32x4_res, vmulq_n_u32 (vecus32x4_src, elemusB)); + + for (indx = 0; indx < 4; indx++) + if (vecus32x4_res[indx] != expectedus4_2[indx]) + abort (); + + vst1q_u32 (vecus32x4_res, vmulq_n_u32 (vecus32x4_src, elemusC)); + + for (indx = 0; indx < 4; indx++) + if (vecus32x4_res[indx] != expectedus4_3[indx]) + abort (); + + vst1q_u32 (vecus32x4_res, vmulq_n_u32 (vecus32x4_src, elemusD)); + + for (indx = 0; indx < 4; indx++) + if (vecus32x4_res[indx] != expectedus4_4[indx]) + abort (); + +/* { dg-final { scan-assembler-times "\tmul\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.s\\\[0\\\]" 8 } } */ +} + + +void +check_v4hi (int16_t elemhA, int16_t elemhB, int16_t elemhC, int16_t elemhD) +{ + int32_t indx; + const int16_t vech16x4_buf[4] = {AH, BH, CH, DH}; + int16x4_t vech16x4_src = vld1_s16 (vech16x4_buf); + int16_t vech16x4_res[4]; + + vst1_s16 (vech16x4_res, vmul_n_s16 (vech16x4_src, elemhA)); + + for (indx = 0; indx < 4; indx++) + if (vech16x4_res[indx] != expectedh4_1[indx]) + abort (); + + vst1_s16 (vech16x4_res, vmul_n_s16 (vech16x4_src, elemhB)); + + for (indx = 0; indx < 4; indx++) + if (vech16x4_res[indx] != expectedh4_2[indx]) + abort (); + + vst1_s16 (vech16x4_res, vmul_n_s16 (vech16x4_src, elemhC)); + + for (indx = 0; indx < 4; indx++) + if (vech16x4_res[indx] != expectedh4_3[indx]) + abort (); + + vst1_s16 (vech16x4_res, vmul_n_s16 (vech16x4_src, elemhD)); + + for (indx = 0; indx < 4; indx++) + if (vech16x4_res[indx] != expectedh4_4[indx]) + abort (); +} + +void +check_v4hi_unsigned (uint16_t elemuhA, uint16_t elemuhB, uint16_t elemuhC, + uint16_t elemuhD) +{ + int indx; + const uint16_t vecuh16x4_buf[4] = {AUH, BUH, CUH, DUH}; + uint16x4_t vecuh16x4_src = vld1_u16 (vecuh16x4_buf); + uint16_t vecuh16x4_res[4]; + + vst1_u16 (vecuh16x4_res, vmul_n_u16 (vecuh16x4_src, elemuhA)); + + for (indx = 0; indx < 4; indx++) + if (vecuh16x4_res[indx] != expecteduh4_1[indx]) + abort (); + + vst1_u16 (vecuh16x4_res, vmul_n_u16 (vecuh16x4_src, elemuhB)); + + for (indx = 0; indx < 4; indx++) + if (vecuh16x4_res[indx] != expecteduh4_2[indx]) + abort (); + + vst1_u16 (vecuh16x4_res, vmul_n_u16 (vecuh16x4_src, elemuhC)); + + for (indx = 0; indx < 4; indx++) + if (vecuh16x4_res[indx] != expecteduh4_3[indx]) + abort (); + + vst1_u16 (vecuh16x4_res, vmul_n_u16 (vecuh16x4_src, elemuhD)); + + for (indx = 0; indx < 4; indx++) + if (vecuh16x4_res[indx] != expecteduh4_4[indx]) + abort (); + +/* { dg-final { scan-assembler-times "mul\tv\[0-9\]+\.4h, v\[0-9\]+\.4h, v\[0-9\]+\.h\\\[0\\\]" 8 } } */ +} + +void +check_v8hi (int16_t elemhA, int16_t elemhB, int16_t elemhC, int16_t elemhD, + int16_t elemhE, int16_t elemhF, int16_t elemhG, int16_t elemhH) +{ + int32_t indx; + const int16_t vech16x8_buf[8] = {AH, BH, CH, DH, EH, FH, GH, HH}; + int16x8_t vech16x8_src = vld1q_s16 (vech16x8_buf); + int16_t vech16x8_res[8]; + + vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhA)); + + for (indx = 0; indx < 8; indx++) + if (vech16x8_res[indx] != expectedh8_1[indx]) + abort (); + + vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhB)); + + for (indx = 0; indx < 8; indx++) + if (vech16x8_res[indx] != expectedh8_2[indx]) + abort (); + + vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhC)); + + for (indx = 0; indx < 8; indx++) + if (vech16x8_res[indx] != expectedh8_3[indx]) + abort (); + + vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhD)); + + for (indx = 0; indx < 8; indx++) + if (vech16x8_res[indx] != expectedh8_4[indx]) + abort (); + + vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhE)); + + for (indx = 0; indx < 8; indx++) + if (vech16x8_res[indx] != expectedh8_5[indx]) + abort (); + + vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhF)); + + for (indx = 0; indx < 8; indx++) + if (vech16x8_res[indx] != expectedh8_6[indx]) + abort (); + + vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhG)); + + for (indx = 0; indx < 8; indx++) + if (vech16x8_res[indx] != expectedh8_7[indx]) + abort (); + + vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhH)); + + for (indx = 0; indx < 8; indx++) + if (vech16x8_res[indx] != expectedh8_8[indx]) + abort (); +} + +void +check_v8hi_unsigned (uint16_t elemuhA, uint16_t elemuhB, uint16_t elemuhC, + uint16_t elemuhD, uint16_t elemuhE, uint16_t elemuhF, + uint16_t elemuhG, uint16_t elemuhH) +{ + int indx; + const uint16_t vecuh16x8_buf[8] = {AUH, BUH, CUH, DUH, EUH, FUH, GUH, HUH}; + uint16x8_t vecuh16x8_src = vld1q_u16 (vecuh16x8_buf); + uint16_t vecuh16x8_res[8]; + + vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhA)); + + for (indx = 0; indx < 8; indx++) + if (vecuh16x8_res[indx] != expecteduh8_1[indx]) + abort (); + + vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhB)); + + for (indx = 0; indx < 8; indx++) + if (vecuh16x8_res[indx] != expecteduh8_2[indx]) + abort (); + + vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhC)); + + for (indx = 0; indx < 8; indx++) + if (vecuh16x8_res[indx] != expecteduh8_3[indx]) + abort (); + + vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhD)); + + for (indx = 0; indx < 8; indx++) + if (vecuh16x8_res[indx] != expecteduh8_4[indx]) + abort (); + + vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhE)); + + for (indx = 0; indx < 8; indx++) + if (vecuh16x8_res[indx] != expecteduh8_5[indx]) + abort (); + + vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhF)); + + for (indx = 0; indx < 8; indx++) + if (vecuh16x8_res[indx] != expecteduh8_6[indx]) + abort (); + + vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhG)); + + for (indx = 0; indx < 8; indx++) + if (vecuh16x8_res[indx] != expecteduh8_7[indx]) + abort (); + + vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhH)); + + for (indx = 0; indx < 8; indx++) + if (vecuh16x8_res[indx] != expecteduh8_8[indx]) + abort (); + +/* { dg-final { scan-assembler-times "mul\tv\[0-9\]+\.8h, v\[0-9\]+\.8h, v\[0-9\]+\.h\\\[0\\\]" 16 } } */ +} + +int +main (void) +{ + check_v2sf (_elemA, _elemB); + check_v4sf (_elemA, _elemB, _elemC, _elemD); + check_v2df (_elemdC, _elemdD); + check_v2si (_elemsA, _elemsB); + check_v4si (_elemsA, _elemsB, _elemsC, _elemsD); + check_v4hi (_elemhA, _elemhB, _elemhC, _elemhD); + check_v8hi (_elemhA, _elemhB, _elemhC, _elemhD, + _elemhE, _elemhF, _elemhG, _elemhH); + check_v2si_unsigned (_elemusA, _elemusB); + check_v4si_unsigned (_elemusA, _elemusB, _elemusC, _elemusD); + check_v4hi_unsigned (_elemuhA, _elemuhB, _elemuhC, _elemuhD); + check_v8hi_unsigned (_elemuhA, _elemuhB, _elemuhC, _elemuhD, + _elemuhE, _elemuhF, _elemuhG, _elemuhH); + + return 0; +} + |