gcc/

Backport from trunk r236331. 2016-05-17 Jiong Wang <jiong.wang@arm.com> * config/aarch64/aarch64-simd.md (*aarch64_fma4_elt_to_128df): Rename to *aarch64_fma4_elt_from_dup<mode>. (*aarch64_fnma4_elt_to_128df): Rename to *aarch64_fnma4_elt_from_dup<mode>. * config/aarch64/arm_neon.h (vfma_n_f64): New. (vfms_n_f32): Likewise. (vfms_n_f64): Likewise. (vfmsq_n_f32): Likewise. (vfmsq_n_f64): Likewise. gcc/testsuite/ Backport from trunk r236331. 2016-05-17 Jiong Wang <jiong.wang@arm.com> * gcc.target/aarch64/fmla_intrinsic_1.c: Allow ".d[index]" besides ".2d[index]" when scan the assembly. * gcc.target/aarch64/fmls_intrinsic_1.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h: New entry for float64x1. * gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c: New. gcc/ Backport from trunk r236332. 2016-05-17 Jiong Wang <jiong.wang@arm.com> * config/aarch64/aarch64-simd.md (*aarch64_mul3_elt_to_128df): Extend to all supported modes. Rename to "*aarch64_mul3_elt_from_dup". gcc/testsuite/ Backport from trunk r236332. 2016-05-17 Jiong Wang <jiong.wang@arm.com> * gcc.target/aarch64/simd/vmul_elem_1.c: New. gcc/ Backport from trunk r236333. 2016-05-17 Jiong Wang <jiong.wang@arm.com> * config/aarch64/aarch64-simd.md (vmul_n_f32): Remove inline assembly. Use builtin. (vmul_n_s16): Likewise. (vmul_n_s32): Likewise. (vmul_n_u16): Likewise. (vmul_n_u32): Likewise. (vmulq_n_f32): Likewise. (vmulq_n_f64): Likewise. (vmulq_n_s16): Likewise. (vmulq_n_s32): Likewise. (vmulq_n_u16): Likewise. (vmulq_n_u32): Likewise. gcc/testsuite/ Backport from trunk r236333. 2016-05-17 Jiong Wang <jiong.wang@arm.com> * gcc.target/aarch64/simd/vmul_elem_1.c: Use intrinsics. gcc/ Backport from trunk r236334. 2016-05-17 Jiong Wang <jiong.wang@arm.com> * config/aarch64/arm_neon.h (vmvn_s8): Reimplement using C operator. Remove inline assembly. (vmvn_s16): Likewise. (vmvn_s32): Likewise. (vmvn_u8): Likewise. (vmvn_u16): Likewise. (vmvn_u32): Likewise. (vmvnq_s8): Likewise. (vmvnq_s16): Likewise. (vmvnq_s32): Likewise. (vmvnq_u8): Likewise. (vmvnq_u16): Likewise. (vmvnq_u32): Likewise. (vmvn_p8): Likewise. (vmvnq_p16): Likewise. gcc/testsuite/ Backport from trunk r236370. 2016-05-18 Jiong Wang <jiong.wang@arm.com> * gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h: Guard float64_t with __aarch64__. * gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c: Guard variable declarations under __aarch64__ and __ARM_FEATURE_FMA. gcc/testsuite/ Backport from trunk r236762. 2016-05-26 Jiong Wang <jiong.wang@arm.com> * gcc.target/aarch64/simd/vmul_elem_1.c: Force result variables to be kept in memory. Change-Id: Ie51f0f3de00727d8a6e0ba3a129df0968853449f
author: Christophe Lyon <christophe.lyon@linaro.org> 2016-06-06 23:44:46 +0200
committer: Yvan Roux <yvan.roux@linaro.org> 2016-06-14 14:47:54 +0200
commit: 060934a01ffa66b5d87ab7098b3dc3d8d4e27e10 (patch)
tree: aa124935b851b6c430e3a8f7875c87ddde0b5653
parent: 309490ea34b4e40522a6c1645ee191c1ad5f68c9 (diff)
7 files changed, 1248 insertions, 306 deletions
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index bd73bce6441..0dd4bf81078 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -371,15 +371,15 @@
   [(set_attr "type" "neon<fp>_mul_<Vetype>_scalar<q>")]
 )
 
-(define_insn "*aarch64_mul3_elt_to_128df"
-  [(set (match_operand:V2DF 0 "register_operand" "=w")
-     (mult:V2DF
-       (vec_duplicate:V2DF
-	 (match_operand:DF 2 "register_operand" "w"))
-      (match_operand:V2DF 1 "register_operand" "w")))]
+(define_insn "*aarch64_mul3_elt_from_dup<mode>"
+ [(set (match_operand:VMUL 0 "register_operand" "=w")
+    (mult:VMUL
+      (vec_duplicate:VMUL
+	    (match_operand:<VEL> 1 "register_operand" "<h_con>"))
+      (match_operand:VMUL 2 "register_operand" "w")))]
   "TARGET_SIMD"
-  "fmul\\t%0.2d, %1.2d, %2.d[0]"
-  [(set_attr "type" "neon_fp_mul_d_scalar_q")]
+  "<f>mul\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]";
+  [(set_attr "type" "neon<fp>_mul_<Vetype>_scalar<q>")]
 )
 
 (define_insn "aarch64_rsqrte_<mode>2"
@@ -1579,16 +1579,16 @@
   [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
 )
 
-(define_insn "*aarch64_fma4_elt_to_128df"
-  [(set (match_operand:V2DF 0 "register_operand" "=w")
-    (fma:V2DF
-      (vec_duplicate:V2DF
-	  (match_operand:DF 1 "register_operand" "w"))
-      (match_operand:V2DF 2 "register_operand" "w")
-      (match_operand:V2DF 3 "register_operand" "0")))]
+(define_insn "*aarch64_fma4_elt_from_dup<mode>"
+  [(set (match_operand:VMUL 0 "register_operand" "=w")
+    (fma:VMUL
+      (vec_duplicate:VMUL
+	  (match_operand:<VEL> 1 "register_operand" "w"))
+      (match_operand:VMUL 2 "register_operand" "w")
+      (match_operand:VMUL 3 "register_operand" "0")))]
   "TARGET_SIMD"
-  "fmla\\t%0.2d, %2.2d, %1.2d[0]"
-  [(set_attr "type" "neon_fp_mla_d_scalar_q")]
+  "fmla\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
+  [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")]
 )
 
 (define_insn "*aarch64_fma4_elt_to_64v2df"
@@ -1656,17 +1656,17 @@
   [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
 )
 
-(define_insn "*aarch64_fnma4_elt_to_128df"
-  [(set (match_operand:V2DF 0 "register_operand" "=w")
-    (fma:V2DF
-      (neg:V2DF
-        (match_operand:V2DF 2 "register_operand" "w"))
-      (vec_duplicate:V2DF
-	(match_operand:DF 1 "register_operand" "w"))
-      (match_operand:V2DF 3 "register_operand" "0")))]
-  "TARGET_SIMD"
-  "fmls\\t%0.2d, %2.2d, %1.2d[0]"
-  [(set_attr "type" "neon_fp_mla_d_scalar_q")]
+(define_insn "*aarch64_fnma4_elt_from_dup<mode>"
+  [(set (match_operand:VMUL 0 "register_operand" "=w")
+    (fma:VMUL
+      (neg:VMUL
+        (match_operand:VMUL 2 "register_operand" "w"))
+      (vec_duplicate:VMUL
+	(match_operand:<VEL> 1 "register_operand" "w"))
+      (match_operand:VMUL 3 "register_operand" "0")))]
+  "TARGET_SIMD"
+  "fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
+  [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")]
 )
 
 (define_insn "*aarch64_fnma4_elt_to_64v2df"
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 2612a325718..e563e3d2f77 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -7938,61 +7938,6 @@ vmovn_u64 (uint64x2_t a)
   return result;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmul_n_f32 (float32x2_t a, float32_t b)
-{
-  float32x2_t result;
-  __asm__ ("fmul %0.2s,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmul_n_s16 (int16x4_t a, int16_t b)
-{
-  int16x4_t result;
-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmul_n_s32 (int32x2_t a, int32_t b)
-{
-  int32x2_t result;
-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmul_n_u16 (uint16x4_t a, uint16_t b)
-{
-  uint16x4_t result;
-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmul_n_u32 (uint32x2_t a, uint32_t b)
-{
-  uint32x2_t result;
-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 #define vmull_high_lane_s16(a, b, c)                                    \
   __extension__                                                         \
     ({                                                                  \
@@ -8443,227 +8388,6 @@ vmull_u32 (uint32x2_t a, uint32x2_t b)
   return result;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmulq_n_f32 (float32x4_t a, float32_t b)
-{
-  float32x4_t result;
-  __asm__ ("fmul %0.4s,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vmulq_n_f64 (float64x2_t a, float64_t b)
-{
-  float64x2_t result;
-  __asm__ ("fmul %0.2d,%1.2d,%2.d[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmulq_n_s16 (int16x8_t a, int16_t b)
-{
-  int16x8_t result;
-  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmulq_n_s32 (int32x4_t a, int32_t b)
-{
-  int32x4_t result;
-  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmulq_n_u16 (uint16x8_t a, uint16_t b)
-{
-  uint16x8_t result;
-  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmulq_n_u32 (uint32x4_t a, uint32_t b)
-{
-  uint32x4_t result;
-  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vmvn_p8 (poly8x8_t a)
-{
-  poly8x8_t result;
-  __asm__ ("mvn %0.8b,%1.8b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vmvn_s8 (int8x8_t a)
-{
-  int8x8_t result;
-  __asm__ ("mvn %0.8b,%1.8b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmvn_s16 (int16x4_t a)
-{
-  int16x4_t result;
-  __asm__ ("mvn %0.8b,%1.8b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmvn_s32 (int32x2_t a)
-{
-  int32x2_t result;
-  __asm__ ("mvn %0.8b,%1.8b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vmvn_u8 (uint8x8_t a)
-{
-  uint8x8_t result;
-  __asm__ ("mvn %0.8b,%1.8b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmvn_u16 (uint16x4_t a)
-{
-  uint16x4_t result;
-  __asm__ ("mvn %0.8b,%1.8b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmvn_u32 (uint32x2_t a)
-{
-  uint32x2_t result;
-  __asm__ ("mvn %0.8b,%1.8b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vmvnq_p8 (poly8x16_t a)
-{
-  poly8x16_t result;
-  __asm__ ("mvn %0.16b,%1.16b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vmvnq_s8 (int8x16_t a)
-{
-  int8x16_t result;
-  __asm__ ("mvn %0.16b,%1.16b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmvnq_s16 (int16x8_t a)
-{
-  int16x8_t result;
-  __asm__ ("mvn %0.16b,%1.16b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmvnq_s32 (int32x4_t a)
-{
-  int32x4_t result;
-  __asm__ ("mvn %0.16b,%1.16b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vmvnq_u8 (uint8x16_t a)
-{
-  uint8x16_t result;
-  __asm__ ("mvn %0.16b,%1.16b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmvnq_u16 (uint16x8_t a)
-{
-  uint16x8_t result;
-  __asm__ ("mvn %0.16b,%1.16b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmvnq_u32 (uint32x4_t a)
-{
-  uint32x4_t result;
-  __asm__ ("mvn %0.16b,%1.16b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-
 __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 vpadal_s8 (int16x4_t a, int8x8_t b)
 {
@@ -14456,6 +14180,12 @@ vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
   return __builtin_aarch64_fmav2sf (__b, vdup_n_f32 (__c), __a);
 }
 
+__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+vfma_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c)
+{
+  return (float64x1_t) {__b[0] * __c + __a[0]};
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vfmaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
 {
@@ -14597,6 +14327,29 @@ vfmsq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
   return __builtin_aarch64_fmav2df (-__b, __c, __a);
 }
 
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vfms_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
+{
+  return __builtin_aarch64_fmav2sf (-__b, vdup_n_f32 (__c), __a);
+}
+
+__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+vfms_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c)
+{
+  return (float64x1_t) {-__b[0] * __c + __a[0]};
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vfmsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
+{
+  return __builtin_aarch64_fmav4sf (-__b, vdupq_n_f32 (__c), __a);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vfmsq_n_f64 (float64x2_t __a, float64x2_t __b, float64_t __c)
+{
+  return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c), __a);
+}
 
 /* vfms_lane  */
 
@@ -18895,6 +18648,160 @@ vmulq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __lane)
   return __a * __aarch64_vget_lane_any (__b, __lane);
 }
 
+/* vmul_n.  */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmul_n_f32 (float32x2_t __a, float32_t __b)
+{
+  return __a * __b;
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulq_n_f32 (float32x4_t __a, float32_t __b)
+{
+  return __a * __b;
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vmulq_n_f64 (float64x2_t __a, float64_t __b)
+{
+  return __a * __b;
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmul_n_s16 (int16x4_t __a, int16_t __b)
+{
+  return __a * __b;
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmulq_n_s16 (int16x8_t __a, int16_t __b)
+{
+  return __a * __b;
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmul_n_s32 (int32x2_t __a, int32_t __b)
+{
+  return __a * __b;
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmulq_n_s32 (int32x4_t __a, int32_t __b)
+{
+  return __a * __b;
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmul_n_u16 (uint16x4_t __a, uint16_t __b)
+{
+  return __a * __b;
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmulq_n_u16 (uint16x8_t __a, uint16_t __b)
+{
+  return __a * __b;
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmul_n_u32 (uint32x2_t __a, uint32_t __b)
+{
+  return __a * __b;
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmulq_n_u32 (uint32x4_t __a, uint32_t __b)
+{
+  return __a * __b;
+}
+
+/* vmvn  */
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vmvn_p8 (poly8x8_t __a)
+{
+  return (poly8x8_t) ~((int8x8_t) __a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vmvn_s8 (int8x8_t __a)
+{
+  return ~__a;
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmvn_s16 (int16x4_t __a)
+{
+  return ~__a;
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmvn_s32 (int32x2_t __a)
+{
+  return ~__a;
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vmvn_u8 (uint8x8_t __a)
+{
+  return ~__a;
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmvn_u16 (uint16x4_t __a)
+{
+  return ~__a;
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmvn_u32 (uint32x2_t __a)
+{
+  return ~__a;
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vmvnq_p8 (poly8x16_t __a)
+{
+  return (poly8x16_t) ~((int8x16_t) __a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vmvnq_s8 (int8x16_t __a)
+{
+  return ~__a;
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmvnq_s16 (int16x8_t __a)
+{
+  return ~__a;
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmvnq_s32 (int32x4_t __a)
+{
+  return ~__a;
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vmvnq_u8 (uint8x16_t __a)
+{
+  return ~__a;
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmvnq_u16 (uint16x8_t __a)
+{
+  return ~__a;
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmvnq_u32 (uint32x4_t __a)
+{
+  return ~__a;
+}
+
 /* vneg  */
 
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
index 49fbd843e50..dde0e45a2b8 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
@@ -137,6 +137,9 @@ static ARRAY(result, poly, 16, 4);
 static ARRAY(result, float, 16, 4);
 #endif
 static ARRAY(result, float, 32, 2);
+#ifdef __aarch64__
+static ARRAY(result, float, 64, 1);
+#endif
 static ARRAY(result, int, 8, 16);
 static ARRAY(result, int, 16, 8);
 static ARRAY(result, int, 32, 4);
@@ -169,6 +172,7 @@ extern ARRAY(expected, poly, 8, 8);
 extern ARRAY(expected, poly, 16, 4);
 extern ARRAY(expected, hfloat, 16, 4);
 extern ARRAY(expected, hfloat, 32, 2);
+extern ARRAY(expected, hfloat, 64, 1);
 extern ARRAY(expected, int, 8, 16);
 extern ARRAY(expected, int, 16, 8);
 extern ARRAY(expected, int, 32, 4);
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c
new file mode 100644
index 00000000000..efa9b5f2ece
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c
@@ -0,0 +1,490 @@
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA)
+
+#define A0 123.4f
+#define A1 -3.8f
+#define A2 -29.4f
+#define A3 (__builtin_inff ())
+#define A4 0.0f
+#define A5 24.0f
+#define A6 124.0f
+#define A7 1024.0f
+
+#define B0 -5.8f
+#define B1 -0.0f
+#define B2 -10.8f
+#define B3 10.0f
+#define B4 23.4f
+#define B5 -1234.8f
+#define B6 8.9f
+#define B7 4.0f
+
+#define E0 9.8f
+#define E1 -1024.0f
+#define E2 (-__builtin_inff ())
+#define E3 479.0f
+float32_t elem0 = E0;
+float32_t elem1 = E1;
+float32_t elem2 = E2;
+float32_t elem3 = E3;
+
+#define DA0 1231234.4
+#define DA1 -3.8
+#define DA2 -2980.4
+#define DA3 -5.8
+#define DA4 0.01123
+#define DA5 24.0
+#define DA6 124.12345
+#define DA7 1024.0
+
+#define DB0 -5.8
+#define DB1 (__builtin_inf ())
+#define DB2 -105.8
+#define DB3 10.0
+#define DB4 (-__builtin_inf ())
+#define DB5 -1234.8
+#define DB6 848.9
+#define DB7 44444.0
+
+#define DE0 9.8
+#define DE1 -1024.0
+#define DE2 105.8
+#define DE3 479.0
+float64_t delem0 = DE0;
+float64_t delem1 = DE1;
+float64_t delem2 = DE2;
+float64_t delem3 = DE3;
+
+/* Expected results for vfms_n.  */
+
+VECT_VAR_DECL(expectedfms0, float, 32, 2) [] = {A0 + -B0 * E0, A1 + -B1 * E0};
+VECT_VAR_DECL(expectedfms1, float, 32, 2) [] = {A2 + -B2 * E1, A3 + -B3 * E1};
+VECT_VAR_DECL(expectedfms2, float, 32, 2) [] = {A4 + -B4 * E2, A5 + -B5 * E2};
+VECT_VAR_DECL(expectedfms3, float, 32, 2) [] = {A6 + -B6 * E3, A7 + -B7 * E3};
+VECT_VAR_DECL(expectedfma0, float, 32, 2) [] = {A0 + B0 * E0, A1 + B1 * E0};
+VECT_VAR_DECL(expectedfma1, float, 32, 2) [] = {A2 + B2 * E1, A3 + B3 * E1};
+VECT_VAR_DECL(expectedfma2, float, 32, 2) [] = {A4 + B4 * E2, A5 + B5 * E2};
+VECT_VAR_DECL(expectedfma3, float, 32, 2) [] = {A6 + B6 * E3, A7 + B7 * E3};
+
+hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 2) =
+  (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 2);
+hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 2) =
+  (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 2);
+hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 2) =
+  (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 2);
+hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 2) =
+  (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 2);
+hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 2) =
+  (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 2);
+hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 2) =
+  (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 2);
+hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 2) =
+  (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 2);
+hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 2) =
+  (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 2);
+
+
+VECT_VAR_DECL(expectedfms0, float, 32, 4) [] = {A0 + -B0 * E0, A1 + -B1 * E0,
+						A2 + -B2 * E0, A3 + -B3 * E0};
+VECT_VAR_DECL(expectedfms1, float, 32, 4) [] = {A4 + -B4 * E1, A5 + -B5 * E1,
+						A6 + -B6 * E1, A7 + -B7 * E1};
+VECT_VAR_DECL(expectedfms2, float, 32, 4) [] = {A0 + -B0 * E2, A2 + -B2 * E2,
+						A4 + -B4 * E2, A6 + -B6 * E2};
+VECT_VAR_DECL(expectedfms3, float, 32, 4) [] = {A1 + -B1 * E3, A3 + -B3 * E3,
+						A5 + -B5 * E3, A7 + -B7 * E3};
+VECT_VAR_DECL(expectedfma0, float, 32, 4) [] = {A0 + B0 * E0, A1 + B1 * E0,
+						A2 + B2 * E0, A3 + B3 * E0};
+VECT_VAR_DECL(expectedfma1, float, 32, 4) [] = {A4 + B4 * E1, A5 + B5 * E1,
+						A6 + B6 * E1, A7 + B7 * E1};
+VECT_VAR_DECL(expectedfma2, float, 32, 4) [] = {A0 + B0 * E2, A2 + B2 * E2,
+						A4 + B4 * E2, A6 + B6 * E2};
+VECT_VAR_DECL(expectedfma3, float, 32, 4) [] = {A1 + B1 * E3, A3 + B3 * E3,
+						A5 + B5 * E3, A7 + B7 * E3};
+
+hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 4) =
+  (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 4);
+hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 4) =
+  (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 4);
+hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 4) =
+  (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 4);
+hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 4) =
+  (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 4);
+hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 4) =
+  (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 4);
+hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 4) =
+  (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 4);
+hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 4) =
+  (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 4);
+hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 4) =
+  (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 4);
+
+VECT_VAR_DECL(expectedfms0, float, 64, 2) [] = {DA0 + -DB0 * DE0,
+						DA1 + -DB1 * DE0};
+VECT_VAR_DECL(expectedfms1, float, 64, 2) [] = {DA2 + -DB2 * DE1,
+						DA3 + -DB3 * DE1};
+VECT_VAR_DECL(expectedfms2, float, 64, 2) [] = {DA4 + -DB4 * DE2,
+						DA5 + -DB5 * DE2};
+VECT_VAR_DECL(expectedfms3, float, 64, 2) [] = {DA6 + -DB6 * DE3,
+						DA7 + -DB7 * DE3};
+VECT_VAR_DECL(expectedfma0, float, 64, 2) [] = {DA0 + DB0 * DE0,
+						DA1 + DB1 * DE0};
+VECT_VAR_DECL(expectedfma1, float, 64, 2) [] = {DA2 + DB2 * DE1,
+						DA3 + DB3 * DE1};
+VECT_VAR_DECL(expectedfma2, float, 64, 2) [] = {DA4 + DB4 * DE2,
+						DA5 + DB5 * DE2};
+VECT_VAR_DECL(expectedfma3, float, 64, 2) [] = {DA6 + DB6 * DE3,
+						DA7 + DB7 * DE3};
+hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 2) =
+  (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 2);
+hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 2) =
+  (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 2);
+hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 2) =
+  (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 2);
+hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 2) =
+  (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 2);
+hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 2) =
+  (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 2);
+hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 2) =
+  (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 2);
+hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 2) =
+  (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 2);
+hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 2) =
+  (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 2);
+
+VECT_VAR_DECL(expectedfms0, float, 64, 1) [] = {DA0 + -DB0 * DE0};
+VECT_VAR_DECL(expectedfms1, float, 64, 1) [] = {DA2 + -DB2 * DE1};
+VECT_VAR_DECL(expectedfms2, float, 64, 1) [] = {DA4 + -DB4 * DE2};
+VECT_VAR_DECL(expectedfms3, float, 64, 1) [] = {DA6 + -DB6 * DE3};
+VECT_VAR_DECL(expectedfma0, float, 64, 1) [] = {DA0 + DB0 * DE0};
+VECT_VAR_DECL(expectedfma1, float, 64, 1) [] = {DA2 + DB2 * DE1};
+VECT_VAR_DECL(expectedfma2, float, 64, 1) [] = {DA4 + DB4 * DE2};
+VECT_VAR_DECL(expectedfma3, float, 64, 1) [] = {DA6 + DB6 * DE3};
+
+hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 1) =
+  (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 1);
+hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 1) =
+  (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 1);
+hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 1) =
+  (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 1);
+hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 1) =
+  (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 1);
+hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 1) =
+  (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 1);
+hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 1) =
+  (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 1);
+hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 1) =
+  (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 1);
+hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 1) =
+  (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 1);
+
+void exec_vfma_vfms_n (void)
+{
+#undef TEST_MSG
+#define TEST_MSG "VFMS_VFMA_N (FP32)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc_1, float, 32, 2);
+  DECL_VARIABLE(vsrc_2, float, 32, 2);
+  VECT_VAR_DECL (buf_src_1, float, 32, 2) [] = {A0, A1};
+  VECT_VAR_DECL (buf_src_2, float, 32, 2) [] = {B0, B1};
+  VLOAD (vsrc_1, buf_src_1, , float, f, 32, 2);
+  VLOAD (vsrc_2, buf_src_2, , float, f, 32, 2);
+  DECL_VARIABLE (vector_res, float, 32, 2) =
+    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
+		VECT_VAR (vsrc_2, float, 32, 2), elem0);
+  vst1_f32 (VECT_VAR (result, float, 32, 2),
+	    VECT_VAR (vector_res, float, 32, 2));
+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms0_static, "");
+  VECT_VAR (vector_res, float, 32, 2) =
+    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
+		VECT_VAR (vsrc_2, float, 32, 2), elem0);
+  vst1_f32 (VECT_VAR (result, float, 32, 2),
+	    VECT_VAR (vector_res, float, 32, 2));
+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma0_static, "");
+
+  VECT_VAR_DECL (buf_src_3, float, 32, 2) [] = {A2, A3};
+  VECT_VAR_DECL (buf_src_4, float, 32, 2) [] = {B2, B3};
+  VLOAD (vsrc_1, buf_src_3, , float, f, 32, 2);
+  VLOAD (vsrc_2, buf_src_4, , float, f, 32, 2);
+  VECT_VAR (vector_res, float, 32, 2) =
+    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
+		VECT_VAR (vsrc_2, float, 32, 2), elem1);
+  vst1_f32 (VECT_VAR (result, float, 32, 2),
+	    VECT_VAR (vector_res, float, 32, 2));
+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms1_static, "");
+  VECT_VAR (vector_res, float, 32, 2) =
+    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
+		VECT_VAR (vsrc_2, float, 32, 2), elem1);
+  vst1_f32 (VECT_VAR (result, float, 32, 2),
+	    VECT_VAR (vector_res, float, 32, 2));
+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma1_static, "");
+
+  VECT_VAR_DECL (buf_src_5, float, 32, 2) [] = {A4, A5};
+  VECT_VAR_DECL (buf_src_6, float, 32, 2) [] = {B4, B5};
+  VLOAD (vsrc_1, buf_src_5, , float, f, 32, 2);
+  VLOAD (vsrc_2, buf_src_6, , float, f, 32, 2);
+  VECT_VAR (vector_res, float, 32, 2) =
+    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
+		VECT_VAR (vsrc_2, float, 32, 2), elem2);
+  vst1_f32 (VECT_VAR (result, float, 32, 2),
+	    VECT_VAR (vector_res, float, 32, 2));
+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms2_static, "");
+  VECT_VAR (vector_res, float, 32, 2) =
+    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
+		VECT_VAR (vsrc_2, float, 32, 2), elem2);
+  vst1_f32 (VECT_VAR (result, float, 32, 2),
+	    VECT_VAR (vector_res, float, 32, 2));
+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma2_static, "");
+
+  VECT_VAR_DECL (buf_src_7, float, 32, 2) [] = {A6, A7};
+  VECT_VAR_DECL (buf_src_8, float, 32, 2) [] = {B6, B7};
+  VLOAD (vsrc_1, buf_src_7, , float, f, 32, 2);
+  VLOAD (vsrc_2, buf_src_8, , float, f, 32, 2);
+  VECT_VAR (vector_res, float, 32, 2) =
+    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
+		VECT_VAR (vsrc_2, float, 32, 2), elem3);
+  vst1_f32 (VECT_VAR (result, float, 32, 2),
+	    VECT_VAR (vector_res, float, 32, 2));
+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms3_static, "");
+  VECT_VAR (vector_res, float, 32, 2) =
+    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
+		VECT_VAR (vsrc_2, float, 32, 2), elem3);
+  vst1_f32 (VECT_VAR (result, float, 32, 2),
+	    VECT_VAR (vector_res, float, 32, 2));
+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma3_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VFMSQ_VFMAQ_N (FP32)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc_1, float, 32, 4);
+  DECL_VARIABLE(vsrc_2, float, 32, 4);
+  VECT_VAR_DECL (buf_src_1, float, 32, 4) [] = {A0, A1, A2, A3};
+  VECT_VAR_DECL (buf_src_2, float, 32, 4) [] = {B0, B1, B2, B3};
+  VLOAD (vsrc_1, buf_src_1, q, float, f, 32, 4);
+  VLOAD (vsrc_2, buf_src_2, q, float, f, 32, 4);
+  DECL_VARIABLE (vector_res, float, 32, 4) =
+    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
+		 VECT_VAR (vsrc_2, float, 32, 4), elem0);
+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
+	     VECT_VAR (vector_res, float, 32, 4));
+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms0_static, "");
+  VECT_VAR (vector_res, float, 32, 4) =
+    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
+		 VECT_VAR (vsrc_2, float, 32, 4), elem0);
+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
+	     VECT_VAR (vector_res, float, 32, 4));
+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma0_static, "");
+
+  VECT_VAR_DECL (buf_src_3, float, 32, 4) [] = {A4, A5, A6, A7};
+  VECT_VAR_DECL (buf_src_4, float, 32, 4) [] = {B4, B5, B6, B7};
+  VLOAD (vsrc_1, buf_src_3, q, float, f, 32, 4);
+  VLOAD (vsrc_2, buf_src_4, q, float, f, 32, 4);
+  VECT_VAR (vector_res, float, 32, 4) =
+    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
+		 VECT_VAR (vsrc_2, float, 32, 4), elem1);
+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
+	     VECT_VAR (vector_res, float, 32, 4));
+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms1_static, "");
+  VECT_VAR (vector_res, float, 32, 4) =
+    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
+		 VECT_VAR (vsrc_2, float, 32, 4), elem1);
+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
+	     VECT_VAR (vector_res, float, 32, 4));
+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma1_static, "");
+
+  VECT_VAR_DECL (buf_src_5, float, 32, 4) [] = {A0, A2, A4, A6};
+  VECT_VAR_DECL (buf_src_6, float, 32, 4) [] = {B0, B2, B4, B6};
+  VLOAD (vsrc_1, buf_src_5, q, float, f, 32, 4);
+  VLOAD (vsrc_2, buf_src_6, q, float, f, 32, 4);
+  VECT_VAR (vector_res, float, 32, 4) =
+    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
+		 VECT_VAR (vsrc_2, float, 32, 4), elem2);
+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
+	     VECT_VAR (vector_res, float, 32, 4));
+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms2_static, "");
+  VECT_VAR (vector_res, float, 32, 4) =
+    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
+		 VECT_VAR (vsrc_2, float, 32, 4), elem2);
+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
+	     VECT_VAR (vector_res, float, 32, 4));
+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma2_static, "");
+
+  VECT_VAR_DECL (buf_src_7, float, 32, 4) [] = {A1, A3, A5, A7};
+  VECT_VAR_DECL (buf_src_8, float, 32, 4) [] = {B1, B3, B5, B7};
+  VLOAD (vsrc_1, buf_src_7, q, float, f, 32, 4);
+  VLOAD (vsrc_2, buf_src_8, q, float, f, 32, 4);
+  VECT_VAR (vector_res, float, 32, 4) =
+    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
+		 VECT_VAR (vsrc_2, float, 32, 4), elem3);
+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
+	     VECT_VAR (vector_res, float, 32, 4));
+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms3_static, "");
+  VECT_VAR (vector_res, float, 32, 4) =
+    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
+		 VECT_VAR (vsrc_2, float, 32, 4), elem3);
+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
+	     VECT_VAR (vector_res, float, 32, 4));
+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma3_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VFMSQ_VFMAQ_N (FP64)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc_1, float, 64, 2);
+  DECL_VARIABLE(vsrc_2, float, 64, 2);
+  VECT_VAR_DECL (buf_src_1, float, 64, 2) [] = {DA0, DA1};
+  VECT_VAR_DECL (buf_src_2, float, 64, 2) [] = {DB0, DB1};
+  VLOAD (vsrc_1, buf_src_1, q, float, f, 64, 2);
+  VLOAD (vsrc_2, buf_src_2, q, float, f, 64, 2);
+  DECL_VARIABLE (vector_res, float, 64, 2) =
+    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
+		 VECT_VAR (vsrc_2, float, 64, 2), delem0);
+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
+	     VECT_VAR (vector_res, float, 64, 2));
+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms0_static, "");
+  VECT_VAR (vector_res, float, 64, 2) =
+    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
+		 VECT_VAR (vsrc_2, float, 64, 2), delem0);
+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
+	     VECT_VAR (vector_res, float, 64, 2));
+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma0_static, "");
+
+  VECT_VAR_DECL (buf_src_3, float, 64, 2) [] = {DA2, DA3};
+  VECT_VAR_DECL (buf_src_4, float, 64, 2) [] = {DB2, DB3};
+  VLOAD (vsrc_1, buf_src_3, q, float, f, 64, 2);
+  VLOAD (vsrc_2, buf_src_4, q, float, f, 64, 2);
+  VECT_VAR (vector_res, float, 64, 2) =
+    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
+		 VECT_VAR (vsrc_2, float, 64, 2), delem1);
+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
+	     VECT_VAR (vector_res, float, 64, 2));
+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms1_static, "");
+  VECT_VAR (vector_res, float, 64, 2) =
+    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
+		 VECT_VAR (vsrc_2, float, 64, 2), delem1);
+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
+	     VECT_VAR (vector_res, float, 64, 2));
+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma1_static, "");
+
+  VECT_VAR_DECL (buf_src_5, float, 64, 2) [] = {DA4, DA5};
+  VECT_VAR_DECL (buf_src_6, float, 64, 2) [] = {DB4, DB5};
+  VLOAD (vsrc_1, buf_src_5, q, float, f, 64, 2);
+  VLOAD (vsrc_2, buf_src_6, q, float, f, 64, 2);
+  VECT_VAR (vector_res, float, 64, 2) =
+    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
+		 VECT_VAR (vsrc_2, float, 64, 2), delem2);
+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
+	     VECT_VAR (vector_res, float, 64, 2));
+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms2_static, "");
+  VECT_VAR (vector_res, float, 64, 2) =
+    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
+		 VECT_VAR (vsrc_2, float, 64, 2), delem2);
+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
+	     VECT_VAR (vector_res, float, 64, 2));
+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma2_static, "");
+
+  VECT_VAR_DECL (buf_src_7, float, 64, 2) [] = {DA6, DA7};
+  VECT_VAR_DECL (buf_src_8, float, 64, 2) [] = {DB6, DB7};
+  VLOAD (vsrc_1, buf_src_7, q, float, f, 64, 2);
+  VLOAD (vsrc_2, buf_src_8, q, float, f, 64, 2);
+  VECT_VAR (vector_res, float, 64, 2) =
+    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
+		 VECT_VAR (vsrc_2, float, 64, 2), delem3);
+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
+	     VECT_VAR (vector_res, float, 64, 2));
+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms3_static, "");
+  VECT_VAR (vector_res, float, 64, 2) =
+    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
+		 VECT_VAR (vsrc_2, float, 64, 2), delem3);
+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
+	     VECT_VAR (vector_res, float, 64, 2));
+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma3_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VFMS_VFMA_N (FP64)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc_1, float, 64, 1);
+  DECL_VARIABLE(vsrc_2, float, 64, 1);
+  VECT_VAR_DECL (buf_src_1, float, 64, 1) [] = {DA0};
+  VECT_VAR_DECL (buf_src_2, float, 64, 1) [] = {DB0};
+  VLOAD (vsrc_1, buf_src_1, , float, f, 64, 1);
+  VLOAD (vsrc_2, buf_src_2, , float, f, 64, 1);
+  DECL_VARIABLE (vector_res, float, 64, 1) =
+    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
+		VECT_VAR (vsrc_2, float, 64, 1), delem0);
+  vst1_f64 (VECT_VAR (result, float, 64, 1),
+	     VECT_VAR (vector_res, float, 64, 1));
+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms0_static, "");
+  VECT_VAR (vector_res, float, 64, 1) =
+    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
+		VECT_VAR (vsrc_2, float, 64, 1), delem0);
+  vst1_f64 (VECT_VAR (result, float, 64, 1),
+	     VECT_VAR (vector_res, float, 64, 1));
+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma0_static, "");
+
+  VECT_VAR_DECL (buf_src_3, float, 64, 1) [] = {DA2};
+  VECT_VAR_DECL (buf_src_4, float, 64, 1) [] = {DB2};
+  VLOAD (vsrc_1, buf_src_3, , float, f, 64, 1);
+  VLOAD (vsrc_2, buf_src_4, , float, f, 64, 1);
+  VECT_VAR (vector_res, float, 64, 1) =
+    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
+		VECT_VAR (vsrc_2, float, 64, 1), delem1);
+  vst1_f64 (VECT_VAR (result, float, 64, 1),
+	     VECT_VAR (vector_res, float, 64, 1));
+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms1_static, "");
+  VECT_VAR (vector_res, float, 64, 1) =
+    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
+		VECT_VAR (vsrc_2, float, 64, 1), delem1);
+  vst1_f64 (VECT_VAR (result, float, 64, 1),
+	     VECT_VAR (vector_res, float, 64, 1));
+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma1_static, "");
+
+  VECT_VAR_DECL (buf_src_5, float, 64, 1) [] = {DA4};
+  VECT_VAR_DECL (buf_src_6, float, 64, 1) [] = {DB4};
+  VLOAD (vsrc_1, buf_src_5, , float, f, 64, 1);
+  VLOAD (vsrc_2, buf_src_6, , float, f, 64, 1);
+  VECT_VAR (vector_res, float, 64, 1) =
+    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
+		VECT_VAR (vsrc_2, float, 64, 1), delem2);
+  vst1_f64 (VECT_VAR (result, float, 64, 1),
+	     VECT_VAR (vector_res, float, 64, 1));
+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms2_static, "");
+  VECT_VAR (vector_res, float, 64, 1) =
+    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
+		VECT_VAR (vsrc_2, float, 64, 1), delem2);
+  vst1_f64 (VECT_VAR (result, float, 64, 1),
+	     VECT_VAR (vector_res, float, 64, 1));
+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma2_static, "");
+
+  VECT_VAR_DECL (buf_src_7, float, 64, 1) [] = {DA6};
+  VECT_VAR_DECL (buf_src_8, float, 64, 1) [] = {DB6};
+  VLOAD (vsrc_1, buf_src_7, , float, f, 64, 1);
+  VLOAD (vsrc_2, buf_src_8, , float, f, 64, 1);
+  VECT_VAR (vector_res, float, 64, 1) =
+    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
+		VECT_VAR (vsrc_2, float, 64, 1), delem3);
+  vst1_f64 (VECT_VAR (result, float, 64, 1),
+	     VECT_VAR (vector_res, float, 64, 1));
+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms3_static, "");
+  VECT_VAR (vector_res, float, 64, 1) =
+    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
+		VECT_VAR (vsrc_2, float, 64, 1), delem3);
+  vst1_f64 (VECT_VAR (result, float, 64, 1),
+	     VECT_VAR (vector_res, float, 64, 1));
+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma3_static, "");
+}
+#endif
+
+int
+main (void)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA)
+  exec_vfma_vfms_n ();
+#endif
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
index 1ba1fed98a0..5b348827002 100644
--- a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
@@ -110,6 +110,6 @@ main (int argc, char **argv)
 /* vfmaq_lane_f64.
    vfma_laneq_f64.
    vfmaq_laneq_f64.  */
-/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */
+/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */
 
 
diff --git a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
index 887ebae10da..6c194a023d3 100644
--- a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
@@ -111,6 +111,6 @@ main (int argc, char **argv)
 /* vfmsq_lane_f64.
    vfms_laneq_f64.
    vfmsq_laneq_f64.  */
-/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */
+/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */
 
 
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmul_elem_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vmul_elem_1.c
new file mode 100644
index 00000000000..a1faefd88ba
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vmul_elem_1.c
@@ -0,0 +1,541 @@
+/* Test the vmul_n_f64 AArch64 SIMD intrinsic.  */
+
+/* { dg-do run } */
+/* { dg-options "-O2 --save-temps" } */
+
+#include "arm_neon.h"
+
+extern void abort (void);
+
+#define A (132.4f)
+#define B (-0.0f)
+#define C (-34.8f)
+#define D (289.34f)
+float32_t expected2_1[2] = {A * A, B * A};
+float32_t expected2_2[2] = {A * B, B * B};
+float32_t expected4_1[4] = {A * A, B * A, C * A, D * A};
+float32_t expected4_2[4] = {A * B, B * B, C * B, D * B};
+float32_t expected4_3[4] = {A * C, B * C, C * C, D * C};
+float32_t expected4_4[4] = {A * D, B * D, C * D, D * D};
+float32_t _elemA = A;
+float32_t _elemB = B;
+float32_t _elemC = C;
+float32_t _elemD = D;
+
+#define AD (1234.5)
+#define BD (-0.0)
+#define CD (71.3)
+#define DD (-1024.4)
+float64_t expectedd2_1[2] = {AD * CD, BD * CD};
+float64_t expectedd2_2[2] = {AD * DD, BD * DD};
+float64_t _elemdC = CD;
+float64_t _elemdD = DD;
+
+
+#define AS (1024)
+#define BS (-31)
+#define CS (0)
+#define DS (655)
+int32_t expecteds2_1[2] = {AS * AS, BS * AS};
+int32_t expecteds2_2[2] = {AS * BS, BS * BS};
+int32_t expecteds4_1[4] = {AS * AS, BS * AS, CS * AS, DS * AS};
+int32_t expecteds4_2[4] = {AS * BS, BS * BS, CS * BS, DS * BS};
+int32_t expecteds4_3[4] = {AS * CS, BS * CS, CS * CS, DS * CS};
+int32_t expecteds4_4[4] = {AS * DS, BS * DS, CS * DS, DS * DS};
+int32_t _elemsA = AS;
+int32_t _elemsB = BS;
+int32_t _elemsC = CS;
+int32_t _elemsD = DS;
+
+#define AH ((int16_t) 0)
+#define BH ((int16_t) -32)
+#define CH ((int16_t) 102)
+#define DH ((int16_t) -51)
+#define EH ((int16_t) 71)
+#define FH ((int16_t) -91)
+#define GH ((int16_t) 48)
+#define HH ((int16_t) 255)
+int16_t expectedh4_1[4] = {AH * AH, BH * AH, CH * AH, DH * AH};
+int16_t expectedh4_2[4] = {AH * BH, BH * BH, CH * BH, DH * BH};
+int16_t expectedh4_3[4] = {AH * CH, BH * CH, CH * CH, DH * CH};
+int16_t expectedh4_4[4] = {AH * DH, BH * DH, CH * DH, DH * DH};
+int16_t expectedh8_1[8] = {AH * AH, BH * AH, CH * AH, DH * AH,
+			   EH * AH, FH * AH, GH * AH, HH * AH};
+int16_t expectedh8_2[8] = {AH * BH, BH * BH, CH * BH, DH * BH,
+			   EH * BH, FH * BH, GH * BH, HH * BH};
+int16_t expectedh8_3[8] = {AH * CH, BH * CH, CH * CH, DH * CH,
+			   EH * CH, FH * CH, GH * CH, HH * CH};
+int16_t expectedh8_4[8] = {AH * DH, BH * DH, CH * DH, DH * DH,
+			   EH * DH, FH * DH, GH * DH, HH * DH};
+int16_t expectedh8_5[8] = {AH * EH, BH * EH, CH * EH, DH * EH,
+			   EH * EH, FH * EH, GH * EH, HH * EH};
+int16_t expectedh8_6[8] = {AH * FH, BH * FH, CH * FH, DH * FH,
+			   EH * FH, FH * FH, GH * FH, HH * FH};
+int16_t expectedh8_7[8] = {AH * GH, BH * GH, CH * GH, DH * GH,
+			   EH * GH, FH * GH, GH * GH, HH * GH};
+int16_t expectedh8_8[8] = {AH * HH, BH * HH, CH * HH, DH * HH,
+			   EH * HH, FH * HH, GH * HH, HH * HH};
+int16_t _elemhA = AH;
+int16_t _elemhB = BH;
+int16_t _elemhC = CH;
+int16_t _elemhD = DH;
+int16_t _elemhE = EH;
+int16_t _elemhF = FH;
+int16_t _elemhG = GH;
+int16_t _elemhH = HH;
+
+#define AUS (1024)
+#define BUS (31)
+#define CUS (0)
+#define DUS (655)
+uint32_t expectedus2_1[2] = {AUS * AUS, BUS * AUS};
+uint32_t expectedus2_2[2] = {AUS * BUS, BUS * BUS};
+uint32_t expectedus4_1[4] = {AUS * AUS, BUS * AUS, CUS * AUS, DUS * AUS};
+uint32_t expectedus4_2[4] = {AUS * BUS, BUS * BUS, CUS * BUS, DUS * BUS};
+uint32_t expectedus4_3[4] = {AUS * CUS, BUS * CUS, CUS * CUS, DUS * CUS};
+uint32_t expectedus4_4[4] = {AUS * DUS, BUS * DUS, CUS * DUS, DUS * DUS};
+uint32_t _elemusA = AUS;
+uint32_t _elemusB = BUS;
+uint32_t _elemusC = CUS;
+uint32_t _elemusD = DUS;
+
+#define AUH ((uint16_t) 0)
+#define BUH ((uint16_t) 32)
+#define CUH ((uint16_t) 102)
+#define DUH ((uint16_t) 51)
+#define EUH ((uint16_t) 71)
+#define FUH ((uint16_t) 91)
+#define GUH ((uint16_t) 48)
+#define HUH ((uint16_t) 255)
+uint16_t expecteduh4_1[4] = {AUH * AUH, BUH * AUH, CUH * AUH, DUH * AUH};
+uint16_t expecteduh4_2[4] = {AUH * BUH, BUH * BUH, CUH * BUH, DUH * BUH};
+uint16_t expecteduh4_3[4] = {AUH * CUH, BUH * CUH, CUH * CUH, DUH * CUH};
+uint16_t expecteduh4_4[4] = {AUH * DUH, BUH * DUH, CUH * DUH, DUH * DUH};
+uint16_t expecteduh8_1[8] = {AUH * AUH, BUH * AUH, CUH * AUH, DUH * AUH,
+			     EUH * AUH, FUH * AUH, GUH * AUH, HUH * AUH};
+uint16_t expecteduh8_2[8] = {AUH * BUH, BUH * BUH, CUH * BUH, DUH * BUH,
+			     EUH * BUH, FUH * BUH, GUH * BUH, HUH * BUH};
+uint16_t expecteduh8_3[8] = {AUH * CUH, BUH * CUH, CUH * CUH, DUH * CUH,
+			     EUH * CUH, FUH * CUH, GUH * CUH, HUH * CUH};
+uint16_t expecteduh8_4[8] = {AUH * DUH, BUH * DUH, CUH * DUH, DUH * DUH,
+			     EUH * DUH, FUH * DUH, GUH * DUH, HUH * DUH};
+uint16_t expecteduh8_5[8] = {AUH * EUH, BUH * EUH, CUH * EUH, DUH * EUH,
+			     EUH * EUH, FUH * EUH, GUH * EUH, HUH * EUH};
+uint16_t expecteduh8_6[8] = {AUH * FUH, BUH * FUH, CUH * FUH, DUH * FUH,
+			     EUH * FUH, FUH * FUH, GUH * FUH, HUH * FUH};
+uint16_t expecteduh8_7[8] = {AUH * GUH, BUH * GUH, CUH * GUH, DUH * GUH,
+			     EUH * GUH, FUH * GUH, GUH * GUH, HUH * GUH};
+uint16_t expecteduh8_8[8] = {AUH * HUH, BUH * HUH, CUH * HUH, DUH * HUH,
+			     EUH * HUH, FUH * HUH, GUH * HUH, HUH * HUH};
+uint16_t _elemuhA = AUH;
+uint16_t _elemuhB = BUH;
+uint16_t _elemuhC = CUH;
+uint16_t _elemuhD = DUH;
+uint16_t _elemuhE = EUH;
+uint16_t _elemuhF = FUH;
+uint16_t _elemuhG = GUH;
+uint16_t _elemuhH = HUH;
+
+void
+check_v2sf (float32_t elemA, float32_t elemB)
+{
+  int32_t indx;
+  const float32_t vec32x2_buf[2] = {A, B};
+  float32x2_t vec32x2_src = vld1_f32 (vec32x2_buf);
+  float32_t vec32x2_res[2];
+
+  vst1_f32 (vec32x2_res, vmul_n_f32 (vec32x2_src, elemA));
+
+  for (indx = 0; indx < 2; indx++)
+    if (* (uint32_t *) &vec32x2_res[indx] != * (uint32_t *) &expected2_1[indx])
+      abort ();
+
+  vst1_f32 (vec32x2_res, vmul_n_f32 (vec32x2_src, elemB));
+
+  for (indx = 0; indx < 2; indx++)
+    if (* (uint32_t *) &vec32x2_res[indx] != * (uint32_t *) &expected2_2[indx])
+      abort ();
+
+/* { dg-final { scan-assembler-times "fmul\tv\[0-9\]+\.2s, v\[0-9\]+\.2s, v\[0-9\]+\.s\\\[0\\\]" 2 } } */
+}
+
+void
+check_v4sf (float32_t elemA, float32_t elemB, float32_t elemC, float32_t elemD)
+{
+  int32_t indx;
+  const float32_t vec32x4_buf[4] = {A, B, C, D};
+  float32x4_t vec32x4_src = vld1q_f32 (vec32x4_buf);
+  float32_t vec32x4_res[4];
+
+  vst1q_f32 (vec32x4_res, vmulq_n_f32 (vec32x4_src, elemA));
+
+  for (indx = 0; indx < 4; indx++)
+    if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) &expected4_1[indx])
+      abort ();
+
+  vst1q_f32 (vec32x4_res, vmulq_n_f32 (vec32x4_src, elemB));
+
+  for (indx = 0; indx < 4; indx++)
+    if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) &expected4_2[indx])
+      abort ();
+
+  vst1q_f32 (vec32x4_res, vmulq_n_f32 (vec32x4_src, elemC));
+
+  for (indx = 0; indx < 4; indx++)
+    if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) &expected4_3[indx])
+      abort ();
+
+  vst1q_f32 (vec32x4_res, vmulq_n_f32 (vec32x4_src, elemD));
+
+  for (indx = 0; indx < 4; indx++)
+    if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) &expected4_4[indx])
+      abort ();
+
+/* { dg-final { scan-assembler-times "fmul\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.s\\\[0\\\]" 4 } } */
+}
+
+void
+check_v2df (float64_t elemdC, float64_t elemdD)
+{
+  int32_t indx;
+  const float64_t vec64x2_buf[2] = {AD, BD};
+  float64x2_t vec64x2_src = vld1q_f64 (vec64x2_buf);
+  float64_t vec64x2_res[2];
+
+  vst1q_f64 (vec64x2_res, vmulq_n_f64 (vec64x2_src, elemdC));
+
+  for (indx = 0; indx < 2; indx++)
+    if (* (uint64_t *) &vec64x2_res[indx] != * (uint64_t *) &expectedd2_1[indx])
+      abort ();
+
+  vst1q_f64 (vec64x2_res, vmulq_n_f64 (vec64x2_src, elemdD));
+
+  for (indx = 0; indx < 2; indx++)
+    if (* (uint64_t *) &vec64x2_res[indx] != * (uint64_t *) &expectedd2_2[indx])
+      abort ();
+
+/* { dg-final { scan-assembler-times "fmul\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.d\\\[0\\\]" 2 } } */
+}
+
+void
+check_v2si (int32_t elemsA, int32_t elemsB)
+{
+  int32_t indx;
+  const int32_t vecs32x2_buf[2] = {AS, BS};
+  int32x2_t vecs32x2_src = vld1_s32 (vecs32x2_buf);
+  int32_t vecs32x2_res[2];
+
+  vst1_s32 (vecs32x2_res, vmul_n_s32 (vecs32x2_src, elemsA));
+
+  for (indx = 0; indx < 2; indx++)
+    if (vecs32x2_res[indx] != expecteds2_1[indx])
+      abort ();
+
+  vst1_s32 (vecs32x2_res, vmul_n_s32 (vecs32x2_src, elemsB));
+
+  for (indx = 0; indx < 2; indx++)
+    if (vecs32x2_res[indx] != expecteds2_2[indx])
+      abort ();
+}
+
+void
+check_v2si_unsigned (uint32_t elemusA, uint32_t elemusB)
+{
+  int indx;
+  const uint32_t vecus32x2_buf[2] = {AUS, BUS};
+  uint32x2_t vecus32x2_src = vld1_u32 (vecus32x2_buf);
+  uint32_t vecus32x2_res[2];
+
+  vst1_u32 (vecus32x2_res, vmul_n_u32 (vecus32x2_src, elemusA));
+
+  for (indx = 0; indx < 2; indx++)
+    if (vecus32x2_res[indx] != expectedus2_1[indx])
+      abort ();
+
+  vst1_u32 (vecus32x2_res, vmul_n_u32 (vecus32x2_src, elemusB));
+
+  for (indx = 0; indx < 2; indx++)
+    if (vecus32x2_res[indx] != expectedus2_2[indx])
+      abort ();
+
+/* { dg-final { scan-assembler-times "\tmul\tv\[0-9\]+\.2s, v\[0-9\]+\.2s, v\[0-9\]+\.s\\\[0\\\]" 4 } } */
+}
+
+void
+check_v4si (int32_t elemsA, int32_t elemsB, int32_t elemsC, int32_t elemsD)
+{
+  int32_t indx;
+  const int32_t vecs32x4_buf[4] = {AS, BS, CS, DS};
+  int32x4_t vecs32x4_src = vld1q_s32 (vecs32x4_buf);
+  int32_t vecs32x4_res[4];
+
+  vst1q_s32 (vecs32x4_res, vmulq_n_s32 (vecs32x4_src, elemsA));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vecs32x4_res[indx] != expecteds4_1[indx])
+      abort ();
+
+  vst1q_s32 (vecs32x4_res, vmulq_n_s32 (vecs32x4_src, elemsB));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vecs32x4_res[indx] != expecteds4_2[indx])
+      abort ();
+
+  vst1q_s32 (vecs32x4_res, vmulq_n_s32 (vecs32x4_src, elemsC));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vecs32x4_res[indx] != expecteds4_3[indx])
+      abort ();
+
+  vst1q_s32 (vecs32x4_res, vmulq_n_s32 (vecs32x4_src, elemsD));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vecs32x4_res[indx] != expecteds4_4[indx])
+      abort ();
+}
+
+void
+check_v4si_unsigned (uint32_t elemusA, uint32_t elemusB, uint32_t elemusC,
+		     uint32_t elemusD)
+{
+  int indx;
+  const uint32_t vecus32x4_buf[4] = {AUS, BUS, CUS, DUS};
+  uint32x4_t vecus32x4_src = vld1q_u32 (vecus32x4_buf);
+  uint32_t vecus32x4_res[4];
+
+  vst1q_u32 (vecus32x4_res, vmulq_n_u32 (vecus32x4_src, elemusA));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vecus32x4_res[indx] != expectedus4_1[indx])
+      abort ();
+
+  vst1q_u32 (vecus32x4_res, vmulq_n_u32 (vecus32x4_src, elemusB));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vecus32x4_res[indx] != expectedus4_2[indx])
+      abort ();
+
+  vst1q_u32 (vecus32x4_res, vmulq_n_u32 (vecus32x4_src, elemusC));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vecus32x4_res[indx] != expectedus4_3[indx])
+      abort ();
+
+  vst1q_u32 (vecus32x4_res, vmulq_n_u32 (vecus32x4_src, elemusD));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vecus32x4_res[indx] != expectedus4_4[indx])
+      abort ();
+
+/* { dg-final { scan-assembler-times "\tmul\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.s\\\[0\\\]" 8 } } */
+}
+
+
+void
+check_v4hi (int16_t elemhA, int16_t elemhB, int16_t elemhC, int16_t elemhD)
+{
+  int32_t indx;
+  const int16_t vech16x4_buf[4] = {AH, BH, CH, DH};
+  int16x4_t vech16x4_src = vld1_s16 (vech16x4_buf);
+  int16_t vech16x4_res[4];
+
+  vst1_s16 (vech16x4_res, vmul_n_s16 (vech16x4_src, elemhA));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vech16x4_res[indx] != expectedh4_1[indx])
+      abort ();
+
+  vst1_s16 (vech16x4_res, vmul_n_s16 (vech16x4_src, elemhB));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vech16x4_res[indx] != expectedh4_2[indx])
+      abort ();
+
+  vst1_s16 (vech16x4_res, vmul_n_s16 (vech16x4_src, elemhC));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vech16x4_res[indx] != expectedh4_3[indx])
+      abort ();
+
+  vst1_s16 (vech16x4_res, vmul_n_s16 (vech16x4_src, elemhD));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vech16x4_res[indx] != expectedh4_4[indx])
+      abort ();
+}
+
+void
+check_v4hi_unsigned (uint16_t elemuhA, uint16_t elemuhB, uint16_t elemuhC,
+		     uint16_t elemuhD)
+{
+  int indx;
+  const uint16_t vecuh16x4_buf[4] = {AUH, BUH, CUH, DUH};
+  uint16x4_t vecuh16x4_src = vld1_u16 (vecuh16x4_buf);
+  uint16_t vecuh16x4_res[4];
+
+  vst1_u16 (vecuh16x4_res, vmul_n_u16 (vecuh16x4_src, elemuhA));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vecuh16x4_res[indx] != expecteduh4_1[indx])
+      abort ();
+
+  vst1_u16 (vecuh16x4_res, vmul_n_u16 (vecuh16x4_src, elemuhB));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vecuh16x4_res[indx] != expecteduh4_2[indx])
+      abort ();
+
+  vst1_u16 (vecuh16x4_res, vmul_n_u16 (vecuh16x4_src, elemuhC));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vecuh16x4_res[indx] != expecteduh4_3[indx])
+      abort ();
+
+  vst1_u16 (vecuh16x4_res, vmul_n_u16 (vecuh16x4_src, elemuhD));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vecuh16x4_res[indx] != expecteduh4_4[indx])
+      abort ();
+
+/* { dg-final { scan-assembler-times "mul\tv\[0-9\]+\.4h, v\[0-9\]+\.4h, v\[0-9\]+\.h\\\[0\\\]" 8 } } */
+}
+
+void
+check_v8hi (int16_t elemhA, int16_t elemhB, int16_t elemhC, int16_t elemhD,
+	    int16_t elemhE, int16_t elemhF, int16_t elemhG, int16_t elemhH)
+{
+  int32_t indx;
+  const int16_t vech16x8_buf[8] = {AH, BH, CH, DH, EH, FH, GH, HH};
+  int16x8_t vech16x8_src = vld1q_s16 (vech16x8_buf);
+  int16_t vech16x8_res[8];
+
+  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhA));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vech16x8_res[indx] != expectedh8_1[indx])
+      abort ();
+
+  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhB));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vech16x8_res[indx] != expectedh8_2[indx])
+      abort ();
+
+  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhC));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vech16x8_res[indx] != expectedh8_3[indx])
+      abort ();
+
+  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhD));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vech16x8_res[indx] != expectedh8_4[indx])
+      abort ();
+
+  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhE));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vech16x8_res[indx] != expectedh8_5[indx])
+      abort ();
+
+  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhF));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vech16x8_res[indx] != expectedh8_6[indx])
+      abort ();
+
+  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhG));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vech16x8_res[indx] != expectedh8_7[indx])
+      abort ();
+
+  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhH));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vech16x8_res[indx] != expectedh8_8[indx])
+      abort ();
+}
+
+void
+check_v8hi_unsigned (uint16_t elemuhA, uint16_t elemuhB, uint16_t elemuhC,
+		     uint16_t elemuhD, uint16_t elemuhE, uint16_t elemuhF,
+		     uint16_t elemuhG, uint16_t elemuhH)
+{
+  int indx;
+  const uint16_t vecuh16x8_buf[8] = {AUH, BUH, CUH, DUH, EUH, FUH, GUH, HUH};
+  uint16x8_t vecuh16x8_src = vld1q_u16 (vecuh16x8_buf);
+  uint16_t vecuh16x8_res[8];
+
+  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhA));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vecuh16x8_res[indx] != expecteduh8_1[indx])
+      abort ();
+
+  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhB));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vecuh16x8_res[indx] != expecteduh8_2[indx])
+      abort ();
+
+  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhC));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vecuh16x8_res[indx] != expecteduh8_3[indx])
+      abort ();
+
+  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhD));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vecuh16x8_res[indx] != expecteduh8_4[indx])
+      abort ();
+
+  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhE));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vecuh16x8_res[indx] != expecteduh8_5[indx])
+      abort ();
+
+  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhF));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vecuh16x8_res[indx] != expecteduh8_6[indx])
+      abort ();
+
+  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhG));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vecuh16x8_res[indx] != expecteduh8_7[indx])
+      abort ();
+
+  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhH));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vecuh16x8_res[indx] != expecteduh8_8[indx])
+      abort ();
+
+/* { dg-final { scan-assembler-times "mul\tv\[0-9\]+\.8h, v\[0-9\]+\.8h, v\[0-9\]+\.h\\\[0\\\]" 16 } } */
+}
+
+int
+main (void)
+{
+  check_v2sf (_elemA, _elemB);
+  check_v4sf (_elemA, _elemB, _elemC, _elemD);
+  check_v2df (_elemdC, _elemdD);
+  check_v2si (_elemsA, _elemsB);
+  check_v4si (_elemsA, _elemsB, _elemsC, _elemsD);
+  check_v4hi (_elemhA, _elemhB, _elemhC, _elemhD);
+  check_v8hi (_elemhA, _elemhB, _elemhC, _elemhD,
+	      _elemhE, _elemhF, _elemhG, _elemhH);
+  check_v2si_unsigned (_elemusA, _elemusB);
+  check_v4si_unsigned (_elemusA, _elemusB, _elemusC, _elemusD);
+  check_v4hi_unsigned (_elemuhA, _elemuhB, _elemuhC, _elemuhD);
+  check_v8hi_unsigned (_elemuhA, _elemuhB, _elemuhC, _elemuhD,
+		       _elemuhE, _elemuhF, _elemuhG, _elemuhH);
+
+  return 0;
+}
+
author	Christophe Lyon <christophe.lyon@linaro.org>	2016-06-06 23:44:46 +0200
committer	Yvan Roux <yvan.roux@linaro.org>	2016-06-14 14:47:54 +0200
commit	060934a01ffa66b5d87ab7098b3dc3d8d4e27e10 (patch)
tree	aa124935b851b6c430e3a8f7875c87ddde0b5653
parent	309490ea34b4e40522a6c1645ee191c1ad5f68c9 (diff)