Added unaligned load and dot product to the ARM NEON backendst/cli-fe-vect

git-svn-id: https://gcc.gnu.org/svn/gcc/branches/st/cli-fe-vect@164473 138bc75d-0d04-0410-961f-82ee72b054a4
author: David Yuste <dyuste@gcc.gnu.org> 2010-09-21 09:13:34 +0000
committer: David Yuste <dyuste@gcc.gnu.org> 2010-09-21 09:13:34 +0000
commit: cc69bc56ec545d44bc5d1577ac92bf10a82306e5 (patch)
tree: 7d0466eb01245a79cacd05fd3622e799e69580de
parent: f749dfefe80a96b8b76a14c2dc26052476748782 (diff)
15 files changed, 223 insertions, 11 deletions
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index ed569e2df0c7..d3402407ba85 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -224,6 +224,11 @@ static bool arm_can_eliminate (const int, const int);
 static void arm_asm_trampoline_template (FILE *);
 static void arm_trampoline_init (rtx, tree, rtx);
 static rtx arm_trampoline_adjust_address (rtx);
+static bool arm_vector_alignment_reachable (const_tree type, bool is_packed);
+static bool arm_builtin_support_vector_misalignment (enum machine_mode mode,
+                                                    const_tree type,
+                                                    int misalignment,
+                                                    bool is_packed);
 
 
 /* Table of machine attributes.  */
@@ -507,6 +512,14 @@ static const struct attribute_spec arm_attribute_table[] =
 #undef TARGET_CAN_ELIMINATE
 #define TARGET_CAN_ELIMINATE arm_can_eliminate
 
+#undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
+#define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
+  arm_vector_alignment_reachable
+
+#undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
+#define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
+  arm_builtin_support_vector_misalignment
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 /* Obstack for minipool constant handling.  */
@@ -8436,7 +8449,8 @@ neon_vector_mem_operand (rtx op, int type)
     return arm_address_register_rtx_p (ind, 0);
 
   /* Allow post-increment with Neon registers.  */
-  if (type != 1 && (GET_CODE (ind) == POST_INC || GET_CODE (ind) == PRE_DEC))
+  if ((type != 1 && GET_CODE (ind) == POST_INC)
+      || (type == 0 && GET_CODE (ind) == PRE_DEC))
     return arm_address_register_rtx_p (XEXP (ind, 0), 0);
 
   /* FIXME: vld1 allows register post-modify.  */
@@ -15404,6 +15418,8 @@ arm_print_operand (FILE *stream, rtx x, int code)
       {
 	rtx addr;
 	bool postinc = FALSE;
+	unsigned align, modesize, align_bits;
+
 	gcc_assert (GET_CODE (x) == MEM);
 	addr = XEXP (x, 0);
 	if (GET_CODE (addr) == POST_INC)
@@ -15411,7 +15427,29 @@ arm_print_operand (FILE *stream, rtx x, int code)
 	    postinc = 1;
 	    addr = XEXP (addr, 0);
 	  }
-	asm_fprintf (stream, "[%r]", REGNO (addr));
+	asm_fprintf (stream, "[%r", REGNO (addr));
+
+	/* We know the alignment of this access, so we can emit a hint in the
+	   instruction (for some alignments) as an aid to the memory subsystem
+	   of the target.  */
+	align = MEM_ALIGN (x) >> 3;
+	modesize = GET_MODE_SIZE (GET_MODE (x));
+	
+	/* Only certain alignment specifiers are supported by the hardware.  */
+	if (modesize == 16 && (align % 32) == 0)
+	  align_bits = 256;
+	else if ((modesize == 8 || modesize == 16) && (align % 16) == 0)
+	  align_bits = 128;
+	else if ((align % 8) == 0)
+	  align_bits = 64;
+	else
+	  align_bits = 0;
+	
+	if (align_bits != 0)
+	  asm_fprintf (stream, ", :%d", align_bits);
+
+	asm_fprintf (stream, "]");
+
 	if (postinc)
 	  fputs("!", stream);
       }
@@ -21457,4 +21495,43 @@ arm_have_conditional_execution (void)
   return !TARGET_THUMB1;
 }
 
+static bool
+arm_vector_alignment_reachable (const_tree type, bool is_packed)
+{
+  /* Vectors which aren't in packed structures will not be less aligned than
+     the natural alignment of their element type, so this is safe.  */
+  if (TARGET_NEON && !BYTES_BIG_ENDIAN)
+    return !is_packed;
+
+  return default_builtin_vector_alignment_reachable (type, is_packed);
+}
+
+static bool
+arm_builtin_support_vector_misalignment (enum machine_mode mode,
+					 const_tree type, int misalignment,
+					 bool is_packed)
+{
+  if (TARGET_NEON && !BYTES_BIG_ENDIAN)
+    {
+      HOST_WIDE_INT align = TYPE_ALIGN_UNIT (type);
+
+      if (is_packed)
+        return align == 1;
+
+      /* If the misalignment is unknown, we should be able to handle the access
+	 so long as it is not to a member of a packed data structure.  */
+      if (misalignment == -1)
+        return true;
+
+      /* Return true if the misalignment is a multiple of the natural alignment
+         of the vector's element type.  This is probably always going to be
+	 true in practice, since we've already established that this isn't a
+	 packed access.  */
+      return ((misalignment % align) == 0);
+    }
+  
+  return default_builtin_support_vector_misalignment (mode, type, misalignment,
+						      is_packed);
+}
+
 #include "gt-arm.h"
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 43b3805c7bad..c22bf56f19e3 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -159,7 +159,8 @@
    (UNSPEC_VUZP1		201)
    (UNSPEC_VUZP2		202)
    (UNSPEC_VZIP1		203)
-   (UNSPEC_VZIP2		204)])
+   (UNSPEC_VZIP2		204)
+   (UNSPEC_MISALIGNED_ACCESS	205)])
 
 ;; Double-width vector modes.
 (define_mode_iterator VD [V8QI V4HI V2SI V2SF])
@@ -674,6 +675,52 @@
   neon_disambiguate_copy (operands, dest, src, 4);
 })
 
+(define_expand "movmisalign<mode>"
+  [(set (match_operand:VDQX 0 "nonimmediate_operand"	      "")
+	(unspec:VDQX [(match_operand:VDQX 1 "general_operand" "")]
+		     UNSPEC_MISALIGNED_ACCESS))]
+  "TARGET_NEON && !BYTES_BIG_ENDIAN"
+{
+  /* This pattern is not permitted to fail during expansion: if both arguments
+     are non-registers (e.g. memory := constant, which can be created by the
+     auto-vectorizer), force operand 1 into a register.  */
+  if (!s_register_operand (operands[0], <MODE>mode)
+      && !s_register_operand (operands[1], <MODE>mode))
+    operands[1] = force_reg (<MODE>mode, operands[1]);
+})
+
+(define_insn "*movmisalign<mode>_neon_store"
+  [(set (match_operand:VDX 0 "memory_operand"		       "=Um")
+	(unspec:VDX [(match_operand:VDX 1 "s_register_operand" " w")]
+		    UNSPEC_MISALIGNED_ACCESS))]
+  "TARGET_NEON && !BYTES_BIG_ENDIAN"
+  "vst1.<V_sz_elem>\t{%P1}, %A0"
+  [(set_attr "neon_type" "neon_vst1_1_2_regs_vst2_2_regs")])
+
+(define_insn "*movmisalign<mode>_neon_load"
+  [(set (match_operand:VDX 0 "s_register_operand"	   "=w")
+	(unspec:VDX [(match_operand:VDX 1 "memory_operand" " Um")]
+		    UNSPEC_MISALIGNED_ACCESS))]
+  "TARGET_NEON && !BYTES_BIG_ENDIAN"
+  "vld1.<V_sz_elem>\t{%P0}, %A1"
+  [(set_attr "neon_type" "neon_vld1_1_2_regs")])
+
+(define_insn "*movmisalign<mode>_neon_store"
+  [(set (match_operand:VQX 0 "memory_operand"		       "=Um")
+	(unspec:VQX [(match_operand:VQX 1 "s_register_operand" " w")]
+		    UNSPEC_MISALIGNED_ACCESS))]
+  "TARGET_NEON && !BYTES_BIG_ENDIAN"
+  "vst1.<V_sz_elem>\t{%q1}, %A0"
+  [(set_attr "neon_type" "neon_vst1_1_2_regs_vst2_2_regs")])
+
+(define_insn "*movmisalign<mode>_neon_load"
+  [(set (match_operand:VQX 0 "s_register_operand"	   "=w")
+	(unspec:VQX [(match_operand:VQX 1 "memory_operand" " Um")]
+		    UNSPEC_MISALIGNED_ACCESS))]
+  "TARGET_NEON && !BYTES_BIG_ENDIAN"
+  "vld1.<V_sz_elem>\t{%q0}, %A1"
+  [(set_attr "neon_type" "neon_vld1_1_2_regs")])
+
 (define_insn "vec_set<mode>_internal"
   [(set (match_operand:VD 0 "s_register_operand" "=w")
         (vec_merge:VD
@@ -5051,3 +5098,38 @@
   emit_insn (gen_orn<mode>3_neon (operands[0], operands[1], operands[2]));
   DONE;
 })
+
+
+;; dot product
+
+(define_expand "udot_prodv4hi"
+  [(match_operand:V2SI 0 "register_operand" "=v")
+   (match_operand:V4HI 1 "register_operand" "v")
+   (match_operand:V4HI 2 "register_operand" "v")
+   (match_operand:V2SI 3 "register_operand" "v")]  
+ "TARGET_NEON"
+
+{
+ /*rtx tmp = gen_rtx_REG (V4HImode);*/
+ emit_insn (gen_neon_vmulv4hi (operands[1], operands[1], operands[2], const0_rtx));
+ emit_insn (gen_neon_vpadalv4hi (operands[3], operands[3], operands[1], const0_rtx));
+ emit_move_insn (operands[0], operands[3]);
+ DONE;
+})
+
+(define_expand "sdot_prodv4hi"
+  [(match_operand:V2SI 0 "register_operand" "=v")
+   (match_operand:V4HI 1 "register_operand" "v")
+   (match_operand:V4HI 2 "register_operand" "v")
+   (match_operand:V2SI 3 "register_operand" "v")]  
+ "TARGET_NEON"
+
+{
+ /* FIXME: operands[1] gets clobbered and it shouldn't. A free register must be used as temporary */
+ emit_insn (gen_neon_vmulv4hi (operands[1], operands[1], operands[2], const0_rtx));
+ emit_insn (gen_neon_vpadalv4hi (operands[3], operands[3], operands[1], const0_rtx));
+ emit_move_insn (operands[0], operands[3]);
+ DONE;
+})
+
+
diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
index 6fbb7cdcdac6..9971cedbfb52 100644
--- a/gcc/config/rs6000/altivec.md
+++ b/gcc/config/rs6000/altivec.md
@@ -2750,3 +2750,4 @@
   emit_insn (gen_altivec_vcfux (operands[0], tmp, const0_rtx));
   DONE;
 }")
+
diff --git a/gcc/expand-vect-cli.c b/gcc/expand-vect-cli.c
index aacdbfb749b3..81f8219916a4 100644
--- a/gcc/expand-vect-cli.c
+++ b/gcc/expand-vect-cli.c
@@ -219,12 +219,12 @@ get_vectype (tree scalar_type, bool *ignore_stmt)
       return NULL_TREE;
     }
 
-  if (nbytes == 0 || nbytes >= UNITS_PER_SIMD_WORD (inner_mode))
+  if (nbytes == 0 || nbytes >= simd_width)
     return NULL_TREE;
 
   /* FORNOW: Only a single vector size per mode (UNITS_PER_SIMD_WORD)
      is expected.  */
-  nunits = UNITS_PER_SIMD_WORD (inner_mode) / nbytes;
+  nunits = simd_width / nbytes;
 
   vectype = build_vector_type (scalar_type, nunits);
 
diff --git a/gcc/expr.c b/gcc/expr.c
index ad66d934d2f3..667dd76e76d7 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -4351,7 +4351,10 @@ expand_assignment (tree to, tree from, bool nontemporal)
            && op_mode1 != VOIDmode)
          reg = copy_to_mode_reg (op_mode1, reg);
 
-      insn = GEN_FCN (icode) (mem, reg);
+       insn = GEN_FCN (icode) (mem, reg);
+       /* The movmisalign<mode> pattern cannot fail, else the assignment would
+          silently be omitted.  */
+       gcc_assert (insn != NULL_RTX);
        emit_insn (insn);
        return;
      }
@@ -8718,6 +8721,7 @@ expand_expr_real_1 (tree exp, rtx target, enum machine_mode tmode,
 
 	    /* Nor can the insn generator.  */
 	    insn = GEN_FCN (icode) (reg, temp);
+	    gcc_assert (insn != NULL_RTX);
 	    emit_insn (insn);
 
 	    return reg;
diff --git a/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-8.c b/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-8.c
index ea67946e5053..afa5b3d241e9 100644
--- a/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-8.c
+++ b/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-8.c
@@ -46,5 +46,5 @@ int main (void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED." 1 "vect" { xfail { ! { vect_hw_misalign } } } } } */
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED." 1 "vect" { xfail { ! { vect_element_align } } } } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/no-vfa-pr29145.c b/gcc/testsuite/gcc.dg/vect/no-vfa-pr29145.c
index 954fe25df041..954474eb9256 100644
--- a/gcc/testsuite/gcc.dg/vect/no-vfa-pr29145.c
+++ b/gcc/testsuite/gcc.dg/vect/no-vfa-pr29145.c
@@ -1,4 +1,5 @@
 /* { dg-require-effective-target vect_int } */
+/* { dg-add-options quad_vectors } */
 
 #include <stdarg.h>
 #include "tree-vect.h"
diff --git a/gcc/testsuite/gcc.dg/vect/slp-25.c b/gcc/testsuite/gcc.dg/vect/slp-25.c
index b660508a79c0..ebeebd8c10b1 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-25.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-25.c
@@ -1,4 +1,5 @@
 /* { dg-require-effective-target vect_int } */
+/* { dg-add-options quad_vectors } */
 
 #include <stdarg.h>
 #include "tree-vect.h"
diff --git a/gcc/testsuite/gcc.dg/vect/slp-3.c b/gcc/testsuite/gcc.dg/vect/slp-3.c
index 070715371bb7..18614cdfb2ee 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-3.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-3.c
@@ -1,4 +1,5 @@
 /* { dg-require-effective-target vect_int } */
+/* { dg-add-options quad_vectors } */
 
 #include <stdarg.h>
 #include <stdio.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-109.c b/gcc/testsuite/gcc.dg/vect/vect-109.c
index dd9f8ea77a31..420d04215841 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-109.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-109.c
@@ -1,4 +1,5 @@
 /* { dg-require-effective-target vect_int } */
+/* { dg-add-options quad_vectors } */
 
 #include <stdarg.h>
 #include "tree-vect.h"
diff --git a/gcc/testsuite/gcc.dg/vect/vect-95.c b/gcc/testsuite/gcc.dg/vect/vect-95.c
index c1d5926e67de..c03d1965df12 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-95.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-95.c
@@ -56,14 +56,14 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
-/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 0 "vect" { xfail {vect_hw_misalign} } } } */
+/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 0 "vect" { xfail {vect_element_align} } } } */
 
 /* For targets that support unaligned loads we version for the two unaligned 
    stores and generate misaligned accesses for the loads. For targets that 
    don't support unaligned loads we version for all four accesses.  */
 
-/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 2 "vect" { xfail { vect_no_align || vect_hw_misalign} } } }  */
-/* { dg-final { scan-tree-dump-times "Alignment of access forced using versioning" 2 "vect" { xfail { vect_no_align || vect_hw_misalign } } } } */
+/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 2 "vect" { xfail { vect_no_align || vect_element_align} } } }  */
+/* { dg-final { scan-tree-dump-times "Alignment of access forced using versioning" 2 "vect" { xfail { vect_no_align || vect_element_align } } } } */
 /*  { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 0 "vect" { target vect_no_align } } } */
 /*  { dg-final { scan-tree-dump-times "Alignment of access forced using versioning" 4 "vect" { target vect_no_align } } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-multitypes-1.c b/gcc/testsuite/gcc.dg/vect/vect-multitypes-1.c
index e8fe027f5f30..cf3dcca359f9 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-multitypes-1.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-multitypes-1.c
@@ -1,4 +1,5 @@
 /* { dg-require-effective-target vect_int } */
+/* { dg-add-options quad_vectors } */
 
 #include <stdarg.h>
 #include "tree-vect.h"
diff --git a/gcc/testsuite/gcc.dg/vect/vect-multitypes-4.c b/gcc/testsuite/gcc.dg/vect/vect-multitypes-4.c
index 274fb0253196..eb16e8354aed 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-multitypes-4.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-multitypes-4.c
@@ -1,4 +1,5 @@
 /* { dg-require-effective-target vect_int } */
+/* { dg-add-options quad_vectors } */
 
 #include <stdarg.h>
 #include "tree-vect.h"
diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-5.c b/gcc/testsuite/gcc.dg/vect/vect-outer-5.c
index 2fc421e96804..01094d343e8c 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-outer-5.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-outer-5.c
@@ -1,4 +1,5 @@
 /* { dg-require-effective-target vect_float } */
+/* { dg-add-options quad_vectors } */
 
 #include <stdio.h>
 #include <stdarg.h>
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 126ae380fe79..33c747141f6a 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -1575,6 +1575,18 @@ proc check_effective_target_arm32 { } {
     }]
 }
 
+# Return 1 if this is an ARM target that only supports aligned vector accesses
+proc check_effective_target_arm_vect_no_misalign { } {
+    return [check_no_compiler_messages arm_vect_no_misalign assembly {
+	#if !defined(__arm__) \
+	    || (defined(__ARMEL__) \
+	        && (!defined(__thumb__) || defined(__thumb2__)))
+	#error FOO
+	#endif
+    }]
+}
+
+
 # Return 1 if this is an ARM target supporting -mfpu=vfp
 # -mfloat-abi=softfp.  Some multilibs may be incompatible with these
 # options.
@@ -2392,7 +2404,7 @@ proc check_effective_target_vect_no_align { } {
 	if { [istarget mipsisa64*-*-*]
 	     || [istarget sparc*-*-*]
 	     || [istarget ia64-*-*]
-	     || [check_effective_target_arm32] } { 
+	     || [check_effective_target_arm_vect_no_misalign] } { 
 	    set et_vect_no_align_saved 1
 	}
     }
@@ -2527,6 +2539,25 @@ proc check_effective_target_vector_alignment_reachable_for_64bit { } {
     return $et_vector_alignment_reachable_for_64bit_saved
 }
 
+# Return 1 if the target only requires element alignment for vector accesses
+
+proc check_effective_target_vect_element_align { } {
+    global et_vect_element_align
+
+    if [info exists et_vect_element_align] {
+	verbose "check_effective_target_vect_element_align: using cached result" 2
+    } else {
+	set et_vect_element_align 0
+	if { [istarget arm*-*-*]
+	     || [check_effective_target_vect_hw_misalign] } {
+	   set et_vect_element_align 1
+	}
+    }
+
+    verbose "check_effective_target_vect_element_align: returning $et_vect_element_align" 2
+    return $et_vect_element_align
+}
+
 # Return 1 if the target supports vector conditional operations, 0 otherwise.
 
 proc check_effective_target_vect_condition { } {
@@ -3084,6 +3115,16 @@ proc add_options_for_bind_pic_locally { flags } {
     return $flags
 }
 
+# Add to FLAGS the flags needed to enable 128-bit vectors.
+
+proc add_options_for_quad_vectors { flags } {
+    if [is-effective-target arm_neon_ok] {
+	return "$flags -mvectorize-with-neon-quad"
+    }
+
+    return $flags
+}
+
 # Return 1 if the target provides a full C99 runtime.
 
 proc check_effective_target_c99_runtime { } {
author	David Yuste <dyuste@gcc.gnu.org>	2010-09-21 09:13:34 +0000
committer	David Yuste <dyuste@gcc.gnu.org>	2010-09-21 09:13:34 +0000
commit	cc69bc56ec545d44bc5d1577ac92bf10a82306e5 (patch)
tree	7d0466eb01245a79cacd05fd3622e799e69580de
parent	f749dfefe80a96b8b76a14c2dc26052476748782 (diff)