From 995281d383768adf62bde791d30eabd4bcd4601c Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Wed, 3 Jun 2015 20:32:00 +0200 Subject: 2015-06-03 Christophe Lyon gcc/ Backport from trunk r222229. 2015-04-20 Alan Lawrence PR target/64134 * config/aarch64/aarch64.c (aarch64_expand_vector_init): Load constant and overwrite variable parts if <= 1/2 the elements are variable. gcc/testsuite/ Backport from trunk r222229. 2015-04-20 Alan Lawrence PR target/64134 * gcc.target/aarch64/vec_init_1.c: New test. Change-Id: Ic66a493ec04e7eb97964ccdbb2938c98ebbdb5f1 --- gcc/config/aarch64/aarch64.c | 73 +++++++++++++++++---------- gcc/testsuite/gcc.target/aarch64/vec_init_1.c | 34 +++++++++++++ 2 files changed, 81 insertions(+), 26 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/vec_init_1.c diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 0c63af04049..77a641e34ec 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -8751,22 +8751,19 @@ aarch64_expand_vector_init (rtx target, rtx vals) machine_mode mode = GET_MODE (target); machine_mode inner_mode = GET_MODE_INNER (mode); int n_elts = GET_MODE_NUNITS (mode); - int n_var = 0, one_var = -1; + int n_var = 0; + rtx any_const = NULL_RTX; bool all_same = true; - rtx x, mem; - int i; - x = XVECEXP (vals, 0, 0); - if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x)) - n_var = 1, one_var = 0; - - for (i = 1; i < n_elts; ++i) + for (int i = 0; i < n_elts; ++i) { - x = XVECEXP (vals, 0, i); + rtx x = XVECEXP (vals, 0, i); if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x)) - ++n_var, one_var = i; + ++n_var; + else + any_const = x; - if (!rtx_equal_p (x, XVECEXP (vals, 0, 0))) + if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0))) all_same = false; } @@ -8783,36 +8780,60 @@ aarch64_expand_vector_init (rtx target, rtx vals) /* Splat a single non-constant element if we can. */ if (all_same) { - x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0)); + rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0)); aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x)); return; } - /* One field is non-constant. Load constant then overwrite varying - field. This is more efficient than using the stack. */ - if (n_var == 1) + /* Half the fields (or less) are non-constant. Load constant then overwrite + varying fields. Hope that this is more efficient than using the stack. */ + if (n_var <= n_elts/2) { rtx copy = copy_rtx (vals); - rtx index = GEN_INT (one_var); - enum insn_code icode; - /* Load constant part of vector, substitute neighboring value for - varying element. */ - XVECEXP (copy, 0, one_var) = XVECEXP (vals, 0, one_var ^ 1); + /* Load constant part of vector. We really don't care what goes into the + parts we will overwrite, but we're more likely to be able to load the + constant efficiently if it has fewer, larger, repeating parts + (see aarch64_simd_valid_immediate). */ + for (int i = 0; i < n_elts; i++) + { + rtx x = XVECEXP (vals, 0, i); + if (CONST_INT_P (x) || CONST_DOUBLE_P (x)) + continue; + rtx subst = any_const; + for (int bit = n_elts / 2; bit > 0; bit /= 2) + { + /* Look in the copied vector, as more elements are const. */ + rtx test = XVECEXP (copy, 0, i ^ bit); + if (CONST_INT_P (test) || CONST_DOUBLE_P (test)) + { + subst = test; + break; + } + } + XVECEXP (copy, 0, i) = subst; + } aarch64_expand_vector_init (target, copy); - /* Insert variable. */ - x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, one_var)); - icode = optab_handler (vec_set_optab, mode); + /* Insert variables. */ + enum insn_code icode = optab_handler (vec_set_optab, mode); gcc_assert (icode != CODE_FOR_nothing); - emit_insn (GEN_FCN (icode) (target, x, index)); + + for (int i = 0; i < n_elts; i++) + { + rtx x = XVECEXP (vals, 0, i); + if (CONST_INT_P (x) || CONST_DOUBLE_P (x)) + continue; + x = copy_to_mode_reg (inner_mode, x); + emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i))); + } return; } /* Construct the vector in memory one field at a time and load the whole vector. */ - mem = assign_stack_temp (mode, GET_MODE_SIZE (mode)); - for (i = 0; i < n_elts; i++) + rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode)); + for (int i = 0; i < n_elts; i++) emit_move_insn (adjust_address_nv (mem, inner_mode, i * GET_MODE_SIZE (inner_mode)), XVECEXP (vals, 0, i)); diff --git a/gcc/testsuite/gcc.target/aarch64/vec_init_1.c b/gcc/testsuite/gcc.target/aarch64/vec_init_1.c new file mode 100644 index 00000000000..64eaff2dada --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vec_init_1.c @@ -0,0 +1,34 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -fomit-frame-pointer --save-temps -fno-inline" } */ + +extern void abort (void); + +typedef float float16x4_t __attribute__ ((vector_size ((16)))); + +float a; +float b; + +float16x4_t +make_vector () +{ + return (float16x4_t) { 0, 0, a, b }; +} + +int +main (int argc, char **argv) +{ + a = 4.0; + b = 3.0; + float16x4_t vec = make_vector (); + if (vec[0] != 0 || vec[1] != 0 || vec[2] != a || vec[3] != b) + abort (); + return 0; +} + +/* { dg-final { scan-assembler-times "ins\\t" 2 } } */ +/* What we want to check, is that make_vector does not stp the whole vector + to the stack. Unfortunately here we scan the body of main() too, which may + be a bit fragile - the test is currently passing only because of the option + -fomit-frame-pointer which avoids use of stp in the prologue to main(). */ +/* { dg-final { scan-assembler-not "stp\\t" } } */ +/* { dg-final { cleanup-saved-temps } } */ -- cgit v1.2.3