From c197f39ed0e00482c5631c74d2bf8f675de05b0d Mon Sep 17 00:00:00 2001 From: Matthew Gretton-Dann Date: Fri, 5 Oct 2012 10:36:11 +0100 Subject: Merge from fsf gcc arm/aarch64-4.7-branch (svn branches/arm/aarch64-4.7-branch 192093). --- ChangeLog.linaro | 9 +- gcc/ChangeLog.aarch64 | 26 ++++ gcc/config/aarch64/arm_neon.h | 51 ++++++-- gcc/reload.c | 286 ++++++++++++++++++++---------------------- 4 files changed, 208 insertions(+), 164 deletions(-) diff --git a/ChangeLog.linaro b/ChangeLog.linaro index 11ee2416ea4..47a8ab14923 100644 --- a/ChangeLog.linaro +++ b/ChangeLog.linaro @@ -1,7 +1,12 @@ +2012-10-05 Matthew Gretton-Dann + + Merge from fsf gcc arm/aarch64-4.7-branch + (svn branches/arm/aarch64-4.7-branch 192093). + 2012-10-03 Matthew Gretton-Dann - Merge from fsf gcc arm/aarch64-4_7-branch - (svn branches/arm/aarch64-4_7-branch 191926). + Merge from fsf gcc arm/aarch64-4.7-branch + (svn branches/arm/aarch64-4.7-branch 191926). 2012-10-02 Matthew Gretton-Dann diff --git a/gcc/ChangeLog.aarch64 b/gcc/ChangeLog.aarch64 index 00b93a66489..36b9d0bfcc0 100644 --- a/gcc/ChangeLog.aarch64 +++ b/gcc/ChangeLog.aarch64 @@ -1,3 +1,29 @@ +2012-10-04 Tejas Belagod + + * config/aarch64/arm_neon.h: Rename vqmll_* to + vqdmll_*. + +2012-10-04 Tejas Belagod + + * config/aarch64/arm_neon.h (vfma_n_f32, vfmaq_n_f32, vfmaq_n_f64): New. + +2012-10-04 Tejas Belagod + + * config/aarch64/arm_neon.h (vbslq_f64): Fix parameter type. + +2012-10-02 Tejas Belagod + Ulrich Weigand + + * reload.c (find_reloads_subreg_address): Remove FORCE_REPLACE + parameter. Always replace normal subreg with memory reference + whenever possible. Return NULL otherwise. + (find_reloads_toplev): Always call find_reloads_subreg_address + for subregs of registers equivalent to a memory location. + Only recurse further if find_reloads_subreg_address fails. + (find_reloads_address_1): Only call find_reloads_subreg_address + for subregs of registers equivalent to a memory location. + Properly handle failure of find_reloads_subreg_address. + 2012-10-01 Ian Bolton Richard Henderson diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index a092dfff94e..6f4480e2288 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -4975,7 +4975,7 @@ vbslq_f32 (uint32x4_t a, float32x4_t b, float32x4_t c) } __extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vbslq_f64 (float64x2_t a, float64x2_t b, float64x2_t c) +vbslq_f64 (uint64x2_t a, float64x2_t b, float64x2_t c) { float64x2_t result; __asm__ ("bsl %0.16b, %2.16b, %3.16b" @@ -7886,6 +7886,39 @@ vfmaq_f64 (float64x2_t a, float64x2_t b, float64x2_t c) result; \ }) +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vfma_n_f32 (float32x2_t a, float32x2_t b, float32_t c) +{ + float32x2_t result; + __asm__ ("fmla %0.2s, %2.2s, %3.s[0]" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); + return result; +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vfmaq_n_f32 (float32x4_t a, float32x4_t b, float32_t c) +{ + float32x4_t result; + __asm__ ("fmla %0.4s, %2.4s, %3.s[0]" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); + return result; +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vfmaq_n_f64 (float64x2_t a, float64x2_t b, float64_t c) +{ + float64x2_t result; + __asm__ ("fmla %0.2d, %2.2d, %3.d[0]" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); + return result; +} + __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vfms_f32 (float32x2_t a, float32x2_t b, float32x2_t c) { @@ -14084,7 +14117,7 @@ vqdmulhq_n_s32 (int32x4_t a, int32_t b) return result; } -#define vqmlalh_lane_s16(a, b, c) \ +#define vqdmlalh_lane_s16(a, b, c) \ __extension__ \ ({ \ int16x8_t b_ = (b); \ @@ -14098,7 +14131,7 @@ vqdmulhq_n_s32 (int32x4_t a, int32_t b) }) __extension__ static __inline int32_t __attribute__ ((__always_inline__)) -vqmlalh_s16 (int16_t a, int16_t b) +vqdmlalh_s16 (int16_t a, int16_t b) { int32_t result; __asm__ ("sqdmlal %s0,%h1,%h2" @@ -14108,7 +14141,7 @@ vqmlalh_s16 (int16_t a, int16_t b) return result; } -#define vqmlals_lane_s32(a, b, c) \ +#define vqdmlals_lane_s32(a, b, c) \ __extension__ \ ({ \ int32x4_t b_ = (b); \ @@ -14122,7 +14155,7 @@ vqmlalh_s16 (int16_t a, int16_t b) }) __extension__ static __inline int64_t __attribute__ ((__always_inline__)) -vqmlals_s32 (int32_t a, int32_t b) +vqdmlals_s32 (int32_t a, int32_t b) { int64_t result; __asm__ ("sqdmlal %d0,%s1,%s2" @@ -14132,7 +14165,7 @@ vqmlals_s32 (int32_t a, int32_t b) return result; } -#define vqmlslh_lane_s16(a, b, c) \ +#define vqdmlslh_lane_s16(a, b, c) \ __extension__ \ ({ \ int16x8_t b_ = (b); \ @@ -14146,7 +14179,7 @@ vqmlals_s32 (int32_t a, int32_t b) }) __extension__ static __inline int32_t __attribute__ ((__always_inline__)) -vqmlslh_s16 (int16_t a, int16_t b) +vqdmlslh_s16 (int16_t a, int16_t b) { int32_t result; __asm__ ("sqdmlsl %s0,%h1,%h2" @@ -14156,7 +14189,7 @@ vqmlslh_s16 (int16_t a, int16_t b) return result; } -#define vqmlsls_lane_s32(a, b, c) \ +#define vqdmlsls_lane_s32(a, b, c) \ __extension__ \ ({ \ int32x4_t b_ = (b); \ @@ -14170,7 +14203,7 @@ vqmlslh_s16 (int16_t a, int16_t b) }) __extension__ static __inline int64_t __attribute__ ((__always_inline__)) -vqmlsls_s32 (int32_t a, int32_t b) +vqdmlsls_s32 (int32_t a, int32_t b) { int64_t result; __asm__ ("sqdmlsl %d0,%s1,%s2" diff --git a/gcc/reload.c b/gcc/reload.c index 8420c808073..a46241980bc 100644 --- a/gcc/reload.c +++ b/gcc/reload.c @@ -283,7 +283,7 @@ static int find_reloads_address_1 (enum machine_mode, addr_space_t, rtx, int, static void find_reloads_address_part (rtx, rtx *, enum reg_class, enum machine_mode, int, enum reload_type, int); -static rtx find_reloads_subreg_address (rtx, int, int, enum reload_type, +static rtx find_reloads_subreg_address (rtx, int, enum reload_type, int, rtx, int *); static void copy_replacements_1 (rtx *, rtx *, int); static int find_inc_amount (rtx, rtx); @@ -4745,31 +4745,19 @@ find_reloads_toplev (rtx x, int opnum, enum reload_type type, } /* If the subreg contains a reg that will be converted to a mem, - convert the subreg to a narrower memref now. - Otherwise, we would get (subreg (mem ...) ...), - which would force reload of the mem. - - We also need to do this if there is an equivalent MEM that is - not offsettable. In that case, alter_subreg would produce an - invalid address on big-endian machines. - - For machines that extend byte loads, we must not reload using - a wider mode if we have a paradoxical SUBREG. find_reloads will - force a reload in that case. So we should not do anything here. */ + attempt to convert the whole subreg to a (narrower or wider) + memory reference instead. If this succeeds, we're done -- + otherwise fall through to check whether the inner reg still + needs address reloads anyway. */ if (regno >= FIRST_PSEUDO_REGISTER -#ifdef LOAD_EXTEND_OP - && !paradoxical_subreg_p (x) -#endif - && (reg_equiv_address (regno) != 0 - || (reg_equiv_mem (regno) != 0 - && (! strict_memory_address_addr_space_p - (GET_MODE (x), XEXP (reg_equiv_mem (regno), 0), - MEM_ADDR_SPACE (reg_equiv_mem (regno))) - || ! offsettable_memref_p (reg_equiv_mem (regno)) - || num_not_at_initial_offset)))) - x = find_reloads_subreg_address (x, 1, opnum, type, ind_levels, - insn, address_reloaded); + && reg_equiv_memory_loc (regno) != 0) + { + tem = find_reloads_subreg_address (x, opnum, type, ind_levels, + insn, address_reloaded); + if (tem) + return tem; + } } for (copied = 0, i = GET_RTX_LENGTH (code) - 1; i >= 0; i--) @@ -6007,12 +5995,31 @@ find_reloads_address_1 (enum machine_mode mode, addr_space_t as, if (ira_reg_class_max_nregs [rclass][GET_MODE (SUBREG_REG (x))] > reg_class_size[(int) rclass]) { - x = find_reloads_subreg_address (x, 0, opnum, - ADDR_TYPE (type), - ind_levels, insn, NULL); - push_reload (x, NULL_RTX, loc, (rtx*) 0, rclass, - GET_MODE (x), VOIDmode, 0, 0, opnum, type); - return 1; + /* If the inner register will be replaced by a memory + reference, we can do this only if we can replace the + whole subreg by a (narrower) memory reference. If + this is not possible, fall through and reload just + the inner register (including address reloads). */ + if (reg_equiv_memory_loc (REGNO (SUBREG_REG (x))) != 0) + { + rtx tem = find_reloads_subreg_address (x, opnum, + ADDR_TYPE (type), + ind_levels, insn, + NULL); + if (tem) + { + push_reload (tem, NULL_RTX, loc, (rtx*) 0, rclass, + GET_MODE (tem), VOIDmode, 0, 0, + opnum, type); + return 1; + } + } + else + { + push_reload (x, NULL_RTX, loc, (rtx*) 0, rclass, + GET_MODE (x), VOIDmode, 0, 0, opnum, type); + return 1; + } } } } @@ -6089,17 +6096,12 @@ find_reloads_address_part (rtx x, rtx *loc, enum reg_class rclass, } /* X, a subreg of a pseudo, is a part of an address that needs to be - reloaded. - - If the pseudo is equivalent to a memory location that cannot be directly - addressed, make the necessary address reloads. + reloaded, and the pseusdo is equivalent to a memory location. - If address reloads have been necessary, or if the address is changed - by register elimination, return the rtx of the memory location; - otherwise, return X. - - If FORCE_REPLACE is nonzero, unconditionally replace the subreg with the - memory location. + Attempt to replace the whole subreg by a (possibly narrower or wider) + memory reference. If this is possible, return this new memory + reference, and push all required address reloads. Otherwise, + return NULL. OPNUM and TYPE identify the purpose of the reload. @@ -6111,130 +6113,108 @@ find_reloads_address_part (rtx x, rtx *loc, enum reg_class rclass, stack slots. */ static rtx -find_reloads_subreg_address (rtx x, int force_replace, int opnum, - enum reload_type type, int ind_levels, rtx insn, - int *address_reloaded) +find_reloads_subreg_address (rtx x, int opnum, enum reload_type type, + int ind_levels, rtx insn, int *address_reloaded) { + enum machine_mode outer_mode = GET_MODE (x); + enum machine_mode inner_mode = GET_MODE (SUBREG_REG (x)); + unsigned outer_size = GET_MODE_SIZE (outer_mode); + unsigned inner_size = GET_MODE_SIZE (inner_mode); int regno = REGNO (SUBREG_REG (x)); int reloaded = 0; + rtx tem, orig; + int offset; - if (reg_equiv_memory_loc (regno)) - { - /* If the address is not directly addressable, or if the address is not - offsettable, then it must be replaced. */ - if (! force_replace - && (reg_equiv_address (regno) - || ! offsettable_memref_p (reg_equiv_mem (regno)))) - force_replace = 1; - - if (force_replace || num_not_at_initial_offset) - { - rtx tem = make_memloc (SUBREG_REG (x), regno); + gcc_assert (reg_equiv_memory_loc (regno) != 0); - /* If the address changes because of register elimination, then - it must be replaced. */ - if (force_replace - || ! rtx_equal_p (tem, reg_equiv_mem (regno))) - { - unsigned outer_size = GET_MODE_SIZE (GET_MODE (x)); - unsigned inner_size = GET_MODE_SIZE (GET_MODE (SUBREG_REG (x))); - int offset; - rtx orig = tem; - - /* For big-endian paradoxical subregs, SUBREG_BYTE does not - hold the correct (negative) byte offset. */ - if (BYTES_BIG_ENDIAN && outer_size > inner_size) - offset = inner_size - outer_size; - else - offset = SUBREG_BYTE (x); - - XEXP (tem, 0) = plus_constant (XEXP (tem, 0), offset); - PUT_MODE (tem, GET_MODE (x)); - if (MEM_OFFSET_KNOWN_P (tem)) - set_mem_offset (tem, MEM_OFFSET (tem) + offset); - if (MEM_SIZE_KNOWN_P (tem) - && MEM_SIZE (tem) != (HOST_WIDE_INT) outer_size) - set_mem_size (tem, outer_size); - - /* If this was a paradoxical subreg that we replaced, the - resulting memory must be sufficiently aligned to allow - us to widen the mode of the memory. */ - if (outer_size > inner_size) - { - rtx base; + /* We cannot replace the subreg with a modified memory reference if: - base = XEXP (tem, 0); - if (GET_CODE (base) == PLUS) - { - if (CONST_INT_P (XEXP (base, 1)) - && INTVAL (XEXP (base, 1)) % outer_size != 0) - return x; - base = XEXP (base, 0); - } - if (!REG_P (base) - || (REGNO_POINTER_ALIGN (REGNO (base)) - < outer_size * BITS_PER_UNIT)) - return x; - } + - we have a paradoxical subreg that implicitly acts as a zero or + sign extension operation due to LOAD_EXTEND_OP; - reloaded = find_reloads_address (GET_MODE (tem), &tem, - XEXP (tem, 0), &XEXP (tem, 0), - opnum, type, ind_levels, insn); - /* ??? Do we need to handle nonzero offsets somehow? */ - if (!offset && !rtx_equal_p (tem, orig)) - push_reg_equiv_alt_mem (regno, tem); - - /* For some processors an address may be valid in the - original mode but not in a smaller mode. For - example, ARM accepts a scaled index register in - SImode but not in HImode. Note that this is only - a problem if the address in reg_equiv_mem is already - invalid in the new mode; other cases would be fixed - by find_reloads_address as usual. - - ??? We attempt to handle such cases here by doing an - additional reload of the full address after the - usual processing by find_reloads_address. Note that - this may not work in the general case, but it seems - to cover the cases where this situation currently - occurs. A more general fix might be to reload the - *value* instead of the address, but this would not - be expected by the callers of this routine as-is. - - If find_reloads_address already completed replaced - the address, there is nothing further to do. */ - if (reloaded == 0 - && reg_equiv_mem (regno) != 0 - && !strict_memory_address_addr_space_p - (GET_MODE (x), XEXP (reg_equiv_mem (regno), 0), - MEM_ADDR_SPACE (reg_equiv_mem (regno)))) - { - push_reload (XEXP (tem, 0), NULL_RTX, &XEXP (tem, 0), (rtx*) 0, - base_reg_class (GET_MODE (tem), - MEM_ADDR_SPACE (tem), - MEM, SCRATCH), - GET_MODE (XEXP (tem, 0)), VOIDmode, 0, 0, - opnum, type); - reloaded = 1; - } - /* If this is not a toplevel operand, find_reloads doesn't see - this substitution. We have to emit a USE of the pseudo so - that delete_output_reload can see it. */ - if (replace_reloads && recog_data.operand[opnum] != x) - /* We mark the USE with QImode so that we recognize it - as one that can be safely deleted at the end of - reload. */ - PUT_MODE (emit_insn_before (gen_rtx_USE (VOIDmode, - SUBREG_REG (x)), - insn), QImode); - x = tem; - } - } + - we have a subreg that is implicitly supposed to act on the full + register due to WORD_REGISTER_OPERATIONS (see also eliminate_regs); + + - the address of the equivalent memory location is mode-dependent; or + + - we have a paradoxical subreg and the resulting memory is not + sufficiently aligned to allow access in the wider mode. + + In addition, we choose not to perform the replacement for *any* + paradoxical subreg, even if it were possible in principle. This + is to avoid generating wider memory references than necessary. + + This corresponds to how previous versions of reload used to handle + paradoxical subregs where no address reload was required. */ + + if (paradoxical_subreg_p (x)) + return NULL; + +#ifdef WORD_REGISTER_OPERATIONS + if (outer_size < inner_size + && ((outer_size - 1) / UNITS_PER_WORD + == (inner_size - 1) / UNITS_PER_WORD)) + return NULL; +#endif + + /* Since we don't attempt to handle paradoxical subregs, we can just + call into simplify_subreg, which will handle all remaining checks + for us. */ + orig = make_memloc (SUBREG_REG (x), regno); + offset = SUBREG_BYTE (x); + tem = simplify_subreg (outer_mode, orig, inner_mode, offset); + if (!tem || !MEM_P (tem)) + return NULL; + + /* Now push all required address reloads, if any. */ + reloaded = find_reloads_address (GET_MODE (tem), &tem, + XEXP (tem, 0), &XEXP (tem, 0), + opnum, type, ind_levels, insn); + /* ??? Do we need to handle nonzero offsets somehow? */ + if (!offset && !rtx_equal_p (tem, orig)) + push_reg_equiv_alt_mem (regno, tem); + + /* For some processors an address may be valid in the original mode but + not in a smaller mode. For example, ARM accepts a scaled index register + in SImode but not in HImode. Note that this is only a problem if the + address in reg_equiv_mem is already invalid in the new mode; other + cases would be fixed by find_reloads_address as usual. + + ??? We attempt to handle such cases here by doing an additional reload + of the full address after the usual processing by find_reloads_address. + Note that this may not work in the general case, but it seems to cover + the cases where this situation currently occurs. A more general fix + might be to reload the *value* instead of the address, but this would + not be expected by the callers of this routine as-is. + + If find_reloads_address already completed replaced the address, there + is nothing further to do. */ + if (reloaded == 0 + && reg_equiv_mem (regno) != 0 + && !strict_memory_address_addr_space_p + (GET_MODE (x), XEXP (reg_equiv_mem (regno), 0), + MEM_ADDR_SPACE (reg_equiv_mem (regno)))) + { + push_reload (XEXP (tem, 0), NULL_RTX, &XEXP (tem, 0), (rtx*) 0, + base_reg_class (GET_MODE (tem), MEM_ADDR_SPACE (tem), + MEM, SCRATCH), + GET_MODE (XEXP (tem, 0)), VOIDmode, 0, 0, opnum, type); + reloaded = 1; } + + /* If this is not a toplevel operand, find_reloads doesn't see this + substitution. We have to emit a USE of the pseudo so that + delete_output_reload can see it. */ + if (replace_reloads && recog_data.operand[opnum] != x) + /* We mark the USE with QImode so that we recognize it as one that + can be safely deleted at the end of reload. */ + PUT_MODE (emit_insn_before (gen_rtx_USE (VOIDmode, SUBREG_REG (x)), insn), + QImode); + if (address_reloaded) *address_reloaded = reloaded; - return x; + return tem; } /* Substitute into the current INSN the registers into which we have reloaded -- cgit v1.2.3