From 153d4d2089ed65c826de495a45ba3e4806d4b8bf Mon Sep 17 00:00:00 2001 From: Nathan Sidwell Date: Wed, 4 Nov 2015 16:58:52 +0000 Subject: * config/nvptx/nvptx.c: Include gimple headers. (worker_red_size, worker_red_align, worker_red_name, worker_red_sym): New. (nvptx_option_override): Initialize worker reduction buffer. (nvptx_file_end): Write out worker reduction buffer var. (nvptx_expand_shuffle, nvptx_expand_worker_addr, nvptx_expand_cmp_swap): New builtin expanders. (enum nvptx_builtins): New. (nvptx_builtin_decls): New. (nvptx_builtin_decl, nvptx_init_builtins, nvptx_expand_builtin): New (PTX_VECTOR_LENGTH, PTX_WORKER_LENGTH): New. (nvptx_get_worker_red_addr, nvptx_generate_vector_shuffle, nvptx_lockless_update): New helpers. (nvptx_goacc_reduction_setup, nvptx_goacc_reduction_init, nvptx_goacc_reduction_fini, nvptx_goacc_reduction_teaddown): New. (nvptx_goacc_reduction): New. (TARGET_INIT_BUILTINS, TARGET_EXPAND_BUILTIN, TARGET_BUILTIN_DECL): Override. (TARGET_GOACC_REDUCTION): Override. git-svn-id: https://gcc.gnu.org/svn/gcc/trunk@229768 138bc75d-0d04-0410-961f-82ee72b054a4 --- gcc/ChangeLog | 23 ++ gcc/config/nvptx/nvptx.c | 632 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 650 insertions(+), 5 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index e332df32c90..db1f93f8bfe 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,26 @@ +2015-11-04 Nathan Sidwell + Cesar Philippidis + + * config/nvptx/nvptx.c: Include gimple headers. + (worker_red_size, worker_red_align, worker_red_name, + worker_red_sym): New. + (nvptx_option_override): Initialize worker reduction buffer. + (nvptx_file_end): Write out worker reduction buffer var. + (nvptx_expand_shuffle, nvptx_expand_worker_addr, + nvptx_expand_cmp_swap): New builtin expanders. + (enum nvptx_builtins): New. + (nvptx_builtin_decls): New. + (nvptx_builtin_decl, nvptx_init_builtins, nvptx_expand_builtin): New + (PTX_VECTOR_LENGTH, PTX_WORKER_LENGTH): New. + (nvptx_get_worker_red_addr, nvptx_generate_vector_shuffle, + nvptx_lockless_update): New helpers. + (nvptx_goacc_reduction_setup, nvptx_goacc_reduction_init, + nvptx_goacc_reduction_fini, nvptx_goacc_reduction_teaddown): New. + (nvptx_goacc_reduction): New. + (TARGET_INIT_BUILTINS, TARGET_EXPAND_BUILTIN, + TARGET_BUILTIN_DECL): Override. + (TARGET_GOACC_REDUCTION): Override. + 2015-11-04 Nathan Sidwell Cesar Philippidis diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index b541666d147..79ef4f703fe 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -57,6 +57,15 @@ #include "omp-low.h" #include "gomp-constants.h" #include "dumpfile.h" +#include "internal-fn.h" +#include "gimple-iterator.h" +#include "stringpool.h" +#include "tree-ssa-operands.h" +#include "tree-ssanames.h" +#include "gimplify.h" +#include "tree-phinodes.h" +#include "cfgloop.h" +#include "fold-const.h" /* This file should be included last. */ #include "target-def.h" @@ -88,16 +97,23 @@ struct tree_hasher : ggc_cache_ptr_hash static GTY((cache)) hash_table *declared_fndecls_htab; static GTY((cache)) hash_table *needed_fndecls_htab; -/* Size of buffer needed to broadcast across workers. This is used - for both worker-neutering and worker broadcasting. It is shared - by all functions emitted. The buffer is placed in shared memory. - It'd be nice if PTX supported common blocks, because then this - could be shared across TUs (taking the largest size). */ +/* Buffer needed to broadcast across workers. This is used for both + worker-neutering and worker broadcasting. It is shared by all + functions emitted. The buffer is placed in shared memory. It'd be + nice if PTX supported common blocks, because then this could be + shared across TUs (taking the largest size). */ static unsigned worker_bcast_size; static unsigned worker_bcast_align; #define worker_bcast_name "__worker_bcast" static GTY(()) rtx worker_bcast_sym; +/* Buffer needed for worker reductions. This has to be distinct from + the worker broadcast array, as both may be live concurrently. */ +static unsigned worker_red_size; +static unsigned worker_red_align; +#define worker_red_name "__worker_red" +static GTY(()) rtx worker_red_sym; + /* Allocate a new, cleared machine_function structure. */ static struct machine_function * @@ -128,6 +144,9 @@ nvptx_option_override (void) worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, worker_bcast_name); worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; + + worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, worker_red_name); + worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; } /* Return the mode to be used when declaring a ptx object for OBJ. @@ -3246,8 +3265,203 @@ nvptx_file_end (void) worker_bcast_align, worker_bcast_name, worker_bcast_size); } + + if (worker_red_size) + { + /* Define the reduction buffer. */ + + worker_red_size = ((worker_red_size + worker_red_align - 1) + & ~(worker_red_align - 1)); + + fprintf (asm_out_file, "// BEGIN VAR DEF: %s\n", worker_red_name); + fprintf (asm_out_file, ".shared .align %d .u8 %s[%d];\n", + worker_red_align, + worker_red_name, worker_red_size); + } +} + +/* Expander for the shuffle builtins. */ + +static rtx +nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore) +{ + if (ignore) + return target; + + rtx src = expand_expr (CALL_EXPR_ARG (exp, 0), + NULL_RTX, mode, EXPAND_NORMAL); + if (!REG_P (src)) + src = copy_to_mode_reg (mode, src); + + rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1), + NULL_RTX, SImode, EXPAND_NORMAL); + rtx op = expand_expr (CALL_EXPR_ARG (exp, 2), + NULL_RTX, SImode, EXPAND_NORMAL); + + if (!REG_P (idx) && GET_CODE (idx) != CONST_INT) + idx = copy_to_mode_reg (SImode, idx); + + rtx pat = nvptx_gen_shuffle (target, src, idx, INTVAL (op)); + if (pat) + emit_insn (pat); + + return target; +} + +/* Worker reduction address expander. */ + +static rtx +nvptx_expand_worker_addr (tree exp, rtx target, + machine_mode ARG_UNUSED (mode), int ignore) +{ + if (ignore) + return target; + + unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2)); + if (align > worker_red_align) + worker_red_align = align; + + unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0)); + unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1)); + if (size + offset > worker_red_size) + worker_red_size = size + offset; + + emit_insn (gen_rtx_SET (target, worker_red_sym)); + + if (offset) + emit_insn (gen_rtx_SET (target, + gen_rtx_PLUS (Pmode, target, GEN_INT (offset)))); + + emit_insn (gen_rtx_SET (target, + gen_rtx_UNSPEC (Pmode, gen_rtvec (1, target), + UNSPEC_FROM_SHARED))); + + return target; +} + +/* Expand the CMP_SWAP PTX builtins. We have our own versions that do + not require taking the address of any object, other than the memory + cell being operated on. */ + +static rtx +nvptx_expand_cmp_swap (tree exp, rtx target, + machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore)) +{ + machine_mode mode = TYPE_MODE (TREE_TYPE (exp)); + + if (!target) + target = gen_reg_rtx (mode); + + rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0), + NULL_RTX, Pmode, EXPAND_NORMAL); + rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1), + NULL_RTX, mode, EXPAND_NORMAL); + rtx src = expand_expr (CALL_EXPR_ARG (exp, 2), + NULL_RTX, mode, EXPAND_NORMAL); + rtx pat; + + mem = gen_rtx_MEM (mode, mem); + if (!REG_P (cmp)) + cmp = copy_to_mode_reg (mode, cmp); + if (!REG_P (src)) + src = copy_to_mode_reg (mode, src); + + if (mode == SImode) + pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx); + else + pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx); + + emit_insn (pat); + + return target; +} + + +/* Codes for all the NVPTX builtins. */ +enum nvptx_builtins +{ + NVPTX_BUILTIN_SHUFFLE, + NVPTX_BUILTIN_SHUFFLELL, + NVPTX_BUILTIN_WORKER_ADDR, + NVPTX_BUILTIN_CMP_SWAP, + NVPTX_BUILTIN_CMP_SWAPLL, + NVPTX_BUILTIN_MAX +}; + +static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX]; + +/* Return the NVPTX builtin for CODE. */ + +static tree +nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p)) +{ + if (code >= NVPTX_BUILTIN_MAX) + return error_mark_node; + + return nvptx_builtin_decls[code]; +} + +/* Set up all builtin functions for this target. */ + +static void +nvptx_init_builtins (void) +{ +#define DEF(ID, NAME, T) \ + (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \ + = add_builtin_function ("__builtin_nvptx_" NAME, \ + build_function_type_list T, \ + NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL)) +#define ST sizetype +#define UINT unsigned_type_node +#define LLUINT long_long_unsigned_type_node +#define PTRVOID ptr_type_node + + DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE)); + DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE)); + DEF (WORKER_ADDR, "worker_addr", + (PTRVOID, ST, UINT, UINT, NULL_TREE)); + DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE)); + DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE)); + +#undef DEF +#undef ST +#undef UINT +#undef LLUINT +#undef PTRVOID +} + +/* Expand an expression EXP that calls a built-in function, + with result going to TARGET if that's convenient + (and in mode MODE if that's convenient). + SUBTARGET may be used as the target for computing one of EXP's operands. + IGNORE is nonzero if the value is to be ignored. */ + +static rtx +nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget), + machine_mode mode, int ignore) +{ + tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0); + switch (DECL_FUNCTION_CODE (fndecl)) + { + case NVPTX_BUILTIN_SHUFFLE: + case NVPTX_BUILTIN_SHUFFLELL: + return nvptx_expand_shuffle (exp, target, mode, ignore); + + case NVPTX_BUILTIN_WORKER_ADDR: + return nvptx_expand_worker_addr (exp, target, mode, ignore); + + case NVPTX_BUILTIN_CMP_SWAP: + case NVPTX_BUILTIN_CMP_SWAPLL: + return nvptx_expand_cmp_swap (exp, target, mode, ignore); + + default: gcc_unreachable (); + } } +/* Define dimension sizes for known hardware. */ +#define PTX_VECTOR_LENGTH 32 +#define PTX_WORKER_LENGTH 32 + /* Validate compute dimensions of an OpenACC offload or routine, fill in non-unity defaults. FN_LEVEL indicates the level at which a routine might spawn a loop. It is negative for non-routines. */ @@ -3284,6 +3498,404 @@ nvptx_goacc_fork_join (gcall *call, const int dims[], return true; } +/* Generate a PTX builtin function call that returns the address in + the worker reduction buffer at OFFSET. TYPE is the type of the + data at that location. */ + +static tree +nvptx_get_worker_red_addr (tree type, tree offset) +{ + machine_mode mode = TYPE_MODE (type); + tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true); + tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode)); + tree align = build_int_cst (unsigned_type_node, + GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT); + tree call = build_call_expr (fndecl, 3, offset, size, align); + + return fold_convert (build_pointer_type (type), call); +} + +/* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function + will cast the variable if necessary. */ + +static void +nvptx_generate_vector_shuffle (location_t loc, + tree dest_var, tree var, unsigned shift, + gimple_seq *seq) +{ + unsigned fn = NVPTX_BUILTIN_SHUFFLE; + tree_code code = NOP_EXPR; + tree type = unsigned_type_node; + enum machine_mode mode = TYPE_MODE (TREE_TYPE (var)); + + if (!INTEGRAL_MODE_P (mode)) + code = VIEW_CONVERT_EXPR; + if (GET_MODE_SIZE (mode) == GET_MODE_SIZE (DImode)) + { + fn = NVPTX_BUILTIN_SHUFFLELL; + type = long_long_unsigned_type_node; + } + + tree call = nvptx_builtin_decl (fn, true); + call = build_call_expr_loc + (loc, call, 3, fold_build1 (code, type, var), + build_int_cst (unsigned_type_node, shift), + build_int_cst (unsigned_type_node, SHUFFLE_DOWN)); + + call = fold_build1 (code, TREE_TYPE (dest_var), call); + + gimplify_assign (dest_var, call, seq); +} + +/* Insert code to locklessly update *PTR with *PTR OP VAR just before + GSI. */ + +static tree +nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi, + tree ptr, tree var, tree_code op) +{ + unsigned fn = NVPTX_BUILTIN_CMP_SWAP; + tree_code code = NOP_EXPR; + tree type = unsigned_type_node; + + enum machine_mode mode = TYPE_MODE (TREE_TYPE (var)); + + if (!INTEGRAL_MODE_P (mode)) + code = VIEW_CONVERT_EXPR; + if (GET_MODE_SIZE (mode) == GET_MODE_SIZE (DImode)) + { + fn = NVPTX_BUILTIN_CMP_SWAPLL; + type = long_long_unsigned_type_node; + } + + gimple_seq init_seq = NULL; + tree init_var = make_ssa_name (type); + tree init_expr = omp_reduction_init_op (loc, op, TREE_TYPE (var)); + init_expr = fold_build1 (code, type, init_expr); + gimplify_assign (init_var, init_expr, &init_seq); + gimple *init_end = gimple_seq_last (init_seq); + + gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT); + + gimple_seq loop_seq = NULL; + tree expect_var = make_ssa_name (type); + tree actual_var = make_ssa_name (type); + tree write_var = make_ssa_name (type); + + tree write_expr = fold_build1 (code, TREE_TYPE (var), expect_var); + write_expr = fold_build2 (op, TREE_TYPE (var), write_expr, var); + write_expr = fold_build1 (code, type, write_expr); + gimplify_assign (write_var, write_expr, &loop_seq); + + tree swap_expr = nvptx_builtin_decl (fn, true); + swap_expr = build_call_expr_loc (loc, swap_expr, 3, + ptr, expect_var, write_var); + gimplify_assign (actual_var, swap_expr, &loop_seq); + + gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var, + NULL_TREE, NULL_TREE); + gimple_seq_add_stmt (&loop_seq, cond); + + /* Split the block just after the init stmts. */ + basic_block pre_bb = gsi_bb (*gsi); + edge pre_edge = split_block (pre_bb, init_end); + basic_block loop_bb = pre_edge->dest; + pre_bb = pre_edge->src; + /* Reset the iterator. */ + *gsi = gsi_for_stmt (gsi_stmt (*gsi)); + + /* Insert the loop statements. */ + gimple *loop_end = gimple_seq_last (loop_seq); + gsi_insert_seq_before (gsi, loop_seq, GSI_SAME_STMT); + + /* Split the block just after the loop stmts. */ + edge post_edge = split_block (loop_bb, loop_end); + basic_block post_bb = post_edge->dest; + loop_bb = post_edge->src; + *gsi = gsi_for_stmt (gsi_stmt (*gsi)); + + post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU; + edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE); + set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb); + set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb); + + gphi *phi = create_phi_node (expect_var, loop_bb); + add_phi_arg (phi, init_var, pre_edge, loc); + add_phi_arg (phi, actual_var, loop_edge, loc); + + loop *loop = alloc_loop (); + loop->header = loop_bb; + loop->latch = loop_bb; + add_loop (loop, loop_bb->loop_father); + + return fold_build1 (code, TREE_TYPE (var), write_var); +} + +/* NVPTX implementation of GOACC_REDUCTION_SETUP. */ + +static void +nvptx_goacc_reduction_setup (gcall *call) +{ + gimple_stmt_iterator gsi = gsi_for_stmt (call); + tree lhs = gimple_call_lhs (call); + tree var = gimple_call_arg (call, 2); + int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3)); + gimple_seq seq = NULL; + + push_gimplify_context (true); + + if (level != GOMP_DIM_GANG) + { + /* Copy the receiver object. */ + tree ref_to_res = gimple_call_arg (call, 1); + + if (!integer_zerop (ref_to_res)) + var = build_simple_mem_ref (ref_to_res); + } + + if (level == GOMP_DIM_WORKER) + { + /* Store incoming value to worker reduction buffer. */ + tree offset = gimple_call_arg (call, 5); + tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset); + tree ptr = make_ssa_name (TREE_TYPE (call)); + + gimplify_assign (ptr, call, &seq); + tree ref = build_simple_mem_ref (ptr); + TREE_THIS_VOLATILE (ref) = 1; + gimplify_assign (ref, var, &seq); + } + + if (lhs) + gimplify_assign (lhs, var, &seq); + + pop_gimplify_context (NULL); + gsi_replace_with_seq (&gsi, seq, true); +} + +/* NVPTX implementation of GOACC_REDUCTION_INIT. */ + +static void +nvptx_goacc_reduction_init (gcall *call) +{ + gimple_stmt_iterator gsi = gsi_for_stmt (call); + tree lhs = gimple_call_lhs (call); + tree var = gimple_call_arg (call, 2); + int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3)); + enum tree_code rcode + = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4)); + tree init = omp_reduction_init_op (gimple_location (call), rcode, + TREE_TYPE (var)); + gimple_seq seq = NULL; + + push_gimplify_context (true); + + if (level == GOMP_DIM_VECTOR) + { + /* Initialize vector-non-zeroes to INIT_VAL (OP). */ + tree tid = make_ssa_name (integer_type_node); + tree dim_vector = gimple_call_arg (call, 3); + gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1, + dim_vector); + gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node, + NULL_TREE, NULL_TREE); + + gimple_call_set_lhs (tid_call, tid); + gimple_seq_add_stmt (&seq, tid_call); + gimple_seq_add_stmt (&seq, cond_stmt); + + /* Split the block just after the call. */ + edge init_edge = split_block (gsi_bb (gsi), call); + basic_block init_bb = init_edge->dest; + basic_block call_bb = init_edge->src; + + /* Fixup flags from call_bb to init_bb. */ + init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE; + + /* Set the initialization stmts. */ + gimple_seq init_seq = NULL; + tree init_var = make_ssa_name (TREE_TYPE (var)); + gimplify_assign (init_var, init, &init_seq); + gsi = gsi_start_bb (init_bb); + gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT); + + /* Split block just after the init stmt. */ + gsi_prev (&gsi); + edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi)); + basic_block dst_bb = inited_edge->dest; + + /* Create false edge from call_bb to dst_bb. */ + edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE); + + /* Create phi node in dst block. */ + gphi *phi = create_phi_node (lhs, dst_bb); + add_phi_arg (phi, init_var, inited_edge, gimple_location (call)); + add_phi_arg (phi, var, nop_edge, gimple_location (call)); + + /* Reset dominator of dst bb. */ + set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb); + + /* Reset the gsi. */ + gsi = gsi_for_stmt (call); + } + else + { + if (level == GOMP_DIM_GANG) + { + /* If there's no receiver object, propagate the incoming VAR. */ + tree ref_to_res = gimple_call_arg (call, 1); + if (integer_zerop (ref_to_res)) + init = var; + } + + gimplify_assign (lhs, init, &seq); + } + + pop_gimplify_context (NULL); + gsi_replace_with_seq (&gsi, seq, true); +} + +/* NVPTX implementation of GOACC_REDUCTION_FINI. */ + +static void +nvptx_goacc_reduction_fini (gcall *call) +{ + gimple_stmt_iterator gsi = gsi_for_stmt (call); + tree lhs = gimple_call_lhs (call); + tree ref_to_res = gimple_call_arg (call, 1); + tree var = gimple_call_arg (call, 2); + int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3)); + enum tree_code op + = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4)); + gimple_seq seq = NULL; + tree r = NULL_TREE;; + + push_gimplify_context (true); + + if (level == GOMP_DIM_VECTOR) + { + /* Emit binary shuffle tree. TODO. Emit this as an actual loop, + but that requires a method of emitting a unified jump at the + gimple level. */ + for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1) + { + tree other_var = make_ssa_name (TREE_TYPE (var)); + nvptx_generate_vector_shuffle (gimple_location (call), + other_var, var, shfl, &seq); + + r = make_ssa_name (TREE_TYPE (var)); + gimplify_assign (r, fold_build2 (op, TREE_TYPE (var), + var, other_var), &seq); + var = r; + } + } + else + { + tree accum = NULL_TREE; + + if (level == GOMP_DIM_WORKER) + { + /* Get reduction buffer address. */ + tree offset = gimple_call_arg (call, 5); + tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset); + tree ptr = make_ssa_name (TREE_TYPE (call)); + + gimplify_assign (ptr, call, &seq); + accum = ptr; + } + else if (integer_zerop (ref_to_res)) + r = var; + else + accum = ref_to_res; + + if (accum) + { + /* Locklessly update the accumulator. */ + gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT); + seq = NULL; + r = nvptx_lockless_update (gimple_location (call), &gsi, + accum, var, op); + } + } + + if (lhs) + gimplify_assign (lhs, r, &seq); + pop_gimplify_context (NULL); + + gsi_replace_with_seq (&gsi, seq, true); +} + +/* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */ + +static void +nvptx_goacc_reduction_teardown (gcall *call) +{ + gimple_stmt_iterator gsi = gsi_for_stmt (call); + tree lhs = gimple_call_lhs (call); + tree var = gimple_call_arg (call, 2); + int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3)); + gimple_seq seq = NULL; + + push_gimplify_context (true); + if (level == GOMP_DIM_WORKER) + { + /* Read the worker reduction buffer. */ + tree offset = gimple_call_arg (call, 5); + tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset); + tree ptr = make_ssa_name (TREE_TYPE (call)); + + gimplify_assign (ptr, call, &seq); + var = build_simple_mem_ref (ptr); + TREE_THIS_VOLATILE (var) = 1; + } + + if (level != GOMP_DIM_GANG) + { + /* Write to the receiver object. */ + tree ref_to_res = gimple_call_arg (call, 1); + + if (!integer_zerop (ref_to_res)) + gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq); + } + + if (lhs) + gimplify_assign (lhs, var, &seq); + + pop_gimplify_context (NULL); + + gsi_replace_with_seq (&gsi, seq, true); +} + +/* NVPTX reduction expander. */ + +void +nvptx_goacc_reduction (gcall *call) +{ + unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0)); + + switch (code) + { + case IFN_GOACC_REDUCTION_SETUP: + nvptx_goacc_reduction_setup (call); + break; + + case IFN_GOACC_REDUCTION_INIT: + nvptx_goacc_reduction_init (call); + break; + + case IFN_GOACC_REDUCTION_FINI: + nvptx_goacc_reduction_fini (call); + break; + + case IFN_GOACC_REDUCTION_TEARDOWN: + nvptx_goacc_reduction_teardown (call); + break; + + default: + gcc_unreachable (); + } +} + #undef TARGET_OPTION_OVERRIDE #define TARGET_OPTION_OVERRIDE nvptx_option_override @@ -3373,12 +3985,22 @@ nvptx_goacc_fork_join (gcall *call, const int dims[], #undef TARGET_CANNOT_COPY_INSN_P #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p +#undef TARGET_INIT_BUILTINS +#define TARGET_INIT_BUILTINS nvptx_init_builtins +#undef TARGET_EXPAND_BUILTIN +#define TARGET_EXPAND_BUILTIN nvptx_expand_builtin +#undef TARGET_BUILTIN_DECL +#define TARGET_BUILTIN_DECL nvptx_builtin_decl + #undef TARGET_GOACC_VALIDATE_DIMS #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims #undef TARGET_GOACC_FORK_JOIN #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join +#undef TARGET_GOACC_REDUCTION +#define TARGET_GOACC_REDUCTION nvptx_goacc_reduction + struct gcc_target targetm = TARGET_INITIALIZER; #include "gt-nvptx.h" -- cgit v1.2.3