diff options
-rw-r--r-- | libomptarget/deviceRTLs/nvptx/test/parallel/tripcount.c | 22 | ||||
-rw-r--r-- | libomptarget/src/device.h | 6 | ||||
-rw-r--r-- | libomptarget/src/interface.cpp | 7 | ||||
-rw-r--r-- | libomptarget/src/omptarget.cpp | 8 | ||||
-rw-r--r-- | libomptarget/src/private.h | 1 |
5 files changed, 37 insertions, 7 deletions
diff --git a/libomptarget/deviceRTLs/nvptx/test/parallel/tripcount.c b/libomptarget/deviceRTLs/nvptx/test/parallel/tripcount.c new file mode 100644 index 0000000..b3f8768 --- /dev/null +++ b/libomptarget/deviceRTLs/nvptx/test/parallel/tripcount.c @@ -0,0 +1,22 @@ +// RUN: %compile-run-and-check + +#include <omp.h> +#include <stdio.h> + +int main() { + int res = 0; + +#pragma omp parallel num_threads(2) reduction(+:res) + { + int tid = omp_get_thread_num(); +#pragma omp target teams distribute reduction(+:res) + for (int i = tid; i < 2; i++) + ++res; + } + // The first thread makes 2 iterations, the second - 1. Expected result of the + // reduction res is 3. + + // CHECK: res = 3. + printf("res = %d.\n", res); + return 0; +} diff --git a/libomptarget/src/device.h b/libomptarget/src/device.h index 2e1ad71..f4cc8ad 100644 --- a/libomptarget/src/device.h +++ b/libomptarget/src/device.h @@ -96,7 +96,9 @@ struct DeviceTy { std::mutex DataMapMtx, PendingGlobalsMtx, ShadowMtx; - uint64_t loopTripCnt; + // NOTE: Once libomp gains full target-task support, this state should be + // moved into the target task in libomp. + std::map<int32_t, uint64_t> loopTripCnt; int64_t RTLRequiresFlags; @@ -104,7 +106,7 @@ struct DeviceTy { : DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(), HasPendingGlobals(false), HostDataToTargetMap(), PendingCtorsDtors(), ShadowPtrMap(), DataMapMtx(), PendingGlobalsMtx(), - ShadowMtx(), loopTripCnt(0), RTLRequiresFlags(0) {} + ShadowMtx(), RTLRequiresFlags(0) {} // The existence of mutexes makes DeviceTy non-copyable. We need to // provide a copy constructor and an assignment operator explicitly. diff --git a/libomptarget/src/interface.cpp b/libomptarget/src/interface.cpp index d055324..2a98b5c 100644 --- a/libomptarget/src/interface.cpp +++ b/libomptarget/src/interface.cpp @@ -304,8 +304,6 @@ EXTERN int __tgt_target_teams_nowait(int64_t device_id, void *host_ptr, arg_sizes, arg_types, team_num, thread_limit); } - -// The trip count mechanism will be revised - this scheme is not thread-safe. EXTERN void __kmpc_push_target_tripcount(int64_t device_id, uint64_t loop_tripcount) { if (device_id == OFFLOAD_DEVICE_DEFAULT) { @@ -320,5 +318,8 @@ EXTERN void __kmpc_push_target_tripcount(int64_t device_id, DP("__kmpc_push_target_tripcount(%" PRId64 ", %" PRIu64 ")\n", device_id, loop_tripcount); - Devices[device_id].loopTripCnt = loop_tripcount; + TblMapMtx.lock(); + Devices[device_id].loopTripCnt.emplace(__kmpc_global_thread_num(NULL), + loop_tripcount); + TblMapMtx.unlock(); } diff --git a/libomptarget/src/omptarget.cpp b/libomptarget/src/omptarget.cpp index 8272b4e..1e8f942 100644 --- a/libomptarget/src/omptarget.cpp +++ b/libomptarget/src/omptarget.cpp @@ -729,8 +729,12 @@ int target(int64_t device_id, void *host_ptr, int32_t arg_num, "Size mismatch in arguments and offsets"); // Pop loop trip count - uint64_t ltc = Device.loopTripCnt; - Device.loopTripCnt = 0; + uint64_t ltc = 0; + TblMapMtx.lock(); + auto I = Device.loopTripCnt.find(__kmpc_global_thread_num(NULL)); + if (I != Device.loopTripCnt.end()) + std::swap(ltc, I->second); + TblMapMtx.unlock(); // Launch device execution. DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n", diff --git a/libomptarget/src/private.h b/libomptarget/src/private.h index b48415c..b406909 100644 --- a/libomptarget/src/private.h +++ b/libomptarget/src/private.h @@ -65,6 +65,7 @@ extern "C" { // functions that extract info from libomp; keep in sync int omp_get_default_device(void) __attribute__((weak)); int32_t __kmpc_omp_taskwait(void *loc_ref, int32_t gtid) __attribute__((weak)); +int32_t __kmpc_global_thread_num(void *) __attribute__((weak)); int __kmpc_get_target_offload(void) __attribute__((weak)); #ifdef __cplusplus } |