aboutsummaryrefslogtreecommitdiff
path: root/libomptarget/deviceRTLs
diff options
context:
space:
mode:
authorAlexey Bataev <a.bataev@hotmail.com>2019-05-02 20:05:01 +0000
committerAlexey Bataev <a.bataev@hotmail.com>2019-05-02 20:05:01 +0000
commit2d4c5f915a723961181c1725daa66a299ab65e81 (patch)
tree5f32eaf163ca0b3e99adac63e7653142bfa1856b /libomptarget/deviceRTLs
parentdb75b85d572b5526020f54055f37fc6bba22e08b (diff)
[OPENMP][NVPTX]Improve code by using parallel level counter.
Summary: Previously for the different purposes we need to get the active/common parallel level and with full runtime we iterated over all the records to calculate this level. Instead, we can used the warp-based parallel level counters used in no-runtime mode. Reviewers: grokos, gtbercea, kkwli0 Subscribers: guansong, jfb, jdoerfert, caomhin, openmp-commits Tags: #openmp Differential Revision: https://reviews.llvm.org/D61395 git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@359822 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'libomptarget/deviceRTLs')
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/libcall.cu55
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/loop.cu51
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu30
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/option.h2
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/parallel.cu39
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/reduction.cu65
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/support.h15
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/supporti.h58
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/sync.cu4
-rw-r--r--libomptarget/deviceRTLs/nvptx/test/parallel/nested.c78
10 files changed, 200 insertions, 197 deletions
diff --git a/libomptarget/deviceRTLs/nvptx/src/libcall.cu b/libomptarget/deviceRTLs/nvptx/src/libcall.cu
index 26b0f4e..452463c 100644
--- a/libomptarget/deviceRTLs/nvptx/src/libcall.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/libcall.cu
@@ -47,8 +47,7 @@ EXTERN void omp_set_num_threads(int num) {
EXTERN int omp_get_num_threads(void) {
bool isSPMDExecutionMode = isSPMDMode();
int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
- int rc =
- GetNumberOfOmpThreads(tid, isSPMDExecutionMode, isRuntimeUninitialized());
+ int rc = GetNumberOfOmpThreads(tid, isSPMDExecutionMode);
PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc);
return rc;
}
@@ -83,7 +82,7 @@ EXTERN int omp_get_thread_limit(void) {
EXTERN int omp_get_thread_num() {
bool isSPMDExecutionMode = isSPMDMode();
int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
- int rc = GetOmpThreadId(tid, isSPMDExecutionMode, isRuntimeUninitialized());
+ int rc = GetOmpThreadId(tid, isSPMDExecutionMode);
PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc);
return rc;
}
@@ -95,18 +94,7 @@ EXTERN int omp_get_num_procs(void) {
}
EXTERN int omp_in_parallel(void) {
- int rc = 0;
- if (isRuntimeUninitialized()) {
- ASSERT0(LT_FUSSY, isSPMDMode(),
- "Expected SPMD mode only with uninitialized runtime.");
- rc = 1; // SPMD mode is always in parallel.
- } else {
- omptarget_nvptx_TaskDescr *currTaskDescr =
- getMyTopTaskDescriptor(isSPMDMode());
- if (currTaskDescr->InParallelRegion()) {
- rc = 1;
- }
- }
+ int rc = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0;
PRINT(LD_IO, "call omp_in_parallel() returns %d\n", rc);
return rc;
}
@@ -155,46 +143,13 @@ EXTERN int omp_get_max_active_levels(void) {
}
EXTERN int omp_get_level(void) {
- if (isRuntimeUninitialized()) {
- ASSERT0(LT_FUSSY, isSPMDMode(),
- "Expected SPMD mode only with uninitialized runtime.");
- // parallelLevel starts from 0, need to add 1 for correct level.
- return parallelLevel[GetWarpId()] + 1;
- }
- int level = 0;
- omptarget_nvptx_TaskDescr *currTaskDescr =
- getMyTopTaskDescriptor(isSPMDMode());
- ASSERT0(LT_FUSSY, currTaskDescr,
- "do not expect fct to be called in a non-active thread");
- do {
- if (currTaskDescr->IsParallelConstruct()) {
- level++;
- }
- currTaskDescr = currTaskDescr->GetPrevTaskDescr();
- } while (currTaskDescr);
+ int level = parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1);
PRINT(LD_IO, "call omp_get_level() returns %d\n", level);
return level;
}
EXTERN int omp_get_active_level(void) {
- if (isRuntimeUninitialized()) {
- ASSERT0(LT_FUSSY, isSPMDMode(),
- "Expected SPMD mode only with uninitialized runtime.");
- return 1;
- }
- int level = 0; // no active level parallelism
- omptarget_nvptx_TaskDescr *currTaskDescr =
- getMyTopTaskDescriptor(isSPMDMode());
- ASSERT0(LT_FUSSY, currTaskDescr,
- "do not expect fct to be called in a non-active thread");
- do {
- if (currTaskDescr->ThreadsInTeam() > 1) {
- // has a parallel with more than one thread in team
- level = 1;
- break;
- }
- currTaskDescr = currTaskDescr->GetPrevTaskDescr();
- } while (currTaskDescr);
+ int level = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0;
PRINT(LD_IO, "call omp_get_active_level() returns %d\n", level)
return level;
}
diff --git a/libomptarget/deviceRTLs/nvptx/src/loop.cu b/libomptarget/deviceRTLs/nvptx/src/loop.cu
index aad32f0..b53dfc8 100644
--- a/libomptarget/deviceRTLs/nvptx/src/loop.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/loop.cu
@@ -95,8 +95,7 @@ public:
INLINE static void for_static_init(int32_t gtid, int32_t schedtype,
int32_t *plastiter, T *plower, T *pupper,
ST *pstride, ST chunk,
- bool IsSPMDExecutionMode,
- bool IsRuntimeUninitialized) {
+ bool IsSPMDExecutionMode) {
// When IsRuntimeUninitialized is true, we assume that the caller is
// in an L0 parallel region and that all worker threads participate.
@@ -104,8 +103,8 @@ public:
// Assume we are in teams region or that we use a single block
// per target region
- ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(
- tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
+ ST numberOfActiveOMPThreads =
+ GetNumberOfOmpThreads(tid, IsSPMDExecutionMode);
// All warps that are in excess of the maximum requested, do
// not execute the loop
@@ -456,9 +455,7 @@ public:
// automatically selects thread or warp ID based on selected implementation
int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
- ASSERT0(LT_FUSSY,
- gtid < GetNumberOfOmpThreads(tid, checkSPMDMode(loc),
- checkRuntimeUninitialized(loc)),
+ ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(tid, checkSPMDMode(loc)),
"current thread is not needed here; error");
// retrieve schedule
kmp_sched_t schedule =
@@ -509,13 +506,12 @@ public:
*pupper = myUb;
*pstride = 1;
- PRINT(
- LD_LOOP,
- "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
- "last %d\n",
- (int)GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
- (int)GetNumberOfWorkersInTeam(), (long long)*plower, (long long)*pupper,
- (long long)*pstride, (int)*plast);
+ PRINT(LD_LOOP,
+ "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
+ "last %d\n",
+ (int)GetNumberOfOmpThreads(tid, isSPMDMode()),
+ (int)GetNumberOfWorkersInTeam(), (long long)*plower,
+ (long long)*pupper, (long long)*pstride, (int)*plast);
return DISPATCH_NOTFINISHED;
}
@@ -629,7 +625,7 @@ EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid,
PRINT0(LD_IO, "call kmpc_for_static_init_4\n");
omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- checkSPMDMode(loc), checkRuntimeUninitialized(loc));
+ checkSPMDMode(loc));
}
EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid,
@@ -640,7 +636,7 @@ EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid,
PRINT0(LD_IO, "call kmpc_for_static_init_4u\n");
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- checkSPMDMode(loc), checkRuntimeUninitialized(loc));
+ checkSPMDMode(loc));
}
EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid,
@@ -651,7 +647,7 @@ EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid,
PRINT0(LD_IO, "call kmpc_for_static_init_8\n");
omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- checkSPMDMode(loc), checkRuntimeUninitialized(loc));
+ checkSPMDMode(loc));
}
EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid,
@@ -662,7 +658,7 @@ EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid,
PRINT0(LD_IO, "call kmpc_for_static_init_8u\n");
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- checkSPMDMode(loc), checkRuntimeUninitialized(loc));
+ checkSPMDMode(loc));
}
EXTERN
@@ -674,7 +670,7 @@ void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid,
PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n");
omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- /*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true);
+ /*IsSPMDExecutionMode=*/true);
}
EXTERN
@@ -686,7 +682,7 @@ void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n");
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- /*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true);
+ /*IsSPMDExecutionMode=*/true);
}
EXTERN
@@ -698,7 +694,7 @@ void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid,
PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n");
omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- /*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true);
+ /*IsSPMDExecutionMode=*/true);
}
EXTERN
@@ -710,7 +706,7 @@ void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n");
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- /*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true);
+ /*IsSPMDExecutionMode=*/true);
}
EXTERN
@@ -721,7 +717,7 @@ void __kmpc_for_static_init_4_simple_generic(
PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n");
omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- /*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true);
+ /*IsSPMDExecutionMode=*/false);
}
EXTERN
@@ -732,7 +728,7 @@ void __kmpc_for_static_init_4u_simple_generic(
PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n");
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- /*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true);
+ /*IsSPMDExecutionMode=*/false);
}
EXTERN
@@ -743,7 +739,7 @@ void __kmpc_for_static_init_8_simple_generic(
PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n");
omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- /*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true);
+ /*IsSPMDExecutionMode=*/false);
}
EXTERN
@@ -754,7 +750,7 @@ void __kmpc_for_static_init_8u_simple_generic(
PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n");
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- /*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true);
+ /*IsSPMDExecutionMode=*/false);
}
EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) {
@@ -787,8 +783,7 @@ EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, int32_t gtid,
omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
- uint32_t NumThreads = GetNumberOfOmpThreads(tid, checkSPMDMode(loc),
- checkRuntimeUninitialized(loc));
+ uint32_t NumThreads = GetNumberOfOmpThreads(tid, checkSPMDMode(loc));
uint64_t *Buffer = teamDescr.getLastprivateIterBuffer();
for (unsigned i = 0; i < varNum; i++) {
// Reset buffer.
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
index b5cfac3..dc41a80 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -43,6 +43,8 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
ASSERT0(LT_FUSSY, RequiresOMPRuntime,
"Generic always requires initialized runtime.");
setExecutionParameters(Generic, RuntimeInitialized);
+ for (int I = 0; I < MAX_THREADS_PER_TEAM / WARPSIZE; ++I)
+ parallelLevel[I] = 0;
int threadIdInBlock = GetThreadIdInBlock();
ASSERT0(LT_FUSSY, threadIdInBlock == GetMasterThreadID(),
@@ -91,32 +93,32 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
int16_t RequiresDataSharing) {
PRINT0(LD_IO, "call to __kmpc_spmd_kernel_init\n");
+ setExecutionParameters(Spmd, RequiresOMPRuntime ? RuntimeInitialized
+ : RuntimeUninitialized);
+ int threadId = GetThreadIdInBlock();
+ if (threadId == 0) {
+ usedSlotIdx = smid() % MAX_SM;
+ parallelLevel[0] =
+ 1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0);
+ } else if (GetLaneId() == 0) {
+ parallelLevel[GetWarpId()] =
+ 1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0);
+ }
if (!RequiresOMPRuntime) {
- // If OMP runtime is not required don't initialize OMP state.
- setExecutionParameters(Spmd, RuntimeUninitialized);
- if (GetThreadIdInBlock() == 0) {
- usedSlotIdx = smid() % MAX_SM;
- parallelLevel[0] = 0;
- } else if (GetLaneId() == 0) {
- parallelLevel[GetWarpId()] = 0;
- }
+ // Runtime is not required - exit.
__SYNCTHREADS();
return;
}
- setExecutionParameters(Spmd, RuntimeInitialized);
//
// Team Context Initialization.
//
// In SPMD mode there is no master thread so use any cuda thread for team
// context initialization.
- int threadId = GetThreadIdInBlock();
if (threadId == 0) {
// Get a state object from the queue.
- int slot = smid() % MAX_SM;
- usedSlotIdx = slot;
omptarget_nvptx_threadPrivateContext =
- omptarget_nvptx_device_State[slot].Dequeue();
+ omptarget_nvptx_device_State[usedSlotIdx].Dequeue();
omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
@@ -148,7 +150,7 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
"%d threads\n",
(int)newTaskDescr->ThreadId(), (int)newTaskDescr->ThreadsInTeam());
- if (RequiresDataSharing && threadId % WARPSIZE == 0) {
+ if (RequiresDataSharing && GetLaneId() == 0) {
// Warp master innitializes data sharing environment.
unsigned WID = threadId / WARPSIZE;
__kmpc_data_sharing_slot *RootS = currTeamDescr.RootS(
diff --git a/libomptarget/deviceRTLs/nvptx/src/option.h b/libomptarget/deviceRTLs/nvptx/src/option.h
index a6f4040..b3661d5 100644
--- a/libomptarget/deviceRTLs/nvptx/src/option.h
+++ b/libomptarget/deviceRTLs/nvptx/src/option.h
@@ -44,6 +44,8 @@
#define MAX_SM 16
#endif
+#define OMP_ACTIVE_PARALLEL_LEVEL 128
+
////////////////////////////////////////////////////////////////////////////////
// algo options
////////////////////////////////////////////////////////////////////////////////
diff --git a/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/libomptarget/deviceRTLs/nvptx/src/parallel.cu
index 1ad72d5..45a6758 100644
--- a/libomptarget/deviceRTLs/nvptx/src/parallel.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/parallel.cu
@@ -311,6 +311,7 @@ EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
(int)newTaskDescr->ThreadId(), (int)newTaskDescr->NThreads());
isActive = true;
+ IncParallelLevel(workDescr.WorkTaskDescr()->ThreadsInTeam() != 1);
}
return isActive;
@@ -327,6 +328,8 @@ EXTERN void __kmpc_kernel_end_parallel() {
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
threadId, currTaskDescr->GetPrevTaskDescr());
+
+ DecParallelLevel(currTaskDescr->ThreadsInTeam() != 1);
}
////////////////////////////////////////////////////////////////////////////////
@@ -336,16 +339,11 @@ EXTERN void __kmpc_kernel_end_parallel() {
EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) {
PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n");
+ IncParallelLevel(/*ActiveParallel=*/false);
+
if (checkRuntimeUninitialized(loc)) {
ASSERT0(LT_FUSSY, checkSPMDMode(loc),
"Expected SPMD mode with uninitialized runtime.");
- unsigned tnum = __ACTIVEMASK();
- int leader = __ffs(tnum) - 1;
- __SHFL_SYNC(tnum, leader, leader);
- if (GetLaneId() == leader)
- ++parallelLevel[GetWarpId()];
- __SHFL_SYNC(tnum, leader, leader);
-
return;
}
@@ -381,15 +379,11 @@ EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
uint32_t global_tid) {
PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n");
+ DecParallelLevel(/*ActiveParallel=*/false);
+
if (checkRuntimeUninitialized(loc)) {
ASSERT0(LT_FUSSY, checkSPMDMode(loc),
"Expected SPMD mode with uninitialized runtime.");
- unsigned tnum = __ACTIVEMASK();
- int leader = __ffs(tnum) - 1;
- __SHFL_SYNC(tnum, leader, leader);
- if (GetLaneId() == leader)
- --parallelLevel[GetWarpId()];
- __SHFL_SYNC(tnum, leader, leader);
return;
}
@@ -408,21 +402,7 @@ EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) {
PRINT0(LD_IO, "call to __kmpc_parallel_level\n");
- if (checkRuntimeUninitialized(loc)) {
- ASSERT0(LT_FUSSY, checkSPMDMode(loc),
- "Expected SPMD mode with uninitialized runtime.");
- return parallelLevel[GetWarpId()] + 1;
- }
-
- int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
- omptarget_nvptx_TaskDescr *currTaskDescr =
- omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
- if (currTaskDescr->InL2OrHigherParallelRegion())
- return 2;
- else if (currTaskDescr->InParallelRegion())
- return 1;
- else
- return 0;
+ return parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1);
}
// This kmpc call returns the thread id across all teams. It's value is
@@ -431,8 +411,7 @@ EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) {
// of this call.
EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) {
int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
- return GetOmpThreadId(tid, checkSPMDMode(loc),
- checkRuntimeUninitialized(loc));
+ return GetOmpThreadId(tid, checkSPMDMode(loc));
}
////////////////////////////////////////////////////////////////////////////////
diff --git a/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/libomptarget/deviceRTLs/nvptx/src/reduction.cu
index eca5bf3..ee91d41 100644
--- a/libomptarget/deviceRTLs/nvptx/src/reduction.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/reduction.cu
@@ -21,8 +21,7 @@ EXTERN
int32_t __gpu_block_reduce() {
bool isSPMDExecutionMode = isSPMDMode();
int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
- int nt =
- GetNumberOfOmpThreads(tid, isSPMDExecutionMode, isRuntimeUninitialized());
+ int nt = GetNumberOfOmpThreads(tid, isSPMDExecutionMode);
if (nt != blockDim.x)
return 0;
unsigned tnum = __ACTIVEMASK();
@@ -40,9 +39,7 @@ int32_t __kmpc_reduce_gpu(kmp_Ident *loc, int32_t global_tid, int32_t num_vars,
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
int numthread;
if (currTaskDescr->IsParallelConstruct()) {
- numthread =
- GetNumberOfOmpThreads(threadId, checkSPMDMode(loc),
- checkRuntimeUninitialized(loc));
+ numthread = GetNumberOfOmpThreads(threadId, checkSPMDMode(loc));
} else {
numthread = GetNumberOfOmpTeams();
}
@@ -150,8 +147,8 @@ static int32_t nvptx_parallel_reduce_nowait(
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
- uint32_t NumThreads = GetNumberOfOmpThreads(
- BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized);
+ uint32_t NumThreads =
+ GetNumberOfOmpThreads(BlockThreadId, isSPMDExecutionMode);
if (NumThreads == 1)
return 1;
/*
@@ -236,10 +233,9 @@ static int32_t nvptx_parallel_reduce_nowait(
EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait(
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
- return nvptx_parallel_reduce_nowait(
- global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
- /*isSPMDExecutionMode=*/isSPMDMode(),
- /*isRuntimeUninitialized=*/isRuntimeUninitialized());
+ return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size,
+ reduce_data, shflFct, cpyFct,
+ isSPMDMode(), isRuntimeUninitialized());
}
EXTERN
@@ -256,36 +252,35 @@ EXTERN
int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd(
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
- return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size,
- reduce_data, shflFct, cpyFct,
- /*isSPMDExecutionMode=*/true,
- /*isRuntimeUninitialized=*/true);
+ return nvptx_parallel_reduce_nowait(
+ global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
+ /*isSPMDExecutionMode=*/true, /*isRuntimeUninitialized=*/true);
}
EXTERN
int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic(
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
- return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size,
- reduce_data, shflFct, cpyFct,
- /*isSPMDExecutionMode=*/false,
- /*isRuntimeUninitialized=*/true);
+ return nvptx_parallel_reduce_nowait(
+ global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
+ /*isSPMDExecutionMode=*/false, /*isRuntimeUninitialized=*/true);
}
INLINE
-static int32_t nvptx_teams_reduce_nowait(
- int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
- kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
- kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct,
- bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
+static int32_t nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
+ size_t reduce_size, void *reduce_data,
+ kmp_ShuffleReductFctPtr shflFct,
+ kmp_InterWarpCopyFctPtr cpyFct,
+ kmp_CopyToScratchpadFctPtr scratchFct,
+ kmp_LoadReduceFctPtr ldFct,
+ bool isSPMDExecutionMode) {
uint32_t ThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
// In non-generic mode all workers participate in the teams reduction.
// In generic mode only the team master participates in the teams
// reduction because the workers are waiting for parallel work.
uint32_t NumThreads =
isSPMDExecutionMode
- ? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true,
- isRuntimeUninitialized)
+ ? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true)
: /*Master thread only*/ 1;
uint32_t TeamId = GetBlockIdInKernel();
uint32_t NumTeams = GetNumberOfBlocksInKernel();
@@ -406,10 +401,9 @@ int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
kmp_InterWarpCopyFctPtr cpyFct,
kmp_CopyToScratchpadFctPtr scratchFct,
kmp_LoadReduceFctPtr ldFct) {
- return nvptx_teams_reduce_nowait(
- global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
- scratchFct, ldFct, /*isSPMDExecutionMode=*/isSPMDMode(),
- /*isRuntimeUninitialized=*/isRuntimeUninitialized());
+ return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
+ reduce_data, shflFct, cpyFct, scratchFct,
+ ldFct, isSPMDMode());
}
EXTERN
@@ -419,9 +413,7 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd(
kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
reduce_data, shflFct, cpyFct, scratchFct,
- ldFct,
- /*isSPMDExecutionMode=*/true,
- /*isRuntimeUninitialized=*/true);
+ ldFct, /*isSPMDExecutionMode=*/true);
}
EXTERN
@@ -431,9 +423,7 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic(
kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
reduce_data, shflFct, cpyFct, scratchFct,
- ldFct,
- /*isSPMDExecutionMode=*/false,
- /*isRuntimeUninitialized=*/true);
+ ldFct, /*isSPMDExecutionMode=*/false);
}
EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc,
@@ -484,8 +474,7 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
// reduction because the workers are waiting for parallel work.
uint32_t NumThreads =
checkSPMDMode(loc)
- ? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true,
- checkRuntimeUninitialized(loc))
+ ? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true)
: /*Master thread only*/ 1;
uint32_t TeamId = GetBlockIdInKernel();
uint32_t NumTeams = GetNumberOfBlocksInKernel();
diff --git a/libomptarget/deviceRTLs/nvptx/src/support.h b/libomptarget/deviceRTLs/nvptx/src/support.h
index f84da6d..02db474 100644
--- a/libomptarget/deviceRTLs/nvptx/src/support.h
+++ b/libomptarget/deviceRTLs/nvptx/src/support.h
@@ -49,15 +49,14 @@ INLINE int GetMasterThreadID();
INLINE int GetNumberOfWorkersInTeam();
// get OpenMP thread and team ids
-INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode,
- bool isRuntimeUninitialized); // omp_thread_num
+INLINE int GetOmpThreadId(int threadId,
+ bool isSPMDExecutionMode); // omp_thread_num
INLINE int GetOmpTeamId(); // omp_team_num
// get OpenMP number of threads and team
-INLINE int
-GetNumberOfOmpThreads(int threadId, bool isSPMDExecutionMode,
- bool isRuntimeUninitialized); // omp_num_threads
-INLINE int GetNumberOfOmpTeams(); // omp_num_teams
+INLINE int GetNumberOfOmpThreads(int threadId,
+ bool isSPMDExecutionMode); // omp_num_threads
+INLINE int GetNumberOfOmpTeams(); // omp_num_teams
// get OpenMP number of procs
INLINE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode);
@@ -66,6 +65,10 @@ INLINE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode);
// masters
INLINE int IsTeamMaster(int ompThreadId);
+// Parallel level
+INLINE void IncParallelLevel(bool ActiveParallel);
+INLINE void DecParallelLevel(bool ActiveParallel);
+
////////////////////////////////////////////////////////////////////////////////
// Memory
////////////////////////////////////////////////////////////////////////////////
diff --git a/libomptarget/deviceRTLs/nvptx/src/supporti.h b/libomptarget/deviceRTLs/nvptx/src/supporti.h
index 3f313a9..c8d8511 100644
--- a/libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ b/libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -149,40 +149,29 @@ INLINE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode) {
//
////////////////////////////////////////////////////////////////////////////////
-INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode,
- bool isRuntimeUninitialized) {
+INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode) {
// omp_thread_num
int rc;
-
- if (isRuntimeUninitialized) {
- ASSERT0(LT_FUSSY, isSPMDExecutionMode,
- "Uninitialized runtime with non-SPMD mode.");
- // For level 2 parallelism all parallel regions are executed sequentially.
- if (parallelLevel[GetWarpId()] > 0)
- rc = 0;
- else
- rc = GetThreadIdInBlock();
+ if ((parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1)) > 1) {
+ rc = 0;
+ } else if (isSPMDExecutionMode) {
+ rc = GetThreadIdInBlock();
} else {
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
+ ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
rc = currTaskDescr->ThreadId();
}
return rc;
}
-INLINE int GetNumberOfOmpThreads(int threadId, bool isSPMDExecutionMode,
- bool isRuntimeUninitialized) {
+INLINE int GetNumberOfOmpThreads(int threadId, bool isSPMDExecutionMode) {
// omp_num_threads
int rc;
-
- if (isRuntimeUninitialized) {
- ASSERT0(LT_FUSSY, isSPMDExecutionMode,
- "Uninitialized runtime with non-SPMD mode.");
- // For level 2 parallelism all parallel regions are executed sequentially.
- if (parallelLevel[GetWarpId()] > 0)
- rc = 1;
- else
- rc = GetNumberOfThreadsInBlock();
+ if ((parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1)) > 1) {
+ rc = 1;
+ } else if (isSPMDExecutionMode) {
+ rc = GetNumberOfThreadsInBlock();
} else {
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
@@ -212,6 +201,31 @@ INLINE int GetNumberOfOmpTeams() {
INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
////////////////////////////////////////////////////////////////////////////////
+// Parallel level
+
+INLINE void IncParallelLevel(bool ActiveParallel) {
+ unsigned tnum = __ACTIVEMASK();
+ int leader = __ffs(tnum) - 1;
+ __SHFL_SYNC(tnum, leader, leader);
+ if (GetLaneId() == leader) {
+ parallelLevel[GetWarpId()] +=
+ (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
+ }
+ __SHFL_SYNC(tnum, leader, leader);
+}
+
+INLINE void DecParallelLevel(bool ActiveParallel) {
+ unsigned tnum = __ACTIVEMASK();
+ int leader = __ffs(tnum) - 1;
+ __SHFL_SYNC(tnum, leader, leader);
+ if (GetLaneId() == leader) {
+ parallelLevel[GetWarpId()] -=
+ (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
+ }
+ __SHFL_SYNC(tnum, leader, leader);
+}
+
+////////////////////////////////////////////////////////////////////////////////
// get OpenMP number of procs
// Get the number of processors in the device.
diff --git a/libomptarget/deviceRTLs/nvptx/src/sync.cu b/libomptarget/deviceRTLs/nvptx/src/sync.cu
index 37471c9..f40b55b 100644
--- a/libomptarget/deviceRTLs/nvptx/src/sync.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/sync.cu
@@ -48,8 +48,8 @@ EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid) {
tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc_ref));
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
- int numberOfActiveOMPThreads = GetNumberOfOmpThreads(
- tid, checkSPMDMode(loc_ref), /*isRuntimeUninitialized=*/false);
+ int numberOfActiveOMPThreads =
+ GetNumberOfOmpThreads(tid, checkSPMDMode(loc_ref));
if (numberOfActiveOMPThreads > 1) {
if (checkSPMDMode(loc_ref)) {
__kmpc_barrier_simple_spmd(loc_ref, tid);
diff --git a/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c b/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c
index 8fd7ada..70ebb1d 100644
--- a/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c
+++ b/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c
@@ -5,6 +5,7 @@
const int MaxThreads = 1024;
const int NumThreads = 64;
+const int NumThreads1 = 1;
int main(int argc, char *argv[]) {
int inParallel = -1, numThreads = -1, threadNum = -1;
@@ -14,20 +15,20 @@ int main(int argc, char *argv[]) {
check1[i] = check2[i] = 0;
}
- #pragma omp target map(inParallel, numThreads, threadNum, check1[:], check2[:])
+#pragma omp target map(inParallel, numThreads, threadNum, check1[:], check2[:])
{
inParallel = omp_in_parallel();
numThreads = omp_get_num_threads();
threadNum = omp_get_thread_num();
- // Expecting active parallel region.
- #pragma omp parallel num_threads(NumThreads)
+// Expecting active parallel region.
+#pragma omp parallel num_threads(NumThreads)
{
int id = omp_get_thread_num();
check1[id] += omp_get_num_threads() + omp_in_parallel();
- // Expecting serialized parallel region.
- #pragma omp parallel
+// Expecting serialized parallel region.
+#pragma omp parallel
{
// Expected to be 1.
int nestedInParallel = omp_in_parallel();
@@ -35,7 +36,7 @@ int main(int argc, char *argv[]) {
int nestedNumThreads = omp_get_num_threads();
// Expected to be 0.
int nestedThreadNum = omp_get_thread_num();
- #pragma omp atomic
+#pragma omp atomic
check2[id] += nestedInParallel + nestedNumThreads + nestedThreadNum;
}
}
@@ -52,7 +53,8 @@ int main(int argc, char *argv[]) {
int Expected = NumThreads + 1;
if (i < NumThreads) {
if (check1[i] != Expected) {
- printf("invalid: check1[%d] should be %d, is %d\n", i, Expected, check1[i]);
+ printf("invalid: check1[%d] should be %d, is %d\n", i, Expected,
+ check1[i]);
}
} else if (check1[i] != 0) {
printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
@@ -68,5 +70,67 @@ int main(int argc, char *argv[]) {
}
}
+ inParallel = -1;
+ numThreads = -1;
+ threadNum = -1;
+ for (int i = 0; i < MaxThreads; i++) {
+ check1[i] = check2[i] = 0;
+ }
+
+#pragma omp target map(inParallel, numThreads, threadNum, check1[:], check2[:])
+ {
+ inParallel = omp_in_parallel();
+ numThreads = omp_get_num_threads();
+ threadNum = omp_get_thread_num();
+
+// Expecting active parallel region.
+#pragma omp parallel num_threads(NumThreads1)
+ {
+ int id = omp_get_thread_num();
+ check1[id] += omp_get_num_threads() + omp_in_parallel();
+
+// Expecting serialized parallel region.
+#pragma omp parallel
+ {
+ // Expected to be 0.
+ int nestedInParallel = omp_in_parallel();
+ // Expected to be 1.
+ int nestedNumThreads = omp_get_num_threads();
+ // Expected to be 0.
+ int nestedThreadNum = omp_get_thread_num();
+#pragma omp atomic
+ check2[id] += nestedInParallel + nestedNumThreads + nestedThreadNum;
+ }
+ }
+ }
+
+ // CHECK: target: inParallel = 0, numThreads = 1, threadNum = 0
+ printf("target: inParallel = %d, numThreads = %d, threadNum = %d\n",
+ inParallel, numThreads, threadNum);
+
+ // CHECK-NOT: invalid
+ for (int i = 0; i < MaxThreads; i++) {
+ // Check that all threads reported
+ // omp_get_num_threads() = 1, omp_in_parallel() = 0.
+ int Expected = 1;
+ if (i < NumThreads1) {
+ if (check1[i] != Expected) {
+ printf("invalid: check1[%d] should be %d, is %d\n", i, Expected,
+ check1[i]);
+ }
+ } else if (check1[i] != 0) {
+ printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
+ }
+
+ // Check serialized parallel region.
+ if (i < NumThreads1) {
+ if (check2[i] != 1) {
+ printf("invalid: check2[%d] should be 1, is %d\n", i, check2[i]);
+ }
+ } else if (check2[i] != 0) {
+ printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
+ }
+ }
+
return 0;
}