summaryrefslogtreecommitdiff
path: root/openmp
diff options
context:
space:
mode:
authorAlexey Bataev <a.bataev@hotmail.com>2018-11-30 16:52:38 +0000
committerAlexey Bataev <a.bataev@hotmail.com>2018-11-30 16:52:38 +0000
commite307e376ac4a60c095e852ece343f7a4fa6bb226 (patch)
tree4a3adccf2b219cebab60bff97097edc52fb4d0a1 /openmp
parent11507843d2a2b488ad2a781f4efd94695bb3fac0 (diff)
[OPENMP][NVPTX]Make runtime compatible with the original runtime.
Summary: Reworked runtime to make it compatible with the requirements of the original runtime library. Also, simplified some code to reduce number of function calls. Reviewers: gtbercea, kkwli0 Subscribers: guansong, jfb, caomhin, openmp-commits Differential Revision: https://reviews.llvm.org/D55130
Diffstat (limited to 'openmp')
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu215
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu4
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu3
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu15
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/task.cu9
5 files changed, 87 insertions, 159 deletions
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
index 3d409d82f85..dfb9c8bd70a 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
@@ -93,9 +93,10 @@ public:
////////////////////////////////////////////////////////////////////////////////
// Support for Static Init
- INLINE static void for_static_init(int32_t schedtype, int32_t *plastiter,
- T *plower, T *pupper, ST *pstride,
- ST chunk, bool IsSPMDExecutionMode,
+ INLINE static void for_static_init(int32_t gtid, int32_t schedtype,
+ int32_t *plastiter, T *plower, T *pupper,
+ ST *pstride, ST chunk,
+ bool IsSPMDExecutionMode,
bool IsRuntimeUninitialized) {
// When IsRuntimeUninitialized is true, we assume that the caller is
// in an L0 parallel region and that all worker threads participate.
@@ -112,108 +113,72 @@ public:
PRINT(LD_LOOP,
"OMP Thread %d: schedule type %d, chunk size = %lld, mytid "
"%d, num tids %d\n",
- GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized),
- schedtype, P64(chunk),
- GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized),
- GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
- IsRuntimeUninitialized));
- ASSERT0(
- LT_FUSSY,
- (GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized)) <
- (GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
- IsRuntimeUninitialized)),
- "current thread is not needed here; error");
+ gtid, schedtype, P64(chunk), gtid, numberOfActiveOMPThreads);
+ ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads,
+ "current thread is not needed here; error");
// copy
int lastiter = 0;
T lb = *plower;
T ub = *pupper;
ST stride = *pstride;
- T entityId, numberOfEntities;
// init
switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) {
case kmp_sched_static_chunk: {
if (chunk > 0) {
- entityId =
- GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
- numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
- IsRuntimeUninitialized);
- ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId,
- numberOfEntities);
+ ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
+ numberOfActiveOMPThreads);
break;
}
} // note: if chunk <=0, use nochunk
case kmp_sched_static_balanced_chunk: {
if (chunk > 0) {
- entityId =
- GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
- numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
- IsRuntimeUninitialized);
-
// round up to make sure the chunk is enough to cover all iterations
T tripCount = ub - lb + 1; // +1 because ub is inclusive
- T span = (tripCount + numberOfEntities - 1) / numberOfEntities;
+ T span = (tripCount + numberOfActiveOMPThreads - 1) /
+ numberOfActiveOMPThreads;
// perform chunk adjustment
chunk = (span + chunk - 1) & ~(chunk - 1);
ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
T oldUb = ub;
- ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId,
- numberOfEntities);
+ ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
+ numberOfActiveOMPThreads);
if (ub > oldUb)
ub = oldUb;
break;
}
} // note: if chunk <=0, use nochunk
case kmp_sched_static_nochunk: {
- entityId =
- GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
- numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
- IsRuntimeUninitialized);
- ForStaticNoChunk(lastiter, lb, ub, stride, chunk, entityId,
- numberOfEntities);
+ ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid,
+ numberOfActiveOMPThreads);
break;
}
case kmp_sched_distr_static_chunk: {
if (chunk > 0) {
- entityId = GetOmpTeamId();
- numberOfEntities = GetNumberOfOmpTeams();
- ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId,
- numberOfEntities);
+ ForStaticChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(),
+ GetNumberOfOmpTeams());
break;
} // note: if chunk <=0, use nochunk
}
case kmp_sched_distr_static_nochunk: {
- entityId = GetOmpTeamId();
- numberOfEntities = GetNumberOfOmpTeams();
-
- ForStaticNoChunk(lastiter, lb, ub, stride, chunk, entityId,
- numberOfEntities);
+ ForStaticNoChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(),
+ GetNumberOfOmpTeams());
break;
}
case kmp_sched_distr_static_chunk_sched_static_chunkone: {
- entityId =
- GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
- IsRuntimeUninitialized) *
- GetOmpTeamId() +
- GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
- numberOfEntities = GetNumberOfOmpTeams() *
- GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
- IsRuntimeUninitialized);
- ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId,
- numberOfEntities);
+ ForStaticChunk(lastiter, lb, ub, stride, chunk,
+ numberOfActiveOMPThreads * GetOmpTeamId() + gtid,
+ GetNumberOfOmpTeams() * numberOfActiveOMPThreads);
break;
}
default: {
ASSERT(LT_FUSSY, FALSE, "unknown schedtype %d", schedtype);
PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n",
schedtype);
- entityId =
- GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
- numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
- IsRuntimeUninitialized);
- ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId,
- numberOfEntities);
+ ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
+ numberOfActiveOMPThreads);
+ break;
}
}
// copy back
@@ -221,13 +186,11 @@ public:
*plower = lb;
*pupper = ub;
*pstride = stride;
- PRINT(
- LD_LOOP,
- "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld, last "
- "%d\n",
- GetNumberOfOmpThreads(tid, IsSPMDExecutionMode, IsRuntimeUninitialized),
- GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper), P64(*pstride),
- lastiter);
+ PRINT(LD_LOOP,
+ "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld, last "
+ "%d\n",
+ numberOfActiveOMPThreads, GetNumberOfWorkersInTeam(), P64(*plower),
+ P64(*pupper), P64(*pstride), lastiter);
}
////////////////////////////////////////////////////////////////////////////////
@@ -247,12 +210,8 @@ public:
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
T tnum = currTaskDescr->ThreadsInTeam();
T tripCount = ub - lb + 1; // +1 because ub is inclusive
- ASSERT0(
- LT_FUSSY,
- GetOmpThreadId(tid, checkSPMDMode(loc), checkRuntimeUninitialized(loc)) <
- GetNumberOfOmpThreads(tid, checkSPMDMode(loc),
- checkRuntimeUninitialized(loc)),
- "current thread is not needed here; error");
+ ASSERT0(LT_FUSSY, threadId < tnum,
+ "current thread is not needed here; error");
/* Currently just ignore the monotonic and non-monotonic modifiers
* (the compiler isn't producing them * yet anyway).
@@ -320,10 +279,7 @@ public:
// compute static chunk
ST stride;
int lastiter = 0;
- ForStaticChunk(
- lastiter, lb, ub, stride, chunk,
- GetOmpThreadId(tid, checkSPMDMode(loc),
- checkRuntimeUninitialized(loc)), tnum);
+ ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
// save computed params
omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
@@ -331,9 +287,7 @@ public:
PRINT(LD_LOOP,
"dispatch init (static chunk) : num threads = %d, ub = %" PRId64
", next lower bound = %llu, stride = %llu\n",
- GetNumberOfOmpThreads(tid, checkSPMDMode(loc),
- checkRuntimeUninitialized(loc)),
- omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+ tnum, omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
omptarget_nvptx_threadPrivateContext->Stride(tid));
} else if (schedule == kmp_sched_static_balanced_chunk) {
@@ -351,10 +305,7 @@ public:
chunk = (span + chunk - 1) & ~(chunk - 1);
T oldUb = ub;
- ForStaticChunk(
- lastiter, lb, ub, stride, chunk,
- GetOmpThreadId(tid, checkSPMDMode(loc),
- checkRuntimeUninitialized(loc)), tnum);
+ ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
if (ub > oldUb)
ub = oldUb;
@@ -365,9 +316,7 @@ public:
PRINT(LD_LOOP,
"dispatch init (static chunk) : num threads = %d, ub = %" PRId64
", next lower bound = %llu, stride = %llu\n",
- GetNumberOfOmpThreads(tid, checkSPMDMode(loc),
- checkRuntimeUninitialized(loc)),
- omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+ tnum, omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
omptarget_nvptx_threadPrivateContext->Stride(tid));
} else if (schedule == kmp_sched_static_nochunk) {
@@ -379,10 +328,7 @@ public:
// compute static chunk
ST stride;
int lastiter = 0;
- ForStaticNoChunk(
- lastiter, lb, ub, stride, chunk,
- GetOmpThreadId(tid, checkSPMDMode(loc),
- checkRuntimeUninitialized(loc)), tnum);
+ ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
// save computed params
omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
@@ -390,9 +336,7 @@ public:
PRINT(LD_LOOP,
"dispatch init (static nochunk) : num threads = %d, ub = %" PRId64
", next lower bound = %llu, stride = %llu\n",
- GetNumberOfOmpThreads(tid, checkSPMDMode(loc),
- checkRuntimeUninitialized(loc)),
- omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+ tnum, omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
omptarget_nvptx_threadPrivateContext->Stride(tid));
@@ -412,9 +356,7 @@ public:
PRINT(LD_LOOP,
"dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
", chunk %" PRIu64 "\n",
- GetNumberOfOmpThreads(tid, checkSPMDMode(loc),
- checkRuntimeUninitialized(loc)),
- omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
+ tnum, omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId),
omptarget_nvptx_threadPrivateContext->Chunk(teamId));
}
@@ -460,19 +402,18 @@ public:
// On Pascal, with inlining of the runtime into the user application,
// this code deadlocks. This is probably because different threads
// in a warp cannot make independent progress.
- NOINLINE static int dispatch_next(int32_t *plast, T *plower, T *pupper,
- ST *pstride) {
+ NOINLINE static int dispatch_next(int32_t gtid, int32_t *plast, T *plower,
+ T *pupper, ST *pstride) {
ASSERT0(LT_FUSSY, isRuntimeInitialized(),
"Expected non-SPMD mode + initialized runtime.");
// ID of a thread in its own warp
// automatically selects thread or warp ID based on selected implementation
int tid = GetLogicalThreadIdInBlock();
- ASSERT0(
- LT_FUSSY,
- GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()) <
- GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
- "current thread is not needed here; error");
+ ASSERT0(LT_FUSSY,
+ gtid < GetNumberOfOmpThreads(tid, isSPMDMode(),
+ isRuntimeUninitialized()),
+ "current thread is not needed here; error");
// retrieve schedule
kmp_sched_t schedule =
omptarget_nvptx_threadPrivateContext->ScheduleType(tid);
@@ -583,7 +524,7 @@ EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t tid, int32_t *p_last,
int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
PRINT0(LD_IO, "call kmpc_dispatch_next_4\n");
return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
- p_last, p_lb, p_ub, p_st);
+ tid, p_last, p_lb, p_ub, p_st);
}
EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t tid,
@@ -591,14 +532,14 @@ EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t tid,
uint32_t *p_ub, int32_t *p_st) {
PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n");
return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
- p_last, p_lb, p_ub, p_st);
+ tid, p_last, p_lb, p_ub, p_st);
}
EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t tid, int32_t *p_last,
int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
PRINT0(LD_IO, "call kmpc_dispatch_next_8\n");
return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
- p_last, p_lb, p_ub, p_st);
+ tid, p_last, p_lb, p_ub, p_st);
}
EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t tid,
@@ -606,7 +547,7 @@ EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t tid,
uint64_t *p_ub, int64_t *p_st) {
PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n");
return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
- p_last, p_lb, p_ub, p_st);
+ tid, p_last, p_lb, p_ub, p_st);
}
// fini
@@ -641,7 +582,7 @@ EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid,
int32_t chunk) {
PRINT0(LD_IO, "call kmpc_for_static_init_4\n");
omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
- schedtype, plastiter, plower, pupper, pstride, chunk,
+ global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
checkSPMDMode(loc), checkRuntimeUninitialized(loc));
}
@@ -652,7 +593,7 @@ EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid,
int32_t chunk) {
PRINT0(LD_IO, "call kmpc_for_static_init_4u\n");
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
- schedtype, plastiter, plower, pupper, pstride, chunk,
+ global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
checkSPMDMode(loc), checkRuntimeUninitialized(loc));
}
@@ -663,7 +604,7 @@ EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid,
int64_t chunk) {
PRINT0(LD_IO, "call kmpc_for_static_init_8\n");
omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
- schedtype, plastiter, plower, pupper, pstride, chunk,
+ global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
checkSPMDMode(loc), checkRuntimeUninitialized(loc));
}
@@ -674,7 +615,7 @@ EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid,
int64_t chunk) {
PRINT0(LD_IO, "call kmpc_for_static_init_8u\n");
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
- schedtype, plastiter, plower, pupper, pstride, chunk,
+ global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
checkSPMDMode(loc), checkRuntimeUninitialized(loc));
}
@@ -686,9 +627,8 @@ void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid,
int32_t chunk) {
PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n");
omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
- schedtype, plastiter, plower, pupper, pstride, chunk,
- /*IsSPMDExecutionMode=*/true,
- /*IsRuntimeUninitialized=*/true);
+ global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+ /*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true);
}
EXTERN
@@ -699,9 +639,8 @@ void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
int32_t incr, int32_t chunk) {
PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n");
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
- schedtype, plastiter, plower, pupper, pstride, chunk,
- /*IsSPMDExecutionMode=*/true,
- /*IsRuntimeUninitialized=*/true);
+ global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+ /*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true);
}
EXTERN
@@ -712,9 +651,8 @@ void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid,
int64_t chunk) {
PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n");
omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
- schedtype, plastiter, plower, pupper, pstride, chunk,
- /*IsSPMDExecutionMode=*/true,
- /*IsRuntimeUninitialized=*/true);
+ global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+ /*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true);
}
EXTERN
@@ -725,9 +663,8 @@ void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
int64_t incr, int64_t chunk) {
PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n");
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
- schedtype, plastiter, plower, pupper, pstride, chunk,
- /*IsSPMDExecutionMode=*/true,
- /*IsRuntimeUninitialized=*/true);
+ global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+ /*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true);
}
EXTERN
@@ -737,9 +674,8 @@ void __kmpc_for_static_init_4_simple_generic(
int32_t chunk) {
PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n");
omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
- schedtype, plastiter, plower, pupper, pstride, chunk,
- /*IsSPMDExecutionMode=*/false,
- /*IsRuntimeUninitialized=*/true);
+ global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+ /*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true);
}
EXTERN
@@ -749,9 +685,8 @@ void __kmpc_for_static_init_4u_simple_generic(
int32_t chunk) {
PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n");
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
- schedtype, plastiter, plower, pupper, pstride, chunk,
- /*IsSPMDExecutionMode=*/false,
- /*IsRuntimeUninitialized=*/true);
+ global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+ /*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true);
}
EXTERN
@@ -761,9 +696,8 @@ void __kmpc_for_static_init_8_simple_generic(
int64_t chunk) {
PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n");
omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
- schedtype, plastiter, plower, pupper, pstride, chunk,
- /*IsSPMDExecutionMode=*/false,
- /*IsRuntimeUninitialized=*/true);
+ global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+ /*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true);
}
EXTERN
@@ -773,9 +707,8 @@ void __kmpc_for_static_init_8u_simple_generic(
int64_t chunk) {
PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n");
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
- schedtype, plastiter, plower, pupper, pstride, chunk,
- /*IsSPMDExecutionMode=*/false,
- /*IsRuntimeUninitialized=*/true);
+ global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+ /*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true);
}
EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) {
@@ -807,15 +740,13 @@ EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, int32_t gtid,
"Expected non-SPMD mode + initialized runtime.");
omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
- int tid = GetOmpThreadId(GetLogicalThreadIdInBlock(), checkSPMDMode(loc),
- checkRuntimeUninitialized(loc));
- uint32_t NumThreads = GetNumberOfOmpThreads(
- GetLogicalThreadIdInBlock(), checkSPMDMode(loc),
- checkRuntimeUninitialized(loc));
+ int tid = GetLogicalThreadIdInBlock();
+ uint32_t NumThreads = GetNumberOfOmpThreads(tid, checkSPMDMode(loc),
+ checkRuntimeUninitialized(loc));
uint64_t *Buffer = teamDescr.getLastprivateIterBuffer();
for (unsigned i = 0; i < varNum; i++) {
// Reset buffer.
- if (tid == 0)
+ if (gtid == 0)
*Buffer = 0; // Reset to minimum loop iteration value.
// Barrier.
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
index 0825564ca09..82f513950b5 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
@@ -418,7 +418,9 @@ EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) {
// it's cheap to recalculate this value so we never use the result
// of this call.
EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) {
- return GetLogicalThreadIdInBlock();
+ int tid = GetLogicalThreadIdInBlock();
+ return GetOmpThreadId(tid, checkSPMDMode(loc),
+ checkRuntimeUninitialized(loc));
}
////////////////////////////////////////////////////////////////////////////////
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
index aaa95c77cb5..bbe4ad16838 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
@@ -232,8 +232,7 @@ int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars,
// Get the OMP thread Id. This is different from BlockThreadId in the case of
// an L2 parallel region.
- return GetOmpThreadId(BlockThreadId, isSPMDExecutionMode,
- isRuntimeUninitialized) == 0;
+ return global_tid == 0;
#endif // __CUDA_ARCH__ >= 700
}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu
index eb4ef00b4d7..082f3b302db 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu
@@ -99,21 +99,14 @@ EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid) {
// KMP MASTER
////////////////////////////////////////////////////////////////////////////////
-INLINE int32_t IsMaster() {
- // only the team master updates the state
- int tid = GetLogicalThreadIdInBlock();
- int ompThreadId = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized());
- return IsTeamMaster(ompThreadId);
-}
-
EXTERN int32_t __kmpc_master(kmp_Ident *loc, int32_t global_tid) {
PRINT0(LD_IO, "call kmpc_master\n");
- return IsMaster();
+ return IsTeamMaster(global_tid);
}
EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid) {
PRINT0(LD_IO, "call kmpc_end_master\n");
- ASSERT0(LT_FUSSY, IsMaster(), "expected only master here");
+ ASSERT0(LT_FUSSY, IsTeamMaster(global_tid), "expected only master here");
}
////////////////////////////////////////////////////////////////////////////////
@@ -123,13 +116,13 @@ EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid) {
EXTERN int32_t __kmpc_single(kmp_Ident *loc, int32_t global_tid) {
PRINT0(LD_IO, "call kmpc_single\n");
// decide to implement single with master; master get the single
- return IsMaster();
+ return IsTeamMaster(global_tid);
}
EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid) {
PRINT0(LD_IO, "call kmpc_end_single\n");
// decide to implement single with master: master get the single
- ASSERT0(LT_FUSSY, IsMaster(), "expected only master here");
+ ASSERT0(LT_FUSSY, IsTeamMaster(global_tid), "expected only master here");
// sync barrier is explicitely called... so that is not a problem
}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/task.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/task.cu
index 3e9b304ec40..c5006903b1a 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/task.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/task.cu
@@ -81,7 +81,8 @@ EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid,
void *noAliasDepList) {
PRINT(LD_IO, "call to __kmpc_omp_task_with_deps(task 0x%llx)\n",
P64(newKmpTaskDescr));
- ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
+ ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
+ "Runtime must be initialized.");
// 1. get explict task descr from kmp task descr
omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
(omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
@@ -118,7 +119,8 @@ EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid,
kmp_TaskDescr *newKmpTaskDescr) {
PRINT(LD_IO, "call to __kmpc_omp_task_begin_if0(task 0x%llx)\n",
P64(newKmpTaskDescr));
- ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
+ ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
+ "Runtime must be initialized.");
// 1. get explict task descr from kmp task descr
omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
(omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
@@ -143,7 +145,8 @@ EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid,
kmp_TaskDescr *newKmpTaskDescr) {
PRINT(LD_IO, "call to __kmpc_omp_task_complete_if0(task 0x%llx)\n",
P64(newKmpTaskDescr));
- ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
+ ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
+ "Runtime must be initialized.");
// 1. get explict task descr from kmp task descr
omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
(omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(