summaryrefslogtreecommitdiff
path: root/openmp/libomptarget
diff options
context:
space:
mode:
authorAlexey Bataev <a.bataev@hotmail.com>2018-08-29 17:35:09 +0000
committerAlexey Bataev <a.bataev@hotmail.com>2018-08-29 17:35:09 +0000
commitb539009f149d227d8109aa89ea0fa8d4d5c2f79f (patch)
tree3b3db1d17b9c299d2b5a7cbb643377550195e39d /openmp/libomptarget
parent5d151cb46ba98105c193d1fa4287620d4a68910d (diff)
[OPENMP][NVPTX] Lightweight runtime support for SPMD mode.
Summary: Implemented simple and lightweight runtime support for SPMD mode-based constructs. It adds support for L2 sequential parallelism wihtout full runtime support. Also, patch fixes some use cases for uninitialized|lightweight runtime. Reviewers: grokos, kkwli0, Hahnfeld, gtbercea Subscribers: guansong, openmp-commits Differential Revision: https://reviews.llvm.org/D51222
Diffstat (limited to 'openmp/libomptarget')
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu5
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/interface.h2
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu75
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu81
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu7
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu44
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h35
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu33
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h17
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu6
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/task.cu3
11 files changed, 263 insertions, 45 deletions
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
index 2a1709f7ecb..a1b60429935 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -79,7 +79,7 @@ __device__ static size_t AlignVal(size_t Val) {
EXTERN void
__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *rootS,
size_t InitialDataSize) {
-
+ assert(isRuntimeInitialized() && "Expected initialized runtime.");
DSPRINT0(DSFLAG_INIT,
"Entering __kmpc_initialize_data_sharing_environment\n");
@@ -331,6 +331,7 @@ __kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
////////////////////////////////////////////////////////////////////////////////
INLINE void data_sharing_init_stack_common() {
+ assert(isRuntimeInitialized() && "Expected initialized runtime.");
omptarget_nvptx_TeamDescr *teamDescr =
&omptarget_nvptx_threadPrivateContext->TeamContext();
@@ -346,6 +347,7 @@ INLINE void data_sharing_init_stack_common() {
// initialization). This function is called only by the MASTER thread of each
// team in non-SPMD mode.
EXTERN void __kmpc_data_sharing_init_stack() {
+ assert(isRuntimeInitialized() && "Expected initialized runtime.");
// This function initializes the stack pointer with the pointer to the
// statically allocated shared memory slots. The size of a shared memory
// slot is pre-determined to be 256 bytes.
@@ -357,6 +359,7 @@ EXTERN void __kmpc_data_sharing_init_stack() {
// once at the beginning of a data sharing context (coincides with the kernel
// initialization). This function is called in SPMD mode only.
EXTERN void __kmpc_data_sharing_init_stack_spmd() {
+ assert(isRuntimeInitialized() && "Expected initialized runtime.");
// This function initializes the stack pointer with the pointer to the
// statically allocated shared memory slots. The size of a shared memory
// slot is pre-determined to be 256 bytes.
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h b/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h
index 680df48d4df..d958a074ab0 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h
@@ -116,6 +116,8 @@ typedef enum kmp_sched_t {
kmp_sched_runtime = 37,
kmp_sched_auto = 38,
+ kmp_sched_static_balanced_chunk = 45,
+
kmp_sched_static_ordered = 65,
kmp_sched_static_nochunk_ordered = 66,
kmp_sched_dynamic_ordered = 67,
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu
index 15040dbafec..de493a5f25a 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu
@@ -31,6 +31,10 @@ EXTERN double omp_get_wtime(void) {
}
EXTERN void omp_set_num_threads(int num) {
+ // Ignore it for SPMD mode.
+ if (isSPMDMode())
+ return;
+ assert(isRuntimeInitialized() && "Expected initialized runtime.");
PRINT(LD_IO, "call omp_set_num_threads(num %d)\n", num);
if (num <= 0) {
WARNING0(LW_INPUT, "expected positive num; ignore\n");
@@ -48,6 +52,12 @@ EXTERN int omp_get_num_threads(void) {
}
EXTERN int omp_get_max_threads(void) {
+ if (isRuntimeUninitialized()) {
+ assert(isSPMDMode() &&
+ "expected SPMD mode only with uninitialized runtime.");
+ // We're already in parallel region.
+ return 1; // default is 1 thread avail
+ }
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
int rc = 1; // default is 1 thread avail
if (!currTaskDescr->InParallelRegion()) {
@@ -60,6 +70,11 @@ EXTERN int omp_get_max_threads(void) {
}
EXTERN int omp_get_thread_limit(void) {
+ if (isRuntimeUninitialized()) {
+ assert(isSPMDMode() &&
+ "expected SPMD mode only with uninitialized runtime.");
+ return 0; // default is 0
+ }
// per contention group.. meaning threads in current team
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
int rc = currTaskDescr->ThreadLimit();
@@ -82,9 +97,15 @@ EXTERN int omp_get_num_procs(void) {
EXTERN int omp_in_parallel(void) {
int rc = 0;
- omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
- if (currTaskDescr->InParallelRegion()) {
- rc = 1;
+ if (isRuntimeUninitialized()) {
+ assert(isSPMDMode() &&
+ "expected SPMD mode only with uninitialized runtime.");
+ rc = 1; // SPMD mode is always in parallel.
+ } else {
+ omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+ if (currTaskDescr->InParallelRegion()) {
+ rc = 1;
+ }
}
PRINT(LD_IO, "call omp_in_parallel() returns %d\n", rc);
return rc;
@@ -102,6 +123,11 @@ EXTERN int omp_in_final(void) {
EXTERN void omp_set_dynamic(int flag) {
PRINT(LD_IO, "call omp_set_dynamic(%d)\n", flag);
+ if (isRuntimeUninitialized()) {
+ assert(isSPMDMode() &&
+ "expected SPMD mode only with uninitialized runtime.");
+ return;
+ }
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
if (flag) {
@@ -113,6 +139,11 @@ EXTERN void omp_set_dynamic(int flag) {
EXTERN int omp_get_dynamic(void) {
int rc = 0;
+ if (isRuntimeUninitialized()) {
+ assert(isSPMDMode() &&
+ "expected SPMD mode only with uninitialized runtime.");
+ return rc;
+ }
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
if (currTaskDescr->IsDynamic()) {
rc = 1;
@@ -145,6 +176,11 @@ EXTERN int omp_get_max_active_levels(void) {
}
EXTERN int omp_get_level(void) {
+ if (isRuntimeUninitialized()) {
+ assert(isSPMDMode() &&
+ "expected SPMD mode only with uninitialized runtime.");
+ return omptarget_nvptx_simpleThreadPrivateContext->GetParallelLevel();
+ }
int level = 0;
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
ASSERT0(LT_FUSSY, currTaskDescr,
@@ -160,6 +196,11 @@ EXTERN int omp_get_level(void) {
}
EXTERN int omp_get_active_level(void) {
+ if (isRuntimeUninitialized()) {
+ assert(isSPMDMode() &&
+ "expected SPMD mode only with uninitialized runtime.");
+ return 1;
+ }
int level = 0; // no active level parallelism
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
ASSERT0(LT_FUSSY, currTaskDescr,
@@ -177,6 +218,11 @@ EXTERN int omp_get_active_level(void) {
}
EXTERN int omp_get_ancestor_thread_num(int level) {
+ if (isRuntimeUninitialized()) {
+ assert(isSPMDMode() &&
+ "expected SPMD mode only with uninitialized runtime.");
+ return level == 1 ? GetThreadIdInBlock() : 0;
+ }
int rc = 0; // default at level 0
if (level >= 0) {
int totLevel = omp_get_level();
@@ -220,6 +266,11 @@ EXTERN int omp_get_ancestor_thread_num(int level) {
}
EXTERN int omp_get_team_size(int level) {
+ if (isRuntimeUninitialized()) {
+ assert(isSPMDMode() &&
+ "expected SPMD mode only with uninitialized runtime.");
+ return level == 1 ? GetNumberOfThreadsInBlock() : 1;
+ }
int rc = 1; // default at level 0
if (level >= 0) {
int totLevel = omp_get_level();
@@ -247,9 +298,16 @@ EXTERN int omp_get_team_size(int level) {
}
EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier) {
- omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
- *kind = currTaskDescr->GetRuntimeSched();
- *modifier = currTaskDescr->RuntimeChunkSize();
+ if (isRuntimeUninitialized()) {
+ assert(isSPMDMode() &&
+ "expected SPMD mode only with uninitialized runtime.");
+ *kind = omp_sched_static;
+ *modifier = 1;
+ } else {
+ omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+ *kind = currTaskDescr->GetRuntimeSched();
+ *modifier = currTaskDescr->RuntimeChunkSize();
+ }
PRINT(LD_IO, "call omp_get_schedule returns sched %d and modif %d\n",
(int)*kind, *modifier);
}
@@ -257,6 +315,11 @@ EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier) {
EXTERN void omp_set_schedule(omp_sched_t kind, int modifier) {
PRINT(LD_IO, "call omp_set_schedule(sched %d, modif %d)\n", (int)kind,
modifier);
+ if (isRuntimeUninitialized()) {
+ assert(isSPMDMode() &&
+ "expected SPMD mode only with uninitialized runtime.");
+ return;
+ }
if (kind >= omp_sched_static && kind < omp_sched_auto) {
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
currTaskDescr->SetRuntimeSched(kind);
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
index f3e475d7108..c4c8e712adc 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
@@ -131,7 +131,7 @@ public:
ST stride = *pstride;
T entityId, numberOfEntities;
// init
- switch (schedtype) {
+ switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) {
case kmp_sched_static_chunk: {
if (chunk > 0) {
entityId =
@@ -143,6 +143,28 @@ public:
break;
}
} // note: if chunk <=0, use nochunk
+ case kmp_sched_static_balanced_chunk: {
+ if (chunk > 0) {
+ entityId =
+ GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
+ numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
+ IsRuntimeUninitialized);
+
+ // round up to make sure the chunk is enough to cover all iterations
+ T tripCount = ub - lb + 1; // +1 because ub is inclusive
+ T span = (tripCount + numberOfEntities - 1) / numberOfEntities;
+ // perform chunk adjustment
+ chunk = (span + chunk - 1) & ~(chunk - 1);
+
+ assert(ub >= lb && "ub must be >= lb.");
+ T oldUb = ub;
+ ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId,
+ numberOfEntities);
+ if (ub > oldUb)
+ ub = oldUb;
+ break;
+ }
+ } // note: if chunk <=0, use nochunk
case kmp_sched_static_nochunk: {
entityId =
GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
@@ -199,12 +221,13 @@ public:
*plower = lb;
*pupper = ub;
*pstride = stride;
- PRINT(LD_LOOP,
- "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld\n",
- GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
- IsRuntimeUninitialized),
- GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper),
- P64(*pstride));
+ PRINT(
+ LD_LOOP,
+ "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld, last "
+ "%d\n",
+ GetNumberOfOmpThreads(tid, IsSPMDExecutionMode, IsRuntimeUninitialized),
+ GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper), P64(*pstride),
+ lastiter);
}
////////////////////////////////////////////////////////////////////////////////
@@ -218,6 +241,8 @@ public:
INLINE static void dispatch_init(kmp_Indent *loc, int32_t threadId,
kmp_sched_t schedule, T lb, T ub, ST st,
ST chunk) {
+ assert(isRuntimeInitialized() &&
+ "Expected non-SPMD mode + initialized runtime.");
int tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
T tnum = currTaskDescr->ThreadsInTeam();
@@ -308,7 +333,38 @@ public:
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
omptarget_nvptx_threadPrivateContext->Stride(tid));
+ } else if (schedule == kmp_sched_static_balanced_chunk) {
+ ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
+ // save sched state
+ omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
+ // save ub
+ omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
+ // compute static chunk
+ ST stride;
+ int lastiter = 0;
+ // round up to make sure the chunk is enough to cover all iterations
+ T span = (tripCount + tnum - 1) / tnum;
+ // perform chunk adjustment
+ chunk = (span + chunk - 1) & ~(chunk - 1);
+ T oldUb = ub;
+ ForStaticChunk(
+ lastiter, lb, ub, stride, chunk,
+ GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()), tnum);
+ assert(ub >= lb && "ub must be >= lb.");
+ if (ub > oldUb)
+ ub = oldUb;
+ // save computed params
+ omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
+ omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
+ omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
+ PRINT(LD_LOOP,
+ "dispatch init (static chunk) : num threads = %d, ub = %" PRId64
+ ", next lower bound = %llu, stride = %llu\n",
+ GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
+ omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+ omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+ omptarget_nvptx_threadPrivateContext->Stride(tid));
} else if (schedule == kmp_sched_static_nochunk) {
ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
// save sched state
@@ -398,6 +454,8 @@ public:
// in a warp cannot make independent progress.
NOINLINE static int dispatch_next(int32_t *plast, T *plower, T *pupper,
ST *pstride) {
+ assert(isRuntimeInitialized() &&
+ "Expected non-SPMD mode + initialized runtime.");
// ID of a thread in its own warp
// automatically selects thread or warp ID based on selected implementation
@@ -458,10 +516,11 @@ public:
*pstride = 1;
PRINT(LD_LOOP,
- "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld\n",
+ "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
+ "last %d\n",
GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
- GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper),
- P64(*pstride));
+ GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper), P64(*pstride),
+ *plast);
return DISPATCH_NOTFINISHED;
}
@@ -736,6 +795,8 @@ INLINE void syncWorkersInGenericMode(uint32_t NumThreads) {
EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Indent *loc, int32_t gtid,
int32_t varNum, void *array) {
PRINT0(LD_IO, "call to __kmpc_reduce_conditional_lastprivate(...)\n");
+ assert(isRuntimeInitialized() &&
+ "Expected non-SPMD mode + initialized runtime.");
omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
int tid = GetOmpThreadId(GetLogicalThreadIdInBlock(), isSPMDMode(),
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
index 149af8d7046..fcecaf3e914 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
@@ -27,10 +27,17 @@ __device__
omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
omptarget_nvptx_device_State[MAX_SM];
+__device__ omptarget_nvptx_Queue<omptarget_nvptx_SimpleThreadPrivateContext,
+ OMP_STATE_COUNT>
+ omptarget_nvptx_device_simpleState[MAX_SM];
+
// Pointer to this team's OpenMP state object
__device__ __shared__
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
+__device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext
+ *omptarget_nvptx_simpleThreadPrivateContext;
+
////////////////////////////////////////////////////////////////////////////////
// The team master sets the outlined parallel function in this variable to
// communicate with the workers. Since it is in shared memory, there is one
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
index 677654dd577..884a27109a3 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -24,6 +24,13 @@ extern __device__
extern __device__ __shared__
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
+extern __device__ omptarget_nvptx_Queue<
+ omptarget_nvptx_SimpleThreadPrivateContext, OMP_STATE_COUNT>
+ omptarget_nvptx_device_simpleState[MAX_SM];
+
+extern __device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext
+ *omptarget_nvptx_simpleThreadPrivateContext;
+
//
// The team master sets the outlined function and its arguments in these
// variables to communicate with the workers. Since they are in shared memory,
@@ -53,12 +60,7 @@ EXTERN void __kmpc_kernel_init_params(void *Ptr) {
EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n",
OMPTARGET_NVPTX_VERSION);
-
- if (!RequiresOMPRuntime) {
- // If OMP runtime is not required don't initialize OMP state.
- setExecutionParameters(Generic, RuntimeUninitialized);
- return;
- }
+ assert(RequiresOMPRuntime && "Generic always requires initialized runtime.");
setExecutionParameters(Generic, RuntimeInitialized);
int threadIdInBlock = GetThreadIdInBlock();
@@ -95,16 +97,16 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
}
EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) {
- if (IsOMPRuntimeInitialized) {
- // Enqueue omp state object for use by another team.
+ assert(IsOMPRuntimeInitialized &&
+ "Generic always requires initialized runtime.");
+ // Enqueue omp state object for use by another team.
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
- int slot = omptarget_nvptx_threadPrivateContext->GetSourceQueue();
+ int slot = omptarget_nvptx_threadPrivateContext->GetSourceQueue();
#else
- int slot = smid() % MAX_SM;
+ int slot = smid() % MAX_SM;
#endif
- omptarget_nvptx_device_State[slot].Enqueue(
- omptarget_nvptx_threadPrivateContext);
- }
+ omptarget_nvptx_device_State[slot].Enqueue(
+ omptarget_nvptx_threadPrivateContext);
// Done with work. Kill the workers.
omptarget_nvptx_workFn = 0;
}
@@ -116,6 +118,13 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
if (!RequiresOMPRuntime) {
// If OMP runtime is not required don't initialize OMP state.
setExecutionParameters(Spmd, RuntimeUninitialized);
+ if (GetThreadIdInBlock() == 0) {
+ int slot = smid() % MAX_SM;
+ omptarget_nvptx_simpleThreadPrivateContext =
+ omptarget_nvptx_device_simpleState[slot].Dequeue();
+ }
+ __syncthreads();
+ omptarget_nvptx_simpleThreadPrivateContext->Init();
return;
}
setExecutionParameters(Spmd, RuntimeInitialized);
@@ -180,6 +189,15 @@ EXTERN void __kmpc_spmd_kernel_deinit() {
// there are no more parallel regions in SPMD mode.
__syncthreads();
int threadId = GetThreadIdInBlock();
+ if (isRuntimeUninitialized()) {
+ if (threadId == 0) {
+ // Enqueue omp state object for use by another team.
+ int slot = smid() % MAX_SM;
+ omptarget_nvptx_device_simpleState[slot].Enqueue(
+ omptarget_nvptx_simpleThreadPrivateContext);
+ return;
+ }
+ }
if (threadId == 0) {
// Enqueue omp state object for use by another team.
int slot = smid() % MAX_SM;
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
index 84c61f97f10..9e9f9345dd3 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -395,6 +395,38 @@ struct omptarget_device_environmentTy {
int32_t debug_level;
};
+class omptarget_nvptx_SimpleThreadPrivateContext {
+ uint16_t par_level[MAX_THREADS_PER_TEAM];
+public:
+ INLINE void Init() {
+ assert(isSPMDMode() && isRuntimeUninitialized() &&
+ "Expected SPMD + uninitialized runtime modes.");
+ par_level[GetThreadIdInBlock()] = 0;
+ }
+ INLINE void IncParLevel() {
+ assert(isSPMDMode() && isRuntimeUninitialized() &&
+ "Expected SPMD + uninitialized runtime modes.");
+ ++par_level[GetThreadIdInBlock()];
+ }
+ INLINE void DecParLevel() {
+ assert(isSPMDMode() && isRuntimeUninitialized() &&
+ "Expected SPMD + uninitialized runtime modes.");
+ assert(par_level[GetThreadIdInBlock()] > 0 &&
+ "Expected parallel level >0.");
+ --par_level[GetThreadIdInBlock()];
+ }
+ INLINE bool InL2OrHigherParallelRegion() const {
+ assert(isSPMDMode() && isRuntimeUninitialized() &&
+ "Expected SPMD + uninitialized runtime modes.");
+ return par_level[GetThreadIdInBlock()] > 0;
+ }
+ INLINE uint16_t GetParallelLevel() const {
+ assert(isSPMDMode() && isRuntimeUninitialized() &&
+ "Expected SPMD + uninitialized runtime modes.");
+ return par_level[GetThreadIdInBlock()] + 1;
+ }
+};
+
////////////////////////////////////////////////////////////////////////////////
// global device envrionment
////////////////////////////////////////////////////////////////////////////////
@@ -409,6 +441,9 @@ extern __device__ omptarget_device_environmentTy omptarget_device_environment;
extern __device__ __shared__
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
+extern __device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext
+ *omptarget_nvptx_simpleThreadPrivateContext;
+
extern __device__ __shared__ uint32_t execution_param;
extern __device__ __shared__ void *ReductionScratchpadPtr;
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
index 33509b6bd05..81d13a4a942 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
@@ -216,10 +216,9 @@ EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) {
EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
int16_t IsOMPRuntimeInitialized) {
PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n");
- omptarget_nvptx_workFn = WorkFn;
+ assert(IsOMPRuntimeInitialized && "expected initialized runtime.");
- if (!IsOMPRuntimeInitialized)
- return;
+ omptarget_nvptx_workFn = WorkFn;
// This routine is only called by the team master. The team master is
// the first thread of the last warp. It always has the logical thread
@@ -320,12 +319,11 @@ EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
int16_t IsOMPRuntimeInitialized) {
PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n");
+ assert(IsOMPRuntimeInitialized && "expected initialized runtime.");
+
// Work function and arguments for L1 parallel region.
*WorkFn = omptarget_nvptx_workFn;
- if (!IsOMPRuntimeInitialized)
- return true;
-
// If this is the termination signal from the master, quit early.
if (!*WorkFn)
return false;
@@ -363,6 +361,8 @@ EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
EXTERN void __kmpc_kernel_end_parallel() {
// pop stack
PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n");
+ assert(isRuntimeInitialized() && "expected initialized runtime.");
+
// Only the worker threads call this routine and the master warp
// never arrives here. Therefore, use the nvptx thread id.
int threadId = GetThreadIdInBlock();
@@ -378,6 +378,12 @@ EXTERN void __kmpc_kernel_end_parallel() {
EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid) {
PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n");
+ if (isRuntimeUninitialized()) {
+ assert(isSPMDMode() && "Expected SPMD mode with uninitialized runtime.");
+ omptarget_nvptx_simpleThreadPrivateContext->IncParLevel();
+ return;
+ }
+
// assume this is only called for nested parallel
int threadId = GetLogicalThreadIdInBlock();
@@ -392,7 +398,7 @@ EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid) {
// it
omptarget_nvptx_TaskDescr *newTaskDescr =
(omptarget_nvptx_TaskDescr *)SafeMalloc(sizeof(omptarget_nvptx_TaskDescr),
- (char *)"new seq parallel task");
+ "new seq parallel task");
newTaskDescr->CopyParent(currTaskDescr);
// tweak values for serialized parallel case:
@@ -410,6 +416,12 @@ EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc,
uint32_t global_tid) {
PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n");
+ if (isRuntimeUninitialized()) {
+ assert(isSPMDMode() && "Expected SPMD mode with uninitialized runtime.");
+ omptarget_nvptx_simpleThreadPrivateContext->DecParLevel();
+ return;
+ }
+
// pop stack
int threadId = GetLogicalThreadIdInBlock();
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
@@ -425,6 +437,11 @@ EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc,
EXTERN uint16_t __kmpc_parallel_level(kmp_Indent *loc, uint32_t global_tid) {
PRINT0(LD_IO, "call to __kmpc_parallel_level\n");
+ if (isRuntimeUninitialized()) {
+ assert(isSPMDMode() && "Expected SPMD mode with uninitialized runtime.");
+ return omptarget_nvptx_simpleThreadPrivateContext->GetParallelLevel();
+ }
+
int threadId = GetLogicalThreadIdInBlock();
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
@@ -451,6 +468,7 @@ EXTERN int32_t __kmpc_global_thread_num(kmp_Indent *loc) {
EXTERN void __kmpc_push_num_threads(kmp_Indent *loc, int32_t tid,
int32_t num_threads) {
PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads);
+ assert(isRuntimeInitialized() && "Runtime must be initialized.");
tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) =
num_threads;
@@ -459,6 +477,7 @@ EXTERN void __kmpc_push_num_threads(kmp_Indent *loc, int32_t tid,
EXTERN void __kmpc_push_simd_limit(kmp_Indent *loc, int32_t tid,
int32_t simd_limit) {
PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", simd_limit);
+ assert(isRuntimeInitialized() && "Runtime must be initialized.");
tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit;
}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
index 4de2039e42e..c58bfc60358 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -101,9 +101,13 @@ INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode,
int rc;
if (isRuntimeUninitialized) {
- rc = GetThreadIdInBlock();
- if (!isSPMDExecutionMode && rc >= GetMasterThreadID())
+ assert(isSPMDExecutionMode && "Uninitialized runtime with non-SPMD mode.");
+ // For level 2 parallelism all parallel regions are executed sequentially.
+ if (omptarget_nvptx_simpleThreadPrivateContext
+ ->InL2OrHigherParallelRegion())
rc = 0;
+ else
+ rc = GetThreadIdInBlock();
} else {
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
@@ -118,8 +122,13 @@ INLINE int GetNumberOfOmpThreads(int threadId, bool isSPMDExecutionMode,
int rc;
if (isRuntimeUninitialized) {
- rc = isSPMDExecutionMode ? GetNumberOfThreadsInBlock()
- : GetNumberOfThreadsInBlock() - WARPSIZE;
+ assert(isSPMDExecutionMode && "Uninitialized runtime with non-SPMD mode.");
+ // For level 2 parallelism all parallel regions are executed sequentially.
+ if (omptarget_nvptx_simpleThreadPrivateContext
+ ->InL2OrHigherParallelRegion())
+ rc = 1;
+ else
+ rc = GetNumberOfThreadsInBlock();
} else {
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu
index 7e55df8ca71..ba11f6257b6 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu
@@ -42,10 +42,8 @@ EXTERN int32_t __kmpc_cancel_barrier(kmp_Indent *loc_ref, int32_t tid) {
EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) {
if (isRuntimeUninitialized()) {
- if (isSPMDMode())
- __kmpc_barrier_simple_spmd(loc_ref, tid);
- else
- __kmpc_barrier_simple_generic(loc_ref, tid);
+ assert(isSPMDMode() && "Expected SPMD mode with uninitialized runtime.");
+ __kmpc_barrier_simple_spmd(loc_ref, tid);
} else {
tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_TaskDescr *currTaskDescr =
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/task.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/task.cu
index 8d479677854..a8895340c28 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/task.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/task.cu
@@ -81,6 +81,7 @@ EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Indent *loc, uint32_t global_tid,
void *noAliasDepList) {
PRINT(LD_IO, "call to __kmpc_omp_task_with_deps(task 0x%llx)\n",
P64(newKmpTaskDescr));
+ assert(isRuntimeInitialized() && "Runtime must be initialized.");
// 1. get explict task descr from kmp task descr
omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
(omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
@@ -117,6 +118,7 @@ EXTERN void __kmpc_omp_task_begin_if0(kmp_Indent *loc, uint32_t global_tid,
kmp_TaskDescr *newKmpTaskDescr) {
PRINT(LD_IO, "call to __kmpc_omp_task_begin_if0(task 0x%llx)\n",
P64(newKmpTaskDescr));
+ assert(isRuntimeInitialized() && "Runtime must be initialized.");
// 1. get explict task descr from kmp task descr
omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
(omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
@@ -141,6 +143,7 @@ EXTERN void __kmpc_omp_task_complete_if0(kmp_Indent *loc, uint32_t global_tid,
kmp_TaskDescr *newKmpTaskDescr) {
PRINT(LD_IO, "call to __kmpc_omp_task_complete_if0(task 0x%llx)\n",
P64(newKmpTaskDescr));
+ assert(isRuntimeInitialized() && "Runtime must be initialized.");
// 1. get explict task descr from kmp task descr
omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
(omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(