aboutsummaryrefslogtreecommitdiff
path: root/libomptarget
diff options
context:
space:
mode:
authorAlexey Bataev <a.bataev@hotmail.com>2019-05-13 14:21:46 +0000
committerAlexey Bataev <a.bataev@hotmail.com>2019-05-13 14:21:46 +0000
commitd5bb0a8d360f1c053b1397618fbf9f8877365cec (patch)
tree11691459bc764d975dbb3645eabeccca38270253 /libomptarget
parentd140b83ae9d0526ccc8339a2b845287db728cfc7 (diff)
[OPENMP][NVPTX]Simplify handling of thread limit, NFC.
Summary: Patch improves performance of the full runtime mode by moving threads limit counter to the shared memory. It also allows to save global memory. Reviewers: grokos, kkwli0, gtbercea Subscribers: guansong, jdoerfert, openmp-commits, caomhin Tags: #openmp Differential Revision: https://reviews.llvm.org/D61801 git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@360584 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'libomptarget')
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/libcall.cu16
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/omp_data.cu1
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu6
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h7
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h9
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/parallel.cu6
6 files changed, 19 insertions, 26 deletions
diff --git a/libomptarget/deviceRTLs/nvptx/src/libcall.cu b/libomptarget/deviceRTLs/nvptx/src/libcall.cu
index 9c6d136..9580d75 100644
--- a/libomptarget/deviceRTLs/nvptx/src/libcall.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/libcall.cu
@@ -37,10 +37,8 @@ EXTERN void omp_set_num_threads(int num) {
PRINT(LD_IO, "call omp_set_num_threads(num %d)\n", num);
if (num <= 0) {
WARNING0(LW_INPUT, "expected positive num; ignore\n");
- } else {
- omptarget_nvptx_TaskDescr *currTaskDescr =
- getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false);
- currTaskDescr->NThreads() = num;
+ } else if (parallelLevel[GetWarpId()] == 0) {
+ nThreads = num;
}
}
@@ -54,12 +52,10 @@ EXTERN int omp_get_max_threads(void) {
if (parallelLevel[GetWarpId()] > 0)
// We're already in parallel region.
return 1; // default is 1 thread avail
- omptarget_nvptx_TaskDescr *currTaskDescr =
- getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false);
- ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(),
- "Should no be in the parallel region");
// Not currently in a parallel region, return what was set.
- int rc = currTaskDescr->NThreads();
+ int rc = 1;
+ if (parallelLevel[GetWarpId()] == 0)
+ rc = nThreads;
ASSERT0(LT_FUSSY, rc >= 0, "bad number of threads");
PRINT(LD_IO, "call omp_get_max_threads() return %d\n", rc);
return rc;
@@ -175,7 +171,7 @@ EXTERN int omp_get_ancestor_thread_num(int level) {
(int)currTaskDescr->InParallelRegion(), (int)sched,
currTaskDescr->RuntimeChunkSize(),
(int)currTaskDescr->ThreadId(), (int)threadsInTeam,
- (int)currTaskDescr->NThreads());
+ (int)nThreads);
}
if (currTaskDescr->IsParallelConstruct()) {
diff --git a/libomptarget/deviceRTLs/nvptx/src/omp_data.cu b/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
index f7cd334..d369da1 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
@@ -34,6 +34,7 @@ __device__ __shared__ uint32_t usedSlotIdx;
__device__ __shared__ uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
__device__ __shared__ uint16_t threadLimit;
__device__ __shared__ uint16_t threadsInTeam;
+__device__ __shared__ uint16_t nThreads;
// Pointer to this team's OpenMP state object
__device__ __shared__
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
index 5c861e3..706776a 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -63,7 +63,7 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
// init team context
omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
- currTeamDescr.InitTeamDescr(/*isSPMDExecutionMode=*/false);
+ currTeamDescr.InitTeamDescr();
// this thread will start execution... has to update its task ICV
// to point to the level zero task ICV. That ICV was init in
// InitTeamDescr()
@@ -73,7 +73,7 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
// set number of threads and thread limit in team to started value
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
- currTaskDescr->NThreads() = GetNumberOfWorkersInTeam();
+ nThreads = GetNumberOfWorkersInTeam();
threadLimit = ThreadLimit;
}
@@ -123,7 +123,7 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
// init team context
- currTeamDescr.InitTeamDescr(/*isSPMDExecutionMode=*/true);
+ currTeamDescr.InitTeamDescr();
}
// FIXME: use __syncthreads instead when the function copy is fixed in LLVM.
__SYNCTHREADS();
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
index 9b6dcbf..cd51538 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -164,7 +164,6 @@ public:
}
INLINE int IsTaskConstruct() const { return !IsParallelConstruct(); }
// methods for other fields
- INLINE uint16_t &NThreads() { return items.nthreads; }
INLINE uint16_t &ThreadId() { return items.threadId; }
INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; }
INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; }
@@ -172,7 +171,7 @@ public:
prev = taskDescr;
}
// init & copy
- INLINE void InitLevelZeroTaskDescr(bool isSPMDExecutionMode);
+ INLINE void InitLevelZeroTaskDescr();
INLINE void InitLevelOneTaskDescr(omptarget_nvptx_TaskDescr *parentTaskDescr);
INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr);
INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr);
@@ -208,7 +207,6 @@ private:
struct TaskDescr_items {
uint8_t flags; // 6 bit used (see flag above)
uint8_t unused;
- uint16_t nthreads; // thread num for subsequent parallel regions
uint16_t threadId; // thread id
uint64_t runtimeChunkSize; // runtime chunk size
} items;
@@ -249,7 +247,7 @@ public:
INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; }
// init
- INLINE void InitTeamDescr(bool isSPMDExecutionMode);
+ INLINE void InitTeamDescr();
INLINE __kmpc_data_sharing_slot *RootS(int wid, bool IsMasterThread) {
// If this is invoked by the master thread of the master warp then intialize
@@ -404,6 +402,7 @@ extern __device__ __shared__ uint8_t
parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
extern __device__ __shared__ uint16_t threadLimit;
extern __device__ __shared__ uint16_t threadsInTeam;
+extern __device__ __shared__ uint16_t nThreads;
extern __device__ __shared__
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
index c890b95..e4efa18 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
+++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
@@ -31,7 +31,7 @@ INLINE void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched) {
}
INLINE void
-omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr(bool isSPMDExecutionMode) {
+omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() {
// slow method
// flag:
// default sched is static,
@@ -39,8 +39,6 @@ omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr(bool isSPMDExecutionMode) {
// not in parallel
items.flags = 0;
- items.nthreads = GetNumberOfProcsInTeam(isSPMDExecutionMode);
- ; // threads: whatever was alloc by kernel
items.threadId = 0; // is master
items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
}
@@ -57,7 +55,6 @@ INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr(
items.flags =
TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
- items.nthreads = 0; // # threads for subsequent parallel region
items.threadId =
GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
@@ -173,8 +170,8 @@ omptarget_nvptx_ThreadPrivateContext::InitThreadPrivateContext(int tid) {
// Team Descriptor
////////////////////////////////////////////////////////////////////////////////
-INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr(bool isSPMDExecutionMode) {
- levelZeroTaskDescr.InitLevelZeroTaskDescr(isSPMDExecutionMode);
+INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr() {
+ levelZeroTaskDescr.InitLevelZeroTaskDescr();
}
////////////////////////////////////////////////////////////////////////////////
diff --git a/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/libomptarget/deviceRTLs/nvptx/src/parallel.cu
index 3f2ec44..6747235 100644
--- a/libomptarget/deviceRTLs/nvptx/src/parallel.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/parallel.cu
@@ -249,8 +249,8 @@ EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
uint16_t &NumThreadsClause =
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
- uint16_t NumThreads = determineNumberOfThreads(
- NumThreadsClause, currTaskDescr->NThreads(), threadLimit);
+ uint16_t NumThreads =
+ determineNumberOfThreads(NumThreadsClause, nThreads, threadLimit);
if (NumThreadsClause != 0) {
// Reset request to avoid propagating to successive #parallel
@@ -308,7 +308,7 @@ EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
PRINT(LD_PAR,
"thread will execute parallel region with id %d in a team of "
"%d threads\n",
- (int)newTaskDescr->ThreadId(), (int)newTaskDescr->NThreads());
+ (int)newTaskDescr->ThreadId(), (int)nThreads);
isActive = true;
IncParallelLevel(threadsInTeam != 1);