aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/loop.cu107
1 files changed, 50 insertions, 57 deletions
diff --git a/libomptarget/deviceRTLs/nvptx/src/loop.cu b/libomptarget/deviceRTLs/nvptx/src/loop.cu
index e764752..642516d 100644
--- a/libomptarget/deviceRTLs/nvptx/src/loop.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/loop.cu
@@ -215,7 +215,8 @@ public:
schedule <= kmp_sched_ordered_last;
}
- INLINE static void dispatch_init(kmp_sched_t schedule, T lb, T ub, ST st,
+ INLINE static void dispatch_init(kmp_Indent *loc, int32_t threadId,
+ kmp_sched_t schedule, T lb, T ub, ST st,
ST chunk) {
int tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
@@ -282,18 +283,15 @@ public:
"unknown schedule %d & chunk %lld\n", schedule, P64(chunk));
}
- // save sched state
- omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
- omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
-
// init schedules
if (schedule == kmp_sched_static_chunk) {
ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
+ // save sched state
+ omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
// save ub
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
// compute static chunk
ST stride;
- T threadId = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized());
int lastiter = 0;
ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
// save computed params
@@ -301,8 +299,8 @@ public:
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
PRINT(LD_LOOP,
- "dispatch init (static chunk) : num threads = %d, ub = %" PRId64 ","
- "next lower bound = %llu, stride = %llu\n",
+ "dispatch init (static chunk) : num threads = %d, ub = %" PRId64
+ ", next lower bound = %llu, stride = %llu\n",
GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
@@ -310,11 +308,12 @@ public:
} else if (schedule == kmp_sched_static_nochunk) {
ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
+ // save sched state
+ omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
// save ub
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
// compute static chunk
ST stride;
- T threadId = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized());
int lastiter = 0;
ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
// save computed params
@@ -322,45 +321,50 @@ public:
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
PRINT(LD_LOOP,
- "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64 ","
- "next lower bound = %llu, stride = %llu\n",
+ "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64
+ ", next lower bound = %llu, stride = %llu\n",
GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
omptarget_nvptx_threadPrivateContext->Stride(tid));
} else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
- if (chunk < 1)
- chunk = 1;
- Counter eventNum = ((tripCount - 1) / chunk) + 1; // number of chunks
- // but each thread (but one) must discover that it is last
- eventNum += tnum;
- omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
- omptarget_nvptx_threadPrivateContext->EventsNumber(tid) = eventNum;
+ if (isSPMDMode())
+ __syncthreads();
+ else
+ __kmpc_barrier(loc, threadId);
+ // save sched state
+ omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
+ if (GetThreadIdInBlock() == 0) {
+ if (chunk < 1)
+ chunk = 1;
+ int teamId = GetOmpTeamId();
+ omptarget_nvptx_threadPrivateContext->Chunk(teamId) = chunk;
+ omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId) = ub;
+ omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId) = lb;
+ }
+ if (isSPMDMode())
+ __syncthreads();
+ else
+ __kmpc_barrier(loc, threadId);
PRINT(LD_LOOP,
- "dispatch init (dyn) : num threads = %d, ub = %" PRId64 ", chunk %" PRIu64 ", "
- "events number = %llu\n",
+ "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
+ ", chunk %" PRIu64 "\n",
GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
- omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
- omptarget_nvptx_threadPrivateContext->Chunk(tid),
- omptarget_nvptx_threadPrivateContext->EventsNumber(tid));
+ omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
+ omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId),
+ omptarget_nvptx_threadPrivateContext->Chunk(teamId));
}
}
////////////////////////////////////////////////////////////////////////////////
// Support for dispatch next
- INLINE static int DynamicNextChunk(omptarget_nvptx_CounterGroup &cg,
- Counter priv, T &lb, T &ub,
- Counter &chunkId, Counter &currentEvent,
- T chunkSize, T loopUpperBound) {
- // get next event atomically
- Counter nextEvent = cg.Next();
- // calculate chunk Id (priv was initialized upon entering the loop to
- // 'start' == 'event')
- chunkId = nextEvent - priv;
+ INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize,
+ Counter &loopLowerBound,
+ T loopUpperBound) {
// calculate lower bound for all lanes in the warp
- lb = chunkId * chunkSize; // this code assume normalization of LB
+ lb = atomicAdd(&loopLowerBound, (Counter)chunkSize);
ub = lb + chunkSize - 1; // Clang uses i <= ub
// 3 result cases:
@@ -368,9 +372,8 @@ public:
// b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
// NOT_FINISHED
// c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
- currentEvent = nextEvent;
// a.
- if (ub <= loopUpperBound) {
+ if (lb <= loopUpperBound && ub < loopUpperBound) {
PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n", P64(lb),
P64(ub), P64(loopUpperBound));
return NOT_FINISHED;
@@ -383,7 +386,8 @@ public:
return LAST_CHUNK;
}
// c. if we are here, we are in case 'c'
- lb = loopUpperBound + 1;
+ lb = loopUpperBound + 2;
+ ub = loopUpperBound + 1;
PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", P64(lb),
P64(ub), P64(loopUpperBound));
return FINISHED;
@@ -437,29 +441,18 @@ public:
ASSERT0(LT_FUSSY,
schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
"bad sched");
- omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
T myLb, myUb;
- Counter chunkId;
- // xxx current event is now local
- omptarget_nvptx_CounterGroup &cg = teamDescr.WorkDescr().CounterGroup();
+ int teamId = GetOmpTeamId();
int finished = DynamicNextChunk(
- cg, omptarget_nvptx_threadPrivateContext->Priv(tid), myLb, myUb,
- chunkId, omptarget_nvptx_threadPrivateContext->CurrentEvent(tid),
- omptarget_nvptx_threadPrivateContext->Chunk(tid),
- omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
-
- if (finished == FINISHED) {
- cg.Complete(omptarget_nvptx_threadPrivateContext->Priv(tid),
- omptarget_nvptx_threadPrivateContext->EventsNumber(tid));
- cg.Release(omptarget_nvptx_threadPrivateContext->Priv(tid),
- omptarget_nvptx_threadPrivateContext->CurrentEvent(tid));
+ myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(teamId),
+ omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
+ omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId));
+ if (finished == FINISHED)
return DISPATCH_FINISHED;
- }
// not finished (either not finished or last chunk)
- *plast = (int32_t)(
- myUb == omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
+ *plast = (int32_t)(finished == LAST_CHUNK);
*plower = myLb;
*pupper = myUb;
*pstride = 1;
@@ -491,7 +484,7 @@ EXTERN void __kmpc_dispatch_init_4(kmp_Indent *loc, int32_t tid,
int32_t st, int32_t chunk) {
PRINT0(LD_IO, "call kmpc_dispatch_init_4\n");
omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
- (kmp_sched_t)schedule, lb, ub, st, chunk);
+ loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
}
EXTERN void __kmpc_dispatch_init_4u(kmp_Indent *loc, int32_t tid,
@@ -499,7 +492,7 @@ EXTERN void __kmpc_dispatch_init_4u(kmp_Indent *loc, int32_t tid,
int32_t st, int32_t chunk) {
PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n");
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
- (kmp_sched_t)schedule, lb, ub, st, chunk);
+ loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
}
EXTERN void __kmpc_dispatch_init_8(kmp_Indent *loc, int32_t tid,
@@ -507,7 +500,7 @@ EXTERN void __kmpc_dispatch_init_8(kmp_Indent *loc, int32_t tid,
int64_t st, int64_t chunk) {
PRINT0(LD_IO, "call kmpc_dispatch_init_8\n");
omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
- (kmp_sched_t)schedule, lb, ub, st, chunk);
+ loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
}
EXTERN void __kmpc_dispatch_init_8u(kmp_Indent *loc, int32_t tid,
@@ -515,7 +508,7 @@ EXTERN void __kmpc_dispatch_init_8u(kmp_Indent *loc, int32_t tid,
int64_t st, int64_t chunk) {
PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n");
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
- (kmp_sched_t)schedule, lb, ub, st, chunk);
+ loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
}
// next