diff options
Diffstat (limited to 'libomptarget/deviceRTLs/nvptx/src/parallel.cu')
-rw-r--r-- | libomptarget/deviceRTLs/nvptx/src/parallel.cu | 26 |
1 files changed, 22 insertions, 4 deletions
diff --git a/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/libomptarget/deviceRTLs/nvptx/src/parallel.cu index d8049d9..5db443c 100644 --- a/libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ b/libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -311,7 +311,16 @@ EXTERN bool __kmpc_kernel_parallel(void **WorkFn, (int)newTaskDescr->ThreadId(), (int)nThreads); isActive = true; - IncParallelLevel(threadsInTeam != 1); + // Reconverge the threads at the end of the parallel region to correctly + // handle parallel levels. + // In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole + // warp. If only 1 thread is active, not need to reconverge the threads. + // If we have the whole warp, reconverge all the threads in the warp before + // actually trying to change the parallel level. Otherwise, parallel level + // can be changed incorrectly because of threads divergence. + bool IsActiveParallelRegion = threadsInTeam != 1; + IncParallelLevel(IsActiveParallelRegion, + IsActiveParallelRegion ? 0xFFFFFFFF : 1u); } return isActive; @@ -329,7 +338,16 @@ EXTERN void __kmpc_kernel_end_parallel() { omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( threadId, currTaskDescr->GetPrevTaskDescr()); - DecParallelLevel(threadsInTeam != 1); + // Reconverge the threads at the end of the parallel region to correctly + // handle parallel levels. + // In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole + // warp. If only 1 thread is active, not need to reconverge the threads. + // If we have the whole warp, reconverge all the threads in the warp before + // actually trying to change the parallel level. Otherwise, parallel level can + // be changed incorrectly because of threads divergence. + bool IsActiveParallelRegion = threadsInTeam != 1; + DecParallelLevel(IsActiveParallelRegion, + IsActiveParallelRegion ? 0xFFFFFFFF : 1u); } //////////////////////////////////////////////////////////////////////////////// @@ -339,7 +357,7 @@ EXTERN void __kmpc_kernel_end_parallel() { EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) { PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n"); - IncParallelLevel(/*ActiveParallel=*/false); + IncParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask()); if (checkRuntimeUninitialized(loc)) { ASSERT0(LT_FUSSY, checkSPMDMode(loc), @@ -378,7 +396,7 @@ EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) { PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n"); - DecParallelLevel(/*ActiveParallel=*/false); + DecParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask()); if (checkRuntimeUninitialized(loc)) { ASSERT0(LT_FUSSY, checkSPMDMode(loc), |