summaryrefslogtreecommitdiff
path: root/openmp
diff options
context:
space:
mode:
authorAlexey Bataev <a.bataev@hotmail.com>2019-01-04 20:16:54 +0000
committerAlexey Bataev <a.bataev@hotmail.com>2019-01-04 20:16:54 +0000
commitb09525b0ad8282d40584d3a81ceb829473d3a1a1 (patch)
treee67c2c0ec0550a5eafdb04f498cca5e647c3bedc /openmp
parent622cdf4939d3623c6512275a8ad96de734acdc62 (diff)
[OPENMP][NVPTX]General formatting/code improvement, NFC.
Summary: Formatting. Reviewers: gtbercea, grokos, kkwli0 Subscribers: guansong, openmp-commits, caomhin Differential Revision: https://reviews.llvm.org/D56290
Diffstat (limited to 'openmp')
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu58
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/debug.h2
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu8
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu2
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h23
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h4
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu4
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu30
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/state-queue.h4
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/state-queuei.h3
10 files changed, 57 insertions, 81 deletions
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
index 15224006434..fb4e8ea38bd 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -13,42 +13,26 @@
#include "omptarget-nvptx.h"
#include <stdio.h>
-// Number of threads in the CUDA block.
-__device__ static unsigned getNumThreads() { return blockDim.x; }
-// Thread ID in the CUDA block
-__device__ static unsigned getThreadId() { return threadIdx.x; }
// Warp ID in the CUDA block
-__device__ static unsigned getWarpId() { return threadIdx.x / WARPSIZE; }
+INLINE static unsigned getWarpId() { return threadIdx.x / WARPSIZE; }
// Lane ID in the CUDA warp.
-__device__ static unsigned getLaneId() { return threadIdx.x % WARPSIZE; }
-
-// The CUDA thread ID of the master thread.
-__device__ static unsigned getMasterThreadId() {
- unsigned Mask = WARPSIZE - 1;
- return (getNumThreads() - 1) & (~Mask);
-}
-
-// Find the active threads in the warp - return a mask whose n-th bit is set if
-// the n-th thread in the warp is active.
-__device__ static unsigned getActiveThreadsMask() {
- return __BALLOT_SYNC(0xFFFFFFFF, true);
-}
+INLINE static unsigned getLaneId() { return threadIdx.x % WARPSIZE; }
// Return true if this is the first active thread in the warp.
-__device__ static bool IsWarpMasterActiveThread() {
- unsigned long long Mask = getActiveThreadsMask();
- unsigned long long ShNum = WARPSIZE - (getThreadId() % WARPSIZE);
+INLINE static bool IsWarpMasterActiveThread() {
+ unsigned long long Mask = __ACTIVEMASK();
+ unsigned long long ShNum = WARPSIZE - (GetThreadIdInBlock() % WARPSIZE);
unsigned long long Sh = Mask << ShNum;
// Truncate Sh to the 32 lower bits
return (unsigned)Sh == 0;
}
// Return true if this is the master thread.
-__device__ static bool IsMasterThread(bool isSPMDExecutionMode) {
- return !isSPMDExecutionMode && getMasterThreadId() == getThreadId();
+INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {
+ return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock();
}
/// Return the provided size aligned to the size of a pointer.
-__device__ static size_t AlignVal(size_t Val) {
+INLINE static size_t AlignVal(size_t Val) {
const size_t Align = (size_t)sizeof(void *);
if (Val & (Align - 1)) {
Val += Align;
@@ -128,7 +112,7 @@ EXTERN void *__kmpc_data_sharing_environment_begin(
(unsigned long long)SharingDefaultDataSize);
unsigned WID = getWarpId();
- unsigned CurActiveThreads = getActiveThreadsMask();
+ unsigned CurActiveThreads = __ACTIVEMASK();
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
void *&StackP = DataSharingState.StackPtr[WID];
@@ -268,7 +252,7 @@ EXTERN void __kmpc_data_sharing_environment_end(
return;
}
- int32_t CurActive = getActiveThreadsMask();
+ int32_t CurActive = __ACTIVEMASK();
// Only the warp master can restore the stack and frame information, and only
// if there are no other threads left behind in this environment (i.e. the
@@ -341,7 +325,7 @@ __kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
// Runtime functions for trunk data sharing scheme.
////////////////////////////////////////////////////////////////////////////////
-INLINE void data_sharing_init_stack_common() {
+INLINE static void data_sharing_init_stack_common() {
ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
omptarget_nvptx_TeamDescr *teamDescr =
&omptarget_nvptx_threadPrivateContext->TeamContext();
@@ -380,11 +364,11 @@ EXTERN void __kmpc_data_sharing_init_stack_spmd() {
__threadfence_block();
}
-INLINE void* data_sharing_push_stack_common(size_t PushSize) {
+INLINE static void* data_sharing_push_stack_common(size_t PushSize) {
ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
// Only warp active master threads manage the stack.
- bool IsWarpMaster = (getThreadId() % WARPSIZE) == 0;
+ bool IsWarpMaster = (GetThreadIdInBlock() % WARPSIZE) == 0;
// Add worst-case padding to DataSize so that future stack allocations are
// correctly aligned.
@@ -394,7 +378,7 @@ INLINE void* data_sharing_push_stack_common(size_t PushSize) {
// Frame pointer must be visible to all workers in the same warp.
const unsigned WID = getWarpId();
void *FrameP = 0;
- const int32_t CurActive = getActiveThreadsMask();
+ int32_t CurActive = __ACTIVEMASK();
if (IsWarpMaster) {
// SlotP will point to either the shared memory slot or an existing
@@ -454,8 +438,8 @@ INLINE void* data_sharing_push_stack_common(size_t PushSize) {
return FrameP;
}
-EXTERN void* __kmpc_data_sharing_coalesced_push_stack(size_t DataSize,
- int16_t UseSharedMemory) {
+EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t DataSize,
+ int16_t UseSharedMemory) {
return data_sharing_push_stack_common(DataSize);
}
@@ -466,8 +450,8 @@ EXTERN void* __kmpc_data_sharing_coalesced_push_stack(size_t DataSize,
// By default the globalized variables are stored in global memory. If the
// UseSharedMemory is set to true, the runtime will attempt to use shared memory
// as long as the size requested fits the pre-allocated size.
-EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
- int16_t UseSharedMemory) {
+EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize,
+ int16_t UseSharedMemory) {
// Compute the total memory footprint of the requested data.
// The master thread requires a stack only for itself. A worker
// thread (which at this point is a warp master) will require
@@ -495,7 +479,7 @@ EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
__threadfence_block();
- if (getThreadId() % WARPSIZE == 0) {
+ if (GetThreadIdInBlock() % WARPSIZE == 0) {
unsigned WID = getWarpId();
// Current slot
@@ -572,7 +556,7 @@ EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
__SYNCTHREADS();
return;
}
- ASSERT0(LT_FUSSY, GetThreadIdInBlock() == getMasterThreadId(),
+ ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
"Must be called only in the target master thread.");
*frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
__threadfence();
@@ -591,7 +575,7 @@ EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
return;
}
__threadfence();
- ASSERT0(LT_FUSSY, GetThreadIdInBlock() == getMasterThreadId(),
+ ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
"Must be called only in the target master thread.");
omptarget_nvptx_simpleMemoryManager.Release();
}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h b/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h
index 100713bbca0..d40cf3f60b8 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h
@@ -130,7 +130,7 @@
#include "option.h"
template <typename... Arguments>
-static NOINLINE void log(const char *fmt, Arguments... parameters) {
+NOINLINE static void log(const char *fmt, Arguments... parameters) {
printf(fmt, (int)blockIdx.x, (int)threadIdx.x, (int)(threadIdx.x / WARPSIZE),
(int)(threadIdx.x & 0x1F), parameters...);
}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
index fdddf3097b5..b8a61a46678 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
@@ -414,12 +414,8 @@ public:
return FINISHED;
}
- // On Pascal, with inlining of the runtime into the user application,
- // this code deadlocks. This is probably because different threads
- // in a warp cannot make independent progress.
- NOINLINE static int dispatch_next(kmp_Ident *loc, int32_t gtid,
- int32_t *plast, T *plower, T *pupper,
- ST *pstride) {
+ INLINE static int dispatch_next(kmp_Ident *loc, int32_t gtid, int32_t *plast,
+ T *plower, T *pupper, ST *pstride) {
ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
"Expected non-SPMD mode + initialized runtime.");
// ID of a thread in its own warp
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
index f8610a5c0c5..2a3d49c56db 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -29,7 +29,7 @@ extern __device__ omptarget_nvptx_Queue<
// init entry points
////////////////////////////////////////////////////////////////////////////////
-INLINE unsigned smid() {
+INLINE static unsigned smid() {
unsigned id;
asm("mov.u32 %0, %%smid;" : "=r"(id));
return id;
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
index 4a12ff36fc3..7a05d93bdfe 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -53,13 +53,11 @@
#define __SHFL_SYNC(mask, var, srcLane) __shfl_sync((mask), (var), (srcLane))
#define __SHFL_DOWN_SYNC(mask, var, delta, width) \
__shfl_down_sync((mask), (var), (delta), (width))
-#define __BALLOT_SYNC(mask, predicate) __ballot_sync((mask), (predicate))
#define __ACTIVEMASK() __activemask()
#else
#define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane))
#define __SHFL_DOWN_SYNC(mask, var, delta, width) \
__shfl_down((var), (delta), (width))
-#define __BALLOT_SYNC(mask, predicate) __ballot((predicate))
#define __ACTIVEMASK() __ballot(1)
#endif
@@ -93,7 +91,7 @@ public:
}
}
// Called by all threads.
- INLINE void **GetArgs() { return args; };
+ INLINE void **GetArgs() const { return args; };
private:
// buffer of pre-allocated arguments.
void *buffer[MAX_SHARED_ARGS];
@@ -104,7 +102,8 @@ private:
uint32_t nArgs;
};
-extern __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs;
+extern __device__ __shared__ omptarget_nvptx_SharedArgs
+ omptarget_nvptx_globalArgs;
// Data sharing related quantities, need to match what is used in the compiler.
enum DATA_SHARING_SIZES {
@@ -155,23 +154,23 @@ extern __device__ __shared__ DataSharingStateTy DataSharingState;
class omptarget_nvptx_TaskDescr {
public:
// methods for flags
- INLINE omp_sched_t GetRuntimeSched();
+ INLINE omp_sched_t GetRuntimeSched() const;
INLINE void SetRuntimeSched(omp_sched_t sched);
- INLINE int InParallelRegion() { return items.flags & TaskDescr_InPar; }
- INLINE int InL2OrHigherParallelRegion() {
+ INLINE int InParallelRegion() const { return items.flags & TaskDescr_InPar; }
+ INLINE int InL2OrHigherParallelRegion() const {
return items.flags & TaskDescr_InParL2P;
}
- INLINE int IsParallelConstruct() {
+ INLINE int IsParallelConstruct() const {
return items.flags & TaskDescr_IsParConstr;
}
- INLINE int IsTaskConstruct() { return !IsParallelConstruct(); }
+ INLINE int IsTaskConstruct() const { return !IsParallelConstruct(); }
// methods for other fields
INLINE uint16_t &NThreads() { return items.nthreads; }
INLINE uint16_t &ThreadLimit() { return items.threadlimit; }
INLINE uint16_t &ThreadId() { return items.threadId; }
INLINE uint16_t &ThreadsInTeam() { return items.threadsInTeam; }
INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; }
- INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() { return prev; }
+ INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; }
INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) {
prev = taskDescr;
}
@@ -326,7 +325,7 @@ public:
omptarget_nvptx_TaskDescr *taskICV) {
topTaskDescr[tid] = taskICV;
}
- INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid);
+ INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid) const;
// parallel
INLINE uint16_t &NumThreadsForNextParallel(int tid) {
return nextRegion.tnum[tid];
@@ -381,7 +380,7 @@ private:
volatile unsigned keys[OMP_STATE_COUNT];
} MemData[MAX_SM];
- INLINE uint32_t hash(unsigned key) const {
+ INLINE static uint32_t hash(unsigned key) {
return key & (OMP_STATE_COUNT - 1);
}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
index 2e834cc7c93..27cbaad036e 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
@@ -16,7 +16,7 @@
// Task Descriptor
////////////////////////////////////////////////////////////////////////////////
-INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched() {
+INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched() const {
// sched starts from 1..4; encode it as 0..3; so add 1 here
uint8_t rc = (items.flags & TaskDescr_SchedMask) + 1;
return (omp_sched_t)rc;
@@ -155,7 +155,7 @@ INLINE void omptarget_nvptx_TaskDescr::RestoreLoopData() const {
////////////////////////////////////////////////////////////////////////////////
INLINE omptarget_nvptx_TaskDescr *
-omptarget_nvptx_ThreadPrivateContext::GetTopLevelTaskDescr(int tid) {
+omptarget_nvptx_ThreadPrivateContext::GetTopLevelTaskDescr(int tid) const {
ASSERT0(
LT_FUSSY, tid < MAX_THREADS_PER_TEAM,
"Getting top level, tid is larger than allocated data structure size");
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
index aa0b9cf9b14..c5edd31cb15 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
@@ -193,7 +193,7 @@ EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) {
// support for parallel that goes parallel (1 static level only)
////////////////////////////////////////////////////////////////////////////////
-static INLINE uint16_t determineNumberOfThreads(uint16_t NumThreadsClause,
+INLINE static uint16_t determineNumberOfThreads(uint16_t NumThreadsClause,
uint16_t NThreadsICV,
uint16_t ThreadLimit) {
uint16_t ThreadsRequested = NThreadsICV;
@@ -236,7 +236,7 @@ EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
// This routine is only called by the team master. The team master is
// the first thread of the last warp. It always has the logical thread
// id of 0 (since it is a shadow for the first worker thread).
- int threadId = 0;
+ const int threadId = 0;
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
index fde1fdecf7f..dbe2d9e7865 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
@@ -86,7 +86,7 @@ EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
return val;
}
-static INLINE void gpu_regular_warp_reduce(void *reduce_data,
+INLINE static void gpu_regular_warp_reduce(void *reduce_data,
kmp_ShuffleReductFctPtr shflFct) {
for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) {
shflFct(reduce_data, /*LaneId - not used= */ 0,
@@ -94,7 +94,7 @@ static INLINE void gpu_regular_warp_reduce(void *reduce_data,
}
}
-static INLINE void gpu_irregular_warp_reduce(void *reduce_data,
+INLINE static void gpu_irregular_warp_reduce(void *reduce_data,
kmp_ShuffleReductFctPtr shflFct,
uint32_t size, uint32_t tid) {
uint32_t curr_size;
@@ -108,18 +108,18 @@ static INLINE void gpu_irregular_warp_reduce(void *reduce_data,
}
}
-static INLINE uint32_t
+INLINE static uint32_t
gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
uint32_t lanemask_lt;
uint32_t lanemask_gt;
uint32_t size, remote_id, physical_lane_id;
physical_lane_id = GetThreadIdInBlock() % WARPSIZE;
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
- uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true);
+ uint32_t Liveness = __ACTIVEMASK();
uint32_t logical_lane_id = __popc(Liveness & lanemask_lt) * 2;
asm("mov.u32 %0, %%lanemask_gt;" : "=r"(lanemask_gt));
do {
- Liveness = __BALLOT_SYNC(0xFFFFFFFF, true);
+ Liveness = __ACTIVEMASK();
remote_id = __ffs(Liveness & lanemask_gt);
size = __popc(Liveness);
logical_lane_id /= 2;
@@ -134,7 +134,7 @@ int32_t __kmpc_nvptx_simd_reduce_nowait(int32_t global_tid, int32_t num_vars,
size_t reduce_size, void *reduce_data,
kmp_ShuffleReductFctPtr shflFct,
kmp_InterWarpCopyFctPtr cpyFct) {
- uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true);
+ uint32_t Liveness = __ACTIVEMASK();
if (Liveness == 0xffffffff) {
gpu_regular_warp_reduce(reduce_data, shflFct);
return GetThreadIdInBlock() % WARPSIZE ==
@@ -146,12 +146,10 @@ int32_t __kmpc_nvptx_simd_reduce_nowait(int32_t global_tid, int32_t num_vars,
}
INLINE
-int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars,
- size_t reduce_size, void *reduce_data,
- kmp_ShuffleReductFctPtr shflFct,
- kmp_InterWarpCopyFctPtr cpyFct,
- bool isSPMDExecutionMode,
- bool isRuntimeUninitialized) {
+static int32_t nvptx_parallel_reduce_nowait(
+ int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+ kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
+ bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
uint32_t NumThreads = GetNumberOfOmpThreads(
BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized);
@@ -195,12 +193,10 @@ int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars,
if (WarpId == 0)
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
BlockThreadId);
-
- return BlockThreadId == 0;
}
return BlockThreadId == 0;
#else
- uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true);
+ uint32_t Liveness = __ACTIVEMASK();
if (Liveness == 0xffffffff) // Full warp
gpu_regular_warp_reduce(reduce_data, shflFct);
else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
@@ -278,7 +274,7 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic(
}
INLINE
-int32_t nvptx_teams_reduce_nowait(
+static int32_t nvptx_teams_reduce_nowait(
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct,
@@ -378,7 +374,7 @@ int32_t nvptx_teams_reduce_nowait(
ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1);
// Reduce across warps to the warp master.
- uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true);
+ uint32_t Liveness = __ACTIVEMASK();
if (Liveness == 0xffffffff) // Full warp
gpu_regular_warp_reduce(reduce_data, shflFct);
else // Partial warp but contiguous lanes
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/state-queue.h b/openmp/libomptarget/deviceRTLs/nvptx/src/state-queue.h
index accb1f725ff..fe28328acd3 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/state-queue.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/state-queue.h
@@ -35,14 +35,14 @@ private:
static const uint32_t MAX_ID = (1u << 31) / SIZE / 2;
INLINE uint32_t ENQUEUE_TICKET();
INLINE uint32_t DEQUEUE_TICKET();
- INLINE uint32_t ID(uint32_t ticket);
+ INLINE static uint32_t ID(uint32_t ticket);
INLINE bool IsServing(uint32_t slot, uint32_t id);
INLINE void PushElement(uint32_t slot, ElementType *element);
INLINE ElementType *PopElement(uint32_t slot);
INLINE void DoneServing(uint32_t slot, uint32_t id);
public:
- INLINE omptarget_nvptx_Queue(){};
+ INLINE omptarget_nvptx_Queue() {}
INLINE void Enqueue(ElementType *element);
INLINE ElementType *Dequeue();
};
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/state-queuei.h b/openmp/libomptarget/deviceRTLs/nvptx/src/state-queuei.h
index c9ffd54d4bc..3a1f49f8925 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/state-queuei.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/state-queuei.h
@@ -30,7 +30,8 @@ INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::DEQUEUE_TICKET() {
}
template <typename ElementType, uint32_t SIZE>
-INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::ID(uint32_t ticket) {
+INLINE uint32_t
+omptarget_nvptx_Queue<ElementType, SIZE>::ID(uint32_t ticket) {
return (ticket / SIZE) * 2;
}