diff options
author | Jon Chesterfield <jonathanchesterfield@gmail.com> | 2019-08-28 01:31:04 +0000 |
---|---|---|
committer | Jon Chesterfield <jonathanchesterfield@gmail.com> | 2019-08-28 01:31:04 +0000 |
commit | bfca7f70c5f1756d7e74e92a1de5630dc41f54fc (patch) | |
tree | ee0b027c830074f8d87ce232f9778fba7c280db3 /libomptarget | |
parent | ad72f5e57bb95e2f765d693821562b4c79406fa9 (diff) |
[libomptarget] Refactor shfl_sync macro to inline function
Summary:
[libomptarget] Refactor shfl_sync macro to inline function
See also abandoned D66846, split into this diff and others.
Reviewers: jdoerfert, ABataev, grokos, ronlieb, gregrodgers
Subscribers: openmp-commits
Tags: #openmp
Differential Revision: https://reviews.llvm.org/D66852
git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@370144 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'libomptarget')
-rw-r--r-- | libomptarget/deviceRTLs/nvptx/src/data_sharing.cu | 6 | ||||
-rw-r--r-- | libomptarget/deviceRTLs/nvptx/src/loop.cu | 4 | ||||
-rw-r--r-- | libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h | 2 | ||||
-rw-r--r-- | libomptarget/deviceRTLs/nvptx/src/parallel.cu | 6 | ||||
-rw-r--r-- | libomptarget/deviceRTLs/nvptx/src/target_impl.h | 14 |
5 files changed, 24 insertions, 8 deletions
diff --git a/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu index 50b8654..b7b8002 100644 --- a/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ b/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -10,6 +10,7 @@ // //===----------------------------------------------------------------------===// #include "omptarget-nvptx.h" +#include "target_impl.h" #include <stdio.h> // Warp ID in the CUDA block @@ -430,9 +431,10 @@ INLINE static void* data_sharing_push_stack_common(size_t PushSize) { } } // Get address from lane 0. - ((int *)&FrameP)[0] = __SHFL_SYNC(CurActive, ((int *)&FrameP)[0], 0); + int *FP = (int *)&FrameP; + FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], 0); if (sizeof(FrameP) == 8) - ((int *)&FrameP)[1] = __SHFL_SYNC(CurActive, ((int *)&FrameP)[1], 0); + FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], 0); return FrameP; } diff --git a/libomptarget/deviceRTLs/nvptx/src/loop.cu b/libomptarget/deviceRTLs/nvptx/src/loop.cu index ff9fac3..4f24ada 100644 --- a/libomptarget/deviceRTLs/nvptx/src/loop.cu +++ b/libomptarget/deviceRTLs/nvptx/src/loop.cu @@ -383,8 +383,8 @@ public: INLINE static int64_t Shuffle(unsigned active, int64_t val, int leader) { int lo, hi; __kmpc_impl_unpack(val, lo, hi); - hi = __SHFL_SYNC(active, hi, leader); - lo = __SHFL_SYNC(active, lo, leader); + hi = __kmpc_impl_shfl_sync(active, hi, leader); + lo = __kmpc_impl_shfl_sync(active, lo, leader); return __kmpc_impl_pack(lo, hi); } diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h index a5e4a71..9fcd1a9 100644 --- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -51,13 +51,11 @@ #ifndef CUDA_VERSION #error CUDA_VERSION macro is undefined, something wrong with cuda. #elif CUDA_VERSION >= 9000 -#define __SHFL_SYNC(mask, var, srcLane) __shfl_sync((mask), (var), (srcLane)) #define __SHFL_DOWN_SYNC(mask, var, delta, width) \ __shfl_down_sync((mask), (var), (delta), (width)) #define __ACTIVEMASK() __activemask() #define __SYNCWARP(Mask) __syncwarp(Mask) #else -#define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane)) #define __SHFL_DOWN_SYNC(mask, var, delta, width) \ __shfl_down((var), (delta), (width)) #define __ACTIVEMASK() __ballot(1) diff --git a/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/libomptarget/deviceRTLs/nvptx/src/parallel.cu index 6747235..182a4f6 100644 --- a/libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ b/libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -33,6 +33,7 @@ //===----------------------------------------------------------------------===// #include "omptarget-nvptx.h" +#include "target_impl.h" typedef struct ConvergentSimdJob { omptarget_nvptx_TaskDescr taskDescr; @@ -64,7 +65,7 @@ EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask, omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId); job->slimForNextSimd = SimdLimit; - int32_t SimdLimitSource = __SHFL_SYNC(Mask, SimdLimit, *LaneSource); + int32_t SimdLimitSource = __kmpc_impl_shfl_sync(Mask, SimdLimit, *LaneSource); // reset simdlimit to avoid propagating to successive #simd if (SimdLimitSource > 0 && threadId == sourceThreadId) omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = 0; @@ -138,7 +139,8 @@ EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask, omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId); job->tnumForNextPar = NumThreadsClause; - int32_t NumThreadsSource = __SHFL_SYNC(Mask, NumThreadsClause, *LaneSource); + int32_t NumThreadsSource = + __kmpc_impl_shfl_sync(Mask, NumThreadsClause, *LaneSource); // reset numthreads to avoid propagating to successive #parallel if (NumThreadsSource > 0 && threadId == sourceThreadId) omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) = diff --git a/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/libomptarget/deviceRTLs/nvptx/src/target_impl.h index a1b4c20..caa9fea 100644 --- a/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -38,6 +38,20 @@ INLINE int __kmpc_impl_ffs(uint32_t x) { return __ffs(x); } INLINE int __kmpc_impl_popc(uint32_t x) { return __popc(x); } +#ifndef CUDA_VERSION +#error CUDA_VERSION macro is undefined, something wrong with cuda. +#endif + +// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'. +INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var, + int32_t SrcLane) { +#if CUDA_VERSION >= 9000 + return __shfl_sync(Mask, Var, SrcLane); +#else + return __shfl(Var, SrcLane); +#endif // CUDA_VERSION +} + INLINE void __kmpc_impl_syncwarp(int32_t Mask) { __SYNCWARP(Mask); } #endif |