diff options
author | Jon Chesterfield <jonathanchesterfield@gmail.com> | 2019-08-28 01:47:41 +0000 |
---|---|---|
committer | Jon Chesterfield <jonathanchesterfield@gmail.com> | 2019-08-28 01:47:41 +0000 |
commit | 77d31c11e83957e751f9d9854e58110e6b532c0c (patch) | |
tree | 56b9cad81677305847b2c1ed1ad485aa98bf98d2 /libomptarget | |
parent | bfca7f70c5f1756d7e74e92a1de5630dc41f54fc (diff) |
[libomptarget] Refactor shfl_down_sync macro to inline function
Summary:
[libomptarget] Refactor shfl_down_sync macro to inline function
See also abandoned D66846, split into this diff and others.
Reviewers: jdoerfert, ABataev, grokos, ronlieb, gregrodgers
Subscribers: openmp-commits
Tags: #openmp
Differential Revision: https://reviews.llvm.org/D66853
git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@370146 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'libomptarget')
-rw-r--r-- | libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h | 4 | ||||
-rw-r--r-- | libomptarget/deviceRTLs/nvptx/src/reduction.cu | 7 | ||||
-rw-r--r-- | libomptarget/deviceRTLs/nvptx/src/target_impl.h | 10 |
3 files changed, 14 insertions, 7 deletions
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h index 9fcd1a9..0a0c6cc 100644 --- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -51,13 +51,9 @@ #ifndef CUDA_VERSION #error CUDA_VERSION macro is undefined, something wrong with cuda. #elif CUDA_VERSION >= 9000 -#define __SHFL_DOWN_SYNC(mask, var, delta, width) \ - __shfl_down_sync((mask), (var), (delta), (width)) #define __ACTIVEMASK() __activemask() #define __SYNCWARP(Mask) __syncwarp(Mask) #else -#define __SHFL_DOWN_SYNC(mask, var, delta, width) \ - __shfl_down((var), (delta), (width)) #define __ACTIVEMASK() __ballot(1) // In Cuda < 9.0 no need to sync threads in warps. #define __SYNCWARP(Mask) diff --git a/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/libomptarget/deviceRTLs/nvptx/src/reduction.cu index c925638..e5e76d5 100644 --- a/libomptarget/deviceRTLs/nvptx/src/reduction.cu +++ b/libomptarget/deviceRTLs/nvptx/src/reduction.cu @@ -15,6 +15,7 @@ #include <stdio.h> #include "omptarget-nvptx.h" +#include "target_impl.h" EXTERN void __kmpc_nvptx_end_reduce(int32_t global_tid) {} @@ -23,14 +24,14 @@ EXTERN void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {} EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) { - return __SHFL_DOWN_SYNC(0xFFFFFFFF, val, delta, size); + return __kmpc_impl_shfl_down_sync(0xFFFFFFFF, val, delta, size); } EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) { int lo, hi; asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val)); - hi = __SHFL_DOWN_SYNC(0xFFFFFFFF, hi, delta, size); - lo = __SHFL_DOWN_SYNC(0xFFFFFFFF, lo, delta, size); + hi = __kmpc_impl_shfl_down_sync(0xFFFFFFFF, hi, delta, size); + lo = __kmpc_impl_shfl_down_sync(0xFFFFFFFF, lo, delta, size); asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi)); return val; } diff --git a/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/libomptarget/deviceRTLs/nvptx/src/target_impl.h index caa9fea..0f54828 100644 --- a/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -43,6 +43,7 @@ INLINE int __kmpc_impl_popc(uint32_t x) { return __popc(x); } #endif // In Cuda 9.0, the *_sync() version takes an extra argument 'mask'. + INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var, int32_t SrcLane) { #if CUDA_VERSION >= 9000 @@ -50,6 +51,15 @@ INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var, #else return __shfl(Var, SrcLane); #endif // CUDA_VERSION + +INLINE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask, + int32_t Var, uint32_t Delta, + int32_t Width) { +#if CUDA_VERSION >= 9000 + return __shfl_down_sync(Mask, Var, Delta, Width); +#else + return __shfl_down(Var, Delta, Width); +#endif // CUDA_VERSION } INLINE void __kmpc_impl_syncwarp(int32_t Mask) { __SYNCWARP(Mask); } |