summaryrefslogtreecommitdiff
path: root/openmp/libomptarget
diff options
context:
space:
mode:
authorGheorghe-Teodor Bercea <gheorghe-teod.bercea@ibm.com>2018-11-01 18:08:12 +0000
committerGheorghe-Teodor Bercea <gheorghe-teod.bercea@ibm.com>2018-11-01 18:08:12 +0000
commit59f8d691a3f2952e4c24614810b838203987ddb8 (patch)
tree190ebd828c853d4d1486a0c6ca8bbbaf9af5a3fb /openmp/libomptarget
parentd3101296109f18eb4c3ea9fb0866f8ba86f48e33 (diff)
[OpenMP][libomptarget] Add runtime function for pushing coalesced global records
Summary: In the case of coalesced global records, we need to push the exact data size passed in. This patch fixes this by outlining the common functionality of the previous push function and by adding a separate entry point for coalesced pushes. The pop function remains unchanged. Reviewers: ABataev, grokos, caomhin Reviewed By: ABataev, grokos Subscribers: jholewinski, cfe-commits, Hahnfeld, guansong, jfb, openmp-commits Differential Revision: https://reviews.llvm.org/D53141
Diffstat (limited to 'openmp/libomptarget')
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu73
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/interface.h2
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu2
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h2
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h1
5 files changed, 45 insertions, 35 deletions
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
index c7b9bdf9a9b..4db9f31a55d 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -129,7 +129,7 @@ EXTERN void *__kmpc_data_sharing_environment_begin(
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
void *&StackP = DataSharingState.StackPtr[WID];
- void *&FrameP = DataSharingState.FramePtr[WID];
+ void * volatile &FrameP = DataSharingState.FramePtr[WID];
int32_t &ActiveT = DataSharingState.ActiveThreads[WID];
DSPRINT0(DSFLAG, "Save current slot/stack values.\n");
@@ -283,7 +283,7 @@ EXTERN void __kmpc_data_sharing_environment_end(
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
void *&StackP = DataSharingState.StackPtr[WID];
- void *&FrameP = DataSharingState.FramePtr[WID];
+ void * volatile &FrameP = DataSharingState.FramePtr[WID];
SlotP = *SavedSharedSlot;
StackP = *SavedSharedStack;
@@ -321,7 +321,7 @@ __kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
DSPRINT(DSFLAG, "Source warp: %d\n", SourceWID);
- void *P = DataSharingState.FramePtr[SourceWID];
+ void * volatile P = DataSharingState.FramePtr[SourceWID];
DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n");
return P;
}
@@ -369,47 +369,31 @@ EXTERN void __kmpc_data_sharing_init_stack_spmd() {
__threadfence_block();
}
-// Called at the time of the kernel initialization. This is used to initilize
-// the list of references to shared variables and to pre-allocate global storage
-// for holding the globalized variables.
-//
-// By default the globalized variables are stored in global memory. If the
-// UseSharedMemory is set to true, the runtime will attempt to use shared memory
-// as long as the size requested fits the pre-allocated size.
-EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
- int16_t UseSharedMemory) {
+INLINE void* data_sharing_push_stack_common(size_t PushSize) {
if (isRuntimeUninitialized()) {
ASSERT0(LT_FUSSY, isSPMDMode(),
"Expected SPMD mode with uninitialized runtime.");
- return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(DataSize);
+ return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(PushSize);
}
+ // Only warp active master threads manage the stack.
+ bool IsWarpMaster = (getThreadId() % WARPSIZE) == 0;
+
// Add worst-case padding to DataSize so that future stack allocations are
// correctly aligned.
const size_t Alignment = 8;
- if (DataSize % Alignment != 0) {
- DataSize += (Alignment - DataSize % Alignment);
- }
+ PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;
// Frame pointer must be visible to all workers in the same warp.
unsigned WID = getWarpId();
- void *&FrameP = DataSharingState.FramePtr[WID];
+ void *volatile &FrameP = DataSharingState.FramePtr[WID];
- // Only warp active master threads manage the stack.
- if (getThreadId() % WARPSIZE == 0) {
+ if (IsWarpMaster) {
// SlotP will point to either the shared memory slot or an existing
// global memory slot.
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
void *&StackP = DataSharingState.StackPtr[WID];
- // Compute the total memory footprint of the requested data.
- // The master thread requires a stack only for itself. A worker
- // thread (which at this point is a warp master) will require
- // space for the variables of each thread in the warp,
- // i.e. one DataSize chunk per warp lane.
- // TODO: change WARPSIZE to the number of active threads in the warp.
- size_t PushSize = IsMasterThread() ? DataSize : WARPSIZE * DataSize;
-
// Check if we have room for the data in the current slot.
const uintptr_t StartAddress = (uintptr_t)StackP;
const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd;
@@ -453,12 +437,39 @@ EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
// Reset stack pointer to the requested address.
StackP = (void *)RequestedEndAddress;
}
+ } else {
+ while (!FrameP);
}
- __threadfence_block();
+ return FrameP;
+}
+
+EXTERN void* __kmpc_data_sharing_coalesced_push_stack(size_t DataSize,
+ int16_t UseSharedMemory) {
+ return data_sharing_push_stack_common(DataSize);
+}
+
+// Called at the time of the kernel initialization. This is used to initilize
+// the list of references to shared variables and to pre-allocate global storage
+// for holding the globalized variables.
+//
+// By default the globalized variables are stored in global memory. If the
+// UseSharedMemory is set to true, the runtime will attempt to use shared memory
+// as long as the size requested fits the pre-allocated size.
+EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
+ int16_t UseSharedMemory) {
+ // Compute the total memory footprint of the requested data.
+ // The master thread requires a stack only for itself. A worker
+ // thread (which at this point is a warp master) will require
+ // space for the variables of each thread in the warp,
+ // i.e. one DataSize chunk per warp lane.
+ // TODO: change WARPSIZE to the number of active threads in the warp.
+ size_t PushSize = (isRuntimeUninitialized() || IsMasterThread()) ?
+ DataSize : WARPSIZE * DataSize;
// Compute the start address of the frame of each thread in the warp.
- uintptr_t FrameStartAddress = (uintptr_t)FrameP;
+ uintptr_t FrameStartAddress =
+ (uintptr_t) data_sharing_push_stack_common(PushSize);
FrameStartAddress += (uintptr_t) (getLaneId() * DataSize);
return (void *)FrameStartAddress;
}
@@ -475,6 +486,8 @@ EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
return omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(FrameStart);
}
+ __threadfence_block();
+
if (getThreadId() % WARPSIZE == 0) {
unsigned WID = getWarpId();
@@ -501,8 +514,6 @@ EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
SlotP->Next = 0;
}
}
-
- __threadfence_block();
}
// Begin a data sharing context. Maintain a list of references to shared
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h b/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h
index aca8fbe7e88..bf36a5a3e6a 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h
@@ -478,6 +478,8 @@ EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer);
EXTERN void __kmpc_data_sharing_init_stack();
EXTERN void __kmpc_data_sharing_init_stack_spmd();
+EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
+ int16_t UseSharedMemory);
EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory);
EXTERN void __kmpc_data_sharing_pop_stack(void *a);
EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
index f23679ca7b0..8b70faef04b 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -40,8 +40,6 @@ INLINE unsigned nsmid() {
INLINE unsigned smid() {
unsigned id;
asm("mov.u32 %0, %%smid;" : "=r"(id));
- ASSERT0(LT_FUSSY, nsmid() <= MAX_SM,
- "Expected number of SMs is less than reported.");
return id;
}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
index e0d4c1679cd..5b621ea5b79 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -123,7 +123,7 @@ enum DATA_SHARING_SIZES {
struct DataSharingStateTy {
__kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number];
void *StackPtr[DS_Max_Warp_Number];
- void *FramePtr[DS_Max_Warp_Number];
+ void * volatile FramePtr[DS_Max_Warp_Number];
int32_t ActiveThreads[DS_Max_Warp_Number];
};
// Additional worker slot type which is initialized with the default worker slot
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
index 9cdcc162dd4..c93657e45e1 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -188,7 +188,6 @@ INLINE void *SafeMalloc(size_t size, const char *msg) // check if success
{
void *ptr = malloc(size);
PRINT(LD_MEM, "malloc data of size %zu for %s: 0x%llx\n", size, msg, P64(ptr));
- ASSERT(LT_SAFETY, ptr, "failed to allocate %zu bytes for %s\n", size, msg);
return ptr;
}