aboutsummaryrefslogtreecommitdiff
path: root/libomptarget
diff options
context:
space:
mode:
authorGheorghe-Teodor Bercea <gheorghe-teod.bercea@ibm.com>2018-11-01 18:08:12 +0000
committerGheorghe-Teodor Bercea <gheorghe-teod.bercea@ibm.com>2018-11-01 18:08:12 +0000
commit35f8cbccc5ce718dd63be9eec8bb208a7b34d7f5 (patch)
treebb858b253ab8d4ce0a28460fc8f82465f8d1a908 /libomptarget
parent78cad4d16891b1cdbb58af78587942a0d1312c7c (diff)
[OpenMP][libomptarget] Add runtime function for pushing coalesced global records
Summary: In the case of coalesced global records, we need to push the exact data size passed in. This patch fixes this by outlining the common functionality of the previous push function and by adding a separate entry point for coalesced pushes. The pop function remains unchanged. Reviewers: ABataev, grokos, caomhin Reviewed By: ABataev, grokos Subscribers: jholewinski, cfe-commits, Hahnfeld, guansong, jfb, openmp-commits Differential Revision: https://reviews.llvm.org/D53141 git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@345867 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'libomptarget')
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/data_sharing.cu73
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/interface.h2
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu2
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h2
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/supporti.h1
5 files changed, 45 insertions, 35 deletions
diff --git a/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
index c7b9bdf..4db9f31 100644
--- a/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -129,7 +129,7 @@ EXTERN void *__kmpc_data_sharing_environment_begin(
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
void *&StackP = DataSharingState.StackPtr[WID];
- void *&FrameP = DataSharingState.FramePtr[WID];
+ void * volatile &FrameP = DataSharingState.FramePtr[WID];
int32_t &ActiveT = DataSharingState.ActiveThreads[WID];
DSPRINT0(DSFLAG, "Save current slot/stack values.\n");
@@ -283,7 +283,7 @@ EXTERN void __kmpc_data_sharing_environment_end(
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
void *&StackP = DataSharingState.StackPtr[WID];
- void *&FrameP = DataSharingState.FramePtr[WID];
+ void * volatile &FrameP = DataSharingState.FramePtr[WID];
SlotP = *SavedSharedSlot;
StackP = *SavedSharedStack;
@@ -321,7 +321,7 @@ __kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
DSPRINT(DSFLAG, "Source warp: %d\n", SourceWID);
- void *P = DataSharingState.FramePtr[SourceWID];
+ void * volatile P = DataSharingState.FramePtr[SourceWID];
DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n");
return P;
}
@@ -369,47 +369,31 @@ EXTERN void __kmpc_data_sharing_init_stack_spmd() {
__threadfence_block();
}
-// Called at the time of the kernel initialization. This is used to initilize
-// the list of references to shared variables and to pre-allocate global storage
-// for holding the globalized variables.
-//
-// By default the globalized variables are stored in global memory. If the
-// UseSharedMemory is set to true, the runtime will attempt to use shared memory
-// as long as the size requested fits the pre-allocated size.
-EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
- int16_t UseSharedMemory) {
+INLINE void* data_sharing_push_stack_common(size_t PushSize) {
if (isRuntimeUninitialized()) {
ASSERT0(LT_FUSSY, isSPMDMode(),
"Expected SPMD mode with uninitialized runtime.");
- return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(DataSize);
+ return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(PushSize);
}
+ // Only warp active master threads manage the stack.
+ bool IsWarpMaster = (getThreadId() % WARPSIZE) == 0;
+
// Add worst-case padding to DataSize so that future stack allocations are
// correctly aligned.
const size_t Alignment = 8;
- if (DataSize % Alignment != 0) {
- DataSize += (Alignment - DataSize % Alignment);
- }
+ PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;
// Frame pointer must be visible to all workers in the same warp.
unsigned WID = getWarpId();
- void *&FrameP = DataSharingState.FramePtr[WID];
+ void *volatile &FrameP = DataSharingState.FramePtr[WID];
- // Only warp active master threads manage the stack.
- if (getThreadId() % WARPSIZE == 0) {
+ if (IsWarpMaster) {
// SlotP will point to either the shared memory slot or an existing
// global memory slot.
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
void *&StackP = DataSharingState.StackPtr[WID];
- // Compute the total memory footprint of the requested data.
- // The master thread requires a stack only for itself. A worker
- // thread (which at this point is a warp master) will require
- // space for the variables of each thread in the warp,
- // i.e. one DataSize chunk per warp lane.
- // TODO: change WARPSIZE to the number of active threads in the warp.
- size_t PushSize = IsMasterThread() ? DataSize : WARPSIZE * DataSize;
-
// Check if we have room for the data in the current slot.
const uintptr_t StartAddress = (uintptr_t)StackP;
const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd;
@@ -453,12 +437,39 @@ EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
// Reset stack pointer to the requested address.
StackP = (void *)RequestedEndAddress;
}
+ } else {
+ while (!FrameP);
}
- __threadfence_block();
+ return FrameP;
+}
+
+EXTERN void* __kmpc_data_sharing_coalesced_push_stack(size_t DataSize,
+ int16_t UseSharedMemory) {
+ return data_sharing_push_stack_common(DataSize);
+}
+
+// Called at the time of the kernel initialization. This is used to initilize
+// the list of references to shared variables and to pre-allocate global storage
+// for holding the globalized variables.
+//
+// By default the globalized variables are stored in global memory. If the
+// UseSharedMemory is set to true, the runtime will attempt to use shared memory
+// as long as the size requested fits the pre-allocated size.
+EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
+ int16_t UseSharedMemory) {
+ // Compute the total memory footprint of the requested data.
+ // The master thread requires a stack only for itself. A worker
+ // thread (which at this point is a warp master) will require
+ // space for the variables of each thread in the warp,
+ // i.e. one DataSize chunk per warp lane.
+ // TODO: change WARPSIZE to the number of active threads in the warp.
+ size_t PushSize = (isRuntimeUninitialized() || IsMasterThread()) ?
+ DataSize : WARPSIZE * DataSize;
// Compute the start address of the frame of each thread in the warp.
- uintptr_t FrameStartAddress = (uintptr_t)FrameP;
+ uintptr_t FrameStartAddress =
+ (uintptr_t) data_sharing_push_stack_common(PushSize);
FrameStartAddress += (uintptr_t) (getLaneId() * DataSize);
return (void *)FrameStartAddress;
}
@@ -475,6 +486,8 @@ EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
return omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(FrameStart);
}
+ __threadfence_block();
+
if (getThreadId() % WARPSIZE == 0) {
unsigned WID = getWarpId();
@@ -501,8 +514,6 @@ EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
SlotP->Next = 0;
}
}
-
- __threadfence_block();
}
// Begin a data sharing context. Maintain a list of references to shared
diff --git a/libomptarget/deviceRTLs/nvptx/src/interface.h b/libomptarget/deviceRTLs/nvptx/src/interface.h
index aca8fbe..bf36a5a 100644
--- a/libomptarget/deviceRTLs/nvptx/src/interface.h
+++ b/libomptarget/deviceRTLs/nvptx/src/interface.h
@@ -478,6 +478,8 @@ EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer);
EXTERN void __kmpc_data_sharing_init_stack();
EXTERN void __kmpc_data_sharing_init_stack_spmd();
+EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
+ int16_t UseSharedMemory);
EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory);
EXTERN void __kmpc_data_sharing_pop_stack(void *a);
EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
index f23679c..8b70fae 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -40,8 +40,6 @@ INLINE unsigned nsmid() {
INLINE unsigned smid() {
unsigned id;
asm("mov.u32 %0, %%smid;" : "=r"(id));
- ASSERT0(LT_FUSSY, nsmid() <= MAX_SM,
- "Expected number of SMs is less than reported.");
return id;
}
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
index e0d4c16..5b621ea 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -123,7 +123,7 @@ enum DATA_SHARING_SIZES {
struct DataSharingStateTy {
__kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number];
void *StackPtr[DS_Max_Warp_Number];
- void *FramePtr[DS_Max_Warp_Number];
+ void * volatile FramePtr[DS_Max_Warp_Number];
int32_t ActiveThreads[DS_Max_Warp_Number];
};
// Additional worker slot type which is initialized with the default worker slot
diff --git a/libomptarget/deviceRTLs/nvptx/src/supporti.h b/libomptarget/deviceRTLs/nvptx/src/supporti.h
index 9cdcc16..c93657e 100644
--- a/libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ b/libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -188,7 +188,6 @@ INLINE void *SafeMalloc(size_t size, const char *msg) // check if success
{
void *ptr = malloc(size);
PRINT(LD_MEM, "malloc data of size %zu for %s: 0x%llx\n", size, msg, P64(ptr));
- ASSERT(LT_SAFETY, ptr, "failed to allocate %zu bytes for %s\n", size, msg);
return ptr;
}