aboutsummaryrefslogtreecommitdiff
path: root/libgomp/plugin
diff options
context:
space:
mode:
authorAndrew Stubbs <ams@codesourcery.com>2021-12-03 17:46:41 +0000
committerAndrew Stubbs <ams@codesourcery.com>2023-12-06 16:48:57 +0000
commit30486fab717a90dc7516722c24ef9c5ea246c350 (patch)
tree473b0117e6eb85aa3d358ef4bf9ccdc3cff5ec6b /libgomp/plugin
parent458e7c937924bbcef80eb006af0b61420dbfc1c1 (diff)
libgomp, nvptx: low-latency memory allocator
This patch adds support for allocating low-latency ".shared" memory on NVPTX GPU device, via the omp_low_lat_mem_space and omp_alloc. The memory can be allocated, reallocated, and freed using a basic but fast algorithm, is thread safe and the size of the low-latency heap can be configured using the GOMP_NVPTX_LOWLAT_POOL environment variable. The use of the PTX dynamic_smem_size feature means that low-latency allocator will not work with the PTX 3.1 multilib. For now, the omp_low_lat_mem_alloc allocator also works, but that will change when I implement the access traits. libgomp/ChangeLog: * allocator.c (MEMSPACE_ALLOC): New macro. (MEMSPACE_CALLOC): New macro. (MEMSPACE_REALLOC): New macro. (MEMSPACE_FREE): New macro. (predefined_alloc_mapping): New array. Add _Static_assert to match. (ARRAY_SIZE): New macro. (omp_aligned_alloc): Use MEMSPACE_ALLOC. Implement fall-backs for predefined allocators. Simplify existing fall-backs. (omp_free): Use MEMSPACE_FREE. (omp_calloc): Use MEMSPACE_CALLOC. Implement fall-backs for predefined allocators. Simplify existing fall-backs. (omp_realloc): Use MEMSPACE_REALLOC, MEMSPACE_ALLOC, and MEMSPACE_FREE. Implement fall-backs for predefined allocators. Simplify existing fall-backs. * config/nvptx/team.c (__nvptx_lowlat_pool): New asm variable. (__nvptx_lowlat_init): New prototype. (gomp_nvptx_main): Call __nvptx_lowlat_init. * libgomp.texi: Update memory space table. * plugin/plugin-nvptx.c (lowlat_pool_size): New variable. (GOMP_OFFLOAD_init_device): Read the GOMP_NVPTX_LOWLAT_POOL envvar. (GOMP_OFFLOAD_run): Apply lowlat_pool_size. * basic-allocator.c: New file. * config/nvptx/allocator.c: New file. * testsuite/libgomp.c/omp_alloc-1.c: New test. * testsuite/libgomp.c/omp_alloc-2.c: New test. * testsuite/libgomp.c/omp_alloc-3.c: New test. * testsuite/libgomp.c/omp_alloc-4.c: New test. * testsuite/libgomp.c/omp_alloc-5.c: New test. * testsuite/libgomp.c/omp_alloc-6.c: New test. Co-authored-by: Kwok Cheung Yeung <kcy@codesourcery.com> Co-Authored-By: Thomas Schwinge <thomas@codesourcery.com>
Diffstat (limited to 'libgomp/plugin')
-rw-r--r--libgomp/plugin/plugin-nvptx.c23
1 files changed, 22 insertions, 1 deletions
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 0548e7e09e5..d4a254ed4f0 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -341,6 +341,11 @@ struct ptx_device
static struct ptx_device **ptx_devices;
+/* OpenMP kernels reserve a small amount of ".shared" space for use by
+ omp_alloc. The size is configured using GOMP_NVPTX_LOWLAT_POOL, but the
+ default is set here. */
+static unsigned lowlat_pool_size = 8 * 1024;
+
static inline struct nvptx_thread *
nvptx_thread (void)
{
@@ -1219,6 +1224,22 @@ GOMP_OFFLOAD_init_device (int n)
instantiated_devices++;
}
+ const char *var_name = "GOMP_NVPTX_LOWLAT_POOL";
+ const char *env_var = secure_getenv (var_name);
+ notify_var (var_name, env_var);
+
+ if (env_var != NULL)
+ {
+ char *endptr;
+ unsigned long val = strtoul (env_var, &endptr, 10);
+ if (endptr == NULL || *endptr != '\0'
+ || errno == ERANGE || errno == EINVAL
+ || val > UINT_MAX)
+ GOMP_PLUGIN_error ("Error parsing %s", var_name);
+ else
+ lowlat_pool_size = val;
+ }
+
pthread_mutex_unlock (&ptx_dev_lock);
return dev != NULL;
@@ -2178,7 +2199,7 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
" [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
__FUNCTION__, fn_name, teams, threads);
r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
- 32, threads, 1, 0, NULL, NULL, config);
+ 32, threads, 1, lowlat_pool_size, NULL, NULL, config);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
if (reverse_offload)