diff options
author | Thomas Schwinge <tschwinge@baylibre.com> | 2024-06-05 12:40:50 +0200 |
---|---|---|
committer | Thomas Schwinge <tschwinge@baylibre.com> | 2024-06-06 13:41:47 +0200 |
commit | 5bbe5350a0932c78d4ffce292ba4104a6fe6ef96 (patch) | |
tree | 85f53058f83c4185c39ff6a1a132545b9cb5045a /libgomp/plugin | |
parent | b4e68dd9084e48ee3e83c11d7f27548d8cca7066 (diff) |
nvptx offloading: Global constructor, destructor support, via nvptx-tools 'ld'
This extends commit d9c90c82d900fdae95df4499bf5f0a4ecb903b53
"nvptx target: Global constructor, destructor support, via nvptx-tools 'ld'"
for offloading.
libgcc/
* config/nvptx/gbl-ctors.c ["mgomp"]
(__do_global_ctors__entry__mgomp)
(__do_global_dtors__entry__mgomp): New.
[!"mgomp"] (__do_global_ctors__entry, __do_global_dtors__entry):
New.
libgomp/
* plugin/plugin-nvptx.c (nvptx_do_global_cdtors): New.
(nvptx_close_device, GOMP_OFFLOAD_load_image)
(GOMP_OFFLOAD_unload_image): Call it.
Diffstat (limited to 'libgomp/plugin')
-rw-r--r-- | libgomp/plugin/plugin-nvptx.c | 117 |
1 files changed, 116 insertions, 1 deletions
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 4cedc5390a3..0f3a3be1898 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -346,6 +346,11 @@ static struct ptx_device **ptx_devices; default is set here. */ static unsigned lowlat_pool_size = 8 * 1024; +static bool nvptx_do_global_cdtors (CUmodule, struct ptx_device *, + const char *); +static size_t nvptx_stacks_size (); +static void *nvptx_stacks_acquire (struct ptx_device *, size_t, int); + static inline struct nvptx_thread * nvptx_thread (void) { @@ -565,6 +570,18 @@ nvptx_close_device (struct ptx_device *ptx_dev) if (!ptx_dev) return true; + bool ret = true; + + for (struct ptx_image_data *image = ptx_dev->images; + image != NULL; + image = image->next) + { + if (!nvptx_do_global_cdtors (image->module, ptx_dev, + "__do_global_dtors__entry" + /* or "__do_global_dtors__entry__mgomp" */)) + ret = false; + } + for (struct ptx_free_block *b = ptx_dev->free_blocks; b;) { struct ptx_free_block *b_next = b->next; @@ -585,7 +602,8 @@ nvptx_close_device (struct ptx_device *ptx_dev) CUDA_CALL (cuCtxDestroy, ptx_dev->ctx); free (ptx_dev); - return true; + + return ret; } static int @@ -1317,6 +1335,93 @@ nvptx_set_clocktick (CUmodule module, struct ptx_device *dev) GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r)); } +/* Invoke MODULE's global constructors/destructors. */ + +static bool +nvptx_do_global_cdtors (CUmodule module, struct ptx_device *ptx_dev, + const char *funcname) +{ + bool ret = true; + char *funcname_mgomp = NULL; + CUresult r; + CUfunction funcptr; + r = CUDA_CALL_NOCHECK (cuModuleGetFunction, + &funcptr, module, funcname); + GOMP_PLUGIN_debug (0, "cuModuleGetFunction (%s): %s\n", + funcname, cuda_error (r)); + if (r == CUDA_ERROR_NOT_FOUND) + { + /* Try '[funcname]__mgomp'. */ + + size_t funcname_len = strlen (funcname); + const char *mgomp_suffix = "__mgomp"; + size_t mgomp_suffix_len = strlen (mgomp_suffix); + funcname_mgomp + = GOMP_PLUGIN_malloc (funcname_len + mgomp_suffix_len + 1); + memcpy (funcname_mgomp, funcname, funcname_len); + memcpy (funcname_mgomp + funcname_len, + mgomp_suffix, mgomp_suffix_len + 1); + funcname = funcname_mgomp; + + r = CUDA_CALL_NOCHECK (cuModuleGetFunction, + &funcptr, module, funcname); + GOMP_PLUGIN_debug (0, "cuModuleGetFunction (%s): %s\n", + funcname, cuda_error (r)); + } + if (r == CUDA_ERROR_NOT_FOUND) + ; + else if (r != CUDA_SUCCESS) + { + GOMP_PLUGIN_error ("cuModuleGetFunction (%s) error: %s", + funcname, cuda_error (r)); + ret = false; + } + else + { + /* If necessary, set up soft stack. */ + void *nvptx_stacks_0; + void *kargs[1]; + if (funcname_mgomp) + { + size_t stack_size = nvptx_stacks_size (); + pthread_mutex_lock (&ptx_dev->omp_stacks.lock); + nvptx_stacks_0 = nvptx_stacks_acquire (ptx_dev, stack_size, 1); + nvptx_stacks_0 += stack_size; + kargs[0] = &nvptx_stacks_0; + } + r = CUDA_CALL_NOCHECK (cuLaunchKernel, + funcptr, + 1, 1, 1, 1, 1, 1, + /* sharedMemBytes */ 0, + /* hStream */ NULL, + /* kernelParams */ funcname_mgomp ? kargs : NULL, + /* extra */ NULL); + if (r != CUDA_SUCCESS) + { + GOMP_PLUGIN_error ("cuLaunchKernel (%s) error: %s", + funcname, cuda_error (r)); + ret = false; + } + + r = CUDA_CALL_NOCHECK (cuStreamSynchronize, + NULL); + if (r != CUDA_SUCCESS) + { + GOMP_PLUGIN_error ("cuStreamSynchronize (%s) error: %s", + funcname, cuda_error (r)); + ret = false; + } + + if (funcname_mgomp) + pthread_mutex_unlock (&ptx_dev->omp_stacks.lock); + } + + if (funcname_mgomp) + free (funcname_mgomp); + + return ret; +} + /* Load the (partial) program described by TARGET_DATA to device number ORD. Allocate and return TARGET_TABLE. If not NULL, REV_FN_TABLE will contain the on-device addresses of the functions for reverse offload. @@ -1546,6 +1651,11 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, nvptx_set_clocktick (module, dev); + if (!nvptx_do_global_cdtors (module, dev, + "__do_global_ctors__entry" + /* or "__do_global_ctors__entry__mgomp" */)) + return -1; + return fn_entries + var_entries + other_entries; } @@ -1571,6 +1681,11 @@ GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data) for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next) if (image->target_data == target_data) { + if (!nvptx_do_global_cdtors (image->module, dev, + "__do_global_dtors__entry" + /* or "__do_global_dtors__entry__mgomp" */)) + ret = false; + *prev_p = image->next; if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS) ret = false; |