aboutsummaryrefslogtreecommitdiff
path: root/libgomp/plugin
diff options
context:
space:
mode:
authorThomas Schwinge <tschwinge@baylibre.com>2024-06-05 12:40:50 +0200
committerThomas Schwinge <tschwinge@baylibre.com>2024-06-06 13:41:47 +0200
commit5bbe5350a0932c78d4ffce292ba4104a6fe6ef96 (patch)
tree85f53058f83c4185c39ff6a1a132545b9cb5045a /libgomp/plugin
parentb4e68dd9084e48ee3e83c11d7f27548d8cca7066 (diff)
nvptx offloading: Global constructor, destructor support, via nvptx-tools 'ld'
This extends commit d9c90c82d900fdae95df4499bf5f0a4ecb903b53 "nvptx target: Global constructor, destructor support, via nvptx-tools 'ld'" for offloading. libgcc/ * config/nvptx/gbl-ctors.c ["mgomp"] (__do_global_ctors__entry__mgomp) (__do_global_dtors__entry__mgomp): New. [!"mgomp"] (__do_global_ctors__entry, __do_global_dtors__entry): New. libgomp/ * plugin/plugin-nvptx.c (nvptx_do_global_cdtors): New. (nvptx_close_device, GOMP_OFFLOAD_load_image) (GOMP_OFFLOAD_unload_image): Call it.
Diffstat (limited to 'libgomp/plugin')
-rw-r--r--libgomp/plugin/plugin-nvptx.c117
1 files changed, 116 insertions, 1 deletions
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 4cedc5390a3..0f3a3be1898 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -346,6 +346,11 @@ static struct ptx_device **ptx_devices;
default is set here. */
static unsigned lowlat_pool_size = 8 * 1024;
+static bool nvptx_do_global_cdtors (CUmodule, struct ptx_device *,
+ const char *);
+static size_t nvptx_stacks_size ();
+static void *nvptx_stacks_acquire (struct ptx_device *, size_t, int);
+
static inline struct nvptx_thread *
nvptx_thread (void)
{
@@ -565,6 +570,18 @@ nvptx_close_device (struct ptx_device *ptx_dev)
if (!ptx_dev)
return true;
+ bool ret = true;
+
+ for (struct ptx_image_data *image = ptx_dev->images;
+ image != NULL;
+ image = image->next)
+ {
+ if (!nvptx_do_global_cdtors (image->module, ptx_dev,
+ "__do_global_dtors__entry"
+ /* or "__do_global_dtors__entry__mgomp" */))
+ ret = false;
+ }
+
for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
{
struct ptx_free_block *b_next = b->next;
@@ -585,7 +602,8 @@ nvptx_close_device (struct ptx_device *ptx_dev)
CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
free (ptx_dev);
- return true;
+
+ return ret;
}
static int
@@ -1317,6 +1335,93 @@ nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
}
+/* Invoke MODULE's global constructors/destructors. */
+
+static bool
+nvptx_do_global_cdtors (CUmodule module, struct ptx_device *ptx_dev,
+ const char *funcname)
+{
+ bool ret = true;
+ char *funcname_mgomp = NULL;
+ CUresult r;
+ CUfunction funcptr;
+ r = CUDA_CALL_NOCHECK (cuModuleGetFunction,
+ &funcptr, module, funcname);
+ GOMP_PLUGIN_debug (0, "cuModuleGetFunction (%s): %s\n",
+ funcname, cuda_error (r));
+ if (r == CUDA_ERROR_NOT_FOUND)
+ {
+ /* Try '[funcname]__mgomp'. */
+
+ size_t funcname_len = strlen (funcname);
+ const char *mgomp_suffix = "__mgomp";
+ size_t mgomp_suffix_len = strlen (mgomp_suffix);
+ funcname_mgomp
+ = GOMP_PLUGIN_malloc (funcname_len + mgomp_suffix_len + 1);
+ memcpy (funcname_mgomp, funcname, funcname_len);
+ memcpy (funcname_mgomp + funcname_len,
+ mgomp_suffix, mgomp_suffix_len + 1);
+ funcname = funcname_mgomp;
+
+ r = CUDA_CALL_NOCHECK (cuModuleGetFunction,
+ &funcptr, module, funcname);
+ GOMP_PLUGIN_debug (0, "cuModuleGetFunction (%s): %s\n",
+ funcname, cuda_error (r));
+ }
+ if (r == CUDA_ERROR_NOT_FOUND)
+ ;
+ else if (r != CUDA_SUCCESS)
+ {
+ GOMP_PLUGIN_error ("cuModuleGetFunction (%s) error: %s",
+ funcname, cuda_error (r));
+ ret = false;
+ }
+ else
+ {
+ /* If necessary, set up soft stack. */
+ void *nvptx_stacks_0;
+ void *kargs[1];
+ if (funcname_mgomp)
+ {
+ size_t stack_size = nvptx_stacks_size ();
+ pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
+ nvptx_stacks_0 = nvptx_stacks_acquire (ptx_dev, stack_size, 1);
+ nvptx_stacks_0 += stack_size;
+ kargs[0] = &nvptx_stacks_0;
+ }
+ r = CUDA_CALL_NOCHECK (cuLaunchKernel,
+ funcptr,
+ 1, 1, 1, 1, 1, 1,
+ /* sharedMemBytes */ 0,
+ /* hStream */ NULL,
+ /* kernelParams */ funcname_mgomp ? kargs : NULL,
+ /* extra */ NULL);
+ if (r != CUDA_SUCCESS)
+ {
+ GOMP_PLUGIN_error ("cuLaunchKernel (%s) error: %s",
+ funcname, cuda_error (r));
+ ret = false;
+ }
+
+ r = CUDA_CALL_NOCHECK (cuStreamSynchronize,
+ NULL);
+ if (r != CUDA_SUCCESS)
+ {
+ GOMP_PLUGIN_error ("cuStreamSynchronize (%s) error: %s",
+ funcname, cuda_error (r));
+ ret = false;
+ }
+
+ if (funcname_mgomp)
+ pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
+ }
+
+ if (funcname_mgomp)
+ free (funcname_mgomp);
+
+ return ret;
+}
+
/* Load the (partial) program described by TARGET_DATA to device
number ORD. Allocate and return TARGET_TABLE. If not NULL, REV_FN_TABLE
will contain the on-device addresses of the functions for reverse offload.
@@ -1546,6 +1651,11 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
nvptx_set_clocktick (module, dev);
+ if (!nvptx_do_global_cdtors (module, dev,
+ "__do_global_ctors__entry"
+ /* or "__do_global_ctors__entry__mgomp" */))
+ return -1;
+
return fn_entries + var_entries + other_entries;
}
@@ -1571,6 +1681,11 @@ GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
if (image->target_data == target_data)
{
+ if (!nvptx_do_global_cdtors (image->module, dev,
+ "__do_global_dtors__entry"
+ /* or "__do_global_dtors__entry__mgomp" */))
+ ret = false;
+
*prev_p = image->next;
if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
ret = false;