aboutsummaryrefslogtreecommitdiff
path: root/libgomp
diff options
context:
space:
mode:
authorjules <jules@138bc75d-0d04-0410-961f-82ee72b054a4>2015-04-08 15:58:33 +0000
committerjules <jules@138bc75d-0d04-0410-961f-82ee72b054a4>2015-04-08 15:58:33 +0000
commit0a1fe572e7c1b7e1770bd68f116528657e2ce125 (patch)
tree60d71a75181c79b311aaa199f917cb19ed8c06f7 /libgomp
parent1ddffe86a11e6a8f80ea4535523439f1af6ae3bb (diff)
gcc/
* config/nvptx/mkoffload.c (process): Support variable mapping. libgomp/ * libgomp.h (target_mem_desc: Remove mem_map field. (acc_dispatch_t): Remove open_device_func, close_device_func, get_device_num_func, set_device_num_func, target_data members. Change create_thread_data_func argument to device number instead of generic pointer. * oacc-async.c (assert.h): Include. (acc_async_test, acc_async_test_all, acc_wait, acc_wait_async) (acc_wait_all, acc_wait_all_async): Use current host thread's active device, not base_dev. * oacc-cuda.c (acc_get_current_cuda_device) (acc_get_current_cuda_context, acc_get_cuda_stream) (acc_set_cuda_stream): Likewise. * oacc-host.c (host_dispatch): Don't set open_device_func, close_device_func, get_device_num_func or set_device_num_func. * oacc-init.c (base_dev, init_key): Remove. (cached_base_dev): New. (name_of_acc_device_t): New. (acc_init_1): Initialise default-numbered device, not zeroth. (acc_shutdown_1): Close all devices of a given type. (goacc_destroy_thread): Don't use base_dev. (lazy_open, lazy_init, lazy_init_and_open): Remove. (goacc_attach_host_thread_to_device): New. (acc_init): Reimplement with goacc_attach_host_thread_to_device. (acc_get_num_devices): Don't use base_dev. (acc_set_device_type): Reimplement. (acc_get_device_type): Don't use base_dev. (acc_get_device_num): Tweak logic. (acc_set_device_num): Likewise. (acc_on_device): Use acc_get_device_type. (goacc_runtime_initialize): Initialize cached_base_dev not base_dev. (goacc_lazy_initialize): Reimplement with acc_init and goacc_attach_host_thread_to_device. * oacc-int.h (goacc_thread): Add base_dev field. (base_dev): Remove extern declaration. (goacc_attach_host_thread_to_device): Add prototype. * oacc-mem.c (acc_malloc): Use current thread's device instead of base_dev. (acc_free): Likewise. (acc_memcpy_to_device): Likewise. (acc_memcpy_from_device): Likewise. * oacc-parallel.c (select_acc_device): Remove. Replace calls with goacc_lazy_initialize (throughout). (GOACC_parallel): Use tgt_offset to locate target functions. * target.c (gomp_map_vars): Don't set tgt->mem_map. (gomp_unmap_vars): Use devicep->mem_map pointer not tgt->mem_map. (gomp_load_plugin_for_device): Remove open_device, close_device, get_device_num, set_device_num openacc hook initialisation. Don't set openacc.target_data. * plugin/plugin-host.c (GOMP_OFFLOAD_openacc_open_device) (GOMP_OFFLOAD_openacc_close_device) (GOMP_OFFLOAD_openacc_get_device_num) (GOMP_OFFLOAD_openacc_set_device_num): Remove. (GOMP_OFFLOAD_openacc_create_thread_data): Change (unused) argument to int. * plugin/plugin-nvptx.c (ptx_inited): Remove. (instantiated_devices, ptx_dev_lock): New. (struct ptx_image_data): New. (ptx_devices, ptx_images, ptx_image_lock): New. (fini_streams_for_device): Reorder cuStreamDestroy call. (nvptx_get_num_devices): Remove forward declaration. (nvptx_init): Change return type to bool. (nvptx_fini): Remove. (nvptx_attach_host_thread_to_device): New. (nvptx_open_device): Return struct ptx_device* instead of void*. (nvptx_close_device): Change argument type to struct ptx_device*, return type to void. (nvptx_get_num_devices): Use instantiated_devices not ptx_inited. (kernel_target_data, kernel_host_table): Remove static globals. (GOMP_OFFLOAD_register_image, GOMP_OFFLOAD_get_table): Remove. (GOMP_OFFLOAD_init_device): Reimplement. (GOMP_OFFLOAD_fini_device): Likewise. (GOMP_OFFLOAD_load_image, GOMP_OFFLOAD_unload_image): New. (GOMP_OFFLOAD_alloc, GOMP_OFFLOAD_free, GOMP_OFFLOAD_dev2host) (GOMP_OFFLOAD_host2dev): Use ORD argument. (GOMP_OFFLOAD_openacc_open_device) (GOMP_OFFLOAD_openacc_close_device) (GOMP_OFFLOAD_openacc_set_device_num) (GOMP_OFFLOAD_openacc_get_device_num): Remove. (GOMP_OFFLOAD_openacc_create_thread_data): Change argument to int (device number). libgomp/testsuite/ * libgomp.oacc-c-c++-common/lib-9.c: Fix devnum check in test. git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@221922 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'libgomp')
-rw-r--r--libgomp/ChangeLog86
-rw-r--r--libgomp/libgomp.h17
-rw-r--r--libgomp/oacc-async.c44
-rw-r--r--libgomp/oacc-cuda.c40
-rw-r--r--libgomp/oacc-host.c7
-rw-r--r--libgomp/oacc-init.c424
-rw-r--r--libgomp/oacc-int.h8
-rw-r--r--libgomp/oacc-mem.c16
-rw-r--r--libgomp/oacc-parallel.c36
-rw-r--r--libgomp/plugin/plugin-host.c27
-rw-r--r--libgomp/plugin/plugin-nvptx.c318
-rw-r--r--libgomp/target.c8
-rw-r--r--libgomp/testsuite/libgomp.oacc-c-c++-common/lib-9.c2
13 files changed, 583 insertions, 450 deletions
diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog
index aa1468f1543..4b0a1c91be6 100644
--- a/libgomp/ChangeLog
+++ b/libgomp/ChangeLog
@@ -1,3 +1,89 @@
+2015-04-08 Julian Brown <julian@codesourcery.com>
+
+ * libgomp.h (target_mem_desc: Remove mem_map field.
+ (acc_dispatch_t): Remove open_device_func, close_device_func,
+ get_device_num_func, set_device_num_func, target_data members.
+ Change create_thread_data_func argument to device number instead of
+ generic pointer.
+ * oacc-async.c (assert.h): Include.
+ (acc_async_test, acc_async_test_all, acc_wait, acc_wait_async)
+ (acc_wait_all, acc_wait_all_async): Use current host thread's
+ active device, not base_dev.
+ * oacc-cuda.c (acc_get_current_cuda_device)
+ (acc_get_current_cuda_context, acc_get_cuda_stream)
+ (acc_set_cuda_stream): Likewise.
+ * oacc-host.c (host_dispatch): Don't set open_device_func,
+ close_device_func, get_device_num_func or set_device_num_func.
+ * oacc-init.c (base_dev, init_key): Remove.
+ (cached_base_dev): New.
+ (name_of_acc_device_t): New.
+ (acc_init_1): Initialise default-numbered device, not zeroth.
+ (acc_shutdown_1): Close all devices of a given type.
+ (goacc_destroy_thread): Don't use base_dev.
+ (lazy_open, lazy_init, lazy_init_and_open): Remove.
+ (goacc_attach_host_thread_to_device): New.
+ (acc_init): Reimplement with goacc_attach_host_thread_to_device.
+ (acc_get_num_devices): Don't use base_dev.
+ (acc_set_device_type): Reimplement.
+ (acc_get_device_type): Don't use base_dev.
+ (acc_get_device_num): Tweak logic.
+ (acc_set_device_num): Likewise.
+ (acc_on_device): Use acc_get_device_type.
+ (goacc_runtime_initialize): Initialize cached_base_dev not base_dev.
+ (goacc_lazy_initialize): Reimplement with acc_init and
+ goacc_attach_host_thread_to_device.
+ * oacc-int.h (goacc_thread): Add base_dev field.
+ (base_dev): Remove extern declaration.
+ (goacc_attach_host_thread_to_device): Add prototype.
+ * oacc-mem.c (acc_malloc): Use current thread's device instead of
+ base_dev.
+ (acc_free): Likewise.
+ (acc_memcpy_to_device): Likewise.
+ (acc_memcpy_from_device): Likewise.
+ * oacc-parallel.c (select_acc_device): Remove. Replace calls with
+ goacc_lazy_initialize (throughout).
+ (GOACC_parallel): Use tgt_offset to locate target functions.
+ * target.c (gomp_map_vars): Don't set tgt->mem_map.
+ (gomp_unmap_vars): Use devicep->mem_map pointer not tgt->mem_map.
+ (gomp_load_plugin_for_device): Remove open_device, close_device,
+ get_device_num, set_device_num openacc hook initialisation. Don't set
+ openacc.target_data.
+ * plugin/plugin-host.c (GOMP_OFFLOAD_openacc_open_device)
+ (GOMP_OFFLOAD_openacc_close_device)
+ (GOMP_OFFLOAD_openacc_get_device_num)
+ (GOMP_OFFLOAD_openacc_set_device_num): Remove.
+ (GOMP_OFFLOAD_openacc_create_thread_data): Change (unused) argument
+ to int.
+ * plugin/plugin-nvptx.c (ptx_inited): Remove.
+ (instantiated_devices, ptx_dev_lock): New.
+ (struct ptx_image_data): New.
+ (ptx_devices, ptx_images, ptx_image_lock): New.
+ (fini_streams_for_device): Reorder cuStreamDestroy call.
+ (nvptx_get_num_devices): Remove forward declaration.
+ (nvptx_init): Change return type to bool.
+ (nvptx_fini): Remove.
+ (nvptx_attach_host_thread_to_device): New.
+ (nvptx_open_device): Return struct ptx_device* instead of void*.
+ (nvptx_close_device): Change argument type to struct ptx_device*,
+ return type to void.
+ (nvptx_get_num_devices): Use instantiated_devices not ptx_inited.
+ (kernel_target_data, kernel_host_table): Remove static globals.
+ (GOMP_OFFLOAD_register_image, GOMP_OFFLOAD_get_table): Remove.
+ (GOMP_OFFLOAD_init_device): Reimplement.
+ (GOMP_OFFLOAD_fini_device): Likewise.
+ (GOMP_OFFLOAD_load_image, GOMP_OFFLOAD_unload_image): New.
+ (GOMP_OFFLOAD_alloc, GOMP_OFFLOAD_free, GOMP_OFFLOAD_dev2host)
+ (GOMP_OFFLOAD_host2dev): Use ORD argument.
+ (GOMP_OFFLOAD_openacc_open_device)
+ (GOMP_OFFLOAD_openacc_close_device)
+ (GOMP_OFFLOAD_openacc_set_device_num)
+ (GOMP_OFFLOAD_openacc_get_device_num): Remove.
+ (GOMP_OFFLOAD_openacc_create_thread_data): Change argument to int
+ (device number).
+
+ testsuite/
+ * libgomp.oacc-c-c++-common/lib-9.c: Fix devnum check in test.
+
2015-04-06 Ilya Verbin <ilya.verbin@intel.com>
* libgomp-plugin.h (struct mapping_table): Replace with addr_pair.
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index a1d42c58d26..5272f0154b7 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -655,9 +655,6 @@ struct target_mem_desc {
/* Corresponding target device descriptor. */
struct gomp_device_descr *device_descr;
- /* Memory mapping info for the thread that created this descriptor. */
- struct splay_tree_s *mem_map;
-
/* List of splay keys to remove (or decrease refcount)
at the end of region. */
splay_tree_key list[];
@@ -691,18 +688,6 @@ typedef struct acc_dispatch_t
/* This is guarded by the lock in the "outer" struct gomp_device_descr. */
struct target_mem_desc *data_environ;
- /* Extra information required for a device instance by a given target. */
- /* This is guarded by the lock in the "outer" struct gomp_device_descr. */
- void *target_data;
-
- /* Open or close a device instance. */
- void *(*open_device_func) (int n);
- int (*close_device_func) (void *h);
-
- /* Set or get the device number. */
- int (*get_device_num_func) (void);
- void (*set_device_num_func) (int);
-
/* Execute. */
void (*exec_func) (void (*) (void *), size_t, void **, void **, size_t *,
unsigned short *, int, int, int, int, void *);
@@ -720,7 +705,7 @@ typedef struct acc_dispatch_t
void (*async_set_async_func) (int);
/* Create/destroy TLS data. */
- void *(*create_thread_data_func) (void *);
+ void *(*create_thread_data_func) (int);
void (*destroy_thread_data_func) (void *);
/* NVIDIA target specific routines. */
diff --git a/libgomp/oacc-async.c b/libgomp/oacc-async.c
index 08b7c5e1945..1f5827e79f6 100644
--- a/libgomp/oacc-async.c
+++ b/libgomp/oacc-async.c
@@ -26,7 +26,7 @@
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
-
+#include <assert.h>
#include "openacc.h"
#include "libgomp.h"
#include "oacc-int.h"
@@ -37,13 +37,23 @@ acc_async_test (int async)
if (async < acc_async_sync)
gomp_fatal ("invalid async argument: %d", async);
- return base_dev->openacc.async_test_func (async);
+ struct goacc_thread *thr = goacc_thread ();
+
+ if (!thr || !thr->dev)
+ gomp_fatal ("no device active");
+
+ return thr->dev->openacc.async_test_func (async);
}
int
acc_async_test_all (void)
{
- return base_dev->openacc.async_test_all_func ();
+ struct goacc_thread *thr = goacc_thread ();
+
+ if (!thr || !thr->dev)
+ gomp_fatal ("no device active");
+
+ return thr->dev->openacc.async_test_all_func ();
}
void
@@ -52,19 +62,34 @@ acc_wait (int async)
if (async < acc_async_sync)
gomp_fatal ("invalid async argument: %d", async);
- base_dev->openacc.async_wait_func (async);
+ struct goacc_thread *thr = goacc_thread ();
+
+ if (!thr || !thr->dev)
+ gomp_fatal ("no device active");
+
+ thr->dev->openacc.async_wait_func (async);
}
void
acc_wait_async (int async1, int async2)
{
- base_dev->openacc.async_wait_async_func (async1, async2);
+ struct goacc_thread *thr = goacc_thread ();
+
+ if (!thr || !thr->dev)
+ gomp_fatal ("no device active");
+
+ thr->dev->openacc.async_wait_async_func (async1, async2);
}
void
acc_wait_all (void)
{
- base_dev->openacc.async_wait_all_func ();
+ struct goacc_thread *thr = goacc_thread ();
+
+ if (!thr || !thr->dev)
+ gomp_fatal ("no device active");
+
+ thr->dev->openacc.async_wait_all_func ();
}
void
@@ -73,5 +98,10 @@ acc_wait_all_async (int async)
if (async < acc_async_sync)
gomp_fatal ("invalid async argument: %d", async);
- base_dev->openacc.async_wait_all_async_func (async);
+ struct goacc_thread *thr = goacc_thread ();
+
+ if (!thr || !thr->dev)
+ gomp_fatal ("no device active");
+
+ thr->dev->openacc.async_wait_all_async_func (async);
}
diff --git a/libgomp/oacc-cuda.c b/libgomp/oacc-cuda.c
index c8ef376e3a2..4aab4221a42 100644
--- a/libgomp/oacc-cuda.c
+++ b/libgomp/oacc-cuda.c
@@ -34,51 +34,53 @@
void *
acc_get_current_cuda_device (void)
{
- void *p = NULL;
+ struct goacc_thread *thr = goacc_thread ();
- if (base_dev && base_dev->openacc.cuda.get_current_device_func)
- p = base_dev->openacc.cuda.get_current_device_func ();
+ if (thr && thr->dev && thr->dev->openacc.cuda.get_current_device_func)
+ return thr->dev->openacc.cuda.get_current_device_func ();
- return p;
+ return NULL;
}
void *
acc_get_current_cuda_context (void)
{
- void *p = NULL;
+ struct goacc_thread *thr = goacc_thread ();
- if (base_dev && base_dev->openacc.cuda.get_current_context_func)
- p = base_dev->openacc.cuda.get_current_context_func ();
-
- return p;
+ if (thr && thr->dev && thr->dev->openacc.cuda.get_current_context_func)
+ return thr->dev->openacc.cuda.get_current_context_func ();
+
+ return NULL;
}
void *
acc_get_cuda_stream (int async)
{
- void *p = NULL;
+ struct goacc_thread *thr = goacc_thread ();
if (async < 0)
- return p;
-
- if (base_dev && base_dev->openacc.cuda.get_stream_func)
- p = base_dev->openacc.cuda.get_stream_func (async);
+ return NULL;
- return p;
+ if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func)
+ return thr->dev->openacc.cuda.get_stream_func (async);
+
+ return NULL;
}
int
acc_set_cuda_stream (int async, void *stream)
{
- int s = -1;
+ struct goacc_thread *thr;
if (async < 0 || stream == NULL)
return 0;
goacc_lazy_initialize ();
- if (base_dev && base_dev->openacc.cuda.set_stream_func)
- s = base_dev->openacc.cuda.set_stream_func (async, stream);
+ thr = goacc_thread ();
+
+ if (thr && thr->dev && thr->dev->openacc.cuda.set_stream_func)
+ return thr->dev->openacc.cuda.set_stream_func (async, stream);
- return s;
+ return -1;
}
diff --git a/libgomp/oacc-host.c b/libgomp/oacc-host.c
index e4756b67a77..6dcdbf3658e 100644
--- a/libgomp/oacc-host.c
+++ b/libgomp/oacc-host.c
@@ -53,16 +53,9 @@ static struct gomp_device_descr host_dispatch =
.host2dev_func = GOMP_OFFLOAD_host2dev,
.run_func = GOMP_OFFLOAD_run,
- .mem_map.root = NULL,
.is_initialized = false,
.openacc = {
- .open_device_func = GOMP_OFFLOAD_openacc_open_device,
- .close_device_func = GOMP_OFFLOAD_openacc_close_device,
-
- .get_device_num_func = GOMP_OFFLOAD_openacc_get_device_num,
- .set_device_num_func = GOMP_OFFLOAD_openacc_set_device_num,
-
.exec_func = GOMP_OFFLOAD_openacc_parallel,
.register_async_cleanup_func
diff --git a/libgomp/oacc-init.c b/libgomp/oacc-init.c
index 1e0243ede44..dc40fb6ffe1 100644
--- a/libgomp/oacc-init.c
+++ b/libgomp/oacc-init.c
@@ -37,14 +37,13 @@
static gomp_mutex_t acc_device_lock;
-/* The dispatch table for the current accelerator device. This is global, so
- you can only have one type of device open at any given time in a program.
- This is the "base" device in that several devices that use the same
- dispatch table may be active concurrently: this one (the "zeroth") is used
- for overall initialisation/shutdown, and other instances -- not necessarily
- including this one -- may be opened and closed once the base device has
- been initialized. */
-struct gomp_device_descr *base_dev;
+/* A cached version of the dispatcher for the global "current" accelerator type,
+ e.g. used as the default when creating new host threads. This is the
+ device-type equivalent of goacc_device_num (which specifies which device to
+ use out of potentially several of the same type). If there are several
+ devices of a given type, this points at the first one. */
+
+static struct gomp_device_descr *cached_base_dev = NULL;
#if defined HAVE_TLS || defined USE_EMUTLS
__thread struct goacc_thread *goacc_tls_data;
@@ -53,9 +52,6 @@ pthread_key_t goacc_tls_key;
#endif
static pthread_key_t goacc_cleanup_key;
-/* Current dispatcher, and how it was initialized */
-static acc_device_t init_key = _ACC_device_hwm;
-
static struct goacc_thread *goacc_threads;
static gomp_mutex_t goacc_thread_lock;
@@ -94,6 +90,21 @@ get_openacc_name (const char *name)
return name;
}
+static const char *
+name_of_acc_device_t (enum acc_device_t type)
+{
+ switch (type)
+ {
+ case acc_device_none: return "none";
+ case acc_device_default: return "default";
+ case acc_device_host: return "host";
+ case acc_device_host_nonshm: return "host_nonshm";
+ case acc_device_not_host: return "not_host";
+ case acc_device_nvidia: return "nvidia";
+ default: gomp_fatal ("unknown device type %u", (unsigned) type);
+ }
+}
+
static struct gomp_device_descr *
resolve_device (acc_device_t d)
{
@@ -159,22 +170,87 @@ resolve_device (acc_device_t d)
static struct gomp_device_descr *
acc_init_1 (acc_device_t d)
{
- struct gomp_device_descr *acc_dev;
+ struct gomp_device_descr *base_dev, *acc_dev;
+ int ndevs;
- acc_dev = resolve_device (d);
+ base_dev = resolve_device (d);
+
+ ndevs = base_dev->get_num_devices_func ();
+
+ if (!base_dev || ndevs <= 0 || goacc_device_num >= ndevs)
+ gomp_fatal ("device %s not supported", name_of_acc_device_t (d));
- if (!acc_dev || acc_dev->get_num_devices_func () <= 0)
- gomp_fatal ("device %u not supported", (unsigned)d);
+ acc_dev = &base_dev[goacc_device_num];
if (acc_dev->is_initialized)
gomp_fatal ("device already active");
- /* We need to remember what we were intialized as, to check shutdown etc. */
- init_key = d;
-
gomp_init_device (acc_dev);
- return acc_dev;
+ return base_dev;
+}
+
+static void
+acc_shutdown_1 (acc_device_t d)
+{
+ struct gomp_device_descr *base_dev;
+ struct goacc_thread *walk;
+ int ndevs, i;
+ bool devices_active = false;
+
+ /* Get the base device for this device type. */
+ base_dev = resolve_device (d);
+
+ if (!base_dev)
+ gomp_fatal ("device %s not supported", name_of_acc_device_t (d));
+
+ gomp_mutex_lock (&goacc_thread_lock);
+
+ /* Free target-specific TLS data and close all devices. */
+ for (walk = goacc_threads; walk != NULL; walk = walk->next)
+ {
+ if (walk->target_tls)
+ base_dev->openacc.destroy_thread_data_func (walk->target_tls);
+
+ walk->target_tls = NULL;
+
+ /* This would mean the user is shutting down OpenACC in the middle of an
+ "acc data" pragma. Likely not intentional. */
+ if (walk->mapped_data)
+ gomp_fatal ("shutdown in 'acc data' region");
+
+ /* Similarly, if this happens then user code has done something weird. */
+ if (walk->saved_bound_dev)
+ gomp_fatal ("shutdown during host fallback");
+
+ if (walk->dev)
+ {
+ gomp_mutex_lock (&walk->dev->lock);
+ gomp_free_memmap (&walk->dev->mem_map);
+ gomp_mutex_unlock (&walk->dev->lock);
+
+ walk->dev = NULL;
+ walk->base_dev = NULL;
+ }
+ }
+
+ gomp_mutex_unlock (&goacc_thread_lock);
+
+ ndevs = base_dev->get_num_devices_func ();
+
+ /* Close all the devices of this type that have been opened. */
+ for (i = 0; i < ndevs; i++)
+ {
+ struct gomp_device_descr *acc_dev = &base_dev[i];
+ if (acc_dev->is_initialized)
+ {
+ devices_active = true;
+ gomp_fini_device (acc_dev);
+ }
+ }
+
+ if (!devices_active)
+ gomp_fatal ("no device initialized");
}
static struct goacc_thread *
@@ -207,9 +283,11 @@ goacc_destroy_thread (void *data)
if (thr)
{
- if (base_dev && thr->target_tls)
+ struct gomp_device_descr *acc_dev = thr->dev;
+
+ if (acc_dev && thr->target_tls)
{
- base_dev->openacc.destroy_thread_data_func (thr->target_tls);
+ acc_dev->openacc.destroy_thread_data_func (thr->target_tls);
thr->target_tls = NULL;
}
@@ -236,53 +314,49 @@ goacc_destroy_thread (void *data)
gomp_mutex_unlock (&goacc_thread_lock);
}
-/* Open the ORD'th device of the currently-active type (base_dev must be
- initialised before calling). If ORD is < 0, open the default-numbered
- device (set by the ACC_DEVICE_NUM environment variable or a call to
- acc_set_device_num), or leave any currently-opened device as is. "Opening"
- consists of calling the device's open_device_func hook, and setting up
- thread-local data (maybe allocating, then initializing with information
- pertaining to the newly-opened or previously-opened device). */
+/* Use the ORD'th device instance for the current host thread (or -1 for the
+ current global default). The device (and the runtime) must be initialised
+ before calling this function. */
-static void
-lazy_open (int ord)
+void
+goacc_attach_host_thread_to_device (int ord)
{
struct goacc_thread *thr = goacc_thread ();
- struct gomp_device_descr *acc_dev;
-
- if (thr && thr->dev)
- {
- assert (ord < 0 || ord == thr->dev->target_id);
- return;
- }
-
- assert (base_dev);
-
+ struct gomp_device_descr *acc_dev = NULL, *base_dev = NULL;
+ int num_devices;
+
+ if (thr && thr->dev && (thr->dev->target_id == ord || ord < 0))
+ return;
+
if (ord < 0)
ord = goacc_device_num;
-
- /* The OpenACC 2.0 spec leaves the runtime's behaviour when an out-of-range
- device is requested as implementation-defined (4.2 ACC_DEVICE_NUM).
- We choose to raise an error in such a case. */
- if (ord >= base_dev->get_num_devices_func ())
- gomp_fatal ("device %u does not exist", ord);
-
+
+ /* Decide which type of device to use. If the current thread has a device
+ type already (e.g. set by acc_set_device_type), use that, else use the
+ global default. */
+ if (thr && thr->base_dev)
+ base_dev = thr->base_dev;
+ else
+ {
+ assert (cached_base_dev);
+ base_dev = cached_base_dev;
+ }
+
+ num_devices = base_dev->get_num_devices_func ();
+ if (num_devices <= 0 || ord >= num_devices)
+ gomp_fatal ("device %u out of range", ord);
+
if (!thr)
thr = goacc_new_thread ();
-
- acc_dev = thr->dev = &base_dev[ord];
-
- assert (acc_dev->target_id == ord);
-
+
+ thr->base_dev = base_dev;
+ thr->dev = acc_dev = &base_dev[ord];
thr->saved_bound_dev = NULL;
thr->mapped_data = NULL;
-
- if (!acc_dev->openacc.target_data)
- acc_dev->openacc.target_data = acc_dev->openacc.open_device_func (ord);
-
+
thr->target_tls
- = acc_dev->openacc.create_thread_data_func (acc_dev->openacc.target_data);
-
+ = acc_dev->openacc.create_thread_data_func (ord);
+
acc_dev->openacc.async_set_async_func (acc_async_sync);
}
@@ -292,74 +366,20 @@ lazy_open (int ord)
void
acc_init (acc_device_t d)
{
- if (!base_dev)
+ if (!cached_base_dev)
gomp_init_targets_once ();
gomp_mutex_lock (&acc_device_lock);
- base_dev = acc_init_1 (d);
-
- lazy_open (-1);
+ cached_base_dev = acc_init_1 (d);
gomp_mutex_unlock (&acc_device_lock);
+
+ goacc_attach_host_thread_to_device (-1);
}
ialias (acc_init)
-static void
-acc_shutdown_1 (acc_device_t d)
-{
- struct goacc_thread *walk;
-
- /* We don't check whether d matches the actual device found, because
- OpenACC 2.0 (3.2.12) says the parameters to the init and this
- call must match (for the shutdown call anyway, it's silent on
- others). */
-
- if (!base_dev)
- gomp_fatal ("no device initialized");
- if (d != init_key)
- gomp_fatal ("device %u(%u) is initialized",
- (unsigned) init_key, (unsigned) base_dev->type);
-
- gomp_mutex_lock (&goacc_thread_lock);
-
- /* Free target-specific TLS data and close all devices. */
- for (walk = goacc_threads; walk != NULL; walk = walk->next)
- {
- if (walk->target_tls)
- base_dev->openacc.destroy_thread_data_func (walk->target_tls);
-
- walk->target_tls = NULL;
-
- /* This would mean the user is shutting down OpenACC in the middle of an
- "acc data" pragma. Likely not intentional. */
- if (walk->mapped_data)
- gomp_fatal ("shutdown in 'acc data' region");
-
- if (walk->dev)
- {
- void *target_data = walk->dev->openacc.target_data;
- if (walk->dev->openacc.close_device_func (target_data) < 0)
- gomp_fatal ("failed to close device");
-
- walk->dev->openacc.target_data = target_data = NULL;
-
- gomp_mutex_lock (&walk->dev->lock);
- gomp_free_memmap (&walk->dev->mem_map);
- gomp_mutex_unlock (&walk->dev->lock);
-
- walk->dev = NULL;
- }
- }
-
- gomp_mutex_unlock (&goacc_thread_lock);
-
- gomp_fini_device (base_dev);
-
- base_dev = NULL;
-}
-
void
acc_shutdown (acc_device_t d)
{
@@ -372,59 +392,16 @@ acc_shutdown (acc_device_t d)
ialias (acc_shutdown)
-/* This function is called after plugins have been initialized. It deals with
- the "base" device, and is used to prepare the runtime for dealing with a
- number of such devices (as implemented by some particular plugin). If the
- argument device type D matches a previous call to the function, return the
- current base device, else shut the old device down and re-initialize with
- the new device type. */
-
-static struct gomp_device_descr *
-lazy_init (acc_device_t d)
-{
- if (base_dev)
- {
- /* Re-initializing the same device, do nothing. */
- if (d == init_key)
- return base_dev;
-
- acc_shutdown_1 (init_key);
- }
-
- assert (!base_dev);
-
- return acc_init_1 (d);
-}
-
-/* Ensure that plugins are loaded, initialize and open the (default-numbered)
- device. */
-
-static void
-lazy_init_and_open (acc_device_t d)
-{
- if (!base_dev)
- gomp_init_targets_once ();
-
- gomp_mutex_lock (&acc_device_lock);
-
- base_dev = lazy_init (d);
-
- lazy_open (-1);
-
- gomp_mutex_unlock (&acc_device_lock);
-}
-
int
acc_get_num_devices (acc_device_t d)
{
int n = 0;
- const struct gomp_device_descr *acc_dev;
+ struct gomp_device_descr *acc_dev;
if (d == acc_device_none)
return 0;
- if (!base_dev)
- gomp_init_targets_once ();
+ gomp_init_targets_once ();
acc_dev = resolve_device (d);
if (!acc_dev)
@@ -439,10 +416,39 @@ acc_get_num_devices (acc_device_t d)
ialias (acc_get_num_devices)
+/* Set the device type for the current thread only (using the current global
+ default device number), initialising that device if necessary. Also set the
+ default device type for new threads to D. */
+
void
acc_set_device_type (acc_device_t d)
{
- lazy_init_and_open (d);
+ struct gomp_device_descr *base_dev, *acc_dev;
+ struct goacc_thread *thr = goacc_thread ();
+
+ gomp_mutex_lock (&acc_device_lock);
+
+ if (!cached_base_dev)
+ gomp_init_targets_once ();
+
+ cached_base_dev = base_dev = resolve_device (d);
+ acc_dev = &base_dev[goacc_device_num];
+
+ if (!acc_dev->is_initialized)
+ gomp_init_device (acc_dev);
+
+ gomp_mutex_unlock (&acc_device_lock);
+
+ /* We're changing device type: invalidate the current thread's dev and
+ base_dev pointers. */
+ if (thr && thr->base_dev != base_dev)
+ {
+ thr->base_dev = thr->dev = NULL;
+ if (thr->mapped_data)
+ gomp_fatal ("acc_set_device_type in 'acc data' region");
+ }
+
+ goacc_attach_host_thread_to_device (-1);
}
ialias (acc_set_device_type)
@@ -451,10 +457,11 @@ acc_device_t
acc_get_device_type (void)
{
acc_device_t res = acc_device_none;
- const struct gomp_device_descr *dev;
+ struct gomp_device_descr *dev;
+ struct goacc_thread *thr = goacc_thread ();
- if (base_dev)
- res = acc_device_type (base_dev->type);
+ if (thr && thr->base_dev)
+ res = acc_device_type (thr->base_dev->type);
else
{
gomp_init_targets_once ();
@@ -475,78 +482,65 @@ int
acc_get_device_num (acc_device_t d)
{
const struct gomp_device_descr *dev;
- int num;
+ struct goacc_thread *thr = goacc_thread ();
if (d >= _ACC_device_hwm)
gomp_fatal ("device %u out of range", (unsigned)d);
- if (!base_dev)
+ if (!cached_base_dev)
gomp_init_targets_once ();
dev = resolve_device (d);
if (!dev)
- gomp_fatal ("no devices of type %u", d);
+ gomp_fatal ("device %s not supported", name_of_acc_device_t (d));
- /* We might not have called lazy_open for this host thread yet, in which case
- the get_device_num_func hook will return -1. */
- num = dev->openacc.get_device_num_func ();
- if (num < 0)
- num = goacc_device_num;
+ if (thr && thr->base_dev == dev && thr->dev)
+ return thr->dev->target_id;
- return num;
+ return goacc_device_num;
}
ialias (acc_get_device_num)
void
-acc_set_device_num (int n, acc_device_t d)
+acc_set_device_num (int ord, acc_device_t d)
{
- const struct gomp_device_descr *dev;
+ struct gomp_device_descr *base_dev, *acc_dev;
int num_devices;
- if (!base_dev)
+ if (!cached_base_dev)
gomp_init_targets_once ();
- if ((int) d == 0)
- {
- int i;
-
- /* A device setting of zero sets all device types on the system to use
- the Nth instance of that device type. Only attempt it for initialized
- devices though. */
- for (i = acc_device_not_host + 1; i < _ACC_device_hwm; i++)
- {
- dev = resolve_device (d);
- if (dev && dev->is_initialized)
- dev->openacc.set_device_num_func (n);
- }
+ if (ord < 0)
+ ord = goacc_device_num;
- /* ...and for future calls to acc_init/acc_set_device_type, etc. */
- goacc_device_num = n;
- }
+ if ((int) d == 0)
+ /* Set whatever device is being used by the current host thread to use
+ device instance ORD. It's unclear if this is supposed to affect other
+ host threads too (OpenACC 2.0 (3.2.4) acc_set_device_num). */
+ goacc_attach_host_thread_to_device (ord);
else
{
- struct goacc_thread *thr = goacc_thread ();
-
gomp_mutex_lock (&acc_device_lock);
- base_dev = lazy_init (d);
+ cached_base_dev = base_dev = resolve_device (d);
num_devices = base_dev->get_num_devices_func ();
- if (n >= num_devices)
- gomp_fatal ("device %u out of range", n);
+ if (ord >= num_devices)
+ gomp_fatal ("device %u out of range", ord);
- /* If we're changing the device number, de-associate this thread with
- the device (but don't close the device, since it may be in use by
- other threads). */
- if (thr && thr->dev && n != thr->dev->target_id)
- thr->dev = NULL;
+ acc_dev = &base_dev[ord];
- lazy_open (n);
+ if (!acc_dev->is_initialized)
+ gomp_init_device (acc_dev);
gomp_mutex_unlock (&acc_device_lock);
+
+ goacc_attach_host_thread_to_device (ord);
}
+
+ goacc_device_num = ord;
}
ialias (acc_set_device_num)
@@ -554,10 +548,7 @@ ialias (acc_set_device_num)
int
acc_on_device (acc_device_t dev)
{
- struct goacc_thread *thr = goacc_thread ();
-
- if (thr && thr->dev
- && acc_device_type (thr->dev->type) == acc_device_host_nonshm)
+ if (acc_get_device_type () == acc_device_host_nonshm)
return dev == acc_device_host_nonshm || dev == acc_device_not_host;
/* Just rely on the compiler builtin. */
@@ -577,7 +568,7 @@ goacc_runtime_initialize (void)
pthread_key_create (&goacc_cleanup_key, goacc_destroy_thread);
- base_dev = NULL;
+ cached_base_dev = NULL;
goacc_threads = NULL;
gomp_mutex_init (&goacc_thread_lock);
@@ -606,9 +597,8 @@ goacc_restore_bind (void)
}
/* This is called from any OpenACC support function that may need to implicitly
- initialize the libgomp runtime. On exit all such initialization will have
- been done, and both the global ACC_dev and the per-host-thread ACC_memmap
- pointers will be valid. */
+ initialize the libgomp runtime, either globally or from a new host thread.
+ On exit "goacc_thread" will return a valid & populated thread block. */
attribute_hidden void
goacc_lazy_initialize (void)
@@ -618,12 +608,8 @@ goacc_lazy_initialize (void)
if (thr && thr->dev)
return;
- if (!base_dev)
- lazy_init_and_open (acc_device_default);
+ if (!cached_base_dev)
+ acc_init (acc_device_default);
else
- {
- gomp_mutex_lock (&acc_device_lock);
- lazy_open (-1);
- gomp_mutex_unlock (&acc_device_lock);
- }
+ goacc_attach_host_thread_to_device (-1);
}
diff --git a/libgomp/oacc-int.h b/libgomp/oacc-int.h
index 85619c8d10a..0ace737884a 100644
--- a/libgomp/oacc-int.h
+++ b/libgomp/oacc-int.h
@@ -56,6 +56,9 @@ acc_device_type (enum offload_target_type type)
struct goacc_thread
{
+ /* The base device for the current thread. */
+ struct gomp_device_descr *base_dev;
+
/* The device for the current thread. */
struct gomp_device_descr *dev;
@@ -89,10 +92,7 @@ goacc_thread (void)
#endif
void goacc_register (struct gomp_device_descr *) __GOACC_NOTHROW;
-
-/* Current dispatcher. */
-extern struct gomp_device_descr *base_dev;
-
+void goacc_attach_host_thread_to_device (int);
void goacc_runtime_initialize (void);
void goacc_save_and_set_bind (acc_device_t);
void goacc_restore_bind (void);
diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c
index fdc82e654f9..89ef5fcd887 100644
--- a/libgomp/oacc-mem.c
+++ b/libgomp/oacc-mem.c
@@ -107,7 +107,9 @@ acc_malloc (size_t s)
struct goacc_thread *thr = goacc_thread ();
- return base_dev->alloc_func (thr->dev->target_id, s);
+ assert (thr->dev);
+
+ return thr->dev->alloc_func (thr->dev->target_id, s);
}
/* OpenACC 2.0a (3.2.16) doesn't specify what to do in the event
@@ -122,6 +124,8 @@ acc_free (void *d)
if (!d)
return;
+ assert (thr && thr->dev);
+
/* We don't have to call lazy open here, as the ptr value must have
been returned by acc_malloc. It's not permitted to pass NULL in
(unless you got that null from acc_malloc). */
@@ -134,7 +138,7 @@ acc_free (void *d)
acc_unmap_data ((void *)(k->host_start + offset));
}
- base_dev->free_func (thr->dev->target_id, d);
+ thr->dev->free_func (thr->dev->target_id, d);
}
void
@@ -144,7 +148,9 @@ acc_memcpy_to_device (void *d, void *h, size_t s)
been obtained from a routine that did that. */
struct goacc_thread *thr = goacc_thread ();
- base_dev->host2dev_func (thr->dev->target_id, d, h, s);
+ assert (thr && thr->dev);
+
+ thr->dev->host2dev_func (thr->dev->target_id, d, h, s);
}
void
@@ -154,7 +160,9 @@ acc_memcpy_from_device (void *h, void *d, size_t s)
been obtained from a routine that did that. */
struct goacc_thread *thr = goacc_thread ();
- base_dev->dev2host_func (thr->dev->target_id, h, d, s);
+ assert (thr && thr->dev);
+
+ thr->dev->dev2host_func (thr->dev->target_id, h, d, s);
}
/* Return the device pointer that corresponds to host data H. Or NULL
diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c
index 563f9bb5b4b..d8999463d6d 100644
--- a/libgomp/oacc-parallel.c
+++ b/libgomp/oacc-parallel.c
@@ -49,32 +49,6 @@ find_pset (int pos, size_t mapnum, unsigned short *kinds)
return kind == GOMP_MAP_TO_PSET;
}
-
-/* Ensure that the target device for DEVICE_TYPE is initialised (and that
- plugins have been loaded if appropriate). The ACC_dev variable for the
- current thread will be set appropriately for the given device type on
- return. */
-
-attribute_hidden void
-select_acc_device (int device_type)
-{
- goacc_lazy_initialize ();
-
- if (device_type == GOMP_DEVICE_HOST_FALLBACK)
- return;
-
- if (device_type == acc_device_none)
- device_type = acc_device_host;
-
- if (device_type >= 0)
- {
- /* NOTE: this will go badly if the surrounding data environment is set up
- to use a different device type. We'll just have to trust that users
- know what they're doing... */
- acc_set_device_type (device_type);
- }
-}
-
static void goacc_wait (int async, int num_waits, va_list ap);
void
@@ -111,7 +85,7 @@ GOACC_parallel (int device, void (*fn) (void *),
__FUNCTION__, (unsigned long) mapnum, hostaddrs, sizes, kinds,
async);
#endif
- select_acc_device (device);
+ goacc_lazy_initialize ();
thr = goacc_thread ();
acc_dev = thr->dev;
@@ -151,7 +125,7 @@ GOACC_parallel (int device, void (*fn) (void *),
if (tgt_fn_key == NULL)
gomp_fatal ("target function wasn't mapped");
- tgt_fn = (void (*)) tgt_fn_key->tgt->tgt_start;
+ tgt_fn = (void (*)) tgt_fn_key->tgt_offset;
}
else
tgt_fn = (void (*)) fn;
@@ -195,7 +169,7 @@ GOACC_data_start (int device, size_t mapnum,
__FUNCTION__, (unsigned long) mapnum, hostaddrs, sizes, kinds);
#endif
- select_acc_device (device);
+ goacc_lazy_initialize ();
struct goacc_thread *thr = goacc_thread ();
struct gomp_device_descr *acc_dev = thr->dev;
@@ -242,7 +216,7 @@ GOACC_enter_exit_data (int device, size_t mapnum,
bool data_enter = false;
size_t i;
- select_acc_device (device);
+ goacc_lazy_initialize ();
thr = goacc_thread ();
acc_dev = thr->dev;
@@ -429,7 +403,7 @@ GOACC_update (int device, size_t mapnum,
bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK;
size_t i;
- select_acc_device (device);
+ goacc_lazy_initialize ();
struct goacc_thread *thr = goacc_thread ();
struct gomp_device_descr *acc_dev = thr->dev;
diff --git a/libgomp/plugin/plugin-host.c b/libgomp/plugin/plugin-host.c
index bc60f72d05e..1faf5bc194e 100644
--- a/libgomp/plugin/plugin-host.c
+++ b/libgomp/plugin/plugin-host.c
@@ -119,31 +119,6 @@ GOMP_OFFLOAD_unload_image (int n __attribute__ ((unused)),
}
STATIC void *
-GOMP_OFFLOAD_openacc_open_device (int n)
-{
- return (void *) (intptr_t) n;
-}
-
-STATIC int
-GOMP_OFFLOAD_openacc_close_device (void *hnd)
-{
- return 0;
-}
-
-STATIC int
-GOMP_OFFLOAD_openacc_get_device_num (void)
-{
- return 0;
-}
-
-STATIC void
-GOMP_OFFLOAD_openacc_set_device_num (int n)
-{
- if (n > 0)
- GOMP (fatal) ("device number %u out of range for host execution", n);
-}
-
-STATIC void *
GOMP_OFFLOAD_alloc (int n __attribute__ ((unused)), size_t s)
{
return GOMP (malloc) (s);
@@ -254,7 +229,7 @@ GOMP_OFFLOAD_openacc_async_wait_all_async (int async __attribute__ ((unused)))
}
STATIC void *
-GOMP_OFFLOAD_openacc_create_thread_data (void *targ_data
+GOMP_OFFLOAD_openacc_create_thread_data (int ord
__attribute__ ((unused)))
{
return NULL;
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 483cb7559e8..583ec87aeee 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -133,7 +133,8 @@ struct targ_fn_descriptor
const char *name;
};
-static bool ptx_inited = false;
+static unsigned int instantiated_devices = 0;
+static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
struct ptx_stream
{
@@ -331,9 +332,21 @@ struct ptx_event
struct ptx_event *next;
};
+struct ptx_image_data
+{
+ void *target_data;
+ CUmodule module;
+ struct ptx_image_data *next;
+};
+
static pthread_mutex_t ptx_event_lock;
static struct ptx_event *ptx_events;
+static struct ptx_device **ptx_devices;
+
+static struct ptx_image_data *ptx_images = NULL;
+static pthread_mutex_t ptx_image_lock = PTHREAD_MUTEX_INITIALIZER;
+
#define _XSTR(s) _STR(s)
#define _STR(s) #s
@@ -450,8 +463,8 @@ fini_streams_for_device (struct ptx_device *ptx_dev)
struct ptx_stream *s = ptx_dev->active_streams;
ptx_dev->active_streams = ptx_dev->active_streams->next;
- cuStreamDestroy (s->stream);
map_fini (s);
+ cuStreamDestroy (s->stream);
free (s);
}
@@ -575,21 +588,21 @@ select_stream_for_async (int async, pthread_t thread, bool create,
return stream;
}
-static int nvptx_get_num_devices (void);
-
-/* Initialize the device. */
-static int
+/* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
+ should be locked on entry and remains locked on exit. */
+static bool
nvptx_init (void)
{
CUresult r;
int rc;
+ int ndevs;
- if (ptx_inited)
- return nvptx_get_num_devices ();
+ if (instantiated_devices != 0)
+ return true;
rc = verify_device_library ();
if (rc < 0)
- return -1;
+ return false;
r = cuInit (0);
if (r != CUDA_SUCCESS)
@@ -599,22 +612,64 @@ nvptx_init (void)
pthread_mutex_init (&ptx_event_lock, NULL);
- ptx_inited = true;
+ r = cuDeviceGetCount (&ndevs);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
- return nvptx_get_num_devices ();
+ ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
+ * ndevs);
+
+ return true;
}
+/* Select the N'th PTX device for the current host thread. The device must
+ have been previously opened before calling this function. */
+
static void
-nvptx_fini (void)
+nvptx_attach_host_thread_to_device (int n)
{
- ptx_inited = false;
+ CUdevice dev;
+ CUresult r;
+ struct ptx_device *ptx_dev;
+ CUcontext thd_ctx;
+
+ r = cuCtxGetDevice (&dev);
+ if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
+ GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
+
+ if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
+ return;
+ else
+ {
+ CUcontext old_ctx;
+
+ ptx_dev = ptx_devices[n];
+ assert (ptx_dev);
+
+ r = cuCtxGetCurrent (&thd_ctx);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
+
+ /* We don't necessarily have a current context (e.g. if it has been
+ destroyed. Pop it if we do though. */
+ if (thd_ctx != NULL)
+ {
+ r = cuCtxPopCurrent (&old_ctx);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
+ }
+
+ r = cuCtxPushCurrent (ptx_dev->ctx);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
+ }
}
-static void *
+static struct ptx_device *
nvptx_open_device (int n)
{
struct ptx_device *ptx_dev;
- CUdevice dev;
+ CUdevice dev, ctx_dev;
CUresult r;
int async_engines, pi;
@@ -628,6 +683,21 @@ nvptx_open_device (int n)
ptx_dev->dev = dev;
ptx_dev->ctx_shared = false;
+ r = cuCtxGetDevice (&ctx_dev);
+ if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
+ GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
+
+ if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
+ {
+ /* The current host thread has an active context for a different device.
+ Detach it. */
+ CUcontext old_ctx;
+
+ r = cuCtxPopCurrent (&old_ctx);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
+ }
+
r = cuCtxGetCurrent (&ptx_dev->ctx);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
@@ -678,17 +748,16 @@ nvptx_open_device (int n)
init_streams_for_device (ptx_dev, async_engines);
- return (void *) ptx_dev;
+ return ptx_dev;
}
-static int
-nvptx_close_device (void *targ_data)
+static void
+nvptx_close_device (struct ptx_device *ptx_dev)
{
CUresult r;
- struct ptx_device *ptx_dev = targ_data;
if (!ptx_dev)
- return 0;
+ return;
fini_streams_for_device (ptx_dev);
@@ -700,8 +769,6 @@ nvptx_close_device (void *targ_data)
}
free (ptx_dev);
-
- return 0;
}
static int
@@ -714,7 +781,7 @@ nvptx_get_num_devices (void)
order to enumerate available devices, but CUDA API routines can't be used
until cuInit has been called. Just call it now (but don't yet do any
further initialization). */
- if (!ptx_inited)
+ if (instantiated_devices == 0)
cuInit (0);
r = cuDeviceGetCount (&n);
@@ -1507,64 +1574,84 @@ GOMP_OFFLOAD_get_num_devices (void)
return nvptx_get_num_devices ();
}
-static void **kernel_target_data;
-static void **kernel_host_table;
-
void
-GOMP_OFFLOAD_register_image (void *host_table, void *target_data)
+GOMP_OFFLOAD_init_device (int n)
{
- kernel_target_data = target_data;
- kernel_host_table = host_table;
-}
+ pthread_mutex_lock (&ptx_dev_lock);
-void
-GOMP_OFFLOAD_init_device (int n __attribute__ ((unused)))
-{
- (void) nvptx_init ();
+ if (!nvptx_init () || ptx_devices[n] != NULL)
+ {
+ pthread_mutex_unlock (&ptx_dev_lock);
+ return;
+ }
+
+ ptx_devices[n] = nvptx_open_device (n);
+ instantiated_devices++;
+
+ pthread_mutex_unlock (&ptx_dev_lock);
}
void
-GOMP_OFFLOAD_fini_device (int n __attribute__ ((unused)))
+GOMP_OFFLOAD_fini_device (int n)
{
- nvptx_fini ();
+ pthread_mutex_lock (&ptx_dev_lock);
+
+ if (ptx_devices[n] != NULL)
+ {
+ nvptx_attach_host_thread_to_device (n);
+ nvptx_close_device (ptx_devices[n]);
+ ptx_devices[n] = NULL;
+ instantiated_devices--;
+ }
+
+ pthread_mutex_unlock (&ptx_dev_lock);
}
int
-GOMP_OFFLOAD_get_table (int n __attribute__ ((unused)),
- struct mapping_table **tablep)
+GOMP_OFFLOAD_load_image (int ord, void *target_data,
+ struct addr_pair **target_table)
{
CUmodule module;
- void **fn_table;
- char **fn_names;
- int fn_entries, i;
+ char **fn_names, **var_names;
+ unsigned int fn_entries, var_entries, i, j;
CUresult r;
struct targ_fn_descriptor *targ_fns;
+ void **img_header = (void **) target_data;
+ struct ptx_image_data *new_image;
- if (nvptx_init () <= 0)
- return 0;
+ GOMP_OFFLOAD_init_device (ord);
- /* This isn't an error, because an image may legitimately have no offloaded
- regions and so will not call GOMP_offload_register. */
- if (kernel_target_data == NULL)
- return 0;
+ nvptx_attach_host_thread_to_device (ord);
+
+ link_ptx (&module, img_header[0]);
- link_ptx (&module, kernel_target_data[0]);
+ pthread_mutex_lock (&ptx_image_lock);
+ new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
+ new_image->target_data = target_data;
+ new_image->module = module;
+ new_image->next = ptx_images;
+ ptx_images = new_image;
+ pthread_mutex_unlock (&ptx_image_lock);
- /* kernel_target_data[0] -> ptx code
- kernel_target_data[1] -> variable mappings
- kernel_target_data[2] -> array of kernel names in ascii
+ /* The mkoffload utility emits a table of pointers/integers at the start of
+ each offload image:
- kernel_host_table[0] -> start of function addresses (__offload_func_table)
- kernel_host_table[1] -> end of function addresses (__offload_funcs_end)
+ img_header[0] -> ptx code
+ img_header[1] -> number of variables
+ img_header[2] -> array of variable names (pointers to strings)
+ img_header[3] -> number of kernels
+ img_header[4] -> array of kernel names (pointers to strings)
The array of kernel names and the functions addresses form a
one-to-one correspondence. */
- fn_table = kernel_host_table[0];
- fn_names = (char **) kernel_target_data[2];
- fn_entries = (kernel_host_table[1] - kernel_host_table[0]) / sizeof (void *);
+ var_entries = (uintptr_t) img_header[1];
+ var_names = (char **) img_header[2];
+ fn_entries = (uintptr_t) img_header[3];
+ fn_names = (char **) img_header[4];
- *tablep = GOMP_PLUGIN_malloc (sizeof (struct mapping_table) * fn_entries);
+ *target_table = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
+ * (fn_entries + var_entries));
targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
* fn_entries);
@@ -1579,38 +1666,86 @@ GOMP_OFFLOAD_get_table (int n __attribute__ ((unused)),
targ_fns[i].fn = function;
targ_fns[i].name = (const char *) fn_names[i];
- (*tablep)[i].host_start = (uintptr_t) fn_table[i];
- (*tablep)[i].host_end = (*tablep)[i].host_start + 1;
- (*tablep)[i].tgt_start = (uintptr_t) &targ_fns[i];
- (*tablep)[i].tgt_end = (*tablep)[i].tgt_start + 1;
+ (*target_table)[i].start = (uintptr_t) &targ_fns[i];
+ (*target_table)[i].end = (*target_table)[i].start + 1;
}
- return fn_entries;
+ for (j = 0; j < var_entries; j++, i++)
+ {
+ CUdeviceptr var;
+ size_t bytes;
+
+ r = cuModuleGetGlobal (&var, &bytes, module, var_names[j]);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
+
+ (*target_table)[i].start = (uintptr_t) var;
+ (*target_table)[i].end = (*target_table)[i].start + bytes;
+ }
+
+ return i;
+}
+
+void
+GOMP_OFFLOAD_unload_image (int tid __attribute__((unused)), void *target_data)
+{
+ void **img_header = (void **) target_data;
+ struct targ_fn_descriptor *targ_fns
+ = (struct targ_fn_descriptor *) img_header[0];
+ struct ptx_image_data *image, *prev = NULL, *newhd = NULL;
+
+ free (targ_fns);
+
+ pthread_mutex_lock (&ptx_image_lock);
+ for (image = ptx_images; image != NULL;)
+ {
+ struct ptx_image_data *next = image->next;
+
+ if (image->target_data == target_data)
+ {
+ cuModuleUnload (image->module);
+ free (image);
+ if (prev)
+ prev->next = next;
+ }
+ else
+ {
+ prev = image;
+ if (!newhd)
+ newhd = image;
+ }
+
+ image = next;
+ }
+ ptx_images = newhd;
+ pthread_mutex_unlock (&ptx_image_lock);
}
void *
-GOMP_OFFLOAD_alloc (int n __attribute__ ((unused)), size_t size)
+GOMP_OFFLOAD_alloc (int ord, size_t size)
{
+ nvptx_attach_host_thread_to_device (ord);
return nvptx_alloc (size);
}
void
-GOMP_OFFLOAD_free (int n __attribute__ ((unused)), void *ptr)
+GOMP_OFFLOAD_free (int ord, void *ptr)
{
+ nvptx_attach_host_thread_to_device (ord);
nvptx_free (ptr);
}
void *
-GOMP_OFFLOAD_dev2host (int ord __attribute__ ((unused)), void *dst,
- const void *src, size_t n)
+GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
{
+ nvptx_attach_host_thread_to_device (ord);
return nvptx_dev2host (dst, src, n);
}
void *
-GOMP_OFFLOAD_host2dev (int ord __attribute__ ((unused)), void *dst,
- const void *src, size_t n)
+GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
{
+ nvptx_attach_host_thread_to_device (ord);
return nvptx_host2dev (dst, src, n);
}
@@ -1627,45 +1762,6 @@ GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
num_workers, vector_length, async, targ_mem_desc);
}
-void *
-GOMP_OFFLOAD_openacc_open_device (int n)
-{
- return nvptx_open_device (n);
-}
-
-int
-GOMP_OFFLOAD_openacc_close_device (void *h)
-{
- return nvptx_close_device (h);
-}
-
-void
-GOMP_OFFLOAD_openacc_set_device_num (int n)
-{
- struct nvptx_thread *nvthd = nvptx_thread ();
-
- assert (n >= 0);
-
- if (!nvthd->ptx_dev || nvthd->ptx_dev->ord != n)
- (void) nvptx_open_device (n);
-}
-
-/* This can be called before the device is "opened" for the current thread, in
- which case we can't tell which device number should be returned. We don't
- actually want to open the device here, so just return -1 and let the caller
- (oacc-init.c:acc_get_device_num) handle it. */
-
-int
-GOMP_OFFLOAD_openacc_get_device_num (void)
-{
- struct nvptx_thread *nvthd = nvptx_thread ();
-
- if (nvthd && nvthd->ptx_dev)
- return nvthd->ptx_dev->ord;
- else
- return -1;
-}
-
void
GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
{
@@ -1729,14 +1825,18 @@ GOMP_OFFLOAD_openacc_async_set_async (int async)
}
void *
-GOMP_OFFLOAD_openacc_create_thread_data (void *targ_data)
+GOMP_OFFLOAD_openacc_create_thread_data (int ord)
{
- struct ptx_device *ptx_dev = (struct ptx_device *) targ_data;
+ struct ptx_device *ptx_dev;
struct nvptx_thread *nvthd
= GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
CUresult r;
CUcontext thd_ctx;
+ ptx_dev = ptx_devices[ord];
+
+ assert (ptx_dev);
+
r = cuCtxGetCurrent (&thd_ctx);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
diff --git a/libgomp/target.c b/libgomp/target.c
index dfe7fb9dbf9..d8da7833aa9 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -178,7 +178,6 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
tgt->list_count = mapnum;
tgt->refcount = 1;
tgt->device_descr = devicep;
- tgt->mem_map = mem_map;
if (mapnum == 0)
return tgt;
@@ -597,7 +596,7 @@ gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom)
devicep->dev2host_func (devicep->target_id, (void *) k->host_start,
(void *) (k->tgt->tgt_start + k->tgt_offset),
k->host_end - k->host_start);
- splay_tree_remove (tgt->mem_map, k);
+ splay_tree_remove (&devicep->mem_map, k);
if (k->tgt->refcount > 1)
k->tgt->refcount--;
else
@@ -1159,10 +1158,6 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
{
optional_present = optional_total = 0;
DLSYM_OPT (openacc.exec, openacc_parallel);
- DLSYM_OPT (openacc.open_device, openacc_open_device);
- DLSYM_OPT (openacc.close_device, openacc_close_device);
- DLSYM_OPT (openacc.get_device_num, openacc_get_device_num);
- DLSYM_OPT (openacc.set_device_num, openacc_set_device_num);
DLSYM_OPT (openacc.register_async_cleanup,
openacc_register_async_cleanup);
DLSYM_OPT (openacc.async_test, openacc_async_test);
@@ -1271,7 +1266,6 @@ gomp_target_init (void)
current_device.mem_map.root = NULL;
current_device.is_initialized = false;
current_device.openacc.data_environ = NULL;
- current_device.openacc.target_data = NULL;
for (i = 0; i < new_num_devices; i++)
{
current_device.target_id = i;
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-9.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-9.c
index 84045dbe328..a4cf7f2e848 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-9.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-9.c
@@ -58,7 +58,7 @@ main (int argc, char **argv)
acc_set_device_num (1, (acc_device_t) 0);
devnum = acc_get_device_num (devtype);
- if (devnum != 0)
+ if (devnum != 1)
abort ();
}