aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrey Churbanov <Andrey.Churbanov@intel.com>2018-03-05 18:42:01 +0000
committerAndrey Churbanov <Andrey.Churbanov@intel.com>2018-03-05 18:42:01 +0000
commitec2d1867560e0536734dbf1a8f9f2c85c4647b29 (patch)
tree5be8d731aa8038b1b61de47199cf73f5c3d19753
parent74d67b2801fdb2a08024b896400d31a33496f8fa (diff)
Improve OpenMP threadprivate implementation.
Patch by Terry Wilmarth Differential Revision: https://reviews.llvm.org/D41914 git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@326733 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--runtime/src/kmp.h4
-rw-r--r--runtime/src/kmp_runtime.cpp149
-rw-r--r--runtime/src/kmp_threadprivate.cpp140
3 files changed, 182 insertions, 111 deletions
diff --git a/runtime/src/kmp.h b/runtime/src/kmp.h
index b36bbea..c7e2965 100644
--- a/runtime/src/kmp.h
+++ b/runtime/src/kmp.h
@@ -1444,6 +1444,8 @@ typedef void *(*kmpc_cctor_vec)(void *, void *,
/* keeps tracked of threadprivate cache allocations for cleanup later */
typedef struct kmp_cached_addr {
void **addr; /* address of allocated cache */
+ void ***compiler_cache; /* pointer to compiler's cache */
+ void *data; /* pointer to global data */
struct kmp_cached_addr *next; /* pointer to next cached address */
} kmp_cached_addr_t;
@@ -3774,6 +3776,8 @@ void kmp_threadprivate_insert_private_data(int gtid, void *pc_addr,
struct private_common *kmp_threadprivate_insert(int gtid, void *pc_addr,
void *data_addr,
size_t pc_size);
+void __kmp_threadprivate_resize_cache(int newCapacity);
+void __kmp_cleanup_threadprivate_caches();
// ompc_, kmpc_ entries moved from omp.h.
#if KMP_OS_WINDOWS
diff --git a/runtime/src/kmp_runtime.cpp b/runtime/src/kmp_runtime.cpp
index f902f04..a092a0e 100644
--- a/runtime/src/kmp_runtime.cpp
+++ b/runtime/src/kmp_runtime.cpp
@@ -3508,8 +3508,14 @@ static int __kmp_reclaim_dead_roots(void) {
If any argument is negative, the behavior is undefined. */
static int __kmp_expand_threads(int nNeed) {
int added = 0;
- int old_tp_cached;
- int __kmp_actual_max_nth;
+ int minimumRequiredCapacity;
+ int newCapacity;
+ kmp_info_t **newThreads;
+ kmp_root_t **newRoot;
+
+// All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
+// resizing __kmp_threads does not need additional protection if foreign
+// threads are present
#if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB
/* only for Windows static library */
@@ -3525,91 +3531,64 @@ static int __kmp_expand_threads(int nNeed) {
if (nNeed <= 0)
return added;
- while (1) {
- int nTarget;
- int minimumRequiredCapacity;
- int newCapacity;
- kmp_info_t **newThreads;
- kmp_root_t **newRoot;
-
- // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
- // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
- // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
- // > __kmp_max_nth in one of two ways:
- //
- // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
- // may not be resused by another thread, so we may need to increase
- // __kmp_threads_capacity to __kmp_max_nth + 1.
- //
- // 2) New foreign root(s) are encountered. We always register new foreign
- // roots. This may cause a smaller # of threads to be allocated at
- // subsequent parallel regions, but the worker threads hang around (and
- // eventually go to sleep) and need slots in the __kmp_threads[] array.
- //
- // Anyway, that is the reason for moving the check to see if
- // __kmp_max_nth was exceeded into __kmp_reserve_threads()
- // instead of having it performed here. -BB
- old_tp_cached = __kmp_tp_cached;
- __kmp_actual_max_nth =
- old_tp_cached ? __kmp_tp_capacity : __kmp_sys_max_nth;
- KMP_DEBUG_ASSERT(__kmp_actual_max_nth >= __kmp_threads_capacity);
-
- /* compute expansion headroom to check if we can expand */
- nTarget = nNeed;
- if (__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
- /* possible expansion too small -- give up */
- break;
- }
- minimumRequiredCapacity = __kmp_threads_capacity + nTarget;
-
- newCapacity = __kmp_threads_capacity;
- do {
- newCapacity = newCapacity <= (__kmp_actual_max_nth >> 1)
- ? (newCapacity << 1)
- : __kmp_actual_max_nth;
- } while (newCapacity < minimumRequiredCapacity);
- newThreads = (kmp_info_t **)__kmp_allocate(
- (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity +
- CACHE_LINE);
- newRoot = (kmp_root_t **)((char *)newThreads +
- sizeof(kmp_info_t *) * newCapacity);
- KMP_MEMCPY(newThreads, __kmp_threads,
- __kmp_threads_capacity * sizeof(kmp_info_t *));
- KMP_MEMCPY(newRoot, __kmp_root,
- __kmp_threads_capacity * sizeof(kmp_root_t *));
- memset(newThreads + __kmp_threads_capacity, 0,
- (newCapacity - __kmp_threads_capacity) * sizeof(kmp_info_t *));
- memset(newRoot + __kmp_threads_capacity, 0,
- (newCapacity - __kmp_threads_capacity) * sizeof(kmp_root_t *));
-
- if (!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
- /* __kmp_tp_cached has changed, i.e. __kmpc_threadprivate_cached has
- allocated a threadprivate cache while we were allocating the expanded
- array, and our new capacity is larger than the threadprivate cache
- capacity, so we should deallocate the expanded arrays and try again.
- This is the first check of a double-check pair. */
- __kmp_free(newThreads);
- continue; /* start over and try again */
- }
+ // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
+ // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
+ // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
+ // > __kmp_max_nth in one of two ways:
+ //
+ // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
+ // may not be resused by another thread, so we may need to increase
+ // __kmp_threads_capacity to __kmp_max_nth + 1.
+ //
+ // 2) New foreign root(s) are encountered. We always register new foreign
+ // roots. This may cause a smaller # of threads to be allocated at
+ // subsequent parallel regions, but the worker threads hang around (and
+ // eventually go to sleep) and need slots in the __kmp_threads[] array.
+ //
+ // Anyway, that is the reason for moving the check to see if
+ // __kmp_max_nth was exceeded into __kmp_reserve_threads()
+ // instead of having it performed here. -BB
+
+ KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
+
+ /* compute expansion headroom to check if we can expand */
+ if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
+ /* possible expansion too small -- give up */
+ return added;
+ }
+ minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
+
+ newCapacity = __kmp_threads_capacity;
+ do {
+ newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
+ : __kmp_sys_max_nth;
+ } while (newCapacity < minimumRequiredCapacity);
+ newThreads = (kmp_info_t **)__kmp_allocate(
+ (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
+ newRoot =
+ (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
+ KMP_MEMCPY(newThreads, __kmp_threads,
+ __kmp_threads_capacity * sizeof(kmp_info_t *));
+ KMP_MEMCPY(newRoot, __kmp_root,
+ __kmp_threads_capacity * sizeof(kmp_root_t *));
+
+ kmp_info_t **temp_threads = __kmp_threads;
+ *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
+ *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
+ __kmp_free(temp_threads);
+ added += newCapacity - __kmp_threads_capacity;
+ *(volatile int *)&__kmp_threads_capacity = newCapacity;
+
+ if (newCapacity > __kmp_tp_capacity) {
__kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
- if (!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
- /* Same check as above, but this time with the lock so we can be sure if
- we can succeed. */
- __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
- __kmp_free(newThreads);
- continue; /* start over and try again */
- } else {
- /* success */
- // __kmp_free( __kmp_threads ); // ATT: It leads to crash. Need to be
- // investigated.
- *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
- *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
- added += newCapacity - __kmp_threads_capacity;
- *(volatile int *)&__kmp_threads_capacity = newCapacity;
- __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
- break; /* succeeded, so we can exit the loop */
+ if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
+ __kmp_threadprivate_resize_cache(newCapacity);
+ } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
+ *(volatile int *)&__kmp_tp_capacity = newCapacity;
}
+ __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
}
+
return added;
}
@@ -7333,6 +7312,8 @@ void __kmp_cleanup(void) {
__kmp_init_serial = FALSE;
}
+ __kmp_cleanup_threadprivate_caches();
+
for (f = 0; f < __kmp_threads_capacity; f++) {
if (__kmp_root[f] != NULL) {
__kmp_free(__kmp_root[f]);
diff --git a/runtime/src/kmp_threadprivate.cpp b/runtime/src/kmp_threadprivate.cpp
index 362c075..e64097b 100644
--- a/runtime/src/kmp_threadprivate.cpp
+++ b/runtime/src/kmp_threadprivate.cpp
@@ -594,6 +594,13 @@ void *__kmpc_threadprivate(ident_t *loc, kmp_int32 global_tid, void *data,
return ret;
}
+static kmp_cached_addr_t *__kmp_find_cache(void *data) {
+ kmp_cached_addr_t *ptr = __kmp_threadpriv_cache_list;
+ while (ptr && ptr->data != data)
+ ptr = ptr->next;
+ return ptr;
+}
+
/*!
@ingroup THREADPRIVATE
@param loc source location information
@@ -620,35 +627,40 @@ __kmpc_threadprivate_cached(ident_t *loc,
if (TCR_PTR(*cache) == 0) {
__kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
- __kmp_tp_cached = 1;
- __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
+ // Compiler often passes in NULL cache, even if it's already been created
void **my_cache;
- KMP_ITT_IGNORE(
- my_cache = (void **)__kmp_allocate(
- sizeof(void *) * __kmp_tp_capacity + sizeof(kmp_cached_addr_t)););
- // No need to zero the allocated memory; __kmp_allocate does that.
- KC_TRACE(
- 50,
- ("__kmpc_threadprivate_cached: T#%d allocated cache at address %p\n",
- global_tid, my_cache));
-
- /* TODO: free all this memory in __kmp_common_destroy using
- * __kmp_threadpriv_cache_list */
- /* Add address of mycache to linked list for cleanup later */
kmp_cached_addr_t *tp_cache_addr;
-
- tp_cache_addr = (kmp_cached_addr_t *)&my_cache[__kmp_tp_capacity];
- tp_cache_addr->addr = my_cache;
- tp_cache_addr->next = __kmp_threadpriv_cache_list;
- __kmp_threadpriv_cache_list = tp_cache_addr;
-
+ // Look for an existing cache
+ tp_cache_addr = __kmp_find_cache(data);
+ if (!tp_cache_addr) { // Cache was never created; do it now
+ __kmp_tp_cached = 1;
+ KMP_ITT_IGNORE(my_cache = (void **)__kmp_allocate(
+ sizeof(void *) * __kmp_tp_capacity +
+ sizeof(kmp_cached_addr_t)););
+ // No need to zero the allocated memory; __kmp_allocate does that.
+ KC_TRACE(50, ("__kmpc_threadprivate_cached: T#%d allocated cache at "
+ "address %p\n",
+ global_tid, my_cache));
+ /* TODO: free all this memory in __kmp_common_destroy using
+ * __kmp_threadpriv_cache_list */
+ /* Add address of mycache to linked list for cleanup later */
+ tp_cache_addr = (kmp_cached_addr_t *)&my_cache[__kmp_tp_capacity];
+ tp_cache_addr->addr = my_cache;
+ tp_cache_addr->data = data;
+ tp_cache_addr->compiler_cache = cache;
+ tp_cache_addr->next = __kmp_threadpriv_cache_list;
+ __kmp_threadpriv_cache_list = tp_cache_addr;
+ } else { // A cache was already created; use it
+ my_cache = tp_cache_addr->addr;
+ tp_cache_addr->compiler_cache = cache;
+ }
KMP_MB();
TCW_PTR(*cache, my_cache);
+ __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
KMP_MB();
}
-
__kmp_release_lock(&__kmp_global_lock, global_tid);
}
@@ -661,10 +673,68 @@ __kmpc_threadprivate_cached(ident_t *loc,
KC_TRACE(10,
("__kmpc_threadprivate_cached: T#%d exiting; return value = %p\n",
global_tid, ret));
-
return ret;
}
+// This function should only be called when both __kmp_tp_cached_lock and
+// kmp_forkjoin_lock are held.
+void __kmp_threadprivate_resize_cache(int newCapacity) {
+ KC_TRACE(10, ("__kmp_threadprivate_resize_cache: called with size: %d\n",
+ newCapacity));
+
+ kmp_cached_addr_t *ptr = __kmp_threadpriv_cache_list;
+
+ while (ptr) {
+ if (ptr->data) { // this location has an active cache; resize it
+ void **my_cache;
+ KMP_ITT_IGNORE(my_cache =
+ (void **)__kmp_allocate(sizeof(void *) * newCapacity +
+ sizeof(kmp_cached_addr_t)););
+ // No need to zero the allocated memory; __kmp_allocate does that.
+ KC_TRACE(50, ("__kmp_threadprivate_resize_cache: allocated cache at %p\n",
+ my_cache));
+ // Now copy old cache into new cache
+ void **old_cache = ptr->addr;
+ for (int i = 0; i < __kmp_tp_capacity; ++i) {
+ my_cache[i] = old_cache[i];
+ }
+
+ // Add address of new my_cache to linked list for cleanup later
+ kmp_cached_addr_t *tp_cache_addr;
+ tp_cache_addr = (kmp_cached_addr_t *)&my_cache[newCapacity];
+ tp_cache_addr->addr = my_cache;
+ tp_cache_addr->data = ptr->data;
+ tp_cache_addr->compiler_cache = ptr->compiler_cache;
+ tp_cache_addr->next = __kmp_threadpriv_cache_list;
+ __kmp_threadpriv_cache_list = tp_cache_addr;
+
+ // Copy new cache to compiler's location: We can copy directly
+ // to (*compiler_cache) if compiler guarantees it will keep
+ // using the same location for the cache. This is not yet true
+ // for some compilers, in which case we have to check if
+ // compiler_cache is still pointing at old cache, and if so, we
+ // can point it at the new cache with an atomic compare&swap
+ // operation. (Old method will always work, but we should shift
+ // to new method (commented line below) when Intel and Clang
+ // compilers use new method.)
+ (void)KMP_COMPARE_AND_STORE_PTR(tp_cache_addr->compiler_cache, old_cache,
+ my_cache);
+ //TCW_PTR(*(tp_cache_addr->compiler_cache), my_cache);
+
+ // If the store doesn't happen here, the compiler's old behavior will
+ // inevitably call __kmpc_threadprivate_cache with a new location for the
+ // cache, and that function will store the resized cache there at that
+ // point.
+
+ // Nullify old cache's data pointer so we skip it next time
+ ptr->data = NULL;
+ }
+ ptr = ptr->next;
+ }
+ // After all caches are resized, update __kmp_tp_capacity to the new size
+ *(volatile int *)&__kmp_tp_capacity = newCapacity;
+}
+
/*!
@ingroup THREADPRIVATE
@param loc source location information
@@ -701,14 +771,30 @@ void __kmpc_threadprivate_register_vec(ident_t *loc, void *data,
d_tn->dt.dtorv = dtor;
d_tn->is_vec = TRUE;
d_tn->vec_len = (size_t)vector_length;
- /*
- d_tn->obj_init = 0; // AC: commented out because __kmp_allocate
- zeroes the memory
- d_tn->pod_init = 0;
- */
+ // d_tn->obj_init = 0; // AC: __kmp_allocate zeroes the memory
+ // d_tn->pod_init = 0;
lnk_tn = &(__kmp_threadprivate_d_table.data[KMP_HASH(data)]);
d_tn->next = *lnk_tn;
*lnk_tn = d_tn;
}
}
+
+void __kmp_cleanup_threadprivate_caches() {
+ kmp_cached_addr_t *ptr = __kmp_threadpriv_cache_list;
+
+ while (ptr) {
+ void **cache = ptr->addr;
+ __kmp_threadpriv_cache_list = ptr->next;
+ if (*ptr->compiler_cache)
+ *ptr->compiler_cache = NULL;
+ ptr->compiler_cache = NULL;
+ ptr->data = NULL;
+ ptr->addr = NULL;
+ ptr->next = NULL;
+ // Threadprivate data pointed at by cache entries are destroyed at end of
+ // __kmp_launch_thread with __kmp_common_destroy_gtid.
+ __kmp_free(cache); // implicitly frees ptr too
+ ptr = __kmp_threadpriv_cache_list;
+ }
+}