[OpenMP] Introduce hierarchical scheduling

This patch introduces the logic implementing hierarchical scheduling. First and foremost, hierarchical scheduling is off by default To enable, use -DLIBOMP_USE_HIER_SCHED=On during CMake's configure stage. This work is based off if the IWOMP paper: "Workstealing and Nested Parallelism in SMP Systems" Hierarchical scheduling is the layering of OpenMP schedules for different layers of the memory hierarchy. One can have multiple layers between the threads and the global iterations space. The threads will go up the hierarchy to grab iterations, using possibly a different schedule & chunk for each layer. [ Global iteration space (0-999) ] (use static) [ L1 | L1 | L1 | L1 ] (use dynamic,1) [ T0 T1 | T2 T3 | T4 T5 | T6 T7 ] In the example shown above, there are 8 threads and 4 L1 caches begin targeted. If the topology indicates that there are two threads per core, then two consecutive threads will share the data of one L1 cache unit. This example would have the iteration space (0-999) split statically across the four L1 caches (so the first L1 would get (0-249), the second would get (250-499), etc). Then the threads will use a dynamic,1 schedule to grab iterations from the L1 cache units. There are currently four supported layers: L1, L2, L3, NUMA OMP_SCHEDULE can now read a hierarchical schedule with this syntax: OMP_SCHEDULE='EXPERIMENTAL LAYER,SCHED[,CHUNK][:LAYER,SCHED[,CHUNK]...]:SCHED,CHUNK And OMP_SCHEDULE can still read the normal SCHED,CHUNK syntax from before I've kept most of the hierarchical scheduling logic inside kmp_dispatch_hier.h to try to keep it separate from the rest of the code. Differential Revision: https://reviews.llvm.org/D47962 git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@336571 91177308-0d34-0410-b5e6-96231b3b80d8
author: Jonathan Peyton <jonathan.l.peyton@intel.com> 2018-07-09 17:51:13 +0000
committer: Jonathan Peyton <jonathan.l.peyton@intel.com> 2018-07-09 17:51:13 +0000
commit: 0afe745b655d87338689a9655aa466ad1acade4f (patch)
tree: de8a334e4ba12fa6d630a355a0eccb5499d2c982 /runtime/src/kmp_affinity.cpp
parent: ebe25f327bae857aeffdfb72aebc91db9ab3b218 (diff)
1 files changed, 87 insertions, 0 deletions
diff --git a/runtime/src/kmp_affinity.cpp b/runtime/src/kmp_affinity.cpp
index b7da8d4..0ccbb45 100644
--- a/runtime/src/kmp_affinity.cpp
+++ b/runtime/src/kmp_affinity.cpp
@@ -17,6 +17,9 @@
 #include "kmp_io.h"
 #include "kmp_str.h"
 #include "kmp_wrapper_getpid.h"
+#if KMP_USE_HIER_SCHED
+#include "kmp_dispatch_hier.h"
+#endif
 
 // Store the real or imagined machine hierarchy here
 static hierarchy_info machine_hierarchy;
@@ -1895,6 +1898,76 @@ static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a,
   return 0;
 }
 
+#if KMP_USE_HIER_SCHED
+// Set the array sizes for the hierarchy layers
+static void __kmp_dispatch_set_hierarchy_values() {
+  // Set the maximum number of L1's to number of cores
+  // Set the maximum number of L2's to to either number of cores / 2 for
+  // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing
+  // Or the number of cores for Intel(R) Xeon(R) processors
+  // Set the maximum number of NUMA nodes and L3's to number of packages
+  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] =
+      nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
+  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores;
+#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
+  if (__kmp_mic_type >= mic3)
+    __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2;
+  else
+#endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
+    __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores;
+  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages;
+  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages;
+  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1;
+  // Set the number of threads per unit
+  // Number of hardware threads per L1/L2/L3/NUMA/LOOP
+  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1;
+  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] =
+      __kmp_nThreadsPerCore;
+#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
+  if (__kmp_mic_type >= mic3)
+    __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
+        2 * __kmp_nThreadsPerCore;
+  else
+#endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
+    __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
+        __kmp_nThreadsPerCore;
+  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] =
+      nCoresPerPkg * __kmp_nThreadsPerCore;
+  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] =
+      nCoresPerPkg * __kmp_nThreadsPerCore;
+  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] =
+      nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
+}
+
+// Return the index into the hierarchy for this tid and layer type (L1, L2, etc)
+// i.e., this thread's L1 or this thread's L2, etc.
+int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) {
+  int index = type + 1;
+  int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
+  KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST);
+  if (type == kmp_hier_layer_e::LAYER_THREAD)
+    return tid;
+  else if (type == kmp_hier_layer_e::LAYER_LOOP)
+    return 0;
+  KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0);
+  if (tid >= num_hw_threads)
+    tid = tid % num_hw_threads;
+  return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index];
+}
+
+// Return the number of t1's per t2
+int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) {
+  int i1 = t1 + 1;
+  int i2 = t2 + 1;
+  KMP_DEBUG_ASSERT(i1 <= i2);
+  KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST);
+  KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST);
+  KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0);
+  // (nthreads/t2) / (nthreads/t1) = t1 / t2
+  return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1];
+}
+#endif // KMP_USE_HIER_SCHED
+
 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
 // affinity map.
 static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os,
@@ -3953,12 +4026,22 @@ static AddrUnsPair *address2os = NULL;
 static int *procarr = NULL;
 static int __kmp_aff_depth = 0;
 
+#if KMP_USE_HIER_SCHED
+#define KMP_EXIT_AFF_NONE                                                      \
+  KMP_ASSERT(__kmp_affinity_type == affinity_none);                            \
+  KMP_ASSERT(address2os == NULL);                                              \
+  __kmp_apply_thread_places(NULL, 0);                                          \
+  __kmp_create_affinity_none_places();                                         \
+  __kmp_dispatch_set_hierarchy_values();                                       \
+  return;
+#else
 #define KMP_EXIT_AFF_NONE                                                      \
   KMP_ASSERT(__kmp_affinity_type == affinity_none);                            \
   KMP_ASSERT(address2os == NULL);                                              \
   __kmp_apply_thread_places(NULL, 0);                                          \
   __kmp_create_affinity_none_places();                                         \
   return;
+#endif
 
 // Create a one element mask array (set of places) which only contains the
 // initial process's affinity mask
@@ -4300,6 +4383,10 @@ static void __kmp_aux_affinity_initialize(void) {
     KMP_ASSERT(address2os != NULL);
   }
 
+#if KMP_USE_HIER_SCHED
+  __kmp_dispatch_set_hierarchy_values();
+#endif
+
   if (address2os == NULL) {
     if (KMP_AFFINITY_CAPABLE() &&
         (__kmp_affinity_verbose ||
author	Jonathan Peyton <jonathan.l.peyton@intel.com>	2018-07-09 17:51:13 +0000
committer	Jonathan Peyton <jonathan.l.peyton@intel.com>	2018-07-09 17:51:13 +0000
commit	0afe745b655d87338689a9655aa466ad1acade4f (patch)
tree	de8a334e4ba12fa6d630a355a0eccb5499d2c982 /runtime/src/kmp_affinity.cpp
parent	ebe25f327bae857aeffdfb72aebc91db9ab3b218 (diff)