aboutsummaryrefslogtreecommitdiff
path: root/runtime/src/kmp_affinity.h
blob: 8ed3415cbe32b4840a3e3d5a0b8f592149abc8f1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
/*
 * kmp_affinity.h -- header for affinity management
 */


//===----------------------------------------------------------------------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.txt for details.
//
//===----------------------------------------------------------------------===//

#ifndef KMP_AFFINITY_H
#define KMP_AFFINITY_H

extern int __kmp_affinity_compact; /* Affinity 'compact' value */

class Address {
public:
    static const unsigned maxDepth = 32;
    unsigned labels[maxDepth];
    unsigned childNums[maxDepth];
    unsigned depth;
    unsigned leader;
    Address(unsigned _depth)
      : depth(_depth), leader(FALSE) {
    }
    Address &operator=(const Address &b) {
        depth = b.depth;
        for (unsigned i = 0; i < depth; i++) {
            labels[i] = b.labels[i];
            childNums[i] = b.childNums[i];
        }
        leader = FALSE;
        return *this;
    }
    bool operator==(const Address &b) const {
        if (depth != b.depth)
            return false;
        for (unsigned i = 0; i < depth; i++)
            if(labels[i] != b.labels[i])
                return false;
        return true;
    }
    bool isClose(const Address &b, int level) const {
        if (depth != b.depth)
            return false;
        if ((unsigned)level >= depth)
            return true;
        for (unsigned i = 0; i < (depth - level); i++)
            if(labels[i] != b.labels[i])
                return false;
        return true;
    }
    bool operator!=(const Address &b) const {
        return !operator==(b);
    }
    void print() const {
        unsigned i;
        printf("Depth: %u --- ", depth);
        for(i=0;i<depth;i++) {
            printf("%u ", labels[i]);
        }
    }
};

class AddrUnsPair {
public:
    Address first;
    unsigned second;
    AddrUnsPair(Address _first, unsigned _second)
      : first(_first), second(_second) {
    }
    AddrUnsPair &operator=(const AddrUnsPair &b)
    {
        first = b.first;
        second = b.second;
        return *this;
    }
    void print() const {
        printf("first = "); first.print();
        printf(" --- second = %u", second);
    }
    bool operator==(const AddrUnsPair &b) const {
        if(first != b.first) return false;
        if(second != b.second) return false;
        return true;
    }
    bool operator!=(const AddrUnsPair &b) const {
        return !operator==(b);
    }
};


static int
__kmp_affinity_cmp_Address_labels(const void *a, const void *b)
{
    const Address *aa = (const Address *)&(((AddrUnsPair *)a)
      ->first);
    const Address *bb = (const Address *)&(((AddrUnsPair *)b)
      ->first);
    unsigned depth = aa->depth;
    unsigned i;
    KMP_DEBUG_ASSERT(depth == bb->depth);
    for (i  = 0; i < depth; i++) {
        if (aa->labels[i] < bb->labels[i]) return -1;
        if (aa->labels[i] > bb->labels[i]) return 1;
    }
    return 0;
}


static int
__kmp_affinity_cmp_Address_child_num(const void *a, const void *b)
{
    const Address *aa = (const Address *)&(((AddrUnsPair *)a)
      ->first);
    const Address *bb = (const Address *)&(((AddrUnsPair *)b)
      ->first);
    unsigned depth = aa->depth;
    unsigned i;
    KMP_DEBUG_ASSERT(depth == bb->depth);
    KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
    KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
    for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
        int j = depth - i - 1;
        if (aa->childNums[j] < bb->childNums[j]) return -1;
        if (aa->childNums[j] > bb->childNums[j]) return 1;
    }
    for (; i < depth; i++) {
        int j = i - __kmp_affinity_compact;
        if (aa->childNums[j] < bb->childNums[j]) return -1;
        if (aa->childNums[j] > bb->childNums[j]) return 1;
    }
    return 0;
}


/** A structure for holding machine-specific hierarchy info to be computed once at init.
    This structure represents a mapping of threads to the actual machine hierarchy, or to
    our best guess at what the hierarchy might be, for the purpose of performing an
    efficient barrier.  In the worst case, when there is no machine hierarchy information,
    it produces a tree suitable for a barrier, similar to the tree used in the hyper barrier. */
class hierarchy_info {
public:
    /** Good default values for number of leaves and branching factor, given no affinity information.
	Behaves a bit like hyper barrier. */
    static const kmp_uint32 maxLeaves=4;
    static const kmp_uint32 minBranch=4;
    /** Number of levels in the hierarchy. Typical levels are threads/core, cores/package
	or socket, packages/node, nodes/machine, etc.  We don't want to get specific with
	nomenclature.  When the machine is oversubscribed we add levels to duplicate the
	hierarchy, doubling the thread capacity of the hierarchy each time we add a level. */
    kmp_uint32 maxLevels;

    /** This is specifically the depth of the machine configuration hierarchy, in terms of the
        number of levels along the longest path from root to any leaf. It corresponds to the
        number of entries in numPerLevel if we exclude all but one trailing 1. */
    kmp_uint32 depth;
    kmp_uint32 base_num_threads;
    enum init_status { initialized=0, not_initialized=1, initializing=2 };
    volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized, 2=initialization in progress
    volatile kmp_int8 resizing; // 0=not resizing, 1=resizing

    /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children the parent of a
        node at level i has. For example, if we have a machine with 4 packages, 4 cores/package
        and 2 HT per core, then numPerLevel = {2, 4, 4, 1, 1}. All empty levels are set to 1. */
    kmp_uint32 *numPerLevel;
    kmp_uint32 *skipPerLevel;

    void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
        int hier_depth = adr2os[0].first.depth;
        int level = 0;
        for (int i=hier_depth-1; i>=0; --i) {
            int max = -1;
            for (int j=0; j<num_addrs; ++j) {
                int next = adr2os[j].first.childNums[i];
                if (next > max) max = next;
            }
            numPerLevel[level] = max+1;
            ++level;
        }
    }

    hierarchy_info() : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}

    void fini() { if (!uninitialized && numPerLevel) __kmp_free(numPerLevel); }

    void init(AddrUnsPair *adr2os, int num_addrs)
    {
        kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&uninitialized, not_initialized, initializing);
        if (bool_result == 0) { // Wait for initialization
            while (TCR_1(uninitialized) != initialized) KMP_CPU_PAUSE();
            return;
        }
        KMP_DEBUG_ASSERT(bool_result==1);

        /* Added explicit initialization of the data fields here to prevent usage of dirty value
           observed when static library is re-initialized multiple times (e.g. when
           non-OpenMP thread repeatedly launches/joins thread that uses OpenMP). */
        depth = 1;
        resizing = 0;
        maxLevels = 7;
        numPerLevel = (kmp_uint32 *)__kmp_allocate(maxLevels*2*sizeof(kmp_uint32));
        skipPerLevel = &(numPerLevel[maxLevels]);
        for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
            numPerLevel[i] = 1;
            skipPerLevel[i] = 1;
        }

        // Sort table by physical ID
        if (adr2os) {
            qsort(adr2os, num_addrs, sizeof(*adr2os), __kmp_affinity_cmp_Address_labels);
            deriveLevels(adr2os, num_addrs);
        }
        else {
            numPerLevel[0] = maxLeaves;
            numPerLevel[1] = num_addrs/maxLeaves;
            if (num_addrs%maxLeaves) numPerLevel[1]++;
        }

        base_num_threads = num_addrs;
        for (int i=maxLevels-1; i>=0; --i) // count non-empty levels to get depth
            if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
                depth++;

        kmp_uint32 branch = minBranch;
        if (numPerLevel[0] == 1) branch = num_addrs/maxLeaves;
        if (branch<minBranch) branch=minBranch;
        for (kmp_uint32 d=0; d<depth-1; ++d) { // optimize hierarchy width
            while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>maxLeaves)) { // max 4 on level 0!
                if (numPerLevel[d] & 1) numPerLevel[d]++;
                numPerLevel[d] = numPerLevel[d] >> 1;
                if (numPerLevel[d+1] == 1) depth++;
                numPerLevel[d+1] = numPerLevel[d+1] << 1;
            }
            if(numPerLevel[0] == 1) {
                branch = branch >> 1;
                if (branch<4) branch = minBranch;
            }
        }

        for (kmp_uint32 i=1; i<depth; ++i)
            skipPerLevel[i] = numPerLevel[i-1] * skipPerLevel[i-1];
        // Fill in hierarchy in the case of oversubscription
        for (kmp_uint32 i=depth; i<maxLevels; ++i)
            skipPerLevel[i] = 2*skipPerLevel[i-1];

        uninitialized = initialized; // One writer

    }

    // Resize the hierarchy if nproc changes to something larger than before
    void resize(kmp_uint32 nproc)
    {
        kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
        while (bool_result == 0) { // someone else is trying to resize
            KMP_CPU_PAUSE();
            if (nproc <= base_num_threads)  // happy with other thread's resize
                return;
            else // try to resize
                bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
        }
        KMP_DEBUG_ASSERT(bool_result!=0);
        if (nproc <= base_num_threads) return; // happy with other thread's resize

        // Calculate new maxLevels
        kmp_uint32 old_sz = skipPerLevel[depth-1];
        kmp_uint32 incs = 0, old_maxLevels = maxLevels;
        // First see if old maxLevels is enough to contain new size
        for (kmp_uint32 i=depth; i<maxLevels && nproc>old_sz; ++i) {
            skipPerLevel[i] = 2*skipPerLevel[i-1];
            numPerLevel[i-1] *= 2;
            old_sz *= 2;
            depth++;
        }
        if (nproc > old_sz) { // Not enough space, need to expand hierarchy
            while (nproc > old_sz) {
                old_sz *=2;
                incs++;
                depth++;
            }
            maxLevels += incs;

            // Resize arrays
            kmp_uint32 *old_numPerLevel = numPerLevel;
            kmp_uint32 *old_skipPerLevel = skipPerLevel;
            numPerLevel = skipPerLevel = NULL;
            numPerLevel = (kmp_uint32 *)__kmp_allocate(maxLevels*2*sizeof(kmp_uint32));
            skipPerLevel = &(numPerLevel[maxLevels]);

            // Copy old elements from old arrays
            for (kmp_uint32 i=0; i<old_maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
                numPerLevel[i] = old_numPerLevel[i];
                skipPerLevel[i] = old_skipPerLevel[i];
            }

            // Init new elements in arrays to 1
            for (kmp_uint32 i=old_maxLevels; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
                numPerLevel[i] = 1;
                skipPerLevel[i] = 1;
            }

            // Free old arrays
            __kmp_free(old_numPerLevel);
        }

        // Fill in oversubscription levels of hierarchy
        for (kmp_uint32 i=old_maxLevels; i<maxLevels; ++i)
            skipPerLevel[i] = 2*skipPerLevel[i-1];

        base_num_threads = nproc;
        resizing = 0; // One writer

    }
};
#endif // KMP_AFFINITY_H