LLVM OpenMP* Runtime Library
kmp_affinity.h
1 /*
2  * kmp_affinity.h -- header for affinity management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef KMP_AFFINITY_H
14 #define KMP_AFFINITY_H
15 
16 #include "kmp.h"
17 #include "kmp_os.h"
18 
19 #if KMP_AFFINITY_SUPPORTED
20 #if KMP_USE_HWLOC
21 class KMPHwlocAffinity : public KMPAffinity {
22 public:
23  class Mask : public KMPAffinity::Mask {
24  hwloc_cpuset_t mask;
25 
26  public:
27  Mask() {
28  mask = hwloc_bitmap_alloc();
29  this->zero();
30  }
31  ~Mask() { hwloc_bitmap_free(mask); }
32  void set(int i) override { hwloc_bitmap_set(mask, i); }
33  bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
34  void clear(int i) override { hwloc_bitmap_clr(mask, i); }
35  void zero() override { hwloc_bitmap_zero(mask); }
36  void copy(const KMPAffinity::Mask *src) override {
37  const Mask *convert = static_cast<const Mask *>(src);
38  hwloc_bitmap_copy(mask, convert->mask);
39  }
40  void bitwise_and(const KMPAffinity::Mask *rhs) override {
41  const Mask *convert = static_cast<const Mask *>(rhs);
42  hwloc_bitmap_and(mask, mask, convert->mask);
43  }
44  void bitwise_or(const KMPAffinity::Mask *rhs) override {
45  const Mask *convert = static_cast<const Mask *>(rhs);
46  hwloc_bitmap_or(mask, mask, convert->mask);
47  }
48  void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
49  int begin() const override { return hwloc_bitmap_first(mask); }
50  int end() const override { return -1; }
51  int next(int previous) const override {
52  return hwloc_bitmap_next(mask, previous);
53  }
54  int get_system_affinity(bool abort_on_error) override {
55  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
56  "Illegal get affinity operation when not capable");
57  long retval =
58  hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
59  if (retval >= 0) {
60  return 0;
61  }
62  int error = errno;
63  if (abort_on_error) {
64  __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
65  }
66  return error;
67  }
68  int set_system_affinity(bool abort_on_error) const override {
69  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
70  "Illegal set affinity operation when not capable");
71  long retval =
72  hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
73  if (retval >= 0) {
74  return 0;
75  }
76  int error = errno;
77  if (abort_on_error) {
78  __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
79  }
80  return error;
81  }
82 #if KMP_OS_WINDOWS
83  int set_process_affinity(bool abort_on_error) const override {
84  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
85  "Illegal set process affinity operation when not capable");
86  int error = 0;
87  const hwloc_topology_support *support =
88  hwloc_topology_get_support(__kmp_hwloc_topology);
89  if (support->cpubind->set_proc_cpubind) {
90  int retval;
91  retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
92  HWLOC_CPUBIND_PROCESS);
93  if (retval >= 0)
94  return 0;
95  error = errno;
96  if (abort_on_error)
97  __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
98  }
99  return error;
100  }
101 #endif
102  int get_proc_group() const override {
103  int group = -1;
104 #if KMP_OS_WINDOWS
105  if (__kmp_num_proc_groups == 1) {
106  return 1;
107  }
108  for (int i = 0; i < __kmp_num_proc_groups; i++) {
109  // On windows, the long type is always 32 bits
110  unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
111  unsigned long second_32_bits =
112  hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
113  if (first_32_bits == 0 && second_32_bits == 0) {
114  continue;
115  }
116  if (group >= 0) {
117  return -1;
118  }
119  group = i;
120  }
121 #endif /* KMP_OS_WINDOWS */
122  return group;
123  }
124  };
125  void determine_capable(const char *var) override {
126  const hwloc_topology_support *topology_support;
127  if (__kmp_hwloc_topology == NULL) {
128  if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
129  __kmp_hwloc_error = TRUE;
130  if (__kmp_affinity_verbose)
131  KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
132  }
133  if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
134  __kmp_hwloc_error = TRUE;
135  if (__kmp_affinity_verbose)
136  KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
137  }
138  }
139  topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
140  // Is the system capable of setting/getting this thread's affinity?
141  // Also, is topology discovery possible? (pu indicates ability to discover
142  // processing units). And finally, were there no errors when calling any
143  // hwloc_* API functions?
144  if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
145  topology_support->cpubind->get_thisthread_cpubind &&
146  topology_support->discovery->pu && !__kmp_hwloc_error) {
147  // enables affinity according to KMP_AFFINITY_CAPABLE() macro
148  KMP_AFFINITY_ENABLE(TRUE);
149  } else {
150  // indicate that hwloc didn't work and disable affinity
151  __kmp_hwloc_error = TRUE;
152  KMP_AFFINITY_DISABLE();
153  }
154  }
155  void bind_thread(int which) override {
156  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
157  "Illegal set affinity operation when not capable");
158  KMPAffinity::Mask *mask;
159  KMP_CPU_ALLOC_ON_STACK(mask);
160  KMP_CPU_ZERO(mask);
161  KMP_CPU_SET(which, mask);
162  __kmp_set_system_affinity(mask, TRUE);
163  KMP_CPU_FREE_FROM_STACK(mask);
164  }
165  KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
166  void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
167  KMPAffinity::Mask *allocate_mask_array(int num) override {
168  return new Mask[num];
169  }
170  void deallocate_mask_array(KMPAffinity::Mask *array) override {
171  Mask *hwloc_array = static_cast<Mask *>(array);
172  delete[] hwloc_array;
173  }
174  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
175  int index) override {
176  Mask *hwloc_array = static_cast<Mask *>(array);
177  return &(hwloc_array[index]);
178  }
179  api_type get_api_type() const override { return HWLOC; }
180 };
181 #endif /* KMP_USE_HWLOC */
182 
183 #if KMP_OS_LINUX || KMP_OS_FREEBSD
184 #if KMP_OS_LINUX
185 /* On some of the older OS's that we build on, these constants aren't present
186  in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
187  all systems of the same arch where they are defined, and they cannot change.
188  stone forever. */
189 #include <sys/syscall.h>
190 #if KMP_ARCH_X86 || KMP_ARCH_ARM
191 #ifndef __NR_sched_setaffinity
192 #define __NR_sched_setaffinity 241
193 #elif __NR_sched_setaffinity != 241
194 #error Wrong code for setaffinity system call.
195 #endif /* __NR_sched_setaffinity */
196 #ifndef __NR_sched_getaffinity
197 #define __NR_sched_getaffinity 242
198 #elif __NR_sched_getaffinity != 242
199 #error Wrong code for getaffinity system call.
200 #endif /* __NR_sched_getaffinity */
201 #elif KMP_ARCH_AARCH64
202 #ifndef __NR_sched_setaffinity
203 #define __NR_sched_setaffinity 122
204 #elif __NR_sched_setaffinity != 122
205 #error Wrong code for setaffinity system call.
206 #endif /* __NR_sched_setaffinity */
207 #ifndef __NR_sched_getaffinity
208 #define __NR_sched_getaffinity 123
209 #elif __NR_sched_getaffinity != 123
210 #error Wrong code for getaffinity system call.
211 #endif /* __NR_sched_getaffinity */
212 #elif KMP_ARCH_RISCV64
213 #ifndef __NR_sched_setaffinity
214 #define __NR_sched_setaffinity 122
215 #elif __NR_sched_setaffinity != 122
216 #error Wrong code for setaffinity system call.
217 #endif /* __NR_sched_setaffinity */
218 #ifndef __NR_sched_getaffinity
219 #define __NR_sched_getaffinity 123
220 #elif __NR_sched_getaffinity != 123
221 #error Wrong code for getaffinity system call.
222 #endif /* __NR_sched_getaffinity */
223 #elif KMP_ARCH_X86_64
224 #ifndef __NR_sched_setaffinity
225 #define __NR_sched_setaffinity 203
226 #elif __NR_sched_setaffinity != 203
227 #error Wrong code for setaffinity system call.
228 #endif /* __NR_sched_setaffinity */
229 #ifndef __NR_sched_getaffinity
230 #define __NR_sched_getaffinity 204
231 #elif __NR_sched_getaffinity != 204
232 #error Wrong code for getaffinity system call.
233 #endif /* __NR_sched_getaffinity */
234 #elif KMP_ARCH_PPC64
235 #ifndef __NR_sched_setaffinity
236 #define __NR_sched_setaffinity 222
237 #elif __NR_sched_setaffinity != 222
238 #error Wrong code for setaffinity system call.
239 #endif /* __NR_sched_setaffinity */
240 #ifndef __NR_sched_getaffinity
241 #define __NR_sched_getaffinity 223
242 #elif __NR_sched_getaffinity != 223
243 #error Wrong code for getaffinity system call.
244 #endif /* __NR_sched_getaffinity */
245 # elif KMP_ARCH_MIPS
246 # ifndef __NR_sched_setaffinity
247 # define __NR_sched_setaffinity 4239
248 # elif __NR_sched_setaffinity != 4239
249 # error Wrong code for setaffinity system call.
250 # endif /* __NR_sched_setaffinity */
251 # ifndef __NR_sched_getaffinity
252 # define __NR_sched_getaffinity 4240
253 # elif __NR_sched_getaffinity != 4240
254 # error Wrong code for getaffinity system call.
255 # endif /* __NR_sched_getaffinity */
256 # elif KMP_ARCH_MIPS64
257 # ifndef __NR_sched_setaffinity
258 # define __NR_sched_setaffinity 5195
259 # elif __NR_sched_setaffinity != 5195
260 # error Wrong code for setaffinity system call.
261 # endif /* __NR_sched_setaffinity */
262 # ifndef __NR_sched_getaffinity
263 # define __NR_sched_getaffinity 5196
264 # elif __NR_sched_getaffinity != 5196
265 # error Wrong code for getaffinity system call.
266 # endif /* __NR_sched_getaffinity */
267 #elif KMP_ARCH_LOONGARCH64
268 #ifndef __NR_sched_setaffinity
269 #define __NR_sched_setaffinity 122
270 #elif __NR_sched_setaffinity != 122
271 #error Wrong code for setaffinity system call.
272 #endif /* __NR_sched_setaffinity */
273 #ifndef __NR_sched_getaffinity
274 #define __NR_sched_getaffinity 123
275 #elif __NR_sched_getaffinity != 123
276 #error Wrong code for getaffinity system call.
277 #endif /* __NR_sched_getaffinity */
278 # else
279 #error Unknown or unsupported architecture
280 #endif /* KMP_ARCH_* */
281 #elif KMP_OS_FREEBSD
282 #include <pthread.h>
283 #include <pthread_np.h>
284 #endif
285 class KMPNativeAffinity : public KMPAffinity {
286  class Mask : public KMPAffinity::Mask {
287  typedef unsigned long mask_t;
288  typedef decltype(__kmp_affin_mask_size) mask_size_type;
289  static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
290  static const mask_t ONE = 1;
291  mask_size_type get_num_mask_types() const {
292  return __kmp_affin_mask_size / sizeof(mask_t);
293  }
294 
295  public:
296  mask_t *mask;
297  Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
298  ~Mask() {
299  if (mask)
300  __kmp_free(mask);
301  }
302  void set(int i) override {
303  mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
304  }
305  bool is_set(int i) const override {
306  return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
307  }
308  void clear(int i) override {
309  mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
310  }
311  void zero() override {
312  mask_size_type e = get_num_mask_types();
313  for (mask_size_type i = 0; i < e; ++i)
314  mask[i] = (mask_t)0;
315  }
316  void copy(const KMPAffinity::Mask *src) override {
317  const Mask *convert = static_cast<const Mask *>(src);
318  mask_size_type e = get_num_mask_types();
319  for (mask_size_type i = 0; i < e; ++i)
320  mask[i] = convert->mask[i];
321  }
322  void bitwise_and(const KMPAffinity::Mask *rhs) override {
323  const Mask *convert = static_cast<const Mask *>(rhs);
324  mask_size_type e = get_num_mask_types();
325  for (mask_size_type i = 0; i < e; ++i)
326  mask[i] &= convert->mask[i];
327  }
328  void bitwise_or(const KMPAffinity::Mask *rhs) override {
329  const Mask *convert = static_cast<const Mask *>(rhs);
330  mask_size_type e = get_num_mask_types();
331  for (mask_size_type i = 0; i < e; ++i)
332  mask[i] |= convert->mask[i];
333  }
334  void bitwise_not() override {
335  mask_size_type e = get_num_mask_types();
336  for (mask_size_type i = 0; i < e; ++i)
337  mask[i] = ~(mask[i]);
338  }
339  int begin() const override {
340  int retval = 0;
341  while (retval < end() && !is_set(retval))
342  ++retval;
343  return retval;
344  }
345  int end() const override {
346  int e;
347  __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
348  return e;
349  }
350  int next(int previous) const override {
351  int retval = previous + 1;
352  while (retval < end() && !is_set(retval))
353  ++retval;
354  return retval;
355  }
356  int get_system_affinity(bool abort_on_error) override {
357  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
358  "Illegal get affinity operation when not capable");
359 #if KMP_OS_LINUX
360  long retval =
361  syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
362 #elif KMP_OS_FREEBSD
363  int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
364  reinterpret_cast<cpuset_t *>(mask));
365  int retval = (r == 0 ? 0 : -1);
366 #endif
367  if (retval >= 0) {
368  return 0;
369  }
370  int error = errno;
371  if (abort_on_error) {
372  __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
373  }
374  return error;
375  }
376  int set_system_affinity(bool abort_on_error) const override {
377  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
378  "Illegal set affinity operation when not capable");
379 #if KMP_OS_LINUX
380  long retval =
381  syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
382 #elif KMP_OS_FREEBSD
383  int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
384  reinterpret_cast<cpuset_t *>(mask));
385  int retval = (r == 0 ? 0 : -1);
386 #endif
387  if (retval >= 0) {
388  return 0;
389  }
390  int error = errno;
391  if (abort_on_error) {
392  __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
393  }
394  return error;
395  }
396  };
397  void determine_capable(const char *env_var) override {
398  __kmp_affinity_determine_capable(env_var);
399  }
400  void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
401  KMPAffinity::Mask *allocate_mask() override {
402  KMPNativeAffinity::Mask *retval = new Mask();
403  return retval;
404  }
405  void deallocate_mask(KMPAffinity::Mask *m) override {
406  KMPNativeAffinity::Mask *native_mask =
407  static_cast<KMPNativeAffinity::Mask *>(m);
408  delete native_mask;
409  }
410  KMPAffinity::Mask *allocate_mask_array(int num) override {
411  return new Mask[num];
412  }
413  void deallocate_mask_array(KMPAffinity::Mask *array) override {
414  Mask *linux_array = static_cast<Mask *>(array);
415  delete[] linux_array;
416  }
417  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
418  int index) override {
419  Mask *linux_array = static_cast<Mask *>(array);
420  return &(linux_array[index]);
421  }
422  api_type get_api_type() const override { return NATIVE_OS; }
423 };
424 #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD */
425 
426 #if KMP_OS_WINDOWS
427 class KMPNativeAffinity : public KMPAffinity {
428  class Mask : public KMPAffinity::Mask {
429  typedef ULONG_PTR mask_t;
430  static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
431  mask_t *mask;
432 
433  public:
434  Mask() {
435  mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
436  }
437  ~Mask() {
438  if (mask)
439  __kmp_free(mask);
440  }
441  void set(int i) override {
442  mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
443  }
444  bool is_set(int i) const override {
445  return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
446  }
447  void clear(int i) override {
448  mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
449  }
450  void zero() override {
451  for (int i = 0; i < __kmp_num_proc_groups; ++i)
452  mask[i] = 0;
453  }
454  void copy(const KMPAffinity::Mask *src) override {
455  const Mask *convert = static_cast<const Mask *>(src);
456  for (int i = 0; i < __kmp_num_proc_groups; ++i)
457  mask[i] = convert->mask[i];
458  }
459  void bitwise_and(const KMPAffinity::Mask *rhs) override {
460  const Mask *convert = static_cast<const Mask *>(rhs);
461  for (int i = 0; i < __kmp_num_proc_groups; ++i)
462  mask[i] &= convert->mask[i];
463  }
464  void bitwise_or(const KMPAffinity::Mask *rhs) override {
465  const Mask *convert = static_cast<const Mask *>(rhs);
466  for (int i = 0; i < __kmp_num_proc_groups; ++i)
467  mask[i] |= convert->mask[i];
468  }
469  void bitwise_not() override {
470  for (int i = 0; i < __kmp_num_proc_groups; ++i)
471  mask[i] = ~(mask[i]);
472  }
473  int begin() const override {
474  int retval = 0;
475  while (retval < end() && !is_set(retval))
476  ++retval;
477  return retval;
478  }
479  int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
480  int next(int previous) const override {
481  int retval = previous + 1;
482  while (retval < end() && !is_set(retval))
483  ++retval;
484  return retval;
485  }
486  int set_process_affinity(bool abort_on_error) const override {
487  if (__kmp_num_proc_groups <= 1) {
488  if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
489  DWORD error = GetLastError();
490  if (abort_on_error) {
491  __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
492  __kmp_msg_null);
493  }
494  return error;
495  }
496  }
497  return 0;
498  }
499  int set_system_affinity(bool abort_on_error) const override {
500  if (__kmp_num_proc_groups > 1) {
501  // Check for a valid mask.
502  GROUP_AFFINITY ga;
503  int group = get_proc_group();
504  if (group < 0) {
505  if (abort_on_error) {
506  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
507  }
508  return -1;
509  }
510  // Transform the bit vector into a GROUP_AFFINITY struct
511  // and make the system call to set affinity.
512  ga.Group = group;
513  ga.Mask = mask[group];
514  ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
515 
516  KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
517  if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
518  DWORD error = GetLastError();
519  if (abort_on_error) {
520  __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
521  __kmp_msg_null);
522  }
523  return error;
524  }
525  } else {
526  if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
527  DWORD error = GetLastError();
528  if (abort_on_error) {
529  __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
530  __kmp_msg_null);
531  }
532  return error;
533  }
534  }
535  return 0;
536  }
537  int get_system_affinity(bool abort_on_error) override {
538  if (__kmp_num_proc_groups > 1) {
539  this->zero();
540  GROUP_AFFINITY ga;
541  KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
542  if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
543  DWORD error = GetLastError();
544  if (abort_on_error) {
545  __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
546  KMP_ERR(error), __kmp_msg_null);
547  }
548  return error;
549  }
550  if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
551  (ga.Mask == 0)) {
552  return -1;
553  }
554  mask[ga.Group] = ga.Mask;
555  } else {
556  mask_t newMask, sysMask, retval;
557  if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
558  DWORD error = GetLastError();
559  if (abort_on_error) {
560  __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
561  KMP_ERR(error), __kmp_msg_null);
562  }
563  return error;
564  }
565  retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
566  if (!retval) {
567  DWORD error = GetLastError();
568  if (abort_on_error) {
569  __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
570  KMP_ERR(error), __kmp_msg_null);
571  }
572  return error;
573  }
574  newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
575  if (!newMask) {
576  DWORD error = GetLastError();
577  if (abort_on_error) {
578  __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
579  KMP_ERR(error), __kmp_msg_null);
580  }
581  }
582  *mask = retval;
583  }
584  return 0;
585  }
586  int get_proc_group() const override {
587  int group = -1;
588  if (__kmp_num_proc_groups == 1) {
589  return 1;
590  }
591  for (int i = 0; i < __kmp_num_proc_groups; i++) {
592  if (mask[i] == 0)
593  continue;
594  if (group >= 0)
595  return -1;
596  group = i;
597  }
598  return group;
599  }
600  };
601  void determine_capable(const char *env_var) override {
602  __kmp_affinity_determine_capable(env_var);
603  }
604  void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
605  KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
606  void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
607  KMPAffinity::Mask *allocate_mask_array(int num) override {
608  return new Mask[num];
609  }
610  void deallocate_mask_array(KMPAffinity::Mask *array) override {
611  Mask *windows_array = static_cast<Mask *>(array);
612  delete[] windows_array;
613  }
614  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
615  int index) override {
616  Mask *windows_array = static_cast<Mask *>(array);
617  return &(windows_array[index]);
618  }
619  api_type get_api_type() const override { return NATIVE_OS; }
620 };
621 #endif /* KMP_OS_WINDOWS */
622 #endif /* KMP_AFFINITY_SUPPORTED */
623 
624 class kmp_hw_thread_t {
625 public:
626  static const int UNKNOWN_ID = -1;
627  static int compare_ids(const void *a, const void *b);
628  static int compare_compact(const void *a, const void *b);
629  int ids[KMP_HW_LAST];
630  int sub_ids[KMP_HW_LAST];
631  bool leader;
632  int os_id;
633  void print() const;
634  void clear() {
635  for (int i = 0; i < (int)KMP_HW_LAST; ++i)
636  ids[i] = UNKNOWN_ID;
637  leader = false;
638  }
639 };
640 
641 class kmp_topology_t {
642 
643  struct flags_t {
644  int uniform : 1;
645  int reserved : 31;
646  };
647 
648  int depth;
649 
650  // The following arrays are all 'depth' long
651 
652  // Orderd array of the types in the topology
653  kmp_hw_t *types;
654 
655  // Keep quick topology ratios, for non-uniform topologies,
656  // this ratio holds the max number of itemAs per itemB
657  // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
658  int *ratio;
659 
660  // Storage containing the absolute number of each topology layer
661  int *count;
662 
663  // The hardware threads array
664  // hw_threads is num_hw_threads long
665  // Each hw_thread's ids and sub_ids are depth deep
666  int num_hw_threads;
667  kmp_hw_thread_t *hw_threads;
668 
669  // Equivalence hash where the key is the hardware topology item
670  // and the value is the equivalent hardware topology type in the
671  // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
672  // known equivalence for the topology type
673  kmp_hw_t equivalent[KMP_HW_LAST];
674 
675  // Flags describing the topology
676  flags_t flags;
677 
678  // Count each item & get the num x's per y
679  // e.g., get the number of cores and the number of threads per core
680  // for each (x, y) in (KMP_HW_* , KMP_HW_*)
681  void _gather_enumeration_information();
682 
683  // Remove layers that don't add information to the topology.
684  // This is done by having the layer take on the id = UNKNOWN_ID (-1)
685  void _remove_radix1_layers();
686 
687  // Find out if the topology is uniform
688  void _discover_uniformity();
689 
690  // Set all the sub_ids for each hardware thread
691  void _set_sub_ids();
692 
693  // Set global affinity variables describing the number of threads per
694  // core, the number of packages, the number of cores per package, and
695  // the number of cores.
696  void _set_globals();
697 
698  // Set the last level cache equivalent type
699  void _set_last_level_cache();
700 
701 public:
702  // Force use of allocate()/deallocate()
703  kmp_topology_t() = delete;
704  kmp_topology_t(const kmp_topology_t &t) = delete;
705  kmp_topology_t(kmp_topology_t &&t) = delete;
706  kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
707  kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
708 
709  static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
710  static void deallocate(kmp_topology_t *);
711 
712  // Functions used in create_map() routines
713  kmp_hw_thread_t &at(int index) {
714  KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
715  return hw_threads[index];
716  }
717  const kmp_hw_thread_t &at(int index) const {
718  KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
719  return hw_threads[index];
720  }
721  int get_num_hw_threads() const { return num_hw_threads; }
722  void sort_ids() {
723  qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
724  kmp_hw_thread_t::compare_ids);
725  }
726  // Check if the hardware ids are unique, if they are
727  // return true, otherwise return false
728  bool check_ids() const;
729 
730  // Function to call after the create_map() routine
731  void canonicalize();
732  void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
733 
734  // Functions used after canonicalize() called
735  bool filter_hw_subset();
736  bool is_close(int hwt1, int hwt2, int level) const;
737  bool is_uniform() const { return flags.uniform; }
738  // Tell whether a type is a valid type in the topology
739  // returns KMP_HW_UNKNOWN when there is no equivalent type
740  kmp_hw_t get_equivalent_type(kmp_hw_t type) const { return equivalent[type]; }
741  // Set type1 = type2
742  void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
743  KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
744  KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
745  kmp_hw_t real_type2 = equivalent[type2];
746  if (real_type2 == KMP_HW_UNKNOWN)
747  real_type2 = type2;
748  equivalent[type1] = real_type2;
749  // This loop is required since any of the types may have been set to
750  // be equivalent to type1. They all must be checked and reset to type2.
751  KMP_FOREACH_HW_TYPE(type) {
752  if (equivalent[type] == type1) {
753  equivalent[type] = real_type2;
754  }
755  }
756  }
757  // Calculate number of types corresponding to level1
758  // per types corresponding to level2 (e.g., number of threads per core)
759  int calculate_ratio(int level1, int level2) const {
760  KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
761  KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
762  int r = 1;
763  for (int level = level1; level > level2; --level)
764  r *= ratio[level];
765  return r;
766  }
767  int get_ratio(int level) const {
768  KMP_DEBUG_ASSERT(level >= 0 && level < depth);
769  return ratio[level];
770  }
771  int get_depth() const { return depth; };
772  kmp_hw_t get_type(int level) const {
773  KMP_DEBUG_ASSERT(level >= 0 && level < depth);
774  return types[level];
775  }
776  int get_level(kmp_hw_t type) const {
777  KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
778  int eq_type = equivalent[type];
779  if (eq_type == KMP_HW_UNKNOWN)
780  return -1;
781  for (int i = 0; i < depth; ++i)
782  if (types[i] == eq_type)
783  return i;
784  return -1;
785  }
786  int get_count(int level) const {
787  KMP_DEBUG_ASSERT(level >= 0 && level < depth);
788  return count[level];
789  }
790 #if KMP_AFFINITY_SUPPORTED
791  void sort_compact() {
792  qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
793  kmp_hw_thread_t::compare_compact);
794  }
795 #endif
796  void print(const char *env_var = "KMP_AFFINITY") const;
797  void dump() const;
798 };
799 
800 class kmp_hw_subset_t {
801 public:
802  struct item_t {
803  int num;
804  kmp_hw_t type;
805  int offset;
806  };
807 
808 private:
809  int depth;
810  int capacity;
811  item_t *items;
812  kmp_uint64 set;
813  bool absolute;
814  // The set must be able to handle up to KMP_HW_LAST number of layers
815  KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
816 
817 public:
818  // Force use of allocate()/deallocate()
819  kmp_hw_subset_t() = delete;
820  kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
821  kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
822  kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
823  kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
824 
825  static kmp_hw_subset_t *allocate() {
826  int initial_capacity = 5;
827  kmp_hw_subset_t *retval =
828  (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
829  retval->depth = 0;
830  retval->capacity = initial_capacity;
831  retval->set = 0ull;
832  retval->absolute = false;
833  retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
834  return retval;
835  }
836  static void deallocate(kmp_hw_subset_t *subset) {
837  __kmp_free(subset->items);
838  __kmp_free(subset);
839  }
840  void set_absolute() { absolute = true; }
841  bool is_absolute() const { return absolute; }
842  void push_back(int num, kmp_hw_t type, int offset) {
843  if (depth == capacity - 1) {
844  capacity *= 2;
845  item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
846  for (int i = 0; i < depth; ++i)
847  new_items[i] = items[i];
848  __kmp_free(items);
849  items = new_items;
850  }
851  items[depth].num = num;
852  items[depth].type = type;
853  items[depth].offset = offset;
854  depth++;
855  set |= (1ull << type);
856  }
857  int get_depth() const { return depth; }
858  const item_t &at(int index) const {
859  KMP_DEBUG_ASSERT(index >= 0 && index < depth);
860  return items[index];
861  }
862  item_t &at(int index) {
863  KMP_DEBUG_ASSERT(index >= 0 && index < depth);
864  return items[index];
865  }
866  void remove(int index) {
867  KMP_DEBUG_ASSERT(index >= 0 && index < depth);
868  set &= ~(1ull << items[index].type);
869  for (int j = index + 1; j < depth; ++j) {
870  items[j - 1] = items[j];
871  }
872  depth--;
873  }
874  bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
875  void dump() const {
876  printf("**********************\n");
877  printf("*** kmp_hw_subset: ***\n");
878  printf("* depth: %d\n", depth);
879  printf("* items:\n");
880  for (int i = 0; i < depth; ++i) {
881  printf("num: %d, type: %s, offset: %d\n", items[i].num,
882  __kmp_hw_get_keyword(items[i].type), items[i].offset);
883  }
884  printf("* set: 0x%llx\n", set);
885  printf("* absolute: %d\n", absolute);
886  printf("**********************\n");
887  }
888 };
889 
890 extern kmp_topology_t *__kmp_topology;
891 extern kmp_hw_subset_t *__kmp_hw_subset;
892 
893 /* A structure for holding machine-specific hierarchy info to be computed once
894  at init. This structure represents a mapping of threads to the actual machine
895  hierarchy, or to our best guess at what the hierarchy might be, for the
896  purpose of performing an efficient barrier. In the worst case, when there is
897  no machine hierarchy information, it produces a tree suitable for a barrier,
898  similar to the tree used in the hyper barrier. */
899 class hierarchy_info {
900 public:
901  /* Good default values for number of leaves and branching factor, given no
902  affinity information. Behaves a bit like hyper barrier. */
903  static const kmp_uint32 maxLeaves = 4;
904  static const kmp_uint32 minBranch = 4;
910  kmp_uint32 maxLevels;
911 
916  kmp_uint32 depth;
917  kmp_uint32 base_num_threads;
918  enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
919  volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
920  // 2=initialization in progress
921  volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
922 
927  kmp_uint32 *numPerLevel;
928  kmp_uint32 *skipPerLevel;
929 
930  void deriveLevels() {
931  int hier_depth = __kmp_topology->get_depth();
932  for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
933  numPerLevel[level] = __kmp_topology->get_ratio(i);
934  }
935  }
936 
937  hierarchy_info()
938  : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
939 
940  void fini() {
941  if (!uninitialized && numPerLevel) {
942  __kmp_free(numPerLevel);
943  numPerLevel = NULL;
944  uninitialized = not_initialized;
945  }
946  }
947 
948  void init(int num_addrs) {
949  kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
950  &uninitialized, not_initialized, initializing);
951  if (bool_result == 0) { // Wait for initialization
952  while (TCR_1(uninitialized) != initialized)
953  KMP_CPU_PAUSE();
954  return;
955  }
956  KMP_DEBUG_ASSERT(bool_result == 1);
957 
958  /* Added explicit initialization of the data fields here to prevent usage of
959  dirty value observed when static library is re-initialized multiple times
960  (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
961  OpenMP). */
962  depth = 1;
963  resizing = 0;
964  maxLevels = 7;
965  numPerLevel =
966  (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
967  skipPerLevel = &(numPerLevel[maxLevels]);
968  for (kmp_uint32 i = 0; i < maxLevels;
969  ++i) { // init numPerLevel[*] to 1 item per level
970  numPerLevel[i] = 1;
971  skipPerLevel[i] = 1;
972  }
973 
974  // Sort table by physical ID
975  if (__kmp_topology && __kmp_topology->get_depth() > 0) {
976  deriveLevels();
977  } else {
978  numPerLevel[0] = maxLeaves;
979  numPerLevel[1] = num_addrs / maxLeaves;
980  if (num_addrs % maxLeaves)
981  numPerLevel[1]++;
982  }
983 
984  base_num_threads = num_addrs;
985  for (int i = maxLevels - 1; i >= 0;
986  --i) // count non-empty levels to get depth
987  if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
988  depth++;
989 
990  kmp_uint32 branch = minBranch;
991  if (numPerLevel[0] == 1)
992  branch = num_addrs / maxLeaves;
993  if (branch < minBranch)
994  branch = minBranch;
995  for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
996  while (numPerLevel[d] > branch ||
997  (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
998  if (numPerLevel[d] & 1)
999  numPerLevel[d]++;
1000  numPerLevel[d] = numPerLevel[d] >> 1;
1001  if (numPerLevel[d + 1] == 1)
1002  depth++;
1003  numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
1004  }
1005  if (numPerLevel[0] == 1) {
1006  branch = branch >> 1;
1007  if (branch < 4)
1008  branch = minBranch;
1009  }
1010  }
1011 
1012  for (kmp_uint32 i = 1; i < depth; ++i)
1013  skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
1014  // Fill in hierarchy in the case of oversubscription
1015  for (kmp_uint32 i = depth; i < maxLevels; ++i)
1016  skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1017 
1018  uninitialized = initialized; // One writer
1019  }
1020 
1021  // Resize the hierarchy if nproc changes to something larger than before
1022  void resize(kmp_uint32 nproc) {
1023  kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1024  while (bool_result == 0) { // someone else is trying to resize
1025  KMP_CPU_PAUSE();
1026  if (nproc <= base_num_threads) // happy with other thread's resize
1027  return;
1028  else // try to resize
1029  bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1030  }
1031  KMP_DEBUG_ASSERT(bool_result != 0);
1032  if (nproc <= base_num_threads)
1033  return; // happy with other thread's resize
1034 
1035  // Calculate new maxLevels
1036  kmp_uint32 old_sz = skipPerLevel[depth - 1];
1037  kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1038  // First see if old maxLevels is enough to contain new size
1039  for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1040  skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1041  numPerLevel[i - 1] *= 2;
1042  old_sz *= 2;
1043  depth++;
1044  }
1045  if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1046  while (nproc > old_sz) {
1047  old_sz *= 2;
1048  incs++;
1049  depth++;
1050  }
1051  maxLevels += incs;
1052 
1053  // Resize arrays
1054  kmp_uint32 *old_numPerLevel = numPerLevel;
1055  kmp_uint32 *old_skipPerLevel = skipPerLevel;
1056  numPerLevel = skipPerLevel = NULL;
1057  numPerLevel =
1058  (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1059  skipPerLevel = &(numPerLevel[maxLevels]);
1060 
1061  // Copy old elements from old arrays
1062  for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1063  // init numPerLevel[*] to 1 item per level
1064  numPerLevel[i] = old_numPerLevel[i];
1065  skipPerLevel[i] = old_skipPerLevel[i];
1066  }
1067 
1068  // Init new elements in arrays to 1
1069  for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1070  // init numPerLevel[*] to 1 item per level
1071  numPerLevel[i] = 1;
1072  skipPerLevel[i] = 1;
1073  }
1074 
1075  // Free old arrays
1076  __kmp_free(old_numPerLevel);
1077  }
1078 
1079  // Fill in oversubscription levels of hierarchy
1080  for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1081  skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1082 
1083  base_num_threads = nproc;
1084  resizing = 0; // One writer
1085  }
1086 };
1087 #endif // KMP_AFFINITY_H